mirror_zfs/module/zcommon/zfs_valstr.c

281 lines
7.2 KiB
C
Raw Normal View History

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2024, Klara Inc.
*/
#include <sys/fs/zfs.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/string.h>
#include <sys/debug.h>
#include "zfs_valstr.h"
/*
* Each bit in a bitfield has three possible string representations:
* - single char
* - two-char pair
* - full name
*/
typedef struct {
const char vb_bit;
const char vb_pair[2];
const char *vb_name;
} valstr_bit_t;
/*
* Emits a character for each bit in `bits`, up to the number of elements
* in the table. Set bits get the character in vb_bit, clear bits get a
* space. This results in all strings having the same width, for easier
* visual comparison.
*/
static size_t
valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems,
uint64_t bits, char *out, size_t outlen)
{
ASSERT(out);
size_t n = 0;
for (int b = 0; b < nelems; b++) {
if (n == outlen)
break;
uint64_t mask = (1ULL << b);
out[n++] = (bits & mask) ? table[b].vb_bit : ' ';
}
if (n < outlen)
out[n++] = '\0';
return (n);
}
/*
* Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and
* separated by a `|` character. This gives a concise representation of the
* whole value.
*/
static size_t
valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems,
uint64_t bits, char *out, size_t outlen)
{
ASSERT(out);
size_t n = 0;
for (int b = 0; b < nelems; b++) {
ASSERT3U(n, <=, outlen);
if (n == outlen)
break;
uint64_t mask = (1ULL << b);
if (bits & mask) {
size_t len = (n > 0) ? 3 : 2;
if (n > outlen-len)
break;
if (n > 0)
out[n++] = '|';
out[n++] = table[b].vb_pair[0];
out[n++] = table[b].vb_pair[1];
}
}
if (n < outlen)
out[n++] = '\0';
return (n);
}
/*
* Emits the full name for each bit set in `bits`, taken from vb_name, and
* separated by a space. This unambiguously shows the entire set of bits, but
* can get very long.
*/
static size_t
valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems,
uint64_t bits, char *out, size_t outlen)
{
ASSERT(out);
size_t n = 0;
for (int b = 0; b < nelems; b++) {
ASSERT3U(n, <=, outlen);
if (n == outlen)
break;
uint64_t mask = (1ULL << b);
if (bits & mask) {
size_t len = strlen(table[b].vb_name);
if (n > 0)
len++;
if (n > outlen-len)
break;
if (n > 0) {
out[n++] = ' ';
len--;
}
memcpy(&out[n], table[b].vb_name, len);
n += len;
}
}
if (n < outlen)
out[n++] = '\0';
return (n);
}
/*
* Emits the name of the given enum value in the table.
*/
static size_t
valstr_enum_str(const char **table, const size_t nelems,
int v, char *out, size_t outlen)
{
ASSERT(out);
ASSERT3U(v, <, nelems);
if (v >= nelems)
return (0);
return (MIN(strlcpy(out, table[v], outlen), outlen));
}
/*
* These macros create the string tables for the given name, and implement
* the public functions described in zfs_valstr.h.
*/
#define _VALSTR_BITFIELD_IMPL(name, ...) \
static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\
size_t \
zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen) \
{ \
return (valstr_bitfield_bits(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
} \
\
size_t \
zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen) \
{ \
return (valstr_bitfield_pairs(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
} \
\
size_t \
zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen) \
{ \
return (valstr_bitfield_str(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
} \
#define _VALSTR_ENUM_IMPL(name, ...) \
static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ }; \
size_t \
zfs_valstr_ ## name(int v, char *out, size_t outlen) \
{ \
return (valstr_enum_str(valstr_ ## name ## _table, \
ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen)); \
} \
/* String tables */
/* ZIO flags: zio_flag_t, typically zio->io_flags */
/* BEGIN CSTYLED */
_VALSTR_BITFIELD_IMPL(zio_flag,
{ '.', "DA", "DONT_AGGREGATE" },
{ '.', "RP", "IO_REPAIR" },
{ '.', "SH", "SELF_HEAL" },
{ '.', "RS", "RESILVER" },
{ '.', "SC", "SCRUB" },
{ '.', "ST", "SCAN_THREAD" },
{ '.', "PH", "PHYSICAL" },
{ '.', "CF", "CANFAIL" },
{ '.', "SP", "SPECULATIVE" },
{ '.', "CW", "CONFIG_WRITER" },
{ '.', "DR", "DONT_RETRY" },
{ '?', "??", "[UNUSED 11]" },
{ '.', "ND", "NODATA" },
{ '.', "ID", "INDUCE_DAMAGE" },
{ '.', "AL", "IO_ALLOCATING" },
{ '.', "RE", "IO_RETRY" },
{ '.', "PR", "PROBE" },
{ '.', "TH", "TRYHARD" },
{ '.', "OP", "OPTIONAL" },
Always validate checksums for Direct I/O reads This fixes an oversight in the Direct I/O PR. There is nothing that stops a process from manipulating the contents of a buffer for a Direct I/O read while the I/O is in flight. This can lead checksum verify failures. However, the disk contents are still correct, and this would lead to false reporting of checksum validation failures. To remedy this, all Direct I/O reads that have a checksum verification failure are treated as suspicious. In the event a checksum validation failure occurs for a Direct I/O read, then the I/O request will be reissued though the ARC. This allows for actual validation to happen and removes any possibility of the buffer being manipulated after the I/O has been issued. Just as with Direct I/O write checksum validation failures, Direct I/O read checksum validation failures are reported though zpool status -d in the DIO column. Also the zevent has been updated to have both: 1. dio_verify_wr -> Checksum verification failure for writes 2. dio_verify_rd -> Checksum verification failure for reads. This allows for determining what I/O operation was the culprit for the checksum verification failure. All DIO errors are reported only on the top-level VDEV. Even though FreeBSD can write protect pages (stable pages) it still has the same issue as Linux with Direct I/O reads. This commit updates the following: 1. Propogates checksum failures for reads all the way up to the top-level VDEV. 2. Reports errors through zpool status -d as DIO. 3. Has two zevents for checksum verify errors with Direct I/O. One for read and one for write. 4. Updates FreeBSD ABD code to also check for ABD_FLAG_FROM_PAGES and handle ABD buffer contents validation the same as Linux. 5. Updated manipulate_user_buffer.c to also manipulate a buffer while a Direct I/O read is taking place. 6. Adds a new ZTS test case dio_read_verify that stress tests the new code. 7. Updated man pages. 8. Added an IMPLY statement to zio_checksum_verify() to make sure that Direct I/O reads are not issued as speculative. 9. Removed self healing through mirror, raidz, and dRAID VDEVs for Direct I/O reads. This issue was first observed when installing a Windows 11 VM on a ZFS dataset with the dataset property direct set to always. The zpool devices would report checksum failures, but running a subsequent zpool scrub would not repair any data and report no errors. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Closes #16598
2024-10-09 22:28:08 +03:00
{ '.', "RD", "DIO_READ" },
{ '.', "DQ", "DONT_QUEUE" },
{ '.', "DP", "DONT_PROPAGATE" },
{ '.', "BY", "IO_BYPASS" },
{ '.', "RW", "IO_REWRITE" },
{ '.', "CM", "RAW_COMPRESS" },
{ '.', "EN", "RAW_ENCRYPT" },
{ '.', "GG", "GANG_CHILD" },
{ '.', "DD", "DDT_CHILD" },
{ '.', "GF", "GODFATHER" },
{ '.', "NP", "NOPWRITE" },
{ '.', "EX", "REEXECUTED" },
{ '.', "DG", "DELEGATED" },
Adding Direct IO Support Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads. O_DIRECT support in ZFS will always ensure there is coherency between buffered and O_DIRECT IO requests. This ensures that all IO requests, whether buffered or direct, will see the same file contents at all times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While data is written directly to VDEV disks, metadata will not be synced until the associated TXG is synced. For both O_DIRECT read and write request the offset and request sizes, at a minimum, must be PAGE_SIZE aligned. In the event they are not, then EINVAL is returned unless the direct property is set to always (see below). For O_DIRECT writes: The request also must be block aligned (recordsize) or the write request will take the normal (buffered) write path. In the event that request is block aligned and a cached copy of the buffer in the ARC, then it will be discarded from the ARC forcing all further reads to retrieve the data from disk. For O_DIRECT reads: The only alignment restrictions are PAGE_SIZE alignment. In the event that the requested data is in buffered (in the ARC) it will just be copied from the ARC into the user buffer. For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in the event that file contents are mmap'ed. In this case, all requests that are at least PAGE_SIZE aligned will just fall back to the buffered paths. If the request however is not PAGE_SIZE aligned, EINVAL will be returned as always regardless if the file's contents are mmap'ed. Since O_DIRECT writes go through the normal ZIO pipeline, the following operations are supported just as with normal buffered writes: Checksum Compression Encryption Erasure Coding There is one caveat for the data integrity of O_DIRECT writes that is distinct for each of the OS's supported by ZFS. FreeBSD - FreeBSD is able to place user pages under write protection so any data in the user buffers and written directly down to the VDEV disks is guaranteed to not change. There is no concern with data integrity and O_DIRECT writes. Linux - Linux is not able to place anonymous user pages under write protection. Because of this, if the user decides to manipulate the page contents while the write operation is occurring, data integrity can not be guaranteed. However, there is a module parameter `zfs_vdev_direct_write_verify` that controls the if a O_DIRECT writes that can occur to a top-level VDEV before a checksum verify is run before the contents of the I/O buffer are committed to disk. In the event of a checksum verification failure the write will return EIO. The number of O_DIRECT write checksum verification errors can be observed by doing `zpool status -d`, which will list all verification errors that have occurred on a top-level VDEV. Along with `zpool status`, a ZED event will be issues as `dio_verify` when a checksum verification error occurs. ZVOLs and dedup is not currently supported with Direct I/O. A new dataset property `direct` has been added with the following 3 allowable values: disabled - Accepts O_DIRECT flag, but silently ignores it and treats the request as a buffered IO request. standard - Follows the alignment restrictions outlined above for write/read IO requests when the O_DIRECT flag is used. always - Treats every write/read IO request as though it passed O_DIRECT and will do O_DIRECT if the alignment restrictions are met otherwise will redirect through the ARC. This property will not allow a request to fail. There is also a module parameter zfs_dio_enabled that can be used to force all reads and writes through the ARC. By setting this module parameter to 0, it mimics as if the direct dataset property is set to disabled. Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Co-authored-by: Mark Maybee <mark.maybee@delphix.com> Co-authored-by: Matt Macy <mmacy@FreeBSD.org> Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov> Closes #10018
2024-09-14 23:47:59 +03:00
{ '.', "DC", "DIO_CHKSUM_ERR" },
)
/* END CSTYLED */
/*
* ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or
* zio->io_pipeline.
*/
/* BEGIN CSTYLED */
_VALSTR_BITFIELD_IMPL(zio_stage,
{ 'O', "O ", "OPEN" },
{ 'I', "RI", "READ_BP_INIT" },
{ 'I', "WI", "WRITE_BP_INIT" },
{ 'I', "FI", "FREE_BP_INIT" },
{ 'A', "IA", "ISSUE_ASYNC" },
{ 'W', "WC", "WRITE_COMPRESS" },
{ 'E', "EN", "ENCRYPT" },
{ 'C', "CG", "CHECKSUM_GENERATE" },
{ 'N', "NW", "NOP_WRITE" },
{ 'B', "BF", "BRT_FREE" },
{ 'd', "dS", "DDT_READ_START" },
{ 'd', "dD", "DDT_READ_DONE" },
{ 'd', "dW", "DDT_WRITE" },
{ 'd', "dF", "DDT_FREE" },
{ 'G', "GA", "GANG_ASSEMBLE" },
{ 'G', "GI", "GANG_ISSUE" },
{ 'D', "DT", "DVA_THROTTLE" },
{ 'D', "DA", "DVA_ALLOCATE" },
{ 'D', "DF", "DVA_FREE" },
{ 'D', "DC", "DVA_CLAIM" },
{ 'R', "R ", "READY" },
{ 'V', "VS", "VDEV_IO_START" },
{ 'V', "VD", "VDEV_IO_DONE" },
{ 'V', "VA", "VDEV_IO_ASSESS" },
{ 'C', "CV", "CHECKSUM_VERIFY" },
Adding Direct IO Support Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads. O_DIRECT support in ZFS will always ensure there is coherency between buffered and O_DIRECT IO requests. This ensures that all IO requests, whether buffered or direct, will see the same file contents at all times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While data is written directly to VDEV disks, metadata will not be synced until the associated TXG is synced. For both O_DIRECT read and write request the offset and request sizes, at a minimum, must be PAGE_SIZE aligned. In the event they are not, then EINVAL is returned unless the direct property is set to always (see below). For O_DIRECT writes: The request also must be block aligned (recordsize) or the write request will take the normal (buffered) write path. In the event that request is block aligned and a cached copy of the buffer in the ARC, then it will be discarded from the ARC forcing all further reads to retrieve the data from disk. For O_DIRECT reads: The only alignment restrictions are PAGE_SIZE alignment. In the event that the requested data is in buffered (in the ARC) it will just be copied from the ARC into the user buffer. For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in the event that file contents are mmap'ed. In this case, all requests that are at least PAGE_SIZE aligned will just fall back to the buffered paths. If the request however is not PAGE_SIZE aligned, EINVAL will be returned as always regardless if the file's contents are mmap'ed. Since O_DIRECT writes go through the normal ZIO pipeline, the following operations are supported just as with normal buffered writes: Checksum Compression Encryption Erasure Coding There is one caveat for the data integrity of O_DIRECT writes that is distinct for each of the OS's supported by ZFS. FreeBSD - FreeBSD is able to place user pages under write protection so any data in the user buffers and written directly down to the VDEV disks is guaranteed to not change. There is no concern with data integrity and O_DIRECT writes. Linux - Linux is not able to place anonymous user pages under write protection. Because of this, if the user decides to manipulate the page contents while the write operation is occurring, data integrity can not be guaranteed. However, there is a module parameter `zfs_vdev_direct_write_verify` that controls the if a O_DIRECT writes that can occur to a top-level VDEV before a checksum verify is run before the contents of the I/O buffer are committed to disk. In the event of a checksum verification failure the write will return EIO. The number of O_DIRECT write checksum verification errors can be observed by doing `zpool status -d`, which will list all verification errors that have occurred on a top-level VDEV. Along with `zpool status`, a ZED event will be issues as `dio_verify` when a checksum verification error occurs. ZVOLs and dedup is not currently supported with Direct I/O. A new dataset property `direct` has been added with the following 3 allowable values: disabled - Accepts O_DIRECT flag, but silently ignores it and treats the request as a buffered IO request. standard - Follows the alignment restrictions outlined above for write/read IO requests when the O_DIRECT flag is used. always - Treats every write/read IO request as though it passed O_DIRECT and will do O_DIRECT if the alignment restrictions are met otherwise will redirect through the ARC. This property will not allow a request to fail. There is also a module parameter zfs_dio_enabled that can be used to force all reads and writes through the ARC. By setting this module parameter to 0, it mimics as if the direct dataset property is set to disabled. Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Co-authored-by: Mark Maybee <mark.maybee@delphix.com> Co-authored-by: Matt Macy <mmacy@FreeBSD.org> Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov> Closes #10018
2024-09-14 23:47:59 +03:00
{ 'C', "DC", "DIO_CHECKSUM_VERIFY" },
{ 'X', "X ", "DONE" },
)
/* END CSTYLED */
/* ZIO priority: zio_priority_t, typically zio->io_priority */
/* BEGIN CSTYLED */
_VALSTR_ENUM_IMPL(zio_priority,
"SYNC_READ",
"SYNC_WRITE",
"ASYNC_READ",
"ASYNC_WRITE",
"SCRUB",
"REMOVAL",
"INITIALIZING",
"TRIM",
"REBUILD",
"[NUM_QUEUEABLE]",
"NOW",
)
/* END CSTYLED */
#undef _VALSTR_BITFIELD_IMPL
#undef _VALSTR_ENUM_IMPL