Linux/vnops: implement STATX_DIOALIGN

This statx(2) mask returns the alignment restrictions for O_DIRECT
access on the given file.

We're expected to return both memory and IO alignment. For memory, it's
always PAGE_SIZE. For IO, we return the current block size for the file,
which is the required alignment for an arbitrary block, and for the
first block we'll fall back to the ARC when necessary, so it should
always work.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16972
This commit is contained in:
Rob Norris 2025-03-14 04:15:14 +11:00 committed by GitHub
parent 0433523ca2
commit 13ec35ce3b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 548 additions and 2 deletions

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*/
#ifndef _SYS_FS_ZFS_VNOPS_H
@ -42,6 +43,8 @@ extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *);
extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *);
extern int zfs_get_direct_alignment(znode_t *, uint64_t *);
extern int mappedread(znode_t *, int, zfs_uio_t *);
extern int mappedread_sf(znode_t *, int, zfs_uio_t *);
extern void update_pages(znode_t *, int64_t, int, objset_t *);

View File

@ -21,6 +21,7 @@
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*/
@ -30,6 +31,7 @@
#include <sys/zfs_vnops.h>
#include <sys/zfs_znode.h>
#include <sys/dmu_objset.h>
#include <sys/spa_impl.h>
#include <sys/vfs.h>
#include <sys/zpl.h>
#include <sys/file.h>
@ -490,6 +492,17 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
}
#endif
#ifdef STATX_DIOALIGN
if (request_mask & STATX_DIOALIGN) {
uint64_t align;
if (zfs_get_direct_alignment(zp, &align) == 0) {
stat->dio_mem_align = PAGE_SIZE;
stat->dio_offset_align = align;
stat->result_mask |= STATX_DIOALIGN;
}
}
#endif
#ifdef STATX_ATTR_IMMUTABLE
if (zp->z_pflags & ZFS_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;

View File

@ -25,6 +25,7 @@
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
* Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*/
/* Portions Copyright 2007 Jeremy Teo */
@ -1083,6 +1084,44 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
return (error);
}
/*
* Get the optimal alignment to ensure direct IO can be performed without
* incurring any RMW penalty on write. If direct IO is not enabled for this
* file, returns an error.
*/
int
zfs_get_direct_alignment(znode_t *zp, uint64_t *alignp)
{
zfsvfs_t *zfsvfs = ZTOZSB(zp);
if (!zfs_dio_enabled || zfsvfs->z_os->os_direct == ZFS_DIRECT_DISABLED)
return (SET_ERROR(EOPNOTSUPP));
/*
* If the file has multiple blocks, then its block size is fixed
* forever, and so is the ideal alignment.
*
* If however it only has a single block, then we want to return the
* max block size it could possibly grown to (ie, the dataset
* recordsize). We do this so that a program querying alignment
* immediately after the file is created gets a value that won't change
* once the file has grown into the second block and beyond.
*
* Because we don't have a count of blocks easily available here, we
* check if the apparent file size is smaller than its current block
* size (meaning, the file hasn't yet grown into the current block
* size) and then, check if the block size is smaller than the dataset
* maximum (meaning, if the file grew past the current block size, the
* block size could would be increased).
*/
if (zp->z_size <= zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz)
*alignp = MAX(zfsvfs->z_max_blksz, PAGE_SIZE);
else
*alignp = MAX(zp->z_blksz, PAGE_SIZE);
return (0);
}
#ifdef ZFS_DEBUG
static int zil_fault_io = 0;
#endif

View File

@ -981,7 +981,7 @@ tests = ['sparse_001_pos']
tags = ['functional', 'sparse']
[tests/functional/stat]
tests = ['stat_001_pos']
tests = ['stat_001_pos', 'statx_dioalign']
tags = ['functional', 'stat']
[tests/functional/suid]

View File

@ -128,6 +128,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+'
#
cfr_reason = 'Kernel copy_file_range support required'
#
# Some statx fields are not supported by all kernels
#
statx_reason = 'Needed statx(2) field not supported on this kernel'
if sys.platform.startswith('freebsd'):
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+'
else:
@ -293,7 +298,8 @@ if sys.platform.startswith('freebsd'):
'block_cloning/block_cloning_cross_enc_dataset':
['SKIP', cfr_cross_reason],
'block_cloning/block_cloning_copyfilerange_cross_dataset':
['SKIP', cfr_cross_reason]
['SKIP', cfr_cross_reason],
'stat/statx_dioalign': ['SKIP', 'na_reason'],
})
elif sys.platform.startswith('linux'):
maybe.update({
@ -361,6 +367,7 @@ elif sys.platform.startswith('linux'):
'mmp/mmp_active_import': ['FAIL', known_reason],
'mmp/mmp_exported_import': ['FAIL', known_reason],
'mmp/mmp_inactive_import': ['FAIL', known_reason],
'stat/statx_dioalign': ['SKIP', 'statx_reason'],
})

View File

@ -36,6 +36,7 @@
/rename_dir
/rm_lnkcnt_zero_file
/send_doall
/statx
/stride_dd
/threadsappend
/user_ns_exec
@ -54,3 +55,4 @@
/skein_test
/sha2_test
/idmap_util
/statx

View File

@ -126,6 +126,7 @@ if BUILD_LINUX
scripts_zfs_tests_bin_PROGRAMS += %D%/getversion
scripts_zfs_tests_bin_PROGRAMS += %D%/user_ns_exec
scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2
scripts_zfs_tests_bin_PROGRAMS += %D%/statx
scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest
scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet
scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util

304
tests/zfs-tests/cmd/statx.c Normal file
View File

@ -0,0 +1,304 @@
/*
* SPDX-License-Identifier: MIT
*
* Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <unistd.h>
/*
* statx() may be available in the kernel, but not in the libc, so we build
* our own wrapper if we can't link one.
*/
#ifndef __NR_statx
#if defined(__x86_64__)
#define __NR_statx (332)
#elif defined(__i386__)
#define __NR_statx (383)
#elif defined(__s390__)
#define __NR_statx (379)
#elif defined(__arm__)
#define __NR_statx (397)
#elif defined(__aarch64__)
#define __NR_statx (291)
#elif defined(__powerpc__)
#define __NR_statx (383)
#else
#error "no definition of __NR_statx for this platform"
#endif
#endif /* __NR_statx */
int
statx(int, const char *, int, unsigned int, void *)
__attribute__((weak));
static inline int
_statx(int fd, const char *path, int flags, unsigned int mask, void *stx)
{
if (statx)
return (statx(fd, path, flags, mask, stx));
else
return (syscall(__NR_statx, fd, path, flags, mask, stx));
}
#ifndef STATX_TYPE
#define STATX_TYPE (1<<0)
#endif
#ifndef STATX_MODE
#define STATX_MODE (1<<1)
#endif
#ifndef STATX_NLINK
#define STATX_NLINK (1<<2)
#endif
#ifndef STATX_UID
#define STATX_UID (1<<3)
#endif
#ifndef STATX_GID
#define STATX_GID (1<<4)
#endif
#ifndef STATX_ATIME
#define STATX_ATIME (1<<5)
#endif
#ifndef STATX_MTIME
#define STATX_MTIME (1<<6)
#endif
#ifndef STATX_CTIME
#define STATX_CTIME (1<<7)
#endif
#ifndef STATX_INO
#define STATX_INO (1<<8)
#endif
#ifndef STATX_SIZE
#define STATX_SIZE (1<<9)
#endif
#ifndef STATX_BLOCKS
#define STATX_BLOCKS (1<<10)
#endif
#ifndef STATX_BTIME
#define STATX_BTIME (1<<11)
#endif
#ifndef STATX_MNT_ID
#define STATX_MNT_ID (1<<12)
#endif
#ifndef STATX_DIOALIGN
#define STATX_DIOALIGN (1<<13)
#endif
typedef struct {
int64_t tv_sec;
uint32_t tv_nsec;
int32_t _pad;
} stx_timestamp_t;
_Static_assert(sizeof (stx_timestamp_t) == 0x10,
"stx_timestamp_t not 16 bytes");
typedef struct {
uint32_t stx_mask;
uint32_t stx_blksize;
uint64_t stx_attributes;
uint32_t stx_nlink;
uint32_t stx_uid;
uint32_t stx_gid;
uint16_t stx_mode;
uint16_t _pad1;
uint64_t stx_ino;
uint64_t stx_size;
uint64_t stx_blocks;
uint64_t stx_attributes_mask;
stx_timestamp_t stx_atime;
stx_timestamp_t stx_btime;
stx_timestamp_t stx_ctime;
stx_timestamp_t stx_mtime;
uint32_t stx_rdev_major;
uint32_t stx_rdev_minor;
uint32_t stx_dev_major;
uint32_t stx_dev_minor;
uint64_t stx_mnt_id;
uint32_t stx_dio_mem_align;
uint32_t stx_dio_offset_align;
uint64_t _pad2[12];
} stx_t;
_Static_assert(sizeof (stx_t) == 0x100, "stx_t not 256 bytes");
typedef struct {
const char *name;
unsigned int mask;
} stx_field_t;
stx_field_t fields[] = {
{ "type", STATX_TYPE },
{ "mode", STATX_MODE },
{ "nlink", STATX_NLINK },
{ "uid", STATX_UID },
{ "gid", STATX_GID },
{ "atime", STATX_ATIME },
{ "mtime", STATX_MTIME },
{ "ctime", STATX_CTIME },
{ "ino", STATX_INO },
{ "size", STATX_SIZE },
{ "blocks", STATX_BLOCKS },
{ "btime", STATX_BTIME },
{ "mnt_id", STATX_MNT_ID },
{ "dioalign", STATX_DIOALIGN },
{ NULL },
};
static int
usage(void)
{
printf(
"usage: statx <field[,field,field]> <file>\n"
"available fields:\n");
int w = 0;
for (stx_field_t *f = fields; f->name != NULL; f++) {
if (w > 0 && (w + strlen(f->name) + 1) > 60) {
fputc('\n', stdout);
w = 0;
}
if (w == 0)
fputc(' ', stdout);
w += printf(" %s", f->name);
}
if (w > 0)
fputc('\n', stdout);
return (1);
}
int
main(int argc, char **argv)
{
if (argc < 3)
return (usage());
unsigned int mask = 0;
char *name;
while ((name = strsep(&argv[1], ",")) != NULL) {
stx_field_t *f;
for (f = fields; f->name != NULL; f++) {
if (strcmp(name, f->name) == 0) {
mask |= f->mask;
break;
}
}
if (f->name == NULL) {
fprintf(stderr, "unknown field name: %s\n", name);
return (usage());
}
}
int fd = open(argv[2], O_PATH);
if (fd < 0) {
fprintf(stderr, "open: %s: %s\n", argv[2], strerror(errno));
return (1);
}
stx_t stx = {};
if (_statx(fd, "",
AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, mask, &stx) < 0) {
fprintf(stderr, "statx: %s: %s\n", argv[2], strerror(errno));
close(fd);
return (1);
}
int rc = 0;
for (stx_field_t *f = fields; f->name != NULL; f++) {
if (!(mask & f->mask))
continue;
if (!(stx.stx_mask & f->mask)) {
printf("statx: kernel did not return field: %s\n",
f->name);
rc = 2;
continue;
}
}
if (rc > 0)
return (rc);
for (stx_field_t *f = fields; f->name != NULL; f++) {
if (!(mask & f->mask))
continue;
switch (f->mask) {
case STATX_TYPE:
printf("type: %u\n", stx.stx_mode & S_IFMT);
break;
case STATX_MODE:
printf("mode: %u\n", stx.stx_mode & ~S_IFMT);
break;
case STATX_NLINK:
printf("nlink: %u\n", stx.stx_nlink);
break;
case STATX_UID:
printf("uid: %u\n", stx.stx_uid);
break;
case STATX_GID:
printf("gid: %u\n", stx.stx_gid);
break;
case STATX_ATIME:
printf("atime: %ld.%u\n",
stx.stx_atime.tv_sec, stx.stx_atime.tv_nsec);
break;
case STATX_MTIME:
printf("mtime: %ld.%u\n",
stx.stx_mtime.tv_sec, stx.stx_mtime.tv_nsec);
break;
case STATX_CTIME:
printf("ctime: %ld.%u\n",
stx.stx_ctime.tv_sec, stx.stx_ctime.tv_nsec);
break;
case STATX_INO:
printf("ino: %lu\n", stx.stx_ino);
break;
case STATX_SIZE:
printf("size: %lu\n", stx.stx_size);
break;
case STATX_BLOCKS:
printf("blocks: %lu\n", stx.stx_blocks);
break;
case STATX_BTIME:
printf("btime: %ld.%u\n",
stx.stx_btime.tv_sec, stx.stx_btime.tv_nsec);
break;
case STATX_MNT_ID:
printf("mnt_id: %lu\n", stx.stx_mnt_id);
break;
case STATX_DIOALIGN:
printf("dioalign: %u %u\n",
stx.stx_dio_mem_align, stx.stx_dio_offset_align);
break;
}
}
return (rc);
}

View File

@ -218,6 +218,7 @@ export ZFSTEST_FILES='badsend
rename_dir
rm_lnkcnt_zero_file
send_doall
statx
threadsappend
user_ns_exec
write_dos_attributes

View File

@ -2059,6 +2059,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/stat/cleanup.ksh \
functional/stat/setup.ksh \
functional/stat/stat_001_pos.ksh \
functional/stat/statx_dioalign.ksh \
functional/suid/cleanup.ksh \
functional/suid/setup.ksh \
functional/suid/suid_write_to_none.ksh \

View File

@ -0,0 +1,175 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
#
#
# Uses the statx helper to test the results of the STATX_DIOALIGN request as we
# manipulate DIO enable, dataset recordsize and file size and structure.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "both"
if ! is_linux ; then
log_unsupported "statx(2) only available on Linux"
fi
if [[ $(linux_version) -lt $(linux_version "6.1") ]] ; then
log_unsupported "STATX_DIOALIGN not available before Linux 6.1"
fi
CLAIM="STATX_DIOALIGN returns useful values when Direct IO is available."
TESTDS=${TESTPOOL}/${TESTFS}
TESTFILE=${TESTDIR}/${TESTFILE0}
log_must save_tunable DIO_ENABLED
typeset recordsize_saved=$(get_prop recordsize $TESTDS)
typeset direct_saved=$(get_prop direct $TESTDS)
function cleanup
{
rm -f ${TESTFILE}
zfs set recordsize=$recordsize_saved $TESTDS
zfs set direct=$direct_saved $TESTDS
restore_tunable DIO_ENABLED
}
log_onexit cleanup
# assert_dioalign <file> <memalign> <ioalign>
function assert_dioalign
{
typeset file=$1
typeset -i memalign=$2
typeset -i ioalign=$3
typeset -a v=($(statx dioalign $file | cut -f2- -d' '))
log_note "statx dioalign returned: $file: mem=${v[0]} io=${v[1]}"
log_must [ ${v[0]} -eq $memalign -a ${v[1]} -eq $ioalign ]
}
# assert_dioalign_failed <file>
function assert_dioalign_failed
{
typeset file=$1
log_mustnot statx dioalign $file
}
log_assert $CLAIM
# The mem alignment will always be PAGE_SIZE, so we need to know what that is.
typeset -i PAGE_SIZE=$(getconf PAGE_SIZE)
# Set recordsize to 128K, and make a 64K file (so only one block) for the
# sizing tests below.
log_must zfs set recordsize=128K $TESTDS
log_must dd if=/dev/urandom of=$TESTFILE bs=64k count=1
log_must zpool sync
# when DIO is disabled via tunable, statx will not return the dioalign result
# and the program fails
log_must set_tunable32 DIO_ENABLED 0
for d in disabled standard always ; do
log_must zfs set direct=$d $TESTDS
assert_dioalign_failed $TESTFILE
done
# when DIO is enabled via tunable, behaviour is dependent on the direct=
# property.
log_must set_tunable32 DIO_ENABLED 1
# when DIO is disabled via property, statx fails
log_must zfs set direct=disabled $TESTDS
assert_dioalign_failed $TESTFILE
# when DIO is enabled, the result should be mem=pagesize, io=recordsize
for d in standard always ; do
log_must zfs set direct=$d $TESTDS
assert_dioalign $TESTFILE $PAGE_SIZE 131072
done
# The IO size is the file's blocksize, unless it is in its first block and
# could grow to the recordsize. Our test file is currently a single 64K block,
# so any recordsize equal or larger than that should be used for the alignment.
for krs in 64 128 256 512 ; do
typeset -i rs=$((krs * 1024))
log_must zfs set recordsize=$rs $TESTDS
for d in standard always ; do
log_must zfs set direct=$d $TESTDS
assert_dioalign $TESTFILE $PAGE_SIZE $rs
done
done
# If the recordsize is smaller than the block size, then the file's block size
# will always be used.
for krs in 4 8 16 32 64 ; do
typeset -i rs=$((krs * 1024))
log_must zfs set recordsize=$rs $TESTDS
for d in standard always ; do
log_must zfs set direct=$d $TESTDS
assert_dioalign $TESTFILE $PAGE_SIZE 65536
done
done
# Now we extend the file into its second block. This effectively locks in its
# block size, which will always be returned regardless of recordsize changes.
log_must zfs set recordsize=128K $TESTDS
log_must dd if=/dev/urandom of=$TESTFILE bs=192K count=1
log_must zpool sync
# Confirm that no matter how we change the recordsize, the alignment remains at
# the block size.
for krs in 4 8 16 32 64 128 256 512 ; do
typeset -i rs=$((krs * 1024))
log_must zfs set recordsize=$rs $TESTDS
for d in standard always ; do
log_must zfs set direct=$d $TESTDS
assert_dioalign $TESTFILE $PAGE_SIZE 131072
done
done
# reset for write tests
log_must zfs set recordsize=16K $TESTDS
log_must zfs set direct=standard $TESTDS
# create an empty file, and fetch its alignment (which we know, so just test
# for it). then, do some O_DIRECT writes with that alignment. they should
# succeed.
log_must rm -f $TESTFILE
log_must touch $TESTFILE
log_must zpool sync
assert_dioalign $TESTFILE $PAGE_SIZE 16384
log_must dd if=/dev/urandom of=$TESTFILE bs=16384 count=16 oflag=direct
# same again, but writing with incorrect alignment, which should fail.
log_must rm -f $TESTFILE
log_must touch $TESTFILE
log_must zpool sync
assert_dioalign $TESTFILE $PAGE_SIZE 16384
log_mustnot dd if=/dev/urandom of=$TESTFILE bs=1024 count=256 oflag=direct
log_pass $CLAIM