From e73ada771d3e1b04b396df1c01facdb9a0aea7cc Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 6 Mar 2026 13:07:46 +1100 Subject: [PATCH] libzpool: lift zfs_file ops out to separate source file So its easier to remove and replace on non-Unix platforms. Sponsored-by: TrueNAS Reviewed-by: Brian Behlendorf Reviewed-by: Jorgen Lundman Signed-off-by: Rob Norris Closes #18281 --- lib/libzpool/Makefile.am | 1 + lib/libzpool/kernel.c | 388 ---------------------------------- lib/libzpool/zfs_file_os.c | 420 +++++++++++++++++++++++++++++++++++++ 3 files changed, 421 insertions(+), 388 deletions(-) create mode 100644 lib/libzpool/zfs_file_os.c diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index f8f128268..819255307 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -16,6 +16,7 @@ dist_libzpool_la_SOURCES = \ %D%/kernel.c \ %D%/util.c \ %D%/vdev_label_os.c \ + %D%/zfs_file_os.c \ %D%/zfs_racct.c \ %D%/zfs_debug.c diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 197e0d178..e132d50a5 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -62,9 +62,6 @@ uint32_t hostid; -/* If set, all blocks read will be copied to the specified directory. */ -char *vn_dumpdir = NULL; - uint32_t zone_get_hostid(void *zonep) { @@ -495,391 +492,6 @@ zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, (void) spa, (void) oldname, (void) newname, (void) async; } -/* - * Open file - * - * path - fully qualified path to file - * flags - file attributes O_READ / O_WRITE / O_EXCL - * fpp - pointer to return file pointer - * - * Returns 0 on success underlying error on failure. - */ -int -zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) -{ - int fd; - int dump_fd; - int err; - int old_umask = 0; - zfs_file_t *fp; - struct stat64 st; - - if (!(flags & O_CREAT) && stat64(path, &st) == -1) - return (errno); - - if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) - flags |= O_DIRECT; - - if (flags & O_CREAT) - old_umask = umask(0); - - fd = open64(path, flags, mode); - if (fd == -1) - return (errno); - - if (flags & O_CREAT) - (void) umask(old_umask); - - if (vn_dumpdir != NULL) { - char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); - const char *inpath = zfs_basename(path); - - (void) snprintf(dumppath, MAXPATHLEN, - "%s/%s", vn_dumpdir, inpath); - dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); - umem_free(dumppath, MAXPATHLEN); - if (dump_fd == -1) { - err = errno; - close(fd); - return (err); - } - } else { - dump_fd = -1; - } - - (void) fcntl(fd, F_SETFD, FD_CLOEXEC); - - fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); - fp->f_fd = fd; - fp->f_dump_fd = dump_fd; - *fpp = fp; - - return (0); -} - -void -zfs_file_close(zfs_file_t *fp) -{ - close(fp->f_fd); - if (fp->f_dump_fd != -1) - close(fp->f_dump_fd); - - umem_free(fp, sizeof (zfs_file_t)); -} - -/* - * Stateful write - use os internal file pointer to determine where to - * write and update on successful completion. - * - * fp - pointer to file (pipe, socket, etc) to write to - * buf - buffer to write - * count - # of bytes to write - * resid - pointer to count of unwritten bytes (if short write) - * - * Returns 0 on success errno on failure. - */ -int -zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) -{ - ssize_t rc; - - rc = write(fp->f_fd, buf, count); - if (rc < 0) - return (errno); - - if (resid) { - *resid = count - rc; - } else if (rc != count) { - return (EIO); - } - - return (0); -} - -/* - * Stateless write - os internal file pointer is not updated. - * - * fp - pointer to file (pipe, socket, etc) to write to - * buf - buffer to write - * count - # of bytes to write - * off - file offset to write to (only valid for seekable types) - * resid - pointer to count of unwritten bytes - * - * Returns 0 on success errno on failure. - */ -int -zfs_file_pwrite(zfs_file_t *fp, const void *buf, - size_t count, loff_t pos, uint8_t ashift, ssize_t *resid) -{ - ssize_t rc, split, done; - int sectors; - - /* - * To simulate partial disk writes, we split writes into two - * system calls so that the process can be killed in between. - * This is used by ztest to simulate realistic failure modes. - */ - sectors = count >> ashift; - split = (sectors > 0 ? rand() % sectors : 0) << ashift; - rc = pwrite64(fp->f_fd, buf, split, pos); - if (rc != -1) { - done = rc; - rc = pwrite64(fp->f_fd, (char *)buf + split, - count - split, pos + split); - } -#ifdef __linux__ - if (rc == -1 && errno == EINVAL) { - /* - * Under Linux, this most likely means an alignment issue - * (memory or disk) due to O_DIRECT, so we abort() in order - * to catch the offender. - */ - abort(); - } -#endif - - if (rc < 0) - return (errno); - - done += rc; - - if (resid) { - *resid = count - done; - } else if (done != count) { - return (EIO); - } - - return (0); -} - -/* - * Stateful read - use os internal file pointer to determine where to - * read and update on successful completion. - * - * fp - pointer to file (pipe, socket, etc) to read from - * buf - buffer to write - * count - # of bytes to read - * resid - pointer to count of unread bytes (if short read) - * - * Returns 0 on success errno on failure. - */ -int -zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) -{ - int rc; - - rc = read(fp->f_fd, buf, count); - if (rc < 0) - return (errno); - - if (resid) { - *resid = count - rc; - } else if (rc != count) { - return (EIO); - } - - return (0); -} - -/* - * Stateless read - os internal file pointer is not updated. - * - * fp - pointer to file (pipe, socket, etc) to read from - * buf - buffer to write - * count - # of bytes to write - * off - file offset to read from (only valid for seekable types) - * resid - pointer to count of unwritten bytes (if short write) - * - * Returns 0 on success errno on failure. - */ -int -zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, - ssize_t *resid) -{ - ssize_t rc; - - rc = pread64(fp->f_fd, buf, count, off); - if (rc < 0) { -#ifdef __linux__ - /* - * Under Linux, this most likely means an alignment issue - * (memory or disk) due to O_DIRECT, so we abort() in order to - * catch the offender. - */ - if (errno == EINVAL) - abort(); -#endif - return (errno); - } - - if (fp->f_dump_fd != -1) { - int status; - - status = pwrite64(fp->f_dump_fd, buf, rc, off); - ASSERT(status != -1); - } - - if (resid) { - *resid = count - rc; - } else if (rc != count) { - return (EIO); - } - - return (0); -} - -/* - * lseek - set / get file pointer - * - * fp - pointer to file (pipe, socket, etc) to read from - * offp - value to seek to, returns current value plus passed offset - * whence - see man pages for standard lseek whence values - * - * Returns 0 on success errno on failure (ESPIPE for non seekable types) - */ -int -zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) -{ - loff_t rc; - - rc = lseek(fp->f_fd, *offp, whence); - if (rc < 0) - return (errno); - - *offp = rc; - - return (0); -} - -/* - * Get file attributes - * - * filp - file pointer - * zfattr - pointer to file attr structure - * - * Currently only used for fetching size and file mode - * - * Returns 0 on success or error code of underlying getattr call on failure. - */ -int -zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) -{ - struct stat64 st; - - if (fstat64_blk(fp->f_fd, &st) == -1) - return (errno); - - zfattr->zfa_size = st.st_size; - zfattr->zfa_mode = st.st_mode; - - return (0); -} - -/* - * Sync file to disk - * - * filp - file pointer - * flags - O_SYNC and or O_DSYNC - * - * Returns 0 on success or error code of underlying sync call on failure. - */ -int -zfs_file_fsync(zfs_file_t *fp, int flags) -{ - (void) flags; - - if (fsync(fp->f_fd) < 0) - return (errno); - - return (0); -} - -/* - * deallocate - zero and/or deallocate file storage - * - * fp - file pointer - * offset - offset to start zeroing or deallocating - * len - length to zero or deallocate - */ -int -zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) -{ - int rc; -#if defined(__linux__) - rc = fallocate(fp->f_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len); -#elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029) - struct spacectl_range rqsr = { - .r_offset = offset, - .r_len = len, - }; - rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr); -#else - (void) fp, (void) offset, (void) len; - rc = EOPNOTSUPP; -#endif - if (rc) - return (SET_ERROR(rc)); - return (0); -} - -/* - * Request current file pointer offset - * - * fp - pointer to file - * - * Returns current file offset. - */ -loff_t -zfs_file_off(zfs_file_t *fp) -{ - return (lseek(fp->f_fd, SEEK_CUR, 0)); -} - -/* - * unlink file - * - * path - fully qualified file path - * - * Returns 0 on success. - * - * OPTIONAL - */ -int -zfs_file_unlink(const char *path) -{ - return (remove(path)); -} - -/* - * Get reference to file pointer - * - * fd - input file descriptor - * - * Returns pointer to file struct or NULL. - * Unsupported in user space. - */ -zfs_file_t * -zfs_file_get(int fd) -{ - (void) fd; - abort(); - return (NULL); -} -/* - * Drop reference to file pointer - * - * fp - pointer to file struct - * - * Unsupported in user space. - */ -void -zfs_file_put(zfs_file_t *fp) -{ - abort(); - (void) fp; -} - void zfsvfs_update_fromname(const char *oldname, const char *newname) { diff --git a/lib/libzpool/zfs_file_os.c b/lib/libzpool/zfs_file_os.c new file mode 100644 index 000000000..7a15e4963 --- /dev/null +++ b/lib/libzpool/zfs_file_os.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2025, Klara, Inc. + */ + +#include +#include +#include +#include + +/* If set, all blocks read will be copied to the specified directory. */ +char *vn_dumpdir = NULL; + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + int fd; + int dump_fd; + int err; + int old_umask = 0; + zfs_file_t *fp; + struct stat64 st; + + if (!(flags & O_CREAT) && stat64(path, &st) == -1) + return (errno); + + if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) + flags |= O_DIRECT; + + if (flags & O_CREAT) + old_umask = umask(0); + + fd = open64(path, flags, mode); + if (fd == -1) + return (errno); + + if (flags & O_CREAT) + (void) umask(old_umask); + + if (vn_dumpdir != NULL) { + char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); + const char *inpath = zfs_basename(path); + + (void) snprintf(dumppath, MAXPATHLEN, + "%s/%s", vn_dumpdir, inpath); + dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); + umem_free(dumppath, MAXPATHLEN); + if (dump_fd == -1) { + err = errno; + close(fd); + return (err); + } + } else { + dump_fd = -1; + } + + (void) fcntl(fd, F_SETFD, FD_CLOEXEC); + + fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); + fp->f_fd = fd; + fp->f_dump_fd = dump_fd; + *fpp = fp; + + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + close(fp->f_fd); + if (fp->f_dump_fd != -1) + close(fp->f_dump_fd); + + umem_free(fp, sizeof (zfs_file_t)); +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + ssize_t rc; + + rc = write(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, + size_t count, loff_t pos, uint8_t ashift, ssize_t *resid) +{ + ssize_t rc, split, done; + int sectors; + + /* + * To simulate partial disk writes, we split writes into two + * system calls so that the process can be killed in between. + * This is used by ztest to simulate realistic failure modes. + */ + sectors = count >> ashift; + split = (sectors > 0 ? rand() % sectors : 0) << ashift; + rc = pwrite64(fp->f_fd, buf, split, pos); + if (rc != -1) { + done = rc; + rc = pwrite64(fp->f_fd, (char *)buf + split, + count - split, pos + split); + } +#ifdef __linux__ + if (rc == -1 && errno == EINVAL) { + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order + * to catch the offender. + */ + abort(); + } +#endif + + if (rc < 0) + return (errno); + + done += rc; + + if (resid) { + *resid = count - done; + } else if (done != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + int rc; + + rc = read(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = pread64(fp->f_fd, buf, count, off); + if (rc < 0) { +#ifdef __linux__ + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order to + * catch the offender. + */ + if (errno == EINVAL) + abort(); +#endif + return (errno); + } + + if (fp->f_dump_fd != -1) { + int status; + + status = pwrite64(fp->f_dump_fd, buf, rc, off); + ASSERT(status != -1); + } + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + loff_t rc; + + rc = lseek(fp->f_fd, *offp, whence); + if (rc < 0) + return (errno); + + *offp = rc; + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct stat64 st; + + if (fstat64_blk(fp->f_fd, &st) == -1) + return (errno); + + zfattr->zfa_size = st.st_size; + zfattr->zfa_mode = st.st_mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + (void) flags; + + if (fsync(fp->f_fd) < 0) + return (errno); + + return (0); +} + +/* + * deallocate - zero and/or deallocate file storage + * + * fp - file pointer + * offset - offset to start zeroing or deallocating + * len - length to zero or deallocate + */ +int +zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) +{ + int rc; +#if defined(__linux__) + rc = fallocate(fp->f_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len); +#elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029) + struct spacectl_range rqsr = { + .r_offset = offset, + .r_len = len, + }; + rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr); +#else + (void) fp, (void) offset, (void) len; + rc = EOPNOTSUPP; +#endif + if (rc) + return (SET_ERROR(rc)); + return (0); +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (lseek(fp->f_fd, SEEK_CUR, 0)); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (remove(path)); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * + * Returns pointer to file struct or NULL. + * Unsupported in user space. + */ +zfs_file_t * +zfs_file_get(int fd) +{ + (void) fd; + abort(); + return (NULL); +} +/* + * Drop reference to file pointer + * + * fp - pointer to file struct + * + * Unsupported in user space. + */ +void +zfs_file_put(zfs_file_t *fp) +{ + abort(); + (void) fp; +}