// SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. * Copyright (c) 2025, Klara, Inc. * Copyright (c) 2025, Rob Norris * Copyright (c) 2026, TrueNAS. */ #include #include #include #include #include #include #include #include #include #include /* * What to do when the last reference to an inode is released. If 0, the kernel * will cache it on the superblock. If 1, the inode will be freed immediately. * See zpl_drop_inode(). */ int zfs_delete_inode = 0; /* * What to do when the last reference to a dentry is released. If 0, the kernel * will cache it until the entry (file) is destroyed. If 1, the dentry will be * marked for cleanup, at which time its inode reference will be released. See * zpl_dentry_delete(). */ int zfs_delete_dentry = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) { struct inode *ip; VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); inode_set_iversion(ip, 1); return (ip); } #ifdef HAVE_SOPS_FREE_INODE static void zpl_inode_free(struct inode *ip) { ASSERT0(atomic_read(&ip->i_count)); zfs_inode_free(ip); } #endif static void zpl_inode_destroy(struct inode *ip) { ASSERT0(atomic_read(&ip->i_count)); zfs_inode_destroy(ip); } /* * Called from __mark_inode_dirty() to reflect that something in the * inode has changed. We use it to ensure the znode system attributes * are always strictly update to date with respect to the inode. */ static void zpl_dirty_inode(struct inode *ip, int flags) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_dirty_inode(ip, flags); spl_fstrans_unmark(cookie); } /* * ->drop_inode() is called when the last reference to an inode is released. * Its return value indicates if the inode should be destroyed immediately, or * cached on the superblock structure. * * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns * "destroy immediately" if the inode is unhashed and has no links (roughly: no * longer exists on disk). On datasets with millions of rarely-accessed files, * this can cause a large amount of memory to be "pinned" by cached inodes, * which in turn pin their associated dnodes and dbufs, until the kernel starts * reporting memory pressure and requests OpenZFS release some memory (see * zfs_prune()). * * When set to 1, we call generic_delete_inode(), which always returns "destroy * immediately", resulting in inodes being destroyed immediately, releasing * their associated dnodes and dbufs to the dbuf cached and the ARC to be * evicted as normal. * * Note that the "last reference" doesn't always mean the last _userspace_ * reference; the dentry cache also holds a reference, so "busy" inodes will * still be kept alive that way (subject to dcache tuning). */ static int zpl_drop_inode(struct inode *ip) { if (zfs_delete_inode) return (generic_delete_inode(ip)); return (generic_drop_inode(ip)); } /* * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the * truncate in evict(). For earlier kernels clear_inode() maps to * end_writeback() which is responsible for completing all outstanding * write back. In either case, once this is done it is safe to cleanup * any remaining inode specific data via zfs_inactive(). * remaining filesystem specific data. */ static void zpl_evict_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); truncate_setsize(ip, 0); clear_inode(ip); zfs_inactive(ip); spl_fstrans_unmark(cookie); } static void zpl_put_super(struct super_block *sb) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_umount(sb); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); } /* * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2) * syscalls, via sb->s_op->sync_fs(). * * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() -> * sync_filesystem() would ignore the return from sync_fs(), instead only * considing the error from syncing the underlying block device (sb->s_dev). * Since OpenZFS doesn't _have_ an underlying block device, there's no way for * us to report a sync directly. * * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra * error store `s_wb_err`, to carry errors seen on page writeback since the * last call to syncfs(). If sync_filesystem() does not return an error, any * existing writeback error on the superblock will be used instead (and cleared * either way). We don't use this (page writeback is a different thing for us), * so for 5.8-5.17 we can use that instead to get syncfs() to return the error. * * Before 5.8, we have no other good options - no matter what happens, the * userspace program will be told the call has succeeded, and so we must make * it so, Therefore, when we are asked to wait for sync to complete (wait == * 1), if zfs_sync() has returned an error we have no choice but to block, * regardless of the reason. * * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the * mainline Linux series at time of writing), and has likely been backported to * vendor kernels before 5.8. We don't really want to use a workaround when we * don't have to, but we can't really detect whether or not sync_filesystem() * will return our errors (without a difficult runtime test anyway). So, we use * a static version check: any kernel reporting its version as 5.17+ will use a * direct error return, otherwise, we'll either use s_wb_err if it was detected * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will * block to ensure the correct semantics. * * See https://github.com/openzfs/zfs/issues/17416 for further discussion. */ static int zpl_sync_fs(struct super_block *sb, int wait) { fstrans_cookie_t cookie; cred_t *cr = CRED(); int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_sync(sb, wait, cr); #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) #ifdef HAVE_SUPER_BLOCK_S_WB_ERR if (error && wait) errseq_set(&sb->s_wb_err, error); #else if (error && wait) { zfsvfs_t *zfsvfs = sb->s_fs_info; ASSERT3P(zfsvfs, !=, NULL); if (zfs_enter(zfsvfs, FTAG) == 0) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); zfs_exit(zfsvfs, FTAG); error = 0; } } #endif #endif /* < 5.17.0 */ spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_statfs(struct dentry *dentry, struct kstatfs *statp) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_statvfs(dentry->d_inode, statp); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); /* * If required by a 32-bit system call, dynamically scale the * block size up to 16MiB and decrease the block counts. This * allows for a maximum size of 64EiB to be reported. The file * counts must be artificially capped at 2^32-1. */ if (unlikely(zpl_is_32bit_api())) { while (statp->f_blocks > UINT32_MAX && statp->f_bsize < SPA_MAXBLOCKSIZE) { statp->f_frsize <<= 1; statp->f_bsize <<= 1; statp->f_blocks >>= 1; statp->f_bfree >>= 1; statp->f_bavail >>= 1; } uint64_t usedobjs = statp->f_files - statp->f_ffree; statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); statp->f_files = statp->f_ffree + usedobjs; } return (error); } static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { int error; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); for (int i = 0; fsname[i] != 0; i++) { /* * Spaces in the dataset name must be converted to their * octal escape sequence for getmntent(3) to correctly * parse then fsname portion of /proc/self/mounts. */ if (fsname[i] == ' ') { seq_puts(seq, "\\040"); } else { seq_putc(seq, fsname[i]); } } kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); zpl_exit(zfsvfs, FTAG); return (0); } static int zpl_show_devname(struct seq_file *seq, struct dentry *root) { return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); } static int __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) { seq_printf(seq, ",%s", zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); #ifdef CONFIG_FS_POSIX_ACL switch (zfsvfs->z_acl_type) { case ZFS_ACLTYPE_POSIX: seq_puts(seq, ",posixacl"); break; default: seq_puts(seq, ",noacl"); break; } #endif /* CONFIG_FS_POSIX_ACL */ switch (zfsvfs->z_case) { case ZFS_CASE_SENSITIVE: seq_puts(seq, ",casesensitive"); break; case ZFS_CASE_INSENSITIVE: seq_puts(seq, ",caseinsensitive"); break; default: seq_puts(seq, ",casemixed"); break; } return (0); } static int zpl_show_options(struct seq_file *seq, struct dentry *root) { return (__zpl_show_options(seq, root->d_sb->s_fs_info)); } static int zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; objset_t *os = data; /* * If the os doesn't match the z_os in the super_block, assume it is * not a match. Matching would imply a multimount of a dataset. It is * possible that during a multimount, there is a simultaneous operation * that changes the z_os, e.g., rollback, where the match will be * missed, but in that case the user will get an EBUSY. */ return (zfsvfs != NULL && os == zfsvfs->z_os); } static void zpl_kill_sb(struct super_block *sb) { zfs_preumount(sb); kill_anon_super(sb); } void zpl_prune_sb(uint64_t nr_to_scan, void *arg) { struct super_block *sb = (struct super_block *)arg; int objects = 0; /* * Ensure the superblock is not in the process of being torn down. */ #ifdef HAVE_SB_DYING if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) { (void) zfs_prune(sb, nr_to_scan, &objects); } up_read(&sb->s_umount); } #else if (down_read_trylock(&sb->s_umount)) { if (!hlist_unhashed(&sb->s_instances) && sb->s_root && (sb->s_flags & SB_BORN)) { (void) zfs_prune(sb, nr_to_scan, &objects); } up_read(&sb->s_umount); } #endif } /* * Mount option parsing. * * The kernel receives a set of "stringy" mount options, typically a * comma-separated list through mount(2) or fsconfig(2). These are split into a * set of struct fs_parameter, and then vfs_parse_fs_param() is called for * each. That function will handle (and consume) some options directly, and * other subsystems (mainly security modules) are given the opportunity to * consume them too. Any left over are passed to zpl_parse_param(). Our job is * to use them to fill in the vfs_t we've attached previously to * fc->fs_private, ready for the mount or remount call when it comes. * * Historically, mount options have been generated, removed, modified and * otherwise complicated by multiple different actors over a long time: the * kernel itself, the original mount(8) utility and later libmount, * mount.zfs(8), libzfs and the ZFS tools that use it, and any program using * the various mount APIs that have come and gone over the years. This is * further complicated by cross-pollination between OpenSolaris/illumos, Linux * and FreeBSD. Long story short: we could see all sorts of things, and we need * to at least try not to break old userspace programs. * * At time of writing, this is my best understanding of all the options we * might reasonably see, and where and how they're handled. * * * These are common options for all filesystems that are processed by the * kernel directly, without zpl_parse_param() being called. They're a bit of a * mixed bag, but are ultimately all available to us via either sb->s_flags or * fc->sb_flags: * * dirsync: set SB_DIRSYNC * lazytime: set SB_LAZYTIME * mand: set SB_MANDLOCK * ro: set SB_RDONLY * sync: set SB_SYNCHRONOUS * * async: clear SB_SYNCHRONOUS * nolazytime: clear SB_LAZYTIME * nomand: clear SB_MANDLOCK * rw: clear SB_RDONLY * * Fortunately, almost all of these are handled directly by the kernel. 'mand' * and 'nomand' are swallowed by the kernel ('mand' emits a warning in the * kernel log), but it and the corresponding dataset property have been a no-op * in OpenZFS for years, so there's nothing for us to do there. * * The only tricky one is SB_RDONLY ('ro'/'rw'), which can be both a mount and * a superblock option. While we won't receive the "stringy" options, the * kernel will set it for us in fc->sb_flags, and we've always had special * handling for it at mount and remount time (eg handling snapshot mounts), so * it's not a problem to do nothing here because we will sort it out later. * * * These are options that we may receive as "stringy" options but also as mount * flags. * * exec: clear MS_NOEXEC * noexec: set MS_NOEXEC * suid: clear MS_NOSUID * nosuid: set MS_NOSUID * dev: clear MS_NODEV * nodev: set MS_NODEV * atime: clear MS_NOATIME * noatime: set MS_NOATIME * relatime: set MS_RELATIME * norelatime: clear MS_RELATIME * * In testing, it appears that recent libmount will convert them, but our own * mount code (libzfs_mount) may not. We will be called for the stringy * versions, but not for the flags. The flags will later be available on * vfsmount->mnt_flags, not set on the vfs_t. This tends not to matter in * practice, as almost all mounts come through libzfs (via zfs-mount(8) or * mount.zfs(8)) and so as strings, and when they do come through flags, they * will still be reported correctly via mountinfo and by zfs-get(8), which has * special handling for "temporary" properties. Also, we never use these * internally for any decisions; 'exec', 'suid' and 'dev' are handled in the * kernel, and the kernel provides helpers for 'atime' and 'relatime'. The * only place the difference is observable is through zfs_get_temporary_prop(), * which is only used by the zfs.get_prop() Lua call. * * This is fixable by getting at vfsmount->mnt_flags, but this is not readily * available until after the mount operation is completed, and with some * effort. This is all very low impact, so it's left for future improvement. * * * These are true OpenZFS-specific mount options. They give the equivalent * of temporarily setting the pool properties as follows: * * strictatime atime=on, relatime=off * * xattr: xattr=sa * saxattr: xattr=sa * dirxattr: xattr=dir * noxattr: xattr=off * * * mntpoint= provides the canonical mount point for a snapshot mount. This * is an assist for the snapshot automounter call out to userspace, to * understand where the snapshot is mounted even when triggered from an * alternate mount namespace (eg inside a chroot). * * mntpoint= vfs->vfs_mntpoint=... * * * These are used for coordination inside libzfs, and should not make it * to the kernel, but it does not strip them, so we handle them and ignore * them. * * defaults * zfsutil * remount * * * These are specific to SELinux. When that security module is running, it * will consume them, but if not, they will be passed through to us. libzfs * adds them unconditionally, so we will always see them when SELinux is not * running, and ignore them. * * fscontext * defcontext * rootcontext * context * * * When preparing a remount, libmount will read /proc/self/mountinfo and add * any unrecognised flags it finds there to the options. So, we have to accept * anything that __zpl_show_options() can produce. * * posixacl * noacl * casesensitive * caseinsensitive * casemixed * * * mount(8) has a notion of "sloppy" options. According to the documentation, * when the -s switch is provided, unrecognised mount options will be ignored. * Only the Linux NFS and SMB filesystems support it, and traditionally * OpenZFS has too. however, it appears massively underspecified and * inconsistent. Depending on the interplay between mount(8), the mount helper * (eg mount.zfs(8)) and libmount, -s may cause unknown options to be filtered * in userspace, _or_ an additional option 'sloppy' to be passed to the kernel * either before or after the "unknown" option, _or_ nothing at all happens * and the unknown option to be passed through to the kernel as-is. The * kernel NFS and SMB filesystems both expect to see an explicit option * 'sloppy' and use this to either ignore or reject unknown options, but as * described, it's very easy for that option to not appear, or appear too late. * * OpenZFS has a test for this in the test suite, and it's documented in * mount.zfs(8), so to support it we accept 'sloppy' and ignore it, and all * other unknown options produce a notice in the kernel log, and are also * ignored. This allows the "feature" to continue to work, while avoiding * the additional housekeeping for the 'sloppy' option. * * sloppy * * * Finally, all filesystems get automatic handling for the 'source' option, * that is, the "name" of the filesystem (the first column of df(1)'s output). * However, this only happens if the handler does not otherwise handle * the 'source' option. Since we handle _all_ options because of 'sloppy', we * deal with this explicitly by calling into the kernel's helper for this, * vfs_parse_fs_param_source(), which sets up fc->source. * * source * * * Thank you for reading this far. I hope you find what you are looking for, * in this life or the next. * * -- robn, 2026-03-26 */ enum { Opt_exec, Opt_suid, Opt_dev, Opt_atime, Opt_relatime, Opt_strictatime, Opt_saxattr, Opt_dirxattr, Opt_noxattr, Opt_mntpoint, Opt_ignore, Opt_warn, }; static const struct fs_parameter_spec zpl_param_spec[] = { fsparam_flag_no("exec", Opt_exec), fsparam_flag_no("suid", Opt_suid), fsparam_flag_no("dev", Opt_dev), fsparam_flag_no("atime", Opt_atime), fsparam_flag_no("relatime", Opt_relatime), fsparam_flag("strictatime", Opt_strictatime), fsparam_flag("xattr", Opt_saxattr), fsparam_flag("saxattr", Opt_saxattr), fsparam_flag("dirxattr", Opt_dirxattr), fsparam_flag("noxattr", Opt_noxattr), fsparam_string("mntpoint", Opt_mntpoint), fsparam_flag("defaults", Opt_ignore), fsparam_flag("zfsutil", Opt_ignore), fsparam_flag("remount", Opt_ignore), fsparam_string("fscontext", Opt_ignore), fsparam_string("defcontext", Opt_ignore), fsparam_string("rootcontext", Opt_ignore), fsparam_string("context", Opt_ignore), fsparam_flag("posixacl", Opt_ignore), fsparam_flag("noacl", Opt_ignore), fsparam_flag("casesensitive", Opt_ignore), fsparam_flag("caseinsensitive", Opt_ignore), fsparam_flag("casemixed", Opt_ignore), fsparam_flag("sloppy", Opt_ignore), {} }; static int zpl_parse_param(struct fs_context *fc, struct fs_parameter *param) { vfs_t *vfs = fc->fs_private; /* Handle 'source' explicitly so we don't trip on it as an unknown. */ int opt = vfs_parse_fs_param_source(fc, param); if (opt != -ENOPARAM) return (opt); struct fs_parse_result result; opt = fs_parse(fc, zpl_param_spec, param, &result); if (opt == -ENOPARAM) { /* * Convert unknowns to warnings, to work around the whole * "sloppy option" mess. */ opt = Opt_warn; } if (opt < 0) return (opt); switch (opt) { case Opt_exec: vfs->vfs_exec = !result.negated; vfs->vfs_do_exec = B_TRUE; break; case Opt_suid: vfs->vfs_setuid = !result.negated; vfs->vfs_do_setuid = B_TRUE; break; case Opt_dev: vfs->vfs_devices = !result.negated; vfs->vfs_do_devices = B_TRUE; break; case Opt_atime: vfs->vfs_atime = !result.negated; vfs->vfs_do_atime = B_TRUE; break; case Opt_relatime: vfs->vfs_relatime = !result.negated; vfs->vfs_do_relatime = B_TRUE; break; case Opt_strictatime: vfs->vfs_atime = B_TRUE; vfs->vfs_do_atime = B_TRUE; vfs->vfs_relatime = B_FALSE; vfs->vfs_do_relatime = B_TRUE; break; case Opt_saxattr: vfs->vfs_xattr = ZFS_XATTR_SA; vfs->vfs_do_xattr = B_TRUE; break; case Opt_dirxattr: vfs->vfs_xattr = ZFS_XATTR_DIR; vfs->vfs_do_xattr = B_TRUE; break; case Opt_noxattr: vfs->vfs_xattr = ZFS_XATTR_OFF; vfs->vfs_do_xattr = B_TRUE; break; case Opt_mntpoint: if (vfs->vfs_mntpoint != NULL) kmem_strfree(vfs->vfs_mntpoint); vfs->vfs_mntpoint = kmem_strdup(param->string); break; case Opt_ignore: break; case Opt_warn: cmn_err(CE_NOTE, "ZFS: ignoring unknown mount option: %s", param->key); break; default: return (-SET_ERROR(EINVAL)); } return (0); } /* * Before Linux 5.8, the kernel's individual parameter parsing had a list of * "forbidden" options that would always be rejected early. These were options * that should be specified by MS_* flags, to be set on the superblock * directly. However, it was inconsistently applied (eg it had various "*atime" * options but not "atime", and also caused problems when it was not in sync * with the version of libmount in use. It was deemed needlessly restrictive * and was dropped in torvalds/linux@9193ae87a8af. * * Unfortunately, some of the options on this list are used by OpenZFS, so * we need to see them. These include the aforementioned "*atime", "dev", * "exec" and "suid". * * There is no easy compile-time check available to detect this, so we use * a simple version check that should make it available everywhere needed, * most notably RHEL8's 4.18+extras, which has backported fs_context support * but does not include the 5.8 commit. */ #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) #define HAVE_FORBIDDEN_SB_FLAGS 1 #endif #ifdef HAVE_FORBIDDEN_SB_FLAGS /* * The typical path for options parsing through mount(2) is: * * ksys_mount * do_mount * generic_parse_monolithic * vfs_parse_fs_string * vfs_parse_fs_param * zpl_parse_param * * vfs_parse_fs_param() calls the internal vfs_parse_sb_flag(), which is * where the "forbidden" flags are applied. If it makes it through there, * it will later call fc->parse_param() ie zpl_parse_param(). We can't * intercept this chain in the middle anywhere; the earliest thing we can * override is generic_parse_monolithic(), substituting our own by setting * fc->parse_monolithic and doing the parsing work ourselves. * * Fortunately, generic_parse_monolithic() is almost entirely splitting the * incoming parameter string on comma and handing off to the rest of the * pipeline. This is easily replaced (almost entirely by reviving a few bits * of our old options parser). * * To keep the change as narrow as possible, we reuse zpl_param_spec and * zpl_parse_param() as much as possible. Once we've parsed the option, we call * fs_parse(zpl_param_spec) to find out if the option is actually one we * explicitly care about. If it is, we call zpl_parse_param() directly, * avoiding vfs_parse_fs_param() and so the risk of being rejected. If it is * not one we explicitly care about, we call zpl_parse_param() as normal, * letting the kernel reject it if it wishes. If it doesn't, it will end up * back in zpl_parse_param() via fc->parse_param, and we can ignore or warn * about it we normally would. */ static int zpl_parse_monolithic(struct fs_context *fc, void *data) { char *mntopts = data; if (mntopts == NULL) return (0); /* * Because we supply a .parse_monolithic callback, the kernel does * no consideration of the options blob at all. Because of this, we * have to give LSMs a first look at it. They will remove any options * of interest to them (eg the SELinux *context= options). */ int err = security_sb_eat_lsm_opts(mntopts, &fc->security); if (err) return (err); char *key; while ((key = strsep(&mntopts, ",")) != NULL) { if (!*key) continue; struct fs_parameter param = { .key = key, }; char *value = strchr(key, '='); if (value != NULL) { /* Key starts with '='. Kernel ignores, we will too. */ if (value == key) continue; *value++ = '\0'; /* key=value is a "string" type, set up for that */ param.string = value; param.type = fs_value_is_string; param.size = strlen(value); } else { /* unadorned key is a "flag" type */ param.type = fs_value_is_flag; } /* Check if this is one of our options. */ struct fs_parse_result result; int opt = fs_parse(fc, zpl_param_spec, ¶m, &result); if (opt >= 0) { /* * We already know this one of our options, so a * failure here would be nonsensical. */ VERIFY0(zpl_parse_param(fc, ¶m)); } else { /* * Not one of our option, send it through the kernel's * standard parameter handling. */ err = vfs_parse_fs_param(fc, ¶m); if (err < 0) return (err); } } return (0); } #endif /* HAVE_FORBIDDEN_SB_FLAGS */ static int zpl_get_tree(struct fs_context *fc) { struct super_block *sb; objset_t *os; boolean_t issnap = B_FALSE; int err; err = dmu_objset_hold(fc->source, FTAG, &os); if (err) return (-err); /* * The dsl pool lock must be released prior to calling sget(). * It is possible sget() may block on the lock in grab_super() * while deactivate_super() holds that same lock and waits for * a txg sync. If the dsl_pool lock is held over sget() * this can prevent the pool sync and cause a deadlock. */ dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); dsl_pool_rele(dmu_objset_pool(os), FTAG); sb = sget(fc->fs_type, zpl_test_super, set_anon_super, fc->sb_flags, os); /* * Recheck with the lock held to prevent mounting the wrong dataset * since z_os can be stale when the teardown lock is held. * * We can't do this in zpl_test_super in since it's under spinlock and * also s_umount lock is not held there so it would race with * zfs_umount and zfsvfs can be freed. */ if (!IS_ERR(sb) && sb->s_fs_info != NULL) { zfsvfs_t *zfsvfs = sb->s_fs_info; if (zpl_enter(zfsvfs, FTAG) == 0) { if (os != zfsvfs->z_os) err = SET_ERROR(EBUSY); issnap = zfsvfs->z_issnap; zpl_exit(zfsvfs, FTAG); } else { err = SET_ERROR(EBUSY); } } dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(sb)) return (PTR_ERR(sb)); if (err) { deactivate_locked_super(sb); return (-err); } if (sb->s_root == NULL) { vfs_t *vfs = fc->fs_private; /* Apply readonly flag as mount option */ if (fc->sb_flags & SB_RDONLY) { vfs->vfs_readonly = B_TRUE; vfs->vfs_do_readonly = B_TRUE; } fstrans_cookie_t cookie = spl_fstrans_mark(); err = zfs_domount(sb, fc->source, vfs, fc->sb_flags & SB_SILENT ? 1 : 0); spl_fstrans_unmark(cookie); if (err) { deactivate_locked_super(sb); return (-err); } /* * zfsvfs has taken ownership of the mount options, so we * need to ensure we don't free them. */ fc->fs_private = NULL; sb->s_flags |= SB_ACTIVE; } else if (!issnap && ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)) { /* * Skip ro check for snap since snap is always ro regardless * ro flag is passed by mount or not. */ deactivate_locked_super(sb); return (-SET_ERROR(EBUSY)); } struct dentry *root = dget(sb->s_root); if (IS_ERR(root)) return (PTR_ERR(root)); fc->root = root; return (0); } static int zpl_reconfigure(struct fs_context *fc) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_remount(fc->root->d_sb, fc->fs_private, fc->sb_flags); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); if (error == 0) { /* * zfsvfs has taken ownership of the mount options, so we * need to ensure we don't free them. */ fc->fs_private = NULL; } return (error); } static int zpl_dup_fc(struct fs_context *fc, struct fs_context *src_fc) { vfs_t *src_vfs = src_fc->fs_private; if (src_vfs == NULL) return (0); vfs_t *vfs = zfsvfs_vfs_alloc(); if (vfs == NULL) return (-SET_ERROR(ENOMEM)); /* * This is annoying, but a straight memcpy() would require us to * reinitialise the lock. */ vfs->vfs_xattr = src_vfs->vfs_xattr; vfs->vfs_readonly = src_vfs->vfs_readonly; vfs->vfs_do_readonly = src_vfs->vfs_do_readonly; vfs->vfs_setuid = src_vfs->vfs_setuid; vfs->vfs_do_setuid = src_vfs->vfs_do_setuid; vfs->vfs_exec = src_vfs->vfs_exec; vfs->vfs_do_exec = src_vfs->vfs_do_exec; vfs->vfs_devices = src_vfs->vfs_devices; vfs->vfs_do_devices = src_vfs->vfs_do_devices; vfs->vfs_do_xattr = src_vfs->vfs_do_xattr; vfs->vfs_atime = src_vfs->vfs_atime; vfs->vfs_do_atime = src_vfs->vfs_do_atime; vfs->vfs_relatime = src_vfs->vfs_relatime; vfs->vfs_do_relatime = src_vfs->vfs_do_relatime; vfs->vfs_nbmand = src_vfs->vfs_nbmand; vfs->vfs_do_nbmand = src_vfs->vfs_do_nbmand; mutex_enter(&src_vfs->vfs_mntpt_lock); if (src_vfs->vfs_mntpoint != NULL) vfs->vfs_mntpoint = kmem_strdup(src_vfs->vfs_mntpoint); mutex_exit(&src_vfs->vfs_mntpt_lock); fc->fs_private = vfs; return (0); } static void zpl_free_fc(struct fs_context *fc) { zfsvfs_vfs_free(fc->fs_private); } const struct fs_context_operations zpl_fs_context_operations = { #ifdef HAVE_FORBIDDEN_SB_FLAGS .parse_monolithic = zpl_parse_monolithic, #endif .parse_param = zpl_parse_param, .get_tree = zpl_get_tree, .reconfigure = zpl_reconfigure, .dup = zpl_dup_fc, .free = zpl_free_fc, }; static int zpl_init_fs_context(struct fs_context *fc) { fc->fs_private = zfsvfs_vfs_alloc(); if (fc->fs_private == NULL) return (-SET_ERROR(ENOMEM)); fc->ops = &zpl_fs_context_operations; return (0); } const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, #ifdef HAVE_SOPS_FREE_INODE .free_inode = zpl_inode_free, #endif .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, .drop_inode = zpl_drop_inode, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, .show_devname = zpl_show_devname, .show_options = zpl_show_options, .show_stats = NULL, }; /* * ->d_delete() is called when the last reference to a dentry is released. Its * return value indicates if the dentry should be destroyed immediately, or * retained in the dentry cache. * * By default (zfs_delete_dentry=0) the kernel will always cache unused * entries. Each dentry holds an inode reference, so cached dentries can hold * the final inode reference indefinitely, leading to the inode and its related * data being pinned (see zpl_drop_inode()). * * When set to 1, we signal that the dentry should be destroyed immediately and * never cached. This reduces memory usage, at the cost of higher overheads to * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be * reloaded and reinflated. * * Note that userspace does not have direct control over dentry references and * reclaim; rather, this is part of the kernel's caching and reclaim subsystems * (eg vm.vfs_cache_pressure). */ static int zpl_dentry_delete(const struct dentry *dentry) { return (zfs_delete_dentry ? 1 : 0); } const struct dentry_operations zpl_dentry_operations = { .d_delete = zpl_dentry_delete, }; struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, #if defined(HAVE_IDMAP_MNT_API) .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, #else .fs_flags = FS_USERNS_MOUNT, #endif .init_fs_context = zpl_init_fs_context, .kill_sb = zpl_kill_sb, }; ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, "Delete inodes as soon as the last reference is released."); ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, "Delete dentries from dentry cache as soon as the last reference is " "released.");