Linux compat 2.6.39: mount_nodev()

The .get_sb callback has been replaced by a .mount callback in the file_system_type structure. When using the new interface the caller must now use the mount_nodev() helper. Unfortunately, the new interface no longer passes the vfsmount down to the zfs layers. This poses a problem for the existing implementation because we currently save this pointer in the super block for latter use. It provides our only entry point in to the namespace layer for manipulating certain mount options. This needed to be done originally to allow commands like 'zfs set atime=off tank' to work properly. It also allowed me to keep more of the original Solaris code unmodified. Under Solaris there is a 1-to-1 mapping between a mount point and a file system so this is a fairly natural thing to do. However, under Linux they many be multiple entries in the namespace which reference the same filesystem. Thus keeping a back reference from the filesystem to the namespace is complicated. Rather than introduce some ugly hack to get the vfsmount and continue as before. I'm leveraging this API change to update the ZFS code to do things in a more natural way for Linux. This has the upside that is resolves the compatibility issue for the long term and fixes several other minor bugs which have been reported. This commit updates the code to remove this vfsmount back reference entirely. All modifications to filesystem mount options are now passed in to the kernel via a '-o remount'. This is the expected Linux mechanism and allows the namespace to properly handle any options which apply to it before passing them on to the file system itself. Aside from fixing the compatibility issue, removing the vfsmount has had the benefit of simplifying the code. This change which fairly involved has turned out nicely. Closes #246 Closes #217 Closes #187 Closes #248 Closes #231
2026-05-22 02:27:36 +03:00 · 2011-05-19 11:44:07 -07:00
parent 5c03efc379
commit 2cf7f52bc4
73 changed files with 347 additions and 220 deletions
@@ -2136,8 +2136,7 @@ top:
 static int
 zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
 {
-	if ((v4_mode & WRITE_MASK) &&
-	    (ZTOZSB(zp)->z_vfs->mnt_flags & MNT_READONLY) &&
+	if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) &&
 	    (!S_ISDEV(ZTOI(zp)->i_mode) ||
 	    (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) {
 		return (EROFS);
@@ -1045,7 +1045,7 @@ top:
 		return (ENOENT);
 	}

-	if (zsb->z_vfs->mnt_flags & MNT_READONLY) {
+	if (zfs_is_readonly(zsb)) {
 		zfs_dirent_unlock(dl);
 		return (EROFS);
 	}
@@ -1107,8 +1107,9 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp)

 	mutex_enter(&os->os_user_ptr_lock);
 	*zsbp = dmu_objset_get_user(os);
-	if (*zsbp) {
-		mntget((*zsbp)->z_vfs);
+	if (*zsbp && (*zsbp)->z_sb) {
+		if (atomic_inc_not_zero(&((*zsbp)->z_sb->s_active)))
+			error = ESRCH;
 	} else {
 		error = ESRCH;
 	}
@@ -1119,7 +1120,7 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp)

 /*
 * Find a zfs_sb_t for a mounted filesystem, or create our own, in which
- * case its z_vfs will be NULL, and it will be opened as the owner.
+ * case its z_sb will be NULL, and it will be opened as the owner.
 */
 static int
 zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer)
@@ -1149,8 +1150,8 @@ zfs_sb_rele(zfs_sb_t *zsb, void *tag)
 {
 	rrw_exit(&zsb->z_teardown_lock, tag);

-	if (zsb->z_vfs) {
-		mntput(zsb->z_vfs);
+	if (zsb->z_sb) {
+		deactivate_super(zsb->z_sb);
 	} else {
 		dmu_objset_disown(zsb->z_os, zsb);
 		zfs_sb_free(zsb);
@@ -3239,7 +3240,7 @@ zfs_ioc_rollback(zfs_cmd_t *zc)
 			resume_err = zfs_resume_fs(zsb, zc->zc_name);
 			error = error ? error : resume_err;
 		}
-		mntput(zsb->z_vfs);
+		deactivate_super(zsb->z_sb);
 	} else {
 		if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
 			error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
@@ -3724,7 +3725,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 			if (error == 0)
 				error = zfs_resume_fs(zsb, tofs);
 			error = error ? error : end_err;
-			mntput(zsb->z_vfs);
+			deactivate_super(zsb->z_sb);
 		} else {
 			error = dmu_recv_end(&drc);
 		}
@@ -4137,7 +4138,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 		}
 		if (error == 0)
 			error = dmu_objset_userspace_upgrade(zsb->z_os);
-		mntput(zsb->z_vfs);
+		deactivate_super(zsb->z_sb);
 	} else {
 		/* XXX kind of reading contents without owning */
 		error = dmu_objset_hold(zc->zc_name, FTAG, &os);
@@ -122,22 +122,17 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
 }
 EXPORT_SYMBOL(zfs_sync);

+boolean_t
+zfs_is_readonly(zfs_sb_t *zsb)
+{
+	return (!!(zsb->z_sb->s_flags & MS_RDONLY));
+}
+EXPORT_SYMBOL(zfs_is_readonly);
+
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
-	zfs_sb_t *zsb = arg;
-	struct super_block *sb = zsb->z_sb;
-	struct vfsmount *vfs = zsb->z_vfs;
-
-	if (newval == TRUE) {
-		vfs->mnt_flags &= ~MNT_NOATIME;
-		sb->s_flags &= ~MS_NOATIME;
-		zsb->z_atime = TRUE;
-	} else {
-		vfs->mnt_flags |= MNT_NOATIME;
-		sb->s_flags |= MS_NOATIME;
-		zsb->z_atime = FALSE;
-	}
+	((zfs_sb_t *)arg)->z_atime = newval;
 }

 static void
@@ -145,11 +140,10 @@ xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfs_sb_t *zsb = arg;

-	if (newval == TRUE) {
-		zsb->z_flags |= ZSB_XATTR_USER;
-	} else {
-		zsb->z_flags &= ~ZSB_XATTR_USER;
-	}
+	if (newval == TRUE)
+		zsb->z_flags |= ZSB_XATTR;
+	else
+		zsb->z_flags &= ~ZSB_XATTR;
 }

 static void
@@ -169,84 +163,44 @@ readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfs_sb_t *zsb = arg;
 	struct super_block *sb = zsb->z_sb;
-	struct vfsmount *vfs = zsb->z_vfs;

-	if (newval) {
-		vfs->mnt_flags |= MNT_READONLY;
+	if (sb == NULL)
+		return;
+
+	if (newval)
 		sb->s_flags |= MS_RDONLY;
-	} else {
-		vfs->mnt_flags &= ~MNT_READONLY;
+	else
 		sb->s_flags &= ~MS_RDONLY;
-	}
 }

 static void
 devices_changed_cb(void *arg, uint64_t newval)
 {
-	zfs_sb_t *zsb = arg;
-	struct super_block *sb = zsb->z_sb;
-	struct vfsmount *vfs = zsb->z_vfs;
-
-	if (newval == FALSE) {
-		vfs->mnt_flags |= MNT_NODEV;
-		sb->s_flags |= MS_NODEV;
-	} else {
-		vfs->mnt_flags &= ~MNT_NODEV;
-		sb->s_flags &= ~MS_NODEV;
-	}
 }

 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
-	zfs_sb_t *zsb = arg;
-	struct super_block *sb = zsb->z_sb;
-	struct vfsmount *vfs = zsb->z_vfs;
-
-	if (newval == FALSE) {
-		vfs->mnt_flags |= MNT_NOSUID;
-		sb->s_flags |= MS_NOSUID;
-	} else {
-		vfs->mnt_flags &= ~MNT_NOSUID;
-		sb->s_flags &= ~MS_NOSUID;
-	}
 }

 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
-	zfs_sb_t *zsb = arg;
-	struct super_block *sb = zsb->z_sb;
-	struct vfsmount *vfs = zsb->z_vfs;
-
-	if (newval == FALSE) {
-		vfs->mnt_flags |= MNT_NOEXEC;
-		sb->s_flags |= MS_NOEXEC;
-	} else {
-		vfs->mnt_flags &= ~MNT_NOEXEC;
-		sb->s_flags &= ~MS_NOEXEC;
-	}
 }

-/*
- * The nbmand mount option can be changed at mount time.
- * We can't allow it to be toggled on live file systems or incorrect
- * behavior may be seen from cifs clients
- *
- * This property isn't registered via dsl_prop_register(), but this callback
- * will be called when a file system is first mounted
- */
 static void
 nbmand_changed_cb(void *arg, uint64_t newval)
 {
 	zfs_sb_t *zsb = arg;
 	struct super_block *sb = zsb->z_sb;

-	if (newval == TRUE) {
+	if (sb == NULL)
+		return;
+
+	if (newval == TRUE)
 		sb->s_flags |= MS_MANDLOCK;
-	} else {
+	else
 		sb->s_flags &= ~MS_MANDLOCK;
-	}
 }

 static void
@@ -270,58 +224,12 @@ acl_inherit_changed_cb(void *arg, uint64_t newval)
 int
 zfs_register_callbacks(zfs_sb_t *zsb)
 {
-	struct vfsmount *vfsp = zsb->z_vfs;
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = zsb->z_os;
-	uint64_t nbmand;
-	boolean_t readonly = B_FALSE;
-	boolean_t setuid = B_TRUE;
-	boolean_t exec = B_TRUE;
-	boolean_t devices = B_TRUE;
-	boolean_t xattr = B_TRUE;
-	boolean_t atime = B_TRUE;
-	char osname[MAXNAMELEN];
 	int error = 0;

-	/*
-	 * While Linux allows multiple vfs mounts per super block we have
-	 * limited it artificially to one in zfs_fill_super.  Thus it is
-	 * safe for us to modify the vfs mount fails through the callbacks.
-	 */
-	if ((vfsp->mnt_flags & MNT_READONLY) ||
-	    !spa_writeable(dmu_objset_spa(os)))
-		readonly = B_TRUE;
-
-	if (vfsp->mnt_flags & MNT_NOSUID) {
-		devices = B_FALSE;
-		setuid = B_FALSE;
-	} else {
-		if (vfsp->mnt_flags & MNT_NODEV)
-			devices = B_FALSE;
-	}
-
-	if (vfsp->mnt_flags & MNT_NOEXEC)
-		exec = B_FALSE;
-
-	if (vfsp->mnt_flags & MNT_NOATIME)
-		atime = B_FALSE;
-
-	/*
-	 * nbmand is a special property which may only be changed at
-	 * mount time.  Unfortunately, Linux does not have a VFS mount
-	 * flag instead this is a super block flag.  So setting this
-	 * option at mount time will have to wait until we can parse
-	 * the mount option string.  For now we rely on the nbmand
-	 * value stored with the object set.  Additional mount option
-	 * string to be handled:
-	 *
-	 *   case: sensitive|insensitive|mixed
-	 *   zerocopy: on|off
-	 */
-
-	dmu_objset_name(os, osname);
-	if ((error = dsl_prop_get_integer(osname, "nbmand", &nbmand, NULL)))
-		return (error);
+	if (zfs_is_readonly(zsb) || !spa_writeable(dmu_objset_spa(os)))
+		readonly_changed_cb(zsb, B_TRUE);

 	/*
 	 * Register property callbacks.
@@ -351,20 +259,11 @@ zfs_register_callbacks(zfs_sb_t *zsb)
 	    "aclinherit", acl_inherit_changed_cb, zsb);
 	error = error ? error : dsl_prop_register(ds,
 	    "vscan", vscan_changed_cb, zsb);
+	error = error ? error : dsl_prop_register(ds,
+	    "nbmand", nbmand_changed_cb, zsb);
 	if (error)
 		goto unregister;

-	/*
-	 * Invoke our callbacks to set required flags.
-	 */
-	readonly_changed_cb(zsb, readonly);
-	setuid_changed_cb(zsb, setuid);
-	exec_changed_cb(zsb, exec);
-	devices_changed_cb(zsb, devices);
-	xattr_changed_cb(zsb, xattr);
-	atime_changed_cb(zsb, atime);
-	nbmand_changed_cb(zsb, nbmand);
-
 	return (0);

 unregister:
@@ -384,6 +283,7 @@ unregister:
 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
 	    zsb);
 	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zsb);
+	(void) dsl_prop_unregister(ds, "nbmand", nbmand_changed_cb, zsb);

 	return (error);
 }
@@ -694,7 +594,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
 	 * Should probably make this a kmem cache, shuffle fields,
 	 * and just bzero up to z_hold_mtx[].
 	 */
-	zsb->z_vfs = NULL;
+	zsb->z_sb = NULL;
 	zsb->z_parent = zsb;
 	zsb->z_max_blksz = SPA_MAXBLOCKSIZE;
 	zsb->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
@@ -840,9 +740,9 @@ zfs_sb_setup(zfs_sb_t *zsb, boolean_t mounting)
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
-		readonly = zsb->z_vfs->mnt_flags & MNT_READONLY;
+		readonly = zfs_is_readonly(zsb);
 		if (readonly != 0)
-			zsb->z_vfs->mnt_flags &= ~MNT_READONLY;
+			readonly_changed_cb(zsb, B_FALSE);
 		else
 			zfs_unlinked_drain(zsb);

@@ -883,7 +783,10 @@ zfs_sb_setup(zfs_sb_t *zsb, boolean_t mounting)
 				zsb->z_replay = B_FALSE;
 			}
 		}
-		zsb->z_vfs->mnt_flags |= readonly; /* restore readonly bit */
+
+		/* restore readonly bit */
+		if (readonly != 0)
+			readonly_changed_cb(zsb, B_TRUE);
 	}

 	return (0);
@@ -954,6 +857,9 @@ zfs_unregister_callbacks(zfs_sb_t *zsb)

 		VERIFY(dsl_prop_unregister(ds, "vscan",
 		    vscan_changed_cb, zsb) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "nbmand",
+		    nbmand_changed_cb, zsb) == 0);
 	}
 }
 EXPORT_SYMBOL(zfs_unregister_callbacks);
@@ -1164,7 +1070,7 @@ zfsvfs_teardown(zfs_sb_t *zsb, boolean_t unmounting)
 	 * Evict cached data
 	 */
 	if (dmu_objset_is_dirty_anywhere(zsb->z_os))
-		if (!(zsb->z_vfs->mnt_flags & MNT_READONLY))
+		if (!zfs_is_readonly(zsb))
 			txg_wait_synced(dmu_objset_pool(zsb->z_os), 0);
 	(void) dmu_objset_evict_dbufs(zsb->z_os);

@@ -1181,17 +1087,6 @@ zfs_domount(struct super_block *sb, void *data, int silent)
 	uint64_t recordsize;
 	int error;

-	/*
-	 * Linux allows multiple vfs mounts per super block.  However, the
-	 * zfs_sb_t only contains a pointer for a single vfs mount.  This
-	 * back reference in the long term could be extended to a list of
-	 * vfs mounts if a hook were added to the kernel to notify us when
-	 * a vfsmount is destroyed.  Until then we must limit the number
-	 * of mounts per super block to one.
-	 */
-	if (atomic_read(&sb->s_active) > 1)
-		return (EBUSY);
-
 	error = zfs_sb_create(osname, &zsb);
 	if (error)
 		return (error);
@@ -1201,7 +1096,6 @@ zfs_domount(struct super_block *sb, void *data, int silent)
 		goto out;

 	zsb->z_sb = sb;
-	zsb->z_vfs = zmd->z_vfs;
 	sb->s_fs_info = zsb;
 	sb->s_magic = ZFS_SUPER_MAGIC;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1298,47 +1192,18 @@ EXPORT_SYMBOL(zfs_umount);
 int
 zfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	zfs_sb_t *zsb = sb->s_fs_info;
-	boolean_t readonly = B_FALSE;
-	boolean_t setuid = B_TRUE;
-	boolean_t exec = B_TRUE;
-	boolean_t devices = B_TRUE;
-	boolean_t atime = B_TRUE;
-
-	if (*flags & MS_RDONLY)
-		readonly = B_TRUE;
-
-	if (*flags & MS_NOSUID) {
-		devices = B_FALSE;
-		setuid = B_FALSE;
-	} else {
-		if (*flags & MS_NODEV)
-			devices = B_FALSE;
-	}
-
-	if (*flags & MS_NOEXEC)
-		exec = B_FALSE;
-
-	if (*flags & MS_NOATIME)
-		atime = B_FALSE;
-
 	/*
-	 * Invoke our callbacks to set required flags.
+	 * All namespace flags (MNT_*) and super block flags (MS_*) will
+	 * be handled by the Linux VFS.  Only handle custom options here.
 	 */
-	readonly_changed_cb(zsb, readonly);
-	setuid_changed_cb(zsb, setuid);
-	exec_changed_cb(zsb, exec);
-	devices_changed_cb(zsb, devices);
-	atime_changed_cb(zsb, atime);
-
 	return (0);
 }
 EXPORT_SYMBOL(zfs_remount);

 int
-zfs_vget(struct vfsmount *vfsp, struct inode **ipp, fid_t *fidp)
+zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
 {
-	zfs_sb_t	*zsb = VTOZSB(vfsp);
+	zfs_sb_t	*zsb = sb->s_fs_info;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
@@ -1122,7 +1122,7 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
-		if (!(zsb->z_flags & ZSB_XATTR_USER)) {
+		if (!(zsb->z_flags & ZSB_XATTR)) {
 			ZFS_EXIT(zsb);
 			return (EINVAL);
 		}
@@ -2420,7 +2420,7 @@ top:
 	aclp = NULL;

 	/* Can this be moved to before the top label? */
-	if (zsb->z_vfs->mnt_flags & MNT_READONLY) {
+	if (zfs_is_readonly(zsb)) {
 		err = EROFS;
 		goto out3;
 	}
@@ -73,8 +73,6 @@ static struct dentry *
 zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
    int fh_len, int fh_type)
 {
-	zfs_sb_t *zsb = sb->s_fs_info;
-	struct vfsmount *vfs = zsb->z_vfs;
 	fid_t *fid = (fid_t *)fh;
 	struct inode *ip;
 	int len_bytes, rc;
@@ -86,7 +84,7 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
 	    len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
 		return ERR_PTR(-EINVAL);

-	rc = zfs_vget(vfs, &ip, fid);
+	rc = zfs_vget(sb, &ip, fid);

 	if (rc != 0)
 		return ERR_PTR(-rc);
@@ -150,8 +150,7 @@ zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp)
 	 * MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, MNT_NOATIME, MNT_READONLY
 	 */

-	if (zsb->z_flags & ZSB_XATTR_USER)
-		seq_printf(seq, ",%s", "xattr");
+	seq_printf(seq, ",%s", zsb->z_flags & ZSB_XATTR ? "xattr" : "noxattr");

 	return (0);
 }
@@ -167,14 +166,25 @@ zpl_fill_super(struct super_block *sb, void *data, int silent)
 	return (error);
 }

+#ifdef HAVE_MOUNT_NODEV
+static struct dentry *
+zpl_mount(struct file_system_type *fs_type, int flags,
+    const char *osname, void *data)
+{
+	zpl_mount_data_t zmd = { osname, data };
+
+	return mount_nodev(fs_type, flags, &zmd, zpl_fill_super);
+}
+#else
 static int
 zpl_get_sb(struct file_system_type *fs_type, int flags,
    const char *osname, void *data, struct vfsmount *mnt)
 {
-	zpl_mount_data_t zmd = { osname, data, mnt };
+	zpl_mount_data_t zmd = { osname, data };

 	return get_sb_nodev(fs_type, flags, &zmd, zpl_fill_super, mnt);
 }
+#endif /* HAVE_MOUNT_NODEV */

 static void
 zpl_kill_sb(struct super_block *sb)
@@ -213,6 +223,10 @@ const struct super_operations zpl_super_operations = {
 struct file_system_type zpl_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= ZFS_DRIVER,
+#ifdef HAVE_MOUNT_NODEV
+	.mount		= zpl_mount,
+#else
 	.get_sb		= zpl_get_sb,
+#endif /* HAVE_MOUNT_NODEV */
 	.kill_sb	= zpl_kill_sb,
 };
@@ -83,7 +83,7 @@ zpl_xattr_filldir(void *arg, const char *name, int name_len,
 	xattr_filldir_t *xf = arg;

 	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-		if (!(ITOZSB(xf->inode)->z_flags & ZSB_XATTR_USER))
+		if (!(ITOZSB(xf->inode)->z_flags & ZSB_XATTR))
 			return (0);

 	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
@@ -281,7 +281,7 @@ __zpl_xattr_user_get(struct inode *ip, const char *name,
 	if (strcmp(name, "") == 0)
 		return -EINVAL;

-	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR_USER))
+	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
 		return -EOPNOTSUPP;

 	xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
@@ -302,7 +302,7 @@ __zpl_xattr_user_set(struct inode *ip, const char *name,
 	if (strcmp(name, "") == 0)
 		return -EINVAL;

-	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR_USER))
+	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
 		return -EOPNOTSUPP;

 	xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);