zfs_rename: support RENAME_* flags

Implement support for Linux's RENAME_* flags (for renameat2). Aside from being quite useful for userspace (providing race-free ways to exchange paths and implement mv --no-clobber), they are used by overlayfs and are thus required in order to use overlayfs-on-ZFS. In order for us to represent the new renameat2(2) flags in the ZIL, we create two new transaction types for the two flags which need transactional-level support (RENAME_EXCHANGE and RENAME_WHITEOUT). RENAME_NOREPLACE does not need any ZIL support because we know that if the operation succeeded before creating the ZIL entry, there was no file to be clobbered and thus it can be treated as a regular TX_RENAME. Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Pavel Snajdr <snajpa@snajpa.net> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> Closes #12209 Closes #14070
2026-05-22 02:27:36 +03:00 · 2019-06-22 10:35:11 +10:00
parent e015d6cc0b
commit dbf6108b4d
33 changed files with 932 additions and 74 deletions
@@ -3420,7 +3420,7 @@ out:

 int
 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
-    cred_t *cr, int flags, zuserns_t *mnt_ns)
+    cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns)
 {
 	struct componentname scn, tcn;
 	vnode_t *sdvp, *tdvp;
@@ -3428,6 +3428,9 @@ zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
 	int error;
 	svp = tvp = NULL;

+	if (rflags != 0 || wo_vap != NULL)
+		return (SET_ERROR(EINVAL));
+
 	sdvp = ZTOV(sdzp);
 	tdvp = ZTOV(tdzp);
 	error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
@@ -1035,7 +1035,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 		}

 		/* The only error is !zfs_dirempty() and we checked earlier. */
-		ASSERT3U(zfs_drop_nlink_locked(zp, tx, &unlinked), ==, 0);
+		error = zfs_drop_nlink_locked(zp, tx, &unlinked);
+		ASSERT3U(error, ==, 0);
 		mutex_exit(&zp->z_lock);
 	} else {
 		error = zfs_dropname(dl, zp, dzp, tx, flag);
@@ -2655,6 +2655,8 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 *		tnm	- New entry name.
 *		cr	- credentials of caller.
 *		flags	- case flags
+ *		rflags  - RENAME_* flags
+ *		wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
 *		mnt_ns	- user namespace of the mount
 *
 *	RETURN:	0 on success, error code on failure.
@@ -2664,7 +2666,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 */
 int
 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
-    cred_t *cr, int flags, zuserns_t *mnt_ns)
+    cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns)
 {
 	znode_t		*szp, *tzp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(sdzp);
@@ -2676,10 +2678,33 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
 	int		error = 0;
 	int		zflg = 0;
 	boolean_t	waited = B_FALSE;
+	/* Needed for whiteout inode creation. */
+	boolean_t	fuid_dirtied;
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	have_acl = B_FALSE;
+	znode_t		*wzp = NULL;
+

 	if (snm == NULL || tnm == NULL)
 		return (SET_ERROR(EINVAL));

+	if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+		return (SET_ERROR(EINVAL));
+
+	/* Already checked by Linux VFS, but just to make sure. */
+	if (rflags & RENAME_EXCHANGE &&
+	    (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
+	 * right kind of vattr_t for the whiteout file. These are set
+	 * internally by ZFS so should never be incorrect.
+	 */
+	VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
+	VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
+	VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
+
 	if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
@@ -2856,7 +2881,6 @@ top:
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
-
 	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
 		goto out;

@@ -2873,16 +2897,22 @@ top:
 	 * Does target exist?
 	 */
 	if (tzp) {
-		/*
-		 * Source and target must be the same type.
-		 */
-		boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
-		boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
-
-		if (s_is_dir != t_is_dir) {
-			error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
+		if (rflags & RENAME_NOREPLACE) {
+			error = SET_ERROR(EEXIST);
 			goto out;
 		}
+		/*
+		 * Source and target must be the same type (unless exchanging).
+		 */
+		if (!(rflags & RENAME_EXCHANGE)) {
+			boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
+			boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
+
+			if (s_is_dir != t_is_dir) {
+				error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
+				goto out;
+			}
+		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
@@ -2892,12 +2922,43 @@ top:
 			error = 0;
 			goto out;
 		}
+	} else if (rflags & RENAME_EXCHANGE) {
+		/* Target must exist for RENAME_EXCHANGE. */
+		error = SET_ERROR(ENOENT);
+		goto out;
+	}
+
+	/* Set up inode creation for RENAME_WHITEOUT. */
+	if (rflags & RENAME_WHITEOUT) {
+		/*
+		 * Whiteout files are not regular files or directories, so to
+		 * match zfs_create() we do not inherit the project id.
+		 */
+		uint64_t wo_projid = ZFS_DEFAULT_PROJID;
+
+		error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
+		if (error)
+			goto out;
+
+		if (!have_acl) {
+			error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
+			    &acl_ids, mnt_ns);
+			if (error)
+				goto out;
+			have_acl = B_TRUE;
+		}
+
+		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
+			error = SET_ERROR(EDQUOT);
+			goto out;
+		}
 	}

 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
-	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+	dmu_tx_hold_zap(tx, sdzp->z_id,
+	    (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
@@ -2907,7 +2968,21 @@ top:
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
+	if (rflags & RENAME_WHITEOUT) {
+		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+		    ZFS_SA_BASE_ATTR_SIZE);

+		dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
+		dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+		if (!zfsvfs->z_use_sa &&
+		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, acl_ids.z_aclp->z_acl_bytes);
+		}
+	}
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
@@ -2946,7 +3021,7 @@ top:

 	error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 	    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
-	ASSERT0(error);
+	VERIFY0(error);

 	error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 	if (error)
@@ -2956,13 +3031,30 @@ top:
 	 * Unlink the target.
 	 */
 	if (tzp) {
-		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+		int tzflg = zflg;
+
+		if (rflags & RENAME_EXCHANGE) {
+			/* This inode will be re-linked soon. */
+			tzflg |= ZRENAMING;
+
+			tzp->z_pflags |= ZFS_AV_MODIFIED;
+			if (sdzp->z_pflags & ZFS_PROJINHERIT)
+				tzp->z_pflags |= ZFS_PROJINHERIT;
+
+			error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+			    (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
+			ASSERT0(error);
+		}
+		error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
 		if (error)
 			goto commit_link_szp;
 	}

 	/*
-	 * Create a new link at the target.
+	 * Create the new target links:
+	 *   * We always link the target.
+	 *   * RENAME_EXCHANGE: Link the old target to the source.
+	 *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
 	 */
 	error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 	if (error) {
@@ -2975,18 +3067,55 @@ top:
 		goto commit_link_tzp;
 	}

-	zfs_log_rename(zilog, tx, TX_RENAME |
-	    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
-	    sdl->dl_name, tdzp, tdl->dl_name, szp);
+	switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
+	case RENAME_EXCHANGE:
+		error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
+		/*
+		 * The same argument as zfs_link_create() failing for
+		 * szp applies here, since the source directory must
+		 * have had an entry we are replacing.
+		 */
+		ASSERT0(error);
+		if (error)
+			goto commit_unlink_td_szp;
+		break;
+	case RENAME_WHITEOUT:
+		zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
+		error = zfs_link_create(sdl, wzp, tx, ZNEW);
+		if (error) {
+			zfs_znode_delete(wzp, tx);
+			remove_inode_hash(ZTOI(wzp));
+			goto commit_unlink_td_szp;
+		}
+		break;
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
+	case RENAME_EXCHANGE:
+		zfs_log_rename_exchange(zilog, tx,
+		    (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
+		    tdzp, tdl->dl_name, szp);
+		break;
+	case RENAME_WHITEOUT:
+		zfs_log_rename_whiteout(zilog, tx,
+		    (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
+		    tdzp, tdl->dl_name, szp, wzp);
+		break;
+	default:
+		ASSERT0(rflags & ~RENAME_NOREPLACE);
+		zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
+		    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+		break;
+	}

 commit:
 	dmu_tx_commit(tx);
 out:
-	if (zl != NULL)
-		zfs_rename_unlock(&zl);
-
-	zfs_dirent_unlock(sdl);
-	zfs_dirent_unlock(tdl);
+	if (have_acl)
+		zfs_acl_ids_free(&acl_ids);

 	zfs_znode_update_vfs(sdzp);
 	if (sdzp == tdzp)
@@ -2997,11 +3126,21 @@ out:

 	zfs_znode_update_vfs(szp);
 	zrele(szp);
+	if (wzp) {
+		zfs_znode_update_vfs(wzp);
+		zrele(wzp);
+	}
 	if (tzp) {
 		zfs_znode_update_vfs(tzp);
 		zrele(tzp);
 	}

+	if (zl != NULL)
+		zfs_rename_unlock(&zl);
+
+	zfs_dirent_unlock(sdl);
+	zfs_dirent_unlock(tdl);
+
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);

@@ -3012,23 +3151,31 @@ out:
 	 * Clean-up path for broken link state.
 	 *
 	 * At this point we are in a (very) bad state, so we need to do our
-	 * best to correct the state. In particular, the nlink of szp is wrong
-	 * because we were destroying and creating links with ZRENAMING.
+	 * best to correct the state. In particular, all of the nlinks are
+	 * wrong because we were destroying and creating links with ZRENAMING.
 	 *
-	 * link_create()s are allowed to fail (though they shouldn't because we
-	 * only just unlinked them and are putting the entries back during
-	 * clean-up). But if they fail, we can just forcefully drop the nlink
-	 * value to (at the very least) avoid broken nlink values -- though in
-	 * the case of non-empty directories we will have to panic.
+	 * In some form, all of these operations have to resolve the state:
+	 *
+	 *  * link_destroy() *must* succeed. Fortunately, this is very likely
+	 *    since we only just created it.
+	 *
+	 *  * link_create()s are allowed to fail (though they shouldn't because
+	 *    we only just unlinked them and are putting the entries back
+	 *    during clean-up). But if they fail, we can just forcefully drop
+	 *    the nlink value to (at the very least) avoid broken nlink values
+	 *    -- though in the case of non-empty directories we will have to
+	 *    panic (otherwise we'd have a leaked directory with a broken ..).
 	 */
+commit_unlink_td_szp:
+	VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
 commit_link_tzp:
 	if (tzp) {
 		if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
-			VERIFY3U(zfs_drop_nlink(tzp, tx, NULL), ==, 0);
+			VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
 	}
 commit_link_szp:
 	if (zfs_link_create(sdl, szp, tx, ZRENAMING))
-		VERIFY3U(zfs_drop_nlink(szp, tx, NULL), ==, 0);
+		VERIFY0(zfs_drop_nlink(szp, tx, NULL));
 	goto commit;
 }

@@ -422,7 +422,12 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 		break;

 	case S_IFDIR:
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+		ip->i_flags |= S_IOPS_WRAPPER;
+		ip->i_op = &zpl_dir_inode_operations.ops;
+#else
 		ip->i_op = &zpl_dir_inode_operations;
+#endif
 		ip->i_fop = &zpl_dir_file_operations;
 		ITOZ(ip)->z_zn_prefetch = B_TRUE;
 		break;
@@ -24,6 +24,7 @@
 */


+#include <sys/sysmacros.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
@@ -498,35 +499,42 @@ static int
 #ifdef HAVE_IOPS_RENAME_USERNS
 zpl_rename2(struct user_namespace *user_ns, struct inode *sdip,
    struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
-    unsigned int flags)
+    unsigned int rflags)
 #else
 zpl_rename2(struct inode *sdip, struct dentry *sdentry,
-    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+    struct inode *tdip, struct dentry *tdentry, unsigned int rflags)
 #endif
 {
 	cred_t *cr = CRED();
+	vattr_t *wo_vap = NULL;
 	int error;
 	fstrans_cookie_t cookie;
 #ifndef HAVE_IOPS_RENAME_USERNS
 	zuserns_t *user_ns = NULL;
 #endif

-	/* We don't have renameat2(2) support */
-	if (flags)
-		return (-EINVAL);
-
 	crhold(cr);
+	if (rflags & RENAME_WHITEOUT) {
+		wo_vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+		zpl_vap_init(wo_vap, sdip, S_IFCHR, cr, user_ns);
+		wo_vap->va_rdev = makedevice(0, 0);
+	}
+
 	cookie = spl_fstrans_mark();
 	error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip),
-	    dname(tdentry), cr, 0, user_ns);
+	    dname(tdentry), cr, 0, rflags, wo_vap, user_ns);
 	spl_fstrans_unmark(cookie);
+	if (wo_vap)
+		kmem_free(wo_vap, sizeof (vattr_t));
 	crfree(cr);
 	ASSERT3S(error, <=, 0);

 	return (error);
 }

-#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS)
+#if !defined(HAVE_IOPS_RENAME_USERNS) && \
+	!defined(HAVE_RENAME_WANTS_FLAGS) && \
+	!defined(HAVE_RENAME2)
 static int
 zpl_rename(struct inode *sdip, struct dentry *sdentry,
    struct inode *tdip, struct dentry *tdentry)
@@ -745,7 +753,12 @@ const struct inode_operations zpl_inode_operations = {
 #endif /* CONFIG_FS_POSIX_ACL */
 };

+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+const struct inode_operations_wrapper zpl_dir_inode_operations = {
+	.ops = {
+#else
 const struct inode_operations zpl_dir_inode_operations = {
+#endif
 	.create		= zpl_create,
 	.lookup		= zpl_lookup,
 	.link		= zpl_link,
@@ -754,7 +767,9 @@ const struct inode_operations zpl_dir_inode_operations = {
 	.mkdir		= zpl_mkdir,
 	.rmdir		= zpl_rmdir,
 	.mknod		= zpl_mknod,
-#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
+#ifdef HAVE_RENAME2
+	.rename2	= zpl_rename2,
+#elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
 	.rename		= zpl_rename2,
 #else
 	.rename		= zpl_rename,
@@ -776,6 +791,10 @@ const struct inode_operations zpl_dir_inode_operations = {
 #endif /* HAVE_SET_ACL */
 	.get_acl	= zpl_get_acl,
 #endif /* CONFIG_FS_POSIX_ACL */
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+	},
+	.rename2	= zpl_rename2,
+#endif
 };

 const struct inode_operations zpl_symlink_inode_operations = {