zfs_rename: support RENAME_* flags

Implement support for Linux's RENAME_* flags (for renameat2). Aside from
being quite useful for userspace (providing race-free ways to exchange
paths and implement mv --no-clobber), they are used by overlayfs and are
thus required in order to use overlayfs-on-ZFS.

In order for us to represent the new renameat2(2) flags in the ZIL, we
create two new transaction types for the two flags which need
transactional-level support (RENAME_EXCHANGE and RENAME_WHITEOUT).
RENAME_NOREPLACE does not need any ZIL support because we know that if
the operation succeeded before creating the ZIL entry, there was no file
to be clobbered and thus it can be treated as a regular TX_RENAME.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pavel Snajdr <snajpa@snajpa.net>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Closes #12209
Closes #14070
This commit is contained in:
Aleksa Sarai
2019-06-22 10:35:11 +10:00
committed by Brian Behlendorf
parent e015d6cc0b
commit dbf6108b4d
33 changed files with 932 additions and 74 deletions
+96 -10
View File
@@ -643,18 +643,21 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
}
static int
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname,
char *tname, uint64_t rflags, vattr_t *wo_vap)
{
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
znode_t *sdzp, *tdzp;
int error;
int vflg = 0;
int error, vflg = 0;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
/* Only Linux currently supports RENAME_* flags. */
#ifdef __linux__
VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT));
/* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */
VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
#else
VERIFY0(rflags);
#endif
if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
return (error);
@@ -667,13 +670,94 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, NULL);
error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
wo_vap, NULL);
zrele(tdzp);
zrele(sdzp);
return (error);
}
static int
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
{
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
}
static int
zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
{
#ifdef __linux__
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
NULL));
#else
return (SET_ERROR(ENOTSUP));
#endif
}
static int
zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
{
#ifdef __linux__
zfsvfs_t *zfsvfs = arg1;
lr_rename_whiteout_t *lr = arg2;
int error;
/* sname and tname follow lr_rename_whiteout_t */
char *sname = (char *)(lr + 1);
char *tname = sname + strlen(sname) + 1;
/* For the whiteout file. */
xvattr_t xva;
uint64_t objid;
uint64_t dnodesize;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
objid = LR_FOID_GET_OBJ(lr->lr_wfoid);
dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT;
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid);
/*
* As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which
* assigns the object's creation time, generation number, and dnode
* slot count. The generic zfs_rename() has no concept of these
* attributes, so we smuggle the values inside the vattr's otherwise
* unused va_ctime, va_nblocks, and va_fsid fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime);
xva.xva_vattr.va_nblocks = lr->lr_wgen;
xva.xva_vattr.va_fsid = dnodesize;
error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
if (error)
return (error);
return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
RENAME_WHITEOUT, &xva.xva_vattr));
#else
return (SET_ERROR(ENOTSUP));
#endif
}
static int
zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
@@ -1069,4 +1153,6 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
zfs_replay_write2, /* TX_WRITE2 */
zfs_replay_setsaxattr, /* TX_SETSAXATTR */
zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */
zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */
};