2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
2017-02-03 01:13:41 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2017-02-17 22:48:20 +03:00
|
|
|
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
|
2017-02-03 01:13:41 +03:00
|
|
|
* Copyright 2017 Nexenta Systems, Inc.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include <sys/sysmacros.h>
|
|
|
|
#include <sys/vfs.h>
|
|
|
|
#include <sys/vnode.h>
|
|
|
|
#include <sys/file.h>
|
|
|
|
#include <sys/kmem.h>
|
|
|
|
#include <sys/uio.h>
|
|
|
|
#include <sys/pathname.h>
|
|
|
|
#include <sys/cmn_err.h>
|
|
|
|
#include <sys/errno.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/sunddi.h>
|
|
|
|
#include <sys/random.h>
|
|
|
|
#include <sys/policy.h>
|
|
|
|
#include <sys/zfs_dir.h>
|
|
|
|
#include <sys/zfs_acl.h>
|
2014-08-05 00:30:20 +04:00
|
|
|
#include <sys/zfs_vnops.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/fs/zfs.h>
|
|
|
|
#include <sys/zap.h>
|
|
|
|
#include <sys/dmu.h>
|
|
|
|
#include <sys/atomic.h>
|
2011-11-11 11:15:53 +04:00
|
|
|
#include <sys/zfs_ctldir.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/zfs_fuid.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/sa.h>
|
|
|
|
#include <sys/zfs_sa.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2019-09-03 03:56:41 +03:00
|
|
|
* zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
|
2008-11-20 23:01:55 +03:00
|
|
|
* of names after deciding which is the appropriate lookup interface.
|
|
|
|
*/
|
|
|
|
static int
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt,
|
2008-11-20 23:01:55 +03:00
|
|
|
boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
|
|
|
|
{
|
2012-01-28 01:43:23 +04:00
|
|
|
boolean_t conflict = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_norm) {
|
2008-11-20 23:01:55 +03:00
|
|
|
size_t bufsz = 0;
|
|
|
|
char *buf = NULL;
|
|
|
|
|
|
|
|
if (rpnp) {
|
|
|
|
buf = rpnp->pn_buf;
|
|
|
|
bufsz = rpnp->pn_bufsize;
|
|
|
|
}
|
2017-02-03 01:13:41 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* In the non-mixed case we only expect there would ever
|
|
|
|
* be one match, but we need to use the normalizing lookup.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
|
2008-11-20 23:01:55 +03:00
|
|
|
zoid, mt, buf, bufsz, &conflict);
|
|
|
|
} else {
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2012-01-28 01:43:23 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Allow multiple entries provided the first entry is
|
|
|
|
* the object id. Non-zpl consumers may safely make
|
|
|
|
* use of the additional space.
|
|
|
|
*
|
|
|
|
* XXX: This should be a feature flag for compatibility
|
|
|
|
*/
|
|
|
|
if (error == EOVERFLOW)
|
|
|
|
error = 0;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_norm && !error && deflags)
|
2012-01-28 01:43:23 +04:00
|
|
|
*deflags = conflict ? ED_CASE_CONFLICT : 0;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
*zoid = ZFS_DIRENT_OBJ(*zoid);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lock a directory entry. A dirlock on <dzp, name> protects that name
|
|
|
|
* in dzp's directory zap object. As long as you hold a dirlock, you can
|
|
|
|
* assume two things: (1) dzp cannot be reaped, and (2) no other thread
|
|
|
|
* can change the zap entry for (i.e. link or unlink) this name.
|
|
|
|
*
|
|
|
|
* Input arguments:
|
|
|
|
* dzp - znode for directory
|
|
|
|
* name - name of entry to lock
|
|
|
|
* flag - ZNEW: if the entry already exists, fail with EEXIST.
|
|
|
|
* ZEXISTS: if the entry does not exist, fail with ENOENT.
|
|
|
|
* ZSHARED: allow concurrent access with other ZSHARED callers.
|
|
|
|
* ZXATTR: we want dzp's xattr directory
|
|
|
|
* ZCILOOK: On a mixed sensitivity file system,
|
|
|
|
* this lookup should be case-insensitive.
|
|
|
|
* ZCIEXACT: On a purely case-insensitive file system,
|
|
|
|
* this lookup should be case-sensitive.
|
|
|
|
* ZRENAMING: we are locking for renaming, force narrow locks
|
2010-05-29 00:45:14 +04:00
|
|
|
* ZHAVELOCK: Don't grab the z_name_lock for this call. The
|
|
|
|
* current thread already holds it.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Output arguments:
|
|
|
|
* zpp - pointer to the znode for the entry (NULL if there isn't one)
|
|
|
|
* dlpp - pointer to the dirlock for this entry (NULL on error)
|
|
|
|
* direntflags - (case-insensitive lookup only)
|
|
|
|
* flags if multiple case-sensitive matches exist in directory
|
|
|
|
* realpnp - (case-insensitive lookup only)
|
|
|
|
* actual name matched within the directory
|
|
|
|
*
|
|
|
|
* Return value: 0 on success or errno on failure.
|
|
|
|
*
|
|
|
|
* NOTE: Always checks for, and rejects, '.' and '..'.
|
|
|
|
* NOTE: For case-insensitive file systems we take wide locks (see below),
|
|
|
|
* but return znode pointers to a single match.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
|
|
|
|
int flag, int *direntflags, pathname_t *realpnp)
|
|
|
|
{
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
boolean_t update;
|
2017-02-03 01:13:41 +03:00
|
|
|
matchtype_t mt = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t zoid;
|
|
|
|
int error = 0;
|
|
|
|
int cmpflags;
|
|
|
|
|
|
|
|
*zpp = NULL;
|
|
|
|
*dlpp = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify that we are not trying to lock '.', '..', or '.zfs'
|
|
|
|
*/
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((name[0] == '.' &&
|
|
|
|
(name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
|
|
|
|
(zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EEXIST));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Case sensitivity and normalization preferences are set when
|
|
|
|
* the file system is created. These are stored in the
|
2017-03-08 03:21:37 +03:00
|
|
|
* zfsvfs->z_case and zfsvfs->z_norm fields. These choices
|
2008-11-20 23:01:55 +03:00
|
|
|
* affect what vnodes can be cached in the DNLC, how we
|
|
|
|
* perform zap lookups, and the "width" of our dirlocks.
|
|
|
|
*
|
|
|
|
* A normal dirlock locks a single name. Note that with
|
|
|
|
* normalization a name can be composed multiple ways, but
|
|
|
|
* when normalized, these names all compare equal. A wide
|
|
|
|
* dirlock locks multiple names. We need these when the file
|
|
|
|
* system is supporting mixed-mode access. It is sometimes
|
|
|
|
* necessary to lock all case permutations of file name at
|
|
|
|
* once so that simultaneous case-insensitive/case-sensitive
|
|
|
|
* behaves as rationally as possible.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2017-02-03 01:13:41 +03:00
|
|
|
* When matching we may need to normalize & change case according to
|
|
|
|
* FS settings.
|
|
|
|
*
|
|
|
|
* Note that a normalized match is necessary for a case insensitive
|
|
|
|
* filesystem when the lookup request is not exact because normalization
|
|
|
|
* can fold case independent of normalizing code point sequences.
|
|
|
|
*
|
|
|
|
* See the table above zfs_dropname().
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_norm != 0) {
|
2017-02-03 01:13:41 +03:00
|
|
|
mt = MT_NORMALIZE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine if the match needs to honor the case specified in
|
|
|
|
* lookup, and if so keep track of that so that during
|
|
|
|
* normalization we don't fold case.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
|
2017-02-03 01:13:41 +03:00
|
|
|
(flag & ZCIEXACT)) ||
|
2017-03-08 03:21:37 +03:00
|
|
|
(zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
|
2017-02-03 01:13:41 +03:00
|
|
|
mt |= MT_MATCH_CASE;
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only look in or update the DNLC if we are looking for the
|
|
|
|
* name on a file system that does not require normalization
|
|
|
|
* or case folding. We can also look there if we happen to be
|
|
|
|
* on a non-normalizing, mixed sensitivity file system IF we
|
|
|
|
* are looking for the exact name.
|
|
|
|
*
|
|
|
|
* Maybe can add TO-UPPERed version of name to dnlc in ci-only
|
|
|
|
* case for performance improvement?
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
update = !zfsvfs->z_norm ||
|
|
|
|
(zfsvfs->z_case == ZFS_CASE_MIXED &&
|
|
|
|
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ZRENAMING indicates we are in a situation where we should
|
|
|
|
* take narrow locks regardless of the file system's
|
|
|
|
* preferences for normalizing and case folding. This will
|
|
|
|
* prevent us deadlocking trying to grab the same wide lock
|
|
|
|
* twice if the two names happen to be case-insensitive
|
|
|
|
* matches.
|
|
|
|
*/
|
|
|
|
if (flag & ZRENAMING)
|
|
|
|
cmpflags = 0;
|
|
|
|
else
|
2017-03-08 03:21:37 +03:00
|
|
|
cmpflags = zfsvfs->z_norm;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait until there are no locks on this name.
|
2010-05-29 00:45:14 +04:00
|
|
|
*
|
2019-09-03 03:56:41 +03:00
|
|
|
* Don't grab the lock if it is already held. However, cannot
|
2010-05-29 00:45:14 +04:00
|
|
|
* have both ZSHARED and ZHAVELOCK together.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
|
|
|
|
if (!(flag & ZHAVELOCK))
|
|
|
|
rw_enter(&dzp->z_name_lock, RW_READER);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&dzp->z_lock);
|
|
|
|
for (;;) {
|
2016-10-13 03:30:46 +03:00
|
|
|
if (dzp->z_unlinked && !(flag & ZXATTR)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&dzp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!(flag & ZHAVELOCK))
|
|
|
|
rw_exit(&dzp->z_name_lock);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
|
|
|
|
if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
|
|
|
|
U8_UNICODE_LATEST, &error) == 0) || error != 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (error != 0) {
|
|
|
|
mutex_exit(&dzp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!(flag & ZHAVELOCK))
|
|
|
|
rw_exit(&dzp->z_name_lock);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
if (dl == NULL) {
|
|
|
|
/*
|
|
|
|
* Allocate a new dirlock and add it to the list.
|
|
|
|
*/
|
|
|
|
dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
|
|
|
|
cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
dl->dl_name = name;
|
|
|
|
dl->dl_sharecnt = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
dl->dl_namelock = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
dl->dl_namesize = 0;
|
|
|
|
dl->dl_dzp = dzp;
|
|
|
|
dl->dl_next = dzp->z_dirlocks;
|
|
|
|
dzp->z_dirlocks = dl;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
|
|
|
|
break;
|
|
|
|
cv_wait(&dl->dl_cv, &dzp->z_lock);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* If the z_name_lock was NOT held for this dirlock record it.
|
|
|
|
*/
|
|
|
|
if (flag & ZHAVELOCK)
|
|
|
|
dl->dl_namelock = 1;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
|
|
|
|
/*
|
|
|
|
* We're the second shared reference to dl. Make a copy of
|
|
|
|
* dl_name in case the first thread goes away before we do.
|
|
|
|
* Note that we initialize the new name before storing its
|
|
|
|
* pointer into dl_name, because the first thread may load
|
2017-02-17 22:48:20 +03:00
|
|
|
* dl->dl_name at any time. It'll either see the old value,
|
|
|
|
* which belongs to it, or the new shared copy; either is OK.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
dl->dl_namesize = strlen(dl->dl_name) + 1;
|
|
|
|
name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
|
|
|
|
bcopy(dl->dl_name, name, dl->dl_namesize);
|
|
|
|
dl->dl_name = name;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&dzp->z_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have a dirlock on the name. (Note that it is the dirlock,
|
|
|
|
* not the dzp's z_lock, that protects the name in the zap object.)
|
|
|
|
* See if there's an object by this name; if so, put a hold on it.
|
|
|
|
*/
|
|
|
|
if (flag & ZXATTR) {
|
2017-03-08 03:21:37 +03:00
|
|
|
error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
|
2010-05-29 00:45:14 +04:00
|
|
|
sizeof (zoid));
|
|
|
|
if (error == 0)
|
2013-03-08 22:41:28 +04:00
|
|
|
error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zfs_match_find(zfsvfs, dzp, name, mt,
|
2011-02-08 22:16:06 +03:00
|
|
|
update, direntflags, realpnp, &zoid);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
if (error) {
|
|
|
|
if (error != ENOENT || (flag & ZEXISTS)) {
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (flag & ZNEW) {
|
|
|
|
zfs_dirent_unlock(dl);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EEXIST));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zfs_zget(zfsvfs, zoid, zpp);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*dlpp = dl;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unlock this directory entry and wake anyone who was waiting for it.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zfs_dirent_unlock(zfs_dirlock_t *dl)
|
|
|
|
{
|
|
|
|
znode_t *dzp = dl->dl_dzp;
|
|
|
|
zfs_dirlock_t **prev_dl, *cur_dl;
|
|
|
|
|
|
|
|
mutex_enter(&dzp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (!dl->dl_namelock)
|
|
|
|
rw_exit(&dzp->z_name_lock);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (dl->dl_sharecnt > 1) {
|
|
|
|
dl->dl_sharecnt--;
|
|
|
|
mutex_exit(&dzp->z_lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
prev_dl = &dzp->z_dirlocks;
|
|
|
|
while ((cur_dl = *prev_dl) != dl)
|
|
|
|
prev_dl = &cur_dl->dl_next;
|
|
|
|
*prev_dl = dl->dl_next;
|
|
|
|
cv_broadcast(&dl->dl_cv);
|
|
|
|
mutex_exit(&dzp->z_lock);
|
|
|
|
|
|
|
|
if (dl->dl_namesize != 0)
|
|
|
|
kmem_free(dl->dl_name, dl->dl_namesize);
|
|
|
|
cv_destroy(&dl->dl_cv);
|
|
|
|
kmem_free(dl, sizeof (*dl));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look up an entry in a directory.
|
|
|
|
*
|
|
|
|
* NOTE: '.' and '..' are handled as special cases because
|
|
|
|
* no directory entries are actually stored for them. If this is
|
|
|
|
* the root of a filesystem, then '.zfs' is also treated as a
|
|
|
|
* special pseudo-directory.
|
|
|
|
*/
|
|
|
|
int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags,
|
2008-11-20 23:01:55 +03:00
|
|
|
int *deflg, pathname_t *rpnp)
|
|
|
|
{
|
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
znode_t *zp;
|
|
|
|
int error = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t parent;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
|
2011-02-08 22:16:06 +03:00
|
|
|
*ipp = ZTOI(dzp);
|
|
|
|
igrab(*ipp);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If we are a snapshot mounted under .zfs, return
|
2011-11-11 11:15:53 +04:00
|
|
|
* the inode pointer for the snapshot directory.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((error = sa_lookup(dzp->z_sa_hdl,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
return (error);
|
2011-11-11 11:15:53 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
|
|
|
|
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
|
2011-11-11 11:15:53 +04:00
|
|
|
"snapshot", ipp, 0, kcred, NULL, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
rw_enter(&dzp->z_parent_lock, RW_READER);
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zfs_zget(zfsvfs, parent, &zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error == 0)
|
2011-02-08 22:16:06 +03:00
|
|
|
*ipp = ZTOI(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
rw_exit(&dzp->z_parent_lock);
|
|
|
|
} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
|
2011-02-08 22:16:06 +03:00
|
|
|
*ipp = zfsctl_root(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
int zf;
|
|
|
|
|
|
|
|
zf = ZEXISTS | ZSHARED;
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
zf |= ZCILOOK;
|
|
|
|
|
|
|
|
error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
|
|
|
|
if (error == 0) {
|
2011-02-08 22:16:06 +03:00
|
|
|
*ipp = ZTOI(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
|
|
|
|
}
|
|
|
|
rpnp = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((flags & FIGNORECASE) && rpnp && !error)
|
|
|
|
(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* unlinked Set (formerly known as the "delete queue") Error Handling
|
|
|
|
*
|
|
|
|
* When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
|
|
|
|
* don't specify the name of the entry that we will be manipulating. We
|
|
|
|
* also fib and say that we won't be adding any new entries to the
|
|
|
|
* unlinked set, even though we might (this is to lower the minimum file
|
|
|
|
* size that can be deleted in a full filesystem). So on the small
|
|
|
|
* chance that the nlink list is using a fat zap (ie. has more than
|
|
|
|
* 2000 entries), we *may* not pre-read a block that's needed.
|
|
|
|
* Therefore it is remotely possible for some of the assertions
|
|
|
|
* regarding the unlinked set below to fail due to i/o error. On a
|
|
|
|
* nondebug system, this will result in the space being leaked.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
|
|
|
|
{
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
ASSERT(zp->z_unlinked);
|
2016-07-14 17:44:38 +03:00
|
|
|
ASSERT(ZTOI(zp)->i_nlink == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
VERIFY3U(0, ==,
|
2017-03-08 03:21:37 +03:00
|
|
|
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
|
2019-02-12 21:41:15 +03:00
|
|
|
|
|
|
|
dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-07-31 22:19:47 +04:00
|
|
|
/*
|
|
|
|
* Clean up any znodes that had no links when we either crashed or
|
|
|
|
* (force) umounted the file system.
|
|
|
|
*/
|
2019-02-12 21:41:15 +03:00
|
|
|
static void
|
|
|
|
zfs_unlinked_drain_task(void *arg)
|
2014-07-31 22:19:47 +04:00
|
|
|
{
|
2019-02-12 21:41:15 +03:00
|
|
|
zfsvfs_t *zfsvfs = arg;
|
2014-07-31 22:19:47 +04:00
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t zap;
|
|
|
|
dmu_object_info_t doi;
|
|
|
|
znode_t *zp;
|
|
|
|
int error;
|
|
|
|
|
2019-02-12 21:41:15 +03:00
|
|
|
ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
|
|
|
|
|
2014-07-31 22:19:47 +04:00
|
|
|
/*
|
|
|
|
* Iterate over the contents of the unlinked set.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
|
2019-02-12 21:41:15 +03:00
|
|
|
zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
|
2014-07-31 22:19:47 +04:00
|
|
|
zap_cursor_advance(&zc)) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* See what kind of object we have in list
|
|
|
|
*/
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
error = dmu_object_info(zfsvfs->z_os,
|
|
|
|
zap.za_first_integer, &doi);
|
2014-07-31 22:19:47 +04:00
|
|
|
if (error != 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
|
|
|
|
(doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
|
|
|
|
/*
|
|
|
|
* We need to re-mark these list entries for deletion,
|
|
|
|
* so we pull them back into core and set zp->z_unlinked.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
|
2014-07-31 22:19:47 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We may pick up znodes that are already marked for deletion.
|
|
|
|
* This could happen during the purge of an extended attribute
|
|
|
|
* directory. All we need to do is skip over them, since they
|
|
|
|
* are already in the system marked z_unlinked.
|
|
|
|
*/
|
|
|
|
if (error != 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
zp->z_unlinked = B_TRUE;
|
2019-02-12 21:41:15 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* iput() is Linux's equivalent to illumos' VN_RELE(). It will
|
|
|
|
* decrement the inode's ref count and may cause the inode to be
|
|
|
|
* synchronously freed. We interrupt freeing of this inode, by
|
|
|
|
* checking the return value of dmu_objset_zfs_unmounting() in
|
|
|
|
* dmu_free_long_range(), when an unmount is requested.
|
|
|
|
*/
|
2014-07-31 22:19:47 +04:00
|
|
|
iput(ZTOI(zp));
|
2019-02-12 21:41:15 +03:00
|
|
|
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
|
2014-07-31 22:19:47 +04:00
|
|
|
}
|
|
|
|
zap_cursor_fini(&zc);
|
2019-02-12 21:41:15 +03:00
|
|
|
|
|
|
|
zfsvfs->z_draining = B_FALSE;
|
|
|
|
zfsvfs->z_drain_task = TASKQID_INVALID;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sets z_draining then tries to dispatch async unlinked drain.
|
|
|
|
* If that fails executes synchronous unlinked drain.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
|
|
|
|
{
|
|
|
|
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
|
|
|
|
ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
|
|
|
|
|
|
|
|
zfsvfs->z_draining = B_TRUE;
|
|
|
|
zfsvfs->z_drain_cancel = B_FALSE;
|
|
|
|
|
|
|
|
zfsvfs->z_drain_task = taskq_dispatch(
|
|
|
|
dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
|
|
|
|
zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
|
|
|
|
if (zfsvfs->z_drain_task == TASKQID_INVALID) {
|
|
|
|
zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
|
|
|
|
zfs_unlinked_drain_task(zfsvfs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for the unlinked drain taskq task to stop. This will interrupt the
|
|
|
|
* unlinked set processing if it is in progress.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
|
|
|
|
{
|
|
|
|
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
|
|
|
|
|
|
|
|
if (zfsvfs->z_draining) {
|
|
|
|
zfsvfs->z_drain_cancel = B_TRUE;
|
|
|
|
taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
|
|
|
|
dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
|
|
|
|
zfsvfs->z_drain_task = TASKQID_INVALID;
|
|
|
|
zfsvfs->z_draining = B_FALSE;
|
|
|
|
}
|
2014-07-31 22:19:47 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Delete the entire contents of a directory. Return a count
|
|
|
|
* of the number of entries that could not be deleted. If we encounter
|
|
|
|
* an error, return a count of at least one so that the directory stays
|
|
|
|
* in the unlinked set.
|
|
|
|
*
|
|
|
|
* NOTE: this function assumes that the directory is inactive,
|
|
|
|
* so there is no need to lock its entries before deletion.
|
|
|
|
* Also, it assumes the directory contents is *only* regular
|
|
|
|
* files.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zfs_purgedir(znode_t *dzp)
|
|
|
|
{
|
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t zap;
|
|
|
|
znode_t *xzp;
|
|
|
|
dmu_tx_t *tx;
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirlock_t dl;
|
|
|
|
int skipped = 0;
|
|
|
|
int error;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
|
2008-11-20 23:01:55 +03:00
|
|
|
(error = zap_cursor_retrieve(&zc, &zap)) == 0;
|
|
|
|
zap_cursor_advance(&zc)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zfs_zget(zfsvfs,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
|
|
|
|
if (error) {
|
|
|
|
skipped += 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-07-31 22:19:47 +04:00
|
|
|
ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
|
|
|
|
S_ISLNK(ZTOI(xzp)->i_mode));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
/* Is this really needed ? */
|
|
|
|
zfs_sa_upgrade_txholds(tx, xzp);
|
2014-07-07 23:49:36 +04:00
|
|
|
dmu_tx_mark_netfree(tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
2014-08-05 00:30:20 +04:00
|
|
|
zfs_iput_async(ZTOI(xzp));
|
2008-11-20 23:01:55 +03:00
|
|
|
skipped += 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
bzero(&dl, sizeof (dl));
|
|
|
|
dl.dl_dzp = dzp;
|
|
|
|
dl.dl_name = zap.za_name;
|
|
|
|
|
|
|
|
error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
|
|
|
|
if (error)
|
|
|
|
skipped += 1;
|
|
|
|
dmu_tx_commit(tx);
|
2016-07-14 17:44:38 +03:00
|
|
|
|
2014-08-05 00:30:20 +04:00
|
|
|
zfs_iput_async(ZTOI(xzp));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
zap_cursor_fini(&zc);
|
|
|
|
if (error != ENOENT)
|
|
|
|
skipped += 1;
|
|
|
|
return (skipped);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zfs_rmnode(znode_t *zp)
|
|
|
|
{
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
|
|
|
objset_t *os = zfsvfs->z_os;
|
2008-11-20 23:01:55 +03:00
|
|
|
znode_t *xzp = NULL;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
uint64_t acl_obj;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t xattr_obj;
|
2016-07-14 17:44:38 +03:00
|
|
|
uint64_t links;
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
|
2016-07-14 17:44:38 +03:00
|
|
|
ASSERT(ZTOI(zp)->i_nlink == 0);
|
2011-02-08 22:16:06 +03:00
|
|
|
ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this is an attribute directory, purge its contents.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
|
2014-07-31 22:19:47 +04:00
|
|
|
if (zfs_purgedir(zp) != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2014-07-31 22:19:47 +04:00
|
|
|
* Not enough space to delete some xattrs.
|
|
|
|
* Leave it in the unlinked set.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2013-01-17 22:05:42 +04:00
|
|
|
zfs_znode_dmu_fini(zp);
|
2014-07-31 22:19:47 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
2015-12-21 22:57:18 +03:00
|
|
|
* Free up all the data in the file. We don't do this for directories
|
|
|
|
* because we need truncate and remove to be in the same tx, like in
|
|
|
|
* zfs_znode_delete(). Otherwise, if we crash here we'll end up with
|
|
|
|
* an inconsistent truncated zap object in the delete queue. Note a
|
|
|
|
* truncated file is harmless since it only contains user data.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2015-12-21 22:57:18 +03:00
|
|
|
if (S_ISREG(ZTOI(zp)->i_mode)) {
|
|
|
|
error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
|
|
|
|
if (error) {
|
|
|
|
/*
|
2017-01-27 22:46:39 +03:00
|
|
|
* Not enough space or we were interrupted by unmount.
|
|
|
|
* Leave the file in the unlinked set.
|
2015-12-21 22:57:18 +03:00
|
|
|
*/
|
|
|
|
zfs_znode_dmu_fini(zp);
|
|
|
|
return;
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If the file has extended attributes, we're going to unlink
|
|
|
|
* the xattr dir.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
&xattr_obj, sizeof (xattr_obj));
|
|
|
|
if (error == 0 && xattr_obj) {
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(error == 0);
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
acl_obj = zfs_external_acl(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Set up the final transaction.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
tx = dmu_tx_create(os);
|
|
|
|
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (xzp) {
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
if (acl_obj)
|
|
|
|
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
/*
|
|
|
|
* Not enough space to delete the file. Leave it in the
|
|
|
|
* unlinked set, leaking it until the fs is remounted (at
|
|
|
|
* which point we'll call zfs_unlinked_drain() to process it).
|
|
|
|
*/
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
zfs_znode_dmu_fini(zp);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xzp) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(error == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&xzp->z_lock);
|
|
|
|
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
|
2016-07-14 17:44:38 +03:00
|
|
|
clear_nlink(ZTOI(xzp)); /* no more links to it */
|
|
|
|
links = 0;
|
2017-03-08 03:21:37 +03:00
|
|
|
VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
|
2016-07-14 17:44:38 +03:00
|
|
|
&links, sizeof (links), tx));
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&xzp->z_lock);
|
|
|
|
zfs_unlinked_add(xzp, tx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Remove this znode from the unlinked set */
|
2008-12-03 23:09:06 +03:00
|
|
|
VERIFY3U(0, ==,
|
2017-03-08 03:21:37 +03:00
|
|
|
zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2019-02-12 21:41:15 +03:00
|
|
|
dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_znode_delete(zp, tx);
|
|
|
|
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
out:
|
|
|
|
if (xzp)
|
2014-08-05 00:30:20 +04:00
|
|
|
zfs_iput_async(ZTOI(xzp));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_dirent(znode_t *zp, uint64_t mode)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
uint64_t de = zp->z_id;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
|
2010-05-29 00:45:14 +04:00
|
|
|
de |= IFTODT(mode) << 60;
|
2008-11-20 23:01:55 +03:00
|
|
|
return (de);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
* Link zp into dl. Can fail in the following cases :
|
|
|
|
* - if zp has been unlinked.
|
|
|
|
* - if the number of entries with the same hash (aka. colliding entries)
|
|
|
|
* exceed the capacity of a leaf-block of fatzap and splitting of the
|
|
|
|
* leaf-block does not help.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
|
|
|
|
{
|
|
|
|
znode_t *dzp = dl->dl_dzp;
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t value;
|
2011-02-08 22:16:06 +03:00
|
|
|
int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
|
2010-05-29 00:45:14 +04:00
|
|
|
sa_bulk_attr_t bulk[5];
|
|
|
|
uint64_t mtime[2], ctime[2];
|
2016-07-14 17:44:38 +03:00
|
|
|
uint64_t links;
|
2010-05-29 00:45:14 +04:00
|
|
|
int count = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
|
|
|
|
mutex_enter(&zp->z_lock);
|
|
|
|
|
|
|
|
if (!(flag & ZRENAMING)) {
|
|
|
|
if (zp->z_unlinked) { /* no new links to unlinked zp */
|
|
|
|
ASSERT(!(flag & (ZNEW | ZEXISTS)));
|
|
|
|
mutex_exit(&zp->z_lock);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2016-07-14 17:44:38 +03:00
|
|
|
if (!(flag & ZNEW)) {
|
|
|
|
/*
|
|
|
|
* ZNEW nodes come from zfs_mknode() where the link
|
|
|
|
* count has already been initialised
|
|
|
|
*/
|
|
|
|
inc_nlink(ZTOI(zp));
|
|
|
|
links = ZTOI(zp)->i_nlink;
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
|
|
|
|
NULL, &links, sizeof (links));
|
2016-07-14 17:44:38 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
|
|
|
|
value = zfs_dirent(zp, zp->z_mode);
|
|
|
|
error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
|
|
|
|
&value, tx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* zap_add could fail to add the entry if it exceeds the capacity of the
|
|
|
|
* leaf-block and zap_leaf_split() failed to help.
|
|
|
|
* The caller of this routine is responsible for failing the transaction
|
|
|
|
* which will rollback the SA updates done above.
|
|
|
|
*/
|
|
|
|
if (error != 0) {
|
|
|
|
if (!(flag & ZRENAMING) && !(flag & ZNEW))
|
|
|
|
drop_nlink(ZTOI(zp));
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&dzp->z_id, sizeof (dzp->z_id));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&zp->z_pflags, sizeof (zp->z_pflags));
|
|
|
|
|
|
|
|
if (!(flag & ZNEW)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
ctime, sizeof (ctime));
|
|
|
|
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
ctime);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
|
|
|
|
ASSERT(error == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
|
|
|
|
mutex_enter(&dzp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
dzp->z_size++;
|
2016-07-14 17:44:38 +03:00
|
|
|
if (zp_is_dir)
|
|
|
|
inc_nlink(ZTOI(dzp));
|
|
|
|
links = ZTOI(dzp)->i_nlink;
|
2010-05-29 00:45:14 +04:00
|
|
|
count = 0;
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&dzp->z_size, sizeof (dzp->z_size));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
|
2016-07-14 17:44:38 +03:00
|
|
|
&links, sizeof (links));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
mtime, sizeof (mtime));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
ctime, sizeof (ctime));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&dzp->z_pflags, sizeof (dzp->z_pflags));
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
|
2010-05-29 00:45:14 +04:00
|
|
|
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
|
|
|
|
ASSERT(error == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&dzp->z_lock);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2017-02-03 01:13:41 +03:00
|
|
|
/*
|
|
|
|
* The match type in the code for this function should conform to:
|
|
|
|
*
|
|
|
|
* ------------------------------------------------------------------------
|
|
|
|
* fs type | z_norm | lookup type | match type
|
|
|
|
* ---------|-------------|-------------|----------------------------------
|
|
|
|
* CS !norm | 0 | 0 | 0 (exact)
|
|
|
|
* CS norm | formX | 0 | MT_NORMALIZE
|
|
|
|
* CI !norm | upper | !ZCIEXACT | MT_NORMALIZE
|
|
|
|
* CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
|
|
|
|
* CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE
|
|
|
|
* CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
|
|
|
|
* CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
|
|
|
|
* CM !norm | upper | ZCILOOK | MT_NORMALIZE
|
|
|
|
* CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
|
|
|
|
* CM norm | upper|formX | ZCILOOK | MT_NORMALIZE
|
|
|
|
*
|
|
|
|
* Abbreviations:
|
|
|
|
* CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
|
|
|
|
* upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
|
|
|
|
* formX = unicode normalization form set on fs creation
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
static int
|
|
|
|
zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
|
|
|
|
int flag)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (ZTOZSB(zp)->z_norm) {
|
2017-02-03 01:13:41 +03:00
|
|
|
matchtype_t mt = MT_NORMALIZE;
|
|
|
|
|
|
|
|
if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
|
2010-05-29 00:45:14 +04:00
|
|
|
(flag & ZCIEXACT)) ||
|
2017-02-03 01:13:41 +03:00
|
|
|
(ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
|
|
|
|
!(flag & ZCILOOK))) {
|
|
|
|
mt |= MT_MATCH_CASE;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
|
|
|
|
dl->dl_name, mt, tx);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
2017-02-03 01:13:41 +03:00
|
|
|
error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
|
|
|
|
tx);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2012-08-26 02:01:39 +04:00
|
|
|
* Unlink zp from dl, and mark zp for deletion if this was the last link. Can
|
|
|
|
* fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
|
2008-11-20 23:01:55 +03:00
|
|
|
* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
|
|
|
|
* If it's non-NULL, we use it to indicate whether the znode needs deletion,
|
|
|
|
* and it's the caller's job to do it.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
|
2017-01-12 20:42:11 +03:00
|
|
|
boolean_t *unlinkedp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
znode_t *dzp = dl->dl_dzp;
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2011-02-08 22:16:06 +03:00
|
|
|
int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
|
2008-11-20 23:01:55 +03:00
|
|
|
boolean_t unlinked = B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
sa_bulk_attr_t bulk[5];
|
|
|
|
uint64_t mtime[2], ctime[2];
|
2016-07-14 17:44:38 +03:00
|
|
|
uint64_t links;
|
2010-05-29 00:45:14 +04:00
|
|
|
int count = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
|
|
|
|
if (!(flag & ZRENAMING)) {
|
|
|
|
mutex_enter(&zp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (zp_is_dir && !zfs_dirempty(zp)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&zp->z_lock);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOTEMPTY));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we get here, we are going to try to remove the object.
|
|
|
|
* First try removing the name from the directory; if that
|
|
|
|
* fails, return the error.
|
|
|
|
*/
|
|
|
|
error = zfs_dropname(dl, zp, dzp, tx, flag);
|
|
|
|
if (error != 0) {
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2016-07-14 17:44:38 +03:00
|
|
|
if (ZTOI(zp)->i_nlink <= zp_is_dir) {
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_panic_recover("zfs: link count on %lu is %u, "
|
|
|
|
"should be at least %u", zp->z_id,
|
2016-07-14 17:44:38 +03:00
|
|
|
(int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
|
|
|
|
set_nlink(ZTOI(zp), zp_is_dir + 1);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2016-07-14 17:44:38 +03:00
|
|
|
drop_nlink(ZTOI(zp));
|
|
|
|
if (ZTOI(zp)->i_nlink == zp_is_dir) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zp->z_unlinked = B_TRUE;
|
2016-07-14 17:44:38 +03:00
|
|
|
clear_nlink(ZTOI(zp));
|
2008-11-20 23:01:55 +03:00
|
|
|
unlinked = B_TRUE;
|
|
|
|
} else {
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, &ctime, sizeof (ctime));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, &zp->z_pflags, sizeof (zp->z_pflags));
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
|
|
|
|
ctime);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2016-07-14 17:44:38 +03:00
|
|
|
links = ZTOI(zp)->i_nlink;
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
|
2016-07-14 17:44:38 +03:00
|
|
|
NULL, &links, sizeof (links));
|
2010-05-29 00:45:14 +04:00
|
|
|
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
|
|
|
|
count = 0;
|
|
|
|
ASSERT(error == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&zp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
|
|
|
error = zfs_dropname(dl, zp, dzp, tx, flag);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&dzp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
dzp->z_size--; /* one dirent removed */
|
2016-07-14 17:44:38 +03:00
|
|
|
if (zp_is_dir)
|
|
|
|
drop_nlink(ZTOI(dzp)); /* ".." link from zp */
|
|
|
|
links = ZTOI(dzp)->i_nlink;
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
|
2016-07-14 17:44:38 +03:00
|
|
|
NULL, &links, sizeof (links));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, &dzp->z_size, sizeof (dzp->z_size));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, ctime, sizeof (ctime));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, mtime, sizeof (mtime));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
|
2010-05-29 00:45:14 +04:00
|
|
|
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(error == 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_exit(&dzp->z_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (unlinkedp != NULL)
|
|
|
|
*unlinkedp = unlinked;
|
|
|
|
else if (unlinked)
|
|
|
|
zfs_unlinked_add(zp, tx);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Indicate whether the directory is empty. Works with or without z_lock
|
|
|
|
* held, but can only be consider a hint in the latter case. Returns true
|
|
|
|
* if only "." and ".." remain and there's no work in progress.
|
2018-01-08 21:57:47 +03:00
|
|
|
*
|
|
|
|
* The internal ZAP size, rather than zp->z_size, needs to be checked since
|
|
|
|
* some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zfs_dirempty(znode_t *dzp)
|
|
|
|
{
|
2018-01-08 21:57:47 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
|
|
|
uint64_t count;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (dzp->z_dirlocks != NULL)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
|
|
|
|
if (error != 0 || count != 0)
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
return (B_TRUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
znode_t *xzp;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_acl_ids_t acl_ids;
|
|
|
|
boolean_t fuid_dirtied;
|
2011-02-08 22:16:06 +03:00
|
|
|
#ifdef DEBUG
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t parent;
|
2011-02-08 22:16:06 +03:00
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
*xipp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)))
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
|
|
|
|
&acl_ids)) != 0)
|
|
|
|
return (error);
|
2018-02-14 01:54:54 +03:00
|
|
|
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EDQUOT));
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
|
|
|
|
ZFS_SA_BASE_ATTR_SIZE);
|
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
|
2017-03-08 03:21:37 +03:00
|
|
|
fuid_dirtied = zfsvfs->z_fuid_dirty;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_txhold(zfsvfs, tx);
|
2013-11-23 03:13:18 +04:00
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_abort(tx);
|
|
|
|
return (error);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_sync(zfsvfs, tx);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#ifdef DEBUG
|
2017-03-08 03:21:37 +03:00
|
|
|
error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
&parent, sizeof (parent));
|
|
|
|
ASSERT(error == 0 && parent == zp->z_id);
|
|
|
|
#endif
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
|
2010-05-29 00:45:14 +04:00
|
|
|
sizeof (xzp->z_id), tx));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-10-13 03:30:46 +03:00
|
|
|
if (!zp->z_unlinked)
|
2017-03-08 03:21:37 +03:00
|
|
|
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
|
2016-10-13 03:30:46 +03:00
|
|
|
xzp, "", NULL, acl_ids.z_fuidp, vap);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
*xipp = ZTOI(xzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return a znode for the extended attribute directory for zp.
|
|
|
|
* ** If the directory does not already exist, it is created **
|
|
|
|
*
|
|
|
|
* IN: zp - znode to obtain attribute directory from
|
|
|
|
* cr - credentials of caller
|
|
|
|
* flags - flags from the VOP_LOOKUP call
|
|
|
|
*
|
2011-02-08 22:16:06 +03:00
|
|
|
* OUT: xipp - pointer to extended attribute znode
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* RETURN: 0 on success
|
|
|
|
* error number on failure
|
|
|
|
*/
|
|
|
|
int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_get_xattrdir(znode_t *zp, struct inode **xipp, cred_t *cr, int flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
znode_t *xzp;
|
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
vattr_t va;
|
|
|
|
int error;
|
|
|
|
top:
|
|
|
|
error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
if (xzp != NULL) {
|
2011-02-08 22:16:06 +03:00
|
|
|
*xipp = ZTOI(xzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(flags & CREATE_XATTR_DIR)) {
|
|
|
|
zfs_dirent_unlock(dl);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfs_is_readonly(zfsvfs)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirent_unlock(dl);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EROFS));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The ability to 'create' files in an attribute
|
|
|
|
* directory comes from the write_xattr permission on the base file.
|
|
|
|
*
|
|
|
|
* The ability to 'search' an attribute directory requires
|
|
|
|
* read_xattr permission on the base file.
|
|
|
|
*
|
|
|
|
* Once in a directory the ability to read/write attributes
|
|
|
|
* is controlled by the permissions on the attribute file.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
|
2008-11-20 23:01:55 +03:00
|
|
|
va.va_mode = S_IFDIR | S_ISVTX | 0777;
|
|
|
|
zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
|
|
|
|
|
2011-04-17 21:42:33 +04:00
|
|
|
va.va_dentry = NULL;
|
2011-02-08 22:16:06 +03:00
|
|
|
error = zfs_make_xattrdir(zp, &va, xipp, cr);
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/* NB: we already did dmu_tx_wait() if necessary */
|
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decide whether it is okay to remove within a sticky directory.
|
|
|
|
*
|
|
|
|
* In sticky directories, write access is not sufficient;
|
|
|
|
* you can remove entries from a directory only if:
|
|
|
|
*
|
|
|
|
* you own the directory,
|
|
|
|
* you own the entry,
|
2017-12-04 22:55:57 +03:00
|
|
|
* you have write access to the entry,
|
2008-11-20 23:01:55 +03:00
|
|
|
* or you are privileged (checked in secpolicy...).
|
|
|
|
*
|
|
|
|
* The function returns 0 if remove access is granted.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
|
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
uid_t uid;
|
2010-08-27 01:24:34 +04:00
|
|
|
uid_t downer;
|
|
|
|
uid_t fowner;
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zdp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_replay)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zdp->z_mode & S_ISVTX) == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
|
2016-05-22 14:15:57 +03:00
|
|
|
cr, ZFS_OWNER);
|
2017-03-08 03:21:37 +03:00
|
|
|
fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
|
2016-05-22 14:15:57 +03:00
|
|
|
cr, ZFS_OWNER);
|
2010-08-27 01:24:34 +04:00
|
|
|
|
|
|
|
if ((uid = crgetuid(cr)) == downer || uid == fowner ||
|
2017-12-04 22:55:57 +03:00
|
|
|
zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
else
|
|
|
|
return (secpolicy_vnode_remove(cr));
|
|
|
|
}
|