mirror_zfs/include/sys/trace_acl.h
Chunwei Chen 0df9673f01 Fix atime handling and relatime
The problem for atime:

We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.

zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.

Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.

Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.

The problem for relatime:

We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.

How Linux handles atime:

The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().

And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.

What this patch tries to do:

We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().

After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.

Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-04-05 18:54:55 -07:00

163 lines
5.5 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
#undef TRACE_SYSTEM
#define TRACE_SYSTEM zfs
#undef TRACE_SYSTEM_VAR
#define TRACE_SYSTEM_VAR zfs_acl
#if !defined(_TRACE_ACL_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_ACL_H
#include <linux/tracepoint.h>
#include <sys/types.h>
/*
* Generic support for three argument tracepoints of the form:
*
* DTRACE_PROBE3(...,
* znode_t *, ...,
* zfs_ace_hdr_t *, ...,
* uint32_t, ...);
*/
DECLARE_EVENT_CLASS(zfs_ace_class,
TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched),
TP_ARGS(zn, ace, mask_matched),
TP_STRUCT__entry(
__field(uint64_t, z_id)
__field(uint8_t, z_unlinked)
__field(uint8_t, z_atime_dirty)
__field(uint8_t, z_zn_prefetch)
__field(uint8_t, z_moved)
__field(uint_t, z_blksz)
__field(uint_t, z_seq)
__field(uint64_t, z_mapcnt)
__field(uint64_t, z_gen)
__field(uint64_t, z_size)
__field(uint64_t, z_links)
__field(uint64_t, z_pflags)
__field(uint64_t, z_uid)
__field(uint64_t, z_gid)
__field(uint32_t, z_sync_cnt)
__field(mode_t, z_mode)
__field(boolean_t, z_is_sa)
__field(boolean_t, z_is_zvol)
__field(boolean_t, z_is_mapped)
__field(boolean_t, z_is_ctldir)
__field(boolean_t, z_is_stale)
__field(unsigned long, i_ino)
__field(unsigned int, i_nlink)
__field(u64, i_version)
__field(loff_t, i_size)
__field(unsigned int, i_blkbits)
__field(unsigned short, i_bytes)
__field(umode_t, i_mode)
__field(__u32, i_generation)
__field(uint16_t, z_type)
__field(uint16_t, z_flags)
__field(uint32_t, z_access_mask)
__field(uint32_t, mask_matched)
),
TP_fast_assign(
__entry->z_id = zn->z_id;
__entry->z_unlinked = zn->z_unlinked;
__entry->z_atime_dirty = zn->z_atime_dirty;
__entry->z_zn_prefetch = zn->z_zn_prefetch;
__entry->z_moved = zn->z_moved;
__entry->z_blksz = zn->z_blksz;
__entry->z_seq = zn->z_seq;
__entry->z_mapcnt = zn->z_mapcnt;
__entry->z_gen = zn->z_gen;
__entry->z_size = zn->z_size;
__entry->z_links = zn->z_links;
__entry->z_pflags = zn->z_pflags;
__entry->z_uid = zn->z_uid;
__entry->z_gid = zn->z_gid;
__entry->z_sync_cnt = zn->z_sync_cnt;
__entry->z_mode = zn->z_mode;
__entry->z_is_sa = zn->z_is_sa;
__entry->z_is_zvol = zn->z_is_zvol;
__entry->z_is_mapped = zn->z_is_mapped;
__entry->z_is_ctldir = zn->z_is_ctldir;
__entry->z_is_stale = zn->z_is_stale;
__entry->i_ino = zn->z_inode.i_ino;
__entry->i_nlink = zn->z_inode.i_nlink;
__entry->i_version = zn->z_inode.i_version;
__entry->i_size = zn->z_inode.i_size;
__entry->i_blkbits = zn->z_inode.i_blkbits;
__entry->i_bytes = zn->z_inode.i_bytes;
__entry->i_mode = zn->z_inode.i_mode;
__entry->i_generation = zn->z_inode.i_generation;
__entry->z_type = ace->z_type;
__entry->z_flags = ace->z_flags;
__entry->z_access_mask = ace->z_access_mask;
__entry->mask_matched = mask_matched;
),
TP_printk("zn { id %llu unlinked %u atime_dirty %u "
"zn_prefetch %u moved %u blksz %u seq %u "
"mapcnt %llu gen %llu size %llu "
"links %llu pflags %llu uid %llu gid %llu "
"sync_cnt %u mode 0x%x is_sa %d is_zvol %d "
"is_mapped %d is_ctldir %d is_stale %d inode { "
"ino %lu nlink %u version %llu size %lli blkbits %u "
"bytes %u mode 0x%x generation %x } } ace { type %u "
"flags %u access_mask %u } mask_matched %u",
__entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty,
__entry->z_zn_prefetch, __entry->z_moved, __entry->z_blksz,
__entry->z_seq, __entry->z_mapcnt, __entry->z_gen,
__entry->z_size,
__entry->z_links, __entry->z_pflags, __entry->z_uid,
__entry->z_gid, __entry->z_sync_cnt, __entry->z_mode,
__entry->z_is_sa, __entry->z_is_zvol, __entry->z_is_mapped,
__entry->z_is_ctldir, __entry->z_is_stale, __entry->i_ino,
__entry->i_nlink, __entry->i_version, __entry->i_size,
__entry->i_blkbits, __entry->i_bytes, __entry->i_mode,
__entry->i_generation, __entry->z_type, __entry->z_flags,
__entry->z_access_mask, __entry->mask_matched)
);
#define DEFINE_ACE_EVENT(name) \
DEFINE_EVENT(zfs_ace_class, name, \
TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), \
TP_ARGS(zn, ace, mask_matched))
DEFINE_ACE_EVENT(zfs_zfs__ace__denies);
DEFINE_ACE_EVENT(zfs_zfs__ace__allows);
#endif /* _TRACE_ACL_H */
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH sys
#define TRACE_INCLUDE_FILE trace_acl
#include <trace/define_trace.h>
#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */