mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-02-27 09:18:20 +03:00

The Linux 3.1 kernel has introduced the concept of per-filesystem shrinkers which are directly assoicated with a super block. Prior to this change there was one shared global shrinker. The zfs code relied on being able to call the global shrinker when the arc_meta_limit was exceeded. This would cause the VFS to drop references on a fraction of the dentries in the dcache. The ARC could then safely reclaim the memory used by these entries and honor the arc_meta_limit. Unfortunately, when per-filesystem shrinkers were added the old interfaces were made unavailable. This change adds support to use the new per-filesystem shrinker interface so we can continue to honor the arc_meta_limit. The major benefit of the new interface is that we can now target only the zfs filesystem for dentry and inode pruning. Thus we can minimize any impact on the caching of other filesystems. In the context of making this change several other important issues related to managing the ARC were addressed, they include: * The dnlc_reduce_cache() function which was called by the ARC to drop dentries for the Posix layer was replaced with a generic zfs_prune_t callback. The ZPL layer now registers a callback to drop these dentries removing a layering violation which dates back to the Solaris code. This callback can also be used by other ARC consumers such as Lustre. arc_add_prune_callback() arc_remove_prune_callback() * The arc_reduce_dnlc_percent module option has been changed to arc_meta_prune for clarity. The dnlc functions are specific to Solaris's VFS and have already been largely eliminated already. The replacement tunable now represents the number of bytes the prune callback will request when invoked. * Less aggressively invoke the prune callback. We used to call this whenever we exceeded the arc_meta_limit however that's not strictly correct since it results in over zeleous reclaim of dentries and inodes. It is now only called once the arc_meta_limit is exceeded and every effort has been made to evict other data from the ARC cache. * More promptly manage exceeding the arc_meta_limit. When reading meta data in to the cache if a buffer was unable to be recycled notify the arc_reclaim thread to invoke the required prune. * Added arcstat_prune kstat which is incremented when the ARC is forced to request that a consumer prune its cache. Remember this will only occur when the ARC has no other choice. If it can evict buffers safely without invoking the prune callback it will. * This change is also expected to resolve the unexpect collapses of the ARC cache. This would occur because when exceeded just the arc_meta_limit reclaim presure would be excerted on the arc_c value via arc_shrink(). This effectively shrunk the entire cache when really we just needed to reclaim meta data. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #466 Closes #292
209 lines
7.7 KiB
C
209 lines
7.7 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_FS_ZFS_VFSOPS_H
|
|
#define _SYS_FS_ZFS_VFSOPS_H
|
|
|
|
#include <sys/isa_defs.h>
|
|
#include <sys/types32.h>
|
|
#include <sys/list.h>
|
|
#include <sys/vfs.h>
|
|
#include <sys/zil.h>
|
|
#include <sys/sa.h>
|
|
#include <sys/rrwlock.h>
|
|
#include <sys/zfs_ioctl.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
struct zfs_sb;
|
|
struct znode;
|
|
|
|
typedef struct zfs_sb {
|
|
struct super_block *z_sb; /* generic super_block */
|
|
struct backing_dev_info z_bdi; /* generic backing dev info */
|
|
struct zfs_sb *z_parent; /* parent fs */
|
|
objset_t *z_os; /* objset reference */
|
|
uint64_t z_flags; /* super_block flags */
|
|
uint64_t z_root; /* id of root znode */
|
|
uint64_t z_unlinkedobj; /* id of unlinked zapobj */
|
|
uint64_t z_max_blksz; /* maximum block size for files */
|
|
uint64_t z_fuid_obj; /* fuid table object number */
|
|
uint64_t z_fuid_size; /* fuid table size */
|
|
avl_tree_t z_fuid_idx; /* fuid tree keyed by index */
|
|
avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */
|
|
krwlock_t z_fuid_lock; /* fuid lock */
|
|
boolean_t z_fuid_loaded; /* fuid tables are loaded */
|
|
boolean_t z_fuid_dirty; /* need to sync fuid table ? */
|
|
struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */
|
|
zilog_t *z_log; /* intent log pointer */
|
|
uint_t z_acl_inherit; /* acl inheritance behavior */
|
|
zfs_case_t z_case; /* case-sense */
|
|
boolean_t z_utf8; /* utf8-only */
|
|
int z_norm; /* normalization flags */
|
|
boolean_t z_atime; /* enable atimes mount option */
|
|
boolean_t z_unmounted; /* unmounted */
|
|
rrwlock_t z_teardown_lock;
|
|
krwlock_t z_teardown_inactive_lock;
|
|
list_t z_all_znodes; /* all znodes in the fs */
|
|
uint64_t z_nr_znodes; /* number of znodes in the fs */
|
|
kmutex_t z_znodes_lock; /* lock for z_all_znodes */
|
|
struct inode *z_ctldir; /* .zfs directory inode */
|
|
boolean_t z_show_ctldir; /* expose .zfs in the root dir */
|
|
boolean_t z_issnap; /* true if this is a snapshot */
|
|
boolean_t z_vscan; /* virus scan on/off */
|
|
boolean_t z_use_fuids; /* version allows fuids */
|
|
boolean_t z_replay; /* set during ZIL replay */
|
|
boolean_t z_use_sa; /* version allow system attributes */
|
|
boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */
|
|
uint64_t z_version; /* ZPL version */
|
|
uint64_t z_shares_dir; /* hidden shares dir */
|
|
kmutex_t z_lock;
|
|
uint64_t z_userquota_obj;
|
|
uint64_t z_groupquota_obj;
|
|
uint64_t z_replay_eof; /* New end of file - replay only */
|
|
sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
|
|
#define ZFS_OBJ_MTX_SZ 64
|
|
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
|
|
} zfs_sb_t;
|
|
|
|
#define ZFS_SUPER_MAGIC 0x2fc12fc1
|
|
|
|
#define ZSB_XATTR 0x0001 /* Enable user xattrs */
|
|
|
|
|
|
/*
|
|
* Minimal snapshot helpers, the bulk of the Linux snapshot implementation
|
|
* lives in the zpl_snap.c file which is part of the zpl source.
|
|
*/
|
|
#define ZFS_CTLDIR_NAME ".zfs"
|
|
|
|
#define zfs_has_ctldir(zdp) \
|
|
((zdp)->z_id == ZTOZSB(zdp)->z_root && \
|
|
(ZTOZSB(zdp)->z_ctldir != NULL))
|
|
#define zfs_show_ctldir(zdp) \
|
|
(zfs_has_ctldir(zdp) && \
|
|
(ZTOZSB(zdp)->z_show_ctldir))
|
|
|
|
#define ZFSCTL_INO_ROOT 0x1
|
|
#define ZFSCTL_INO_SNAPDIR 0x2
|
|
#define ZFSCTL_INO_SHARES 0x3
|
|
|
|
/*
|
|
* Allow a maximum number of links. While ZFS does not internally limit
|
|
* this most Linux filesystems do. It's probably a good idea to limit
|
|
* this to a large value until it is validated that this is safe.
|
|
*/
|
|
#define ZFS_LINK_MAX 65536
|
|
|
|
/*
|
|
* Normal filesystems (those not under .zfs/snapshot) have a total
|
|
* file ID size limited to 12 bytes (including the length field) due to
|
|
* NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical
|
|
* reasons, this same limit is being imposed by the Solaris NFSv3 implementation
|
|
* (although the NFSv3 protocol actually permits a maximum of 64 bytes). It
|
|
* is not possible to expand beyond 12 bytes without abandoning support
|
|
* of NFSv2.
|
|
*
|
|
* For normal filesystems, we partition up the available space as follows:
|
|
* 2 bytes fid length (required)
|
|
* 6 bytes object number (48 bits)
|
|
* 4 bytes generation number (32 bits)
|
|
*
|
|
* We reserve only 48 bits for the object number, as this is the limit
|
|
* currently defined and imposed by the DMU.
|
|
*/
|
|
typedef struct zfid_short {
|
|
uint16_t zf_len;
|
|
uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
|
|
uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
|
|
} zfid_short_t;
|
|
|
|
/*
|
|
* Filesystems under .zfs/snapshot have a total file ID size of 22 bytes
|
|
* (including the length field). This makes files under .zfs/snapshot
|
|
* accessible by NFSv3 and NFSv4, but not NFSv2.
|
|
*
|
|
* For files under .zfs/snapshot, we partition up the available space
|
|
* as follows:
|
|
* 2 bytes fid length (required)
|
|
* 6 bytes object number (48 bits)
|
|
* 4 bytes generation number (32 bits)
|
|
* 6 bytes objset id (48 bits)
|
|
* 4 bytes currently just zero (32 bits)
|
|
*
|
|
* We reserve only 48 bits for the object number and objset id, as these are
|
|
* the limits currently defined and imposed by the DMU.
|
|
*/
|
|
typedef struct zfid_long {
|
|
zfid_short_t z_fid;
|
|
uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
|
|
uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */
|
|
} zfid_long_t;
|
|
|
|
#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
|
|
#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
|
|
|
|
extern uint_t zfs_fsyncer_key;
|
|
|
|
extern int zfs_suspend_fs(zfs_sb_t *zsb);
|
|
extern int zfs_resume_fs(zfs_sb_t *zsb, const char *osname);
|
|
extern int zfs_userspace_one(zfs_sb_t *zsb, zfs_userquota_prop_t type,
|
|
const char *domain, uint64_t rid, uint64_t *valuep);
|
|
extern int zfs_userspace_many(zfs_sb_t *zsb, zfs_userquota_prop_t type,
|
|
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
|
|
extern int zfs_set_userquota(zfs_sb_t *zsb, zfs_userquota_prop_t type,
|
|
const char *domain, uint64_t rid, uint64_t quota);
|
|
extern boolean_t zfs_owner_overquota(zfs_sb_t *zsb, struct znode *,
|
|
boolean_t isgroup);
|
|
extern boolean_t zfs_fuid_overquota(zfs_sb_t *zsb, boolean_t isgroup,
|
|
uint64_t fuid);
|
|
extern int zfs_set_version(zfs_sb_t *zsb, uint64_t newvers);
|
|
extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop,
|
|
uint64_t *value);
|
|
extern int zfs_sb_create(const char *name, zfs_sb_t **zsbp);
|
|
extern int zfs_sb_setup(zfs_sb_t *zsb, boolean_t mounting);
|
|
extern void zfs_sb_free(zfs_sb_t *zsb);
|
|
extern int zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan,
|
|
int *objects);
|
|
extern int zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting);
|
|
extern int zfs_check_global_label(const char *dsname, const char *hexsl);
|
|
extern boolean_t zfs_is_readonly(zfs_sb_t *zsb);
|
|
|
|
extern int zfs_register_callbacks(zfs_sb_t *zsb);
|
|
extern void zfs_unregister_callbacks(zfs_sb_t *zsb);
|
|
extern int zfs_domount(struct super_block *sb, void *data, int silent);
|
|
extern int zfs_umount(struct super_block *sb);
|
|
extern int zfs_remount(struct super_block *sb, int *flags, char *data);
|
|
extern int zfs_root(zfs_sb_t *zsb, struct inode **ipp);
|
|
extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp);
|
|
extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_FS_ZFS_VFSOPS_H */
|