mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-12 19:20:28 +03:00
Restructure per-filesystem reclaim
Originally when the ARC prune callback was introduced the idea was
to register a single callback for the ZPL. The ARC could invoke this
call back if it needed the ZPL to drop dentries, inodes, or other
cache objects which might be pinning buffers in the ARC. The ZPL
would iterate over all ZFS super blocks and perform the reclaim.
For the most part this design has worked well but due to limitations
in 2.6.35 and earlier kernels there were some problems. This patch
is designed to address those issues.
1) iterate_supers_type() is not provided by all kernels which makes
it impossible to safely iterate over all zpl_fs_type filesystems in
a single callback. The most straight forward and portable way to
resolve this is to register a callback per-filesystem during mount.
The arc_*_prune_callback() functions have always supported multiple
callbacks so this is functionally a very small change.
2) Commit 050d22b
removed the non-portable shrink_dcache_memory()
and shrink_icache_memory() functions and didn't replace them with
equivalent functionality. This meant that for Linux 3.1 and older
kernels the ARC had no mechanism to drop dentries and inodes from
the caches if needed. This patch adds that missing functionality
by calling shrink_dcache_parent() to release dentries which may be
pinning inodes. This will result in all unused cache entries being
dropped which is a bit heavy handed but it's the only interface
available for old kernels.
3) A zpl_drop_inode() callback is registered for kernels older than
2.6.35 which do not support the .evict_inode callback. This ensures
that when the last reference on an inode is dropped it is immediately
removed from the cache. If this isn't done than inode can end up on
the global unused LRU with no mechanism available to ZFS to drop them.
Since the ARC buffers are not dropped the hottest inodes can still
be recreated without performing disk IO.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Pavel Snajdr <snajpa@snajpa.net>
Issue #3160
This commit is contained in:
parent
596a8935a1
commit
2cbb06b561
@ -73,6 +73,7 @@ typedef struct zfs_sb {
|
|||||||
uint64_t z_nr_znodes; /* number of znodes in the fs */
|
uint64_t z_nr_znodes; /* number of znodes in the fs */
|
||||||
unsigned long z_rollback_time; /* last online rollback time */
|
unsigned long z_rollback_time; /* last online rollback time */
|
||||||
kmutex_t z_znodes_lock; /* lock for z_all_znodes */
|
kmutex_t z_znodes_lock; /* lock for z_all_znodes */
|
||||||
|
arc_prune_t *z_arc_prune; /* called by ARC to prune caches */
|
||||||
struct inode *z_ctldir; /* .zfs directory inode */
|
struct inode *z_ctldir; /* .zfs directory inode */
|
||||||
avl_tree_t z_ctldir_snaps; /* .zfs/snapshot entries */
|
avl_tree_t z_ctldir_snaps; /* .zfs/snapshot entries */
|
||||||
kmutex_t z_ctldir_lock; /* .zfs ctldir lock */
|
kmutex_t z_ctldir_lock; /* .zfs ctldir lock */
|
||||||
|
@ -63,7 +63,7 @@ extern const struct file_operations zpl_file_operations;
|
|||||||
extern const struct file_operations zpl_dir_file_operations;
|
extern const struct file_operations zpl_dir_file_operations;
|
||||||
|
|
||||||
/* zpl_super.c */
|
/* zpl_super.c */
|
||||||
extern void zpl_prune_sbs(int64_t bytes_to_scan, void *private);
|
extern void zpl_prune_sb(int64_t nr_to_scan, void *arg);
|
||||||
|
|
||||||
typedef struct zpl_mount_data {
|
typedef struct zpl_mount_data {
|
||||||
const char *z_osname; /* Dataset name */
|
const char *z_osname; /* Dataset name */
|
||||||
|
@ -386,7 +386,11 @@ Use \fB1\fR for yes (default) and \fB0\fR to disable.
|
|||||||
\fBzfs_arc_meta_limit\fR (ulong)
|
\fBzfs_arc_meta_limit\fR (ulong)
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
Meta limit for arc size
|
The maximum allowed size in bytes that meta data buffers are allowed to
|
||||||
|
consume in the ARC. When this limit is reached meta data buffers will
|
||||||
|
be reclaimed even if the overall arc_c_max has not been reached. This
|
||||||
|
value defaults to 0 which indicates that 3/4 of the ARC may be used
|
||||||
|
for meta data.
|
||||||
.sp
|
.sp
|
||||||
Default value: \fB0\fR.
|
Default value: \fB0\fR.
|
||||||
.RE
|
.RE
|
||||||
@ -397,9 +401,14 @@ Default value: \fB0\fR.
|
|||||||
\fBzfs_arc_meta_prune\fR (int)
|
\fBzfs_arc_meta_prune\fR (int)
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
Bytes of meta data to prune
|
The number of dentries and inodes to be scanned looking for entries
|
||||||
|
which can be dropped. This may be required when the ARC reaches the
|
||||||
|
\fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers
|
||||||
|
in the ARC. Increasing this value will cause to dentry and inode caches
|
||||||
|
to be pruned more aggressively. Setting this value to 0 will disable
|
||||||
|
pruning the inode and dentry caches.
|
||||||
.sp
|
.sp
|
||||||
Default value: \fB1,048,576\fR.
|
Default value: \fB10,000\fR.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
|
@ -158,8 +158,8 @@ static kmutex_t arc_reclaim_thr_lock;
|
|||||||
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
|
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
|
||||||
static uint8_t arc_thread_exit;
|
static uint8_t arc_thread_exit;
|
||||||
|
|
||||||
/* number of bytes to prune from caches when at arc_meta_limit is reached */
|
/* number of objects to prune from caches when arc_meta_limit is reached */
|
||||||
int zfs_arc_meta_prune = 1048576;
|
int zfs_arc_meta_prune = 10000;
|
||||||
|
|
||||||
typedef enum arc_reclaim_strategy {
|
typedef enum arc_reclaim_strategy {
|
||||||
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
|
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
|
||||||
@ -5607,7 +5607,7 @@ module_param(zfs_arc_meta_limit, ulong, 0644);
|
|||||||
MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
|
MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
|
||||||
|
|
||||||
module_param(zfs_arc_meta_prune, int, 0644);
|
module_param(zfs_arc_meta_prune, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
|
MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
|
||||||
|
|
||||||
module_param(zfs_arc_grow_retry, int, 0644);
|
module_param(zfs_arc_grow_retry, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
|
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
|
||||||
|
@ -1068,29 +1068,52 @@ zfs_root(zfs_sb_t *zsb, struct inode **ipp)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(zfs_root);
|
EXPORT_SYMBOL(zfs_root);
|
||||||
|
|
||||||
#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK)
|
/*
|
||||||
|
* The ARC has requested that the filesystem drop entries from the dentry
|
||||||
|
* and inode caches. This can occur when the ARC needs to free meta data
|
||||||
|
* blocks but can't because they are all pinned by entries in these caches.
|
||||||
|
*/
|
||||||
int
|
int
|
||||||
zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
|
zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
|
||||||
{
|
{
|
||||||
zfs_sb_t *zsb = sb->s_fs_info;
|
zfs_sb_t *zsb = sb->s_fs_info;
|
||||||
|
int error = 0;
|
||||||
|
#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK)
|
||||||
struct shrinker *shrinker = &sb->s_shrink;
|
struct shrinker *shrinker = &sb->s_shrink;
|
||||||
struct shrink_control sc = {
|
struct shrink_control sc = {
|
||||||
.nr_to_scan = nr_to_scan,
|
.nr_to_scan = nr_to_scan,
|
||||||
.gfp_mask = GFP_KERNEL,
|
.gfp_mask = GFP_KERNEL,
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
ZFS_ENTER(zsb);
|
ZFS_ENTER(zsb);
|
||||||
#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
|
|
||||||
|
#if defined(HAVE_SPLIT_SHRINKER_CALLBACK)
|
||||||
*objects = (*shrinker->scan_objects)(shrinker, &sc);
|
*objects = (*shrinker->scan_objects)(shrinker, &sc);
|
||||||
#else
|
#elif defined(HAVE_SHRINK)
|
||||||
*objects = (*shrinker->shrink)(shrinker, &sc);
|
*objects = (*shrinker->shrink)(shrinker, &sc);
|
||||||
|
#else
|
||||||
|
/*
|
||||||
|
* Linux kernels older than 3.1 do not support a per-filesystem
|
||||||
|
* shrinker. Therefore, we must fall back to the only available
|
||||||
|
* interface which is to discard all unused dentries and inodes.
|
||||||
|
* This behavior clearly isn't ideal but it's required so the ARC
|
||||||
|
* may free memory. The performance impact is mitigated by the
|
||||||
|
* fact that the frequently accessed dentry and inode buffers will
|
||||||
|
* still be in the ARC making them relatively cheap to recreate.
|
||||||
|
*/
|
||||||
|
*objects = 0;
|
||||||
|
shrink_dcache_parent(sb->s_root);
|
||||||
#endif
|
#endif
|
||||||
ZFS_EXIT(zsb);
|
ZFS_EXIT(zsb);
|
||||||
|
|
||||||
return (0);
|
dprintf_ds(zsb->z_os->os_dsl_dataset,
|
||||||
|
"pruning, nr_to_scan=%lu objects=%d error=%d\n",
|
||||||
|
nr_to_scan, *objects, error);
|
||||||
|
|
||||||
|
return (error);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(zfs_sb_prune);
|
EXPORT_SYMBOL(zfs_sb_prune);
|
||||||
#endif /* defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Teardown the zfs_sb_t.
|
* Teardown the zfs_sb_t.
|
||||||
@ -1286,6 +1309,8 @@ zfs_domount(struct super_block *sb, void *data, int silent)
|
|||||||
|
|
||||||
if (!zsb->z_issnap)
|
if (!zsb->z_issnap)
|
||||||
zfsctl_create(zsb);
|
zfsctl_create(zsb);
|
||||||
|
|
||||||
|
zsb->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
|
||||||
out:
|
out:
|
||||||
if (error) {
|
if (error) {
|
||||||
dmu_objset_disown(zsb->z_os, zsb);
|
dmu_objset_disown(zsb->z_os, zsb);
|
||||||
@ -1324,6 +1349,7 @@ zfs_umount(struct super_block *sb)
|
|||||||
zfs_sb_t *zsb = sb->s_fs_info;
|
zfs_sb_t *zsb = sb->s_fs_info;
|
||||||
objset_t *os;
|
objset_t *os;
|
||||||
|
|
||||||
|
arc_remove_prune_callback(zsb->z_arc_prune);
|
||||||
VERIFY(zfs_sb_teardown(zsb, B_TRUE) == 0);
|
VERIFY(zfs_sb_teardown(zsb, B_TRUE) == 0);
|
||||||
os = zsb->z_os;
|
os = zsb->z_os;
|
||||||
bdi_destroy(sb->s_bdi);
|
bdi_destroy(sb->s_bdi);
|
||||||
@ -1682,7 +1708,6 @@ zfs_init(void)
|
|||||||
zfs_znode_init();
|
zfs_znode_init();
|
||||||
dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
|
dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
|
||||||
register_filesystem(&zpl_fs_type);
|
register_filesystem(&zpl_fs_type);
|
||||||
(void) arc_add_prune_callback(zpl_prune_sbs, NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -109,6 +109,12 @@ zpl_evict_inode(struct inode *ip)
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
static void
|
||||||
|
zpl_drop_inode(struct inode *ip)
|
||||||
|
{
|
||||||
|
generic_delete_inode(ip);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
zpl_clear_inode(struct inode *ip)
|
zpl_clear_inode(struct inode *ip)
|
||||||
{
|
{
|
||||||
@ -125,7 +131,6 @@ zpl_inode_delete(struct inode *ip)
|
|||||||
truncate_setsize(ip, 0);
|
truncate_setsize(ip, 0);
|
||||||
clear_inode(ip);
|
clear_inode(ip);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* HAVE_EVICT_INODE */
|
#endif /* HAVE_EVICT_INODE */
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -276,37 +281,13 @@ zpl_kill_sb(struct super_block *sb)
|
|||||||
#endif /* HAVE_S_INSTANCES_LIST_HEAD */
|
#endif /* HAVE_S_INSTANCES_LIST_HEAD */
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK)
|
|
||||||
/*
|
|
||||||
* Linux 3.1 - 3.x API
|
|
||||||
*
|
|
||||||
* The Linux 3.1 API introduced per-sb cache shrinkers to replace the
|
|
||||||
* global ones. This allows us a mechanism to cleanly target a specific
|
|
||||||
* zfs file system when the dnode and inode caches grow too large.
|
|
||||||
*
|
|
||||||
* In addition, the 3.0 kernel added the iterate_supers_type() helper
|
|
||||||
* function which is used to safely walk all of the zfs file systems.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
zpl_prune_sb(struct super_block *sb, void *arg)
|
|
||||||
{
|
|
||||||
int objects = 0;
|
|
||||||
int error;
|
|
||||||
|
|
||||||
error = -zfs_sb_prune(sb, *(unsigned long *)arg, &objects);
|
|
||||||
ASSERT3S(error, <=, 0);
|
|
||||||
}
|
|
||||||
#endif /* defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) */
|
|
||||||
|
|
||||||
void
|
void
|
||||||
zpl_prune_sbs(int64_t bytes_to_scan, void *private)
|
zpl_prune_sb(int64_t nr_to_scan, void *arg)
|
||||||
{
|
{
|
||||||
#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK)
|
struct super_block *sb = (struct super_block *)arg;
|
||||||
unsigned long nr_to_scan = (bytes_to_scan / sizeof (znode_t));
|
int objects = 0;
|
||||||
|
|
||||||
iterate_supers_type(&zpl_fs_type, zpl_prune_sb, &nr_to_scan);
|
(void) -zfs_sb_prune(sb, nr_to_scan, &objects);
|
||||||
kmem_reap();
|
|
||||||
#endif /* defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) */
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_NR_CACHED_OBJECTS
|
#ifdef HAVE_NR_CACHED_OBJECTS
|
||||||
@ -343,10 +324,10 @@ const struct super_operations zpl_super_operations = {
|
|||||||
.destroy_inode = zpl_inode_destroy,
|
.destroy_inode = zpl_inode_destroy,
|
||||||
.dirty_inode = zpl_dirty_inode,
|
.dirty_inode = zpl_dirty_inode,
|
||||||
.write_inode = NULL,
|
.write_inode = NULL,
|
||||||
.drop_inode = NULL,
|
|
||||||
#ifdef HAVE_EVICT_INODE
|
#ifdef HAVE_EVICT_INODE
|
||||||
.evict_inode = zpl_evict_inode,
|
.evict_inode = zpl_evict_inode,
|
||||||
#else
|
#else
|
||||||
|
.drop_inode = zpl_drop_inode,
|
||||||
.clear_inode = zpl_clear_inode,
|
.clear_inode = zpl_clear_inode,
|
||||||
.delete_inode = zpl_inode_delete,
|
.delete_inode = zpl_inode_delete,
|
||||||
#endif /* HAVE_EVICT_INODE */
|
#endif /* HAVE_EVICT_INODE */
|
||||||
|
Loading…
Reference in New Issue
Block a user