mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-09-15 13:50:11 +03:00
enforce arc_dnode_limit
Linux kernel shrinker in the context of null/root memcg does not scan dentry and inode caches added by a task running in non-root memcg. For ZFS this means that dnode cache routinely overflows, evicting valuable meta/data and putting additional memory pressure on the system. This patch restores zfs_prune_aliases as fallback when the kernel shrinker does nothing, enabling zfs to actually free dnodes. Moreover, it (indirectly) calls arc_evict when dnode_size > dnode_limit. Reviewed-by: Rob Norris <robn@despairlabs.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Gionatan Danti <g.danti@assyoma.it> Closes #17487 Closes #17542
This commit is contained in:
parent
30fa92bff3
commit
4808641e71
@ -954,7 +954,7 @@ typedef struct arc_sums {
|
|||||||
wmsum_t arcstat_data_size;
|
wmsum_t arcstat_data_size;
|
||||||
wmsum_t arcstat_metadata_size;
|
wmsum_t arcstat_metadata_size;
|
||||||
wmsum_t arcstat_dbuf_size;
|
wmsum_t arcstat_dbuf_size;
|
||||||
wmsum_t arcstat_dnode_size;
|
aggsum_t arcstat_dnode_size;
|
||||||
wmsum_t arcstat_bonus_size;
|
wmsum_t arcstat_bonus_size;
|
||||||
wmsum_t arcstat_l2_hits;
|
wmsum_t arcstat_l2_hits;
|
||||||
wmsum_t arcstat_l2_misses;
|
wmsum_t arcstat_l2_misses;
|
||||||
|
@ -1176,6 +1176,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
|
|||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Dentry and inode caches referenced by a task in non-root memcg are
|
||||||
|
* not going to be scanned by the kernel-provided shrinker. So, if
|
||||||
|
* kernel prunes nothing, fall back to this manual walk to free dnodes.
|
||||||
|
* To avoid scanning the same znodes multiple times they are always rotated
|
||||||
|
* to the end of the z_all_znodes list. New znodes are inserted at the
|
||||||
|
* end of the list so we're always scanning the oldest znodes first.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
|
||||||
|
{
|
||||||
|
znode_t **zp_array, *zp;
|
||||||
|
int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
|
||||||
|
int objects = 0;
|
||||||
|
int i = 0, j = 0;
|
||||||
|
|
||||||
|
zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
|
||||||
|
|
||||||
|
mutex_enter(&zfsvfs->z_znodes_lock);
|
||||||
|
while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
|
||||||
|
|
||||||
|
if ((i++ > nr_to_scan) || (j >= max_array))
|
||||||
|
break;
|
||||||
|
|
||||||
|
ASSERT(list_link_active(&zp->z_link_node));
|
||||||
|
list_remove(&zfsvfs->z_all_znodes, zp);
|
||||||
|
list_insert_tail(&zfsvfs->z_all_znodes, zp);
|
||||||
|
|
||||||
|
/* Skip active znodes and .zfs entries */
|
||||||
|
if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (igrab(ZTOI(zp)) == NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
zp_array[j] = zp;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
mutex_exit(&zfsvfs->z_znodes_lock);
|
||||||
|
|
||||||
|
for (i = 0; i < j; i++) {
|
||||||
|
zp = zp_array[i];
|
||||||
|
|
||||||
|
ASSERT3P(zp, !=, NULL);
|
||||||
|
d_prune_aliases(ZTOI(zp));
|
||||||
|
|
||||||
|
if (atomic_read(&ZTOI(zp)->i_count) == 1)
|
||||||
|
objects++;
|
||||||
|
|
||||||
|
zrele(zp);
|
||||||
|
}
|
||||||
|
|
||||||
|
vmem_free(zp_array, max_array * sizeof (znode_t *));
|
||||||
|
|
||||||
|
return (objects);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The ARC has requested that the filesystem drop entries from the dentry
|
* The ARC has requested that the filesystem drop entries from the dentry
|
||||||
* and inode caches. This can occur when the ARC needs to free meta data
|
* and inode caches. This can occur when the ARC needs to free meta data
|
||||||
@ -1227,6 +1284,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
|
|||||||
*objects = (*shrinker->scan_objects)(shrinker, &sc);
|
*objects = (*shrinker->scan_objects)(shrinker, &sc);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fall back to zfs_prune_aliases if kernel's shrinker did nothing
|
||||||
|
* due to dentry and inode caches being referenced by a task running
|
||||||
|
* in non-root memcg.
|
||||||
|
*/
|
||||||
|
if (*objects == 0)
|
||||||
|
*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
|
||||||
|
|
||||||
zfs_exit(zfsvfs, FTAG);
|
zfs_exit(zfsvfs, FTAG);
|
||||||
|
|
||||||
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
|
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
|
||||||
|
@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
|
|||||||
ARCSTAT_INCR(arcstat_bonus_size, space);
|
ARCSTAT_INCR(arcstat_bonus_size, space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_DNODE:
|
case ARC_SPACE_DNODE:
|
||||||
ARCSTAT_INCR(arcstat_dnode_size, space);
|
aggsum_add(&arc_sums.arcstat_dnode_size, space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_DBUF:
|
case ARC_SPACE_DBUF:
|
||||||
ARCSTAT_INCR(arcstat_dbuf_size, space);
|
ARCSTAT_INCR(arcstat_dbuf_size, space);
|
||||||
@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
|
|||||||
ARCSTAT_INCR(arcstat_bonus_size, -space);
|
ARCSTAT_INCR(arcstat_bonus_size, -space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_DNODE:
|
case ARC_SPACE_DNODE:
|
||||||
ARCSTAT_INCR(arcstat_dnode_size, -space);
|
aggsum_add(&arc_sums.arcstat_dnode_size, -space);
|
||||||
break;
|
break;
|
||||||
case ARC_SPACE_DBUF:
|
case ARC_SPACE_DBUF:
|
||||||
ARCSTAT_INCR(arcstat_dbuf_size, -space);
|
ARCSTAT_INCR(arcstat_dbuf_size, -space);
|
||||||
@ -4490,7 +4490,7 @@ arc_evict(void)
|
|||||||
* target is not evictable or if they go over arc_dnode_limit.
|
* target is not evictable or if they go over arc_dnode_limit.
|
||||||
*/
|
*/
|
||||||
int64_t prune = 0;
|
int64_t prune = 0;
|
||||||
int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
|
int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
|
||||||
int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
|
int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
|
||||||
+ zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
|
+ zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
|
||||||
- zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
|
- zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
|
||||||
@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
|
|||||||
* in the ARC. In practice, that's in the tens of MB, which is low
|
* in the ARC. In practice, that's in the tens of MB, which is low
|
||||||
* enough to be safe.
|
* enough to be safe.
|
||||||
*/
|
*/
|
||||||
int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
|
int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
|
||||||
zfs_max_recordsize;
|
zfs_max_recordsize;
|
||||||
|
int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
|
||||||
|
arc_dnode_limit;
|
||||||
|
|
||||||
/* Always allow at least one block of overflow. */
|
/* Always allow at least one block of overflow. */
|
||||||
if (over < 0)
|
if (arc_over < 0 && dn_over <= 0)
|
||||||
return (ARC_OVF_NONE);
|
return (ARC_OVF_NONE);
|
||||||
|
|
||||||
/* If we are under memory pressure, report severe overflow. */
|
/* If we are under memory pressure, report severe overflow. */
|
||||||
@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
|
|||||||
int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
|
int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
|
||||||
if (use_reserve)
|
if (use_reserve)
|
||||||
overflow *= 3;
|
overflow *= 3;
|
||||||
return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
|
return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static abd_t *
|
static abd_t *
|
||||||
@ -7324,7 +7326,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
|
|||||||
#if defined(COMPAT_FREEBSD11)
|
#if defined(COMPAT_FREEBSD11)
|
||||||
as->arcstat_other_size.value.ui64 =
|
as->arcstat_other_size.value.ui64 =
|
||||||
wmsum_value(&arc_sums.arcstat_bonus_size) +
|
wmsum_value(&arc_sums.arcstat_bonus_size) +
|
||||||
wmsum_value(&arc_sums.arcstat_dnode_size) +
|
aggsum_value(&arc_sums.arcstat_dnode_size) +
|
||||||
wmsum_value(&arc_sums.arcstat_dbuf_size);
|
wmsum_value(&arc_sums.arcstat_dbuf_size);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -7366,7 +7368,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
|
|||||||
&as->arcstat_uncached_evictable_metadata);
|
&as->arcstat_uncached_evictable_metadata);
|
||||||
|
|
||||||
as->arcstat_dnode_size.value.ui64 =
|
as->arcstat_dnode_size.value.ui64 =
|
||||||
wmsum_value(&arc_sums.arcstat_dnode_size);
|
aggsum_value(&arc_sums.arcstat_dnode_size);
|
||||||
as->arcstat_bonus_size.value.ui64 =
|
as->arcstat_bonus_size.value.ui64 =
|
||||||
wmsum_value(&arc_sums.arcstat_bonus_size);
|
wmsum_value(&arc_sums.arcstat_bonus_size);
|
||||||
as->arcstat_l2_hits.value.ui64 =
|
as->arcstat_l2_hits.value.ui64 =
|
||||||
@ -7736,7 +7738,7 @@ arc_state_init(void)
|
|||||||
wmsum_init(&arc_sums.arcstat_data_size, 0);
|
wmsum_init(&arc_sums.arcstat_data_size, 0);
|
||||||
wmsum_init(&arc_sums.arcstat_metadata_size, 0);
|
wmsum_init(&arc_sums.arcstat_metadata_size, 0);
|
||||||
wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
|
wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
|
||||||
wmsum_init(&arc_sums.arcstat_dnode_size, 0);
|
aggsum_init(&arc_sums.arcstat_dnode_size, 0);
|
||||||
wmsum_init(&arc_sums.arcstat_bonus_size, 0);
|
wmsum_init(&arc_sums.arcstat_bonus_size, 0);
|
||||||
wmsum_init(&arc_sums.arcstat_l2_hits, 0);
|
wmsum_init(&arc_sums.arcstat_l2_hits, 0);
|
||||||
wmsum_init(&arc_sums.arcstat_l2_misses, 0);
|
wmsum_init(&arc_sums.arcstat_l2_misses, 0);
|
||||||
@ -7895,7 +7897,7 @@ arc_state_fini(void)
|
|||||||
wmsum_fini(&arc_sums.arcstat_data_size);
|
wmsum_fini(&arc_sums.arcstat_data_size);
|
||||||
wmsum_fini(&arc_sums.arcstat_metadata_size);
|
wmsum_fini(&arc_sums.arcstat_metadata_size);
|
||||||
wmsum_fini(&arc_sums.arcstat_dbuf_size);
|
wmsum_fini(&arc_sums.arcstat_dbuf_size);
|
||||||
wmsum_fini(&arc_sums.arcstat_dnode_size);
|
aggsum_fini(&arc_sums.arcstat_dnode_size);
|
||||||
wmsum_fini(&arc_sums.arcstat_bonus_size);
|
wmsum_fini(&arc_sums.arcstat_bonus_size);
|
||||||
wmsum_fini(&arc_sums.arcstat_l2_hits);
|
wmsum_fini(&arc_sums.arcstat_l2_hits);
|
||||||
wmsum_fini(&arc_sums.arcstat_l2_misses);
|
wmsum_fini(&arc_sums.arcstat_l2_misses);
|
||||||
|
Loading…
Reference in New Issue
Block a user