From 4808641e71bbc81e45491a0d4266c9de216eaf24 Mon Sep 17 00:00:00 2001 From: shodanshok Date: Mon, 21 Jul 2025 19:32:01 +0200 Subject: [PATCH] enforce arc_dnode_limit Linux kernel shrinker in the context of null/root memcg does not scan dentry and inode caches added by a task running in non-root memcg. For ZFS this means that dnode cache routinely overflows, evicting valuable meta/data and putting additional memory pressure on the system. This patch restores zfs_prune_aliases as fallback when the kernel shrinker does nothing, enabling zfs to actually free dnodes. Moreover, it (indirectly) calls arc_evict when dnode_size > dnode_limit. Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Gionatan Danti Closes #17487 Closes #17542 --- include/sys/arc_impl.h | 2 +- module/os/linux/zfs/zfs_vfsops.c | 65 ++++++++++++++++++++++++++++++++ module/zfs/arc.c | 22 ++++++----- 3 files changed, 78 insertions(+), 11 deletions(-) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 1b3038910..b55d5da33 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -954,7 +954,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - wmsum_t arcstat_dnode_size; + aggsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 56af4fe0a..7961549e6 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1176,6 +1176,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) return (error); } +/* + * Dentry and inode caches referenced by a task in non-root memcg are + * not going to be scanned by the kernel-provided shrinker. So, if + * kernel prunes nothing, fall back to this manual walk to free dnodes. + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + zrele(zp); + } + + vmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} + /* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data @@ -1227,6 +1284,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = (*shrinker->scan_objects)(shrinker, &sc); #endif + /* + * Fall back to zfs_prune_aliases if kernel's shrinker did nothing + * due to dentry and inode caches being referenced by a task running + * in non-root memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 998bb7cf6..5e70d95e5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, space); + aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, -space); + aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -4490,7 +4490,7 @@ arc_evict(void) * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; - int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) @@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - + int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - zfs_max_recordsize; + int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) - + arc_dnode_limit; /* Always allow at least one block of overflow. */ - if (over < 0) + if (arc_over < 0 && dn_over <= 0) return (ARC_OVF_NONE); /* If we are under memory pressure, report severe overflow. */ @@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; if (use_reserve) overflow *= 3; - return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); + return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * @@ -7324,7 +7326,7 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - wmsum_value(&arc_sums.arcstat_dnode_size) + + aggsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif @@ -7366,7 +7368,7 @@ arc_kstat_update(kstat_t *ksp, int rw) &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - wmsum_value(&arc_sums.arcstat_dnode_size); + aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7736,7 +7738,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - wmsum_init(&arc_sums.arcstat_dnode_size, 0); + aggsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7895,7 +7897,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - wmsum_fini(&arc_sums.arcstat_dnode_size); + aggsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses);