2020-04-14 21:36:28 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-12 00:16:13 +03:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2020-04-14 21:36:28 +03:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/zio.h>
|
|
|
|
#include <sys/spa_impl.h>
|
2020-08-01 07:30:31 +03:00
|
|
|
#include <sys/counter.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
#include <sys/zio_compress.h>
|
|
|
|
#include <sys/zio_checksum.h>
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/arc.h>
|
2022-09-20 03:21:45 +03:00
|
|
|
#include <sys/arc_os.h>
|
2020-07-30 02:35:33 +03:00
|
|
|
#include <sys/zfs_refcount.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
#include <sys/vdev.h>
|
|
|
|
#include <sys/vdev_trim.h>
|
|
|
|
#include <sys/vdev_impl.h>
|
|
|
|
#include <sys/dsl_pool.h>
|
|
|
|
#include <sys/zio_checksum.h>
|
|
|
|
#include <sys/multilist.h>
|
|
|
|
#include <sys/abd.h>
|
|
|
|
#include <sys/zil.h>
|
|
|
|
#include <sys/fm/fs/zfs.h>
|
|
|
|
#include <sys/eventhandler.h>
|
|
|
|
#include <sys/callb.h>
|
|
|
|
#include <sys/kstat.h>
|
|
|
|
#include <sys/zthr.h>
|
|
|
|
#include <zfs_fletcher.h>
|
|
|
|
#include <sys/arc_impl.h>
|
|
|
|
#include <sys/sdt.h>
|
|
|
|
#include <sys/aggsum.h>
|
2020-07-01 19:10:08 +03:00
|
|
|
#include <sys/vnode.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
#include <cityhash.h>
|
2020-07-25 20:49:49 +03:00
|
|
|
#include <machine/vmparam.h>
|
2020-08-01 07:30:31 +03:00
|
|
|
#include <sys/vm.h>
|
|
|
|
#include <sys/vmmeter.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
extern struct vfsops zfs_vfsops;
|
|
|
|
|
|
|
|
uint_t zfs_arc_free_target = 0;
|
|
|
|
|
2020-04-15 21:14:47 +03:00
|
|
|
static void
|
|
|
|
arc_free_target_init(void *unused __unused)
|
|
|
|
{
|
|
|
|
zfs_arc_free_target = vm_cnt.v_free_target;
|
|
|
|
}
|
|
|
|
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
|
|
|
|
arc_free_target_init, NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't have a tunable for arc_free_target due to the dependency on
|
|
|
|
* pagedaemon initialisation.
|
|
|
|
*/
|
2022-08-09 12:05:47 +03:00
|
|
|
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, free_target,
|
|
|
|
param_set_arc_free_target, 0, CTLFLAG_RW,
|
2022-01-21 19:07:15 +03:00
|
|
|
"Desired number of free pages below which ARC triggers reclaim");
|
2022-08-09 12:05:47 +03:00
|
|
|
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift,
|
|
|
|
param_set_arc_no_grow_shift, 0, ZMOD_RW,
|
|
|
|
"log2(fraction of ARC which must be free to allow growing)");
|
2020-04-15 21:14:47 +03:00
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
int64_t
|
|
|
|
arc_available_memory(void)
|
|
|
|
{
|
|
|
|
int64_t lowest = INT64_MAX;
|
|
|
|
int64_t n __unused;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cooperate with pagedaemon when it's time for it to scan
|
|
|
|
* and reclaim some pages.
|
|
|
|
*/
|
|
|
|
n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
}
|
2024-06-07 04:11:00 +03:00
|
|
|
#if !defined(UMA_MD_SMALL_ALLOC) && !defined(UMA_USE_DMAP)
|
2020-04-14 21:36:28 +03:00
|
|
|
/*
|
2024-06-07 04:11:00 +03:00
|
|
|
* If we're on a platform without a direct map, it's possible that we'll
|
|
|
|
* exhaust the kernel heap space before we ever run out of available
|
|
|
|
* physical memory. Most checks of the size of the heap_area compare
|
|
|
|
* against tune.t_minarmem, which is the minimum available real memory
|
|
|
|
* that we can have in the system. However, this is generally fixed at
|
|
|
|
* 25 pages which is so low that it's useless. In this comparison, we
|
|
|
|
* seek to calculate the total heap-size, and reclaim if more than
|
|
|
|
* 3/4ths of the heap is allocated. (Or, in the calculation, if less
|
|
|
|
* than 1/4th is free)
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
|
|
|
n = uma_avail() - (long)(uma_limit() / 4);
|
|
|
|
if (n < lowest) {
|
|
|
|
lowest = n;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Revise ARC shrinker algorithm
The ARC shrinker callback `arc_shrinker_count/_scan()` is invoked by the
kernel's shrinker mechanism when the system is running low on free
pages. This happens via 2 code paths:
1. "direct reclaim": The system is attempting to allocate a page, but we
are low on memory. The ARC shrinker callback is invoked from the
page-allocation code path.
2. "indirect reclaim": kswapd notices that there aren't many free pages,
so it invokes the ARC shrinker callback.
In both cases, the kernel's shrinker code requests that the ARC shrinker
callback release some of its cache, and then it measures how many pages
were released. However, it's measurement of released pages does not
include pages that are freed via `__free_pages()`, which is how the ARC
releases memory (via `abd_free_chunks()`). Rather, the kernel shrinker
code is looking for pages to be placed on the lists of reclaimable pages
(which is separate from actually-free pages).
Because the kernel shrinker code doesn't detect that the ARC has
released pages, it may call the ARC shrinker callback many times,
resulting in the ARC "collapsing" down to `arc_c_min`. This has several
negative impacts:
1. ZFS doesn't use RAM to cache data effectively.
2. In the direct reclaim case, a single page allocation may wait a long
time (e.g. more than a minute) while we evict the entire ARC.
3. Even with the improvements made in 67c0f0dedc5 ("ARC shrinking blocks
reads/writes"), occasionally `arc_size` may stay above `arc_c` for the
entire time of the ARC collapse, thus blocking ZFS read/write operations
in `arc_get_data_impl()`.
To address these issues, this commit limits the ways that the ARC
shrinker callback can be used by the kernel shrinker code, and mitigates
the impact of arc_is_overflowing() on ZFS read/write operations.
With this commit:
1. We limit the amount of data that can be reclaimed from the ARC via
the "direct reclaim" shrinker. This limits the amount of time it takes
to allocate a single page.
2. We do not allow the ARC to shrink via kswapd (indirect reclaim).
Instead we rely on `arc_evict_zthr` to monitor free memory and reduce
the ARC target size to keep sufficient free memory in the system. Note
that we can't simply rely on limiting the amount that we reclaim at once
(as for the direct reclaim case), because kswapd's "boosted" logic can
invoke the callback an unlimited number of times (see
`balance_pgdat()`).
3. When `arc_is_overflowing()` and we want to allocate memory,
`arc_get_data_impl()` will wait only for a multiple of the requested
amount of data to be evicted, rather than waiting for the ARC to no
longer be overflowing. This allows ZFS reads/writes to make progress
even while the ARC is overflowing, while also ensuring that the eviction
thread makes progress towards reducing the total amount of memory used
by the ARC.
4. The amount of memory that the ARC always tries to keep free for the
rest of the system, `arc_sys_free` is increased.
5. Now that the shrinker callback is able to provide feedback to the
kernel's shrinker code about our progress, we can safely enable
the kswapd hook. This will allow the arc to receive notifications
when memory pressure is first detected by the kernel. We also
re-enable the appropriate kstats to track these callbacks.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10600
2020-08-01 07:10:52 +03:00
|
|
|
DTRACE_PROBE1(arc__available_memory, int64_t, lowest);
|
2020-04-14 21:36:28 +03:00
|
|
|
return (lowest);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return a default max arc size based on the amount of physical memory.
|
|
|
|
*/
|
|
|
|
uint64_t
|
|
|
|
arc_default_max(uint64_t min, uint64_t allmem)
|
|
|
|
{
|
|
|
|
uint64_t size;
|
|
|
|
|
|
|
|
if (allmem >= 1 << 30)
|
|
|
|
size = allmem - (1 << 30);
|
|
|
|
else
|
|
|
|
size = min;
|
|
|
|
return (MAX(allmem * 5 / 8, size));
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
arc_all_memory(void)
|
|
|
|
{
|
2020-07-25 20:47:18 +03:00
|
|
|
return (ptob(physmem));
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
|
|
|
|
{
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
arc_free_memory(void)
|
|
|
|
{
|
2020-07-25 20:47:18 +03:00
|
|
|
return (ptob(freemem));
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static eventhandler_tag arc_event_lowmem = NULL;
|
|
|
|
|
|
|
|
static void
|
|
|
|
arc_lowmem(void *arg __unused, int howto __unused)
|
|
|
|
{
|
Several improvements to ARC shrinking (#16197)
- When receiving memory pressure signal from OS be more strict
trying to free some memory. Otherwise kernel may come again and
request much more. Return as result how much arc_c was actually
reduced due to this request, that may be less than requested.
- On Linux when receiving direct reclaim from some file system
(that may be ZFS) instead of ignoring request completely, just
shrink the ARC, but do not wait for eviction. Waiting there may
cause deadlock. Ignoring it as before may put extra pressure on
other caches and/or swap, and cause OOM if nothing help. While
not waiting may result in more ARC evicted later, and may be too
late if OOM killer activate right now, but I hope it to be better
than doing nothing at all.
- On Linux set arc_no_grow before waiting for reclaim, not after,
or it may grow back while we are waiting.
- On Linux add new parameter zfs_arc_shrinker_seeks to balance
ARC eviction cost, relative to page cache and other subsystems.
- Slightly update Linux arc_set_sys_free() math for new kernels.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
2024-07-25 20:31:14 +03:00
|
|
|
int64_t can_free, free_memory, to_free;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
arc_no_grow = B_TRUE;
|
|
|
|
arc_warm = B_TRUE;
|
|
|
|
arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
|
Several improvements to ARC shrinking (#16197)
- When receiving memory pressure signal from OS be more strict
trying to free some memory. Otherwise kernel may come again and
request much more. Return as result how much arc_c was actually
reduced due to this request, that may be less than requested.
- On Linux when receiving direct reclaim from some file system
(that may be ZFS) instead of ignoring request completely, just
shrink the ARC, but do not wait for eviction. Waiting there may
cause deadlock. Ignoring it as before may put extra pressure on
other caches and/or swap, and cause OOM if nothing help. While
not waiting may result in more ARC evicted later, and may be too
late if OOM killer activate right now, but I hope it to be better
than doing nothing at all.
- On Linux set arc_no_grow before waiting for reclaim, not after,
or it may grow back while we are waiting.
- On Linux add new parameter zfs_arc_shrinker_seeks to balance
ARC eviction cost, relative to page cache and other subsystems.
- Slightly update Linux arc_set_sys_free() math for new kernels.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
2024-07-25 20:31:14 +03:00
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
free_memory = arc_available_memory();
|
Several improvements to ARC shrinking (#16197)
- When receiving memory pressure signal from OS be more strict
trying to free some memory. Otherwise kernel may come again and
request much more. Return as result how much arc_c was actually
reduced due to this request, that may be less than requested.
- On Linux when receiving direct reclaim from some file system
(that may be ZFS) instead of ignoring request completely, just
shrink the ARC, but do not wait for eviction. Waiting there may
cause deadlock. Ignoring it as before may put extra pressure on
other caches and/or swap, and cause OOM if nothing help. While
not waiting may result in more ARC evicted later, and may be too
late if OOM killer activate right now, but I hope it to be better
than doing nothing at all.
- On Linux set arc_no_grow before waiting for reclaim, not after,
or it may grow back while we are waiting.
- On Linux add new parameter zfs_arc_shrinker_seeks to balance
ARC eviction cost, relative to page cache and other subsystems.
- Slightly update Linux arc_set_sys_free() math for new kernels.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
2024-07-25 20:31:14 +03:00
|
|
|
can_free = arc_c - arc_c_min;
|
|
|
|
to_free = (MAX(can_free, 0) >> arc_shrink_shift) - MIN(free_memory, 0);
|
2020-04-14 21:36:28 +03:00
|
|
|
DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
|
Several improvements to ARC shrinking (#16197)
- When receiving memory pressure signal from OS be more strict
trying to free some memory. Otherwise kernel may come again and
request much more. Return as result how much arc_c was actually
reduced due to this request, that may be less than requested.
- On Linux when receiving direct reclaim from some file system
(that may be ZFS) instead of ignoring request completely, just
shrink the ARC, but do not wait for eviction. Waiting there may
cause deadlock. Ignoring it as before may put extra pressure on
other caches and/or swap, and cause OOM if nothing help. While
not waiting may result in more ARC evicted later, and may be too
late if OOM killer activate right now, but I hope it to be better
than doing nothing at all.
- On Linux set arc_no_grow before waiting for reclaim, not after,
or it may grow back while we are waiting.
- On Linux add new parameter zfs_arc_shrinker_seeks to balance
ARC eviction cost, relative to page cache and other subsystems.
- Slightly update Linux arc_set_sys_free() math for new kernels.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
2024-07-25 20:31:14 +03:00
|
|
|
to_free = arc_reduce_target_size(to_free);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It is unsafe to block here in arbitrary threads, because we can come
|
|
|
|
* here from ARC itself and may hold ARC locks and thus risk a deadlock
|
|
|
|
* with ARC reclaim thread.
|
|
|
|
*/
|
2024-08-01 22:25:42 +03:00
|
|
|
if (curproc == pageproc) {
|
Several improvements to ARC shrinking (#16197)
- When receiving memory pressure signal from OS be more strict
trying to free some memory. Otherwise kernel may come again and
request much more. Return as result how much arc_c was actually
reduced due to this request, that may be less than requested.
- On Linux when receiving direct reclaim from some file system
(that may be ZFS) instead of ignoring request completely, just
shrink the ARC, but do not wait for eviction. Waiting there may
cause deadlock. Ignoring it as before may put extra pressure on
other caches and/or swap, and cause OOM if nothing help. While
not waiting may result in more ARC evicted later, and may be too
late if OOM killer activate right now, but I hope it to be better
than doing nothing at all.
- On Linux set arc_no_grow before waiting for reclaim, not after,
or it may grow back while we are waiting.
- On Linux add new parameter zfs_arc_shrinker_seeks to balance
ARC eviction cost, relative to page cache and other subsystems.
- Slightly update Linux arc_set_sys_free() math for new kernels.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
2024-07-25 20:31:14 +03:00
|
|
|
arc_wait_for_eviction(to_free, B_FALSE, B_FALSE);
|
2024-08-01 22:25:42 +03:00
|
|
|
ARCSTAT_BUMP(arcstat_memory_indirect_count);
|
|
|
|
} else {
|
|
|
|
ARCSTAT_BUMP(arcstat_memory_direct_count);
|
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_lowmem_init(void)
|
|
|
|
{
|
|
|
|
arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
|
|
|
|
EVENTHANDLER_PRI_FIRST);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_lowmem_fini(void)
|
|
|
|
{
|
|
|
|
if (arc_event_lowmem != NULL)
|
|
|
|
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
|
|
|
|
}
|
2020-12-11 01:09:23 +03:00
|
|
|
|
|
|
|
void
|
|
|
|
arc_register_hotplug(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
arc_unregister_hotplug(void)
|
|
|
|
{
|
|
|
|
}
|