Add support for POSIX_FADV_DONTNEED

For now make it only evict the specified data from the dbuf cache.
Even though dbuf cache is small, this may still reduce eviction of
more useful data from there, and slightly accelerate ARC evictions
by making the blocks there evictable a bit sooner.

On FreeBSD this also adds support for POSIX_FADV_NOREUSE, since the
kernel translates it into POSIX_FADV_DONTNEED after every read/write.
This is not as efficient as it could be for ZFS, but that is the only
way FreeBSD kernel allows to handle POSIX_FADV_NOREUSE now.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18399
This commit is contained in:
Alexander Motin
2026-04-07 11:56:54 -04:00
committed by Tony Hutter
parent 6f14581e1a
commit 4bb7592745
9 changed files with 169 additions and 4 deletions
+2
View File
@@ -411,6 +411,8 @@ void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type,
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
void dbuf_evict_range(struct dnode *dn, uint64_t start_blkid,
uint64_t end_blkid);
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+2
View File
@@ -963,6 +963,8 @@ void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
int dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size);
void dmu_evict_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t len);
typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */
+3 -1
View File
@@ -6748,10 +6748,12 @@ zfs_freebsd_advise(struct vop_advise_args *ap)
dmu_prefetch(os, zp->z_id, 0, start, len,
ZIO_PRIORITY_ASYNC_READ);
break;
case POSIX_FADV_DONTNEED:
dmu_evict_range(os, zp->z_id, start, len);
break;
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
case POSIX_FADV_DONTNEED:
case POSIX_FADV_NOREUSE:
/* ignored for now */
break;
+8 -2
View File
@@ -789,11 +789,17 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
}
}
zfs_exit(zfsvfs, FTAG);
#ifdef HAVE_GENERIC_FADVISE
error = generic_fadvise(filp, offset, len, advice);
#endif
if (error == 0 && advice == POSIX_FADV_DONTNEED) {
loff_t rlen = len ? len : i_size_read(ip) - offset;
dmu_evict_range(os, zp->z_id, offset, rlen);
}
zfs_exit(zfsvfs, FTAG);
return (error);
}
+60
View File
@@ -2082,6 +2082,65 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
kmem_free(db_search, sizeof (dmu_buf_impl_t));
}
/*
* Advisory eviction of level-0 dbufs in [start_blkid, end_blkid] for
* the given dnode. Dirty dbufs carry a reference, so they will be
* evicted once their sync is completed.
*/
void
dbuf_evict_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid)
{
dmu_buf_impl_t *db_marker;
dmu_buf_impl_t *db, *db_next;
avl_index_t where;
db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
db_marker->db_level = 0;
db_marker->db_blkid = start_blkid;
db_marker->db_state = DB_SEARCH;
mutex_enter(&dn->dn_dbufs_mtx);
db = avl_find(&dn->dn_dbufs, db_marker, &where);
ASSERT0P(db);
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
for (; db != NULL; db = db_next) {
if (db->db_level != 0 || db->db_blkid > end_blkid)
break;
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING &&
zfs_refcount_is_zero(&db->db_holds)) {
/*
* Clean and unreferenced: evict immediately.
* Use the marker pattern from dnode_evict_dbufs()
* because dbuf_destroy() may recursively remove
* the parent indirect dbuf from dn_dbufs, which
* could be the node db_next would point to.
*/
db_marker->db_level = db->db_level;
db_marker->db_blkid = db->db_blkid;
db_marker->db_state = DB_MARKER;
db_marker->db_parent =
(void *)((uintptr_t)db - 1);
avl_insert_here(&dn->dn_dbufs, db_marker,
db, AVL_BEFORE);
dbuf_destroy(db);
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
avl_remove(&dn->dn_dbufs, db_marker);
} else {
/* Referenced (possibly dirty): evict when released. */
db->db_pending_evict = TRUE;
db->db_partial_read = FALSE;
mutex_exit(&db->db_mtx);
db_next = AVL_NEXT(&dn->dn_dbufs, db);
}
}
mutex_exit(&dn->dn_dbufs_mtx);
kmem_free(db_marker, sizeof (dmu_buf_impl_t));
}
void
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
{
@@ -5462,6 +5521,7 @@ EXPORT_SYMBOL(dbuf_whichblock);
EXPORT_SYMBOL(dbuf_read);
EXPORT_SYMBOL(dbuf_unoverride);
EXPORT_SYMBOL(dbuf_free_range);
EXPORT_SYMBOL(dbuf_evict_range);
EXPORT_SYMBOL(dbuf_new_size);
EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
+29
View File
@@ -898,6 +898,35 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
rw_exit(&dn->dn_struct_rwlock);
}
/*
* Advisory cache eviction for a byte range of an object.
*/
void
dmu_evict_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
{
dnode_t *dn;
if (len == 0)
return;
if (dnode_hold(os, object, FTAG, &dn) != 0)
return;
/*
* Exclude the last block if the range end is not block-aligned:
* a sequential access may continue into that block. The first
* block is included even when partially covered since backwards
* access patterns are rare.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
uint64_t start = dbuf_whichblock(dn, 0, offset);
uint64_t end = dbuf_whichblock(dn, 0, offset + len);
if (end > start)
dbuf_evict_range(dn, start, end - 1);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
}
/*
* Get the next "chunk" of file data to free. We traverse the file from
* the end so that the file gets shorter over time (if we crash in the
+1 -1
View File
@@ -741,7 +741,7 @@ tests = ['exec_001_pos', 'exec_002_neg']
tags = ['functional', 'exec']
[tests/functional/fadvise]
tests = ['fadvise_willneed']
tests = ['fadvise_dontneed', 'fadvise_willneed']
tags = ['functional', 'fadvise']
[tests/functional/failmode]
+1
View File
@@ -1566,6 +1566,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/exec/exec_002_neg.ksh \
functional/exec/setup.ksh \
functional/fadvise/cleanup.ksh \
functional/fadvise/fadvise_dontneed.ksh \
functional/fadvise/fadvise_willneed.ksh \
functional/fadvise/setup.ksh \
functional/failmode/cleanup.ksh \
@@ -0,0 +1,63 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Test that POSIX_FADV_DONTNEED evicts data from the ZFS dbuf cache.
#
# STRATEGY:
# 1. Write blocks to a file and sync, so they land in the dbuf LRU cache
# 2. Record cache_count from dbufstats
# 3. Call file_fadvise with POSIX_FADV_DONTNEED on the file
# 4. Verify that cache_count decreased
#
verify_runnable "global"
FILE=$TESTDIR/$TESTFILE0
BLKSZ=$(get_prop recordsize $TESTPOOL)
function cleanup
{
[[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/*
}
log_assert "Ensure POSIX_FADV_DONTNEED evicts data from the dbuf cache"
log_onexit cleanup
log_must file_write -o create -f $FILE -b $BLKSZ -c 100
sync_pool $TESTPOOL
evicts1=$(kstat dbufstats.cache_count)
log_must file_fadvise -f $FILE -a POSIX_FADV_DONTNEED
evicts2=$(kstat dbufstats.cache_count)
log_note "cache_count before=$evicts1 after=$evicts2"
log_must [ $evicts1 -gt $evicts2 ]
log_pass "POSIX_FADV_DONTNEED evicts data from the dbuf cache"