Add support for POSIX_FADV_DONTNEED

For now make it only evict the specified data from the dbuf cache.
Even though dbuf cache is small, this may still reduce eviction of
more useful data from there, and slightly accelerate ARC evictions
by making the blocks there evictable a bit sooner.

On FreeBSD this also adds support for POSIX_FADV_NOREUSE, since the
kernel translates it into POSIX_FADV_DONTNEED after every read/write.
This is not as efficient as it could be for ZFS, but that is the only
way FreeBSD kernel allows to handle POSIX_FADV_NOREUSE now.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18399
This commit is contained in:
Alexander Motin
2026-04-07 11:56:54 -04:00
committed by Tony Hutter
parent 6f14581e1a
commit 4bb7592745
9 changed files with 169 additions and 4 deletions
+3 -1
View File
@@ -6748,10 +6748,12 @@ zfs_freebsd_advise(struct vop_advise_args *ap)
dmu_prefetch(os, zp->z_id, 0, start, len,
ZIO_PRIORITY_ASYNC_READ);
break;
case POSIX_FADV_DONTNEED:
dmu_evict_range(os, zp->z_id, start, len);
break;
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
case POSIX_FADV_DONTNEED:
case POSIX_FADV_NOREUSE:
/* ignored for now */
break;
+8 -2
View File
@@ -789,11 +789,17 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
}
}
zfs_exit(zfsvfs, FTAG);
#ifdef HAVE_GENERIC_FADVISE
error = generic_fadvise(filp, offset, len, advice);
#endif
if (error == 0 && advice == POSIX_FADV_DONTNEED) {
loff_t rlen = len ? len : i_size_read(ip) - offset;
dmu_evict_range(os, zp->z_id, offset, rlen);
}
zfs_exit(zfsvfs, FTAG);
return (error);
}
+60
View File
@@ -2082,6 +2082,65 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
kmem_free(db_search, sizeof (dmu_buf_impl_t));
}
/*
* Advisory eviction of level-0 dbufs in [start_blkid, end_blkid] for
* the given dnode. Dirty dbufs carry a reference, so they will be
* evicted once their sync is completed.
*/
void
dbuf_evict_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid)
{
dmu_buf_impl_t *db_marker;
dmu_buf_impl_t *db, *db_next;
avl_index_t where;
db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
db_marker->db_level = 0;
db_marker->db_blkid = start_blkid;
db_marker->db_state = DB_SEARCH;
mutex_enter(&dn->dn_dbufs_mtx);
db = avl_find(&dn->dn_dbufs, db_marker, &where);
ASSERT0P(db);
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
for (; db != NULL; db = db_next) {
if (db->db_level != 0 || db->db_blkid > end_blkid)
break;
mutex_enter(&db->db_mtx);
if (db->db_state != DB_EVICTING &&
zfs_refcount_is_zero(&db->db_holds)) {
/*
* Clean and unreferenced: evict immediately.
* Use the marker pattern from dnode_evict_dbufs()
* because dbuf_destroy() may recursively remove
* the parent indirect dbuf from dn_dbufs, which
* could be the node db_next would point to.
*/
db_marker->db_level = db->db_level;
db_marker->db_blkid = db->db_blkid;
db_marker->db_state = DB_MARKER;
db_marker->db_parent =
(void *)((uintptr_t)db - 1);
avl_insert_here(&dn->dn_dbufs, db_marker,
db, AVL_BEFORE);
dbuf_destroy(db);
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
avl_remove(&dn->dn_dbufs, db_marker);
} else {
/* Referenced (possibly dirty): evict when released. */
db->db_pending_evict = TRUE;
db->db_partial_read = FALSE;
mutex_exit(&db->db_mtx);
db_next = AVL_NEXT(&dn->dn_dbufs, db);
}
}
mutex_exit(&dn->dn_dbufs_mtx);
kmem_free(db_marker, sizeof (dmu_buf_impl_t));
}
void
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
{
@@ -5462,6 +5521,7 @@ EXPORT_SYMBOL(dbuf_whichblock);
EXPORT_SYMBOL(dbuf_read);
EXPORT_SYMBOL(dbuf_unoverride);
EXPORT_SYMBOL(dbuf_free_range);
EXPORT_SYMBOL(dbuf_evict_range);
EXPORT_SYMBOL(dbuf_new_size);
EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
+29
View File
@@ -898,6 +898,35 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
rw_exit(&dn->dn_struct_rwlock);
}
/*
* Advisory cache eviction for a byte range of an object.
*/
void
dmu_evict_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
{
dnode_t *dn;
if (len == 0)
return;
if (dnode_hold(os, object, FTAG, &dn) != 0)
return;
/*
* Exclude the last block if the range end is not block-aligned:
* a sequential access may continue into that block. The first
* block is included even when partially covered since backwards
* access patterns are rare.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
uint64_t start = dbuf_whichblock(dn, 0, offset);
uint64_t end = dbuf_whichblock(dn, 0, offset + len);
if (end > start)
dbuf_evict_range(dn, start, end - 1);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
}
/*
* Get the next "chunk" of file data to free. We traverse the file from
* the end so that the file gets shorter over time (if we crash in the