mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 03:08:51 +03:00
Add support for POSIX_FADV_DONTNEED
For now make it only evict the specified data from the dbuf cache. Even though dbuf cache is small, this may still reduce eviction of more useful data from there, and slightly accelerate ARC evictions by making the blocks there evictable a bit sooner. On FreeBSD this also adds support for POSIX_FADV_NOREUSE, since the kernel translates it into POSIX_FADV_DONTNEED after every read/write. This is not as efficient as it could be for ZFS, but that is the only way FreeBSD kernel allows to handle POSIX_FADV_NOREUSE now. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com> Closes #18399
This commit is contained in:
committed by
Tony Hutter
parent
6f14581e1a
commit
4bb7592745
@@ -6748,10 +6748,12 @@ zfs_freebsd_advise(struct vop_advise_args *ap)
|
||||
dmu_prefetch(os, zp->z_id, 0, start, len,
|
||||
ZIO_PRIORITY_ASYNC_READ);
|
||||
break;
|
||||
case POSIX_FADV_DONTNEED:
|
||||
dmu_evict_range(os, zp->z_id, start, len);
|
||||
break;
|
||||
case POSIX_FADV_NORMAL:
|
||||
case POSIX_FADV_RANDOM:
|
||||
case POSIX_FADV_SEQUENTIAL:
|
||||
case POSIX_FADV_DONTNEED:
|
||||
case POSIX_FADV_NOREUSE:
|
||||
/* ignored for now */
|
||||
break;
|
||||
|
||||
@@ -789,11 +789,17 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
|
||||
}
|
||||
}
|
||||
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
|
||||
#ifdef HAVE_GENERIC_FADVISE
|
||||
error = generic_fadvise(filp, offset, len, advice);
|
||||
#endif
|
||||
|
||||
if (error == 0 && advice == POSIX_FADV_DONTNEED) {
|
||||
loff_t rlen = len ? len : i_size_read(ip) - offset;
|
||||
dmu_evict_range(os, zp->z_id, offset, rlen);
|
||||
}
|
||||
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
|
||||
@@ -2082,6 +2082,65 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
|
||||
kmem_free(db_search, sizeof (dmu_buf_impl_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Advisory eviction of level-0 dbufs in [start_blkid, end_blkid] for
|
||||
* the given dnode. Dirty dbufs carry a reference, so they will be
|
||||
* evicted once their sync is completed.
|
||||
*/
|
||||
void
|
||||
dbuf_evict_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid)
|
||||
{
|
||||
dmu_buf_impl_t *db_marker;
|
||||
dmu_buf_impl_t *db, *db_next;
|
||||
avl_index_t where;
|
||||
|
||||
db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
|
||||
db_marker->db_level = 0;
|
||||
db_marker->db_blkid = start_blkid;
|
||||
db_marker->db_state = DB_SEARCH;
|
||||
|
||||
mutex_enter(&dn->dn_dbufs_mtx);
|
||||
db = avl_find(&dn->dn_dbufs, db_marker, &where);
|
||||
ASSERT0P(db);
|
||||
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
|
||||
|
||||
for (; db != NULL; db = db_next) {
|
||||
if (db->db_level != 0 || db->db_blkid > end_blkid)
|
||||
break;
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (db->db_state != DB_EVICTING &&
|
||||
zfs_refcount_is_zero(&db->db_holds)) {
|
||||
/*
|
||||
* Clean and unreferenced: evict immediately.
|
||||
* Use the marker pattern from dnode_evict_dbufs()
|
||||
* because dbuf_destroy() may recursively remove
|
||||
* the parent indirect dbuf from dn_dbufs, which
|
||||
* could be the node db_next would point to.
|
||||
*/
|
||||
db_marker->db_level = db->db_level;
|
||||
db_marker->db_blkid = db->db_blkid;
|
||||
db_marker->db_state = DB_MARKER;
|
||||
db_marker->db_parent =
|
||||
(void *)((uintptr_t)db - 1);
|
||||
avl_insert_here(&dn->dn_dbufs, db_marker,
|
||||
db, AVL_BEFORE);
|
||||
dbuf_destroy(db);
|
||||
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
|
||||
avl_remove(&dn->dn_dbufs, db_marker);
|
||||
} else {
|
||||
/* Referenced (possibly dirty): evict when released. */
|
||||
db->db_pending_evict = TRUE;
|
||||
db->db_partial_read = FALSE;
|
||||
mutex_exit(&db->db_mtx);
|
||||
db_next = AVL_NEXT(&dn->dn_dbufs, db);
|
||||
}
|
||||
}
|
||||
mutex_exit(&dn->dn_dbufs_mtx);
|
||||
|
||||
kmem_free(db_marker, sizeof (dmu_buf_impl_t));
|
||||
}
|
||||
|
||||
void
|
||||
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
||||
{
|
||||
@@ -5462,6 +5521,7 @@ EXPORT_SYMBOL(dbuf_whichblock);
|
||||
EXPORT_SYMBOL(dbuf_read);
|
||||
EXPORT_SYMBOL(dbuf_unoverride);
|
||||
EXPORT_SYMBOL(dbuf_free_range);
|
||||
EXPORT_SYMBOL(dbuf_evict_range);
|
||||
EXPORT_SYMBOL(dbuf_new_size);
|
||||
EXPORT_SYMBOL(dbuf_release_bp);
|
||||
EXPORT_SYMBOL(dbuf_dirty);
|
||||
|
||||
@@ -898,6 +898,35 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Advisory cache eviction for a byte range of an object.
|
||||
*/
|
||||
void
|
||||
dmu_evict_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
|
||||
{
|
||||
dnode_t *dn;
|
||||
|
||||
if (len == 0)
|
||||
return;
|
||||
if (dnode_hold(os, object, FTAG, &dn) != 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Exclude the last block if the range end is not block-aligned:
|
||||
* a sequential access may continue into that block. The first
|
||||
* block is included even when partially covered since backwards
|
||||
* access patterns are rare.
|
||||
*/
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||
uint64_t start = dbuf_whichblock(dn, 0, offset);
|
||||
uint64_t end = dbuf_whichblock(dn, 0, offset + len);
|
||||
if (end > start)
|
||||
dbuf_evict_range(dn, start, end - 1);
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
|
||||
dnode_rele(dn, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the next "chunk" of file data to free. We traverse the file from
|
||||
* the end so that the file gets shorter over time (if we crash in the
|
||||
|
||||
Reference in New Issue
Block a user