diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index baf3b1508..0b379ee79 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -411,6 +411,8 @@ void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); +void dbuf_evict_range(struct dnode *dn, uint64_t start_blkid, + uint64_t end_blkid); void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index bb623e404..81c293e72 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -963,6 +963,8 @@ void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); int dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); +void dmu_evict_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t len); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 1e8d6eb1b..b865c1639 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6748,10 +6748,12 @@ zfs_freebsd_advise(struct vop_advise_args *ap) dmu_prefetch(os, zp->z_id, 0, start, len, ZIO_PRIORITY_ASYNC_READ); break; + case POSIX_FADV_DONTNEED: + dmu_evict_range(os, zp->z_id, start, len); + break; case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_SEQUENTIAL: - case POSIX_FADV_DONTNEED: case POSIX_FADV_NOREUSE: /* ignored for now */ break; diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index b154d045f..ffe227796 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -789,11 +789,17 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) } } - zfs_exit(zfsvfs, FTAG); - #ifdef HAVE_GENERIC_FADVISE error = generic_fadvise(filp, offset, len, advice); #endif + + if (error == 0 && advice == POSIX_FADV_DONTNEED) { + loff_t rlen = len ? len : i_size_read(ip) - offset; + dmu_evict_range(os, zp->z_id, offset, rlen); + } + + zfs_exit(zfsvfs, FTAG); + return (error); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 39a7b7bce..ee9a2270a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2082,6 +2082,65 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, kmem_free(db_search, sizeof (dmu_buf_impl_t)); } +/* + * Advisory eviction of level-0 dbufs in [start_blkid, end_blkid] for + * the given dnode. Dirty dbufs carry a reference, so they will be + * evicted once their sync is completed. + */ +void +dbuf_evict_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid) +{ + dmu_buf_impl_t *db_marker; + dmu_buf_impl_t *db, *db_next; + avl_index_t where; + + db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + db_marker->db_level = 0; + db_marker->db_blkid = start_blkid; + db_marker->db_state = DB_SEARCH; + + mutex_enter(&dn->dn_dbufs_mtx); + db = avl_find(&dn->dn_dbufs, db_marker, &where); + ASSERT0P(db); + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db != NULL; db = db_next) { + if (db->db_level != 0 || db->db_blkid > end_blkid) + break; + + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + zfs_refcount_is_zero(&db->db_holds)) { + /* + * Clean and unreferenced: evict immediately. + * Use the marker pattern from dnode_evict_dbufs() + * because dbuf_destroy() may recursively remove + * the parent indirect dbuf from dn_dbufs, which + * could be the node db_next would point to. + */ + db_marker->db_level = db->db_level; + db_marker->db_blkid = db->db_blkid; + db_marker->db_state = DB_MARKER; + db_marker->db_parent = + (void *)((uintptr_t)db - 1); + avl_insert_here(&dn->dn_dbufs, db_marker, + db, AVL_BEFORE); + dbuf_destroy(db); + db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); + avl_remove(&dn->dn_dbufs, db_marker); + } else { + /* Referenced (possibly dirty): evict when released. */ + db->db_pending_evict = TRUE; + db->db_partial_read = FALSE; + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + kmem_free(db_marker, sizeof (dmu_buf_impl_t)); +} + void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { @@ -5462,6 +5521,7 @@ EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); EXPORT_SYMBOL(dbuf_unoverride); EXPORT_SYMBOL(dbuf_free_range); +EXPORT_SYMBOL(dbuf_evict_range); EXPORT_SYMBOL(dbuf_new_size); EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 5cb02831a..19b8b0594 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -898,6 +898,35 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) rw_exit(&dn->dn_struct_rwlock); } +/* + * Advisory cache eviction for a byte range of an object. + */ +void +dmu_evict_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +{ + dnode_t *dn; + + if (len == 0) + return; + if (dnode_hold(os, object, FTAG, &dn) != 0) + return; + + /* + * Exclude the last block if the range end is not block-aligned: + * a sequential access may continue into that block. The first + * block is included even when partially covered since backwards + * access patterns are rare. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + uint64_t start = dbuf_whichblock(dn, 0, offset); + uint64_t end = dbuf_whichblock(dn, 0, offset + len); + if (end > start) + dbuf_evict_range(dn, start, end - 1); + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); +} + /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crash in the diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 69752e07a..623496916 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -741,7 +741,7 @@ tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/fadvise] -tests = ['fadvise_willneed'] +tests = ['fadvise_dontneed', 'fadvise_willneed'] tags = ['functional', 'fadvise'] [tests/functional/failmode] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 97b644c31..27a6edc31 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1566,6 +1566,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/exec/exec_002_neg.ksh \ functional/exec/setup.ksh \ functional/fadvise/cleanup.ksh \ + functional/fadvise/fadvise_dontneed.ksh \ functional/fadvise/fadvise_willneed.ksh \ functional/fadvise/setup.ksh \ functional/failmode/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/fadvise/fadvise_dontneed.ksh b/tests/zfs-tests/tests/functional/fadvise/fadvise_dontneed.ksh new file mode 100755 index 000000000..b19f576ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/fadvise/fadvise_dontneed.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test that POSIX_FADV_DONTNEED evicts data from the ZFS dbuf cache. +# +# STRATEGY: +# 1. Write blocks to a file and sync, so they land in the dbuf LRU cache +# 2. Record cache_count from dbufstats +# 3. Call file_fadvise with POSIX_FADV_DONTNEED on the file +# 4. Verify that cache_count decreased +# + +verify_runnable "global" + +FILE=$TESTDIR/$TESTFILE0 +BLKSZ=$(get_prop recordsize $TESTPOOL) + +function cleanup +{ + [[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/* +} + +log_assert "Ensure POSIX_FADV_DONTNEED evicts data from the dbuf cache" + +log_onexit cleanup + +log_must file_write -o create -f $FILE -b $BLKSZ -c 100 +sync_pool $TESTPOOL + +evicts1=$(kstat dbufstats.cache_count) + +log_must file_fadvise -f $FILE -a POSIX_FADV_DONTNEED + +evicts2=$(kstat dbufstats.cache_count) +log_note "cache_count before=$evicts1 after=$evicts2" + +log_must [ $evicts1 -gt $evicts2 ] + +log_pass "POSIX_FADV_DONTNEED evicts data from the dbuf cache"