Define sops->free_inode() to prevent use-after-free during lookup

On Linux, when doing path lookup with LOOKUP_RCU, dentry and inode can
be dereferenced without refcounts and locks. For this reason, dentry and
inode must only be freed after RCU grace period.

However, zfs currently frees inode in zfs_inode_destroy synchronously
and we can't use GPL-only call_rcu() in zfs directly. Fortunately, on
Linux 5.2 and after, if we define sops->free_inode(), the kernel will do
call_rcu() for us.

This issue may be triggered more easily with init_on_free=1 boot
parameter:

BUG: kernel NULL pointer dereference, address: 0000000000000020
RIP: 0010:selinux_inode_permission+0x10e/0x1c0
Call Trace:
 ? show_trace_log_lvl+0x1be/0x2d9
 ? show_trace_log_lvl+0x1be/0x2d9
 ? show_trace_log_lvl+0x1be/0x2d9
 ? security_inode_permission+0x37/0x60
 ? __die_body.cold+0x8/0xd
 ? no_context+0x113/0x220
 ? exc_page_fault+0x6d/0x130
 ? asm_exc_page_fault+0x1e/0x30
 ? selinux_inode_permission+0x10e/0x1c0
 security_inode_permission+0x37/0x60
 link_path_walk.part.0.constprop.0+0xb5/0x360
 ? path_init+0x27d/0x3c0
 path_lookupat+0x3e/0x1a0
 filename_lookup+0xc0/0x1d0
 ? __check_object_size.part.0+0x123/0x150
 ? strncpy_from_user+0x4e/0x130
 ? getname_flags.part.0+0x4b/0x1c0
 vfs_statx+0x72/0x120
 ? ioctl_has_perm.constprop.0.isra.0+0xbd/0x120
 __do_sys_newlstat+0x39/0x70
 ? __x64_sys_ioctl+0x8d/0xd0
 do_syscall_64+0x30/0x40
 entry_SYSCALL_64_after_hwframe+0x62/0xc7

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Co-authored-by: Chunwei Chen <david.chen@nutanix.com>
Closes #17546
This commit is contained in:
Chunwei Chen 2025-07-18 08:45:13 -07:00 committed by Alexander Motin
parent 347d68048a
commit c79d5e4f33
5 changed files with 54 additions and 2 deletions

View File

@ -0,0 +1,24 @@
dnl #
dnl # Linux 5.2 API change
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE], [
ZFS_LINUX_TEST_SRC([super_operations_free_inode], [
#include <linux/fs.h>
static void free_inode(struct inode *) { }
static struct super_operations sops __attribute__ ((unused)) = {
.free_inode = free_inode,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_SOPS_FREE_INODE], [
AC_MSG_CHECKING([whether sops->free_inode() exists])
ZFS_LINUX_TEST_RESULT([super_operations_free_inode], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_SOPS_FREE_INODE, 1, [sops->free_inode() exists])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -132,6 +132,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_PIN_USER_PAGES ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
ZFS_AC_KERNEL_SRC_TIMER ZFS_AC_KERNEL_SRC_TIMER
ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR
ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE
case "$host_cpu" in case "$host_cpu" in
powerpc*) powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@ -248,6 +249,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_PIN_USER_PAGES ZFS_AC_KERNEL_PIN_USER_PAGES
ZFS_AC_KERNEL_TIMER ZFS_AC_KERNEL_TIMER
ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR
ZFS_AC_KERNEL_SOPS_FREE_INODE
case "$host_cpu" in case "$host_cpu" in
powerpc*) powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_CPU_HAS_FEATURE

View File

@ -157,6 +157,7 @@ struct znode;
extern int zfs_sync(struct super_block *, int, cred_t *); extern int zfs_sync(struct super_block *, int, cred_t *);
extern int zfs_inode_alloc(struct super_block *, struct inode **ip); extern int zfs_inode_alloc(struct super_block *, struct inode **ip);
extern void zfs_inode_free(struct inode *);
extern void zfs_inode_destroy(struct inode *); extern void zfs_inode_destroy(struct inode *);
extern void zfs_mark_inode_dirty(struct inode *); extern void zfs_mark_inode_dirty(struct inode *);
extern boolean_t zfs_relatime_need_update(const struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *);

View File

@ -371,6 +371,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
return (0); return (0);
} }
void
zfs_inode_free(struct inode *ip)
{
kmem_cache_free(znode_cache, ITOZ(ip));
}
/* /*
* Called in multiple places when an inode should be destroyed. * Called in multiple places when an inode should be destroyed.
*/ */
@ -395,8 +401,15 @@ zfs_inode_destroy(struct inode *ip)
nvlist_free(zp->z_xattr_cached); nvlist_free(zp->z_xattr_cached);
zp->z_xattr_cached = NULL; zp->z_xattr_cached = NULL;
} }
#ifndef HAVE_SOPS_FREE_INODE
kmem_cache_free(znode_cache, zp); /*
* inode needs to be freed in RCU callback. If we have
* super_operations->free_inode, Linux kernel will do call_rcu
* for us. But if we don't have it, since call_rcu is GPL-only
* symbol, we can only free synchronously and accept the risk.
*/
zfs_inode_free(ip);
#endif
} }
static void static void

View File

@ -45,6 +45,15 @@ zpl_inode_alloc(struct super_block *sb)
return (ip); return (ip);
} }
#ifdef HAVE_SOPS_FREE_INODE
static void
zpl_inode_free(struct inode *ip)
{
ASSERT(atomic_read(&ip->i_count) == 0);
zfs_inode_free(ip);
}
#endif
static void static void
zpl_inode_destroy(struct inode *ip) zpl_inode_destroy(struct inode *ip)
{ {
@ -455,6 +464,9 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
const struct super_operations zpl_super_operations = { const struct super_operations zpl_super_operations = {
.alloc_inode = zpl_inode_alloc, .alloc_inode = zpl_inode_alloc,
#ifdef HAVE_SOPS_FREE_INODE
.free_inode = zpl_inode_free,
#endif
.destroy_inode = zpl_inode_destroy, .destroy_inode = zpl_inode_destroy,
.dirty_inode = zpl_dirty_inode, .dirty_inode = zpl_dirty_inode,
.write_inode = NULL, .write_inode = NULL,