Improve fstat(2) performance

There is at most a factor of 3x performance improvement to be had by using the Linux generic_fillattr() helper. However, to use it safely we need to ensure the values in a cached inode are kept rigerously up to date. Unfortunately, this isn't the case for the blksize, blocks, and atime fields. At the moment the authoritative values are still stored in the znode. This patch introduces an optimized zfs_getattr_fast() call. The idea is to use the up to date values from the inode and the blksize, block, and atime fields from the znode. At some latter date we should be able to strictly use the inode values and further improve performance. The remaining overhead in the zfs_getattr_fast() call can be attributed to having to take the znode mutex. This overhead is unavoidable until the inode is kept strictly up to date. The the careful reader will notice the we do not use the customary ZFS_ENTER()/ZFS_EXIT() macros. These macro's are designed to ensure the filesystem is not torn down in the middle of an operation. However, in this case the VFS is holding a reference on the active inode so we know this is impossible. =================== Performance Tests ======================== This test calls the fstat(2) system call 10,000,000 times on an open file description in a tight loop. The test results show the zfs stat(2) performance is now only 22% slower than ext4. This is a 2.5x improvement and there is a clear long term plan to get to parity with ext4. filesystem | test-1 test-2 test-3 | average | times-ext4 --------------+-------------------------+---------+----------- ext4 | 7.785s 7.899s 7.284s | 7.656s | 1.000x zfs-0.6.0-rc4 | 24.052s 22.531s 23.857s | 23.480s | 3.066x zfs-faststat | 9.224s 9.398s 9.485s | 9.369s | 1.223x The second test is to run 'du' of a copy of the /usr tree which contains 110514 files. The test is run multiple times both using both a cold cache (/proc/sys/vm/drop_caches) and a hot cache. As expected this change signigicantly improved the zfs hot cache performance and doesn't quite bring zfs to parity with ext4. A little surprisingly the zfs cold cache performance is better than ext4. This can probably be attributed to the zfs allocation policy of co-locating all the meta data on disk which minimizes seek times. By default the ext4 allocator will spread the data over the entire disk only co-locating each directory. filesystem | cold | hot --------------+---------+-------- ext4 | 13.318s | 1.040s zfs-0.6.0-rc4 | 4.982s | 1.762s zfs-faststat | 4.933s | 1.345s
2025-08-02 13:07:42 +03:00 · 2011-07-09 15:44:16 -07:00 · 2011-07-09 15:44:16 -07:00 · 057e8eee35
commit 057e8eee35
parent abd8610cd5
3 changed files with 40 additions and 27 deletions
--- a/include/sys/zfs_vnops.h
+++ b/include/sys/zfs_vnops.h
@ -54,6 +54,7 @@ extern int zfs_readdir(struct inode *ip, void *dirent, filldir_t filldir,
    loff_t *pos, cred_t *cr);
 extern int zfs_fsync(struct inode *ip, int syncflag, cred_t *cr);
 extern int zfs_getattr(struct inode *ip, vattr_t *vap, int flag, cred_t *cr);
+extern int zfs_getattr_fast(struct inode *ip, struct kstat *sp);
 extern int zfs_setattr(struct inode *ip, vattr_t *vap, int flag, cred_t *cr);
 extern int zfs_rename(struct inode *sdip, char *snm, struct inode *tdip,
    char *tnm, cred_t *cr, int flags);
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@ -2282,6 +2282,44 @@ zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
 }
 EXPORT_SYMBOL(zfs_getattr);

+/*
+ * Get the basic file attributes and place them in the provided kstat
+ * structure.  The inode is assumed to be the authoritative source
+ * for most of the attributes.  However, the znode currently has the
+ * authoritative atime, blksize, and block count.
+ *
+ *	IN:	ip	- inode of file.
+ *
+ *	OUT:	sp	- kstat values.
+ *
+ *	RETURN:	0 (always succeeds)
+ */
+/* ARGSUSED */
+int
+zfs_getattr_fast(struct inode *ip, struct kstat *sp)
+{
+	znode_t *zp = ITOZ(ip);
+	zfs_sb_t *zsb = ITOZSB(ip);
+
+	mutex_enter(&zp->z_lock);
+
+	generic_fillattr(ip, sp);
+	ZFS_TIME_DECODE(&sp->atime, zp->z_atime);
+
+	sa_object_size(zp->z_sa_hdl, (uint32_t *)&sp->blksize, &sp->blocks);
+	if (unlikely(zp->z_blksz == 0)) {
+		/*
+		 * Block size hasn't been set; suggest maximal I/O transfers.
+		 */
+		sp->blksize = zsb->z_max_blksz;
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	return (0);
+}
+EXPORT_SYMBOL(zfs_getattr_fast);
+
 /*
 * Set the file attributes to the values contained in the
 * vattr structure.
--- a/module/zfs/zpl_inode.c
+++ b/module/zfs/zpl_inode.c
@ -165,35 +165,9 @@ zpl_rmdir(struct inode * dir, struct dentry *dentry)
 static int
 zpl_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
-	cred_t *cr = CRED();
-	vattr_t *vap;
-	struct inode *ip;
 	int error;

-	ip = dentry->d_inode;
-	crhold(cr);
-	vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP);
-
-	error = -zfs_getattr(ip, vap, 0, cr);
-	if (error)
-		goto out;
-
-	stat->ino = ip->i_ino;
-	stat->dev = ip->i_sb->s_dev;
-	stat->mode = vap->va_mode;
-	stat->nlink = vap->va_nlink;
-	stat->uid = vap->va_uid;
-	stat->gid = vap->va_gid;
-	stat->rdev = vap->va_rdev;
-	stat->size = vap->va_size;
-	stat->atime = vap->va_atime;
-	stat->mtime = vap->va_mtime;
-	stat->ctime = vap->va_ctime;
-	stat->blksize = vap->va_blksize;
-	stat->blocks = vap->va_nblocks;
-out:
-	kmem_free(vap, sizeof(vattr_t));
-	crfree(cr);
+	error = -zfs_getattr_fast(dentry->d_inode, stat);
 	ASSERT3S(error, <=, 0);

 	return (error);