OpenZFS 9689 - zfs range lock code should not be zpl-specific

The ZFS range locking code in zfs_rlock.c/h depends on ZPL-specific data structures, specifically znode_t. However, it's also used by the ZVOL code, which uses a "dummy" znode_t to pass to the range locking code. We should clean this up so that the range locking code is generic and can be used equally by ZPL and ZVOL, and also can be used by future consumers that may need to run in userland (libzpool) as well as the kernel. Porting notes: * Added missing sys/avl.h include to sys/zfs_rlock.h. * Removed 'dbuf is within the locked range' ASSERTs from dmu_sync(). This was needed because ztest does not yet use a locked_range_t. * Removed "Approved by:" tag requirement from OpenZFS commit check to prevent needless warnings when integrating changes which has not been merged to illumos. * Reverted free_list range lock changes which were originally needed to defer the cv_destroy() which was called immediately after cv_broadcast(). With d2733258 this should be safe but if not we may need to reintroduce this logic. * Reverts: The following two commits were reverted and squashed in to this change in order to make it easier to apply OpenZFS 9689. - d88895a0, which removed the dummy znode from zvol_state - e3a07cd0, which updated ztest to use range locks * Preserved optimized rangelock comparison function. Preserved the rangelock free list. The cv_destroy() function will block waiting for all processes in cv_wait() to be scheduled and drop their reference. This is done to ensure it's safe to free the condition variable. However, blocking while holding the rl->rl_lock mutex can result in a deadlock on Linux. A free list is introduced to defer the cv_destroy() and kmem_free() until after the mutex is released. Authored by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://illumos.org/issues/9689 OpenZFS-commit: https://github.com/openzfs/openzfs/pull/680 External-issue: DLPX-58662 Closes #7980
2026-05-24 11:18:52 +03:00 · 2018-10-01 15:13:12 -07:00
parent 50a343d85c
commit 5d43cc9a59
10 changed files with 484 additions and 595 deletions
@@ -20,7 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 */

 /* Portions Copyright 2007 Jeremy Teo */
@@ -91,6 +91,37 @@ static kmem_cache_t *znode_cache = NULL;
 static kmem_cache_t *znode_hold_cache = NULL;
 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;

+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(locked_range_t *new, void *arg)
+{
+	znode_t *zp = arg;
+
+	/*
+	 * If in append mode, convert to writer and lock starting at the
+	 * current end of file.
+	 */
+	if (new->lr_type == RL_APPEND) {
+		new->lr_offset = zp->z_size;
+		new->lr_type = RL_WRITER;
+	}
+
+	/*
+	 * If we need to grow the block size then lock the whole file range.
+	 */
+	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+		new->lr_offset = 0;
+		new->lr_length = UINT64_MAX;
+	}
+}
+
 /*ARGSUSED*/
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
@@ -106,7 +137,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);

-	zfs_rlock_init(&zp->z_range_lock);
+	rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);

 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
@@ -128,7 +159,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	rw_destroy(&zp->z_xattr_lock);
-	zfs_rlock_destroy(&zp->z_range_lock);
+	rangelock_fini(&zp->z_rangelock);

 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
@@ -577,9 +608,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_FALSE;
 	zp->z_is_stale = B_FALSE;
-	zp->z_range_lock.zr_size = &zp->z_size;
-	zp->z_range_lock.zr_blksz = &zp->z_blksz;
-	zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;

 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);

@@ -1475,20 +1503,20 @@ zfs_extend(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	dmu_tx_t *tx;
-	rl_t *rl;
+	locked_range_t *lr;
 	uint64_t newblksz;
 	int error;

 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
+	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);

 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end <= zp->z_size) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (0);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
@@ -1518,7 +1546,7 @@ zfs_extend(znode_t *zp, uint64_t end)
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (error);
 	}

@@ -1530,7 +1558,7 @@ zfs_extend(znode_t *zp, uint64_t end)
 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
 	    &zp->z_size, sizeof (zp->z_size), tx));

-	zfs_range_unlock(rl);
+	rangelock_exit(lr);

 	dmu_tx_commit(tx);

@@ -1593,19 +1621,19 @@ static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
-	rl_t *rl;
+	locked_range_t *lr;
 	int error;

 	/*
 	 * Lock the range being freed.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
+	lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);

 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (off >= zp->z_size) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (0);
 	}

@@ -1655,7 +1683,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 				    page_len);
 		}
 	}
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);

 	return (error);
 }
@@ -1673,7 +1701,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	dmu_tx_t *tx;
-	rl_t *rl;
+	locked_range_t *lr;
 	int error;
 	sa_bulk_attr_t bulk[2];
 	int count = 0;
@@ -1681,20 +1709,20 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
-	rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
+	lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);

 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end >= zp->z_size) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (0);
 	}

 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
 	    DMU_OBJECT_END);
 	if (error) {
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (error);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
@@ -1704,7 +1732,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		zfs_range_unlock(rl);
+		rangelock_exit(lr);
 		return (error);
 	}

@@ -1720,8 +1748,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);

 	dmu_tx_commit(tx);
-
-	zfs_range_unlock(rl);
+	rangelock_exit(lr);

 	return (0);
 }