ZIL: Cleanup sync and commit handling

ZVOL: - Mark all ZVOL ZIL transactions as sync. Since ZVOLs have only one object, it makes no sense to maintain async queue and on each commit merge it into sync. Single sync queue is just cheaper, while it changes nothing until actual commit request arrives. - Remove zsd_sync_cnt and the zil_async_to_sync() calls since we are no longer switching between sync and async queues. ZFS: - Mark write transactions as sync based only on number of sync opens (z_sync_cnt). We can not randomly jump between sync and async unless we want data corruptions due to writes reordering. - When file first opened with O_SYNC (z_sync_cnt incremented to 1) call zil_async_to_sync() for it to preserve correct ordering between past and future writes. - Drop zfs_fsyncer_key logic. Looks like it was an optimization for workloads heavily intermixing async writes with tons of fsyncs. But first it was broken 8 years ago due to Linux tsd implementation not allowing data storage between syscalls, and second, I doubt it is safe to switch from async to sync so often and without calling zil_async_to_sync(). - Rename sync argument of *_log_write() into commit, now only signalling caller's intent to call zil_commit() soon after. It allows WR_COPIED optimizations without extra other meanings. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: George Wilson <george.wilson@delphix.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15366
2026-05-22 02:27:36 +03:00 · 2023-10-30 17:51:56 -04:00
parent 043c6ee3b6
commit c3773de168
12 changed files with 65 additions and 82 deletions
@@ -244,9 +244,15 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 		return (SET_ERROR(EPERM));
 	}

-	/* Keep a count of the synchronous opens in the znode */
-	if (flag & O_SYNC)
-		atomic_inc_32(&zp->z_sync_cnt);
+	/*
+	 * Keep a count of the synchronous opens in the znode.  On first
+	 * synchronous open we must convert all previous async transactions
+	 * into sync to keep correct ordering.
+	 */
+	if (flag & O_SYNC) {
+		if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
+			zil_async_to_sync(zfsvfs->z_log, zp->z_id);
+	}

 	zfs_exit(zfsvfs, FTAG);
 	return (0);
@@ -4201,6 +4207,10 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 	}
 	zfs_vmobject_wunlock(object);

+	boolean_t commit = (flags & (zfs_vm_pagerput_sync |
+	    zfs_vm_pagerput_inval)) != 0 ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS;
+
 	if (ncount == 0)
 		goto out;

@@ -4253,7 +4263,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		 * but that would make the locking messier
 		 */
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
-		    len, 0, NULL, NULL);
+		    len, commit, NULL, NULL);

 		zfs_vmobject_wlock(object);
 		for (i = 0; i < ncount; i++) {
@@ -4268,8 +4278,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,

 out:
 	zfs_rangelock_exit(lr);
-	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
-	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+	if (commit)
 		zil_commit(zfsvfs->z_log, zp->z_id);

 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
@@ -123,7 +123,6 @@ struct zvol_state_os {
 		/* volmode=dev */
 		struct zvol_state_dev {
 			struct cdev *zsd_cdev;
-			uint64_t zsd_sync_cnt;
 			struct selinfo zsd_selinfo;
 		} _zso_dev;

@@ -669,7 +668,7 @@ zvol_geom_bio_strategy(struct bio *bp)
 	int error = 0;
 	boolean_t doread = B_FALSE;
 	boolean_t is_dumpified;
-	boolean_t sync;
+	boolean_t commit;

 	if (bp->bio_to)
 		zv = bp->bio_to->private;
@@ -696,7 +695,7 @@ zvol_geom_bio_strategy(struct bio *bp)
 		}
 		zvol_ensure_zilog(zv);
 		if (bp->bio_cmd == BIO_FLUSH)
-			goto sync;
+			goto commit;
 		break;
 	default:
 		error = SET_ERROR(EOPNOTSUPP);
@@ -718,7 +717,7 @@ zvol_geom_bio_strategy(struct bio *bp)
 	}

 	is_dumpified = B_FALSE;
-	sync = !doread && !is_dumpified &&
+	commit = !doread && !is_dumpified &&
 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;

 	/*
@@ -734,7 +733,7 @@ zvol_geom_bio_strategy(struct bio *bp)
 		if (error != 0) {
 			dmu_tx_abort(tx);
 		} else {
-			zvol_log_truncate(zv, tx, off, resid, sync);
+			zvol_log_truncate(zv, tx, off, resid);
 			dmu_tx_commit(tx);
 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
 			    off, resid);
@@ -755,7 +754,7 @@ zvol_geom_bio_strategy(struct bio *bp)
 				dmu_tx_abort(tx);
 			} else {
 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
-				zvol_log_write(zv, tx, off, size, sync);
+				zvol_log_write(zv, tx, off, size, commit);
 				dmu_tx_commit(tx);
 			}
 		}
@@ -793,8 +792,8 @@ unlock:
 		break;
 	}

-	if (sync) {
-sync:
+	if (commit) {
+commit:
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	}
 resume:
@@ -866,7 +865,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
 	uint64_t volsize;
 	zfs_locked_range_t *lr;
 	int error = 0;
-	boolean_t sync;
+	boolean_t commit;
 	zfs_uio_t uio;

 	zv = dev->si_drv2;
@@ -880,7 +879,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
 		return (SET_ERROR(EIO));

 	ssize_t start_resid = zfs_uio_resid(&uio);
-	sync = (ioflag & IO_SYNC) ||
+	commit = (ioflag & IO_SYNC) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);

 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
@@ -904,7 +903,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
 		}
 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 		if (error == 0)
-			zvol_log_write(zv, tx, off, bytes, sync);
+			zvol_log_write(zv, tx, off, bytes, commit);
 		dmu_tx_commit(tx);

 		if (error)
@@ -913,7 +912,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
 	zfs_rangelock_exit(lr);
 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
-	if (sync)
+	if (commit)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	rw_exit(&zv->zv_suspend_lock);
 	return (error);
@@ -923,7 +922,6 @@ static int
 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	zvol_state_t *zv;
-	struct zvol_state_dev *zsd;
 	int err = 0;
 	boolean_t drop_suspend = B_FALSE;

@@ -1017,13 +1015,6 @@ retry:
 	}

 	zv->zv_open_count++;
-	if (flags & O_SYNC) {
-		zsd = &zv->zv_zso->zso_dev;
-		zsd->zsd_sync_cnt++;
-		if (zsd->zsd_sync_cnt == 1 &&
-		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
-			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
-	}
 out_opened:
 	if (zv->zv_open_count == 0) {
 		zvol_last_close(zv);
@@ -1041,7 +1032,6 @@ static int
 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	zvol_state_t *zv;
-	struct zvol_state_dev *zsd;
 	boolean_t drop_suspend = B_TRUE;

 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
@@ -1091,10 +1081,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 	 * You may get multiple opens, but only one close.
 	 */
 	zv->zv_open_count--;
-	if (flags & O_SYNC) {
-		zsd = &zv->zv_zso->zso_dev;
-		zsd->zsd_sync_cnt--;
-	}

 	if (zv->zv_open_count == 0) {
 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
@@ -1163,7 +1149,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
 			dmu_tx_abort(tx);
 		} else {
 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
-			zvol_log_truncate(zv, tx, offset, length, sync);
+			zvol_log_truncate(zv, tx, offset, length);
 			dmu_tx_commit(tx);
 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
 			    offset, length);