From 384f8a09f8423d951bb81d9ca945e588de14f95f Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Fri, 22 Nov 2013 15:13:18 -0800 Subject: [PATCH] Illumos #4347 ZPL can use dmu_tx_assign(TXG_WAIT) Fix a lock contention issue by allowing threads not holding ZPL locks to block when waiting to assign a transaction. Porting Notes: zfs_putpage() still uses TXG_NOWAIT, unlike the upstream version. This case may be a contention point just like zfs_write(), however it is not safe to block here since it may be called during memory reclaim. Reviewed by: George Wilson Reviewed by: Adam Leventhal Reviewed by: Dan McDonald Reviewed by: Boris Protopopov Approved by: Dan McDonald References: https://www.illumos.org/issues/4347 illumos/illumos-gate@e722410c49fe67cbf0f639cbcc288bd6cbcf7dd1 Ported-by: Ned Bass Signed-off-by: Brian Behlendorf --- module/zfs/zfs_dir.c | 8 +------- module/zfs/zfs_vnops.c | 32 +++++++++++++++----------------- module/zfs/zfs_znode.c | 15 ++------------- 3 files changed, 18 insertions(+), 37 deletions(-) diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c index 17c77d035..448a8727e 100644 --- a/module/zfs/zfs_dir.c +++ b/module/zfs/zfs_dir.c @@ -973,7 +973,6 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) return (SET_ERROR(EDQUOT)); } -top: tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); @@ -982,13 +981,8 @@ top: fuid_dirtied = zsb->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zsb, tx); - error = dmu_tx_assign(tx, TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); return (error); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index abf3747db..1552b61e0 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -106,11 +106,18 @@ * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * - * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). - * This is critical because we don't want to block while holding locks. - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing to - * use a non-blocking assign can deadlock the system. The scenario: + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. @@ -712,7 +719,6 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) while (n > 0) { abuf = NULL; woff = uio->uio_loffset; -again: if (zfs_owner_overquota(zsb, zp, B_FALSE) || zfs_owner_overquota(zsb, zp, B_TRUE)) { if (abuf != NULL) @@ -762,13 +768,8 @@ again: dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto again; - } dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); @@ -2833,12 +2834,9 @@ top: zfs_sa_upgrade_txholds(tx, zp); - err = dmu_tx_assign(tx, TXG_NOWAIT); - if (err) { - if (err == ERESTART) - dmu_tx_wait(tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) goto out; - } count = 0; /* diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index c141c9367..9b44912cc 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -1205,7 +1205,6 @@ zfs_extend(znode_t *zp, uint64_t end) zfs_range_unlock(rl); return (0); } -top: tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); @@ -1225,13 +1224,8 @@ top: newblksz = 0; } - error = dmu_tx_assign(tx, TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); zfs_range_unlock(rl); return (error); @@ -1419,13 +1413,8 @@ log: tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto log; - } dmu_tx_abort(tx); return (error); }