Use big transactions for small recordsize writes.

When ZFS appends files in chunks bigger than recordsize, it borrows
buffer from ARC and fills it before opening transaction.  This
supposed to help in case of page faults to not hold transaction open
indefinitely.  The problem appears when recordsize is set lower than
default 128KB. Since each block is committed in separate transaction,
per-transaction overhead becomes significant, and what is even worse,
active use of of per-dataset and per-pool locks to protect space use
accounting for each transaction badly hurts the code SMP scalability.
The same transaction size limitation applies in case of file rewrite,
but without even excuse of buffer borrowing.

To address the issue, disable the borrowing mechanism if recordsize
is smaller than default and the write request is 4x bigger than it.
In such case writes up to 32MB are executed in single transaction,
that dramatically reduces overhead and lock contention.  Since the
borrowing mechanism is not used for file rewrites, and it was never
used by zvols, which seem to work fine, I don't think this change
should create significant problems, partially because in addition to
the borrowing mechanism there are also used pre-faults.

My tests with 4/8 threads writing several files same time on datasets
with 32KB recordsize in 1MB requests show reduction of CPU usage by
the user threads by 25-35%.  I would measure it in GB/s, but at that
block size we are now limited by the lock contention of single write
issue taskqueue, which is a separate problem we are going to work on.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14964
This commit is contained in:
Alexander Motin 2023-06-27 20:00:30 -04:00 committed by GitHub
parent bc9d0084ea
commit b0cbc1aa9a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -462,14 +462,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
} }
const uint64_t max_blksz = zfsvfs->z_max_blksz;
/* /*
* Pre-fault the pages to ensure slow (eg NFS) pages * Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg. * don't hold up txg.
* Skip this if uio contains loaned arc_buf.
*/ */
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
if (zfs_uio_prefaultpages(pfbytes, uio)) {
zfs_exit(zfsvfs, FTAG); zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFAULT)); return (SET_ERROR(EFAULT));
} }
@ -544,10 +542,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break; break;
} }
uint64_t blksz;
if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
if (zp->z_blksz > zfsvfs->z_max_blksz &&
!ISP2(zp->z_blksz)) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
blksz = 1 << highbit64(zp->z_blksz);
} else {
blksz = zfsvfs->z_max_blksz;
}
blksz = MIN(blksz, P2ROUNDUP(end_size,
SPA_MINBLOCKSIZE));
blksz = MAX(blksz, zp->z_blksz);
} else {
blksz = zp->z_blksz;
}
arc_buf_t *abuf = NULL; arc_buf_t *abuf = NULL;
if (n >= max_blksz && woff >= zp->z_size && ssize_t nbytes = n;
P2PHASE(woff, max_blksz) == 0 && if (n >= blksz && woff >= zp->z_size &&
zp->z_blksz == max_blksz) { P2PHASE(woff, blksz) == 0 &&
(blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
/* /*
* This write covers a full block. "Borrow" a buffer * This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter * from the dmu so that we can fill it before we enter
@ -555,18 +574,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* holding up the transaction if the data copy hangs * holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping). * up on a pagefault (e.g., from an NFS server mapping).
*/ */
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz); blksz);
ASSERT(abuf != NULL); ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz); ASSERT(arc_buf_size(abuf) == blksz);
if ((error = zfs_uiocopy(abuf->b_data, max_blksz, if ((error = zfs_uiocopy(abuf->b_data, blksz,
UIO_WRITE, uio, &cbytes))) { UIO_WRITE, uio, &nbytes))) {
dmu_return_arcbuf(abuf); dmu_return_arcbuf(abuf);
break; break;
} }
ASSERT3S(cbytes, ==, max_blksz); ASSERT3S(nbytes, ==, blksz);
} else {
nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
P2PHASE(woff, blksz));
if (pfbytes < nbytes) {
if (zfs_uio_prefaultpages(nbytes, uio)) {
error = SET_ERROR(EFAULT);
break;
}
pfbytes = nbytes;
}
} }
/* /*
@ -576,8 +603,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
MIN(n, max_blksz));
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT); error = dmu_tx_assign(tx, TXG_WAIT);
@ -600,31 +626,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* shrink down lr_length to the appropriate size. * shrink down lr_length to the appropriate size.
*/ */
if (lr->lr_length == UINT64_MAX) { if (lr->lr_length == UINT64_MAX) {
uint64_t new_blksz; zfs_grow_blocksize(zp, blksz, tx);
if (zp->z_blksz > max_blksz) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
ASSERT(!ISP2(zp->z_blksz));
new_blksz = MIN(end_size,
1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
zfs_rangelock_reduce(lr, woff, n); zfs_rangelock_reduce(lr, woff, n);
} }
/*
* XXX - should we really limit each write to z_max_blksz?
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
*/
const ssize_t nbytes =
MIN(n, max_blksz - P2PHASE(woff, max_blksz));
ssize_t tx_bytes; ssize_t tx_bytes;
if (abuf == NULL) { if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio); tx_bytes = zfs_uio_resid(uio);
@ -644,12 +649,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* zfs_uio_prefaultpages, or prefaultpages may * zfs_uio_prefaultpages, or prefaultpages may
* error, and we may break the loop early. * error, and we may break the loop early.
*/ */
if (tx_bytes != zfs_uio_resid(uio)) n -= tx_bytes - zfs_uio_resid(uio);
n -= tx_bytes - zfs_uio_resid(uio); pfbytes -= tx_bytes - zfs_uio_resid(uio);
if (zfs_uio_prefaultpages(MIN(n, max_blksz),
uio)) {
break;
}
continue; continue;
} }
#endif #endif
@ -665,15 +666,6 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
} }
tx_bytes -= zfs_uio_resid(uio); tx_bytes -= zfs_uio_resid(uio);
} else { } else {
/* Implied by abuf != NULL: */
ASSERT3S(n, >=, max_blksz);
ASSERT0(P2PHASE(woff, max_blksz));
/*
* We can simplify nbytes to MIN(n, max_blksz) since
* P2PHASE(woff, max_blksz) is 0, and knowing
* n >= max_blksz lets us simplify further:
*/
ASSERT3S(nbytes, ==, max_blksz);
/* /*
* Thus, we're writing a full block at a block-aligned * Thus, we're writing a full block at a block-aligned
* offset and extending the file past EOF. * offset and extending the file past EOF.
@ -758,13 +750,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break; break;
ASSERT3S(tx_bytes, ==, nbytes); ASSERT3S(tx_bytes, ==, nbytes);
n -= nbytes; n -= nbytes;
pfbytes -= nbytes;
if (n > 0) {
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
error = SET_ERROR(EFAULT);
break;
}
}
} }
zfs_znode_update_vfs(zp); zfs_znode_update_vfs(zp);