Fix range locking in ZIL commit codepath

Since OpenZFS 7578 (1b7c1e5) if we have a ZVOL with logbias=throughput
we will force WR_INDIRECT itxs in zvol_log_write() setting itx->itx_lr
offset and length to the offset and length of the BIO from
zvol_write()->zvol_log_write(): these offset and length are later used
to take a range lock in zillog->zl_get_data function: zvol_get_data().

Now suppose we have a ZVOL with blocksize=8K and push 4K writes to
offset 0: we will only be range-locking 0-4096. This means the
ASSERTion we make in dbuf_unoverride() is no longer valid because now
dmu_sync() is called from zilog's get_data functions holding a partial
lock on the dbuf.

Fix this by taking a range lock on the whole block in zvol_get_data().

Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #6238 
Closes #6315 
Closes #6356 
Closes #6477
This commit is contained in:
LOLi
2017-08-21 17:59:48 +02:00
committed by Tony Hutter
parent 3468fdbd34
commit ae5b4a05ff
7 changed files with 123 additions and 14 deletions
+6
View File
@@ -49,6 +49,7 @@
#include <sys/zfeature.h>
#include <sys/abd.h>
#include <sys/trace_dmu.h>
#include <sys/zfs_rlock.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <sys/zfs_znode.h>
@@ -1715,6 +1716,11 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
ASSERT(pio != NULL);
ASSERT(txg != 0);
/* dbuf is within the locked range */
ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off);
ASSERT3U(db->db.db_offset + db->db.db_size, <=,
zgd->zgd_rl->r_off + zgd->zgd_rl->r_len);
SET_BOOKMARK(&zb, ds->ds_object,
db->db.db_object, db->db_level, db->db_blkid);
+1 -1
View File
@@ -1050,7 +1050,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
* written out and it's checksum is being calculated
* written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
+37 -11
View File
@@ -821,6 +821,7 @@ zvol_discard(void *arg)
uint64_t start = BIO_BI_SECTOR(bio) << 9;
uint64_t size = BIO_BI_SIZE(bio);
uint64_t end = start + size;
boolean_t sync;
int error = 0;
dmu_tx_t *tx;
unsigned long start_jif;
@@ -830,9 +831,11 @@ zvol_discard(void *arg)
start_jif = jiffies;
generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
if (end > zv->zv_volsize) {
error = SET_ERROR(EIO);
goto out;
goto unlock;
}
/*
@@ -848,7 +851,7 @@ zvol_discard(void *arg)
}
if (start >= end)
goto out;
goto unlock;
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_mark_netfree(tx);
@@ -861,9 +864,11 @@ zvol_discard(void *arg)
error = dmu_free_long_range(zv->zv_objset,
ZVOL_OBJ, start, size);
}
out:
unlock:
zfs_range_unlock(zvr->rl);
if (error == 0 && sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
BIO_END_IO(bio, -error);
@@ -933,6 +938,8 @@ zvol_request(struct request_queue *q, struct bio *bio)
}
if (rw == WRITE) {
boolean_t need_sync = B_FALSE;
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
BIO_END_IO(bio, -SET_ERROR(EROFS));
goto out;
@@ -966,13 +973,24 @@ zvol_request(struct request_queue *q, struct bio *bio)
*/
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_WRITER);
/*
* Sync writes and discards execute zil_commit() which may need
* to take a RL_READER lock on the whole block being modified
* via its zillog->zl_get_data(): to avoid circular dependency
* issues with taskq threads execute these requests
* synchronously here in zvol_request().
*/
need_sync = bio_is_fua(bio) ||
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
if (zvol_request_sync || taskq_dispatch(zvol_taskq,
zvol_discard, zvr, TQ_SLEEP) == TASKQID_INVALID)
if (zvol_request_sync || need_sync ||
taskq_dispatch(zvol_taskq, zvol_discard, zvr,
TQ_SLEEP) == TASKQID_INVALID)
zvol_discard(zvr);
} else {
if (zvol_request_sync || taskq_dispatch(zvol_taskq,
zvol_write, zvr, TQ_SLEEP) == TASKQID_INVALID)
if (zvol_request_sync || need_sync ||
taskq_dispatch(zvol_taskq, zvol_write, zvr,
TQ_SLEEP) == TASKQID_INVALID)
zvol_write(zvr);
}
} else {
@@ -1030,8 +1048,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_zilog = zv->zv_zilog;
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_READER);
/*
* Write records come in two flavors: immediate and indirect.
@@ -1041,11 +1057,21 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_READER);
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
} else {
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's written out
* and its checksum is being calculated that no one can change
* the data. Contrarily to zfs_get_data we need not re-check
* blocksize after we get the lock because it cannot be changed.
*/
size = zv->zv_volblocksize;
offset = P2ALIGN_TYPED(offset, size, uint64_t);
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_READER);
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {