Clean up zvol request processing to pass uio and fix porting regressions

In illumos-gate, `zvol_read` and `zvol_write` are both passed uio_t
rather than bio_t. Since we are translating from bio to uio for both, we
might as well unify the logic and have code more similar to its illumos
counterpart. At the same time, we can fix some regressions that occurred
versus the original code from illumos-gate.

We refactor zvol_write to take uio and also correct the
following problems:

1. We did `dnode_hold()` on each IO when we already had a hold.
2. We would attempt to send writes that exceeded `DMU_MAX_ACCESS` to the
DMU.
3. We could call `zil_commit()` twice. In this case, this is because
Linux uses the `->write` function to send flushes and can aggregate the
flush with a write. If a synchronous write occurred with the flush, we
effectively flushed twice when there is no need to do that.

zvol_read also suffers from the first two problems. Other platforms
suffer from the first, so we leave that for a second patch so that there
is a discrete patch for them to cherry-pick.

Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Chunwei Chen <tuxoko@gmail.com>
Closes #4316
This commit is contained in:
Richard Yao 2016-02-05 20:36:07 -05:00 committed by Brian Behlendorf
parent c7e7ec1997
commit a765a34a31

View File

@ -601,59 +601,42 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
} }
static int static int
zvol_write(struct bio *bio) zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
{ {
zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; uint64_t volsize = zv->zv_volsize;
uint64_t offset = BIO_BI_SECTOR(bio) << 9;
uint64_t size = BIO_BI_SIZE(bio);
int error = 0;
dmu_tx_t *tx;
rl_t *rl; rl_t *rl;
uio_t uio; int error = 0;
if (bio->bi_rw & VDEV_REQ_FLUSH) rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
zil_commit(zv->zv_zilog, ZVOL_OBJ); RL_WRITER);
/* while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
* Some requests are just for flush and nothing else. uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
*/ uint64_t off = uio->uio_loffset;
if (size == 0) dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
goto out;
uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; if (bytes > volsize - off) /* don't write past the end */
uio.uio_skip = BIO_BI_SKIP(bio); bytes = volsize - off;
uio.uio_resid = size;
uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio.uio_loffset = offset;
uio.uio_limit = MAXOFFSET_T;
uio.uio_segflg = UIO_BVEC;
rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size);
/* This will only fail for ENOSPC */ /* This will only fail for ENOSPC */
error = dmu_tx_assign(tx, TXG_WAIT); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
zfs_range_unlock(rl); break;
goto out;
} }
error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, &uio, size, tx);
if (error == 0) if (error == 0)
zvol_log_write(zv, tx, offset, size, zvol_log_write(zv, tx, off, bytes, sync);
!!(bio->bi_rw & VDEV_REQ_FUA));
dmu_tx_commit(tx); dmu_tx_commit(tx);
if (error)
break;
}
zfs_range_unlock(rl); zfs_range_unlock(rl);
if (sync)
if ((bio->bi_rw & VDEV_REQ_FUA) ||
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zv->zv_zilog, ZVOL_OBJ); zil_commit(zv->zv_zilog, ZVOL_OBJ);
out:
return (error); return (error);
} }
@ -733,64 +716,65 @@ zvol_discard(struct bio *bio)
} }
static int static int
zvol_read(struct bio *bio) zvol_read(zvol_state_t *zv, uio_t *uio)
{ {
zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; uint64_t volsize = zv->zv_volsize;
uint64_t offset = BIO_BI_SECTOR(bio) << 9;
uint64_t size = BIO_BI_SIZE(bio);
int error;
rl_t *rl; rl_t *rl;
uio_t uio; int error = 0;
if (size == 0) rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
return (0); RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; /* don't read past the end */
uio.uio_skip = BIO_BI_SKIP(bio); if (bytes > volsize - uio->uio_loffset)
uio.uio_resid = size; bytes = volsize - uio->uio_loffset;
uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio.uio_loffset = offset;
uio.uio_limit = MAXOFFSET_T;
uio.uio_segflg = UIO_BVEC;
rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, &uio, size);
zfs_range_unlock(rl);
error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
if (error) {
/* convert checksum errors into IO errors */ /* convert checksum errors into IO errors */
if (error == ECKSUM) if (error == ECKSUM)
error = SET_ERROR(EIO); error = SET_ERROR(EIO);
break;
}
}
zfs_range_unlock(rl);
return (error); return (error);
} }
static MAKE_REQUEST_FN_RET static MAKE_REQUEST_FN_RET
zvol_request(struct request_queue *q, struct bio *bio) zvol_request(struct request_queue *q, struct bio *bio)
{ {
uio_t uio;
zvol_state_t *zv = q->queuedata; zvol_state_t *zv = q->queuedata;
fstrans_cookie_t cookie = spl_fstrans_mark(); fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = BIO_BI_SECTOR(bio);
unsigned int sectors = bio_sectors(bio);
int rw = bio_data_dir(bio); int rw = bio_data_dir(bio);
#ifdef HAVE_GENERIC_IO_ACCT #ifdef HAVE_GENERIC_IO_ACCT
unsigned long start = jiffies; unsigned long start = jiffies;
#endif #endif
int error = 0; int error = 0;
if (bio_has_data(bio) && offset + sectors > uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
get_capacity(zv->zv_disk)) { uio.uio_skip = BIO_BI_SKIP(bio);
uio.uio_resid = BIO_BI_SIZE(bio);
uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio.uio_loffset = BIO_BI_SECTOR(bio) << 9;
uio.uio_limit = MAXOFFSET_T;
uio.uio_segflg = UIO_BVEC;
if (bio_has_data(bio) && uio.uio_loffset + uio.uio_resid >
zv->zv_volsize) {
printk(KERN_INFO printk(KERN_INFO
"%s: bad access: block=%llu, count=%lu\n", "%s: bad access: offset=%llu, size=%lu\n",
zv->zv_disk->disk_name, zv->zv_disk->disk_name,
(long long unsigned)offset, (long long unsigned)uio.uio_loffset,
(long unsigned)sectors); (long unsigned)uio.uio_resid);
error = SET_ERROR(EIO); error = SET_ERROR(EIO);
goto out1; goto out1;
} }
generic_start_io_acct(rw, sectors, &zv->zv_disk->part0); generic_start_io_acct(rw, bio_sectors(bio), &zv->zv_disk->part0);
if (rw == WRITE) { if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
@ -803,9 +787,20 @@ zvol_request(struct request_queue *q, struct bio *bio)
goto out2; goto out2;
} }
error = zvol_write(bio); /*
* Some requests are just for flush and nothing else.
*/
if (uio.uio_resid == 0) {
if (bio->bi_rw & VDEV_REQ_FLUSH)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
goto out2;
}
error = zvol_write(zv, &uio,
((bio->bi_rw & (VDEV_REQ_FUA|VDEV_REQ_FLUSH)) ||
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS));
} else } else
error = zvol_read(bio); error = zvol_read(zv, &uio);
out2: out2:
generic_end_io_acct(rw, &zv->zv_disk->part0, start); generic_end_io_acct(rw, &zv->zv_disk->part0, start);