mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 02:14:28 +03:00
Fix sync behavior for disk vdevs
Prior tob39c22b
, which was first generally available in the 0.6.5 release asb39c22b
, ZoL never actually submitted synchronous read or write requests to the Linux block layer. This means the vdev_disk_dio_is_sync() function had always returned false and, therefore, the completion in dio_request_t.dr_comp was never actually used. Inb39c22b
, synchronous ZIO operations were translated to synchronous BIO requests in vdev_disk_io_start(). The follow-on commits5592404
andaa159af
fixed several problems introduced byb39c22b
. In particular,5592404
introduced the new flag parameter "wait" to __vdev_disk_physio() but under ZoL, since vdev_disk_physio() is never actually used, the wait flag was always zero so the new code had no effect other than to cause a bug in the use of the dio_request_t.dr_comp which was fixed byaa159af
. The original rationale for introducing synchronous operations inb39c22b
was to hurry certains requests through the BIO layer which would have otherwise been subject to its unplug timer which would increase the latency. This behavior of the unplug timer, however, went away during the transition of the plug/unplug system between kernels 2.6.32 and 2.6.39. To handle the unplug timer behavior on 2.6.32-2.6.35 kernels the BIO_RW_UNPLUG flag is used as a hint to suppress the plugging behavior. For kernels 2.6.36-2.6.38, the REQ_UNPLUG macro will be available and ise used for the same purpose. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4858
This commit is contained in:
parent
273ff9b5cc
commit
e6603b7c1f
23
config/kernel-blk-queue-unplug.m4
Normal file
23
config/kernel-blk-queue-unplug.m4
Normal file
@ -0,0 +1,23 @@
|
||||
dnl #
|
||||
dnl # 2.6.32-2.6.35 API - The BIO_RW_UNPLUG enum can be used as a hint
|
||||
dnl # to unplug the queue.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BIO_RW_UNPLUG], [
|
||||
AC_MSG_CHECKING([whether the BIO_RW_UNPLUG enum is available])
|
||||
tmp_flags="$EXTRA_KCFLAGS"
|
||||
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
|
||||
ZFS_LINUX_TRY_COMPILE([
|
||||
#include <linux/blkdev.h>
|
||||
],[
|
||||
extern enum bio_rw_flags rw;
|
||||
|
||||
rw = BIO_RW_UNPLUG;
|
||||
],[
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG, 1,
|
||||
[BIO_RW_UNPLUG is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
EXTRA_KCFLAGS="$tmp_flags"
|
||||
])
|
@ -27,6 +27,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BIO_RW_UNPLUG
|
||||
ZFS_AC_KERNEL_GET_DISK_RO
|
||||
ZFS_AC_KERNEL_GET_GENDISK
|
||||
ZFS_AC_KERNEL_DISCARD_GRANULARITY
|
||||
|
@ -37,9 +37,11 @@ typedef struct vdev_disk {
|
||||
struct block_device *vd_bdev;
|
||||
} vdev_disk_t;
|
||||
|
||||
#ifndef __linux__
|
||||
extern int vdev_disk_physio(struct block_device *, caddr_t,
|
||||
size_t, uint64_t, int);
|
||||
extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
|
||||
#endif
|
||||
|
||||
#endif /* _KERNEL */
|
||||
#endif /* _SYS_VDEV_DISK_H */
|
||||
|
@ -3971,7 +3971,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
return (0);
|
||||
}
|
||||
|
||||
#ifdef _KERNEL
|
||||
#if defined(_KERNEL) && !defined(__linux__)
|
||||
/*
|
||||
* Get the root pool information from the root disk, then import the root pool
|
||||
* during the system boot up time.
|
||||
@ -4174,7 +4174,7 @@ out:
|
||||
return (error);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif /* defined(_KERNEL) && !defined(__linux__) */
|
||||
|
||||
/*
|
||||
* Import a non-root pool into the system.
|
||||
@ -7038,7 +7038,6 @@ EXPORT_SYMBOL(spa_open);
|
||||
EXPORT_SYMBOL(spa_open_rewind);
|
||||
EXPORT_SYMBOL(spa_get_stats);
|
||||
EXPORT_SYMBOL(spa_create);
|
||||
EXPORT_SYMBOL(spa_import_rootpool);
|
||||
EXPORT_SYMBOL(spa_import);
|
||||
EXPORT_SYMBOL(spa_tryimport);
|
||||
EXPORT_SYMBOL(spa_destroy);
|
||||
|
@ -41,10 +41,8 @@ static void *zfs_vdev_holder = VDEV_HOLDER;
|
||||
* Virtual device vector for disks.
|
||||
*/
|
||||
typedef struct dio_request {
|
||||
struct completion dr_comp; /* Completion for sync IO */
|
||||
zio_t *dr_zio; /* Parent ZIO */
|
||||
atomic_t dr_ref; /* References */
|
||||
int dr_wait; /* Wait for IO */
|
||||
int dr_error; /* Bio error */
|
||||
int dr_bio_count; /* Count of bio's */
|
||||
struct bio *dr_bio[0]; /* Attached bio's */
|
||||
@ -363,7 +361,6 @@ vdev_disk_dio_alloc(int bio_count)
|
||||
dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
if (dr) {
|
||||
init_completion(&dr->dr_comp);
|
||||
atomic_set(&dr->dr_ref, 0);
|
||||
dr->dr_bio_count = bio_count;
|
||||
dr->dr_error = 0;
|
||||
@ -425,7 +422,6 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
||||
{
|
||||
dio_request_t *dr = bio->bi_private;
|
||||
int rc;
|
||||
int wait;
|
||||
|
||||
if (dr->dr_error == 0) {
|
||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
@ -438,13 +434,8 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
||||
#endif
|
||||
}
|
||||
|
||||
wait = dr->dr_wait;
|
||||
/* Drop reference aquired by __vdev_disk_physio */
|
||||
rc = vdev_disk_dio_put(dr);
|
||||
|
||||
/* Wake up synchronous waiter this is the last outstanding bio */
|
||||
if (wait && rc == 1)
|
||||
complete(&dr->dr_comp);
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
@ -511,7 +502,7 @@ vdev_submit_bio(int rw, struct bio *bio)
|
||||
|
||||
static int
|
||||
__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
|
||||
size_t kbuf_size, uint64_t kbuf_offset, int flags, int wait)
|
||||
size_t kbuf_size, uint64_t kbuf_offset, int flags)
|
||||
{
|
||||
dio_request_t *dr;
|
||||
caddr_t bio_ptr;
|
||||
@ -531,7 +522,6 @@ retry:
|
||||
|
||||
rw = flags;
|
||||
dr->dr_zio = zio;
|
||||
dr->dr_wait = wait;
|
||||
|
||||
/*
|
||||
* When the IO size exceeds the maximum bio size for the request
|
||||
@ -593,32 +583,20 @@ retry:
|
||||
if (dr->dr_bio[i])
|
||||
vdev_submit_bio(rw, dr->dr_bio[i]);
|
||||
|
||||
/*
|
||||
* On synchronous blocking requests we wait for all bio the completion
|
||||
* callbacks to run. We will be woken when the last callback runs
|
||||
* for this dio. We are responsible for putting the last dio_request
|
||||
* reference will in turn put back the last bio references. The
|
||||
* only synchronous consumer is vdev_disk_read_rootlabel() all other
|
||||
* IO originating from vdev_disk_io_start() is asynchronous.
|
||||
*/
|
||||
if (wait) {
|
||||
wait_for_completion(&dr->dr_comp);
|
||||
error = dr->dr_error;
|
||||
ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
|
||||
}
|
||||
|
||||
(void) vdev_disk_dio_put(dr);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
#ifndef __linux__
|
||||
int
|
||||
vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
|
||||
size_t size, uint64_t offset, int flags)
|
||||
{
|
||||
bio_set_flags_failfast(bdev, &flags);
|
||||
return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags, 1));
|
||||
return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags));
|
||||
}
|
||||
#endif
|
||||
|
||||
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, rc)
|
||||
{
|
||||
@ -667,7 +645,6 @@ vdev_disk_io_start(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
zio_priority_t pri = zio->io_priority;
|
||||
int flags, error;
|
||||
|
||||
switch (zio->io_type) {
|
||||
@ -707,17 +684,23 @@ vdev_disk_io_start(zio_t *zio)
|
||||
zio_execute(zio);
|
||||
return;
|
||||
case ZIO_TYPE_WRITE:
|
||||
if ((pri == ZIO_PRIORITY_SYNC_WRITE) && (v->vdev_nonrot))
|
||||
flags = WRITE_SYNC;
|
||||
else
|
||||
flags = WRITE;
|
||||
#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
|
||||
flags = WRITE | (1 << BIO_RW_UNPLUG);
|
||||
#elif defined(REQ_UNPLUG)
|
||||
flags = WRITE | REQ_UNPLUG;
|
||||
#else
|
||||
flags = WRITE;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case ZIO_TYPE_READ:
|
||||
if ((pri == ZIO_PRIORITY_SYNC_READ) && (v->vdev_nonrot))
|
||||
flags = READ_SYNC;
|
||||
else
|
||||
flags = READ;
|
||||
#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
|
||||
flags = READ | (1 << BIO_RW_UNPLUG);
|
||||
#elif defined(REQ_UNPLUG)
|
||||
flags = READ | REQ_UNPLUG;
|
||||
#else
|
||||
flags = READ;
|
||||
#endif
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -728,7 +711,7 @@ vdev_disk_io_start(zio_t *zio)
|
||||
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
|
||||
zio->io_size, zio->io_offset, flags, 0);
|
||||
zio->io_size, zio->io_offset, flags);
|
||||
if (error) {
|
||||
zio->io_error = error;
|
||||
zio_interrupt(zio);
|
||||
@ -798,6 +781,7 @@ vdev_ops_t vdev_disk_ops = {
|
||||
B_TRUE /* leaf vdev */
|
||||
};
|
||||
|
||||
#ifndef __linux__
|
||||
/*
|
||||
* Given the root disk device devid or pathname, read the label from
|
||||
* the device, and construct a configuration nvlist.
|
||||
@ -860,6 +844,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
|
||||
|
||||
return (0);
|
||||
}
|
||||
#endif /* __linux__ */
|
||||
|
||||
module_param(zfs_vdev_scheduler, charp, 0644);
|
||||
MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
|
||||
|
Loading…
Reference in New Issue
Block a user