Merge branch 'zvol'

Performance improvements for zvols.

Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3720
This commit is contained in:
Brian Behlendorf 2015-09-04 13:02:48 -07:00
commit e20cd6f7a8
21 changed files with 292 additions and 704 deletions

View File

@ -0,0 +1,25 @@
dnl #
dnl # Interface for issuing a discard bio:
dnl # 2.6.28-2.6.35: BIO_RW_BARRIER
dnl # 2.6.36-3.x: REQ_BARRIER
dnl #
dnl # Since REQ_BARRIER is a preprocessor definition, there is no need for an
dnl # autotools check for it. Also, REQ_BARRIER existed in the request layer
dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the
dnl # request layer and bio layer flags, so it would be wrong to assume that
dnl # the APIs are mutually exclusive contrary to the typical case.
AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_BARRIER], [
AC_MSG_CHECKING([whether BIO_RW_BARRIER is defined])
ZFS_LINUX_TRY_COMPILE([
#include <linux/bio.h>
],[
int flags __attribute__ ((unused));
flags = BIO_RW_BARRIER;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BIO_RW_BARRIER, 1, [BIO_RW_BARRIER is defined])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -0,0 +1,25 @@
dnl #
dnl # Interface for issuing a discard bio:
dnl # 2.6.28-2.6.35: BIO_RW_DISCARD
dnl # 2.6.36-3.x: REQ_DISCARD
dnl #
dnl # Since REQ_DISCARD is a preprocessor definition, there is no need for an
dnl # autotools check for it. Also, REQ_DISCARD existed in the request layer
dnl # until torvalds/linux@7b6d91daee5cac6402186ff224c3af39d79f4a0e unified the
dnl # request layer and bio layer flags, so it would be wrong to assume that
dnl # the APIs are mutually exclusive contrary to the typical case.
AC_DEFUN([ZFS_AC_KERNEL_BIO_RW_DISCARD], [
AC_MSG_CHECKING([whether BIO_RW_DISCARD is defined])
ZFS_LINUX_TRY_COMPILE([
#include <linux/bio.h>
],[
int flags __attribute__ ((unused));
flags = BIO_RW_DISCARD;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BIO_RW_DISCARD, 1, [BIO_RW_DISCARD is defined])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -1,40 +0,0 @@
dnl #
dnl # 2.6.31 API change
dnl # In 2.6.29 kernels blk_end_request() was a GPL-only symbol, this was
dnl # changed in 2.6.31 so it may be used by non-GPL modules.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_END_REQUEST], [
AC_MSG_CHECKING([whether blk_end_request() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request *req = NULL;
(void) blk_end_request(req, 0, 0);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_END_REQUEST, 1,
[blk_end_request() is available])
],[
AC_MSG_RESULT(no)
])
AC_MSG_CHECKING([whether blk_end_request() is GPL-only])
ZFS_LINUX_TRY_COMPILE([
#include <linux/module.h>
#include <linux/blkdev.h>
MODULE_LICENSE("$ZFS_META_LICENSE");
],[
struct request *req = NULL;
(void) blk_end_request(req, 0, 0);
],[
AC_MSG_RESULT(no)
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_END_REQUEST_GPL_ONLY, 1,
[blk_end_request() is GPL-only])
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,25 +0,0 @@
dnl #
dnl # 2.6.31 API change
dnl # Request queue peek/retrieval interface cleanup, the blk_fetch_request()
dnl # function replaces the elv_next_request() and blk_fetch_request()
dnl # functions. The updated blk_fetch_request() function returns the
dnl # next available request and removed it from the request queue.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_FETCH_REQUEST], [
AC_MSG_CHECKING([whether blk_fetch_request() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request_queue *q = NULL;
(void) blk_fetch_request(q);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_FETCH_REQUEST, 1,
[blk_fetch_request() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,22 +0,0 @@
dnl #
dnl # 2.6.32 API change
dnl # Discard requests were moved to the normal I/O path.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [
AC_MSG_CHECKING([whether blk_queue_discard() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request_queue *q = NULL;
(void) blk_queue_discard(q);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_QUEUE_DISCARD, 1,
[blk_queue_discard() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,25 +0,0 @@
dnl #
dnl # 2.6.27 API change
dnl # The blk_queue_nonrot() function and QUEUE_FLAG_NONROT flag were
dnl # added so non-rotational devices could be identified. These devices
dnl # have no seek time which the higher level elevator uses to optimize
dnl # how the I/O issued to the device.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_NONROT], [
AC_MSG_CHECKING([whether blk_queue_nonrot() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request_queue *q = NULL;
(void) blk_queue_nonrot(q);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_QUEUE_NONROT, 1,
[blk_queue_nonrot() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,25 +0,0 @@
dnl #
dnl # 2.6.31 API change
dnl # Request queue peek/retrieval interface cleanup, the
dnl # elv_requeue_request() function has been replaced with the
dnl # blk_requeue_request() function.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_REQUEUE_REQUEST], [
AC_MSG_CHECKING([whether blk_requeue_request() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request_queue *q = NULL;
struct request *req = NULL;
blk_requeue_request(q, req);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_REQUEUE_REQUEST, 1,
[blk_requeue_request() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,41 +0,0 @@
dnl #
dnl # 2.6.29 API change
dnl # In the 2.6.29 kernel blk_rq_bytes() was available as a GPL-only symbol.
dnl # So we need to check the symbol license as well. As of 2.6.31 the
dnl blk_rq_bytes() helper was changed to a static inline which we can use.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_RQ_BYTES], [
AC_MSG_CHECKING([whether blk_rq_bytes() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request *req = NULL;
(void) blk_rq_bytes(req);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_RQ_BYTES, 1,
[blk_rq_bytes() is available])
],[
AC_MSG_RESULT(no)
])
AC_MSG_CHECKING([whether blk_rq_bytes() is GPL-only])
ZFS_LINUX_TRY_COMPILE([
#include <linux/module.h>
#include <linux/blkdev.h>
MODULE_LICENSE("$ZFS_META_LICENSE");
],[
struct request *req = NULL;
(void) blk_rq_bytes(req);
],[
AC_MSG_RESULT(no)
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_RQ_BYTES_GPL_ONLY, 1,
[blk_rq_bytes() is GPL-only])
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,21 +0,0 @@
dnl #
dnl # 2.6.31 API change
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_RQ_POS], [
AC_MSG_CHECKING([whether blk_rq_pos() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request *req = NULL;
(void) blk_rq_pos(req);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_RQ_POS, 1,
[blk_rq_pos() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,21 +0,0 @@
dnl #
dnl # 2.6.31 API change
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BLK_RQ_SECTORS], [
AC_MSG_CHECKING([whether blk_rq_sectors() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request *req = NULL;
(void) blk_rq_sectors(req);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_RQ_SECTORS, 1,
[blk_rq_sectors() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -0,0 +1,33 @@
dnl #
dnl # 2.6.34 API change
dnl # current->bio_tail and current->bio_list were struct bio pointers prior to
dnl # Linux 2.6.34. They were refactored into a struct bio_list pointer called
dnl # current->bio_list in Linux 2.6.34.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_CURRENT_BIO_TAIL], [
AC_MSG_CHECKING([whether current->bio_tail exists])
ZFS_LINUX_TRY_COMPILE([
#include <linux/sched.h>
],[
current->bio_tail = (struct bio **) NULL;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_CURRENT_BIO_TAIL, 1,
[current->bio_tail exists])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether current->bio_list exists])
ZFS_LINUX_TRY_COMPILE([
#include <linux/sched.h>
],[
current->bio_list = (struct bio_list *) NULL;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_CURRENT_BIO_LIST, 1,
[current->bio_list exists])
],[
AC_MSG_ERROR(no - Please file a bug report at
https://github.com/zfsonlinux/zfs/issues/new)
])
])
])

View File

@ -0,0 +1,43 @@
dnl #
dnl # Linux 3.2 API Change
dnl # make_request_fn returns void instead of int.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
AC_MSG_CHECKING([whether make_request_fn() returns int])
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
int make_request(struct request_queue *q, struct bio *bio)
{
return (0);
}
],[
blk_queue_make_request(NULL, &make_request);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(MAKE_REQUEST_FN_RET, int,
[make_request_fn() returns int])
AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_INT, 1,
[Noting that make_request_fn() returns int])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether make_request_fn() returns void])
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
void make_request(struct request_queue *q, struct bio *bio)
{
return;
}
],[
blk_queue_make_request(NULL, &make_request);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(MAKE_REQUEST_FN_RET, void,
[make_request_fn() returns void])
],[
AC_MSG_ERROR(no - Please file a bug report at
https://github.com/zfsonlinux/zfs/issues/new)
])
])
])

View File

@ -1,47 +0,0 @@
dnl #
dnl # 2.6.x API change
dnl #
dnl # 3.14 API change
dnl #
AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec *])
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct bio_vec *bv;
struct req_iterator iter;
struct request *req = NULL;
rq_for_each_segment(bv, req, iter) { }
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT, 1,
[rq_for_each_segment() is available])
AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BVP, 1,
[rq_for_each_segment() wants bio_vec *])
],[
AC_MSG_RESULT(no)
])
AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec])
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct bio_vec bv;
struct req_iterator iter;
struct request *req = NULL;
rq_for_each_segment(bv, req, iter) { }
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT, 1,
[rq_for_each_segment() is available])
AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BV, 1,
[rq_for_each_segment() wants bio_vec])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -1,21 +0,0 @@
dnl #
dnl # 2.6.x API change
dnl #
AC_DEFUN([ZFS_AC_KERNEL_RQ_IS_SYNC], [
AC_MSG_CHECKING([whether rq_is_sync() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct request *req = NULL;
(void) rq_is_sync(req);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RQ_IS_SYNC, 1,
[rq_is_sync() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -7,6 +7,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_TEST_MODULE ZFS_AC_TEST_MODULE
ZFS_AC_KERNEL_CONFIG ZFS_AC_KERNEL_CONFIG
ZFS_AC_KERNEL_DECLARE_EVENT_CLASS ZFS_AC_KERNEL_DECLARE_EVENT_CLASS
ZFS_AC_KERNEL_CURRENT_BIO_TAIL
ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS
ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
ZFS_AC_KERNEL_TYPE_FMODE_T ZFS_AC_KERNEL_TYPE_FMODE_T
@ -22,24 +23,16 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_BIO_FAILFAST_DTD ZFS_AC_KERNEL_BIO_FAILFAST_DTD
ZFS_AC_KERNEL_REQ_FAILFAST_MASK ZFS_AC_KERNEL_REQ_FAILFAST_MASK
ZFS_AC_KERNEL_BIO_END_IO_T_ARGS ZFS_AC_KERNEL_BIO_END_IO_T_ARGS
ZFS_AC_KERNEL_BIO_RW_BARRIER
ZFS_AC_KERNEL_BIO_RW_DISCARD
ZFS_AC_KERNEL_BIO_RW_SYNC ZFS_AC_KERNEL_BIO_RW_SYNC
ZFS_AC_KERNEL_BIO_RW_SYNCIO ZFS_AC_KERNEL_BIO_RW_SYNCIO
ZFS_AC_KERNEL_REQ_SYNC ZFS_AC_KERNEL_REQ_SYNC
ZFS_AC_KERNEL_BLK_END_REQUEST
ZFS_AC_KERNEL_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS
ZFS_AC_KERNEL_BLK_QUEUE_NONROT
ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
ZFS_AC_KERNEL_BLK_FETCH_REQUEST
ZFS_AC_KERNEL_BLK_REQUEUE_REQUEST
ZFS_AC_KERNEL_BLK_RQ_BYTES
ZFS_AC_KERNEL_BLK_RQ_POS
ZFS_AC_KERNEL_BLK_RQ_SECTORS
ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_GET_DISK_RO
ZFS_AC_KERNEL_GET_GENDISK ZFS_AC_KERNEL_GET_GENDISK
ZFS_AC_KERNEL_RQ_IS_SYNC
ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT
ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_DISCARD_GRANULARITY
ZFS_AC_KERNEL_CONST_XATTR_HANDLER ZFS_AC_KERNEL_CONST_XATTR_HANDLER
ZFS_AC_KERNEL_XATTR_HANDLER_GET ZFS_AC_KERNEL_XATTR_HANDLER_GET
@ -100,6 +93,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_RW_ITERATE
ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
ZFS_AC_KERNEL_MAKE_REQUEST_FN
AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ" KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"

View File

@ -36,102 +36,6 @@
typedef unsigned __bitwise__ fmode_t; typedef unsigned __bitwise__ fmode_t;
#endif /* HAVE_FMODE_T */ #endif /* HAVE_FMODE_T */
#ifndef HAVE_BLK_FETCH_REQUEST
static inline struct request *
blk_fetch_request(struct request_queue *q)
{
struct request *req;
req = elv_next_request(q);
if (req)
blkdev_dequeue_request(req);
return (req);
}
#endif /* HAVE_BLK_FETCH_REQUEST */
#ifndef HAVE_BLK_REQUEUE_REQUEST
static inline void
blk_requeue_request(request_queue_t *q, struct request *req)
{
elv_requeue_request(q, req);
}
#endif /* HAVE_BLK_REQUEUE_REQUEST */
#ifndef HAVE_BLK_END_REQUEST
static inline bool
__blk_end_request(struct request *req, int error, unsigned int nr_bytes)
{
LIST_HEAD(list);
/*
* Request has already been dequeued but 2.6.18 version of
* end_request() unconditionally dequeues the request so we
* add it to a local list to prevent hitting the BUG_ON.
*/
list_add(&req->queuelist, &list);
/*
* The old API required the driver to end each segment and not
* the entire request. In our case we always need to end the
* entire request partial requests are not supported.
*/
req->hard_cur_sectors = nr_bytes >> 9;
end_request(req, ((error == 0) ? 1 : error));
return (0);
}
static inline bool
blk_end_request(struct request *req, int error, unsigned int nr_bytes)
{
struct request_queue *q = req->q;
bool rc;
spin_lock_irq(q->queue_lock);
rc = __blk_end_request(req, error, nr_bytes);
spin_unlock_irq(q->queue_lock);
return (rc);
}
#else
#ifdef HAVE_BLK_END_REQUEST_GPL_ONLY
/*
* Define required to avoid conflicting 2.6.29 non-static prototype for a
* GPL-only version of the helper. As of 2.6.31 the helper is available
* to non-GPL modules and is not explicitly exported GPL-only.
*/
#define __blk_end_request __blk_end_request_x
#define blk_end_request blk_end_request_x
static inline bool
__blk_end_request_x(struct request *req, int error, unsigned int nr_bytes)
{
/*
* The old API required the driver to end each segment and not
* the entire request. In our case we always need to end the
* entire request partial requests are not supported.
*/
req->hard_cur_sectors = nr_bytes >> 9;
end_request(req, ((error == 0) ? 1 : error));
return (0);
}
static inline bool
blk_end_request_x(struct request *req, int error, unsigned int nr_bytes)
{
struct request_queue *q = req->q;
bool rc;
spin_lock_irq(q->queue_lock);
rc = __blk_end_request_x(req, error, nr_bytes);
spin_unlock_irq(q->queue_lock);
return (rc);
}
#endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */
#endif /* HAVE_BLK_END_REQUEST */
/* /*
* 2.6.36 API change, * 2.6.36 API change,
* The blk_queue_flush() interface has replaced blk_queue_ordered() * The blk_queue_flush() interface has replaced blk_queue_ordered()
@ -148,37 +52,6 @@ __blk_queue_flush(struct request_queue *q, unsigned int flags)
q->flush_flags = flags & (REQ_FLUSH | REQ_FUA); q->flush_flags = flags & (REQ_FLUSH | REQ_FUA);
} }
#endif /* HAVE_BLK_QUEUE_FLUSH && HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ #endif /* HAVE_BLK_QUEUE_FLUSH && HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */
#ifndef HAVE_BLK_RQ_POS
static inline sector_t
blk_rq_pos(struct request *req)
{
return (req->sector);
}
#endif /* HAVE_BLK_RQ_POS */
#ifndef HAVE_BLK_RQ_SECTORS
static inline unsigned int
blk_rq_sectors(struct request *req)
{
return (req->nr_sectors);
}
#endif /* HAVE_BLK_RQ_SECTORS */
#if !defined(HAVE_BLK_RQ_BYTES) || defined(HAVE_BLK_RQ_BYTES_GPL_ONLY)
/*
* Define required to avoid conflicting 2.6.29 non-static prototype for a
* GPL-only version of the helper. As of 2.6.31 the helper is available
* to non-GPL modules in the form of a static inline in the header.
*/
#define blk_rq_bytes __blk_rq_bytes
static inline unsigned int
__blk_rq_bytes(struct request *req)
{
return (blk_rq_sectors(req) << 9);
}
#endif /* !HAVE_BLK_RQ_BYTES || HAVE_BLK_RQ_BYTES_GPL_ONLY */
/* /*
* Most of the blk_* macros were removed in 2.6.36. Ostensibly this was * Most of the blk_* macros were removed in 2.6.36. Ostensibly this was
* done to improve readability and allow easier grepping. However, from * done to improve readability and allow easier grepping. However, from
@ -241,64 +114,20 @@ get_disk_ro(struct gendisk *disk)
} }
#endif /* HAVE_GET_DISK_RO */ #endif /* HAVE_GET_DISK_RO */
#ifndef HAVE_RQ_IS_SYNC
static inline bool
rq_is_sync(struct request *req)
{
return (req->flags & REQ_RW_SYNC);
}
#endif /* HAVE_RQ_IS_SYNC */
#ifndef HAVE_RQ_FOR_EACH_SEGMENT
struct req_iterator {
int i;
struct bio *bio;
};
#define for_each_bio(_bio) \
for (; _bio; _bio = _bio->bi_next)
#define __rq_for_each_bio(_bio, rq) \
if ((rq->bio)) \
for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
#define rq_for_each_segment(bvl, _rq, _iter) \
__rq_for_each_bio(_iter.bio, _rq) \
bio_for_each_segment(bvl, _iter.bio, _iter.i)
#define HAVE_RQ_FOR_EACH_SEGMENT_BVP 1
#endif /* HAVE_RQ_FOR_EACH_SEGMENT */
/*
* 3.14 API change
* rq_for_each_segment changed from taking bio_vec * to taking bio_vec.
* We provide rq_for_each_segment4 which takes both.
* You should not modify the fields in @bv and @bvp.
*
* Note: the if-else is just to inject the assignment before the loop body.
*/
#ifdef HAVE_RQ_FOR_EACH_SEGMENT_BVP
#define rq_for_each_segment4(bv, bvp, rq, iter) \
rq_for_each_segment(bvp, rq, iter) \
if ((bv = *bvp), 0) \
; \
else
#else
#define rq_for_each_segment4(bv, bvp, rq, iter) \
rq_for_each_segment(bv, rq, iter) \
if ((bvp = &bv), 0) \
; \
else
#endif
#ifdef HAVE_BIO_BVEC_ITER #ifdef HAVE_BIO_BVEC_ITER
#define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector #define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector
#define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size #define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size
#define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx #define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx
#define bio_for_each_segment4(bv, bvp, b, i) \
bio_for_each_segment((bv), (b), (i))
typedef struct bvec_iter bvec_iterator_t;
#else #else
#define BIO_BI_SECTOR(bio) (bio)->bi_sector #define BIO_BI_SECTOR(bio) (bio)->bi_sector
#define BIO_BI_SIZE(bio) (bio)->bi_size #define BIO_BI_SIZE(bio) (bio)->bi_size
#define BIO_BI_IDX(bio) (bio)->bi_idx #define BIO_BI_IDX(bio) (bio)->bi_idx
#define bio_for_each_segment4(bv, bvp, b, i) \
bio_for_each_segment((bvp), (b), (i))
typedef int bvec_iterator_t;
#endif #endif
/* /*
@ -457,17 +286,30 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags)
#define VDEV_REQ_FUA REQ_FUA #define VDEV_REQ_FUA REQ_FUA
#else #else
#define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER #define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER
#ifdef HAVE_BIO_RW_BARRIER
#define VDEV_REQ_FLUSH (1 << BIO_RW_BARRIER)
#define VDEV_REQ_FUA (1 << BIO_RW_BARRIER)
#else
#define VDEV_REQ_FLUSH REQ_HARDBARRIER #define VDEV_REQ_FLUSH REQ_HARDBARRIER
#define VDEV_REQ_FUA REQ_HARDBARRIER #define VDEV_REQ_FUA REQ_FUA
#endif
#endif #endif
/* /*
* 2.6.32 API change * 2.6.32 API change
* Use the normal I/O patch for discards. * Use the normal I/O patch for discards.
*/ */
#ifdef REQ_DISCARD #ifdef QUEUE_FLAG_DISCARD
#ifdef HAVE_BIO_RW_DISCARD
#define VDEV_REQ_DISCARD (1 << BIO_RW_DISCARD)
#else
#define VDEV_REQ_DISCARD REQ_DISCARD #define VDEV_REQ_DISCARD REQ_DISCARD
#endif #endif
#else
#error "Allowing the build will cause discard requests to become writes "
"potentially triggering the DMU_MAX_ACCESS assertion. Please file a "
"an issue report at: https://github.com/zfsonlinux/zfs/issues/new"
#endif
/* /*
* 2.6.33 API change * 2.6.33 API change

View File

@ -710,8 +710,8 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx); dmu_tx_t *tx);
#ifdef _KERNEL #ifdef _KERNEL
#include <linux/blkdev_compat.h> #include <linux/blkdev_compat.h>
int dmu_read_req(objset_t *os, uint64_t object, struct request *req); int dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio);
int dmu_write_req(objset_t *os, uint64_t object, struct request *req, int dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio,
dmu_tx_t *tx); dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);

View File

@ -1591,17 +1591,6 @@ Max number of blocks to discard at once
Default value: \fB16,384\fR. Default value: \fB16,384\fR.
.RE .RE
.sp
.ne 2
.na
\fBzvol_threads\fR (uint)
.ad
.RS 12n
Max number of threads to handle zvol I/O requests
.sp
Default value: \fB32\fR.
.RE
.SH ZFS I/O SCHEDULER .SH ZFS I/O SCHEDULER
ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os. ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
The I/O scheduler determines when and in what order those operations are The I/O scheduler determines when and in what order those operations are

View File

@ -1049,15 +1049,16 @@ xuio_stat_wbuf_nocopy()
* return value is the number of bytes successfully copied to arg_buf. * return value is the number of bytes successfully copied to arg_buf.
*/ */
static int static int
dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) dmu_bio_copy(void *arg_buf, int size, struct bio *bio, size_t bio_offset)
{ {
struct bio_vec bv, *bvp; struct bio_vec bv, *bvp = &bv;
struct req_iterator iter; bvec_iterator_t iter;
char *bv_buf; char *bv_buf;
int tocpy, bv_len, bv_offset; int tocpy, bv_len, bv_offset;
int offset = 0; int offset = 0;
rq_for_each_segment4(bv, bvp, req, iter) { bio_for_each_segment4(bv, bvp, bio, iter) {
/* /*
* Fully consumed the passed arg_buf. We use goto here because * Fully consumed the passed arg_buf. We use goto here because
* rq_for_each_segment is a double loop * rq_for_each_segment is a double loop
@ -1066,23 +1067,23 @@ dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset)
if (size == offset) if (size == offset)
goto out; goto out;
/* Skip already copied bv */ /* Skip already copied bvp */
if (req_offset >= bv.bv_len) { if (bio_offset >= bvp->bv_len) {
req_offset -= bv.bv_len; bio_offset -= bvp->bv_len;
continue; continue;
} }
bv_len = bv.bv_len - req_offset; bv_len = bvp->bv_len - bio_offset;
bv_offset = bv.bv_offset + req_offset; bv_offset = bvp->bv_offset + bio_offset;
req_offset = 0; bio_offset = 0;
tocpy = MIN(bv_len, size - offset); tocpy = MIN(bv_len, size - offset);
ASSERT3S(tocpy, >=, 0); ASSERT3S(tocpy, >=, 0);
bv_buf = page_address(bv.bv_page) + bv_offset; bv_buf = page_address(bvp->bv_page) + bv_offset;
ASSERT3P(bv_buf, !=, NULL); ASSERT3P(bv_buf, !=, NULL);
if (rq_data_dir(req) == WRITE) if (bio_data_dir(bio) == WRITE)
memcpy(arg_buf + offset, bv_buf, tocpy); memcpy(arg_buf + offset, bv_buf, tocpy);
else else
memcpy(bv_buf, arg_buf + offset, tocpy); memcpy(bv_buf, arg_buf + offset, tocpy);
@ -1094,13 +1095,13 @@ out:
} }
int int
dmu_read_req(objset_t *os, uint64_t object, struct request *req) dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio)
{ {
uint64_t size = blk_rq_bytes(req); uint64_t offset = BIO_BI_SECTOR(bio) << 9;
uint64_t offset = blk_rq_pos(req) << 9; uint64_t size = BIO_BI_SIZE(bio);
dmu_buf_t **dbp; dmu_buf_t **dbp;
int numbufs, i, err; int numbufs, i, err;
size_t req_offset; size_t bio_offset;
/* /*
* NB: we could do this block-at-a-time, but it's nice * NB: we could do this block-at-a-time, but it's nice
@ -1111,7 +1112,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
if (err) if (err)
return (err); return (err);
req_offset = 0; bio_offset = 0;
for (i = 0; i < numbufs; i++) { for (i = 0; i < numbufs; i++) {
uint64_t tocpy; uint64_t tocpy;
int64_t bufoff; int64_t bufoff;
@ -1125,8 +1126,8 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
if (tocpy == 0) if (tocpy == 0)
break; break;
didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
req_offset); bio_offset);
if (didcpy < tocpy) if (didcpy < tocpy)
err = EIO; err = EIO;
@ -1136,7 +1137,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
size -= tocpy; size -= tocpy;
offset += didcpy; offset += didcpy;
req_offset += didcpy; bio_offset += didcpy;
err = 0; err = 0;
} }
dmu_buf_rele_array(dbp, numbufs, FTAG); dmu_buf_rele_array(dbp, numbufs, FTAG);
@ -1145,13 +1146,13 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req)
} }
int int
dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio, dmu_tx_t *tx)
{ {
uint64_t size = blk_rq_bytes(req); uint64_t offset = BIO_BI_SECTOR(bio) << 9;
uint64_t offset = blk_rq_pos(req) << 9; uint64_t size = BIO_BI_SIZE(bio);
dmu_buf_t **dbp; dmu_buf_t **dbp;
int numbufs, i, err; int numbufs, i, err;
size_t req_offset; size_t bio_offset;
if (size == 0) if (size == 0)
return (0); return (0);
@ -1161,7 +1162,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
if (err) if (err)
return (err); return (err);
req_offset = 0; bio_offset = 0;
for (i = 0; i < numbufs; i++) { for (i = 0; i < numbufs; i++) {
uint64_t tocpy; uint64_t tocpy;
int64_t bufoff; int64_t bufoff;
@ -1182,8 +1183,8 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
else else
dmu_buf_will_dirty(db, tx); dmu_buf_will_dirty(db, tx);
didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
req_offset); bio_offset);
if (tocpy == db->db_size) if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx); dmu_buf_fill_done(db, tx);
@ -1196,7 +1197,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
size -= tocpy; size -= tocpy;
offset += didcpy; offset += didcpy;
req_offset += didcpy; bio_offset += didcpy;
err = 0; err = 0;
} }

View File

@ -496,6 +496,22 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
return (bio_size); return (bio_size);
} }
static inline void
vdev_submit_bio(int rw, struct bio *bio)
{
#ifdef HAVE_CURRENT_BIO_TAIL
struct bio **bio_tail = current->bio_tail;
current->bio_tail = NULL;
submit_bio(rw, bio);
current->bio_tail = bio_tail;
#else
struct bio_list *bio_list = current->bio_list;
current->bio_list = NULL;
submit_bio(rw, bio);
current->bio_list = bio_list;
#endif
}
static int static int
__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
size_t kbuf_size, uint64_t kbuf_offset, int flags) size_t kbuf_size, uint64_t kbuf_offset, int flags)
@ -571,7 +587,7 @@ retry:
bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
} }
/* Extra reference to protect dio_request during submit_bio */ /* Extra reference to protect dio_request during vdev_submit_bio */
vdev_disk_dio_get(dr); vdev_disk_dio_get(dr);
if (zio) if (zio)
zio->io_delay = jiffies_64; zio->io_delay = jiffies_64;
@ -579,7 +595,7 @@ retry:
/* Submit all bio's associated with this dio */ /* Submit all bio's associated with this dio */
for (i = 0; i < dr->dr_bio_count; i++) for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i]) if (dr->dr_bio[i])
submit_bio(dr->dr_rw, dr->dr_bio[i]); vdev_submit_bio(dr->dr_rw, dr->dr_bio[i]);
/* /*
* On synchronous blocking requests we wait for all bio the completion * On synchronous blocking requests we wait for all bio the completion
@ -645,7 +661,7 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
bio->bi_private = zio; bio->bi_private = zio;
bio->bi_bdev = bdev; bio->bi_bdev = bdev;
zio->io_delay = jiffies_64; zio->io_delay = jiffies_64;
submit_bio(VDEV_WRITE_FLUSH_FUA, bio); vdev_submit_bio(VDEV_WRITE_FLUSH_FUA, bio);
invalidate_bdev(bdev); invalidate_bdev(bdev);
return (0); return (0);

View File

@ -50,10 +50,8 @@
unsigned int zvol_inhibit_dev = 0; unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_major = ZVOL_MAJOR;
unsigned int zvol_threads = 32;
unsigned long zvol_max_discard_blocks = 16384; unsigned long zvol_max_discard_blocks = 16384;
static taskq_t *zvol_taskq;
static kmutex_t zvol_state_lock; static kmutex_t zvol_state_lock;
static list_t zvol_state_list; static list_t zvol_state_list;
static char *zvol_tag = "zvol_tag"; static char *zvol_tag = "zvol_tag";
@ -590,34 +588,24 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
} }
} }
/* static int
* Common write path running under the zvol taskq context. This function zvol_write(struct bio *bio)
* is responsible for copying the request structure data in to the DMU and
* signaling the request queue with the result of the copy.
*/
static void
zvol_write(void *arg)
{ {
struct request *req = (struct request *)arg; zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
struct request_queue *q = req->q; uint64_t offset = BIO_BI_SECTOR(bio) << 9;
zvol_state_t *zv = q->queuedata; uint64_t size = BIO_BI_SIZE(bio);
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = blk_rq_pos(req) << 9;
uint64_t size = blk_rq_bytes(req);
int error = 0; int error = 0;
dmu_tx_t *tx; dmu_tx_t *tx;
rl_t *rl; rl_t *rl;
if (req->cmd_flags & VDEV_REQ_FLUSH) if (bio->bi_rw & VDEV_REQ_FLUSH)
zil_commit(zv->zv_zilog, ZVOL_OBJ); zil_commit(zv->zv_zilog, ZVOL_OBJ);
/* /*
* Some requests are just for flush and nothing else. * Some requests are just for flush and nothing else.
*/ */
if (size == 0) { if (size == 0)
error = 0;
goto out; goto out;
}
rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
@ -632,96 +620,82 @@ zvol_write(void *arg)
goto out; goto out;
} }
error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); error = dmu_write_bio(zv->zv_objset, ZVOL_OBJ, bio, tx);
if (error == 0) if (error == 0)
zvol_log_write(zv, tx, offset, size, zvol_log_write(zv, tx, offset, size,
req->cmd_flags & VDEV_REQ_FUA); !!(bio->bi_rw & VDEV_REQ_FUA));
dmu_tx_commit(tx); dmu_tx_commit(tx);
zfs_range_unlock(rl); zfs_range_unlock(rl);
if ((req->cmd_flags & VDEV_REQ_FUA) || if ((bio->bi_rw & VDEV_REQ_FUA) ||
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zv->zv_zilog, ZVOL_OBJ); zil_commit(zv->zv_zilog, ZVOL_OBJ);
out: out:
blk_end_request(req, -error, size); return (error);
spl_fstrans_unmark(cookie);
} }
#ifdef HAVE_BLK_QUEUE_DISCARD static int
static void zvol_discard(struct bio *bio)
zvol_discard(void *arg)
{ {
struct request *req = (struct request *)arg; zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
struct request_queue *q = req->q; uint64_t start = BIO_BI_SECTOR(bio) << 9;
zvol_state_t *zv = q->queuedata; uint64_t size = BIO_BI_SIZE(bio);
fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t end = start + size;
uint64_t start = blk_rq_pos(req) << 9;
uint64_t end = start + blk_rq_bytes(req);
int error; int error;
rl_t *rl; rl_t *rl;
if (end > zv->zv_volsize) { if (end > zv->zv_volsize)
error = EIO; return (SET_ERROR(EIO));
goto out;
}
/* /*
* Align the request to volume block boundaries. If we don't, * Align the request to volume block boundaries when REQ_SECURE is
* then this will force dnode_free_range() to zero out the * available, but not requested. If we don't, then this will force
* unaligned parts, which is slow (read-modify-write) and * dnode_free_range() to zero out the unaligned parts, which is slow
* useless since we are not freeing any space by doing so. * (read-modify-write) and useless since we are not freeing any space
* by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
* 2.6.35) will not receive this optimization.
*/ */
#ifdef REQ_SECURE
if (!(bio->bi_rw & REQ_SECURE)) {
start = P2ROUNDUP(start, zv->zv_volblocksize); start = P2ROUNDUP(start, zv->zv_volblocksize);
end = P2ALIGN(end, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize);
if (start >= end) {
error = 0;
goto out;
} }
#endif
rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER); if (start >= end)
return (0);
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start); rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);
/* /*
* TODO: maybe we should add the operation to the log. * TODO: maybe we should add the operation to the log.
*/ */
zfs_range_unlock(rl); zfs_range_unlock(rl);
out:
blk_end_request(req, -error, blk_rq_bytes(req));
spl_fstrans_unmark(cookie);
}
#endif /* HAVE_BLK_QUEUE_DISCARD */
/* return (error);
* Common read path running under the zvol taskq context. This function }
* is responsible for copying the requested data out of the DMU and in to
* a linux request structure. It then must signal the request queue with static int
* an error code describing the result of the copy. zvol_read(struct bio *bio)
*/
static void
zvol_read(void *arg)
{ {
struct request *req = (struct request *)arg; zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
struct request_queue *q = req->q; uint64_t offset = BIO_BI_SECTOR(bio) << 9;
zvol_state_t *zv = q->queuedata; uint64_t len = BIO_BI_SIZE(bio);
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = blk_rq_pos(req) << 9;
uint64_t size = blk_rq_bytes(req);
int error; int error;
rl_t *rl; rl_t *rl;
if (size == 0) { if (len == 0)
error = 0; return (0);
goto out;
}
rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req); rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
error = dmu_read_bio(zv->zv_objset, ZVOL_OBJ, bio);
zfs_range_unlock(rl); zfs_range_unlock(rl);
@ -729,91 +703,50 @@ zvol_read(void *arg)
if (error == ECKSUM) if (error == ECKSUM)
error = SET_ERROR(EIO); error = SET_ERROR(EIO);
out: return (error);
blk_end_request(req, -error, size);
spl_fstrans_unmark(cookie);
} }
/* static MAKE_REQUEST_FN_RET
* Request will be added back to the request queue and retried if zvol_request(struct request_queue *q, struct bio *bio)
* it cannot be immediately dispatched to the taskq for handling
*/
static inline void
zvol_dispatch(task_func_t func, struct request *req)
{
if (!taskq_dispatch(zvol_taskq, func, (void *)req, TQ_NOSLEEP))
blk_requeue_request(req->q, req);
}
/*
* Common request path. Rather than registering a custom make_request()
* function we use the generic Linux version. This is done because it allows
* us to easily merge read requests which would otherwise we performed
* synchronously by the DMU. This is less critical in write case where the
* DMU will perform the correct merging within a transaction group. Using
* the generic make_request() also let's use leverage the fact that the
* elevator with ensure correct ordering in regards to barrior IOs. On
* the downside it means that in the write case we end up doing request
* merging twice once in the elevator and once in the DMU.
*
* The request handler is called under a spin lock so all the real work
* is handed off to be done in the context of the zvol taskq. This function
* simply performs basic request sanity checking and hands off the request.
*/
static void
zvol_request(struct request_queue *q)
{ {
zvol_state_t *zv = q->queuedata; zvol_state_t *zv = q->queuedata;
struct request *req; fstrans_cookie_t cookie = spl_fstrans_mark();
unsigned int size; uint64_t offset = BIO_BI_SECTOR(bio);
unsigned int sectors = bio_sectors(bio);
int error = 0;
while ((req = blk_fetch_request(q)) != NULL) { if (bio_has_data(bio) && offset + sectors >
size = blk_rq_bytes(req);
if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) >
get_capacity(zv->zv_disk)) { get_capacity(zv->zv_disk)) {
printk(KERN_INFO printk(KERN_INFO
"%s: bad access: block=%llu, count=%lu\n", "%s: bad access: block=%llu, count=%lu\n",
req->rq_disk->disk_name, zv->zv_disk->disk_name,
(long long unsigned)blk_rq_pos(req), (long long unsigned)offset,
(long unsigned)blk_rq_sectors(req)); (long unsigned)sectors);
__blk_end_request(req, -EIO, size); error = SET_ERROR(EIO);
continue; goto out;
} }
if (!blk_fs_request(req)) { if (bio_data_dir(bio) == WRITE) {
printk(KERN_INFO "%s: non-fs cmd\n",
req->rq_disk->disk_name);
__blk_end_request(req, -EIO, size);
continue;
}
switch ((int)rq_data_dir(req)) {
case READ:
zvol_dispatch(zvol_read, req);
break;
case WRITE:
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
__blk_end_request(req, -EROFS, size); error = SET_ERROR(EROFS);
break; goto out;
} }
#ifdef HAVE_BLK_QUEUE_DISCARD if (bio->bi_rw & VDEV_REQ_DISCARD) {
if (req->cmd_flags & VDEV_REQ_DISCARD) { error = zvol_discard(bio);
zvol_dispatch(zvol_discard, req); goto out;
break;
} }
#endif /* HAVE_BLK_QUEUE_DISCARD */
zvol_dispatch(zvol_write, req); error = zvol_write(bio);
break; } else
default: error = zvol_read(bio);
printk(KERN_INFO "%s: unknown cmd: %d\n",
req->rq_disk->disk_name, (int)rq_data_dir(req)); out:
__blk_end_request(req, -EIO, size); bio_endio(bio, -error);
break; spl_fstrans_unmark(cookie);
} #ifdef HAVE_MAKE_REQUEST_FN_RET_INT
} return (0);
#endif
} }
static void static void
@ -1259,25 +1192,17 @@ static zvol_state_t *
zvol_alloc(dev_t dev, const char *name) zvol_alloc(dev_t dev, const char *name)
{ {
zvol_state_t *zv; zvol_state_t *zv;
int error = 0;
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
spin_lock_init(&zv->zv_lock); spin_lock_init(&zv->zv_lock);
list_link_init(&zv->zv_next); list_link_init(&zv->zv_next);
zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock); zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
if (zv->zv_queue == NULL) if (zv->zv_queue == NULL)
goto out_kmem; goto out_kmem;
#ifdef HAVE_ELEVATOR_CHANGE blk_queue_make_request(zv->zv_queue, zvol_request);
error = elevator_change(zv->zv_queue, "noop");
#endif /* HAVE_ELEVATOR_CHANGE */
if (error) {
printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n",
"noop", name, error);
goto out_queue;
}
#ifdef HAVE_BLK_QUEUE_FLUSH #ifdef HAVE_BLK_QUEUE_FLUSH
blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA); blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
@ -1418,13 +1343,11 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
#ifdef HAVE_BLK_QUEUE_DISCARD
blk_queue_max_discard_sectors(zv->zv_queue, blk_queue_max_discard_sectors(zv->zv_queue,
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
#endif #ifdef QUEUE_FLAG_NONROT
#ifdef HAVE_BLK_QUEUE_NONROT
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
#endif #endif
#ifdef QUEUE_FLAG_ADD_RANDOM #ifdef QUEUE_FLAG_ADD_RANDOM
@ -1651,7 +1574,6 @@ zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
int int
zvol_init(void) zvol_init(void)
{ {
int threads = MIN(MAX(zvol_threads, 1), 1024);
int error; int error;
list_create(&zvol_state_list, sizeof (zvol_state_t), list_create(&zvol_state_list, sizeof (zvol_state_t),
@ -1659,18 +1581,10 @@ zvol_init(void)
mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (zvol_taskq == NULL) {
printk(KERN_INFO "ZFS: taskq_create() failed\n");
error = -ENOMEM;
goto out1;
}
error = register_blkdev(zvol_major, ZVOL_DRIVER); error = register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) { if (error) {
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
goto out2; goto out;
} }
blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
@ -1678,9 +1592,7 @@ zvol_init(void)
return (0); return (0);
out2: out:
taskq_destroy(zvol_taskq);
out1:
mutex_destroy(&zvol_state_lock); mutex_destroy(&zvol_state_lock);
list_destroy(&zvol_state_list); list_destroy(&zvol_state_list);
@ -1693,7 +1605,6 @@ zvol_fini(void)
zvol_remove_minors(NULL); zvol_remove_minors(NULL);
blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
unregister_blkdev(zvol_major, ZVOL_DRIVER); unregister_blkdev(zvol_major, ZVOL_DRIVER);
taskq_destroy(zvol_taskq);
mutex_destroy(&zvol_state_lock); mutex_destroy(&zvol_state_lock);
list_destroy(&zvol_state_list); list_destroy(&zvol_state_list);
} }
@ -1704,8 +1615,5 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
module_param(zvol_major, uint, 0444); module_param(zvol_major, uint, 0444);
MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
module_param(zvol_threads, uint, 0444);
MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
module_param(zvol_max_discard_blocks, ulong, 0444); module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");