2010-08-26 22:45:02 +04:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
|
|
|
|
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
|
|
|
|
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
|
|
|
* LLNL-CODE-403049.
|
|
|
|
*
|
|
|
|
* ZFS volume emulation driver.
|
|
|
|
*
|
|
|
|
* Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
|
|
|
|
* Volumes are accessed through the symbolic links named:
|
|
|
|
*
|
|
|
|
* /dev/<pool_name>/<dataset_name>
|
|
|
|
*
|
|
|
|
* Volumes are persistent through reboot and module load. No user command
|
|
|
|
* needs to be run before opening and using a device.
|
2015-08-02 16:01:14 +03:00
|
|
|
*
|
|
|
|
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
|
2016-02-16 22:52:55 +03:00
|
|
|
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
|
2017-04-14 22:59:18 +03:00
|
|
|
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
/*
|
|
|
|
* Note on locking of zvol state structures.
|
|
|
|
*
|
|
|
|
* These structures are used to maintain internal state used to emulate block
|
|
|
|
* devices on top of zvols. In particular, management of device minor number
|
|
|
|
* operations - create, remove, rename, and set_snapdev - involves access to
|
|
|
|
* these structures. The zvol_state_lock is primarily used to protect the
|
|
|
|
* zvol_state_list. The zv->zv_state_lock is used to protect the contents
|
|
|
|
* of the zvol_state_t structures, as well as to make sure that when the
|
|
|
|
* time comes to remove the structure from the list, it is not in use, and
|
|
|
|
* therefore, it can be taken off zvol_state_list and freed.
|
|
|
|
*
|
2017-06-13 19:03:44 +03:00
|
|
|
* The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
|
|
|
|
* e.g. for the duration of receive and rollback operations. This lock can be
|
|
|
|
* held for significant periods of time. Given that it is undesirable to hold
|
|
|
|
* mutexes for long periods of time, the following lock ordering applies:
|
|
|
|
* - take zvol_state_lock if necessary, to protect zvol_state_list
|
|
|
|
* - take zv_suspend_lock if necessary, by the code path in question
|
|
|
|
* - take zv_state_lock to protect zvol_state_t
|
|
|
|
*
|
|
|
|
* The minor operations are issued to spa->spa_zvol_taskq queues, that are
|
2017-05-10 20:51:29 +03:00
|
|
|
* single-threaded (to preserve order of minor operations), and are executed
|
|
|
|
* through the zvol_task_cb that dispatches the specific operations. Therefore,
|
|
|
|
* these operations are serialized per pool. Consequently, we can be certain
|
|
|
|
* that for a given zvol, there is only one operation at a time in progress.
|
|
|
|
* That is why one can be sure that first, zvol_state_t for a given zvol is
|
|
|
|
* allocated and placed on zvol_state_list, and then other minor operations
|
|
|
|
* for this zvol are going to proceed in the order of issue.
|
|
|
|
*
|
|
|
|
* It is also worth keeping in mind that once add_disk() is called, the zvol is
|
|
|
|
* announced to the world, and zvol_open()/zvol_release() can be called at any
|
|
|
|
* time. Incidentally, add_disk() itself calls zvol_open()->zvol_first_open()
|
|
|
|
* and zvol_release()->zvol_last_close() directly as well.
|
|
|
|
*/
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
#include <sys/dbuf.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
#include <sys/dmu_traverse.h>
|
|
|
|
#include <sys/dsl_dataset.h>
|
|
|
|
#include <sys/dsl_prop.h>
|
2014-03-22 13:07:14 +04:00
|
|
|
#include <sys/dsl_dir.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
#include <sys/zap.h>
|
2015-08-25 00:18:48 +03:00
|
|
|
#include <sys/zfeature.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
#include <sys/zil_impl.h>
|
2015-08-02 16:01:14 +03:00
|
|
|
#include <sys/dmu_tx.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
#include <sys/zio.h>
|
|
|
|
#include <sys/zfs_rlock.h>
|
|
|
|
#include <sys/zfs_znode.h>
|
2014-03-22 13:07:14 +04:00
|
|
|
#include <sys/spa_impl.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
#include <sys/zvol.h>
|
2011-02-22 23:15:13 +03:00
|
|
|
#include <linux/blkdev_compat.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2012-06-02 05:49:10 +04:00
|
|
|
unsigned int zvol_inhibit_dev = 0;
|
2010-08-26 22:45:02 +04:00
|
|
|
unsigned int zvol_major = ZVOL_MAJOR;
|
2017-02-23 03:08:04 +03:00
|
|
|
unsigned int zvol_threads = 32;
|
2017-05-03 03:37:14 +03:00
|
|
|
unsigned int zvol_request_sync = 0;
|
2015-08-18 23:51:20 +03:00
|
|
|
unsigned int zvol_prefetch_bytes = (128 * 1024);
|
Limit the number of blocks to discard at once.
The number of blocks that can be discarded in one BLKDISCARD ioctl on a
zvol is currently unlimited. Some applications, such as mkfs, discard
the whole volume at once and they use the maximum possible discard size
to do that. As a result, several gigabytes discard requests are not
uncommon.
Unfortunately, if a large amount of data is allocated in the zvol, ZFS
can be quite slow to process discard requests. This is especially true
if the volblocksize is low (e.g. the 8K default). As a result, very
large discard requests can take a very long time (seconds to minutes
under heavy load) to complete. This can cause a number of problems, most
notably if the zvol is accessed remotely (e.g. via iSCSI), in which case
the client has a high probability of timing out on the request.
This patch solves the issue by adding a new tunable module parameter:
zvol_max_discard_blocks. This indicates the maximum possible range, in
zvol blocks, of one discard operation. It is set by default to 16384
blocks, which appears to be a good tradeoff. Using the default
volblocksize of 8K this is equivalent to 128 MB. When using the maximum
volblocksize of 128K this is equivalent to 2 GB.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #858
2012-07-31 12:45:37 +04:00
|
|
|
unsigned long zvol_max_discard_blocks = 16384;
|
2017-07-12 23:05:37 +03:00
|
|
|
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
static taskq_t *zvol_taskq;
|
2010-08-26 22:45:02 +04:00
|
|
|
static kmutex_t zvol_state_lock;
|
|
|
|
static list_t zvol_state_list;
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
#define ZVOL_HT_SIZE 1024
|
|
|
|
static struct hlist_head *zvol_htable;
|
|
|
|
#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)])
|
2017-02-08 20:27:48 +03:00
|
|
|
|
|
|
|
static struct ida zvol_ida;
|
2016-12-01 00:56:50 +03:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/*
|
|
|
|
* The in-core state of each volume.
|
|
|
|
*/
|
2017-01-20 00:56:36 +03:00
|
|
|
struct zvol_state {
|
2011-02-22 13:58:44 +03:00
|
|
|
char zv_name[MAXNAMELEN]; /* name */
|
2013-12-13 01:04:40 +04:00
|
|
|
uint64_t zv_volsize; /* advertised space */
|
|
|
|
uint64_t zv_volblocksize; /* volume block size */
|
2010-08-26 22:45:02 +04:00
|
|
|
objset_t *zv_objset; /* objset handle */
|
|
|
|
uint32_t zv_flags; /* ZVOL_* flags */
|
|
|
|
uint32_t zv_open_count; /* open counts */
|
|
|
|
uint32_t zv_changed; /* disk changed */
|
|
|
|
zilog_t *zv_zilog; /* ZIL handle */
|
2016-04-12 00:53:48 +03:00
|
|
|
zfs_rlock_t zv_range_lock; /* range lock */
|
2017-06-13 19:18:08 +03:00
|
|
|
dnode_t *zv_dn; /* dnode hold */
|
2010-08-26 22:45:02 +04:00
|
|
|
dev_t zv_dev; /* device id */
|
|
|
|
struct gendisk *zv_disk; /* generic disk */
|
|
|
|
struct request_queue *zv_queue; /* request queue */
|
|
|
|
list_node_t zv_next; /* next zvol_state_t linkage */
|
2016-12-01 00:56:50 +03:00
|
|
|
uint64_t zv_hash; /* name hash */
|
|
|
|
struct hlist_node zv_hlink; /* hash link */
|
2017-05-10 20:51:29 +03:00
|
|
|
kmutex_t zv_state_lock; /* protects zvol_state_t */
|
2017-01-20 00:56:36 +03:00
|
|
|
atomic_t zv_suspend_ref; /* refcount for suspend */
|
|
|
|
krwlock_t zv_suspend_lock; /* suspend lock */
|
|
|
|
};
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
typedef enum {
|
|
|
|
ZVOL_ASYNC_CREATE_MINORS,
|
|
|
|
ZVOL_ASYNC_REMOVE_MINORS,
|
|
|
|
ZVOL_ASYNC_RENAME_MINORS,
|
|
|
|
ZVOL_ASYNC_SET_SNAPDEV,
|
2017-07-12 23:05:37 +03:00
|
|
|
ZVOL_ASYNC_SET_VOLMODE,
|
2014-03-22 13:07:14 +04:00
|
|
|
ZVOL_ASYNC_MAX
|
|
|
|
} zvol_async_op_t;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
zvol_async_op_t op;
|
|
|
|
char pool[MAXNAMELEN];
|
|
|
|
char name1[MAXNAMELEN];
|
|
|
|
char name2[MAXNAMELEN];
|
|
|
|
zprop_source_t source;
|
2017-07-12 23:05:37 +03:00
|
|
|
uint64_t value;
|
2014-03-22 13:07:14 +04:00
|
|
|
} zvol_task_t;
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
#define ZVOL_RDONLY 0x1
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
static uint64_t
|
|
|
|
zvol_name_hash(const char *name)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2016-12-01 00:56:50 +03:00
|
|
|
int i;
|
|
|
|
uint64_t crc = -1ULL;
|
|
|
|
uint8_t *p = (uint8_t *)name;
|
|
|
|
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
|
|
|
for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2016-12-01 00:56:50 +03:00
|
|
|
return (crc);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-06-13 19:03:44 +03:00
|
|
|
* Find a zvol_state_t given the full major+minor dev_t. If found,
|
|
|
|
* return with zv_state_lock taken, otherwise, return (NULL) without
|
|
|
|
* taking zv_state_lock.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
|
|
|
static zvol_state_t *
|
|
|
|
zvol_find_by_dev(dev_t dev)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zvol_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
for (zv = list_head(&zvol_state_list); zv != NULL;
|
2013-12-13 01:04:40 +04:00
|
|
|
zv = list_next(&zvol_state_list, zv)) {
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
if (zv->zv_dev == dev) {
|
|
|
|
mutex_exit(&zvol_state_lock);
|
2013-12-13 01:04:40 +04:00
|
|
|
return (zv);
|
2017-06-13 19:03:44 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_exit(&zvol_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (NULL);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-12-01 00:56:50 +03:00
|
|
|
* Find a zvol_state_t given the name and hash generated by zvol_name_hash.
|
2017-06-13 19:03:44 +03:00
|
|
|
* If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
|
|
|
|
* return (NULL) without the taking locks. The zv_suspend_lock is always taken
|
|
|
|
* before zv_state_lock. The mode argument indicates the mode (including none)
|
|
|
|
* for zv_suspend_lock to be taken.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
|
|
|
static zvol_state_t *
|
2017-06-13 19:03:44 +03:00
|
|
|
zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
zvol_state_t *zv;
|
2017-11-19 01:08:00 +03:00
|
|
|
struct hlist_node *p = NULL;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zvol_state_lock);
|
2016-12-01 00:56:50 +03:00
|
|
|
hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
|
|
|
|
zv = hlist_entry(p, zvol_state_t, zv_hlink);
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
2016-12-01 00:56:50 +03:00
|
|
|
if (zv->zv_hash == hash &&
|
2017-06-13 19:03:44 +03:00
|
|
|
strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
|
|
|
|
/*
|
|
|
|
* this is the right zvol, take the locks in the
|
|
|
|
* right order
|
|
|
|
*/
|
|
|
|
if (mode != RW_NONE &&
|
|
|
|
!rw_tryenter(&zv->zv_suspend_lock, mode)) {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
rw_enter(&zv->zv_suspend_lock, mode);
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
/*
|
|
|
|
* zvol cannot be renamed as we continue
|
|
|
|
* to hold zvol_state_lock
|
|
|
|
*/
|
|
|
|
ASSERT(zv->zv_hash == hash &&
|
|
|
|
strncmp(zv->zv_name, name, MAXNAMELEN)
|
|
|
|
== 0);
|
|
|
|
}
|
|
|
|
mutex_exit(&zvol_state_lock);
|
2013-12-13 01:04:40 +04:00
|
|
|
return (zv);
|
2017-06-13 19:03:44 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_exit(&zvol_state_lock);
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (NULL);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
/*
|
2017-06-13 19:03:44 +03:00
|
|
|
* Find a zvol_state_t given the name.
|
|
|
|
* If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
|
|
|
|
* return (NULL) without the taking locks. The zv_suspend_lock is always taken
|
|
|
|
* before zv_state_lock. The mode argument indicates the mode (including none)
|
|
|
|
* for zv_suspend_lock to be taken.
|
2016-12-01 00:56:50 +03:00
|
|
|
*/
|
|
|
|
static zvol_state_t *
|
2017-06-13 19:03:44 +03:00
|
|
|
zvol_find_by_name(const char *name, int mode)
|
2016-12-01 00:56:50 +03:00
|
|
|
{
|
2017-06-13 19:03:44 +03:00
|
|
|
return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
|
2016-12-01 00:56:50 +03:00
|
|
|
}
|
|
|
|
|
2012-12-17 05:33:57 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Given a path, return TRUE if path is a ZVOL.
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zvol_is_zvol(const char *device)
|
|
|
|
{
|
|
|
|
struct block_device *bdev;
|
|
|
|
unsigned int major;
|
|
|
|
|
2016-10-26 20:30:43 +03:00
|
|
|
bdev = vdev_lookup_bdev(device);
|
2012-12-17 05:33:57 +04:00
|
|
|
if (IS_ERR(bdev))
|
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
major = MAJOR(bdev->bd_dev);
|
|
|
|
bdput(bdev);
|
|
|
|
|
|
|
|
if (major == zvol_major)
|
2013-12-13 01:04:40 +04:00
|
|
|
return (B_TRUE);
|
2012-12-17 05:33:57 +04:00
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/*
|
|
|
|
* ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
zfs_creat_t *zct = arg;
|
|
|
|
nvlist_t *nvprops = zct->zct_props;
|
|
|
|
int error;
|
|
|
|
uint64_t volblocksize, volsize;
|
|
|
|
|
|
|
|
VERIFY(nvlist_lookup_uint64(nvprops,
|
|
|
|
zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
|
|
|
|
if (nvlist_lookup_uint64(nvprops,
|
|
|
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
|
|
|
|
volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These properties must be removed from the list so the generic
|
|
|
|
* property setting step won't apply to them.
|
|
|
|
*/
|
|
|
|
VERIFY(nvlist_remove_all(nvprops,
|
|
|
|
zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
|
|
|
|
(void) nvlist_remove_all(nvprops,
|
|
|
|
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
|
|
|
|
|
|
|
|
error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
|
|
|
|
DMU_OT_NONE, 0, tx);
|
|
|
|
ASSERT(error == 0);
|
|
|
|
|
|
|
|
error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
|
|
|
|
DMU_OT_NONE, 0, tx);
|
|
|
|
ASSERT(error == 0);
|
|
|
|
|
|
|
|
error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
|
|
|
|
ASSERT(error == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ZFS_IOC_OBJSET_STATS entry point.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zvol_get_stats(objset_t *os, nvlist_t *nv)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
dmu_object_info_t *doi;
|
|
|
|
uint64_t val;
|
|
|
|
|
|
|
|
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
|
|
|
|
if (error)
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
|
2013-12-13 01:04:40 +04:00
|
|
|
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
|
2010-08-26 22:45:02 +04:00
|
|
|
error = dmu_object_info(os, ZVOL_OBJ, doi);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
|
|
|
|
doi->doi_data_block_size);
|
|
|
|
}
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
kmem_free(doi, sizeof (dmu_object_info_t));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
static void
|
|
|
|
zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
|
|
|
|
{
|
|
|
|
struct block_device *bdev;
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
bdev = bdget_disk(zv->zv_disk, 0);
|
|
|
|
if (bdev == NULL)
|
|
|
|
return;
|
2017-06-13 19:03:44 +03:00
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
set_capacity(zv->zv_disk, volsize >> 9);
|
|
|
|
zv->zv_volsize = volsize;
|
|
|
|
check_disk_size_change(zv->zv_disk, bdev);
|
|
|
|
|
|
|
|
bdput(bdev);
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/*
|
|
|
|
* Sanity check volume size.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
|
|
|
|
{
|
|
|
|
if (volsize == 0)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
if (volsize % blocksize != 0)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
#ifdef _ILP32
|
2016-10-29 02:53:24 +03:00
|
|
|
if (volsize - 1 > SPEC_MAXOFFSET_T)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EOVERFLOW));
|
2010-08-26 22:45:02 +04:00
|
|
|
#endif
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ensure the zap is flushed then inform the VFS of the capacity change.
|
|
|
|
*/
|
|
|
|
static int
|
2014-01-14 02:27:33 +04:00
|
|
|
zvol_update_volsize(uint64_t volsize, objset_t *os)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
2016-02-26 10:33:44 +03:00
|
|
|
uint64_t txg;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2011-02-25 10:36:01 +03:00
|
|
|
tx = dmu_tx_create(os);
|
2010-08-26 22:45:02 +04:00
|
|
|
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
|
2014-07-07 23:49:36 +04:00
|
|
|
dmu_tx_mark_netfree(tx);
|
2010-08-26 22:45:02 +04:00
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2016-02-26 10:33:44 +03:00
|
|
|
txg = dmu_tx_get_txg(tx);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2011-02-25 10:36:01 +03:00
|
|
|
error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
|
2010-08-26 22:45:02 +04:00
|
|
|
&volsize, tx);
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
2016-02-26 10:33:44 +03:00
|
|
|
txg_wait_synced(dmu_objset_pool(os), txg);
|
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
if (error == 0)
|
|
|
|
error = dmu_free_long_range(os,
|
|
|
|
ZVOL_OBJ, volsize, DMU_OBJECT_END);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
return (error);
|
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
static int
|
|
|
|
zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
|
|
|
|
{
|
|
|
|
zvol_size_changed(zv, volsize);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
/*
|
|
|
|
* We should post a event here describing the expansion. However,
|
|
|
|
* the zfs_ereport_post() interface doesn't nicely support posting
|
|
|
|
* events for zvols, it assumes events relate to vdevs or zios.
|
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set ZFS_PROP_VOLSIZE set entry point.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zvol_set_volsize(const char *name, uint64_t volsize)
|
|
|
|
{
|
2014-01-14 02:27:33 +04:00
|
|
|
zvol_state_t *zv = NULL;
|
2010-08-26 22:45:02 +04:00
|
|
|
objset_t *os = NULL;
|
|
|
|
int error;
|
2014-01-14 02:27:33 +04:00
|
|
|
dmu_object_info_t *doi;
|
|
|
|
uint64_t readonly;
|
|
|
|
boolean_t owned = B_FALSE;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
error = dsl_prop_get_integer(name,
|
|
|
|
zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
|
|
|
|
if (error != 0)
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2013-09-04 16:00:57 +04:00
|
|
|
if (readonly)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EROFS));
|
2013-09-04 16:00:57 +04:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
zv = zvol_find_by_name(name, RW_READER);
|
|
|
|
|
|
|
|
ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
|
|
|
|
RW_READ_HELD(&zv->zv_suspend_lock)));
|
2014-01-14 02:27:33 +04:00
|
|
|
|
|
|
|
if (zv == NULL || zv->zv_objset == NULL) {
|
2017-06-13 19:03:44 +03:00
|
|
|
if (zv != NULL)
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
|
2014-01-14 02:27:33 +04:00
|
|
|
FTAG, &os)) != 0) {
|
2017-05-10 20:51:29 +03:00
|
|
|
if (zv != NULL)
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2014-01-14 02:27:33 +04:00
|
|
|
return (SET_ERROR(error));
|
|
|
|
}
|
|
|
|
owned = B_TRUE;
|
|
|
|
if (zv != NULL)
|
|
|
|
zv->zv_objset = os;
|
|
|
|
} else {
|
|
|
|
os = zv->zv_objset;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
|
|
|
|
(error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
|
2014-01-14 02:27:33 +04:00
|
|
|
goto out;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
error = zvol_update_volsize(volsize, os);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
if (error == 0 && zv != NULL)
|
|
|
|
error = zvol_update_live_volsize(zv, volsize);
|
|
|
|
out:
|
2017-05-31 22:52:12 +03:00
|
|
|
kmem_free(doi, sizeof (dmu_object_info_t));
|
|
|
|
|
2014-01-14 02:27:33 +04:00
|
|
|
if (owned) {
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
dmu_objset_disown(os, B_TRUE, FTAG);
|
2014-01-14 02:27:33 +04:00
|
|
|
if (zv != NULL)
|
|
|
|
zv->zv_objset = NULL;
|
2017-01-20 00:56:36 +03:00
|
|
|
} else {
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2014-01-14 02:27:33 +04:00
|
|
|
}
|
2017-05-10 20:51:29 +03:00
|
|
|
|
|
|
|
if (zv != NULL)
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2017-06-13 19:03:44 +03:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sanity check volume block size.
|
|
|
|
*/
|
|
|
|
int
|
2015-08-25 00:18:48 +03:00
|
|
|
zvol_check_volblocksize(const char *name, uint64_t volblocksize)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2015-08-25 00:18:48 +03:00
|
|
|
/* Record sizes above 128k need the feature to be enabled */
|
|
|
|
if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
|
|
|
|
spa_t *spa;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if ((error = spa_open(name, &spa, FTAG)) != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
|
|
|
|
spa_close(spa, FTAG);
|
|
|
|
return (SET_ERROR(ENOTSUP));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't allow setting the property above 1MB,
|
|
|
|
* unless the tunable has been changed.
|
|
|
|
*/
|
|
|
|
if (volblocksize > zfs_max_recordsize)
|
|
|
|
return (SET_ERROR(EDOM));
|
|
|
|
|
|
|
|
spa_close(spa, FTAG);
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
if (volblocksize < SPA_MINBLOCKSIZE ||
|
|
|
|
volblocksize > SPA_MAXBLOCKSIZE ||
|
|
|
|
!ISP2(volblocksize))
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EDOM));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set ZFS_PROP_VOLBLOCKSIZE set entry point.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zvol_set_volblocksize(const char *name, uint64_t volblocksize)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
zv = zvol_find_by_name(name, RW_READER);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
if (zv == NULL)
|
2017-05-10 20:51:29 +03:00
|
|
|
return (SET_ERROR(ENXIO));
|
2017-06-13 19:03:44 +03:00
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
|
|
|
|
RW_READ_HELD(&zv->zv_suspend_lock));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-07 02:20:22 +04:00
|
|
|
if (zv->zv_flags & ZVOL_RDONLY) {
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2017-06-13 19:03:44 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2017-05-10 20:51:29 +03:00
|
|
|
return (SET_ERROR(EROFS));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
tx = dmu_tx_create(zv->zv_objset);
|
|
|
|
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
|
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
} else {
|
|
|
|
error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
|
|
|
|
volblocksize, 0, tx);
|
|
|
|
if (error == ENOTSUP)
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EBUSY);
|
2010-08-26 22:45:02 +04:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
if (error == 0)
|
|
|
|
zv->zv_volblocksize = volblocksize;
|
|
|
|
}
|
2017-05-10 20:51:29 +03:00
|
|
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2017-06-13 19:03:44 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2015-08-02 16:01:14 +03:00
|
|
|
/*
|
|
|
|
* Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
|
|
|
|
* implement DKIOCFREE/free-long-range.
|
|
|
|
*/
|
|
|
|
static int
|
2017-10-27 22:46:35 +03:00
|
|
|
zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
|
2015-08-02 16:01:14 +03:00
|
|
|
{
|
2017-10-27 22:46:35 +03:00
|
|
|
zvol_state_t *zv = arg1;
|
|
|
|
lr_truncate_t *lr = arg2;
|
2015-08-02 16:01:14 +03:00
|
|
|
uint64_t offset, length;
|
|
|
|
|
|
|
|
if (byteswap)
|
|
|
|
byteswap_uint64_array(lr, sizeof (*lr));
|
|
|
|
|
|
|
|
offset = lr->lr_offset;
|
|
|
|
length = lr->lr_length;
|
|
|
|
|
|
|
|
return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/*
|
|
|
|
* Replay a TX_WRITE ZIL transaction that didn't get committed
|
|
|
|
* after a system failure
|
|
|
|
*/
|
|
|
|
static int
|
2017-10-27 22:46:35 +03:00
|
|
|
zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2017-10-27 22:46:35 +03:00
|
|
|
zvol_state_t *zv = arg1;
|
|
|
|
lr_write_t *lr = arg2;
|
2010-08-26 22:45:02 +04:00
|
|
|
objset_t *os = zv->zv_objset;
|
2017-09-09 01:07:00 +03:00
|
|
|
char *data = (char *)(lr + 1); /* data follows lr_write_t */
|
|
|
|
uint64_t offset, length;
|
2010-08-26 22:45:02 +04:00
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (byteswap)
|
|
|
|
byteswap_uint64_array(lr, sizeof (*lr));
|
|
|
|
|
2017-09-09 01:07:00 +03:00
|
|
|
offset = lr->lr_offset;
|
|
|
|
length = lr->lr_length;
|
|
|
|
|
|
|
|
/* If it's a dmu_sync() block, write the whole block */
|
|
|
|
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
|
|
|
|
uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
|
|
|
|
if (length < blocksize) {
|
|
|
|
offset -= offset % blocksize;
|
|
|
|
length = blocksize;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
tx = dmu_tx_create(os);
|
2017-09-09 01:07:00 +03:00
|
|
|
dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
|
2010-08-26 22:45:02 +04:00
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
} else {
|
2017-09-09 01:07:00 +03:00
|
|
|
dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
|
2010-08-26 22:45:02 +04:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
}
|
|
|
|
|
2017-09-09 01:07:00 +03:00
|
|
|
return (error);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2017-10-27 22:46:35 +03:00
|
|
|
zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOTSUP));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Callback vectors for replaying records.
|
2015-08-02 16:01:14 +03:00
|
|
|
* Only TX_WRITE and TX_TRUNCATE are needed for zvol.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
2017-10-27 22:46:35 +03:00
|
|
|
zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
|
|
|
|
zvol_replay_err, /* no such transaction type */
|
|
|
|
zvol_replay_err, /* TX_CREATE */
|
|
|
|
zvol_replay_err, /* TX_MKDIR */
|
|
|
|
zvol_replay_err, /* TX_MKXATTR */
|
|
|
|
zvol_replay_err, /* TX_SYMLINK */
|
|
|
|
zvol_replay_err, /* TX_REMOVE */
|
|
|
|
zvol_replay_err, /* TX_RMDIR */
|
|
|
|
zvol_replay_err, /* TX_LINK */
|
|
|
|
zvol_replay_err, /* TX_RENAME */
|
|
|
|
zvol_replay_write, /* TX_WRITE */
|
|
|
|
zvol_replay_truncate, /* TX_TRUNCATE */
|
|
|
|
zvol_replay_err, /* TX_SETATTR */
|
|
|
|
zvol_replay_err, /* TX_ACL */
|
|
|
|
zvol_replay_err, /* TX_CREATE_ATTR */
|
|
|
|
zvol_replay_err, /* TX_CREATE_ACL_ATTR */
|
|
|
|
zvol_replay_err, /* TX_MKDIR_ACL */
|
|
|
|
zvol_replay_err, /* TX_MKDIR_ATTR */
|
|
|
|
zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
|
|
|
|
zvol_replay_err, /* TX_WRITE2 */
|
2010-08-26 22:45:02 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
|
|
|
|
*
|
|
|
|
* We store data in the log buffers if it's small enough.
|
|
|
|
* Otherwise we will later flush the data out via dmu_sync().
|
|
|
|
*/
|
|
|
|
ssize_t zvol_immediate_write_sz = 32768;
|
|
|
|
|
|
|
|
static void
|
2013-12-13 01:04:40 +04:00
|
|
|
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
|
|
|
|
uint64_t size, int sync)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
uint32_t blocksize = zv->zv_volblocksize;
|
|
|
|
zilog_t *zilog = zv->zv_zilog;
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
itx_wr_state_t write_state;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
if (zil_replaying(zilog, tx))
|
|
|
|
return;
|
|
|
|
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
|
|
|
|
write_state = WR_INDIRECT;
|
|
|
|
else if (!spa_has_slogs(zilog->zl_spa) &&
|
|
|
|
size >= blocksize && blocksize > zvol_immediate_write_sz)
|
|
|
|
write_state = WR_INDIRECT;
|
|
|
|
else if (sync)
|
|
|
|
write_state = WR_COPIED;
|
|
|
|
else
|
|
|
|
write_state = WR_NEED_COPY;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
while (size) {
|
|
|
|
itx_t *itx;
|
|
|
|
lr_write_t *lr;
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
itx_wr_state_t wr_state = write_state;
|
|
|
|
ssize_t len = size;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
if (wr_state == WR_COPIED && size > ZIL_MAX_COPIED_DATA)
|
|
|
|
wr_state = WR_NEED_COPY;
|
|
|
|
else if (wr_state == WR_INDIRECT)
|
|
|
|
len = MIN(blocksize - P2PHASE(offset, blocksize), size);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
(wr_state == WR_COPIED ? len : 0));
|
2010-08-26 22:45:02 +04:00
|
|
|
lr = (lr_write_t *)&itx->itx_lr;
|
2017-06-13 19:18:08 +03:00
|
|
|
if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
|
|
|
|
offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
|
2010-08-26 22:45:02 +04:00
|
|
|
zil_itx_destroy(itx);
|
|
|
|
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
|
|
|
|
lr = (lr_write_t *)&itx->itx_lr;
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
wr_state = WR_NEED_COPY;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
OpenZFS 7578 - Fix/improve some aspects of ZIL writing
- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.
- Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
- Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size. In some
cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas. This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently. Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.
- While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.
Sponsored by: iXsystems, Inc.
Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
2017-06-09 19:15:37 +03:00
|
|
|
itx->itx_wr_state = wr_state;
|
2010-08-26 22:45:02 +04:00
|
|
|
lr->lr_foid = ZVOL_OBJ;
|
|
|
|
lr->lr_offset = offset;
|
|
|
|
lr->lr_length = len;
|
|
|
|
lr->lr_blkoff = 0;
|
|
|
|
BP_ZERO(&lr->lr_blkptr);
|
|
|
|
|
|
|
|
itx->itx_private = zv;
|
|
|
|
itx->itx_sync = sync;
|
|
|
|
|
|
|
|
(void) zil_itx_assign(zilog, itx, tx);
|
|
|
|
|
|
|
|
offset += len;
|
|
|
|
size -= len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
typedef struct zv_request {
|
|
|
|
zvol_state_t *zv;
|
|
|
|
struct bio *bio;
|
|
|
|
rl_t *rl;
|
|
|
|
} zv_request_t;
|
|
|
|
|
|
|
|
static void
|
|
|
|
uio_from_bio(uio_t *uio, struct bio *bio)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2017-02-23 03:08:04 +03:00
|
|
|
uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
|
|
|
|
uio->uio_skip = BIO_BI_SKIP(bio);
|
|
|
|
uio->uio_resid = BIO_BI_SIZE(bio);
|
|
|
|
uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
|
|
|
|
uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
|
|
|
|
uio->uio_limit = MAXOFFSET_T;
|
|
|
|
uio->uio_segflg = UIO_BVEC;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zvol_write(void *arg)
|
|
|
|
{
|
|
|
|
zv_request_t *zvr = arg;
|
|
|
|
struct bio *bio = zvr->bio;
|
|
|
|
uio_t uio;
|
|
|
|
zvol_state_t *zv = zvr->zv;
|
2016-02-06 04:36:07 +03:00
|
|
|
uint64_t volsize = zv->zv_volsize;
|
2017-02-23 03:08:04 +03:00
|
|
|
boolean_t sync;
|
2016-02-06 04:36:07 +03:00
|
|
|
int error = 0;
|
2017-02-23 03:08:04 +03:00
|
|
|
unsigned long start_jif;
|
|
|
|
|
|
|
|
uio_from_bio(&uio, bio);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
ASSERT(zv && zv->zv_open_count > 0);
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
start_jif = jiffies;
|
2017-09-16 21:00:19 +03:00
|
|
|
blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
|
|
|
|
&zv->zv_disk->part0);
|
2011-09-05 13:11:38 +04:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
|
|
|
|
|
|
|
|
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
|
|
|
|
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
|
|
|
|
uint64_t off = uio.uio_loffset;
|
2016-02-06 04:36:07 +03:00
|
|
|
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
|
2011-09-05 13:11:38 +04:00
|
|
|
|
2016-02-06 04:36:07 +03:00
|
|
|
if (bytes > volsize - off) /* don't write past the end */
|
|
|
|
bytes = volsize - off;
|
2015-12-08 23:37:24 +03:00
|
|
|
|
2016-02-06 04:36:07 +03:00
|
|
|
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-02-06 04:36:07 +03:00
|
|
|
/* This will only fail for ENOSPC */
|
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
break;
|
|
|
|
}
|
2017-06-13 19:18:08 +03:00
|
|
|
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
|
2016-02-06 04:36:07 +03:00
|
|
|
if (error == 0)
|
|
|
|
zvol_log_write(zv, tx, off, bytes, sync);
|
|
|
|
dmu_tx_commit(tx);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-02-06 04:36:07 +03:00
|
|
|
if (error)
|
|
|
|
break;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2017-02-23 03:08:04 +03:00
|
|
|
zfs_range_unlock(zvr->rl);
|
2016-02-06 04:36:07 +03:00
|
|
|
if (sync)
|
2010-08-26 22:45:02 +04:00
|
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
2017-02-23 03:08:04 +03:00
|
|
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2017-09-16 21:00:19 +03:00
|
|
|
blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
|
|
|
|
start_jif);
|
2017-02-23 03:08:04 +03:00
|
|
|
BIO_END_IO(bio, -error);
|
|
|
|
kmem_free(zvr, sizeof (zv_request_t));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2015-08-02 16:01:14 +03:00
|
|
|
/*
|
|
|
|
* Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
|
|
|
|
boolean_t sync)
|
|
|
|
{
|
|
|
|
itx_t *itx;
|
|
|
|
lr_truncate_t *lr;
|
|
|
|
zilog_t *zilog = zv->zv_zilog;
|
|
|
|
|
|
|
|
if (zil_replaying(zilog, tx))
|
|
|
|
return;
|
|
|
|
|
|
|
|
itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
|
|
|
|
lr = (lr_truncate_t *)&itx->itx_lr;
|
|
|
|
lr->lr_foid = ZVOL_OBJ;
|
|
|
|
lr->lr_offset = off;
|
|
|
|
lr->lr_length = len;
|
|
|
|
|
|
|
|
itx->itx_sync = sync;
|
|
|
|
zil_itx_assign(zilog, itx, tx);
|
|
|
|
}
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
static void
|
|
|
|
zvol_discard(void *arg)
|
2011-09-02 17:23:12 +04:00
|
|
|
{
|
2017-02-23 03:08:04 +03:00
|
|
|
zv_request_t *zvr = arg;
|
|
|
|
struct bio *bio = zvr->bio;
|
|
|
|
zvol_state_t *zv = zvr->zv;
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
uint64_t start = BIO_BI_SECTOR(bio) << 9;
|
|
|
|
uint64_t size = BIO_BI_SIZE(bio);
|
|
|
|
uint64_t end = start + size;
|
2017-08-21 18:59:48 +03:00
|
|
|
boolean_t sync;
|
2017-02-23 03:08:04 +03:00
|
|
|
int error = 0;
|
2015-08-02 16:01:14 +03:00
|
|
|
dmu_tx_t *tx;
|
2017-02-23 03:08:04 +03:00
|
|
|
unsigned long start_jif;
|
2011-09-02 17:23:12 +04:00
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
ASSERT(zv && zv->zv_open_count > 0);
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
start_jif = jiffies;
|
2017-09-16 21:00:19 +03:00
|
|
|
blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
|
|
|
|
&zv->zv_disk->part0);
|
2017-02-23 03:08:04 +03:00
|
|
|
|
2017-08-21 18:59:48 +03:00
|
|
|
sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
if (end > zv->zv_volsize) {
|
|
|
|
error = SET_ERROR(EIO);
|
2017-08-21 18:59:48 +03:00
|
|
|
goto unlock;
|
2017-02-23 03:08:04 +03:00
|
|
|
}
|
2011-09-02 17:23:12 +04:00
|
|
|
|
2012-10-04 12:38:55 +04:00
|
|
|
/*
|
2016-08-09 21:22:30 +03:00
|
|
|
* Align the request to volume block boundaries when a secure erase is
|
|
|
|
* not required. This will prevent dnode_free_range() from zeroing out
|
|
|
|
* the unaligned parts which is slow (read-modify-write) and useless
|
|
|
|
* since we are not freeing any space by doing so.
|
2012-10-04 12:38:55 +04:00
|
|
|
*/
|
2016-08-09 21:22:30 +03:00
|
|
|
if (!bio_is_secure_erase(bio)) {
|
2014-10-10 19:23:23 +04:00
|
|
|
start = P2ROUNDUP(start, zv->zv_volblocksize);
|
|
|
|
end = P2ALIGN(end, zv->zv_volblocksize);
|
2015-09-18 15:32:52 +03:00
|
|
|
size = end - start;
|
2014-10-10 19:23:23 +04:00
|
|
|
}
|
2012-10-04 12:38:55 +04:00
|
|
|
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
if (start >= end)
|
2017-08-21 18:59:48 +03:00
|
|
|
goto unlock;
|
2011-09-02 17:23:12 +04:00
|
|
|
|
2015-08-02 16:01:14 +03:00
|
|
|
tx = dmu_tx_create(zv->zv_objset);
|
|
|
|
dmu_tx_mark_netfree(tx);
|
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error != 0) {
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
} else {
|
|
|
|
zvol_log_truncate(zv, tx, start, size, B_TRUE);
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
error = dmu_free_long_range(zv->zv_objset,
|
|
|
|
ZVOL_OBJ, start, size);
|
|
|
|
}
|
2017-08-21 18:59:48 +03:00
|
|
|
unlock:
|
2017-02-23 03:08:04 +03:00
|
|
|
zfs_range_unlock(zvr->rl);
|
2017-08-21 18:59:48 +03:00
|
|
|
if (error == 0 && sync)
|
|
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2017-09-16 21:00:19 +03:00
|
|
|
blk_generic_end_io_acct(zv->zv_queue, WRITE, &zv->zv_disk->part0,
|
|
|
|
start_jif);
|
2017-02-23 03:08:04 +03:00
|
|
|
BIO_END_IO(bio, -error);
|
|
|
|
kmem_free(zvr, sizeof (zv_request_t));
|
2011-09-02 17:23:12 +04:00
|
|
|
}
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
static void
|
|
|
|
zvol_read(void *arg)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2017-02-23 03:08:04 +03:00
|
|
|
zv_request_t *zvr = arg;
|
|
|
|
struct bio *bio = zvr->bio;
|
|
|
|
uio_t uio;
|
|
|
|
zvol_state_t *zv = zvr->zv;
|
2016-02-06 04:36:07 +03:00
|
|
|
uint64_t volsize = zv->zv_volsize;
|
|
|
|
int error = 0;
|
2017-02-23 03:08:04 +03:00
|
|
|
unsigned long start_jif;
|
|
|
|
|
|
|
|
uio_from_bio(&uio, bio);
|
2011-09-05 13:11:38 +04:00
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
ASSERT(zv && zv->zv_open_count > 0);
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
start_jif = jiffies;
|
2017-09-16 21:00:19 +03:00
|
|
|
blk_generic_start_io_acct(zv->zv_queue, READ, bio_sectors(bio),
|
|
|
|
&zv->zv_disk->part0);
|
2017-02-23 03:08:04 +03:00
|
|
|
|
|
|
|
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
|
|
|
|
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-02-06 04:36:07 +03:00
|
|
|
/* don't read past the end */
|
2017-02-23 03:08:04 +03:00
|
|
|
if (bytes > volsize - uio.uio_loffset)
|
|
|
|
bytes = volsize - uio.uio_loffset;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-06-13 19:18:08 +03:00
|
|
|
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
|
2016-02-06 04:36:07 +03:00
|
|
|
if (error) {
|
|
|
|
/* convert checksum errors into IO errors */
|
|
|
|
if (error == ECKSUM)
|
|
|
|
error = SET_ERROR(EIO);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2017-02-23 03:08:04 +03:00
|
|
|
zfs_range_unlock(zvr->rl);
|
|
|
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2017-09-16 21:00:19 +03:00
|
|
|
blk_generic_end_io_acct(zv->zv_queue, READ, &zv->zv_disk->part0,
|
|
|
|
start_jif);
|
2017-02-23 03:08:04 +03:00
|
|
|
BIO_END_IO(bio, -error);
|
|
|
|
kmem_free(zvr, sizeof (zv_request_t));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
static MAKE_REQUEST_FN_RET
|
|
|
|
zvol_request(struct request_queue *q, struct bio *bio)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
zvol_state_t *zv = q->queuedata;
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
fstrans_cookie_t cookie = spl_fstrans_mark();
|
2017-02-23 03:08:04 +03:00
|
|
|
uint64_t offset = BIO_BI_SECTOR(bio) << 9;
|
|
|
|
uint64_t size = BIO_BI_SIZE(bio);
|
2015-09-07 19:03:19 +03:00
|
|
|
int rw = bio_data_dir(bio);
|
2017-02-23 03:08:04 +03:00
|
|
|
zv_request_t *zvr;
|
2016-02-06 04:36:07 +03:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
printk(KERN_INFO
|
2016-02-06 04:36:07 +03:00
|
|
|
"%s: bad access: offset=%llu, size=%lu\n",
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
zv->zv_disk->disk_name,
|
2017-02-23 03:08:04 +03:00
|
|
|
(long long unsigned)offset,
|
|
|
|
(long unsigned)size);
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
BIO_END_IO(bio, -SET_ERROR(EIO));
|
|
|
|
goto out;
|
|
|
|
}
|
2015-09-07 19:03:19 +03:00
|
|
|
|
|
|
|
if (rw == WRITE) {
|
2017-08-21 18:59:48 +03:00
|
|
|
boolean_t need_sync = B_FALSE;
|
|
|
|
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
|
2017-02-23 03:08:04 +03:00
|
|
|
BIO_END_IO(bio, -SET_ERROR(EROFS));
|
|
|
|
goto out;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
/*
|
|
|
|
* To be released in the I/O function. See the comment on
|
|
|
|
* zfs_range_lock below.
|
|
|
|
*/
|
|
|
|
rw_enter(&zv->zv_suspend_lock, RW_READER);
|
|
|
|
|
|
|
|
/* bio marked as FLUSH need to flush before write */
|
|
|
|
if (bio_is_flush(bio))
|
|
|
|
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
|
|
|
|
|
|
|
/* Some requests are just for flush and nothing else. */
|
|
|
|
if (size == 0) {
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
BIO_END_IO(bio, 0);
|
|
|
|
goto out;
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
|
|
|
|
zvr->zv = zv;
|
|
|
|
zvr->bio = bio;
|
|
|
|
|
2016-02-06 04:36:07 +03:00
|
|
|
/*
|
2017-02-23 03:08:04 +03:00
|
|
|
* To be released in the I/O function. Since the I/O functions
|
|
|
|
* are asynchronous, we take it here synchronously to make
|
|
|
|
* sure overlapped I/Os are properly ordered.
|
2016-02-06 04:36:07 +03:00
|
|
|
*/
|
2017-02-23 03:08:04 +03:00
|
|
|
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
|
|
|
|
RL_WRITER);
|
2017-08-21 18:59:48 +03:00
|
|
|
/*
|
|
|
|
* Sync writes and discards execute zil_commit() which may need
|
|
|
|
* to take a RL_READER lock on the whole block being modified
|
|
|
|
* via its zillog->zl_get_data(): to avoid circular dependency
|
|
|
|
* issues with taskq threads execute these requests
|
|
|
|
* synchronously here in zvol_request().
|
|
|
|
*/
|
|
|
|
need_sync = bio_is_fua(bio) ||
|
|
|
|
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
|
2017-02-23 03:08:04 +03:00
|
|
|
if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
|
2017-08-21 18:59:48 +03:00
|
|
|
if (zvol_request_sync || need_sync ||
|
|
|
|
taskq_dispatch(zvol_taskq, zvol_discard, zvr,
|
|
|
|
TQ_SLEEP) == TASKQID_INVALID)
|
2017-02-23 03:08:04 +03:00
|
|
|
zvol_discard(zvr);
|
|
|
|
} else {
|
2017-08-21 18:59:48 +03:00
|
|
|
if (zvol_request_sync || need_sync ||
|
|
|
|
taskq_dispatch(zvol_taskq, zvol_write, zvr,
|
|
|
|
TQ_SLEEP) == TASKQID_INVALID)
|
2017-02-23 03:08:04 +03:00
|
|
|
zvol_write(zvr);
|
2016-02-06 04:36:07 +03:00
|
|
|
}
|
2017-02-23 03:08:04 +03:00
|
|
|
} else {
|
|
|
|
zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
|
|
|
|
zvr->zv = zv;
|
|
|
|
zvr->bio = bio;
|
2016-02-06 04:36:07 +03:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
rw_enter(&zv->zv_suspend_lock, RW_READER);
|
2011-09-02 17:23:12 +04:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
|
|
|
|
RL_READER);
|
|
|
|
if (zvol_request_sync || taskq_dispatch(zvol_taskq,
|
|
|
|
zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
|
|
|
|
zvol_read(zvr);
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
spl_fstrans_unmark(cookie);
|
|
|
|
#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
|
|
|
|
return (0);
|
2015-11-24 01:47:29 +03:00
|
|
|
#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
|
|
|
|
return (BLK_QC_T_NONE);
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
#endif
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zvol_get_done(zgd_t *zgd, int error)
|
|
|
|
{
|
|
|
|
if (zgd->zgd_db)
|
|
|
|
dmu_buf_rele(zgd->zgd_db, zgd);
|
|
|
|
|
|
|
|
zfs_range_unlock(zgd->zgd_rl);
|
|
|
|
|
|
|
|
if (error == 0 && zgd->zgd_bp)
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
kmem_free(zgd, sizeof (zgd_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get data to generate a TX_WRITE intent log record.
|
|
|
|
*/
|
|
|
|
static int
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
zvol_state_t *zv = arg;
|
|
|
|
uint64_t offset = lr->lr_offset;
|
|
|
|
uint64_t size = lr->lr_length;
|
|
|
|
dmu_buf_t *db;
|
|
|
|
zgd_t *zgd;
|
|
|
|
int error;
|
|
|
|
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3P(lwb, !=, NULL);
|
|
|
|
ASSERT3P(zio, !=, NULL);
|
|
|
|
ASSERT3U(size, !=, 0);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
zgd->zgd_lwb = lwb;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Write records come in two flavors: immediate and indirect.
|
|
|
|
* For small writes it's cheaper to store the data with the
|
|
|
|
* log record (immediate); for large writes it's cheaper to
|
|
|
|
* sync the data and get a pointer to it (indirect) so that
|
|
|
|
* we don't have to write the data twice.
|
|
|
|
*/
|
|
|
|
if (buf != NULL) { /* immediate write */
|
2017-08-21 18:59:48 +03:00
|
|
|
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
|
|
|
|
RL_READER);
|
2017-06-13 19:18:08 +03:00
|
|
|
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
|
2010-08-26 22:45:02 +04:00
|
|
|
DMU_READ_NO_PREFETCH);
|
2017-08-21 18:59:48 +03:00
|
|
|
} else { /* indirect write */
|
|
|
|
/*
|
|
|
|
* Have to lock the whole block to ensure when it's written out
|
|
|
|
* and its checksum is being calculated that no one can change
|
|
|
|
* the data. Contrarily to zfs_get_data we need not re-check
|
|
|
|
* blocksize after we get the lock because it cannot be changed.
|
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
size = zv->zv_volblocksize;
|
|
|
|
offset = P2ALIGN_TYPED(offset, size, uint64_t);
|
2017-08-21 18:59:48 +03:00
|
|
|
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
|
|
|
|
RL_READER);
|
2017-06-13 19:18:08 +03:00
|
|
|
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
|
2010-08-26 22:45:02 +04:00
|
|
|
DMU_READ_NO_PREFETCH);
|
|
|
|
if (error == 0) {
|
2017-04-14 22:59:18 +03:00
|
|
|
blkptr_t *bp = &lr->lr_blkptr;
|
2013-05-10 23:47:54 +04:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
zgd->zgd_db = db;
|
2017-04-14 22:59:18 +03:00
|
|
|
zgd->zgd_bp = bp;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
ASSERT(db != NULL);
|
|
|
|
ASSERT(db->db_offset == offset);
|
|
|
|
ASSERT(db->db_size == size);
|
|
|
|
|
|
|
|
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
|
|
|
zvol_get_done, zgd);
|
|
|
|
|
|
|
|
if (error == 0)
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
zvol_get_done(zgd, error);
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-12-01 00:56:50 +03:00
|
|
|
* The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
|
|
|
static void
|
2016-12-01 00:56:50 +03:00
|
|
|
zvol_insert(zvol_state_t *zv)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&zvol_state_lock));
|
2016-12-01 00:56:50 +03:00
|
|
|
ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
|
|
|
|
list_insert_head(&zvol_state_list, zv);
|
|
|
|
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Simply remove the zvol from to list of zvols.
|
|
|
|
*/
|
|
|
|
static void
|
2016-12-01 00:56:50 +03:00
|
|
|
zvol_remove(zvol_state_t *zv)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&zvol_state_lock));
|
2016-12-01 00:56:50 +03:00
|
|
|
list_remove(&zvol_state_list, zv);
|
|
|
|
hlist_del(&zv->zv_hlink);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2017-01-20 00:56:36 +03:00
|
|
|
/*
|
|
|
|
* Setup zv after we just own the zv->objset
|
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
static int
|
2017-01-20 00:56:36 +03:00
|
|
|
zvol_setup_zv(zvol_state_t *zv)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
uint64_t volsize;
|
|
|
|
int error;
|
|
|
|
uint64_t ro;
|
2017-01-20 00:56:36 +03:00
|
|
|
objset_t *os = zv->zv_objset;
|
2015-09-23 19:34:51 +03:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
|
|
|
|
RW_LOCK_HELD(&zv->zv_suspend_lock));
|
|
|
|
|
2015-09-23 19:34:51 +03:00
|
|
|
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
|
|
|
|
if (error)
|
2017-01-20 00:56:36 +03:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
|
2015-09-23 19:34:51 +03:00
|
|
|
if (error)
|
2017-01-20 00:56:36 +03:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-06-13 19:18:08 +03:00
|
|
|
error = dnode_hold(os, ZVOL_OBJ, FTAG, &zv->zv_dn);
|
2015-09-23 19:34:51 +03:00
|
|
|
if (error)
|
2017-01-20 00:56:36 +03:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
set_capacity(zv->zv_disk, volsize >> 9);
|
|
|
|
zv->zv_volsize = volsize;
|
|
|
|
zv->zv_zilog = zil_open(os, zvol_get_data);
|
|
|
|
|
2013-03-03 09:57:39 +04:00
|
|
|
if (ro || dmu_objset_is_snapshot(os) ||
|
|
|
|
!spa_writeable(dmu_objset_spa(os))) {
|
2013-01-18 21:44:09 +04:00
|
|
|
set_disk_ro(zv->zv_disk, 1);
|
|
|
|
zv->zv_flags |= ZVOL_RDONLY;
|
2010-08-26 22:45:02 +04:00
|
|
|
} else {
|
2013-01-18 21:44:09 +04:00
|
|
|
set_disk_ro(zv->zv_disk, 0);
|
|
|
|
zv->zv_flags &= ~ZVOL_RDONLY;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2017-01-20 00:56:36 +03:00
|
|
|
return (0);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2017-01-20 00:56:36 +03:00
|
|
|
/*
|
|
|
|
* Shutdown every zv_objset related stuff except zv_objset itself.
|
|
|
|
* The is the reverse of zvol_setup_zv.
|
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
static void
|
2017-01-20 00:56:36 +03:00
|
|
|
zvol_shutdown_zv(zvol_state_t *zv)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
|
|
|
|
RW_LOCK_HELD(&zv->zv_suspend_lock));
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
zil_close(zv->zv_zilog);
|
|
|
|
zv->zv_zilog = NULL;
|
2012-08-24 18:12:46 +04:00
|
|
|
|
2017-06-13 19:18:08 +03:00
|
|
|
dnode_rele(zv->zv_dn, FTAG);
|
|
|
|
zv->zv_dn = NULL;
|
2012-08-24 18:12:46 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Evict cached data
|
|
|
|
*/
|
|
|
|
if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
|
|
|
|
!(zv->zv_flags & ZVOL_RDONLY))
|
|
|
|
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
|
|
|
|
(void) dmu_objset_evict_dbufs(zv->zv_objset);
|
2017-01-20 00:56:36 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* return the proper tag for rollback and recv
|
|
|
|
*/
|
|
|
|
void *
|
|
|
|
zvol_tag(zvol_state_t *zv)
|
|
|
|
{
|
|
|
|
ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
|
|
|
|
return (zv->zv_open_count > 0 ? zv : NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Suspend the zvol for recv and rollback.
|
|
|
|
*/
|
|
|
|
zvol_state_t *
|
|
|
|
zvol_suspend(const char *name)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
zv = zvol_find_by_name(name, RW_WRITER);
|
|
|
|
|
|
|
|
if (zv == NULL)
|
2017-05-10 20:51:29 +03:00
|
|
|
return (NULL);
|
2012-08-24 18:12:46 +04:00
|
|
|
|
2017-01-20 00:56:36 +03:00
|
|
|
/* block all I/O, release in zvol_resume. */
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
|
|
|
|
RW_WRITE_HELD(&zv->zv_suspend_lock));
|
2017-01-20 00:56:36 +03:00
|
|
|
|
|
|
|
atomic_inc(&zv->zv_suspend_ref);
|
|
|
|
|
|
|
|
if (zv->zv_open_count > 0)
|
|
|
|
zvol_shutdown_zv(zv);
|
2017-05-10 20:51:29 +03:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
/*
|
|
|
|
* do not hold zv_state_lock across suspend/resume to
|
|
|
|
* avoid locking up zvol lookups
|
|
|
|
*/
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2017-06-13 19:03:44 +03:00
|
|
|
|
|
|
|
/* zv_suspend_lock is released in zvol_resume() */
|
2017-01-20 00:56:36 +03:00
|
|
|
return (zv);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
zvol_resume(zvol_state_t *zv)
|
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
|
2017-05-27 03:50:25 +03:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
|
2017-01-20 00:56:36 +03:00
|
|
|
if (zv->zv_open_count > 0) {
|
|
|
|
VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
|
|
|
|
VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
|
|
|
|
VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
|
|
|
|
dmu_objset_rele(zv->zv_objset, zv);
|
|
|
|
|
|
|
|
error = zvol_setup_zv(zv);
|
|
|
|
}
|
2017-06-13 19:03:44 +03:00
|
|
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
2017-01-20 00:56:36 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
/*
|
|
|
|
* We need this because we don't hold zvol_state_lock while releasing
|
|
|
|
* zv_suspend_lock. zvol_remove_minors_impl thus cannot check
|
|
|
|
* zv_suspend_lock to determine it is safe to free because rwlock is
|
|
|
|
* not inherent atomic.
|
|
|
|
*/
|
|
|
|
atomic_dec(&zv->zv_suspend_ref);
|
|
|
|
|
|
|
|
return (SET_ERROR(error));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2018-02-21 03:27:31 +03:00
|
|
|
zvol_first_open(zvol_state_t *zv, boolean_t readonly)
|
2017-01-20 00:56:36 +03:00
|
|
|
{
|
|
|
|
objset_t *os;
|
2017-05-11 23:40:33 +03:00
|
|
|
int error, locked = 0;
|
2018-02-21 03:27:31 +03:00
|
|
|
boolean_t ro;
|
2017-05-11 23:40:33 +03:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
2017-05-11 23:40:33 +03:00
|
|
|
/*
|
|
|
|
* In all other cases the spa_namespace_lock is taken before the
|
|
|
|
* bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
|
|
|
|
* function calls fops->open() with the bdev->bd_mutex lock held.
|
|
|
|
* This deadlock can be easily observed with zvols used as vdevs.
|
|
|
|
*
|
|
|
|
* To avoid a potential lock inversion deadlock we preemptively
|
|
|
|
* try to take the spa_namespace_lock(). Normally it will not
|
|
|
|
* be contended and this is safe because spa_open_common() handles
|
|
|
|
* the case where the caller already holds the spa_namespace_lock.
|
|
|
|
*
|
|
|
|
* When it is contended we risk a lock inversion if we were to
|
|
|
|
* block waiting for the lock. Luckily, the __blkdev_get()
|
|
|
|
* function allows us to return -ERESTARTSYS which will result in
|
|
|
|
* bdev->bd_mutex being dropped, reacquired, and fops->open() being
|
|
|
|
* called again. This process can be repeated safely until both
|
|
|
|
* locks are acquired.
|
|
|
|
*/
|
|
|
|
if (!mutex_owned(&spa_namespace_lock)) {
|
|
|
|
locked = mutex_tryenter(&spa_namespace_lock);
|
|
|
|
if (!locked)
|
|
|
|
return (-SET_ERROR(ERESTARTSYS));
|
|
|
|
}
|
2017-01-20 00:56:36 +03:00
|
|
|
|
2018-02-21 03:27:31 +03:00
|
|
|
ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
|
|
|
|
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
|
2017-01-20 00:56:36 +03:00
|
|
|
if (error)
|
2017-05-11 23:40:33 +03:00
|
|
|
goto out_mutex;
|
2017-01-20 00:56:36 +03:00
|
|
|
|
|
|
|
zv->zv_objset = os;
|
|
|
|
|
|
|
|
error = zvol_setup_zv(zv);
|
|
|
|
|
|
|
|
if (error) {
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
dmu_objset_disown(os, 1, zv);
|
2017-01-20 00:56:36 +03:00
|
|
|
zv->zv_objset = NULL;
|
|
|
|
}
|
|
|
|
|
2017-05-11 23:40:33 +03:00
|
|
|
out_mutex:
|
|
|
|
if (locked)
|
|
|
|
mutex_exit(&spa_namespace_lock);
|
2017-01-20 00:56:36 +03:00
|
|
|
return (SET_ERROR(-error));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zvol_last_close(zvol_state_t *zv)
|
|
|
|
{
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
2017-01-20 00:56:36 +03:00
|
|
|
zvol_shutdown_zv(zv);
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
dmu_objset_disown(zv->zv_objset, 1, zv);
|
2010-08-26 22:45:02 +04:00
|
|
|
zv->zv_objset = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zvol_open(struct block_device *bdev, fmode_t flag)
|
|
|
|
{
|
2016-02-16 22:52:55 +03:00
|
|
|
zvol_state_t *zv;
|
2017-06-13 19:03:44 +03:00
|
|
|
int error = 0;
|
2017-08-09 21:10:47 +03:00
|
|
|
boolean_t drop_suspend = B_TRUE;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-08-09 21:10:47 +03:00
|
|
|
ASSERT(!MUTEX_HELD(&zvol_state_lock));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_enter(&zvol_state_lock);
|
2016-02-16 22:52:55 +03:00
|
|
|
/*
|
2017-06-13 19:03:44 +03:00
|
|
|
* Obtain a copy of private_data under the zvol_state_lock to make
|
|
|
|
* sure that either the result of zvol free code path setting
|
2016-02-16 22:52:55 +03:00
|
|
|
* bdev->bd_disk->private_data to NULL is observed, or zvol_free()
|
|
|
|
* is not called on this zv because of the positive zv_open_count.
|
|
|
|
*/
|
|
|
|
zv = bdev->bd_disk->private_data;
|
|
|
|
if (zv == NULL) {
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_exit(&zvol_state_lock);
|
|
|
|
return (SET_ERROR(-ENXIO));
|
2016-02-16 22:52:55 +03:00
|
|
|
}
|
2017-06-13 19:03:44 +03:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
2017-06-13 19:03:44 +03:00
|
|
|
/*
|
|
|
|
* make sure zvol is not suspended during first open
|
2017-08-09 21:10:47 +03:00
|
|
|
* (hold zv_suspend_lock) and respect proper lock acquisition
|
|
|
|
* ordering - zv_suspend_lock before zv_state_lock
|
2017-06-13 19:03:44 +03:00
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
if (zv->zv_open_count == 0) {
|
2017-08-09 21:10:47 +03:00
|
|
|
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
rw_enter(&zv->zv_suspend_lock, RW_READER);
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
/* check to see if zv_suspend_lock is needed */
|
|
|
|
if (zv->zv_open_count != 0) {
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
}
|
|
|
|
}
|
2017-06-13 19:03:44 +03:00
|
|
|
} else {
|
2017-08-09 21:10:47 +03:00
|
|
|
drop_suspend = B_FALSE;
|
2017-06-13 19:03:44 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&zvol_state_lock);
|
2017-01-20 00:56:36 +03:00
|
|
|
|
2017-08-09 21:10:47 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
if (zv->zv_open_count == 0) {
|
2018-02-21 03:27:31 +03:00
|
|
|
error = zvol_first_open(zv, !(flag & FMODE_WRITE));
|
2010-08-26 22:45:02 +04:00
|
|
|
if (error)
|
|
|
|
goto out_mutex;
|
|
|
|
}
|
|
|
|
|
2018-02-21 03:27:31 +03:00
|
|
|
if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
|
2010-08-26 22:45:02 +04:00
|
|
|
error = -EROFS;
|
|
|
|
goto out_open_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
zv->zv_open_count++;
|
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
check_disk_change(bdev);
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
out_open_count:
|
|
|
|
if (zv->zv_open_count == 0)
|
|
|
|
zvol_last_close(zv);
|
|
|
|
out_mutex:
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2017-01-20 00:56:36 +03:00
|
|
|
if (drop_suspend)
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2017-07-06 18:38:24 +03:00
|
|
|
if (error == -ERESTARTSYS)
|
|
|
|
schedule();
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2013-06-03 10:58:52 +04:00
|
|
|
#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
|
|
|
|
static void
|
|
|
|
#else
|
2010-08-26 22:45:02 +04:00
|
|
|
static int
|
2013-06-03 10:58:52 +04:00
|
|
|
#endif
|
2010-08-26 22:45:02 +04:00
|
|
|
zvol_release(struct gendisk *disk, fmode_t mode)
|
|
|
|
{
|
2017-05-10 20:51:29 +03:00
|
|
|
zvol_state_t *zv;
|
2017-08-09 21:10:47 +03:00
|
|
|
boolean_t drop_suspend = B_TRUE;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-08-09 21:10:47 +03:00
|
|
|
ASSERT(!MUTEX_HELD(&zvol_state_lock));
|
2016-02-16 22:52:55 +03:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_enter(&zvol_state_lock);
|
|
|
|
zv = disk->private_data;
|
2017-06-13 19:03:44 +03:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
2017-08-09 21:10:47 +03:00
|
|
|
ASSERT(zv->zv_open_count > 0);
|
2017-06-13 19:03:44 +03:00
|
|
|
/*
|
|
|
|
* make sure zvol is not suspended during last close
|
2017-08-09 21:10:47 +03:00
|
|
|
* (hold zv_suspend_lock) and respect proper lock acquisition
|
|
|
|
* ordering - zv_suspend_lock before zv_state_lock
|
2017-06-13 19:03:44 +03:00
|
|
|
*/
|
2017-08-09 21:10:47 +03:00
|
|
|
if (zv->zv_open_count == 1) {
|
|
|
|
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
rw_enter(&zv->zv_suspend_lock, RW_READER);
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
/* check to see if zv_suspend_lock is needed */
|
|
|
|
if (zv->zv_open_count != 1) {
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
}
|
|
|
|
mutex_exit(&zvol_state_lock);
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
|
2017-01-20 00:56:36 +03:00
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
zv->zv_open_count--;
|
2017-06-13 19:03:44 +03:00
|
|
|
if (zv->zv_open_count == 0)
|
2016-02-16 22:52:55 +03:00
|
|
|
zvol_last_close(zv);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
if (drop_suspend)
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
2013-06-03 10:58:52 +04:00
|
|
|
#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
|
2010-08-26 22:45:02 +04:00
|
|
|
return (0);
|
2013-06-03 10:58:52 +04:00
|
|
|
#endif
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zvol_ioctl(struct block_device *bdev, fmode_t mode,
|
2013-12-13 01:04:40 +04:00
|
|
|
unsigned int cmd, unsigned long arg)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
zvol_state_t *zv = bdev->bd_disk->private_data;
|
|
|
|
int error = 0;
|
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
ASSERT(zv && zv->zv_open_count > 0);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case BLKFLSBUF:
|
2017-03-10 04:43:36 +03:00
|
|
|
fsync_bdev(bdev);
|
|
|
|
invalidate_bdev(bdev);
|
|
|
|
rw_enter(&zv->zv_suspend_lock, RW_READER);
|
|
|
|
|
|
|
|
if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
|
|
|
|
!(zv->zv_flags & ZVOL_RDONLY))
|
|
|
|
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
|
|
|
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
break;
|
2017-03-10 04:43:36 +03:00
|
|
|
|
2011-02-22 13:58:44 +03:00
|
|
|
case BLKZNAME:
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
2011-02-22 13:58:44 +03:00
|
|
|
error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2011-02-22 13:58:44 +03:00
|
|
|
break;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
default:
|
|
|
|
error = -ENOTTY;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
static int
|
|
|
|
zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
|
2013-12-13 01:04:40 +04:00
|
|
|
unsigned cmd, unsigned long arg)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2013-12-13 01:04:40 +04:00
|
|
|
return (zvol_ioctl(bdev, mode, cmd, arg));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
#else
|
2013-12-13 01:04:40 +04:00
|
|
|
#define zvol_compat_ioctl NULL
|
2010-08-26 22:45:02 +04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
static int zvol_media_changed(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv = disk->private_data;
|
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
ASSERT(zv && zv->zv_open_count > 0);
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (zv->zv_changed);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int zvol_revalidate_disk(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv = disk->private_data;
|
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
ASSERT(zv && zv->zv_open_count > 0);
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
zv->zv_changed = 0;
|
|
|
|
set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (0);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Provide a simple virtual geometry for legacy compatibility. For devices
|
|
|
|
* smaller than 1 MiB a small head and sector count is used to allow very
|
|
|
|
* tiny devices. For devices over 1 Mib a standard head and sector count
|
|
|
|
* is used to keep the cylinders count reasonable.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv = bdev->bd_disk->private_data;
|
2016-02-16 22:52:55 +03:00
|
|
|
sector_t sectors;
|
|
|
|
|
|
|
|
ASSERT(zv && zv->zv_open_count > 0);
|
|
|
|
|
|
|
|
sectors = get_capacity(zv->zv_disk);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
if (sectors > 2048) {
|
|
|
|
geo->heads = 16;
|
|
|
|
geo->sectors = 63;
|
|
|
|
} else {
|
|
|
|
geo->heads = 2;
|
|
|
|
geo->sectors = 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
geo->start = 0;
|
|
|
|
geo->cylinders = sectors / (geo->heads * geo->sectors);
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (0);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct kobject *
|
|
|
|
zvol_probe(dev_t dev, int *part, void *arg)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv;
|
|
|
|
struct kobject *kobj;
|
|
|
|
|
|
|
|
zv = zvol_find_by_dev(dev);
|
2018-03-05 23:44:35 +03:00
|
|
|
kobj = zv ? get_disk_and_module(zv->zv_disk) : NULL;
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
if (zv)
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (kobj);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
|
|
|
|
static struct block_device_operations zvol_ops = {
|
2013-12-13 01:04:40 +04:00
|
|
|
.open = zvol_open,
|
|
|
|
.release = zvol_release,
|
|
|
|
.ioctl = zvol_ioctl,
|
|
|
|
.compat_ioctl = zvol_compat_ioctl,
|
|
|
|
.media_changed = zvol_media_changed,
|
|
|
|
.revalidate_disk = zvol_revalidate_disk,
|
|
|
|
.getgeo = zvol_getgeo,
|
|
|
|
.owner = THIS_MODULE,
|
2010-08-26 22:45:02 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
|
|
|
|
|
|
|
|
static int
|
|
|
|
zvol_open_by_inode(struct inode *inode, struct file *file)
|
|
|
|
{
|
2013-12-13 01:04:40 +04:00
|
|
|
return (zvol_open(inode->i_bdev, file->f_mode));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zvol_release_by_inode(struct inode *inode, struct file *file)
|
|
|
|
{
|
2013-12-13 01:04:40 +04:00
|
|
|
return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
zvol_ioctl_by_inode(struct inode *inode, struct file *file,
|
2013-12-13 01:04:40 +04:00
|
|
|
unsigned int cmd, unsigned long arg)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2010-10-29 23:13:52 +04:00
|
|
|
if (file == NULL || inode == NULL)
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(-EINVAL));
|
|
|
|
|
|
|
|
return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
#ifdef CONFIG_COMPAT
|
2010-08-26 22:45:02 +04:00
|
|
|
static long
|
|
|
|
zvol_compat_ioctl_by_inode(struct file *file,
|
2013-12-13 01:04:40 +04:00
|
|
|
unsigned int cmd, unsigned long arg)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2010-10-29 23:13:52 +04:00
|
|
|
if (file == NULL)
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(-EINVAL));
|
|
|
|
|
|
|
|
return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
|
|
|
|
file->f_mode, cmd, arg));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2013-12-13 01:04:40 +04:00
|
|
|
#else
|
|
|
|
#define zvol_compat_ioctl_by_inode NULL
|
|
|
|
#endif
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
static struct block_device_operations zvol_ops = {
|
2013-12-13 01:04:40 +04:00
|
|
|
.open = zvol_open_by_inode,
|
|
|
|
.release = zvol_release_by_inode,
|
|
|
|
.ioctl = zvol_ioctl_by_inode,
|
|
|
|
.compat_ioctl = zvol_compat_ioctl_by_inode,
|
|
|
|
.media_changed = zvol_media_changed,
|
|
|
|
.revalidate_disk = zvol_revalidate_disk,
|
|
|
|
.getgeo = zvol_getgeo,
|
|
|
|
.owner = THIS_MODULE,
|
2010-08-26 22:45:02 +04:00
|
|
|
};
|
|
|
|
#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate memory for a new zvol_state_t and setup the required
|
|
|
|
* request queue and generic disk structures for the block device.
|
|
|
|
*/
|
|
|
|
static zvol_state_t *
|
|
|
|
zvol_alloc(dev_t dev, const char *name)
|
|
|
|
{
|
|
|
|
zvol_state_t *zv;
|
2017-07-12 23:05:37 +03:00
|
|
|
uint64_t volmode;
|
|
|
|
|
|
|
|
if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
if (volmode == ZFS_VOLMODE_DEFAULT)
|
|
|
|
volmode = zvol_volmode;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-07-31 21:07:05 +03:00
|
|
|
if (volmode == ZFS_VOLMODE_NONE)
|
|
|
|
return (NULL);
|
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-07-02 22:59:10 +04:00
|
|
|
list_link_init(&zv->zv_next);
|
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
|
2010-08-26 22:45:02 +04:00
|
|
|
if (zv->zv_queue == NULL)
|
|
|
|
goto out_kmem;
|
|
|
|
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
blk_queue_make_request(zv->zv_queue, zvol_request);
|
2016-08-09 21:22:30 +03:00
|
|
|
blk_queue_set_write_cache(zv->zv_queue, B_TRUE, B_TRUE);
|
2011-09-05 13:11:38 +04:00
|
|
|
|
2014-07-11 22:35:58 +04:00
|
|
|
/* Limit read-ahead to a single page to prevent over-prefetching. */
|
|
|
|
blk_queue_set_read_ahead(zv->zv_queue, 1);
|
|
|
|
|
2017-03-18 07:51:36 +03:00
|
|
|
/* Disable write merging in favor of the ZIO pipeline. */
|
2018-04-10 20:32:14 +03:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zv->zv_queue);
|
2017-03-18 07:51:36 +03:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
zv->zv_disk = alloc_disk(ZVOL_MINORS);
|
|
|
|
if (zv->zv_disk == NULL)
|
|
|
|
goto out_queue;
|
|
|
|
|
|
|
|
zv->zv_queue->queuedata = zv;
|
|
|
|
zv->zv_dev = dev;
|
|
|
|
zv->zv_open_count = 0;
|
2011-02-22 13:58:44 +03:00
|
|
|
strlcpy(zv->zv_name, name, MAXNAMELEN);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-04-12 00:53:48 +03:00
|
|
|
zfs_rlock_init(&zv->zv_range_lock);
|
2017-01-20 00:56:36 +03:00
|
|
|
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
|
2011-02-08 22:29:50 +03:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
zv->zv_disk->major = zvol_major;
|
2017-07-12 23:05:37 +03:00
|
|
|
if (volmode == ZFS_VOLMODE_DEV) {
|
|
|
|
/*
|
|
|
|
* ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
|
|
|
|
* gendisk->minors = 1 as noted in include/linux/genhd.h.
|
|
|
|
* Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
|
|
|
|
* and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
|
|
|
|
* setting gendisk->flags accordingly.
|
|
|
|
*/
|
|
|
|
zv->zv_disk->minors = 1;
|
|
|
|
#if defined(GENHD_FL_EXT_DEVT)
|
|
|
|
zv->zv_disk->flags &= ~GENHD_FL_EXT_DEVT;
|
|
|
|
#endif
|
|
|
|
#if defined(GENHD_FL_NO_PART_SCAN)
|
|
|
|
zv->zv_disk->flags |= GENHD_FL_NO_PART_SCAN;
|
|
|
|
#endif
|
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
zv->zv_disk->first_minor = (dev & MINORMASK);
|
|
|
|
zv->zv_disk->fops = &zvol_ops;
|
|
|
|
zv->zv_disk->private_data = zv;
|
|
|
|
zv->zv_disk->queue = zv->zv_queue;
|
2011-02-22 13:58:44 +03:00
|
|
|
snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
|
|
|
|
ZVOL_DEV_NAME, (dev & MINORMASK));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (zv);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
out_queue:
|
|
|
|
blk_cleanup_queue(zv->zv_queue);
|
|
|
|
out_kmem:
|
|
|
|
kmem_free(zv, sizeof (zvol_state_t));
|
2013-06-29 15:07:45 +04:00
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (NULL);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-05-10 20:51:29 +03:00
|
|
|
* Cleanup then free a zvol_state_t which was created by zvol_alloc().
|
|
|
|
* At this time, the structure is not opened by anyone, is taken off
|
|
|
|
* the zvol_state_list, and has its private data set to NULL.
|
|
|
|
* The zvol_state_lock is dropped.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
|
|
|
static void
|
2017-05-10 20:51:29 +03:00
|
|
|
zvol_free(void *arg)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2016-12-01 00:56:50 +03:00
|
|
|
zvol_state_t *zv = arg;
|
2017-05-10 20:51:29 +03:00
|
|
|
|
|
|
|
ASSERT(!MUTEX_HELD(&zvol_state_lock));
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
|
|
|
|
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
|
2016-02-16 22:52:55 +03:00
|
|
|
ASSERT(zv->zv_open_count == 0);
|
2017-05-10 20:51:29 +03:00
|
|
|
ASSERT(zv->zv_disk->private_data == NULL);
|
2016-02-16 22:52:55 +03:00
|
|
|
|
2017-01-20 00:56:36 +03:00
|
|
|
rw_destroy(&zv->zv_suspend_lock);
|
2016-04-12 00:53:48 +03:00
|
|
|
zfs_rlock_destroy(&zv->zv_range_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
del_gendisk(zv->zv_disk);
|
|
|
|
blk_cleanup_queue(zv->zv_queue);
|
|
|
|
put_disk(zv->zv_disk);
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_destroy(&zv->zv_state_lock);
|
|
|
|
|
|
|
|
kmem_free(zv, sizeof (zvol_state_t));
|
2016-12-01 00:56:50 +03:00
|
|
|
}
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
/*
|
|
|
|
* Create a block device minor node and setup the linkage between it
|
|
|
|
* and the specified volume. Once this function returns the block
|
|
|
|
* device is live and ready for use.
|
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
static int
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_create_minor_impl(const char *name)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
zvol_state_t *zv;
|
|
|
|
objset_t *os;
|
|
|
|
dmu_object_info_t *doi;
|
|
|
|
uint64_t volsize;
|
2015-08-18 23:51:20 +03:00
|
|
|
uint64_t len;
|
2010-08-26 22:45:02 +04:00
|
|
|
unsigned minor = 0;
|
|
|
|
int error = 0;
|
2016-12-01 00:56:50 +03:00
|
|
|
int idx;
|
|
|
|
uint64_t hash = zvol_name_hash(name);
|
|
|
|
|
2017-07-12 23:05:37 +03:00
|
|
|
if (zvol_inhibit_dev)
|
|
|
|
return (0);
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
|
|
|
|
if (idx < 0)
|
|
|
|
return (SET_ERROR(-idx));
|
|
|
|
minor = idx << ZVOL_MINOR_BITS;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
zv = zvol_find_by_name_hash(name, hash, RW_NONE);
|
2010-08-26 22:45:02 +04:00
|
|
|
if (zv) {
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2017-05-27 03:50:25 +03:00
|
|
|
ida_simple_remove(&zvol_ida, idx);
|
2017-05-10 20:51:29 +03:00
|
|
|
return (SET_ERROR(EEXIST));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2014-11-21 03:09:39 +03:00
|
|
|
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
|
2010-08-26 22:45:02 +04:00
|
|
|
if (error)
|
|
|
|
goto out_doi;
|
|
|
|
|
|
|
|
error = dmu_object_info(os, ZVOL_OBJ, doi);
|
|
|
|
if (error)
|
|
|
|
goto out_dmu_objset_disown;
|
|
|
|
|
|
|
|
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
|
|
|
|
if (error)
|
|
|
|
goto out_dmu_objset_disown;
|
|
|
|
|
|
|
|
zv = zvol_alloc(MKDEV(zvol_major, minor), name);
|
|
|
|
if (zv == NULL) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EAGAIN);
|
2010-08-26 22:45:02 +04:00
|
|
|
goto out_dmu_objset_disown;
|
|
|
|
}
|
2016-12-01 00:56:50 +03:00
|
|
|
zv->zv_hash = hash;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
if (dmu_objset_is_snapshot(os))
|
|
|
|
zv->zv_flags |= ZVOL_RDONLY;
|
|
|
|
|
|
|
|
zv->zv_volblocksize = doi->doi_data_block_size;
|
|
|
|
zv->zv_volsize = volsize;
|
|
|
|
zv->zv_objset = os;
|
|
|
|
|
|
|
|
set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
|
|
|
|
|
2015-08-28 03:01:59 +03:00
|
|
|
blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
|
2011-09-05 17:15:45 +04:00
|
|
|
blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
|
|
|
|
blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
|
|
|
|
blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
|
|
|
|
blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
|
Limit the number of blocks to discard at once.
The number of blocks that can be discarded in one BLKDISCARD ioctl on a
zvol is currently unlimited. Some applications, such as mkfs, discard
the whole volume at once and they use the maximum possible discard size
to do that. As a result, several gigabytes discard requests are not
uncommon.
Unfortunately, if a large amount of data is allocated in the zvol, ZFS
can be quite slow to process discard requests. This is especially true
if the volblocksize is low (e.g. the 8K default). As a result, very
large discard requests can take a very long time (seconds to minutes
under heavy load) to complete. This can cause a number of problems, most
notably if the zvol is accessed remotely (e.g. via iSCSI), in which case
the client has a high probability of timing out on the request.
This patch solves the issue by adding a new tunable module parameter:
zvol_max_discard_blocks. This indicates the maximum possible range, in
zvol blocks, of one discard operation. It is set by default to 16384
blocks, which appears to be a good tradeoff. Using the default
volblocksize of 8K this is equivalent to 128 MB. When using the maximum
volblocksize of 128K this is equivalent to 2 GB.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #858
2012-07-31 12:45:37 +04:00
|
|
|
blk_queue_max_discard_sectors(zv->zv_queue,
|
|
|
|
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
|
2012-08-01 12:29:59 +04:00
|
|
|
blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
|
2018-04-10 20:32:14 +03:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_queue);
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
#ifdef QUEUE_FLAG_NONROT
|
2018-04-10 20:32:14 +03:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_queue);
|
2011-09-05 17:15:45 +04:00
|
|
|
#endif
|
2015-08-29 19:49:55 +03:00
|
|
|
#ifdef QUEUE_FLAG_ADD_RANDOM
|
2018-04-10 20:32:14 +03:00
|
|
|
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
|
2015-08-29 19:49:55 +03:00
|
|
|
#endif
|
2011-09-05 17:15:45 +04:00
|
|
|
|
2013-03-03 09:57:39 +04:00
|
|
|
if (spa_writeable(dmu_objset_spa(os))) {
|
|
|
|
if (zil_replay_disable)
|
|
|
|
zil_destroy(dmu_objset_zil(os), B_FALSE);
|
|
|
|
else
|
|
|
|
zil_replay(os, zv, zvol_replay_vector);
|
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2015-08-18 23:51:20 +03:00
|
|
|
/*
|
|
|
|
* When udev detects the addition of the device it will immediately
|
|
|
|
* invoke blkid(8) to determine the type of content on the device.
|
|
|
|
* Prefetching the blocks commonly scanned by blkid(8) will speed
|
|
|
|
* up this process.
|
|
|
|
*/
|
|
|
|
len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
|
|
|
|
if (len > 0) {
|
2015-12-22 04:31:57 +03:00
|
|
|
dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
|
|
|
|
dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
|
2016-12-12 21:46:26 +03:00
|
|
|
ZIO_PRIORITY_SYNC_READ);
|
2015-08-18 23:51:20 +03:00
|
|
|
}
|
|
|
|
|
2012-11-28 02:02:49 +04:00
|
|
|
zv->zv_objset = NULL;
|
2010-08-26 22:45:02 +04:00
|
|
|
out_dmu_objset_disown:
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
dmu_objset_disown(os, B_TRUE, FTAG);
|
2010-08-26 22:45:02 +04:00
|
|
|
out_doi:
|
2013-12-13 01:04:40 +04:00
|
|
|
kmem_free(doi, sizeof (dmu_object_info_t));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
if (error == 0) {
|
2017-05-10 20:51:29 +03:00
|
|
|
mutex_enter(&zvol_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
zvol_insert(zv);
|
2016-02-16 22:52:55 +03:00
|
|
|
mutex_exit(&zvol_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
add_disk(zv->zv_disk);
|
2014-03-22 13:07:14 +04:00
|
|
|
} else {
|
2016-12-01 00:56:50 +03:00
|
|
|
ida_simple_remove(&zvol_ida, idx);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2013-12-07 02:20:22 +04:00
|
|
|
/*
|
|
|
|
* Rename a block device minor mode for the specified volume.
|
|
|
|
*/
|
|
|
|
static void
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_rename_minor(zvol_state_t *zv, const char *newname)
|
2013-12-07 02:20:22 +04:00
|
|
|
{
|
|
|
|
int readonly = get_disk_ro(zv->zv_disk);
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&zvol_state_lock));
|
2017-06-13 19:03:44 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
2013-12-07 02:20:22 +04:00
|
|
|
|
|
|
|
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
|
2017-01-20 00:56:36 +03:00
|
|
|
|
|
|
|
/* move to new hashtable entry */
|
|
|
|
zv->zv_hash = zvol_name_hash(zv->zv_name);
|
|
|
|
hlist_del(&zv->zv_hlink);
|
|
|
|
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
|
2013-12-07 02:20:22 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The block device's read-only state is briefly changed causing
|
|
|
|
* a KOBJ_CHANGE uevent to be issued. This ensures udev detects
|
|
|
|
* the name change and fixes the symlinks. This does not change
|
|
|
|
* ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
|
|
|
|
* changes. This would normally be done using kobject_uevent() but
|
|
|
|
* that is a GPL-only symbol which is why we need this workaround.
|
|
|
|
*/
|
|
|
|
set_disk_ro(zv->zv_disk, !readonly);
|
|
|
|
set_disk_ro(zv->zv_disk, readonly);
|
|
|
|
}
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
typedef struct minors_job {
|
|
|
|
list_t *list;
|
|
|
|
list_node_t link;
|
|
|
|
/* input */
|
|
|
|
char *name;
|
|
|
|
/* output */
|
|
|
|
int error;
|
|
|
|
} minors_job_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prefetch zvol dnodes for the minors_job
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zvol_prefetch_minors_impl(void *arg)
|
|
|
|
{
|
|
|
|
minors_job_t *job = arg;
|
|
|
|
char *dsname = job->name;
|
|
|
|
objset_t *os = NULL;
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
|
|
|
|
FTAG, &os);
|
2016-12-01 00:56:50 +03:00
|
|
|
if (job->error == 0) {
|
|
|
|
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
dmu_objset_disown(os, B_TRUE, FTAG);
|
2016-12-01 00:56:50 +03:00
|
|
|
}
|
|
|
|
}
|
2014-03-22 13:07:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Mask errors to continue dmu_objset_find() traversal
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zvol_create_snap_minor_cb(const char *dsname, void *arg)
|
|
|
|
{
|
2016-12-01 00:56:50 +03:00
|
|
|
minors_job_t *j = arg;
|
|
|
|
list_t *minors_list = j->list;
|
|
|
|
const char *name = j->name;
|
2014-03-22 13:07:14 +04:00
|
|
|
|
2015-09-23 19:34:51 +03:00
|
|
|
ASSERT0(MUTEX_HELD(&spa_namespace_lock));
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
/* skip the designated dataset */
|
|
|
|
if (name && strcmp(dsname, name) == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/* at this point, the dsname should name a snapshot */
|
|
|
|
if (strchr(dsname, '@') == 0) {
|
|
|
|
dprintf("zvol_create_snap_minor_cb(): "
|
2016-12-12 21:46:26 +03:00
|
|
|
"%s is not a shapshot name\n", dsname);
|
2014-03-22 13:07:14 +04:00
|
|
|
} else {
|
2016-12-01 00:56:50 +03:00
|
|
|
minors_job_t *job;
|
|
|
|
char *n = strdup(dsname);
|
|
|
|
if (n == NULL)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
|
|
|
|
job->name = n;
|
|
|
|
job->list = minors_list;
|
|
|
|
job->error = 0;
|
|
|
|
list_insert_tail(minors_list, job);
|
|
|
|
/* don't care if dispatch fails, because job->error is 0 */
|
|
|
|
taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
|
|
|
|
TQ_SLEEP);
|
2014-03-22 13:07:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mask errors to continue dmu_objset_find() traversal
|
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
static int
|
2013-09-04 16:00:57 +04:00
|
|
|
zvol_create_minors_cb(const char *dsname, void *arg)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2014-03-22 13:07:14 +04:00
|
|
|
uint64_t snapdev;
|
|
|
|
int error;
|
2016-12-01 00:56:50 +03:00
|
|
|
list_t *minors_list = arg;
|
2014-03-22 13:07:14 +04:00
|
|
|
|
2015-09-23 19:34:51 +03:00
|
|
|
ASSERT0(MUTEX_HELD(&spa_namespace_lock));
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
|
|
|
|
if (error)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given the name and the 'snapdev' property, create device minor nodes
|
|
|
|
* with the linkages to zvols/snapshots as needed.
|
|
|
|
* If the name represents a zvol, create a minor node for the zvol, then
|
|
|
|
* check if its snapshots are 'visible', and if so, iterate over the
|
|
|
|
* snapshots and create device minor nodes for those.
|
|
|
|
*/
|
|
|
|
if (strchr(dsname, '@') == 0) {
|
2016-12-01 00:56:50 +03:00
|
|
|
minors_job_t *job;
|
|
|
|
char *n = strdup(dsname);
|
|
|
|
if (n == NULL)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
|
|
|
|
job->name = n;
|
|
|
|
job->list = minors_list;
|
|
|
|
job->error = 0;
|
|
|
|
list_insert_tail(minors_list, job);
|
|
|
|
/* don't care if dispatch fails, because job->error is 0 */
|
|
|
|
taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
|
|
|
|
TQ_SLEEP);
|
|
|
|
|
|
|
|
if (snapdev == ZFS_SNAPDEV_VISIBLE) {
|
2014-03-22 13:07:14 +04:00
|
|
|
/*
|
|
|
|
* traverse snapshots only, do not traverse children,
|
|
|
|
* and skip the 'dsname'
|
|
|
|
*/
|
|
|
|
error = dmu_objset_find((char *)dsname,
|
2016-12-01 00:56:50 +03:00
|
|
|
zvol_create_snap_minor_cb, (void *)job,
|
2014-03-22 13:07:14 +04:00
|
|
|
DS_FIND_SNAPSHOTS);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
|
2016-12-12 21:46:26 +03:00
|
|
|
dsname);
|
2014-03-22 13:07:14 +04:00
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2011-02-16 20:40:29 +03:00
|
|
|
return (0);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2014-03-22 13:07:14 +04:00
|
|
|
* Create minors for the specified dataset, including children and snapshots.
|
|
|
|
* Pay attention to the 'snapdev' property and iterate over the snapshots
|
|
|
|
* only if they are 'visible'. This approach allows one to assure that the
|
|
|
|
* snapshot metadata is read from disk only if it is needed.
|
|
|
|
*
|
|
|
|
* The name can represent a dataset to be recursively scanned for zvols and
|
|
|
|
* their snapshots, or a single zvol snapshot. If the name represents a
|
|
|
|
* dataset, the scan is performed in two nested stages:
|
|
|
|
* - scan the dataset for zvols, and
|
|
|
|
* - for each zvol, create a minor node, then check if the zvol's snapshots
|
|
|
|
* are 'visible', and only then iterate over the snapshots if needed
|
|
|
|
*
|
2017-01-03 20:31:18 +03:00
|
|
|
* If the name represents a snapshot, a check is performed if the snapshot is
|
2014-03-22 13:07:14 +04:00
|
|
|
* 'visible' (which also verifies that the parent is a zvol), and if so,
|
|
|
|
* a minor node for that snapshot is created.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
2014-03-22 13:07:14 +04:00
|
|
|
static int
|
|
|
|
zvol_create_minors_impl(const char *name)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
int error = 0;
|
2016-02-16 22:52:55 +03:00
|
|
|
fstrans_cookie_t cookie;
|
2014-03-22 13:07:14 +04:00
|
|
|
char *atp, *parent;
|
2016-12-01 00:56:50 +03:00
|
|
|
list_t minors_list;
|
|
|
|
minors_job_t *job;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
if (zvol_inhibit_dev)
|
|
|
|
return (0);
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
/*
|
|
|
|
* This is the list for prefetch jobs. Whenever we found a match
|
|
|
|
* during dmu_objset_find, we insert a minors_job to the list and do
|
|
|
|
* taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
|
|
|
|
* any lock because all list operation is done on the current thread.
|
|
|
|
*
|
|
|
|
* We will use this list to do zvol_create_minor_impl after prefetch
|
|
|
|
* so we don't have to traverse using dmu_objset_find again.
|
|
|
|
*/
|
|
|
|
list_create(&minors_list, sizeof (minors_job_t),
|
|
|
|
offsetof(minors_job_t, link));
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
|
|
|
|
(void) strlcpy(parent, name, MAXPATHLEN);
|
|
|
|
|
|
|
|
if ((atp = strrchr(parent, '@')) != NULL) {
|
|
|
|
uint64_t snapdev;
|
|
|
|
|
|
|
|
*atp = '\0';
|
|
|
|
error = dsl_prop_get_integer(parent, "snapdev",
|
|
|
|
&snapdev, NULL);
|
|
|
|
|
|
|
|
if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
|
|
|
|
error = zvol_create_minor_impl(name);
|
|
|
|
} else {
|
|
|
|
cookie = spl_fstrans_mark();
|
|
|
|
error = dmu_objset_find(parent, zvol_create_minors_cb,
|
2016-12-01 00:56:50 +03:00
|
|
|
&minors_list, DS_FIND_CHILDREN);
|
2014-03-22 13:07:14 +04:00
|
|
|
spl_fstrans_unmark(cookie);
|
|
|
|
}
|
|
|
|
|
|
|
|
kmem_free(parent, MAXPATHLEN);
|
2016-12-01 00:56:50 +03:00
|
|
|
taskq_wait_outstanding(system_taskq, 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prefetch is completed, we can do zvol_create_minor_impl
|
|
|
|
* sequentially.
|
|
|
|
*/
|
|
|
|
while ((job = list_head(&minors_list)) != NULL) {
|
|
|
|
list_remove(&minors_list, job);
|
|
|
|
if (!job->error)
|
|
|
|
zvol_create_minor_impl(job->name);
|
|
|
|
strfree(job->name);
|
|
|
|
kmem_free(job, sizeof (minors_job_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
list_destroy(&minors_list);
|
2013-12-07 02:20:22 +04:00
|
|
|
|
|
|
|
return (SET_ERROR(error));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove minors for specified dataset including children and snapshots.
|
|
|
|
*/
|
2014-03-22 13:07:14 +04:00
|
|
|
static void
|
|
|
|
zvol_remove_minors_impl(const char *name)
|
2013-12-07 02:20:22 +04:00
|
|
|
{
|
|
|
|
zvol_state_t *zv, *zv_next;
|
|
|
|
int namelen = ((name) ? strlen(name) : 0);
|
2016-12-01 00:56:50 +03:00
|
|
|
taskqid_t t, tid = TASKQID_INVALID;
|
2017-05-10 20:51:29 +03:00
|
|
|
list_t free_list;
|
2013-12-07 02:20:22 +04:00
|
|
|
|
2012-06-02 05:49:10 +04:00
|
|
|
if (zvol_inhibit_dev)
|
2013-12-07 02:20:22 +04:00
|
|
|
return;
|
2012-06-02 05:49:10 +04:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
list_create(&free_list, sizeof (zvol_state_t),
|
|
|
|
offsetof(zvol_state_t, zv_next));
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
mutex_enter(&zvol_state_lock);
|
2013-12-07 02:20:22 +04:00
|
|
|
|
|
|
|
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
|
|
|
|
zv_next = list_next(&zvol_state_list, zv);
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
2013-12-07 02:20:22 +04:00
|
|
|
if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
|
|
|
|
(strncmp(zv->zv_name, name, namelen) == 0 &&
|
2016-02-16 22:52:55 +03:00
|
|
|
(zv->zv_name[namelen] == '/' ||
|
|
|
|
zv->zv_name[namelen] == '@'))) {
|
2017-05-10 20:51:29 +03:00
|
|
|
/*
|
2017-06-13 19:03:44 +03:00
|
|
|
* By holding zv_state_lock here, we guarantee that no
|
2017-05-10 20:51:29 +03:00
|
|
|
* one is currently using this zv
|
|
|
|
*/
|
2017-06-15 21:08:45 +03:00
|
|
|
|
|
|
|
/* If in use, leave alone */
|
|
|
|
if (zv->zv_open_count > 0 ||
|
|
|
|
atomic_read(&zv->zv_suspend_ref)) {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-12-07 02:20:22 +04:00
|
|
|
zvol_remove(zv);
|
2016-12-01 00:56:50 +03:00
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
/*
|
|
|
|
* clear this while holding zvol_state_lock so
|
|
|
|
* zvol_open won't open it
|
|
|
|
*/
|
2016-12-01 00:56:50 +03:00
|
|
|
zv->zv_disk->private_data = NULL;
|
|
|
|
|
2017-06-15 21:08:45 +03:00
|
|
|
/* Drop zv_state_lock before zvol_free() */
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
/* try parallel zv_free, if failed do it in place */
|
2017-05-10 20:51:29 +03:00
|
|
|
t = taskq_dispatch(system_taskq, zvol_free, zv,
|
2016-12-01 00:56:50 +03:00
|
|
|
TQ_SLEEP);
|
|
|
|
if (t == TASKQID_INVALID)
|
2017-05-10 20:51:29 +03:00
|
|
|
list_insert_head(&free_list, zv);
|
2016-12-01 00:56:50 +03:00
|
|
|
else
|
|
|
|
tid = t;
|
2017-06-13 19:03:44 +03:00
|
|
|
} else {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
}
|
2013-12-07 02:20:22 +04:00
|
|
|
mutex_exit(&zvol_state_lock);
|
2017-05-10 20:51:29 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop zvol_state_lock before calling zvol_free()
|
|
|
|
*/
|
|
|
|
while ((zv = list_head(&free_list)) != NULL) {
|
|
|
|
list_remove(&free_list, zv);
|
|
|
|
zvol_free(zv);
|
|
|
|
}
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
if (tid != TASKQID_INVALID)
|
|
|
|
taskq_wait_outstanding(system_taskq, tid);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2017-07-12 23:05:37 +03:00
|
|
|
/* Remove minor for this specific volume only */
|
2014-03-22 13:07:14 +04:00
|
|
|
static void
|
|
|
|
zvol_remove_minor_impl(const char *name)
|
|
|
|
{
|
2017-06-02 17:17:00 +03:00
|
|
|
zvol_state_t *zv = NULL, *zv_next;
|
2014-03-22 13:07:14 +04:00
|
|
|
|
|
|
|
if (zvol_inhibit_dev)
|
|
|
|
return;
|
|
|
|
|
|
|
|
mutex_enter(&zvol_state_lock);
|
|
|
|
|
|
|
|
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
|
|
|
|
zv_next = list_next(&zvol_state_list, zv);
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
2014-03-22 13:07:14 +04:00
|
|
|
if (strcmp(zv->zv_name, name) == 0) {
|
2017-05-10 20:51:29 +03:00
|
|
|
/*
|
2017-06-13 19:03:44 +03:00
|
|
|
* By holding zv_state_lock here, we guarantee that no
|
2017-05-10 20:51:29 +03:00
|
|
|
* one is currently using this zv
|
|
|
|
*/
|
2017-06-15 21:08:45 +03:00
|
|
|
|
|
|
|
/* If in use, leave alone */
|
|
|
|
if (zv->zv_open_count > 0 ||
|
|
|
|
atomic_read(&zv->zv_suspend_ref)) {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
continue;
|
|
|
|
}
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_remove(zv);
|
2017-06-15 21:08:45 +03:00
|
|
|
|
2017-05-10 20:51:29 +03:00
|
|
|
/* clear this so zvol_open won't open it */
|
|
|
|
zv->zv_disk->private_data = NULL;
|
2017-06-15 21:08:45 +03:00
|
|
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2014-03-22 13:07:14 +04:00
|
|
|
break;
|
2017-06-13 19:03:44 +03:00
|
|
|
} else {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2014-03-22 13:07:14 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-02 17:17:00 +03:00
|
|
|
/* Drop zvol_state_lock before calling zvol_free() */
|
2014-03-22 13:07:14 +04:00
|
|
|
mutex_exit(&zvol_state_lock);
|
2017-05-10 20:51:29 +03:00
|
|
|
|
2017-06-02 17:17:00 +03:00
|
|
|
if (zv != NULL)
|
|
|
|
zvol_free(zv);
|
2014-03-22 13:07:14 +04:00
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/*
|
2013-12-07 02:20:22 +04:00
|
|
|
* Rename minors for specified dataset including children and snapshots.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
2014-03-22 13:07:14 +04:00
|
|
|
static void
|
|
|
|
zvol_rename_minors_impl(const char *oldname, const char *newname)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
zvol_state_t *zv, *zv_next;
|
2013-12-07 02:20:22 +04:00
|
|
|
int oldnamelen, newnamelen;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2012-06-02 05:49:10 +04:00
|
|
|
if (zvol_inhibit_dev)
|
|
|
|
return;
|
|
|
|
|
2013-12-07 02:20:22 +04:00
|
|
|
oldnamelen = strlen(oldname);
|
|
|
|
newnamelen = strlen(newname);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
mutex_enter(&zvol_state_lock);
|
2013-12-07 02:20:22 +04:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
|
|
|
|
zv_next = list_next(&zvol_state_list, zv);
|
|
|
|
|
2017-06-13 19:03:44 +03:00
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
|
2016-02-16 22:52:55 +03:00
|
|
|
/* If in use, leave alone */
|
2017-06-13 19:03:44 +03:00
|
|
|
if (zv->zv_open_count > 0) {
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2016-02-16 22:52:55 +03:00
|
|
|
continue;
|
2017-06-13 19:03:44 +03:00
|
|
|
}
|
2016-02-16 22:52:55 +03:00
|
|
|
|
2013-12-07 02:20:22 +04:00
|
|
|
if (strcmp(zv->zv_name, oldname) == 0) {
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_rename_minor(zv, newname);
|
2013-12-07 02:20:22 +04:00
|
|
|
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
|
|
|
|
(zv->zv_name[oldnamelen] == '/' ||
|
|
|
|
zv->zv_name[oldnamelen] == '@')) {
|
2017-06-28 20:05:16 +03:00
|
|
|
char *name = kmem_asprintf("%s%c%s", newname,
|
2013-12-07 02:20:22 +04:00
|
|
|
zv->zv_name[oldnamelen],
|
|
|
|
zv->zv_name + oldnamelen + 1);
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_rename_minor(zv, name);
|
2017-06-28 20:05:16 +03:00
|
|
|
kmem_free(name, strlen(name + 1));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2017-06-13 19:03:44 +03:00
|
|
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
2013-12-07 02:20:22 +04:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
mutex_exit(&zvol_state_lock);
|
|
|
|
}
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
typedef struct zvol_snapdev_cb_arg {
|
|
|
|
uint64_t snapdev;
|
|
|
|
} zvol_snapdev_cb_arg_t;
|
|
|
|
|
2013-02-14 03:11:59 +04:00
|
|
|
static int
|
2017-01-21 00:17:55 +03:00
|
|
|
zvol_set_snapdev_cb(const char *dsname, void *param)
|
|
|
|
{
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_snapdev_cb_arg_t *arg = param;
|
2013-02-14 03:11:59 +04:00
|
|
|
|
|
|
|
if (strchr(dsname, '@') == NULL)
|
2013-12-07 02:20:22 +04:00
|
|
|
return (0);
|
2013-02-14 03:11:59 +04:00
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
switch (arg->snapdev) {
|
2013-02-14 03:11:59 +04:00
|
|
|
case ZFS_SNAPDEV_VISIBLE:
|
2014-03-22 13:07:14 +04:00
|
|
|
(void) zvol_create_minor_impl(dsname);
|
2013-02-14 03:11:59 +04:00
|
|
|
break;
|
|
|
|
case ZFS_SNAPDEV_HIDDEN:
|
2014-03-22 13:07:14 +04:00
|
|
|
(void) zvol_remove_minor_impl(dsname);
|
2013-02-14 03:11:59 +04:00
|
|
|
break;
|
|
|
|
}
|
2013-12-07 02:20:22 +04:00
|
|
|
|
|
|
|
return (0);
|
2013-02-14 03:11:59 +04:00
|
|
|
}
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
static void
|
|
|
|
zvol_set_snapdev_impl(char *name, uint64_t snapdev)
|
|
|
|
{
|
|
|
|
zvol_snapdev_cb_arg_t arg = {snapdev};
|
|
|
|
fstrans_cookie_t cookie = spl_fstrans_mark();
|
|
|
|
/*
|
|
|
|
* The zvol_set_snapdev_sync() sets snapdev appropriately
|
|
|
|
* in the dataset hierarchy. Here, we only scan snapshots.
|
|
|
|
*/
|
|
|
|
dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
|
|
|
|
spl_fstrans_unmark(cookie);
|
|
|
|
}
|
|
|
|
|
2017-07-12 23:05:37 +03:00
|
|
|
typedef struct zvol_volmode_cb_arg {
|
|
|
|
uint64_t volmode;
|
|
|
|
} zvol_volmode_cb_arg_t;
|
|
|
|
|
|
|
|
static void
|
|
|
|
zvol_set_volmode_impl(char *name, uint64_t volmode)
|
|
|
|
{
|
|
|
|
fstrans_cookie_t cookie = spl_fstrans_mark();
|
|
|
|
|
|
|
|
if (strchr(name, '@') != NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It's unfortunate we need to remove minors before we create new ones:
|
|
|
|
* this is necessary because our backing gendisk (zvol_state->zv_disk)
|
|
|
|
* coule be different when we set, for instance, volmode from "geom"
|
|
|
|
* to "dev" (or vice versa).
|
|
|
|
* A possible optimization is to modify our consumers so we don't get
|
|
|
|
* called when "volmode" does not change.
|
|
|
|
*/
|
|
|
|
switch (volmode) {
|
|
|
|
case ZFS_VOLMODE_NONE:
|
|
|
|
(void) zvol_remove_minor_impl(name);
|
|
|
|
break;
|
|
|
|
case ZFS_VOLMODE_GEOM:
|
|
|
|
case ZFS_VOLMODE_DEV:
|
|
|
|
(void) zvol_remove_minor_impl(name);
|
|
|
|
(void) zvol_create_minor_impl(name);
|
|
|
|
break;
|
|
|
|
case ZFS_VOLMODE_DEFAULT:
|
|
|
|
(void) zvol_remove_minor_impl(name);
|
|
|
|
if (zvol_volmode == ZFS_VOLMODE_NONE)
|
|
|
|
break;
|
|
|
|
else /* if zvol_volmode is invalid defaults to "geom" */
|
|
|
|
(void) zvol_create_minor_impl(name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
spl_fstrans_unmark(cookie);
|
|
|
|
}
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
static zvol_task_t *
|
|
|
|
zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
|
2017-07-12 23:05:37 +03:00
|
|
|
uint64_t value)
|
2014-03-22 13:07:14 +04:00
|
|
|
{
|
|
|
|
zvol_task_t *task;
|
|
|
|
char *delim;
|
|
|
|
|
|
|
|
/* Never allow tasks on hidden names. */
|
|
|
|
if (name1[0] == '$')
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
|
|
|
|
task->op = op;
|
2017-07-12 23:05:37 +03:00
|
|
|
task->value = value;
|
2014-03-22 13:07:14 +04:00
|
|
|
delim = strchr(name1, '/');
|
|
|
|
strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
|
|
|
|
|
|
|
|
strlcpy(task->name1, name1, MAXNAMELEN);
|
|
|
|
if (name2 != NULL)
|
|
|
|
strlcpy(task->name2, name2, MAXNAMELEN);
|
|
|
|
|
|
|
|
return (task);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zvol_task_free(zvol_task_t *task)
|
|
|
|
{
|
|
|
|
kmem_free(task, sizeof (zvol_task_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The worker thread function performed asynchronously.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zvol_task_cb(void *param)
|
|
|
|
{
|
|
|
|
zvol_task_t *task = (zvol_task_t *)param;
|
|
|
|
|
|
|
|
switch (task->op) {
|
|
|
|
case ZVOL_ASYNC_CREATE_MINORS:
|
|
|
|
(void) zvol_create_minors_impl(task->name1);
|
|
|
|
break;
|
|
|
|
case ZVOL_ASYNC_REMOVE_MINORS:
|
|
|
|
zvol_remove_minors_impl(task->name1);
|
|
|
|
break;
|
|
|
|
case ZVOL_ASYNC_RENAME_MINORS:
|
|
|
|
zvol_rename_minors_impl(task->name1, task->name2);
|
|
|
|
break;
|
|
|
|
case ZVOL_ASYNC_SET_SNAPDEV:
|
2017-07-12 23:05:37 +03:00
|
|
|
zvol_set_snapdev_impl(task->name1, task->value);
|
|
|
|
break;
|
|
|
|
case ZVOL_ASYNC_SET_VOLMODE:
|
|
|
|
zvol_set_volmode_impl(task->name1, task->value);
|
2014-03-22 13:07:14 +04:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
VERIFY(0);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
zvol_task_free(task);
|
|
|
|
}
|
|
|
|
|
2017-07-12 23:05:37 +03:00
|
|
|
typedef struct zvol_set_prop_int_arg {
|
2014-03-22 13:07:14 +04:00
|
|
|
const char *zsda_name;
|
|
|
|
uint64_t zsda_value;
|
|
|
|
zprop_source_t zsda_source;
|
|
|
|
dmu_tx_t *zsda_tx;
|
2017-07-12 23:05:37 +03:00
|
|
|
} zvol_set_prop_int_arg_t;
|
2014-03-22 13:07:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Sanity check the dataset for safe use by the sync task. No additional
|
|
|
|
* conditions are imposed.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
|
|
|
|
{
|
2017-07-12 23:05:37 +03:00
|
|
|
zvol_set_prop_int_arg_t *zsda = arg;
|
2014-03-22 13:07:14 +04:00
|
|
|
dsl_pool_t *dp = dmu_tx_pool(tx);
|
|
|
|
dsl_dir_t *dd;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
dsl_dir_rele(dd, FTAG);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2017-06-02 17:17:00 +03:00
|
|
|
/* ARGSUSED */
|
2014-03-22 13:07:14 +04:00
|
|
|
static int
|
|
|
|
zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
|
|
|
|
{
|
|
|
|
char dsname[MAXNAMELEN];
|
|
|
|
zvol_task_t *task;
|
2017-06-02 17:17:00 +03:00
|
|
|
uint64_t snapdev;
|
2014-03-22 13:07:14 +04:00
|
|
|
|
|
|
|
dsl_dataset_name(ds, dsname);
|
2017-06-02 17:17:00 +03:00
|
|
|
if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
|
|
|
|
return (0);
|
|
|
|
task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
|
2014-03-22 13:07:14 +04:00
|
|
|
if (task == NULL)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
|
2016-12-12 21:46:26 +03:00
|
|
|
task, TQ_SLEEP);
|
2014-03-22 13:07:14 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-06-02 17:17:00 +03:00
|
|
|
* Traverse all child datasets and apply snapdev appropriately.
|
|
|
|
* We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
|
|
|
|
* dataset and read the effective "snapdev" on every child in the callback
|
|
|
|
* function: this is because the value is not guaranteed to be the same in the
|
|
|
|
* whole dataset hierarchy.
|
2014-03-22 13:07:14 +04:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
|
|
|
|
{
|
2017-07-12 23:05:37 +03:00
|
|
|
zvol_set_prop_int_arg_t *zsda = arg;
|
2014-03-22 13:07:14 +04:00
|
|
|
dsl_pool_t *dp = dmu_tx_pool(tx);
|
|
|
|
dsl_dir_t *dd;
|
2017-06-02 17:17:00 +03:00
|
|
|
dsl_dataset_t *ds;
|
|
|
|
int error;
|
2014-03-22 13:07:14 +04:00
|
|
|
|
|
|
|
VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
|
|
|
|
zsda->zsda_tx = tx;
|
|
|
|
|
2017-06-02 17:17:00 +03:00
|
|
|
error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
|
|
|
|
if (error == 0) {
|
|
|
|
dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
|
|
|
|
zsda->zsda_source, sizeof (zsda->zsda_value), 1,
|
|
|
|
&zsda->zsda_value, zsda->zsda_tx);
|
|
|
|
dsl_dataset_rele(ds, FTAG);
|
|
|
|
}
|
2014-03-22 13:07:14 +04:00
|
|
|
dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
|
|
|
|
zsda, DS_FIND_CHILDREN);
|
|
|
|
|
|
|
|
dsl_dir_rele(dd, FTAG);
|
|
|
|
}
|
|
|
|
|
2013-02-14 03:11:59 +04:00
|
|
|
int
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
|
|
|
|
{
|
2017-07-12 23:05:37 +03:00
|
|
|
zvol_set_prop_int_arg_t zsda;
|
2016-02-16 22:52:55 +03:00
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
zsda.zsda_name = ddname;
|
|
|
|
zsda.zsda_source = source;
|
|
|
|
zsda.zsda_value = snapdev;
|
2016-02-16 22:52:55 +03:00
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
return (dsl_sync_task(ddname, zvol_set_snapdev_check,
|
|
|
|
zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
|
|
|
|
}
|
|
|
|
|
2017-07-12 23:05:37 +03:00
|
|
|
/*
|
|
|
|
* Sanity check the dataset for safe use by the sync task. No additional
|
|
|
|
* conditions are imposed.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
zvol_set_prop_int_arg_t *zsda = arg;
|
|
|
|
dsl_pool_t *dp = dmu_tx_pool(tx);
|
|
|
|
dsl_dir_t *dd;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
dsl_dir_rele(dd, FTAG);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
|
|
|
|
{
|
|
|
|
char dsname[MAXNAMELEN];
|
|
|
|
zvol_task_t *task;
|
|
|
|
uint64_t volmode;
|
|
|
|
|
|
|
|
dsl_dataset_name(ds, dsname);
|
|
|
|
if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
|
|
|
|
return (0);
|
|
|
|
task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
|
|
|
|
if (task == NULL)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
|
|
|
|
task, TQ_SLEEP);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Traverse all child datasets and apply volmode appropriately.
|
|
|
|
* We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
|
|
|
|
* dataset and read the effective "volmode" on every child in the callback
|
|
|
|
* function: this is because the value is not guaranteed to be the same in the
|
|
|
|
* whole dataset hierarchy.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
zvol_set_prop_int_arg_t *zsda = arg;
|
|
|
|
dsl_pool_t *dp = dmu_tx_pool(tx);
|
|
|
|
dsl_dir_t *dd;
|
|
|
|
dsl_dataset_t *ds;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
|
|
|
|
zsda->zsda_tx = tx;
|
|
|
|
|
|
|
|
error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
|
|
|
|
if (error == 0) {
|
|
|
|
dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
|
|
|
|
zsda->zsda_source, sizeof (zsda->zsda_value), 1,
|
|
|
|
&zsda->zsda_value, zsda->zsda_tx);
|
|
|
|
dsl_dataset_rele(ds, FTAG);
|
|
|
|
}
|
|
|
|
|
|
|
|
dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
|
|
|
|
zsda, DS_FIND_CHILDREN);
|
|
|
|
|
|
|
|
dsl_dir_rele(dd, FTAG);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
|
|
|
|
{
|
|
|
|
zvol_set_prop_int_arg_t zsda;
|
|
|
|
|
|
|
|
zsda.zsda_name = ddname;
|
|
|
|
zsda.zsda_source = source;
|
|
|
|
zsda.zsda_value = volmode;
|
|
|
|
|
|
|
|
return (dsl_sync_task(ddname, zvol_set_volmode_check,
|
|
|
|
zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
|
|
|
|
}
|
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
void
|
|
|
|
zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
|
|
|
|
{
|
|
|
|
zvol_task_t *task;
|
|
|
|
taskqid_t id;
|
|
|
|
|
|
|
|
task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL);
|
|
|
|
if (task == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
|
2016-10-29 01:40:14 +03:00
|
|
|
if ((async == B_FALSE) && (id != TASKQID_INVALID))
|
2014-03-22 13:07:14 +04:00
|
|
|
taskq_wait_id(spa->spa_zvol_taskq, id);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
|
|
|
|
{
|
|
|
|
zvol_task_t *task;
|
|
|
|
taskqid_t id;
|
|
|
|
|
|
|
|
task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
|
|
|
|
if (task == NULL)
|
|
|
|
return;
|
2016-02-16 22:52:55 +03:00
|
|
|
|
2014-03-22 13:07:14 +04:00
|
|
|
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
|
2016-10-29 01:40:14 +03:00
|
|
|
if ((async == B_FALSE) && (id != TASKQID_INVALID))
|
2014-03-22 13:07:14 +04:00
|
|
|
taskq_wait_id(spa->spa_zvol_taskq, id);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
|
|
|
|
boolean_t async)
|
|
|
|
{
|
|
|
|
zvol_task_t *task;
|
|
|
|
taskqid_t id;
|
|
|
|
|
|
|
|
task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
|
|
|
|
if (task == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
|
2016-10-29 01:40:14 +03:00
|
|
|
if ((async == B_FALSE) && (id != TASKQID_INVALID))
|
2014-03-22 13:07:14 +04:00
|
|
|
taskq_wait_id(spa->spa_zvol_taskq, id);
|
2013-02-14 03:11:59 +04:00
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
int
|
|
|
|
zvol_init(void)
|
|
|
|
{
|
2017-02-23 03:08:04 +03:00
|
|
|
int threads = MIN(MAX(zvol_threads, 1), 1024);
|
2016-12-01 00:56:50 +03:00
|
|
|
int i, error;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-07-02 22:59:10 +04:00
|
|
|
list_create(&zvol_state_list, sizeof (zvol_state_t),
|
2013-12-13 01:04:40 +04:00
|
|
|
offsetof(zvol_state_t, zv_next));
|
2013-07-02 22:59:10 +04:00
|
|
|
mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
2017-02-08 20:27:48 +03:00
|
|
|
ida_init(&zvol_ida);
|
2013-07-02 22:59:10 +04:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
|
|
|
|
threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
|
|
|
|
if (zvol_taskq == NULL) {
|
|
|
|
printk(KERN_INFO "ZFS: taskq_create() failed\n");
|
|
|
|
error = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
|
|
|
|
KM_SLEEP);
|
|
|
|
if (!zvol_htable) {
|
2017-02-23 03:08:04 +03:00
|
|
|
error = -ENOMEM;
|
|
|
|
goto out_taskq;
|
2016-12-01 00:56:50 +03:00
|
|
|
}
|
|
|
|
for (i = 0; i < ZVOL_HT_SIZE; i++)
|
|
|
|
INIT_HLIST_HEAD(&zvol_htable[i]);
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
error = register_blkdev(zvol_major, ZVOL_DRIVER);
|
|
|
|
if (error) {
|
|
|
|
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
|
2016-12-01 00:56:50 +03:00
|
|
|
goto out_free;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
|
2013-12-13 01:04:40 +04:00
|
|
|
THIS_MODULE, zvol_probe, NULL, NULL);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
return (0);
|
2013-07-02 22:59:10 +04:00
|
|
|
|
2016-12-01 00:56:50 +03:00
|
|
|
out_free:
|
|
|
|
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
|
2017-02-23 03:08:04 +03:00
|
|
|
out_taskq:
|
|
|
|
taskq_destroy(zvol_taskq);
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
out:
|
2017-05-20 21:01:55 +03:00
|
|
|
ida_destroy(&zvol_ida);
|
2013-07-02 22:59:10 +04:00
|
|
|
mutex_destroy(&zvol_state_lock);
|
|
|
|
list_destroy(&zvol_state_list);
|
|
|
|
|
2013-12-13 01:04:40 +04:00
|
|
|
return (SET_ERROR(error));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zvol_fini(void)
|
|
|
|
{
|
2014-03-22 13:07:14 +04:00
|
|
|
zvol_remove_minors_impl(NULL);
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
|
|
|
|
unregister_blkdev(zvol_major, ZVOL_DRIVER);
|
2016-12-01 00:56:50 +03:00
|
|
|
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
|
2014-03-22 13:07:14 +04:00
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
taskq_destroy(zvol_taskq);
|
2010-08-26 22:45:02 +04:00
|
|
|
list_destroy(&zvol_state_list);
|
2014-03-22 13:07:14 +04:00
|
|
|
mutex_destroy(&zvol_state_lock);
|
2016-12-14 20:41:39 +03:00
|
|
|
|
|
|
|
ida_destroy(&zvol_ida);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2016-12-12 21:46:26 +03:00
|
|
|
/* BEGIN CSTYLED */
|
2012-06-02 05:49:10 +04:00
|
|
|
module_param(zvol_inhibit_dev, uint, 0644);
|
|
|
|
MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
|
|
|
|
|
2011-12-07 21:23:44 +04:00
|
|
|
module_param(zvol_major, uint, 0444);
|
2010-08-26 22:45:02 +04:00
|
|
|
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
|
|
|
|
|
2017-02-23 03:08:04 +03:00
|
|
|
module_param(zvol_threads, uint, 0444);
|
|
|
|
MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
|
|
|
|
|
|
|
|
module_param(zvol_request_sync, uint, 0644);
|
|
|
|
MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
|
|
|
|
|
Limit the number of blocks to discard at once.
The number of blocks that can be discarded in one BLKDISCARD ioctl on a
zvol is currently unlimited. Some applications, such as mkfs, discard
the whole volume at once and they use the maximum possible discard size
to do that. As a result, several gigabytes discard requests are not
uncommon.
Unfortunately, if a large amount of data is allocated in the zvol, ZFS
can be quite slow to process discard requests. This is especially true
if the volblocksize is low (e.g. the 8K default). As a result, very
large discard requests can take a very long time (seconds to minutes
under heavy load) to complete. This can cause a number of problems, most
notably if the zvol is accessed remotely (e.g. via iSCSI), in which case
the client has a high probability of timing out on the request.
This patch solves the issue by adding a new tunable module parameter:
zvol_max_discard_blocks. This indicates the maximum possible range, in
zvol blocks, of one discard operation. It is set by default to 16384
blocks, which appears to be a good tradeoff. Using the default
volblocksize of 8K this is equivalent to 128 MB. When using the maximum
volblocksize of 128K this is equivalent to 2 GB.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #858
2012-07-31 12:45:37 +04:00
|
|
|
module_param(zvol_max_discard_blocks, ulong, 0444);
|
2013-12-13 01:04:40 +04:00
|
|
|
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
|
2015-08-18 23:51:20 +03:00
|
|
|
|
|
|
|
module_param(zvol_prefetch_bytes, uint, 0644);
|
|
|
|
MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
|
2017-07-12 23:05:37 +03:00
|
|
|
|
|
|
|
module_param(zvol_volmode, uint, 0644);
|
|
|
|
MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
|
2016-12-12 21:46:26 +03:00
|
|
|
/* END CSTYLED */
|