2025-01-04 03:04:27 +03:00
|
|
|
// SPDX-License-Identifier: CDDL-1.0
|
2020-04-14 21:36:28 +03:00
|
|
|
/*
|
|
|
|
|
* CDDL HEADER START
|
|
|
|
|
*
|
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
|
*
|
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-12 00:16:13 +03:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2020-04-14 21:36:28 +03:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
|
* and limitations under the License.
|
|
|
|
|
*
|
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
|
*
|
|
|
|
|
* CDDL HEADER END
|
|
|
|
|
*/
|
|
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
|
|
|
*
|
|
|
|
|
* Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
|
|
|
|
|
* All rights reserved.
|
|
|
|
|
*
|
|
|
|
|
* Portions Copyright 2010 Robert Milkowski
|
|
|
|
|
*
|
|
|
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
|
|
|
|
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
|
|
|
|
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
|
|
|
|
|
* Copyright (c) 2014 Integros [integros.com]
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
* Copyright (c) 2024, 2025, Klara, Inc.
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* ZFS volume emulation driver.
|
|
|
|
|
*
|
|
|
|
|
* Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
|
|
|
|
|
* Volumes are accessed through the symbolic links named:
|
|
|
|
|
*
|
|
|
|
|
* /dev/zvol/<pool_name>/<dataset_name>
|
|
|
|
|
*
|
|
|
|
|
* Volumes are persistent through reboot. No user command needs to be
|
|
|
|
|
* run before opening and using a device.
|
|
|
|
|
*
|
|
|
|
|
* On FreeBSD ZVOLs are simply GEOM providers like any other storage device
|
|
|
|
|
* in the system. Except when they're simply character devices (volmode=dev).
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
|
#include <sys/errno.h>
|
|
|
|
|
#include <sys/uio.h>
|
|
|
|
|
#include <sys/bio.h>
|
|
|
|
|
#include <sys/buf.h>
|
|
|
|
|
#include <sys/kmem.h>
|
|
|
|
|
#include <sys/conf.h>
|
|
|
|
|
#include <sys/cmn_err.h>
|
|
|
|
|
#include <sys/stat.h>
|
2020-08-17 21:01:19 +03:00
|
|
|
#include <sys/proc.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
#include <sys/zap.h>
|
|
|
|
|
#include <sys/spa.h>
|
|
|
|
|
#include <sys/spa_impl.h>
|
|
|
|
|
#include <sys/zio.h>
|
|
|
|
|
#include <sys/disk.h>
|
|
|
|
|
#include <sys/dmu_traverse.h>
|
|
|
|
|
#include <sys/dnode.h>
|
|
|
|
|
#include <sys/dsl_dataset.h>
|
|
|
|
|
#include <sys/dsl_prop.h>
|
|
|
|
|
#include <sys/dsl_dir.h>
|
|
|
|
|
#include <sys/byteorder.h>
|
|
|
|
|
#include <sys/sunddi.h>
|
|
|
|
|
#include <sys/dirent.h>
|
|
|
|
|
#include <sys/policy.h>
|
|
|
|
|
#include <sys/queue.h>
|
|
|
|
|
#include <sys/fs/zfs.h>
|
|
|
|
|
#include <sys/zfs_ioctl.h>
|
|
|
|
|
#include <sys/zil.h>
|
|
|
|
|
#include <sys/zfs_znode.h>
|
|
|
|
|
#include <sys/zfs_rlock.h>
|
|
|
|
|
#include <sys/vdev_impl.h>
|
|
|
|
|
#include <sys/vdev_raidz.h>
|
|
|
|
|
#include <sys/zvol.h>
|
|
|
|
|
#include <sys/zil_impl.h>
|
2020-06-06 03:17:02 +03:00
|
|
|
#include <sys/dataset_kstats.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
#include <sys/dbuf.h>
|
|
|
|
|
#include <sys/dmu_tx.h>
|
|
|
|
|
#include <sys/zfeature.h>
|
|
|
|
|
#include <sys/zio_checksum.h>
|
|
|
|
|
#include <sys/zil_impl.h>
|
|
|
|
|
#include <sys/filio.h>
|
2022-02-02 08:00:57 +03:00
|
|
|
#include <sys/freebsd_event.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
#include <geom/geom.h>
|
|
|
|
|
#include <sys/zvol.h>
|
|
|
|
|
#include <sys/zvol_impl.h>
|
2025-05-08 22:25:40 +03:00
|
|
|
#include <cityhash.h>
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
#include "zfs_namecheck.h"
|
|
|
|
|
|
|
|
|
|
#define ZVOL_DUMPSIZE "dumpsize"
|
|
|
|
|
|
|
|
|
|
#ifdef ZVOL_LOCK_DEBUG
|
|
|
|
|
#define ZVOL_RW_READER RW_WRITER
|
|
|
|
|
#define ZVOL_RW_READ_HELD RW_WRITE_HELD
|
|
|
|
|
#else
|
|
|
|
|
#define ZVOL_RW_READER RW_READER
|
|
|
|
|
#define ZVOL_RW_READ_HELD RW_READ_HELD
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
struct zvol_state_os {
|
|
|
|
|
#define zso_dev _zso_state._zso_dev
|
|
|
|
|
#define zso_geom _zso_state._zso_geom
|
|
|
|
|
union {
|
|
|
|
|
/* volmode=dev */
|
|
|
|
|
struct zvol_state_dev {
|
|
|
|
|
struct cdev *zsd_cdev;
|
2022-02-02 08:00:57 +03:00
|
|
|
struct selinfo zsd_selinfo;
|
2020-04-14 21:36:28 +03:00
|
|
|
} _zso_dev;
|
|
|
|
|
|
|
|
|
|
/* volmode=geom */
|
|
|
|
|
struct zvol_state_geom {
|
|
|
|
|
struct g_provider *zsg_provider;
|
|
|
|
|
} _zso_geom;
|
|
|
|
|
} _zso_state;
|
2020-11-17 20:50:52 +03:00
|
|
|
int zso_dying;
|
2020-04-14 21:36:28 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static uint32_t zvol_minors;
|
|
|
|
|
|
|
|
|
|
SYSCTL_DECL(_vfs_zfs);
|
|
|
|
|
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
|
2025-06-01 02:09:50 +03:00
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
static boolean_t zpool_on_zvol = B_FALSE;
|
|
|
|
|
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
|
|
|
|
|
"Allow zpools to use zvols as vdevs (DANGEROUS)");
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Toggle unmap functionality.
|
|
|
|
|
*/
|
|
|
|
|
boolean_t zvol_unmap_enabled = B_TRUE;
|
|
|
|
|
|
|
|
|
|
SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
|
|
|
|
|
&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* zvol maximum transfer in one DMU tx.
|
|
|
|
|
*/
|
|
|
|
|
int zvol_maxphys = DMU_MAX_ACCESS / 2;
|
|
|
|
|
|
|
|
|
|
static void zvol_ensure_zilog(zvol_state_t *zv);
|
|
|
|
|
|
|
|
|
|
static d_open_t zvol_cdev_open;
|
|
|
|
|
static d_close_t zvol_cdev_close;
|
|
|
|
|
static d_ioctl_t zvol_cdev_ioctl;
|
|
|
|
|
static d_read_t zvol_cdev_read;
|
|
|
|
|
static d_write_t zvol_cdev_write;
|
2025-05-08 22:25:40 +03:00
|
|
|
static d_strategy_t zvol_cdev_bio_strategy;
|
2022-02-02 08:00:57 +03:00
|
|
|
static d_kqfilter_t zvol_cdev_kqfilter;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
static struct cdevsw zvol_cdevsw = {
|
|
|
|
|
.d_name = "zvol",
|
|
|
|
|
.d_version = D_VERSION,
|
|
|
|
|
.d_flags = D_DISK | D_TRACKCLOSE,
|
|
|
|
|
.d_open = zvol_cdev_open,
|
|
|
|
|
.d_close = zvol_cdev_close,
|
|
|
|
|
.d_ioctl = zvol_cdev_ioctl,
|
|
|
|
|
.d_read = zvol_cdev_read,
|
|
|
|
|
.d_write = zvol_cdev_write,
|
2025-05-08 22:25:40 +03:00
|
|
|
.d_strategy = zvol_cdev_bio_strategy,
|
2022-02-02 08:00:57 +03:00
|
|
|
.d_kqfilter = zvol_cdev_kqfilter,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static void zvol_filter_detach(struct knote *kn);
|
|
|
|
|
static int zvol_filter_vnode(struct knote *kn, long hint);
|
|
|
|
|
|
|
|
|
|
static struct filterops zvol_filterops_vnode = {
|
|
|
|
|
.f_isfd = 1,
|
|
|
|
|
.f_detach = zvol_filter_detach,
|
|
|
|
|
.f_event = zvol_filter_vnode,
|
2020-04-14 21:36:28 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
extern uint_t zfs_geom_probe_vdev_key;
|
|
|
|
|
|
|
|
|
|
struct g_class zfs_zvol_class = {
|
|
|
|
|
.name = "ZFS::ZVOL",
|
|
|
|
|
.version = G_VERSION,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
|
|
|
|
|
|
|
|
|
|
static int zvol_geom_open(struct g_provider *pp, int flag, int count);
|
|
|
|
|
static int zvol_geom_close(struct g_provider *pp, int flag, int count);
|
|
|
|
|
static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
|
|
|
|
|
static void zvol_geom_bio_start(struct bio *bp);
|
|
|
|
|
static int zvol_geom_bio_getattr(struct bio *bp);
|
2025-05-08 22:25:40 +03:00
|
|
|
static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* GEOM mode implementation
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_geom_open(struct g_provider *pp, int flag, int count)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
int err = 0;
|
2020-11-06 21:56:58 +03:00
|
|
|
boolean_t drop_suspend = B_FALSE;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
|
|
|
|
|
/*
|
2021-12-23 22:04:07 +03:00
|
|
|
* If zfs_geom_probe_vdev_key is set, that means that zfs is
|
2020-04-14 21:36:28 +03:00
|
|
|
* attempting to probe geom providers while looking for a
|
|
|
|
|
* replacement for a missing VDEV. In this case, the
|
|
|
|
|
* spa_namespace_lock will not be held, but it is still illegal
|
|
|
|
|
* to use a zvol as a vdev. Deadlocks can result if another
|
2021-12-23 22:04:07 +03:00
|
|
|
* thread has spa_namespace_lock.
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
|
|
|
|
return (SET_ERROR(EOPNOTSUPP));
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-31 00:11:33 +03:00
|
|
|
retry:
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
zv = atomic_load_ptr(&pp->private);
|
|
|
|
|
if (zv == NULL)
|
|
|
|
|
return (SET_ERROR(ENXIO));
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
zvol: ensure device minors are properly cleaned up
Currently, if a minor is in use when we try to remove it, we'll skip it
and never come back to it again. Since the zvol state is hung off the
minor in the kernel, this can get us into weird situations if something
tries to use it after the removal fails. It's even worse at pool export,
as there's now a vestigial zvol state with no pool under it. It's
weirder again if the pool is subsequently reimported, as the zvol code
(reasonably) assumes the zvol state has been properly setup, when it's
actually left over from the previous import of the pool.
This commit attempts to tackle that by setting a flag on the zvol if its
minor can't be removed, and then checking that flag when a request is
made and rejecting it, thus stopping new work coming in.
The flag also causes a condvar to be signaled when the last client
finishes. For the case where a single minor is being removed (eg
changing volmode), it will wait for this signal before proceeding.
Meanwhile, when removing all minors, a background task is created for
each minor that couldn't be removed on the spot, and those tasks then
wake and clean up.
Since any new tasks are queued on to the pool's spa_zvol_taskq,
spa_export_common() will continue to wait at export until all minors are
removed.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #14872
Closes #16364
2024-07-18 06:24:05 +03:00
|
|
|
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
|
2020-11-17 20:50:52 +03:00
|
|
|
err = SET_ERROR(ENXIO);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
goto out_locked;
|
2020-11-17 20:50:52 +03:00
|
|
|
}
|
|
|
|
|
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
/*
|
2021-12-23 22:04:07 +03:00
|
|
|
* Make sure zvol is not suspended during first open
|
2020-04-14 21:36:28 +03:00
|
|
|
* (hold zv_suspend_lock) and respect proper lock acquisition
|
2021-12-23 22:04:07 +03:00
|
|
|
* ordering - zv_suspend_lock before zv_state_lock.
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
|
|
|
|
if (zv->zv_open_count == 0) {
|
2020-11-06 21:56:58 +03:00
|
|
|
drop_suspend = B_TRUE;
|
2020-04-14 21:36:28 +03:00
|
|
|
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Removal may happen while the locks are down, so
|
|
|
|
|
* we can't trust zv any longer; we have to start over.
|
|
|
|
|
*/
|
|
|
|
|
zv = atomic_load_ptr(&pp->private);
|
|
|
|
|
if (zv == NULL)
|
|
|
|
|
return (SET_ERROR(ENXIO));
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
|
|
|
|
|
if (zv->zv_zso->zso_dying ||
|
|
|
|
|
zv->zv_flags & ZVOL_REMOVING) {
|
|
|
|
|
err = SET_ERROR(ENXIO);
|
|
|
|
|
goto out_locked;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Check to see if zv_suspend_lock is needed. */
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_open_count != 0) {
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
|
|
|
|
|
if (zv->zv_open_count == 0) {
|
2021-12-23 22:03:23 +03:00
|
|
|
boolean_t drop_namespace = B_FALSE;
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
|
2021-12-23 22:03:23 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Take spa_namespace_lock to prevent lock inversion when
|
|
|
|
|
* zvols from one pool are opened as vdevs in another.
|
|
|
|
|
*/
|
|
|
|
|
if (!mutex_owned(&spa_namespace_lock)) {
|
|
|
|
|
if (!mutex_tryenter(&spa_namespace_lock)) {
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2024-07-11 00:27:44 +03:00
|
|
|
drop_suspend = B_FALSE;
|
2021-12-23 22:03:23 +03:00
|
|
|
kern_yield(PRI_USER);
|
|
|
|
|
goto retry;
|
|
|
|
|
} else {
|
|
|
|
|
drop_namespace = B_TRUE;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
err = zvol_first_open(zv, !(flag & FWRITE));
|
2021-12-23 22:03:23 +03:00
|
|
|
if (drop_namespace)
|
|
|
|
|
mutex_exit(&spa_namespace_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (err)
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
goto out_locked;
|
2020-04-14 21:36:28 +03:00
|
|
|
pp->mediasize = zv->zv_volsize;
|
|
|
|
|
pp->stripeoffset = 0;
|
|
|
|
|
pp->stripesize = zv->zv_volblocksize;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-23 22:03:23 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
/*
|
|
|
|
|
* Check for a bad on-disk format version now since we
|
|
|
|
|
* lied about owning the dataset readonly before.
|
|
|
|
|
*/
|
|
|
|
|
if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
|
|
|
|
|
dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
|
2020-11-03 20:21:09 +03:00
|
|
|
err = SET_ERROR(EROFS);
|
2020-11-06 21:56:58 +03:00
|
|
|
goto out_opened;
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
2020-11-03 20:21:09 +03:00
|
|
|
err = SET_ERROR(EBUSY);
|
2020-11-06 21:56:58 +03:00
|
|
|
goto out_opened;
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
2022-05-03 02:26:28 +03:00
|
|
|
if (flag & O_EXCL) {
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_open_count != 0) {
|
2020-11-03 20:21:09 +03:00
|
|
|
err = SET_ERROR(EBUSY);
|
2020-11-06 21:56:58 +03:00
|
|
|
goto out_opened;
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
zv->zv_flags |= ZVOL_EXCL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
zv->zv_open_count += count;
|
2020-11-06 21:56:58 +03:00
|
|
|
out_opened:
|
2020-11-17 20:50:52 +03:00
|
|
|
if (zv->zv_open_count == 0) {
|
2020-04-14 21:36:28 +03:00
|
|
|
zvol_last_close(zv);
|
2020-11-17 20:50:52 +03:00
|
|
|
wakeup(zv);
|
|
|
|
|
}
|
2020-11-06 21:56:58 +03:00
|
|
|
out_locked:
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (drop_suspend)
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2020-11-03 20:21:09 +03:00
|
|
|
return (err);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_geom_close(struct g_provider *pp, int flag, int count)
|
|
|
|
|
{
|
2022-02-16 04:38:43 +03:00
|
|
|
(void) flag;
|
2020-04-14 21:36:28 +03:00
|
|
|
zvol_state_t *zv;
|
|
|
|
|
boolean_t drop_suspend = B_TRUE;
|
2020-10-21 20:46:48 +03:00
|
|
|
int new_open_count;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
zv = atomic_load_ptr(&pp->private);
|
|
|
|
|
if (zv == NULL)
|
2020-04-14 21:36:28 +03:00
|
|
|
return (SET_ERROR(ENXIO));
|
|
|
|
|
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT3U(zv->zv_open_count, ==, 1);
|
2020-04-14 21:36:28 +03:00
|
|
|
zv->zv_flags &= ~ZVOL_EXCL;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-17 20:50:52 +03:00
|
|
|
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the open count is zero, this is a spurious close.
|
|
|
|
|
* That indicates a bug in the kernel / DDI framework.
|
|
|
|
|
*/
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT3U(zv->zv_open_count, >, 0);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
/*
|
2021-12-23 22:04:07 +03:00
|
|
|
* Make sure zvol is not suspended during last close
|
2020-04-14 21:36:28 +03:00
|
|
|
* (hold zv_suspend_lock) and respect proper lock acquisition
|
2021-12-23 22:04:07 +03:00
|
|
|
* ordering - zv_suspend_lock before zv_state_lock.
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
2020-10-21 20:46:48 +03:00
|
|
|
new_open_count = zv->zv_open_count - count;
|
|
|
|
|
if (new_open_count == 0) {
|
2020-04-14 21:36:28 +03:00
|
|
|
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Unlike in zvol_geom_open(), we don't check if
|
|
|
|
|
* removal started here, because we might be one of the
|
|
|
|
|
* openers that needs to be thrown out! If we're the
|
|
|
|
|
* last, we need to call zvol_last_close() below to
|
|
|
|
|
* finish cleanup. So, no special treatment for us.
|
|
|
|
|
*/
|
|
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Check to see if zv_suspend_lock is needed. */
|
2020-10-21 20:46:48 +03:00
|
|
|
new_open_count = zv->zv_open_count - count;
|
|
|
|
|
if (new_open_count != 0) {
|
2020-04-14 21:36:28 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* You may get multiple opens, but only one close.
|
|
|
|
|
*/
|
2020-10-21 20:46:48 +03:00
|
|
|
zv->zv_open_count = new_open_count;
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_open_count == 0) {
|
|
|
|
|
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
|
|
|
|
|
zvol_last_close(zv);
|
2020-11-17 20:50:52 +03:00
|
|
|
wakeup(zv);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
|
|
|
|
|
if (drop_suspend)
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-17 20:50:52 +03:00
|
|
|
void
|
|
|
|
|
zvol_wait_close(zvol_state_t *zv)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
|
|
|
|
|
return;
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
|
zv->zv_zso->zso_dying = B_TRUE;
|
|
|
|
|
|
|
|
|
|
if (zv->zv_open_count)
|
|
|
|
|
msleep(zv, &zv->zv_state_lock,
|
|
|
|
|
PRIBIO, "zvol:dying", 10*hz);
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
static int
|
|
|
|
|
zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
|
|
|
|
|
{
|
|
|
|
|
int count, error, flags;
|
|
|
|
|
|
|
|
|
|
g_topology_assert();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* To make it easier we expect either open or close, but not both
|
|
|
|
|
* at the same time.
|
|
|
|
|
*/
|
|
|
|
|
KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
|
|
|
|
|
(acr <= 0 && acw <= 0 && ace <= 0),
|
|
|
|
|
("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
|
|
|
|
|
pp->name, acr, acw, ace));
|
|
|
|
|
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
if (atomic_load_ptr(&pp->private) == NULL) {
|
2020-04-14 21:36:28 +03:00
|
|
|
if (acr <= 0 && acw <= 0 && ace <= 0)
|
|
|
|
|
return (0);
|
|
|
|
|
return (pp->error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
|
|
|
|
|
* ace != 0, because GEOM already handles that and handles it a bit
|
|
|
|
|
* differently. GEOM allows for multiple read/exclusive consumers and
|
|
|
|
|
* ZFS allows only one exclusive consumer, no matter if it is reader or
|
|
|
|
|
* writer. I like better the way GEOM works so I'll leave it for GEOM
|
|
|
|
|
* to decide what to do.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
count = acr + acw + ace;
|
|
|
|
|
if (count == 0)
|
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
|
|
flags = 0;
|
|
|
|
|
if (acr != 0 || ace != 0)
|
|
|
|
|
flags |= FREAD;
|
|
|
|
|
if (acw != 0)
|
|
|
|
|
flags |= FWRITE;
|
|
|
|
|
|
|
|
|
|
g_topology_unlock();
|
|
|
|
|
if (count > 0)
|
|
|
|
|
error = zvol_geom_open(pp, flags, count);
|
|
|
|
|
else
|
|
|
|
|
error = zvol_geom_close(pp, flags, -count);
|
|
|
|
|
g_topology_lock();
|
|
|
|
|
return (error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
zvol_geom_bio_start(struct bio *bp)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv = bp->bio_to->private;
|
|
|
|
|
|
2020-11-17 20:50:52 +03:00
|
|
|
if (zv == NULL) {
|
|
|
|
|
g_io_deliver(bp, ENXIO);
|
|
|
|
|
return;
|
|
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
if (bp->bio_cmd == BIO_GETATTR) {
|
|
|
|
|
if (zvol_geom_bio_getattr(bp))
|
|
|
|
|
g_io_deliver(bp, EOPNOTSUPP);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-08 22:25:40 +03:00
|
|
|
zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
|
|
|
|
|
THREAD_CAN_SLEEP());
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_geom_bio_getattr(struct bio *bp)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
|
|
|
|
|
zv = bp->bio_to->private;
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT3P(zv, !=, NULL);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
spa_t *spa = dmu_objset_spa(zv->zv_objset);
|
|
|
|
|
uint64_t refd, avail, usedobjs, availobjs;
|
|
|
|
|
|
|
|
|
|
if (g_handleattr_int(bp, "GEOM::candelete", 1))
|
|
|
|
|
return (0);
|
|
|
|
|
if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
|
|
|
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
|
|
|
&usedobjs, &availobjs);
|
|
|
|
|
if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
|
|
|
|
|
return (0);
|
|
|
|
|
} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
|
|
|
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
|
|
|
&usedobjs, &availobjs);
|
|
|
|
|
if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
|
|
|
|
|
return (0);
|
|
|
|
|
} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
|
|
|
|
|
avail = metaslab_class_get_space(spa_normal_class(spa));
|
|
|
|
|
avail -= metaslab_class_get_alloc(spa_normal_class(spa));
|
|
|
|
|
if (g_handleattr_off_t(bp, "poolblocksavail",
|
|
|
|
|
avail / DEV_BSIZE))
|
|
|
|
|
return (0);
|
|
|
|
|
} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
|
|
|
|
|
refd = metaslab_class_get_alloc(spa_normal_class(spa));
|
|
|
|
|
if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
return (1);
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-02 08:00:57 +03:00
|
|
|
static void
|
|
|
|
|
zvol_filter_detach(struct knote *kn)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
struct zvol_state_dev *zsd;
|
|
|
|
|
|
|
|
|
|
zv = kn->kn_hook;
|
|
|
|
|
zsd = &zv->zv_zso->zso_dev;
|
|
|
|
|
|
|
|
|
|
knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_filter_vnode(struct knote *kn, long hint)
|
|
|
|
|
{
|
|
|
|
|
kn->kn_fflags |= kn->kn_sfflags & hint;
|
|
|
|
|
|
|
|
|
|
return (kn->kn_fflags != 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
struct zvol_state_dev *zsd;
|
|
|
|
|
|
|
|
|
|
zv = dev->si_drv2;
|
|
|
|
|
zsd = &zv->zv_zso->zso_dev;
|
|
|
|
|
|
|
|
|
|
if (kn->kn_filter != EVFILT_VNODE)
|
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
|
|
/* XXX: extend support for other NOTE_* events */
|
|
|
|
|
if (kn->kn_sfflags != NOTE_ATTRIB)
|
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
|
|
kn->kn_fop = &zvol_filterops_vnode;
|
|
|
|
|
kn->kn_hook = zv;
|
|
|
|
|
knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
|
|
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
static void
|
2025-05-08 22:25:40 +03:00
|
|
|
zvol_strategy_impl(zv_request_t *zvr)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
2025-05-08 22:25:40 +03:00
|
|
|
struct bio *bp;
|
2020-04-14 21:36:28 +03:00
|
|
|
uint64_t off, volsize;
|
|
|
|
|
size_t resid;
|
|
|
|
|
char *addr;
|
|
|
|
|
objset_t *os;
|
|
|
|
|
zfs_locked_range_t *lr;
|
|
|
|
|
int error = 0;
|
2020-06-03 20:45:12 +03:00
|
|
|
boolean_t doread = B_FALSE;
|
2020-04-14 21:36:28 +03:00
|
|
|
boolean_t is_dumpified;
|
2023-10-31 00:51:56 +03:00
|
|
|
boolean_t commit;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
2025-05-08 22:25:40 +03:00
|
|
|
bp = zvr->bio;
|
|
|
|
|
zv = zvr->zv;
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv == NULL) {
|
|
|
|
|
error = SET_ERROR(ENXIO);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-03 20:45:12 +03:00
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
zvol: ensure device minors are properly cleaned up
Currently, if a minor is in use when we try to remove it, we'll skip it
and never come back to it again. Since the zvol state is hung off the
minor in the kernel, this can get us into weird situations if something
tries to use it after the removal fails. It's even worse at pool export,
as there's now a vestigial zvol state with no pool under it. It's
weirder again if the pool is subsequently reimported, as the zvol code
(reasonably) assumes the zvol state has been properly setup, when it's
actually left over from the previous import of the pool.
This commit attempts to tackle that by setting a flag on the zvol if its
minor can't be removed, and then checking that flag when a request is
made and rejecting it, thus stopping new work coming in.
The flag also causes a condvar to be signaled when the last client
finishes. For the case where a single minor is being removed (eg
changing volmode), it will wait for this signal before proceeding.
Meanwhile, when removing all minors, a background task is created for
each minor that couldn't be removed on the spot, and those tasks then
wake and clean up.
Since any new tasks are queued on to the pool's spa_zvol_taskq,
spa_export_common() will continue to wait at export until all minors are
removed.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #14872
Closes #16364
2024-07-18 06:24:05 +03:00
|
|
|
if (zv->zv_flags & ZVOL_REMOVING) {
|
|
|
|
|
error = SET_ERROR(ENXIO);
|
|
|
|
|
goto resume;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
switch (bp->bio_cmd) {
|
|
|
|
|
case BIO_READ:
|
2020-06-03 20:45:12 +03:00
|
|
|
doread = B_TRUE;
|
|
|
|
|
break;
|
2020-04-14 21:36:28 +03:00
|
|
|
case BIO_WRITE:
|
2020-06-03 20:45:12 +03:00
|
|
|
case BIO_FLUSH:
|
2020-04-14 21:36:28 +03:00
|
|
|
case BIO_DELETE:
|
2020-06-03 20:45:12 +03:00
|
|
|
if (zv->zv_flags & ZVOL_RDONLY) {
|
|
|
|
|
error = SET_ERROR(EROFS);
|
|
|
|
|
goto resume;
|
|
|
|
|
}
|
|
|
|
|
zvol_ensure_zilog(zv);
|
|
|
|
|
if (bp->bio_cmd == BIO_FLUSH)
|
2023-10-31 00:51:56 +03:00
|
|
|
goto commit;
|
2020-04-14 21:36:28 +03:00
|
|
|
break;
|
|
|
|
|
default:
|
2020-11-03 20:21:09 +03:00
|
|
|
error = SET_ERROR(EOPNOTSUPP);
|
2020-06-03 20:45:12 +03:00
|
|
|
goto resume;
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
off = bp->bio_offset;
|
|
|
|
|
volsize = zv->zv_volsize;
|
|
|
|
|
|
|
|
|
|
os = zv->zv_objset;
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT3P(os, !=, NULL);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
addr = bp->bio_data;
|
|
|
|
|
resid = bp->bio_length;
|
|
|
|
|
|
2020-09-02 19:30:29 +03:00
|
|
|
if (resid > 0 && off >= volsize) {
|
2020-04-14 21:36:28 +03:00
|
|
|
error = SET_ERROR(EIO);
|
2020-06-03 20:45:12 +03:00
|
|
|
goto resume;
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
is_dumpified = B_FALSE;
|
2023-10-31 00:51:56 +03:00
|
|
|
commit = !doread && !is_dumpified &&
|
2020-04-14 21:36:28 +03:00
|
|
|
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* There must be no buffer changes when doing a dmu_sync() because
|
|
|
|
|
* we can't change the data whilst calculating the checksum.
|
|
|
|
|
*/
|
|
|
|
|
lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
|
|
|
|
|
doread ? RL_READER : RL_WRITER);
|
|
|
|
|
|
|
|
|
|
if (bp->bio_cmd == BIO_DELETE) {
|
|
|
|
|
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
|
2025-03-19 02:04:22 +03:00
|
|
|
error = dmu_tx_assign(tx, DMU_TX_WAIT);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error != 0) {
|
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
|
} else {
|
2023-10-31 00:51:56 +03:00
|
|
|
zvol_log_truncate(zv, tx, off, resid);
|
2020-04-14 21:36:28 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
|
|
|
|
|
off, resid);
|
|
|
|
|
resid = 0;
|
|
|
|
|
}
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
|
|
|
|
while (resid != 0 && off < volsize) {
|
|
|
|
|
size_t size = MIN(resid, zvol_maxphys);
|
|
|
|
|
if (doread) {
|
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O. It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable. Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.
While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations. It require I/Os
to be page aligned, does not allow speculative prefetch, etc. The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O. As such it should fill the gap in between. Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.
To pass the information between the layers I had to change a number
of APIs. But as side effect upper layers can now control not only
the caching, but also speculative prefetch. I haven't wired it to
VFS yet, since it require looking on some OS specifics. But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
2025-05-14 00:26:55 +03:00
|
|
|
error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
|
2020-04-14 21:36:28 +03:00
|
|
|
DMU_READ_PREFETCH);
|
|
|
|
|
} else {
|
|
|
|
|
dmu_tx_t *tx = dmu_tx_create(os);
|
|
|
|
|
dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
|
2025-03-19 02:04:22 +03:00
|
|
|
error = dmu_tx_assign(tx, DMU_TX_WAIT);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error) {
|
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
|
} else {
|
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O. It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable. Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.
While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations. It require I/Os
to be page aligned, does not allow speculative prefetch, etc. The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O. As such it should fill the gap in between. Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.
To pass the information between the layers I had to change a number
of APIs. But as side effect upper layers can now control not only
the caching, but also speculative prefetch. I haven't wired it to
VFS yet, since it require looking on some OS specifics. But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
2025-05-14 00:26:55 +03:00
|
|
|
dmu_write_by_dnode(zv->zv_dn, off, size, addr,
|
|
|
|
|
tx, DMU_READ_PREFETCH);
|
2023-10-31 00:51:56 +03:00
|
|
|
zvol_log_write(zv, tx, off, size, commit);
|
2020-04-14 21:36:28 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (error) {
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Convert checksum errors into IO errors. */
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error == ECKSUM)
|
|
|
|
|
error = SET_ERROR(EIO);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
off += size;
|
|
|
|
|
addr += size;
|
|
|
|
|
resid -= size;
|
|
|
|
|
}
|
|
|
|
|
unlock:
|
|
|
|
|
zfs_rangelock_exit(lr);
|
|
|
|
|
|
|
|
|
|
bp->bio_completed = bp->bio_length - resid;
|
|
|
|
|
if (bp->bio_completed < bp->bio_length && off > volsize)
|
2020-11-03 20:21:09 +03:00
|
|
|
error = SET_ERROR(EINVAL);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
2020-06-06 03:17:02 +03:00
|
|
|
switch (bp->bio_cmd) {
|
|
|
|
|
case BIO_FLUSH:
|
|
|
|
|
break;
|
|
|
|
|
case BIO_READ:
|
|
|
|
|
dataset_kstats_update_read_kstats(&zv->zv_kstat,
|
|
|
|
|
bp->bio_completed);
|
|
|
|
|
break;
|
|
|
|
|
case BIO_WRITE:
|
|
|
|
|
dataset_kstats_update_write_kstats(&zv->zv_kstat,
|
|
|
|
|
bp->bio_completed);
|
|
|
|
|
break;
|
|
|
|
|
case BIO_DELETE:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-24 07:14:23 +03:00
|
|
|
if (error == 0 && commit) {
|
2023-10-31 00:51:56 +03:00
|
|
|
commit:
|
2025-02-24 07:14:23 +03:00
|
|
|
error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
2020-06-03 20:45:12 +03:00
|
|
|
resume:
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
out:
|
|
|
|
|
if (bp->bio_to)
|
|
|
|
|
g_io_deliver(bp, error);
|
|
|
|
|
else
|
|
|
|
|
biofinish(bp, NULL, error);
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-08 22:25:40 +03:00
|
|
|
static void
|
|
|
|
|
zvol_strategy_task(void *arg)
|
|
|
|
|
{
|
|
|
|
|
zv_request_task_t *task = arg;
|
|
|
|
|
|
|
|
|
|
zvol_strategy_impl(&task->zvr);
|
|
|
|
|
zv_request_task_free(task);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
|
|
|
|
|
{
|
|
|
|
|
zv_taskq_t *ztqs = &zvol_taskqs;
|
|
|
|
|
zv_request_task_t *task;
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
uint_t tq_idx;
|
|
|
|
|
uint_t taskq_hash;
|
|
|
|
|
int error;
|
|
|
|
|
|
|
|
|
|
if (bp->bio_to)
|
|
|
|
|
zv = bp->bio_to->private;
|
|
|
|
|
else
|
|
|
|
|
zv = bp->bio_dev->si_drv2;
|
|
|
|
|
|
|
|
|
|
if (zv == NULL) {
|
|
|
|
|
error = SET_ERROR(ENXIO);
|
|
|
|
|
if (bp->bio_to)
|
|
|
|
|
g_io_deliver(bp, error);
|
|
|
|
|
else
|
|
|
|
|
biofinish(bp, NULL, error);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
zv_request_t zvr = {
|
|
|
|
|
.zv = zv,
|
|
|
|
|
.bio = bp,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (sync || zvol_request_sync) {
|
|
|
|
|
zvol_strategy_impl(&zvr);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
|
|
|
|
|
ZVOL_TASKQ_OFFSET_SHIFT);
|
|
|
|
|
tq_idx = taskq_hash % ztqs->tqs_cnt;
|
|
|
|
|
task = zv_request_task_create(zvr);
|
|
|
|
|
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
|
|
|
|
|
0, &task->ent);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
zvol_cdev_bio_strategy(struct bio *bp)
|
|
|
|
|
{
|
|
|
|
|
zvol_geom_bio_strategy(bp, B_FALSE);
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
/*
|
|
|
|
|
* Character device mode implementation
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
static int
|
2021-01-21 08:27:30 +03:00
|
|
|
zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
uint64_t volsize;
|
|
|
|
|
zfs_locked_range_t *lr;
|
|
|
|
|
int error = 0;
|
2021-01-21 08:27:30 +03:00
|
|
|
zfs_uio_t uio;
|
|
|
|
|
|
|
|
|
|
zfs_uio_init(&uio, uio_s);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
zv = dev->si_drv2;
|
|
|
|
|
|
|
|
|
|
volsize = zv->zv_volsize;
|
|
|
|
|
/*
|
|
|
|
|
* uio_loffset == volsize isn't an error as
|
2021-04-03 04:38:53 +03:00
|
|
|
* it's required for EOF processing.
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
2021-01-21 08:27:30 +03:00
|
|
|
if (zfs_uio_resid(&uio) > 0 &&
|
|
|
|
|
(zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
|
2020-04-14 21:36:28 +03:00
|
|
|
return (SET_ERROR(EIO));
|
|
|
|
|
|
2023-05-10 03:56:35 +03:00
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
2021-05-26 21:14:26 +03:00
|
|
|
ssize_t start_resid = zfs_uio_resid(&uio);
|
2021-01-21 08:27:30 +03:00
|
|
|
lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
|
|
|
|
|
zfs_uio_resid(&uio), RL_READER);
|
|
|
|
|
while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
|
|
|
|
|
uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Don't read past the end. */
|
2021-01-21 08:27:30 +03:00
|
|
|
if (bytes > volsize - zfs_uio_offset(&uio))
|
|
|
|
|
bytes = volsize - zfs_uio_offset(&uio);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O. It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable. Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.
While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations. It require I/Os
to be page aligned, does not allow speculative prefetch, etc. The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O. As such it should fill the gap in between. Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.
To pass the information between the layers I had to change a number
of APIs. But as side effect upper layers can now control not only
the caching, but also speculative prefetch. I haven't wired it to
VFS yet, since it require looking on some OS specifics. But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
2025-05-14 00:26:55 +03:00
|
|
|
error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
|
|
|
|
|
DMU_READ_PREFETCH);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error) {
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Convert checksum errors into IO errors. */
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error == ECKSUM)
|
|
|
|
|
error = SET_ERROR(EIO);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
zfs_rangelock_exit(lr);
|
2021-05-26 21:14:26 +03:00
|
|
|
int64_t nread = start_resid - zfs_uio_resid(&uio);
|
|
|
|
|
dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
|
2023-05-10 03:56:35 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
2021-01-21 08:27:30 +03:00
|
|
|
zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
uint64_t volsize;
|
|
|
|
|
zfs_locked_range_t *lr;
|
|
|
|
|
int error = 0;
|
2023-10-31 00:51:56 +03:00
|
|
|
boolean_t commit;
|
2021-01-21 08:27:30 +03:00
|
|
|
zfs_uio_t uio;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
zv = dev->si_drv2;
|
|
|
|
|
|
|
|
|
|
volsize = zv->zv_volsize;
|
|
|
|
|
|
2021-01-21 08:27:30 +03:00
|
|
|
zfs_uio_init(&uio, uio_s);
|
|
|
|
|
|
|
|
|
|
if (zfs_uio_resid(&uio) > 0 &&
|
|
|
|
|
(zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
|
2020-04-14 21:36:28 +03:00
|
|
|
return (SET_ERROR(EIO));
|
|
|
|
|
|
2021-05-26 21:14:26 +03:00
|
|
|
ssize_t start_resid = zfs_uio_resid(&uio);
|
2023-10-31 00:51:56 +03:00
|
|
|
commit = (ioflag & IO_SYNC) ||
|
2020-04-14 21:36:28 +03:00
|
|
|
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
|
|
|
|
|
|
|
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
|
|
|
|
zvol_ensure_zilog(zv);
|
|
|
|
|
|
2021-01-21 08:27:30 +03:00
|
|
|
lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
|
|
|
|
|
zfs_uio_resid(&uio), RL_WRITER);
|
|
|
|
|
while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
|
|
|
|
|
uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
|
|
|
|
|
uint64_t off = zfs_uio_offset(&uio);
|
2020-04-14 21:36:28 +03:00
|
|
|
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
|
|
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
if (bytes > volsize - off) /* Don't write past the end. */
|
2020-04-14 21:36:28 +03:00
|
|
|
bytes = volsize - off;
|
|
|
|
|
|
|
|
|
|
dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
|
2025-03-19 02:04:22 +03:00
|
|
|
error = dmu_tx_assign(tx, DMU_TX_WAIT);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error) {
|
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
|
break;
|
|
|
|
|
}
|
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O. It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable. Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.
While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations. It require I/Os
to be page aligned, does not allow speculative prefetch, etc. The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O. As such it should fill the gap in between. Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.
To pass the information between the layers I had to change a number
of APIs. But as side effect upper layers can now control not only
the caching, but also speculative prefetch. I haven't wired it to
VFS yet, since it require looking on some OS specifics. But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
2025-05-14 00:26:55 +03:00
|
|
|
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
|
|
|
|
|
DMU_READ_PREFETCH);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error == 0)
|
2023-10-31 00:51:56 +03:00
|
|
|
zvol_log_write(zv, tx, off, bytes, commit);
|
2020-04-14 21:36:28 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
|
|
|
|
|
if (error)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
zfs_rangelock_exit(lr);
|
2021-05-26 21:14:26 +03:00
|
|
|
int64_t nwritten = start_resid - zfs_uio_resid(&uio);
|
|
|
|
|
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
|
2025-02-24 07:14:23 +03:00
|
|
|
if (error == 0 && commit)
|
|
|
|
|
error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
2020-06-03 20:45:12 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
Adding Direct IO Support
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads.
O_DIRECT support in ZFS will always ensure there is coherency between
buffered and O_DIRECT IO requests. This ensures that all IO requests,
whether buffered or direct, will see the same file contents at all
times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While
data is written directly to VDEV disks, metadata will not be synced
until the associated TXG is synced.
For both O_DIRECT read and write request the offset and request sizes,
at a minimum, must be PAGE_SIZE aligned. In the event they are not,
then EINVAL is returned unless the direct property is set to always (see
below).
For O_DIRECT writes:
The request also must be block aligned (recordsize) or the write
request will take the normal (buffered) write path. In the event that
request is block aligned and a cached copy of the buffer in the ARC,
then it will be discarded from the ARC forcing all further reads to
retrieve the data from disk.
For O_DIRECT reads:
The only alignment restrictions are PAGE_SIZE alignment. In the event
that the requested data is in buffered (in the ARC) it will just be
copied from the ARC into the user buffer.
For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in
the event that file contents are mmap'ed. In this case, all requests
that are at least PAGE_SIZE aligned will just fall back to the buffered
paths. If the request however is not PAGE_SIZE aligned, EINVAL will
be returned as always regardless if the file's contents are mmap'ed.
Since O_DIRECT writes go through the normal ZIO pipeline, the
following operations are supported just as with normal buffered writes:
Checksum
Compression
Encryption
Erasure Coding
There is one caveat for the data integrity of O_DIRECT writes that is
distinct for each of the OS's supported by ZFS.
FreeBSD - FreeBSD is able to place user pages under write protection so
any data in the user buffers and written directly down to the
VDEV disks is guaranteed to not change. There is no concern
with data integrity and O_DIRECT writes.
Linux - Linux is not able to place anonymous user pages under write
protection. Because of this, if the user decides to manipulate
the page contents while the write operation is occurring, data
integrity can not be guaranteed. However, there is a module
parameter `zfs_vdev_direct_write_verify` that controls the
if a O_DIRECT writes that can occur to a top-level VDEV before
a checksum verify is run before the contents of the I/O buffer
are committed to disk. In the event of a checksum verification
failure the write will return EIO. The number of O_DIRECT write
checksum verification errors can be observed by doing
`zpool status -d`, which will list all verification errors that
have occurred on a top-level VDEV. Along with `zpool status`, a
ZED event will be issues as `dio_verify` when a checksum
verification error occurs.
ZVOLs and dedup is not currently supported with Direct I/O.
A new dataset property `direct` has been added with the following 3
allowable values:
disabled - Accepts O_DIRECT flag, but silently ignores it and treats
the request as a buffered IO request.
standard - Follows the alignment restrictions outlined above for
write/read IO requests when the O_DIRECT flag is used.
always - Treats every write/read IO request as though it passed
O_DIRECT and will do O_DIRECT if the alignment restrictions
are met otherwise will redirect through the ARC. This
property will not allow a request to fail.
There is also a module parameter zfs_dio_enabled that can be used to
force all reads and writes through the ARC. By setting this module
parameter to 0, it mimics as if the direct dataset property is set to
disabled.
Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Co-authored-by: Mark Maybee <mark.maybee@delphix.com>
Co-authored-by: Matt Macy <mmacy@FreeBSD.org>
Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov>
Closes #10018
2024-09-14 23:47:59 +03:00
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
return (error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
int err = 0;
|
2020-11-06 21:56:58 +03:00
|
|
|
boolean_t drop_suspend = B_FALSE;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
2020-11-06 21:52:16 +03:00
|
|
|
retry:
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
zv = atomic_load_ptr(&dev->si_drv2);
|
|
|
|
|
if (zv == NULL)
|
|
|
|
|
return (SET_ERROR(ENXIO));
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
|
2021-12-23 22:03:23 +03:00
|
|
|
err = SET_ERROR(ENXIO);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
goto out_locked;
|
2021-12-23 22:03:23 +03:00
|
|
|
}
|
2020-11-17 20:50:52 +03:00
|
|
|
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
/*
|
2021-12-23 22:04:07 +03:00
|
|
|
* Make sure zvol is not suspended during first open
|
2020-04-14 21:36:28 +03:00
|
|
|
* (hold zv_suspend_lock) and respect proper lock acquisition
|
2021-12-23 22:04:07 +03:00
|
|
|
* ordering - zv_suspend_lock before zv_state_lock.
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
|
|
|
|
if (zv->zv_open_count == 0) {
|
2020-11-06 21:56:58 +03:00
|
|
|
drop_suspend = B_TRUE;
|
2020-04-14 21:36:28 +03:00
|
|
|
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
|
|
|
|
|
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
|
|
|
|
|
/* Removal started while locks were down. */
|
|
|
|
|
err = SET_ERROR(ENXIO);
|
|
|
|
|
goto out_locked;
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Check to see if zv_suspend_lock is needed. */
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_open_count != 0) {
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
|
|
|
|
|
if (zv->zv_open_count == 0) {
|
2021-12-23 22:03:23 +03:00
|
|
|
boolean_t drop_namespace = B_FALSE;
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
|
2021-12-23 22:03:23 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Take spa_namespace_lock to prevent lock inversion when
|
|
|
|
|
* zvols from one pool are opened as vdevs in another.
|
|
|
|
|
*/
|
|
|
|
|
if (!mutex_owned(&spa_namespace_lock)) {
|
|
|
|
|
if (!mutex_tryenter(&spa_namespace_lock)) {
|
2022-01-26 22:23:39 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2024-07-11 00:27:44 +03:00
|
|
|
drop_suspend = B_FALSE;
|
2021-12-23 22:03:23 +03:00
|
|
|
kern_yield(PRI_USER);
|
|
|
|
|
goto retry;
|
|
|
|
|
} else {
|
|
|
|
|
drop_namespace = B_TRUE;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
err = zvol_first_open(zv, !(flags & FWRITE));
|
2021-12-23 22:03:23 +03:00
|
|
|
if (drop_namespace)
|
|
|
|
|
mutex_exit(&spa_namespace_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (err)
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
goto out_locked;
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
2021-12-23 22:03:23 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
|
2020-11-03 20:21:09 +03:00
|
|
|
err = SET_ERROR(EROFS);
|
2020-04-14 21:36:28 +03:00
|
|
|
goto out_opened;
|
|
|
|
|
}
|
|
|
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
2020-11-03 20:21:09 +03:00
|
|
|
err = SET_ERROR(EBUSY);
|
2020-04-14 21:36:28 +03:00
|
|
|
goto out_opened;
|
|
|
|
|
}
|
2022-05-03 02:26:28 +03:00
|
|
|
if (flags & O_EXCL) {
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_open_count != 0) {
|
2020-11-03 20:21:09 +03:00
|
|
|
err = SET_ERROR(EBUSY);
|
2020-04-14 21:36:28 +03:00
|
|
|
goto out_opened;
|
|
|
|
|
}
|
|
|
|
|
zv->zv_flags |= ZVOL_EXCL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
zv->zv_open_count++;
|
|
|
|
|
out_opened:
|
2020-11-17 20:50:52 +03:00
|
|
|
if (zv->zv_open_count == 0) {
|
2020-04-14 21:36:28 +03:00
|
|
|
zvol_last_close(zv);
|
2020-11-17 20:50:52 +03:00
|
|
|
wakeup(zv);
|
|
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
out_locked:
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (drop_suspend)
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2020-11-03 20:21:09 +03:00
|
|
|
return (err);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
boolean_t drop_suspend = B_TRUE;
|
|
|
|
|
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
zv = atomic_load_ptr(&dev->si_drv2);
|
|
|
|
|
if (zv == NULL)
|
2020-04-14 21:36:28 +03:00
|
|
|
return (SET_ERROR(ENXIO));
|
|
|
|
|
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
|
if (zv->zv_flags & ZVOL_EXCL) {
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT3U(zv->zv_open_count, ==, 1);
|
2020-04-14 21:36:28 +03:00
|
|
|
zv->zv_flags &= ~ZVOL_EXCL;
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-17 20:50:52 +03:00
|
|
|
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the open count is zero, this is a spurious close.
|
|
|
|
|
* That indicates a bug in the kernel / DDI framework.
|
|
|
|
|
*/
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT3U(zv->zv_open_count, >, 0);
|
2020-04-14 21:36:28 +03:00
|
|
|
/*
|
2021-12-23 22:04:07 +03:00
|
|
|
* Make sure zvol is not suspended during last close
|
2020-04-14 21:36:28 +03:00
|
|
|
* (hold zv_suspend_lock) and respect proper lock acquisition
|
2021-12-23 22:04:07 +03:00
|
|
|
* ordering - zv_suspend_lock before zv_state_lock.
|
2020-04-14 21:36:28 +03:00
|
|
|
*/
|
|
|
|
|
if (zv->zv_open_count == 1) {
|
|
|
|
|
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Unlike in zvol_cdev_open(), we don't check if
|
|
|
|
|
* removal started here, because we might be one of the
|
|
|
|
|
* openers that needs to be thrown out! If we're the
|
|
|
|
|
* last, we need to call zvol_last_close() below to
|
|
|
|
|
* finish cleanup. So, no special treatment for us.
|
|
|
|
|
*/
|
|
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Check to see if zv_suspend_lock is needed. */
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_open_count != 1) {
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
drop_suspend = B_FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* You may get multiple opens, but only one close.
|
|
|
|
|
*/
|
|
|
|
|
zv->zv_open_count--;
|
|
|
|
|
|
|
|
|
|
if (zv->zv_open_count == 0) {
|
|
|
|
|
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
|
|
|
|
|
zvol_last_close(zv);
|
2020-11-17 20:50:52 +03:00
|
|
|
wakeup(zv);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
|
|
|
|
|
if (drop_suspend)
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
|
|
|
|
|
int fflag, struct thread *td)
|
|
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
zfs_locked_range_t *lr;
|
|
|
|
|
off_t offset, length;
|
2021-12-27 21:33:16 +03:00
|
|
|
int error;
|
2020-04-14 21:36:28 +03:00
|
|
|
boolean_t sync;
|
|
|
|
|
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
zv = atomic_load_ptr(&dev->si_drv2);
|
|
|
|
|
ASSERT3P(zv, !=, NULL);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
|
KASSERT(zv->zv_open_count > 0,
|
|
|
|
|
("Device with zero access count in %s", __func__));
|
|
|
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
|
case DIOCGSECTORSIZE:
|
|
|
|
|
*(uint32_t *)data = DEV_BSIZE;
|
|
|
|
|
break;
|
|
|
|
|
case DIOCGMEDIASIZE:
|
|
|
|
|
*(off_t *)data = zv->zv_volsize;
|
|
|
|
|
break;
|
|
|
|
|
case DIOCGFLUSH:
|
2020-06-03 20:45:12 +03:00
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_zilog != NULL)
|
2025-02-24 07:14:23 +03:00
|
|
|
error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
2020-06-03 20:45:12 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
break;
|
|
|
|
|
case DIOCGDELETE:
|
|
|
|
|
if (!zvol_unmap_enabled)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
offset = ((off_t *)data)[0];
|
|
|
|
|
length = ((off_t *)data)[1];
|
|
|
|
|
if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
|
|
|
|
|
offset < 0 || offset >= zv->zv_volsize ||
|
|
|
|
|
length <= 0) {
|
|
|
|
|
printf("%s: offset=%jd length=%jd\n", __func__, offset,
|
|
|
|
|
length);
|
2020-11-03 20:21:09 +03:00
|
|
|
error = SET_ERROR(EINVAL);
|
2020-04-14 21:36:28 +03:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
|
|
|
|
zvol_ensure_zilog(zv);
|
|
|
|
|
lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
|
|
|
|
|
RL_WRITER);
|
|
|
|
|
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
|
2025-03-19 02:04:22 +03:00
|
|
|
error = dmu_tx_assign(tx, DMU_TX_WAIT);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error != 0) {
|
|
|
|
|
sync = FALSE;
|
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
|
} else {
|
|
|
|
|
sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
|
2023-10-31 00:51:56 +03:00
|
|
|
zvol_log_truncate(zv, tx, offset, length);
|
2020-04-14 21:36:28 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
|
|
|
|
|
offset, length);
|
|
|
|
|
}
|
|
|
|
|
zfs_rangelock_exit(lr);
|
|
|
|
|
if (sync)
|
2025-02-24 07:14:23 +03:00
|
|
|
error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
2020-04-14 21:36:28 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
break;
|
|
|
|
|
case DIOCGSTRIPESIZE:
|
|
|
|
|
*(off_t *)data = zv->zv_volblocksize;
|
|
|
|
|
break;
|
|
|
|
|
case DIOCGSTRIPEOFFSET:
|
|
|
|
|
*(off_t *)data = 0;
|
|
|
|
|
break;
|
|
|
|
|
case DIOCGATTR: {
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
2020-04-14 21:36:28 +03:00
|
|
|
spa_t *spa = dmu_objset_spa(zv->zv_objset);
|
|
|
|
|
struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
|
|
|
|
|
uint64_t refd, avail, usedobjs, availobjs;
|
|
|
|
|
|
|
|
|
|
if (strcmp(arg->name, "GEOM::candelete") == 0)
|
|
|
|
|
arg->value.i = 1;
|
|
|
|
|
else if (strcmp(arg->name, "blocksavail") == 0) {
|
|
|
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
|
|
|
&usedobjs, &availobjs);
|
|
|
|
|
arg->value.off = avail / DEV_BSIZE;
|
|
|
|
|
} else if (strcmp(arg->name, "blocksused") == 0) {
|
|
|
|
|
dmu_objset_space(zv->zv_objset, &refd, &avail,
|
|
|
|
|
&usedobjs, &availobjs);
|
|
|
|
|
arg->value.off = refd / DEV_BSIZE;
|
|
|
|
|
} else if (strcmp(arg->name, "poolblocksavail") == 0) {
|
|
|
|
|
avail = metaslab_class_get_space(spa_normal_class(spa));
|
|
|
|
|
avail -= metaslab_class_get_alloc(
|
|
|
|
|
spa_normal_class(spa));
|
|
|
|
|
arg->value.off = avail / DEV_BSIZE;
|
|
|
|
|
} else if (strcmp(arg->name, "poolblocksused") == 0) {
|
|
|
|
|
refd = metaslab_class_get_alloc(spa_normal_class(spa));
|
|
|
|
|
arg->value.off = refd / DEV_BSIZE;
|
|
|
|
|
} else
|
2020-11-03 20:21:09 +03:00
|
|
|
error = SET_ERROR(ENOIOCTL);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case FIOSEEKHOLE:
|
|
|
|
|
case FIOSEEKDATA: {
|
|
|
|
|
off_t *off = (off_t *)data;
|
|
|
|
|
uint64_t noff;
|
|
|
|
|
boolean_t hole;
|
|
|
|
|
|
|
|
|
|
hole = (cmd == FIOSEEKHOLE);
|
|
|
|
|
noff = *off;
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
2023-03-28 18:19:03 +03:00
|
|
|
lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
|
|
|
|
|
RL_READER);
|
2020-04-14 21:36:28 +03:00
|
|
|
error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
|
2023-03-28 18:19:03 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
*off = noff;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
default:
|
2020-11-03 20:21:09 +03:00
|
|
|
error = SET_ERROR(ENOIOCTL);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Misc. helpers
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
zvol_ensure_zilog(zvol_state_t *zv)
|
|
|
|
|
{
|
|
|
|
|
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Open a ZIL if this is the first time we have written to this
|
|
|
|
|
* zvol. We protect zv->zv_zilog with zv_suspend_lock rather
|
|
|
|
|
* than zv_state_lock so that we don't need to acquire an
|
|
|
|
|
* additional lock in this path.
|
|
|
|
|
*/
|
|
|
|
|
if (zv->zv_zilog == NULL) {
|
2020-06-03 20:45:12 +03:00
|
|
|
if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
|
|
|
|
|
rw_exit(&zv->zv_suspend_lock);
|
|
|
|
|
rw_enter(&zv->zv_suspend_lock, RW_WRITER);
|
|
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
if (zv->zv_zilog == NULL) {
|
|
|
|
|
zv->zv_zilog = zil_open(zv->zv_objset,
|
2022-07-21 03:14:06 +03:00
|
|
|
zvol_get_data, &zv->zv_kstat.dk_zil_sums);
|
2020-04-14 21:36:28 +03:00
|
|
|
zv->zv_flags |= ZVOL_WRITTEN_TO;
|
2022-02-07 21:24:38 +03:00
|
|
|
/* replay / destroy done in zvol_os_create_minor() */
|
2021-05-01 02:36:10 +03:00
|
|
|
VERIFY0(zv->zv_zilog->zl_header->zh_flags &
|
|
|
|
|
ZIL_REPLAY_NEEDED);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
rw_downgrade(&zv->zv_suspend_lock);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-07 21:24:38 +03:00
|
|
|
boolean_t
|
|
|
|
|
zvol_os_is_zvol(const char *device)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
|
|
|
|
return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-06 17:10:52 +03:00
|
|
|
int
|
2022-02-07 21:24:38 +03:00
|
|
|
zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
2025-08-06 17:10:52 +03:00
|
|
|
int error = 0;
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Move to a new hashtable entry. */
|
2024-04-26 00:24:52 +03:00
|
|
|
zv->zv_hash = zvol_name_hash(newname);
|
2020-04-14 21:36:28 +03:00
|
|
|
hlist_del(&zv->zv_hlink);
|
|
|
|
|
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
|
|
|
|
|
|
2020-11-17 20:50:52 +03:00
|
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
2020-04-14 21:36:28 +03:00
|
|
|
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
|
|
|
|
|
struct g_provider *pp = zsg->zsg_provider;
|
|
|
|
|
struct g_geom *gp;
|
|
|
|
|
|
|
|
|
|
g_topology_lock();
|
|
|
|
|
gp = pp->geom;
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT3P(gp, !=, NULL);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
zsg->zsg_provider = NULL;
|
|
|
|
|
g_wither_provider(pp, ENXIO);
|
|
|
|
|
|
|
|
|
|
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
|
|
|
|
|
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
|
|
|
|
|
pp->sectorsize = DEV_BSIZE;
|
|
|
|
|
pp->mediasize = zv->zv_volsize;
|
|
|
|
|
pp->private = zv;
|
|
|
|
|
zsg->zsg_provider = pp;
|
|
|
|
|
g_error_provider(pp, 0);
|
|
|
|
|
g_topology_unlock();
|
2020-11-17 20:50:52 +03:00
|
|
|
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
2020-04-14 21:36:28 +03:00
|
|
|
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
|
|
|
|
|
struct cdev *dev;
|
|
|
|
|
struct make_dev_args args;
|
|
|
|
|
|
|
|
|
|
dev = zsd->zsd_cdev;
|
|
|
|
|
if (dev != NULL) {
|
|
|
|
|
destroy_dev(dev);
|
|
|
|
|
dev = zsd->zsd_cdev = NULL;
|
|
|
|
|
if (zv->zv_open_count > 0) {
|
|
|
|
|
zv->zv_flags &= ~ZVOL_EXCL;
|
|
|
|
|
zv->zv_open_count = 0;
|
|
|
|
|
/* XXX need suspend lock but lock order */
|
|
|
|
|
zvol_last_close(zv);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
make_dev_args_init(&args);
|
|
|
|
|
args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
|
|
|
|
|
args.mda_devsw = &zvol_cdevsw;
|
|
|
|
|
args.mda_cr = NULL;
|
|
|
|
|
args.mda_uid = UID_ROOT;
|
|
|
|
|
args.mda_gid = GID_OPERATOR;
|
|
|
|
|
args.mda_mode = 0640;
|
|
|
|
|
args.mda_si_drv2 = zv;
|
2025-08-06 17:10:52 +03:00
|
|
|
error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
|
|
|
|
|
if (error == 0) {
|
2021-07-22 19:22:14 +03:00
|
|
|
dev->si_iosize_max = maxphys;
|
2020-04-14 21:36:28 +03:00
|
|
|
zsd->zsd_cdev = dev;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
|
2023-11-07 22:34:50 +03:00
|
|
|
dataset_kstats_rename(&zv->zv_kstat, newname);
|
2025-08-06 17:10:52 +03:00
|
|
|
|
|
|
|
|
return (error);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
2025-07-31 18:02:09 +03:00
|
|
|
/*
|
|
|
|
|
* Allocate memory for a new zvol_state_t and setup the required
|
|
|
|
|
* request queue and generic disk structures for the block device.
|
|
|
|
|
*/
|
2025-08-06 17:10:52 +03:00
|
|
|
static int
|
|
|
|
|
zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
|
|
|
|
|
zvol_state_t **zvp)
|
2025-07-31 18:02:09 +03:00
|
|
|
{
|
|
|
|
|
zvol_state_t *zv;
|
|
|
|
|
uint64_t volmode;
|
2025-08-06 17:10:52 +03:00
|
|
|
int error;
|
2025-07-31 18:02:09 +03:00
|
|
|
|
2025-08-06 17:10:52 +03:00
|
|
|
error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
|
|
|
|
|
&volmode, NULL);
|
|
|
|
|
if (error)
|
|
|
|
|
return (error);
|
2025-07-31 18:02:09 +03:00
|
|
|
|
|
|
|
|
if (volmode == ZFS_VOLMODE_DEFAULT)
|
|
|
|
|
volmode = zvol_volmode;
|
|
|
|
|
|
|
|
|
|
if (volmode == ZFS_VOLMODE_NONE)
|
2025-08-06 17:10:52 +03:00
|
|
|
return (0);
|
2025-07-31 18:02:09 +03:00
|
|
|
|
|
|
|
|
zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
|
|
|
|
|
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
|
cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
|
|
|
|
|
zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
|
|
|
|
|
zv->zv_volmode = volmode;
|
2025-08-06 17:10:52 +03:00
|
|
|
zv->zv_volsize = volsize;
|
2025-07-31 18:02:09 +03:00
|
|
|
zv->zv_volblocksize = volblocksize;
|
|
|
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
|
|
|
|
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
|
|
|
|
|
struct g_provider *pp;
|
|
|
|
|
struct g_geom *gp;
|
|
|
|
|
|
|
|
|
|
g_topology_lock();
|
|
|
|
|
gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
|
|
|
|
|
gp->start = zvol_geom_bio_start;
|
|
|
|
|
gp->access = zvol_geom_access;
|
|
|
|
|
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
|
|
|
|
|
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
|
|
|
|
|
pp->sectorsize = DEV_BSIZE;
|
|
|
|
|
pp->mediasize = 0;
|
|
|
|
|
pp->private = zv;
|
|
|
|
|
|
|
|
|
|
zsg->zsg_provider = pp;
|
|
|
|
|
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
|
|
|
|
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
|
|
|
|
|
struct cdev *dev;
|
|
|
|
|
struct make_dev_args args;
|
|
|
|
|
|
|
|
|
|
make_dev_args_init(&args);
|
|
|
|
|
args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
|
|
|
|
|
args.mda_devsw = &zvol_cdevsw;
|
|
|
|
|
args.mda_cr = NULL;
|
|
|
|
|
args.mda_uid = UID_ROOT;
|
|
|
|
|
args.mda_gid = GID_OPERATOR;
|
|
|
|
|
args.mda_mode = 0640;
|
|
|
|
|
args.mda_si_drv2 = zv;
|
2025-08-06 17:10:52 +03:00
|
|
|
error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
|
|
|
|
|
if (error) {
|
2025-07-31 18:02:09 +03:00
|
|
|
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
|
|
|
|
|
kmem_free(zv, sizeof (zvol_state_t));
|
2025-08-06 17:10:52 +03:00
|
|
|
return (error);
|
2025-07-31 18:02:09 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dev->si_iosize_max = maxphys;
|
|
|
|
|
zsd->zsd_cdev = dev;
|
|
|
|
|
knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
|
|
|
|
|
}
|
|
|
|
|
(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
|
|
|
|
|
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
|
|
|
|
|
zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
|
|
|
|
|
|
2025-08-06 17:10:52 +03:00
|
|
|
*zvp = zv;
|
|
|
|
|
return (error);
|
2025-07-31 18:02:09 +03:00
|
|
|
}
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
/*
|
|
|
|
|
* Remove minor node for the specified volume.
|
|
|
|
|
*/
|
2022-02-07 21:24:38 +03:00
|
|
|
void
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
zvol_os_remove_minor(zvol_state_t *zv)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
2020-10-21 20:23:08 +03:00
|
|
|
ASSERT0(zv->zv_open_count);
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
ASSERT0(atomic_read(&zv->zv_suspend_ref));
|
|
|
|
|
ASSERT(zv->zv_flags & ZVOL_REMOVING);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
struct zvol_state_os *zso = zv->zv_zso;
|
|
|
|
|
zv->zv_zso = NULL;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
2020-11-17 20:50:52 +03:00
|
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
struct zvol_state_geom *zsg = &zso->zso_geom;
|
|
|
|
|
struct g_provider *pp = zsg->zsg_provider;
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
atomic_store_ptr(&pp->private, NULL);
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
g_topology_lock();
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
g_wither_geom(pp->geom, ENXIO);
|
2020-04-14 21:36:28 +03:00
|
|
|
g_topology_unlock();
|
2020-11-17 20:50:52 +03:00
|
|
|
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
struct zvol_state_dev *zsd = &zso->zso_dev;
|
2020-04-14 21:36:28 +03:00
|
|
|
struct cdev *dev = zsd->zsd_cdev;
|
|
|
|
|
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
if (dev != NULL)
|
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol
lists (zvol_find_by_name()), but has also been used to control access to
OS-side private data, accessed through whatever kernel object is used to
represent the volume (gendisk, geom, etc).
This appears to have been necessary to some degree because the OS-side
object is what's used to get a handle on zvol_state_t, so zv_state_lock
and zv_suspend_lock can't be used to manage access, but also, with the
private object and the zvol_state_t being shutdown and destroyed at the
same time in zvol_os_free(), we must ensure that the private object
pointer only ever corresponds to a real zvol_state_t, not one in partial
destruction. Taking the global lock seems like a convenient way to
ensure this.
The problem with this is that zvol_state_lock does not actually protect
access to the zvol_state_t internals, so we need to take zv_state_lock
and/or zv_suspend_lock. If those are contended, this can then cause
OS-side operations (eg zvol_open()) to sleep to wait for them while hold
zvol_state_lock. This then blocks out all other OS-side operations which
want to get the private data, and any ZFS-side control operations that
would take the write half of the lock. It's even worse if ZFS-side
operations induce OS-side calls back into the zvol (eg creating a zvol
triggers a partition probe inside the kernel, and also a userspace
access from udev to set up device links). And it gets even works again
if anything decides to defer those ops to a task and wait on them, which
zvol_remove_minors_impl() will do under high load.
However, since the previous commit, we have a guarantee that the private
data pointer will always be NULL'd out in zvol_os_remove_minor()
_before_ the zvol_state_t is made invalid, but it won't happen until all
users are ejected. So, if we make access to the private object pointer
atomic, we remove the need to take a global lockout to access it, and so
we can remove all acquisitions of zvol_state_lock from the OS side.
While here, I've rewritten much of the locking theory comment at the top
of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've
tried to describe the purpose of each lock in a little more detail, and
in particular describe where it should and shouldn't be used.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 07:19:24 +03:00
|
|
|
atomic_store_ptr(&dev->si_drv2, NULL);
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
|
2021-07-22 19:22:14 +03:00
|
|
|
if (dev != NULL) {
|
|
|
|
|
destroy_dev(dev);
|
2022-02-02 08:00:57 +03:00
|
|
|
knlist_clear(&zsd->zsd_selinfo.si_note, 0);
|
|
|
|
|
knlist_destroy(&zsd->zsd_selinfo.si_note);
|
2021-07-22 19:22:14 +03:00
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
zvol: remove the OS-side minor before freeing the zvol
When destroying a zvol, it is not "unpublished" from the system (that
is, /dev/zd* node removed) until zvol_os_free(). Under Linux, at the
time del_gendisk() and put_disk() are called, the device node may still
be have an active hold, from a userspace program or something inside the
kernel (a partition probe). As it is currently, this can lead to calls
to zvol_open() or zvol_release() while the zvol_state_t is partially or
fully freed. zvol_open() has some protection against this by checking
that private_data is NULL, but zvol_release does not.
This implements a better ordering for all of this by adding a new
OS-side method, zvol_os_remove_minor(), which is responsible for fully
decoupling the "private" (OS-side) objects from the zvol_state_t. For
Linux, that means calling put_disk(), nulling private_data, and freeing
zv_zso.
This takes the place of zvol_os_clear_private(), which was a nod in that
direction but did not do enough, and did not do it early enough.
Equivalent changes are made on the FreeBSD side to follow the API
change.
Sponsored-by: Klara, Inc.
Sponsored-by: Railway Corporation
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17625
2025-08-05 06:43:17 +03:00
|
|
|
kmem_free(zso, sizeof (struct zvol_state_os));
|
|
|
|
|
|
|
|
|
|
mutex_enter(&zv->zv_state_lock);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
zvol_os_free(zvol_state_t *zv)
|
|
|
|
|
{
|
|
|
|
|
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
|
|
|
|
|
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
ASSERT0(zv->zv_open_count);
|
|
|
|
|
ASSERT0P(zv->zv_zso);
|
|
|
|
|
|
|
|
|
|
ASSERT0P(zv->zv_objset);
|
|
|
|
|
ASSERT0P(zv->zv_zilog);
|
|
|
|
|
ASSERT0P(zv->zv_dn);
|
|
|
|
|
|
|
|
|
|
ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
|
|
|
|
|
|
|
|
|
|
rw_destroy(&zv->zv_suspend_lock);
|
|
|
|
|
zfs_rangelock_fini(&zv->zv_rangelock);
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
mutex_destroy(&zv->zv_state_lock);
|
zvol: ensure device minors are properly cleaned up
Currently, if a minor is in use when we try to remove it, we'll skip it
and never come back to it again. Since the zvol state is hung off the
minor in the kernel, this can get us into weird situations if something
tries to use it after the removal fails. It's even worse at pool export,
as there's now a vestigial zvol state with no pool under it. It's
weirder again if the pool is subsequently reimported, as the zvol code
(reasonably) assumes the zvol state has been properly setup, when it's
actually left over from the previous import of the pool.
This commit attempts to tackle that by setting a flag on the zvol if its
minor can't be removed, and then checking that flag when a request is
made and rejecting it, thus stopping new work coming in.
The flag also causes a condvar to be signaled when the last client
finishes. For the case where a single minor is being removed (eg
changing volmode), it will wait for this signal before proceeding.
Meanwhile, when removing all minors, a background task is created for
each minor that couldn't be removed on the spot, and those tasks then
wake and clean up.
Since any new tasks are queued on to the pool's spa_zvol_taskq,
spa_export_common() will continue to wait at export until all minors are
removed.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #14872
Closes #16364
2024-07-18 06:24:05 +03:00
|
|
|
cv_destroy(&zv->zv_removing_cv);
|
2020-06-06 03:17:02 +03:00
|
|
|
dataset_kstats_destroy(&zv->zv_kstat);
|
2020-04-14 21:36:28 +03:00
|
|
|
kmem_free(zv, sizeof (zvol_state_t));
|
|
|
|
|
zvol_minors--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Create a minor node (plus a whole lot more) for the specified volume.
|
|
|
|
|
*/
|
2022-02-07 21:24:38 +03:00
|
|
|
int
|
|
|
|
|
zvol_os_create_minor(const char *name)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
2025-08-06 17:10:52 +03:00
|
|
|
zvol_state_t *zv = NULL;
|
2020-04-14 21:36:28 +03:00
|
|
|
objset_t *os;
|
|
|
|
|
dmu_object_info_t *doi;
|
|
|
|
|
uint64_t volsize;
|
2025-07-31 18:02:09 +03:00
|
|
|
uint64_t hash, len;
|
2020-04-14 21:36:28 +03:00
|
|
|
int error;
|
2022-11-08 23:38:08 +03:00
|
|
|
bool replayed_zil = B_FALSE;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
2025-05-29 16:37:41 +03:00
|
|
|
if (zvol_inhibit_dev)
|
|
|
|
|
return (0);
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
ZFS_LOG(1, "Creating ZVOL %s...", name);
|
|
|
|
|
hash = zvol_name_hash(name);
|
|
|
|
|
if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
|
|
|
|
|
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
|
|
|
|
mutex_exit(&zv->zv_state_lock);
|
|
|
|
|
return (SET_ERROR(EEXIST));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
DROP_GIANT();
|
2020-10-21 20:54:38 +03:00
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
|
|
|
|
|
|
2021-12-23 22:04:07 +03:00
|
|
|
/* Lie and say we're read-only. */
|
2020-10-21 20:54:38 +03:00
|
|
|
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (error)
|
|
|
|
|
goto out_doi;
|
|
|
|
|
|
|
|
|
|
error = dmu_object_info(os, ZVOL_OBJ, doi);
|
|
|
|
|
if (error)
|
|
|
|
|
goto out_dmu_objset_disown;
|
|
|
|
|
|
|
|
|
|
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
|
|
|
|
|
if (error)
|
|
|
|
|
goto out_dmu_objset_disown;
|
|
|
|
|
|
2025-08-06 17:10:52 +03:00
|
|
|
error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
|
|
|
|
|
if (error || zv == NULL)
|
2025-07-30 19:46:34 +03:00
|
|
|
goto out_dmu_objset_disown;
|
2025-08-06 17:10:52 +03:00
|
|
|
|
|
|
|
|
zv->zv_hash = hash;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
|
|
|
|
|
zv->zv_flags |= ZVOL_RDONLY;
|
|
|
|
|
|
|
|
|
|
zv->zv_objset = os;
|
|
|
|
|
|
2025-08-04 13:00:19 +03:00
|
|
|
ASSERT0P(zv->zv_kstat.dk_kstats);
|
2022-07-21 03:14:06 +03:00
|
|
|
error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
|
|
|
|
|
if (error)
|
|
|
|
|
goto out_dmu_objset_disown;
|
2025-08-04 13:00:19 +03:00
|
|
|
ASSERT0P(zv->zv_zilog);
|
2022-07-21 03:14:06 +03:00
|
|
|
zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
|
2020-04-14 21:36:28 +03:00
|
|
|
if (spa_writeable(dmu_objset_spa(os))) {
|
|
|
|
|
if (zil_replay_disable)
|
2022-11-08 23:38:08 +03:00
|
|
|
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
|
2020-04-14 21:36:28 +03:00
|
|
|
else
|
2022-11-08 23:38:08 +03:00
|
|
|
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
2022-11-08 23:38:08 +03:00
|
|
|
if (replayed_zil)
|
|
|
|
|
zil_close(zv->zv_zilog);
|
zvol: call zil_replaying() during replay
zil_replaying(zil, tx) has the side-effect of informing the ZIL that an
entry has been replayed in the (still open) tx. The ZIL uses that
information to record the replay progress in the ZIL header when that
tx's txg syncs.
ZPL log entries are not idempotent and logically dependent and thus
calling zil_replaying() is necessary for correctness.
For ZVOLs the question of correctness is more nuanced: ZVOL logs only
TX_WRITE and TX_TRUNCATE, both of which are idempotent. Logical
dependencies between two records exist only if the write or discard
request had sync semantics or if the ranges affected by the records
overlap.
Thus, at a first glance, it would be correct to restart replay from
the beginning if we crash before replay completes. But this does not
address the following scenario:
Assume one log record per LWB.
The chain on disk is
HDR -> 1:W(1, "A") -> 2:W(1, "B") -> 3:W(2, "X") -> 4:W(3, "Z")
where N:W(O, C) represents log entry number N which is a TX_WRITE of C
to offset A.
We replay 1, 2 and 3 in one txg, sync that txg, then crash.
Bit flips corrupt 2, 3, and 4.
We come up again and restart replay from the beginning because
we did not call zil_replaying() during replay.
We replay 1 again, then interpret 2's invalid checksum as the end
of the ZIL chain and call replay done.
The replayed zvol content is "AX".
If we had called zil_replaying() the HDR would have pointed to 3
and our resumed replay would not have replayed anything because
3 was corrupted, resulting in zvol content "BX".
If 3 logically depends on 2 then the replay corrupted the ZVOL_OBJ's
contents.
This patch adds the zil_replaying() calls to the replay functions.
Since the callbacks in the replay function need the zilog_t* pointer
so that they can call zil_replaying() we open the ZIL while
replaying in zvol_create_minor(). We also verify that replay has
been done when on-demand-opening the ZIL on the first modifying
bio.
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christian Schwarz <me@cschwarz.com>
Closes #11667
2021-03-07 20:49:58 +03:00
|
|
|
zv->zv_zilog = NULL;
|
2020-04-14 21:36:28 +03:00
|
|
|
|
2025-05-31 16:58:54 +03:00
|
|
|
len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
|
|
|
|
|
if (len > 0) {
|
|
|
|
|
dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ);
|
|
|
|
|
dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
|
|
|
|
|
ZIO_PRIORITY_ASYNC_READ);
|
|
|
|
|
}
|
2020-04-14 21:36:28 +03:00
|
|
|
|
|
|
|
|
zv->zv_objset = NULL;
|
|
|
|
|
out_dmu_objset_disown:
|
|
|
|
|
dmu_objset_disown(os, B_TRUE, FTAG);
|
|
|
|
|
|
2025-08-06 17:10:52 +03:00
|
|
|
if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
2025-05-08 22:25:40 +03:00
|
|
|
g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
|
2025-07-31 18:02:09 +03:00
|
|
|
/* geom was locked inside zvol_alloc() function */
|
2020-04-14 21:36:28 +03:00
|
|
|
g_topology_unlock();
|
|
|
|
|
}
|
|
|
|
|
out_doi:
|
|
|
|
|
kmem_free(doi, sizeof (dmu_object_info_t));
|
2025-08-06 17:10:52 +03:00
|
|
|
if (error == 0 && zv) {
|
2020-04-14 21:36:28 +03:00
|
|
|
rw_enter(&zvol_state_lock, RW_WRITER);
|
|
|
|
|
zvol_insert(zv);
|
|
|
|
|
zvol_minors++;
|
|
|
|
|
rw_exit(&zvol_state_lock);
|
2020-10-21 20:59:15 +03:00
|
|
|
ZFS_LOG(1, "ZVOL %s created.", name);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
PICKUP_GIANT();
|
|
|
|
|
return (error);
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-07 21:24:38 +03:00
|
|
|
int
|
|
|
|
|
zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
|
|
|
|
zv->zv_volsize = volsize;
|
2020-11-17 20:50:52 +03:00
|
|
|
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
2020-04-14 21:36:28 +03:00
|
|
|
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
|
|
|
|
|
struct g_provider *pp = zsg->zsg_provider;
|
|
|
|
|
|
|
|
|
|
g_topology_lock();
|
|
|
|
|
|
2020-10-21 21:09:17 +03:00
|
|
|
if (pp->private == NULL) {
|
|
|
|
|
g_topology_unlock();
|
|
|
|
|
return (SET_ERROR(ENXIO));
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-14 21:36:28 +03:00
|
|
|
/*
|
|
|
|
|
* Do not invoke resize event when initial size was zero.
|
|
|
|
|
* ZVOL initializes the size on first open, this is not
|
|
|
|
|
* real resizing.
|
|
|
|
|
*/
|
|
|
|
|
if (pp->mediasize == 0)
|
|
|
|
|
pp->mediasize = zv->zv_volsize;
|
|
|
|
|
else
|
|
|
|
|
g_resize_provider(pp, zv->zv_volsize);
|
|
|
|
|
|
|
|
|
|
g_topology_unlock();
|
2022-02-02 08:00:57 +03:00
|
|
|
} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
|
|
|
|
|
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
|
|
|
|
|
|
|
|
|
|
KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
return (0);
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-07 21:24:38 +03:00
|
|
|
void
|
|
|
|
|
zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
2025-05-28 03:00:25 +03:00
|
|
|
/*
|
|
|
|
|
* The ro/rw ZVOL mode is switched using zvol_set_ro() function by
|
|
|
|
|
* enabling/disabling ZVOL_RDONLY flag. No additional FreeBSD-specific
|
|
|
|
|
* actions are required for readonly zfs property switching.
|
|
|
|
|
*/
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
2022-02-07 21:24:38 +03:00
|
|
|
void
|
|
|
|
|
zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
|
2020-04-14 21:36:28 +03:00
|
|
|
{
|
2025-05-28 03:00:25 +03:00
|
|
|
/*
|
|
|
|
|
* The ZVOL size/capacity is changed by zvol_set_volsize() function.
|
|
|
|
|
* Leave this method empty, because all required job is doing by
|
|
|
|
|
* zvol_os_update_volsize() platform-specific function.
|
|
|
|
|
*/
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Public interfaces
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
zvol_busy(void)
|
|
|
|
|
{
|
|
|
|
|
return (zvol_minors != 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
zvol_init(void)
|
|
|
|
|
{
|
2025-05-08 22:25:40 +03:00
|
|
|
return (zvol_init_impl());
|
2020-04-14 21:36:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
zvol_fini(void)
|
|
|
|
|
{
|
|
|
|
|
zvol_fini_impl();
|
|
|
|
|
}
|