2010-08-26 22:45:02 +04:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-12 00:16:13 +03:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2010-08-26 22:45:02 +04:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
|
|
|
|
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
|
|
|
|
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
|
|
|
* LLNL-CODE-403049.
|
2019-05-05 02:39:10 +03:00
|
|
|
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/zfs_context.h>
|
2017-09-05 23:41:32 +03:00
|
|
|
#include <sys/spa_impl.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
#include <sys/vdev_disk.h>
|
|
|
|
#include <sys/vdev_impl.h>
|
2019-03-29 19:13:20 +03:00
|
|
|
#include <sys/vdev_trim.h>
|
2016-07-22 18:52:49 +03:00
|
|
|
#include <sys/abd.h>
|
2010-08-26 22:45:02 +04:00
|
|
|
#include <sys/fs/zfs.h>
|
|
|
|
#include <sys/zio.h>
|
2020-09-18 06:03:10 +03:00
|
|
|
#include <linux/blkpg.h>
|
2018-05-31 20:36:37 +03:00
|
|
|
#include <linux/msdos_fs.h>
|
2019-01-11 02:28:44 +03:00
|
|
|
#include <linux/vfs_compat.h>
|
2021-12-04 06:00:10 +03:00
|
|
|
#ifdef HAVE_LINUX_BLK_CGROUP_HEADER
|
|
|
|
#include <linux/blk-cgroup.h>
|
|
|
|
#endif
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
/*
|
|
|
|
* Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
|
|
|
|
* block_device. Since it carries the block_device inside, its convenient to
|
|
|
|
* just use the handle as a proxy. For pre-6.8, we just emulate this with
|
|
|
|
* a cast, since we don't need any of the other fields inside the handle.
|
|
|
|
*/
|
|
|
|
#ifdef HAVE_BDEV_OPEN_BY_PATH
|
|
|
|
typedef struct bdev_handle zfs_bdev_handle_t;
|
|
|
|
#define BDH_BDEV(bdh) ((bdh)->bdev)
|
|
|
|
#define BDH_IS_ERR(bdh) (IS_ERR(bdh))
|
|
|
|
#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh))
|
|
|
|
#define BDH_ERR_PTR(err) (ERR_PTR(err))
|
|
|
|
#else
|
|
|
|
typedef void zfs_bdev_handle_t;
|
|
|
|
#define BDH_BDEV(bdh) ((struct block_device *)bdh)
|
|
|
|
#define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh)))
|
|
|
|
#define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh)))
|
|
|
|
#define BDH_ERR_PTR(err) (ERR_PTR(err))
|
|
|
|
#endif
|
|
|
|
|
2020-06-16 21:43:33 +03:00
|
|
|
typedef struct vdev_disk {
|
2024-01-23 07:42:57 +03:00
|
|
|
zfs_bdev_handle_t *vd_bdh;
|
2020-06-16 21:43:33 +03:00
|
|
|
krwlock_t vd_lock;
|
|
|
|
} vdev_disk_t;
|
|
|
|
|
2019-12-09 22:09:14 +03:00
|
|
|
/*
|
|
|
|
* Unique identifier for the exclusive vdev holder.
|
|
|
|
*/
|
2013-02-27 05:02:27 +04:00
|
|
|
static void *zfs_vdev_holder = VDEV_HOLDER;
|
2011-02-08 00:54:59 +03:00
|
|
|
|
2019-12-09 22:09:14 +03:00
|
|
|
/*
|
|
|
|
* Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
|
|
|
|
* device is missing. The missing path may be transient since the links
|
|
|
|
* can be briefly removed and recreated in response to udev events.
|
|
|
|
*/
|
2022-11-04 01:02:46 +03:00
|
|
|
static uint_t zfs_vdev_open_timeout_ms = 1000;
|
2019-12-09 22:09:14 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Size of the "reserved" partition, in blocks.
|
|
|
|
*/
|
2018-05-31 20:36:37 +03:00
|
|
|
#define EFI_MIN_RESV_SIZE (16 * 1024)
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/*
|
|
|
|
* Virtual device vector for disks.
|
|
|
|
*/
|
|
|
|
typedef struct dio_request {
|
|
|
|
zio_t *dr_zio; /* Parent ZIO */
|
2015-10-14 00:13:52 +03:00
|
|
|
atomic_t dr_ref; /* References */
|
2010-08-26 22:45:02 +04:00
|
|
|
int dr_error; /* Bio error */
|
|
|
|
int dr_bio_count; /* Count of bio's */
|
2023-01-10 23:14:59 +03:00
|
|
|
struct bio *dr_bio[]; /* Attached bio's */
|
2010-08-26 22:45:02 +04:00
|
|
|
} dio_request_t;
|
|
|
|
|
2022-11-11 00:37:12 +03:00
|
|
|
/*
|
|
|
|
* BIO request failfast mask.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static unsigned int zfs_vdev_failfast_mask = 1;
|
|
|
|
|
2023-08-01 18:37:20 +03:00
|
|
|
#ifdef HAVE_BLK_MODE_T
|
|
|
|
static blk_mode_t
|
|
|
|
#else
|
2010-08-26 22:45:02 +04:00
|
|
|
static fmode_t
|
2023-08-01 18:37:20 +03:00
|
|
|
#endif
|
2023-12-21 22:22:56 +03:00
|
|
|
vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2023-08-01 18:37:20 +03:00
|
|
|
#ifdef HAVE_BLK_MODE_T
|
|
|
|
blk_mode_t mode = 0;
|
|
|
|
|
|
|
|
if (spa_mode & SPA_MODE_READ)
|
|
|
|
mode |= BLK_OPEN_READ;
|
|
|
|
|
|
|
|
if (spa_mode & SPA_MODE_WRITE)
|
|
|
|
mode |= BLK_OPEN_WRITE;
|
2023-12-21 22:22:56 +03:00
|
|
|
|
|
|
|
if (exclusive)
|
|
|
|
mode |= BLK_OPEN_EXCL;
|
2023-08-01 18:37:20 +03:00
|
|
|
#else
|
2010-08-26 22:45:02 +04:00
|
|
|
fmode_t mode = 0;
|
|
|
|
|
2019-11-21 20:32:57 +03:00
|
|
|
if (spa_mode & SPA_MODE_READ)
|
2010-08-26 22:45:02 +04:00
|
|
|
mode |= FMODE_READ;
|
|
|
|
|
2019-11-21 20:32:57 +03:00
|
|
|
if (spa_mode & SPA_MODE_WRITE)
|
2010-08-26 22:45:02 +04:00
|
|
|
mode |= FMODE_WRITE;
|
2023-12-21 22:22:56 +03:00
|
|
|
|
|
|
|
if (exclusive)
|
|
|
|
mode |= FMODE_EXCL;
|
2023-08-01 18:37:20 +03:00
|
|
|
#endif
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
return (mode);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
/*
|
|
|
|
* Returns the usable capacity (in bytes) for the partition or disk.
|
|
|
|
*/
|
2010-08-26 22:45:02 +04:00
|
|
|
static uint64_t
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
bdev_capacity(struct block_device *bdev)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
return (i_size_read(bdev->bd_inode));
|
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2020-12-23 00:02:59 +03:00
|
|
|
#if !defined(HAVE_BDEV_WHOLE)
|
|
|
|
static inline struct block_device *
|
|
|
|
bdev_whole(struct block_device *bdev)
|
|
|
|
{
|
|
|
|
return (bdev->bd_contains);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-08-03 21:35:47 +03:00
|
|
|
#if defined(HAVE_BDEVNAME)
|
|
|
|
#define vdev_bdevname(bdev, name) bdevname(bdev, name)
|
|
|
|
#else
|
|
|
|
static inline void
|
|
|
|
vdev_bdevname(struct block_device *bdev, char *name)
|
|
|
|
{
|
|
|
|
snprintf(name, BDEVNAME_SIZE, "%pg", bdev);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
/*
|
|
|
|
* Returns the maximum expansion capacity of the block device (in bytes).
|
|
|
|
*
|
|
|
|
* It is possible to expand a vdev when it has been created as a wholedisk
|
|
|
|
* and the containing block device has increased in capacity. Or when the
|
|
|
|
* partition containing the pool has been manually increased in size.
|
|
|
|
*
|
|
|
|
* This function is only responsible for calculating the potential expansion
|
|
|
|
* size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
|
|
|
|
* responsible for verifying the expected partition layout in the wholedisk
|
|
|
|
* case, and updating the partition table if appropriate. Once the partition
|
|
|
|
* size has been increased the additional capacity will be visible using
|
|
|
|
* bdev_capacity().
|
2019-02-23 02:36:34 +03:00
|
|
|
*
|
|
|
|
* The returned maximum expansion capacity is always expected to be larger, or
|
|
|
|
* at the very least equal, to its usable capacity to prevent overestimating
|
|
|
|
* the pool expandsize.
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
*/
|
|
|
|
static uint64_t
|
|
|
|
bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
|
|
|
|
{
|
|
|
|
uint64_t psize;
|
|
|
|
int64_t available;
|
|
|
|
|
2020-12-23 00:02:59 +03:00
|
|
|
if (wholedisk && bdev != bdev_whole(bdev)) {
|
2018-05-31 20:36:37 +03:00
|
|
|
/*
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
* When reporting maximum expansion capacity for a wholedisk
|
|
|
|
* deduct any capacity which is expected to be lost due to
|
|
|
|
* alignment restrictions. Over reporting this value isn't
|
|
|
|
* harmful and would only result in slightly less capacity
|
|
|
|
* than expected post expansion.
|
2019-02-23 02:36:34 +03:00
|
|
|
* The estimated available space may be slightly smaller than
|
|
|
|
* bdev_capacity() for devices where the number of sectors is
|
|
|
|
* not a multiple of the alignment size and the partition layout
|
|
|
|
* is keeping less than PARTITION_END_ALIGNMENT bytes after the
|
|
|
|
* "reserved" EFI partition: in such cases return the device
|
|
|
|
* usable capacity.
|
2018-05-31 20:36:37 +03:00
|
|
|
*/
|
2020-12-23 00:02:59 +03:00
|
|
|
available = i_size_read(bdev_whole(bdev)->bd_inode) -
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
|
|
|
|
PARTITION_END_ALIGNMENT) << SECTOR_BITS);
|
2019-02-23 02:36:34 +03:00
|
|
|
psize = MAX(available, bdev_capacity(bdev));
|
2018-05-31 20:36:37 +03:00
|
|
|
} else {
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
psize = bdev_capacity(bdev);
|
2018-05-31 20:36:37 +03:00
|
|
|
}
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
|
|
|
|
return (psize);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2010-09-28 02:30:14 +04:00
|
|
|
static void
|
|
|
|
vdev_disk_error(zio_t *zio)
|
|
|
|
{
|
2018-11-28 22:29:57 +03:00
|
|
|
/*
|
|
|
|
* This function can be called in interrupt context, for instance while
|
|
|
|
* handling IRQs coming from a misbehaving disk device; use printk()
|
|
|
|
* which is safe from any context.
|
|
|
|
*/
|
|
|
|
printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
|
2022-10-27 19:54:54 +03:00
|
|
|
"offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa),
|
2018-11-28 22:29:57 +03:00
|
|
|
zio->io_vd->vdev_path, zio->io_error, zio->io_type,
|
|
|
|
(u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
|
|
|
|
zio->io_flags);
|
2010-09-28 02:30:14 +04:00
|
|
|
}
|
|
|
|
|
2022-09-28 19:48:46 +03:00
|
|
|
static void
|
|
|
|
vdev_disk_kobj_evt_post(vdev_t *v)
|
|
|
|
{
|
|
|
|
vdev_disk_t *vd = v->vdev_tsd;
|
2024-01-23 07:42:57 +03:00
|
|
|
if (vd && vd->vd_bdh) {
|
|
|
|
spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh));
|
2022-09-28 19:48:46 +03:00
|
|
|
} else {
|
|
|
|
vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n",
|
|
|
|
v->vdev_path);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
static zfs_bdev_handle_t *
|
|
|
|
vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder)
|
2023-08-01 18:37:20 +03:00
|
|
|
{
|
2024-01-23 07:42:57 +03:00
|
|
|
#if defined(HAVE_BDEV_OPEN_BY_PATH)
|
|
|
|
return (bdev_open_by_path(path,
|
|
|
|
vdev_bdev_mode(mode, B_TRUE), holder, NULL));
|
|
|
|
#elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
|
2023-08-01 18:37:20 +03:00
|
|
|
return (blkdev_get_by_path(path,
|
2024-01-23 07:42:57 +03:00
|
|
|
vdev_bdev_mode(mode, B_TRUE), holder, NULL));
|
2023-08-01 18:37:20 +03:00
|
|
|
#else
|
|
|
|
return (blkdev_get_by_path(path,
|
2023-12-21 22:22:56 +03:00
|
|
|
vdev_bdev_mode(mode, B_TRUE), holder));
|
2023-08-01 18:37:20 +03:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2024-01-23 07:42:57 +03:00
|
|
|
vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder)
|
2023-08-01 18:37:20 +03:00
|
|
|
{
|
2024-01-23 07:42:57 +03:00
|
|
|
#if defined(HAVE_BDEV_RELEASE)
|
|
|
|
return (bdev_release(bdh));
|
|
|
|
#elif defined(HAVE_BLKDEV_PUT_HOLDER)
|
|
|
|
return (blkdev_put(BDH_BDEV(bdh), holder));
|
2023-08-01 18:37:20 +03:00
|
|
|
#else
|
2024-01-23 07:42:57 +03:00
|
|
|
return (blkdev_put(BDH_BDEV(bdh),
|
|
|
|
vdev_bdev_mode(mode, B_TRUE)));
|
2023-08-01 18:37:20 +03:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
static int
|
2012-01-24 06:43:32 +04:00
|
|
|
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
|
2020-08-21 22:53:17 +03:00
|
|
|
uint64_t *logical_ashift, uint64_t *physical_ashift)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2024-01-23 07:42:57 +03:00
|
|
|
zfs_bdev_handle_t *bdh;
|
2023-08-01 18:37:20 +03:00
|
|
|
#ifdef HAVE_BLK_MODE_T
|
2023-12-21 22:22:56 +03:00
|
|
|
blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
|
2023-08-01 18:37:20 +03:00
|
|
|
#else
|
2023-12-21 22:22:56 +03:00
|
|
|
fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
|
2023-08-01 18:37:20 +03:00
|
|
|
#endif
|
2019-12-09 22:09:14 +03:00
|
|
|
hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
|
2010-08-26 22:45:02 +04:00
|
|
|
vdev_disk_t *vd;
|
|
|
|
|
|
|
|
/* Must have a pathname and it must be absolute. */
|
|
|
|
if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
|
|
|
|
v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
vdev_dbgmsg(v, "invalid vdev_path");
|
2016-04-19 21:19:12 +03:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2013-02-26 23:25:55 +04:00
|
|
|
/*
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
* Reopen the device if it is currently open. When expanding a
|
2020-09-18 06:03:10 +03:00
|
|
|
* partition force re-scanning the partition table if userland
|
|
|
|
* did not take care of this already. We need to do this while closed
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
* in order to get an accurate updated block device size. Then
|
|
|
|
* since udev may need to recreate the device links increase the
|
2019-12-09 22:09:14 +03:00
|
|
|
* open retry timeout before reporting the device as unavailable.
|
2013-02-26 23:25:55 +04:00
|
|
|
*/
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
vd = v->vdev_tsd;
|
|
|
|
if (vd) {
|
|
|
|
char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
|
|
|
|
boolean_t reread_part = B_FALSE;
|
2013-02-26 23:25:55 +04:00
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
rw_enter(&vd->vd_lock, RW_WRITER);
|
2024-01-23 07:42:57 +03:00
|
|
|
bdh = vd->vd_bdh;
|
|
|
|
vd->vd_bdh = NULL;
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
if (bdh) {
|
|
|
|
struct block_device *bdev = BDH_BDEV(bdh);
|
2020-12-23 00:02:59 +03:00
|
|
|
if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
|
2022-08-03 21:35:47 +03:00
|
|
|
vdev_bdevname(bdev_whole(bdev), disk_name + 5);
|
2020-09-18 06:03:10 +03:00
|
|
|
/*
|
|
|
|
* If userland has BLKPG_RESIZE_PARTITION,
|
|
|
|
* then it should have updated the partition
|
|
|
|
* table already. We can detect this by
|
|
|
|
* comparing our current physical size
|
|
|
|
* with that of the device. If they are
|
|
|
|
* the same, then we must not have
|
|
|
|
* BLKPG_RESIZE_PARTITION or it failed to
|
|
|
|
* update the partition table online. We
|
|
|
|
* fallback to rescanning the partition
|
|
|
|
* table from the kernel below. However,
|
|
|
|
* if the capacity already reflects the
|
|
|
|
* updated partition, then we skip
|
|
|
|
* rescanning the partition table here.
|
|
|
|
*/
|
|
|
|
if (v->vdev_psize == bdev_capacity(bdev))
|
|
|
|
reread_part = B_TRUE;
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
}
|
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (reread_part) {
|
2024-01-23 07:42:57 +03:00
|
|
|
bdh = vdev_blkdev_get_by_path(disk_name, mode,
|
|
|
|
zfs_vdev_holder);
|
|
|
|
if (!BDH_IS_ERR(bdh)) {
|
|
|
|
int error =
|
|
|
|
vdev_bdev_reread_part(BDH_BDEV(bdh));
|
|
|
|
vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
|
2019-12-09 22:09:14 +03:00
|
|
|
if (error == 0) {
|
|
|
|
timeout = MSEC2NSEC(
|
|
|
|
zfs_vdev_open_timeout_ms * 2);
|
|
|
|
}
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
|
|
|
|
|
|
|
|
rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
|
|
|
|
rw_enter(&vd->vd_lock, RW_WRITER);
|
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Devices are always opened by the path provided at configuration
|
|
|
|
* time. This means that if the provided path is a udev by-id path
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
* then drives may be re-cabled without an issue. If the provided
|
2013-03-30 06:27:50 +04:00
|
|
|
* path is a udev by-path path, then the physical location information
|
2010-08-26 22:45:02 +04:00
|
|
|
* will be preserved. This can be critical for more complicated
|
|
|
|
* configurations where drives are located in specific physical
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
* locations to maximize the systems tolerance to component failure.
|
|
|
|
*
|
2013-03-30 06:27:50 +04:00
|
|
|
* Alternatively, you can provide your own udev rule to flexibly map
|
2010-08-26 22:45:02 +04:00
|
|
|
* the drives as you see fit. It is not advised that you use the
|
2013-03-30 06:27:50 +04:00
|
|
|
* /dev/[hd]d devices which may be reordered due to probing order.
|
2010-08-26 22:45:02 +04:00
|
|
|
* Devices in the wrong locations will be detected by the higher
|
|
|
|
* level vdev validation.
|
2016-04-19 21:19:12 +03:00
|
|
|
*
|
|
|
|
* The specified paths may be briefly removed and recreated in
|
|
|
|
* response to udev events. This should be exceptionally unlikely
|
|
|
|
* because the zpool command makes every effort to verify these paths
|
|
|
|
* have already settled prior to reaching this point. Therefore,
|
|
|
|
* a ENOENT failure at this point is highly likely to be transient
|
|
|
|
* and it is reasonable to sleep and retry before giving up. In
|
|
|
|
* practice delays have been observed to be on the order of 100ms.
|
2021-12-02 03:07:12 +03:00
|
|
|
*
|
|
|
|
* When ERESTARTSYS is returned it indicates the block device is
|
|
|
|
* a zvol which could not be opened due to the deadlock detection
|
|
|
|
* logic in zvol_open(). Extend the timeout and retry the open
|
|
|
|
* subsequent attempts are expected to eventually succeed.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
2019-12-09 22:09:14 +03:00
|
|
|
hrtime_t start = gethrtime();
|
2024-01-23 07:42:57 +03:00
|
|
|
bdh = BDH_ERR_PTR(-ENXIO);
|
|
|
|
while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
|
|
|
|
bdh = vdev_blkdev_get_by_path(v->vdev_path, mode,
|
|
|
|
zfs_vdev_holder);
|
|
|
|
if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
|
2022-09-28 19:48:46 +03:00
|
|
|
/*
|
|
|
|
* There is no point of waiting since device is removed
|
|
|
|
* explicitly
|
|
|
|
*/
|
|
|
|
if (v->vdev_removed)
|
|
|
|
break;
|
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
schedule_timeout(MSEC_TO_TICK(10));
|
2024-01-23 07:42:57 +03:00
|
|
|
} else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) {
|
2021-12-02 03:07:12 +03:00
|
|
|
timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
|
|
|
|
continue;
|
2024-01-23 07:42:57 +03:00
|
|
|
} else if (BDH_IS_ERR(bdh)) {
|
2016-04-19 21:19:12 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
if (BDH_IS_ERR(bdh)) {
|
|
|
|
int error = -BDH_PTR_ERR(bdh);
|
2019-12-09 22:09:14 +03:00
|
|
|
vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
|
|
|
|
(u_longlong_t)(gethrtime() - start),
|
|
|
|
(u_longlong_t)timeout);
|
2024-01-23 07:42:57 +03:00
|
|
|
vd->vd_bdh = NULL;
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
v->vdev_tsd = vd;
|
|
|
|
rw_exit(&vd->vd_lock);
|
|
|
|
return (SET_ERROR(error));
|
|
|
|
} else {
|
2024-01-23 07:42:57 +03:00
|
|
|
vd->vd_bdh = bdh;
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
v->vdev_tsd = vd;
|
|
|
|
rw_exit(&vd->vd_lock);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
|
|
|
|
2013-02-26 23:25:55 +04:00
|
|
|
/* Determine the physical block size */
|
2024-01-23 07:42:57 +03:00
|
|
|
int physical_block_size = bdev_physical_block_size(bdev);
|
2020-08-21 22:53:17 +03:00
|
|
|
|
|
|
|
/* Determine the logical block size */
|
2024-01-23 07:42:57 +03:00
|
|
|
int logical_block_size = bdev_logical_block_size(bdev);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
|
|
|
|
v->vdev_nowritecache = B_FALSE;
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/* Set when device reports it supports TRIM. */
|
2024-01-23 07:42:57 +03:00
|
|
|
v->vdev_has_trim = bdev_discard_supported(bdev);
|
2019-03-29 19:13:20 +03:00
|
|
|
|
|
|
|
/* Set when device reports it supports secure TRIM. */
|
2024-01-23 07:42:57 +03:00
|
|
|
v->vdev_has_securetrim = bdev_secure_discard_supported(bdev);
|
2019-03-29 19:13:20 +03:00
|
|
|
|
2015-08-29 19:01:07 +03:00
|
|
|
/* Inform the ZIO pipeline that we are non-rotational */
|
2024-01-23 07:42:57 +03:00
|
|
|
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
|
2015-08-29 19:01:07 +03:00
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
/* Physical volume size in bytes for the partition */
|
2024-01-23 07:42:57 +03:00
|
|
|
*psize = bdev_capacity(bdev);
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
|
|
|
|
/* Physical volume size in bytes including possible expansion space */
|
2024-01-23 07:42:57 +03:00
|
|
|
*max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk);
|
2012-01-24 06:43:32 +04:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/* Based on the minimum sector size set the block size */
|
2020-08-21 22:53:17 +03:00
|
|
|
*physical_ashift = highbit64(MAX(physical_block_size,
|
|
|
|
SPA_MINBLOCKSIZE)) - 1;
|
|
|
|
|
|
|
|
*logical_ashift = highbit64(MAX(logical_block_size,
|
|
|
|
SPA_MINBLOCKSIZE)) - 1;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
return (0);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vdev_disk_close(vdev_t *v)
|
|
|
|
{
|
|
|
|
vdev_disk_t *vd = v->vdev_tsd;
|
|
|
|
|
2013-02-26 23:25:55 +04:00
|
|
|
if (v->vdev_reopening || vd == NULL)
|
2010-08-26 22:45:02 +04:00
|
|
|
return;
|
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
if (vd->vd_bdh != NULL) {
|
|
|
|
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
|
2023-08-01 18:37:20 +03:00
|
|
|
zfs_vdev_holder);
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
rw_destroy(&vd->vd_lock);
|
2013-11-01 23:26:11 +04:00
|
|
|
kmem_free(vd, sizeof (vdev_disk_t));
|
2010-08-26 22:45:02 +04:00
|
|
|
v->vdev_tsd = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static dio_request_t *
|
|
|
|
vdev_disk_dio_alloc(int bio_count)
|
|
|
|
{
|
2021-01-28 20:28:20 +03:00
|
|
|
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
2014-11-21 03:09:39 +03:00
|
|
|
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
2021-01-28 20:28:20 +03:00
|
|
|
atomic_set(&dr->dr_ref, 0);
|
|
|
|
dr->dr_bio_count = bio_count;
|
|
|
|
dr->dr_error = 0;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2021-01-28 20:28:20 +03:00
|
|
|
for (int i = 0; i < dr->dr_bio_count; i++)
|
|
|
|
dr->dr_bio[i] = NULL;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
return (dr);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vdev_disk_dio_free(dio_request_t *dr)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < dr->dr_bio_count; i++)
|
|
|
|
if (dr->dr_bio[i])
|
|
|
|
bio_put(dr->dr_bio[i]);
|
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
kmem_free(dr, sizeof (dio_request_t) +
|
|
|
|
sizeof (struct bio *) * dr->dr_bio_count);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vdev_disk_dio_get(dio_request_t *dr)
|
|
|
|
{
|
|
|
|
atomic_inc(&dr->dr_ref);
|
|
|
|
}
|
|
|
|
|
Cleanup of dead code suggested by Clang Static Analyzer (#14380)
I recently gained the ability to run Clang's static analyzer on the
linux kernel modules via a few hacks. This extended coverage to code
that was previously missed since Clang's static analyzer only looked at
code that we built in userspace. Running it against the Linux kernel
modules built from my local branch produced a total of 72 reports
against my local branch. Of those, 50 were reports of logic errors and
22 were reports of dead code. Since we already had cleaned up all of
the previous dead code reports, I felt it would be a good next step to
clean up these dead code reports. Clang did a further breakdown of the
dead code reports into:
Dead assignment 15
Dead increment 2
Dead nested assignment 5
The benefit of cleaning these up, especially in the case of dead nested
assignment, is that they can expose places where our error handling is
incorrect. A number of them were fairly straight forward. However
several were not:
In vdev_disk_physio_completion(), not only were we not using the return
value from the static function vdev_disk_dio_put(), but nothing used it,
so I changed it to return void and removed the existing (void) cast in
the other area where we call it in addition to no longer storing it to a
stack value.
In FSE_createDTable(), the function is dead code. Its helper function
FSE_freeDTable() is also dead code, as are the CPP definitions in
`module/zstd/include/zstd_compat_wrapper.h`. We just delete it all.
In zfs_zevent_wait(), we have an optimization opportunity. cv_wait_sig()
returns 0 if there are waiting signals and 1 if there are none. The
Linux SPL version literally returns `signal_pending(current) ? 0 : 1)`
and FreeBSD implements the same semantics, we can just do
`!cv_wait_sig()` in place of `signal_pending(current)` to avoid
unnecessarily calling it again.
zfs_setattr() on FreeBSD version did not have error handling issue
because the code was removed entirely from FreeBSD version. The error is
from updating the attribute directory's files. After some thought, I
decided to propapage errors on it to userspace.
In zfs_secpolicy_tmp_snapshot(), we ignore a lack of permission from the
first check in favor of checking three other permissions. I assume this
is intentional.
In zfs_create_fs(), the return value of zap_update() was not checked
despite setting an important version number. I see no backward
compatibility reason to permit failures, so we add an assertion to catch
failures. Interestingly, Linux is still using ASSERT(error == 0) from
OpenSolaris while FreeBSD has switched to the improved ASSERT0(error)
from illumos, although illumos has yet to adopt it here. ASSERT(error ==
0) was used on Linux while ASSERT0(error) was used on FreeBSD since the
entire file needs conversion and that should be the subject of
another patch.
dnode_move()'s issue was caused by us not having implemented
POINTER_IS_VALID() on Linux. We have a stub in
`include/os/linux/spl/sys/kmem_cache.h` for it, when it really should be
in `include/os/linux/spl/sys/kmem.h` to be consistent with
Illumos/OpenSolaris. FreeBSD put both `POINTER_IS_VALID()` and
`POINTER_INVALIDATE()` in `include/os/freebsd/spl/sys/kmem.h`, so we
copy what it did.
Whenever a report was in platform-specific code, I checked the FreeBSD
version to see if it also applied to FreeBSD, but it was only relevant a
few times.
Lastly, the patch that enabled Clang's static analyzer to be run on the
Linux kernel modules needs more work before it can be put into a PR. I
plan to do that in the future as part of the on-going static analysis
work that I am doing.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14380
2023-01-17 20:57:12 +03:00
|
|
|
static void
|
2010-08-26 22:45:02 +04:00
|
|
|
vdev_disk_dio_put(dio_request_t *dr)
|
|
|
|
{
|
|
|
|
int rc = atomic_dec_return(&dr->dr_ref);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free the dio_request when the last reference is dropped and
|
|
|
|
* ensure zio_interpret is called only once with the correct zio
|
|
|
|
*/
|
|
|
|
if (rc == 0) {
|
|
|
|
zio_t *zio = dr->dr_zio;
|
|
|
|
int error = dr->dr_error;
|
|
|
|
|
|
|
|
vdev_disk_dio_free(dr);
|
|
|
|
|
|
|
|
if (zio) {
|
|
|
|
zio->io_error = error;
|
2010-09-28 02:30:14 +04:00
|
|
|
ASSERT3S(zio->io_error, >=, 0);
|
|
|
|
if (zio->io_error)
|
|
|
|
vdev_disk_error(zio);
|
2016-07-22 18:52:49 +03:00
|
|
|
|
2016-05-23 20:41:29 +03:00
|
|
|
zio_delay_interrupt(zio);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-23 18:55:15 +03:00
|
|
|
BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
dio_request_t *dr = bio->bi_private;
|
|
|
|
|
2015-09-23 18:55:15 +03:00
|
|
|
if (dr->dr_error == 0) {
|
|
|
|
#ifdef HAVE_1ARG_BIO_END_IO_T
|
2017-07-24 05:37:12 +03:00
|
|
|
dr->dr_error = BIO_END_IO_ERROR(bio);
|
2015-09-23 18:55:15 +03:00
|
|
|
#else
|
|
|
|
if (error)
|
|
|
|
dr->dr_error = -(error);
|
|
|
|
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
|
|
|
dr->dr_error = EIO;
|
|
|
|
#endif
|
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-08-31 09:26:43 +03:00
|
|
|
/* Drop reference acquired by __vdev_disk_physio */
|
Cleanup of dead code suggested by Clang Static Analyzer (#14380)
I recently gained the ability to run Clang's static analyzer on the
linux kernel modules via a few hacks. This extended coverage to code
that was previously missed since Clang's static analyzer only looked at
code that we built in userspace. Running it against the Linux kernel
modules built from my local branch produced a total of 72 reports
against my local branch. Of those, 50 were reports of logic errors and
22 were reports of dead code. Since we already had cleaned up all of
the previous dead code reports, I felt it would be a good next step to
clean up these dead code reports. Clang did a further breakdown of the
dead code reports into:
Dead assignment 15
Dead increment 2
Dead nested assignment 5
The benefit of cleaning these up, especially in the case of dead nested
assignment, is that they can expose places where our error handling is
incorrect. A number of them were fairly straight forward. However
several were not:
In vdev_disk_physio_completion(), not only were we not using the return
value from the static function vdev_disk_dio_put(), but nothing used it,
so I changed it to return void and removed the existing (void) cast in
the other area where we call it in addition to no longer storing it to a
stack value.
In FSE_createDTable(), the function is dead code. Its helper function
FSE_freeDTable() is also dead code, as are the CPP definitions in
`module/zstd/include/zstd_compat_wrapper.h`. We just delete it all.
In zfs_zevent_wait(), we have an optimization opportunity. cv_wait_sig()
returns 0 if there are waiting signals and 1 if there are none. The
Linux SPL version literally returns `signal_pending(current) ? 0 : 1)`
and FreeBSD implements the same semantics, we can just do
`!cv_wait_sig()` in place of `signal_pending(current)` to avoid
unnecessarily calling it again.
zfs_setattr() on FreeBSD version did not have error handling issue
because the code was removed entirely from FreeBSD version. The error is
from updating the attribute directory's files. After some thought, I
decided to propapage errors on it to userspace.
In zfs_secpolicy_tmp_snapshot(), we ignore a lack of permission from the
first check in favor of checking three other permissions. I assume this
is intentional.
In zfs_create_fs(), the return value of zap_update() was not checked
despite setting an important version number. I see no backward
compatibility reason to permit failures, so we add an assertion to catch
failures. Interestingly, Linux is still using ASSERT(error == 0) from
OpenSolaris while FreeBSD has switched to the improved ASSERT0(error)
from illumos, although illumos has yet to adopt it here. ASSERT(error ==
0) was used on Linux while ASSERT0(error) was used on FreeBSD since the
entire file needs conversion and that should be the subject of
another patch.
dnode_move()'s issue was caused by us not having implemented
POINTER_IS_VALID() on Linux. We have a stub in
`include/os/linux/spl/sys/kmem_cache.h` for it, when it really should be
in `include/os/linux/spl/sys/kmem.h` to be consistent with
Illumos/OpenSolaris. FreeBSD put both `POINTER_IS_VALID()` and
`POINTER_INVALIDATE()` in `include/os/freebsd/spl/sys/kmem.h`, so we
copy what it did.
Whenever a report was in platform-specific code, I checked the FreeBSD
version to see if it also applied to FreeBSD, but it was only relevant a
few times.
Lastly, the patch that enabled Clang's static analyzer to be run on the
Linux kernel modules needs more work before it can be put into a PR. I
plan to do that in the future as part of the on-going static analysis
work that I am doing.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14380
2023-01-17 20:57:12 +03:00
|
|
|
vdev_disk_dio_put(dr);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2016-07-27 05:23:53 +03:00
|
|
|
static inline void
|
2016-07-27 20:55:32 +03:00
|
|
|
vdev_submit_bio_impl(struct bio *bio)
|
2016-07-27 05:23:53 +03:00
|
|
|
{
|
|
|
|
#ifdef HAVE_1ARG_SUBMIT_BIO
|
2021-11-06 03:17:03 +03:00
|
|
|
(void) submit_bio(bio);
|
2016-07-27 05:23:53 +03:00
|
|
|
#else
|
2022-01-26 00:12:49 +03:00
|
|
|
(void) submit_bio(bio_data_dir(bio), bio);
|
2016-07-27 05:23:53 +03:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2020-09-25 23:28:35 +03:00
|
|
|
/*
|
|
|
|
* preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
|
|
|
|
* replace it with preempt_schedule under the following condition:
|
|
|
|
*/
|
|
|
|
#if defined(CONFIG_ARM64) && \
|
|
|
|
defined(CONFIG_PREEMPTION) && \
|
|
|
|
defined(CONFIG_BLK_CGROUP)
|
|
|
|
#define preempt_schedule_notrace(x) preempt_schedule(x)
|
|
|
|
#endif
|
|
|
|
|
2022-05-27 23:28:51 +03:00
|
|
|
/*
|
|
|
|
* As for the Linux 5.18 kernel bio_alloc() expects a block_device struct
|
|
|
|
* as an argument removing the need to set it with bio_set_dev(). This
|
|
|
|
* removes the need for all of the following compatibility code.
|
|
|
|
*/
|
|
|
|
#if !defined(HAVE_BIO_ALLOC_4ARG)
|
|
|
|
|
2019-01-16 21:39:19 +03:00
|
|
|
#ifdef HAVE_BIO_SET_DEV
|
|
|
|
#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
|
2020-02-28 19:58:39 +03:00
|
|
|
/*
|
|
|
|
* The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
|
|
|
|
* blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
|
|
|
|
* As a side effect the function was converted to GPL-only. Define our
|
|
|
|
* own version when needed which uses rcu_read_lock_sched().
|
2022-04-02 00:47:36 +03:00
|
|
|
*
|
|
|
|
* The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public
|
|
|
|
* part, moving blkg_tryget into the private one. Define our own version.
|
2020-02-28 19:58:39 +03:00
|
|
|
*/
|
2022-04-02 00:47:36 +03:00
|
|
|
#if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET)
|
2020-02-28 19:58:39 +03:00
|
|
|
static inline bool
|
|
|
|
vdev_blkg_tryget(struct blkcg_gq *blkg)
|
|
|
|
{
|
|
|
|
struct percpu_ref *ref = &blkg->refcnt;
|
|
|
|
unsigned long __percpu *count;
|
|
|
|
bool rc;
|
|
|
|
|
|
|
|
rcu_read_lock_sched();
|
|
|
|
|
|
|
|
if (__ref_is_percpu(ref, &count)) {
|
|
|
|
this_cpu_inc(*count);
|
|
|
|
rc = true;
|
|
|
|
} else {
|
2020-10-18 17:36:12 +03:00
|
|
|
#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
|
|
|
|
rc = atomic_long_inc_not_zero(&ref->data->count);
|
|
|
|
#else
|
2020-02-28 19:58:39 +03:00
|
|
|
rc = atomic_long_inc_not_zero(&ref->count);
|
2020-10-18 17:36:12 +03:00
|
|
|
#endif
|
2020-02-28 19:58:39 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
rcu_read_unlock_sched();
|
|
|
|
|
|
|
|
return (rc);
|
|
|
|
}
|
2022-04-02 00:47:36 +03:00
|
|
|
#else
|
2020-02-28 19:58:39 +03:00
|
|
|
#define vdev_blkg_tryget(bg) blkg_tryget(bg)
|
|
|
|
#endif
|
2021-12-04 05:45:28 +03:00
|
|
|
#ifdef HAVE_BIO_SET_DEV_MACRO
|
2019-01-16 21:39:19 +03:00
|
|
|
/*
|
|
|
|
* The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
|
|
|
|
* GPL-only bio_associate_blkg() symbol thus inadvertently converting
|
|
|
|
* the entire macro. Provide a minimal version which always assigns the
|
|
|
|
* request queue's root_blkg to the bio.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
vdev_bio_associate_blkg(struct bio *bio)
|
|
|
|
{
|
2021-02-23 05:07:51 +03:00
|
|
|
#if defined(HAVE_BIO_BDEV_DISK)
|
|
|
|
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
|
|
|
|
#else
|
2019-01-16 21:39:19 +03:00
|
|
|
struct request_queue *q = bio->bi_disk->queue;
|
2021-02-23 05:07:51 +03:00
|
|
|
#endif
|
2019-01-16 21:39:19 +03:00
|
|
|
|
|
|
|
ASSERT3P(q, !=, NULL);
|
|
|
|
ASSERT3P(bio->bi_blkg, ==, NULL);
|
|
|
|
|
2020-02-28 19:58:39 +03:00
|
|
|
if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
|
2019-01-16 21:39:19 +03:00
|
|
|
bio->bi_blkg = q->root_blkg;
|
|
|
|
}
|
2021-12-04 05:45:28 +03:00
|
|
|
|
2019-01-16 21:39:19 +03:00
|
|
|
#define bio_associate_blkg vdev_bio_associate_blkg
|
2021-12-04 05:45:28 +03:00
|
|
|
#else
|
|
|
|
static inline void
|
|
|
|
vdev_bio_set_dev(struct bio *bio, struct block_device *bdev)
|
|
|
|
{
|
|
|
|
#if defined(HAVE_BIO_BDEV_DISK)
|
|
|
|
struct request_queue *q = bdev->bd_disk->queue;
|
|
|
|
#else
|
|
|
|
struct request_queue *q = bio->bi_disk->queue;
|
|
|
|
#endif
|
|
|
|
bio_clear_flag(bio, BIO_REMAPPED);
|
|
|
|
if (bio->bi_bdev != bdev)
|
|
|
|
bio_clear_flag(bio, BIO_THROTTLED);
|
|
|
|
bio->bi_bdev = bdev;
|
|
|
|
|
|
|
|
ASSERT3P(q, !=, NULL);
|
|
|
|
ASSERT3P(bio->bi_blkg, ==, NULL);
|
|
|
|
|
|
|
|
if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
|
|
|
|
bio->bi_blkg = q->root_blkg;
|
|
|
|
}
|
|
|
|
#define bio_set_dev vdev_bio_set_dev
|
|
|
|
#endif
|
2019-01-16 21:39:19 +03:00
|
|
|
#endif
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
|
|
|
|
*/
|
2017-09-16 21:00:19 +03:00
|
|
|
static inline void
|
|
|
|
bio_set_dev(struct bio *bio, struct block_device *bdev)
|
|
|
|
{
|
|
|
|
bio->bi_bdev = bdev;
|
|
|
|
}
|
2019-01-16 21:39:19 +03:00
|
|
|
#endif /* HAVE_BIO_SET_DEV */
|
2022-05-27 23:28:51 +03:00
|
|
|
#endif /* !HAVE_BIO_ALLOC_4ARG */
|
2017-09-16 21:00:19 +03:00
|
|
|
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
static inline void
|
2016-07-27 20:55:32 +03:00
|
|
|
vdev_submit_bio(struct bio *bio)
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
{
|
|
|
|
struct bio_list *bio_list = current->bio_list;
|
|
|
|
current->bio_list = NULL;
|
2016-07-27 20:55:32 +03:00
|
|
|
vdev_submit_bio_impl(bio);
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
current->bio_list = bio_list;
|
|
|
|
}
|
|
|
|
|
2022-05-27 23:28:51 +03:00
|
|
|
static inline struct bio *
|
|
|
|
vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
|
|
|
unsigned short nr_vecs)
|
|
|
|
{
|
|
|
|
struct bio *bio;
|
|
|
|
|
2022-03-24 17:22:53 +03:00
|
|
|
#ifdef HAVE_BIO_ALLOC_4ARG
|
2022-05-27 23:28:51 +03:00
|
|
|
bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask);
|
|
|
|
#else
|
|
|
|
bio = bio_alloc(gfp_mask, nr_vecs);
|
|
|
|
if (likely(bio != NULL))
|
|
|
|
bio_set_dev(bio, bdev);
|
2022-03-24 17:22:53 +03:00
|
|
|
#endif
|
|
|
|
|
2022-05-27 23:28:51 +03:00
|
|
|
return (bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int
|
|
|
|
vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
|
|
|
{
|
|
|
|
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
|
|
|
|
bio_size, abd_offset);
|
|
|
|
|
|
|
|
#ifdef HAVE_BIO_MAX_SEGS
|
|
|
|
return (bio_max_segs(nr_segs));
|
|
|
|
#else
|
|
|
|
return (MIN(nr_segs, BIO_MAX_PAGES));
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
static int
|
2016-08-31 09:26:43 +03:00
|
|
|
__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
|
|
|
size_t io_size, uint64_t io_offset, int rw, int flags)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
2013-11-01 23:26:11 +04:00
|
|
|
dio_request_t *dr;
|
2016-08-31 09:26:43 +03:00
|
|
|
uint64_t abd_offset;
|
2010-08-26 22:45:02 +04:00
|
|
|
uint64_t bio_offset;
|
2021-01-28 20:28:20 +03:00
|
|
|
int bio_size;
|
|
|
|
int bio_count = 16;
|
|
|
|
int error = 0;
|
2016-09-29 23:13:31 +03:00
|
|
|
struct blk_plug plug;
|
2022-05-27 23:28:51 +03:00
|
|
|
unsigned short nr_vecs;
|
2019-11-12 19:59:06 +03:00
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
/*
|
|
|
|
* Accessing outside the block device is never allowed.
|
|
|
|
*/
|
|
|
|
if (io_offset + io_size > bdev->bd_inode->i_size) {
|
|
|
|
vdev_dbgmsg(zio->io_vd,
|
|
|
|
"Illegal access %llu size %llu, device size %llu",
|
2021-06-05 14:14:12 +03:00
|
|
|
(u_longlong_t)io_offset,
|
|
|
|
(u_longlong_t)io_size,
|
|
|
|
(u_longlong_t)i_size_read(bdev->bd_inode));
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
return (SET_ERROR(EIO));
|
|
|
|
}
|
2010-11-11 00:36:18 +03:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
retry:
|
|
|
|
dr = vdev_disk_dio_alloc(bio_count);
|
|
|
|
|
2022-12-05 00:46:44 +03:00
|
|
|
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
2022-11-11 00:37:12 +03:00
|
|
|
zio->io_vd->vdev_failfast == B_TRUE) {
|
|
|
|
bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
|
|
|
|
zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
|
|
|
|
}
|
2010-10-01 21:57:56 +04:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
dr->dr_zio = zio;
|
|
|
|
|
|
|
|
/*
|
2021-01-28 20:28:20 +03:00
|
|
|
* Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
|
|
|
|
* is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
|
|
|
|
* can cover at least 128KB and at most 1MB. When the required number
|
|
|
|
* of iovec's exceeds this, we are forced to break the IO in multiple
|
|
|
|
* bio's and wait for them all to complete. This is likely if the
|
|
|
|
* recordsize property is increased beyond 1MB. The default
|
|
|
|
* bio_count=16 should typically accommodate the maximum-size zio of
|
|
|
|
* 16MB.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
2016-07-22 18:52:49 +03:00
|
|
|
|
2016-08-31 09:26:43 +03:00
|
|
|
abd_offset = 0;
|
|
|
|
bio_offset = io_offset;
|
2021-01-28 20:28:20 +03:00
|
|
|
bio_size = io_size;
|
|
|
|
for (int i = 0; i <= dr->dr_bio_count; i++) {
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
/* Finished constructing bio's for given buffer */
|
|
|
|
if (bio_size <= 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
2021-01-28 20:28:20 +03:00
|
|
|
* If additional bio's are required, we have to retry, but
|
|
|
|
* this should be rare - see the comment above.
|
2010-08-26 22:45:02 +04:00
|
|
|
*/
|
|
|
|
if (dr->dr_bio_count == i) {
|
|
|
|
vdev_disk_dio_free(dr);
|
|
|
|
bio_count *= 2;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2022-05-27 23:28:51 +03:00
|
|
|
nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
|
|
|
|
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
|
2019-07-06 05:52:27 +03:00
|
|
|
if (unlikely(dr->dr_bio[i] == NULL)) {
|
2010-08-26 22:45:02 +04:00
|
|
|
vdev_disk_dio_free(dr);
|
2017-08-03 07:16:12 +03:00
|
|
|
return (SET_ERROR(ENOMEM));
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Matching put called by vdev_disk_physio_completion */
|
|
|
|
vdev_disk_dio_get(dr);
|
|
|
|
|
2014-03-28 11:08:21 +04:00
|
|
|
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
|
2010-08-26 22:45:02 +04:00
|
|
|
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
|
|
|
|
dr->dr_bio[i]->bi_private = dr;
|
2016-07-27 20:55:32 +03:00
|
|
|
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
/* Remaining size is returned to become the new size */
|
2020-05-21 04:06:09 +03:00
|
|
|
bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
|
2016-12-12 21:46:26 +03:00
|
|
|
bio_size, abd_offset);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
/* Advance in buffer and construct another bio if needed */
|
2016-08-31 09:26:43 +03:00
|
|
|
abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
|
2014-03-28 11:08:21 +04:00
|
|
|
bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
zvol processing should use struct bio
Internally, zvols are files exposed through the block device API. This
is intended to reduce overhead when things require block devices.
However, the ZoL zvol code emulates a traditional block device in that
it has a top half and a bottom half. This is an unnecessary source of
overhead that does not exist on any other OpenZFS platform does this.
This patch removes it. Early users of this patch reported double digit
performance gains in IOPS on zvols in the range of 50% to 80%.
Comments in the code suggest that the current implementation was done to
obtain IO merging from Linux's IO elevator. However, the DMU already
does write merging while arc_read() should implicitly merge read IOs
because only 1 thread is permitted to fetch the buffer into ARC. In
addition, commercial ZFSOnLinux distributions report that regular files
are more performant than zvols under the current implementation, and the
main consumers of zvols are VMs and iSCSI targets, which have their own
elevators to merge IOs.
Some minor refactoring allows us to register zfs_request() as our
->make_request() handler in place of the generic_make_request()
function. This eliminates the layer of code that broke IO requests on
zvols into a top half and a bottom half. This has several benefits:
1. No per zvol spinlocks.
2. No redundant IO elevator processing.
3. Interrupts are disabled only when actually necessary.
4. No redispatching of IOs when all taskq threads are busy.
5. Linux's page out routines will properly block.
6. Many autotools checks become obsolete.
An unfortunate consequence of eliminating the layer that
generic_make_request() is that we no longer calls the instrumentation
hooks for block IO accounting. Those hooks are GPL-exported, so we
cannot call them ourselves and consequently, we lose the ability to do
IO monitoring via iostat. Since zvols are internally files mapped as
block devices, this should be okay. Anyone who is willing to accept the
performance penalty for the block IO layer's accounting could use the
loop device in between the zvol and its consumer. Alternatively, perf
and ftrace likely could be used. Also, tools like latencytop will still
work. Tools such as latencytop sometimes provide a better view of
performance bottlenecks than the traditional block IO accounting tools
do.
Lastly, if direct reclaim occurs during spacemap loading and swap is on
a zvol, this code will deadlock. That deadlock could already occur with
sync=always on zvols. Given that swap on zvols is not yet production
ready, this is not a blocker.
Signed-off-by: Richard Yao <ryao@gentoo.org>
2014-07-05 02:43:47 +04:00
|
|
|
/* Extra reference to protect dio_request during vdev_submit_bio */
|
2010-08-26 22:45:02 +04:00
|
|
|
vdev_disk_dio_get(dr);
|
|
|
|
|
2016-09-29 23:13:31 +03:00
|
|
|
if (dr->dr_bio_count > 1)
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
/* Submit all bio's associated with this dio */
|
2021-01-28 20:28:20 +03:00
|
|
|
for (int i = 0; i < dr->dr_bio_count; i++) {
|
2010-08-26 22:45:02 +04:00
|
|
|
if (dr->dr_bio[i])
|
2016-07-27 20:55:32 +03:00
|
|
|
vdev_submit_bio(dr->dr_bio[i]);
|
2021-01-28 20:28:20 +03:00
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2016-09-29 23:13:31 +03:00
|
|
|
if (dr->dr_bio_count > 1)
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
|
Cleanup of dead code suggested by Clang Static Analyzer (#14380)
I recently gained the ability to run Clang's static analyzer on the
linux kernel modules via a few hacks. This extended coverage to code
that was previously missed since Clang's static analyzer only looked at
code that we built in userspace. Running it against the Linux kernel
modules built from my local branch produced a total of 72 reports
against my local branch. Of those, 50 were reports of logic errors and
22 were reports of dead code. Since we already had cleaned up all of
the previous dead code reports, I felt it would be a good next step to
clean up these dead code reports. Clang did a further breakdown of the
dead code reports into:
Dead assignment 15
Dead increment 2
Dead nested assignment 5
The benefit of cleaning these up, especially in the case of dead nested
assignment, is that they can expose places where our error handling is
incorrect. A number of them were fairly straight forward. However
several were not:
In vdev_disk_physio_completion(), not only were we not using the return
value from the static function vdev_disk_dio_put(), but nothing used it,
so I changed it to return void and removed the existing (void) cast in
the other area where we call it in addition to no longer storing it to a
stack value.
In FSE_createDTable(), the function is dead code. Its helper function
FSE_freeDTable() is also dead code, as are the CPP definitions in
`module/zstd/include/zstd_compat_wrapper.h`. We just delete it all.
In zfs_zevent_wait(), we have an optimization opportunity. cv_wait_sig()
returns 0 if there are waiting signals and 1 if there are none. The
Linux SPL version literally returns `signal_pending(current) ? 0 : 1)`
and FreeBSD implements the same semantics, we can just do
`!cv_wait_sig()` in place of `signal_pending(current)` to avoid
unnecessarily calling it again.
zfs_setattr() on FreeBSD version did not have error handling issue
because the code was removed entirely from FreeBSD version. The error is
from updating the attribute directory's files. After some thought, I
decided to propapage errors on it to userspace.
In zfs_secpolicy_tmp_snapshot(), we ignore a lack of permission from the
first check in favor of checking three other permissions. I assume this
is intentional.
In zfs_create_fs(), the return value of zap_update() was not checked
despite setting an important version number. I see no backward
compatibility reason to permit failures, so we add an assertion to catch
failures. Interestingly, Linux is still using ASSERT(error == 0) from
OpenSolaris while FreeBSD has switched to the improved ASSERT0(error)
from illumos, although illumos has yet to adopt it here. ASSERT(error ==
0) was used on Linux while ASSERT0(error) was used on FreeBSD since the
entire file needs conversion and that should be the subject of
another patch.
dnode_move()'s issue was caused by us not having implemented
POINTER_IS_VALID() on Linux. We have a stub in
`include/os/linux/spl/sys/kmem_cache.h` for it, when it really should be
in `include/os/linux/spl/sys/kmem.h` to be consistent with
Illumos/OpenSolaris. FreeBSD put both `POINTER_IS_VALID()` and
`POINTER_INVALIDATE()` in `include/os/freebsd/spl/sys/kmem.h`, so we
copy what it did.
Whenever a report was in platform-specific code, I checked the FreeBSD
version to see if it also applied to FreeBSD, but it was only relevant a
few times.
Lastly, the patch that enabled Clang's static analyzer to be run on the
Linux kernel modules needs more work before it can be put into a PR. I
plan to do that in the future as part of the on-going static analysis
work that I am doing.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14380
2023-01-17 20:57:12 +03:00
|
|
|
vdev_disk_dio_put(dr);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
return (error);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2017-07-24 05:37:12 +03:00
|
|
|
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
2010-08-26 22:45:02 +04:00
|
|
|
{
|
|
|
|
zio_t *zio = bio->bi_private;
|
2015-09-23 18:55:15 +03:00
|
|
|
#ifdef HAVE_1ARG_BIO_END_IO_T
|
2017-07-24 05:37:12 +03:00
|
|
|
zio->io_error = BIO_END_IO_ERROR(bio);
|
|
|
|
#else
|
|
|
|
zio->io_error = -error;
|
2015-09-23 18:55:15 +03:00
|
|
|
#endif
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2017-07-24 05:37:12 +03:00
|
|
|
if (zio->io_error && (zio->io_error == EOPNOTSUPP))
|
2010-08-26 22:45:02 +04:00
|
|
|
zio->io_vd->vdev_nowritecache = B_TRUE;
|
|
|
|
|
|
|
|
bio_put(bio);
|
2010-09-28 02:30:14 +04:00
|
|
|
ASSERT3S(zio->io_error, >=, 0);
|
|
|
|
if (zio->io_error)
|
|
|
|
vdev_disk_error(zio);
|
2010-08-26 22:45:02 +04:00
|
|
|
zio_interrupt(zio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
|
|
|
|
{
|
|
|
|
struct request_queue *q;
|
|
|
|
struct bio *bio;
|
|
|
|
|
|
|
|
q = bdev_get_queue(bdev);
|
|
|
|
if (!q)
|
2017-08-03 07:16:12 +03:00
|
|
|
return (SET_ERROR(ENXIO));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2022-05-27 23:28:51 +03:00
|
|
|
bio = vdev_bio_alloc(bdev, GFP_NOIO, 0);
|
2014-10-21 22:20:10 +04:00
|
|
|
if (unlikely(bio == NULL))
|
2017-08-03 07:16:12 +03:00
|
|
|
return (SET_ERROR(ENOMEM));
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
bio->bi_end_io = vdev_disk_io_flush_completion;
|
|
|
|
bio->bi_private = zio;
|
2016-12-31 01:03:59 +03:00
|
|
|
bio_set_flush(bio);
|
2016-07-27 20:55:32 +03:00
|
|
|
vdev_submit_bio(bio);
|
Invalidate Linux buffer cache on vdevs upon each flush
Userland tools such as blkid, grub2-probe and zdb will go through the
buffer cache. However, ZFS uses on submit_bio() to bypass the buffer
cache when performing IO operations on vdevs for efficiency purposes.
This permits the on-disk state and buffer cache to fall out of
synchronization. That causes seemingly random failures when tools
reading stale metadata from the buffer cache try to access references to
data that is no longer there.
A particularly bad failure this causes involves grub2-probe, which is
used by grub2-mkconfig. Ordinarily, a rootfs might be called
rpool/ROOT/gentoo. However, when a failure occurs in grub2-probe,
grub2-mkconfig will generate a configuration file containing
/ROOT/gentoo, which omits the pool name and causes a boot failure.
This is avoidable by calling invalidate_bdev() on each flush, which is a
simple way to ensure that all non-dirty pages are wiped. Since userland
tools rarely access vdevs directly, this should be a fancy noop >99.999%
of the time and have little impact on IO. We could have tried a finer
grained approach for the rare instances in which the vdevs are accessed
frequently by userland. However, that would require consideration of
corner cases and it is not worth the effort.
Memory-wise, it would have been better to use a Linux kernel API hook to
disable the buffer cache on such devices, but it provides us no way of
doing that, so we opt for this approach instead. We should revisit that
idea in the future when higher priority issues have been tackled.
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2150
2014-02-27 23:03:39 +04:00
|
|
|
invalidate_bdev(bdev);
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
return (0);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2024-02-02 22:51:51 +03:00
|
|
|
#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
|
|
|
|
defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
|
|
|
|
BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
|
|
|
|
{
|
|
|
|
zio_t *zio = bio->bi_private;
|
|
|
|
#ifdef HAVE_1ARG_BIO_END_IO_T
|
|
|
|
zio->io_error = BIO_END_IO_ERROR(bio);
|
|
|
|
#else
|
|
|
|
zio->io_error = -error;
|
|
|
|
#endif
|
|
|
|
bio_put(bio);
|
|
|
|
if (zio->io_error)
|
|
|
|
vdev_disk_error(zio);
|
|
|
|
zio_interrupt(zio);
|
|
|
|
}
|
|
|
|
|
2022-05-27 22:40:22 +03:00
|
|
|
static int
|
2024-02-02 22:51:51 +03:00
|
|
|
vdev_issue_discard_trim(zio_t *zio, unsigned long flags)
|
2022-05-27 22:40:22 +03:00
|
|
|
{
|
2024-02-02 22:51:51 +03:00
|
|
|
int ret;
|
|
|
|
struct bio *bio = NULL;
|
2022-05-27 22:40:22 +03:00
|
|
|
|
2024-02-02 22:51:51 +03:00
|
|
|
#if defined(BLKDEV_DISCARD_SECURE)
|
|
|
|
ret = - __blkdev_issue_discard(
|
|
|
|
BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
|
|
|
|
zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio);
|
|
|
|
#else
|
|
|
|
(void) flags;
|
|
|
|
ret = - __blkdev_issue_discard(
|
|
|
|
BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
|
|
|
|
zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio);
|
|
|
|
#endif
|
|
|
|
if (!ret && bio) {
|
|
|
|
bio->bi_private = zio;
|
|
|
|
bio->bi_end_io = vdev_disk_discard_end_io;
|
|
|
|
vdev_submit_bio(bio);
|
2022-05-27 22:40:22 +03:00
|
|
|
}
|
2024-02-02 22:51:51 +03:00
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int
|
|
|
|
vdev_disk_io_trim(zio_t *zio)
|
|
|
|
{
|
2022-05-27 22:40:22 +03:00
|
|
|
unsigned long trim_flags = 0;
|
2024-02-02 22:51:51 +03:00
|
|
|
if (zio->io_trim_flags & ZIO_TRIM_SECURE) {
|
|
|
|
#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
|
|
|
|
return (-blkdev_issue_secure_erase(
|
|
|
|
BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
|
|
|
|
zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
|
|
|
|
#elif defined(BLKDEV_DISCARD_SECURE)
|
2022-05-27 22:40:22 +03:00
|
|
|
trim_flags |= BLKDEV_DISCARD_SECURE;
|
|
|
|
#endif
|
2024-02-02 22:51:51 +03:00
|
|
|
}
|
|
|
|
#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \
|
|
|
|
defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC)
|
|
|
|
return (vdev_issue_discard_trim(zio, trim_flags));
|
|
|
|
#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
|
|
|
|
return (-blkdev_issue_discard(
|
|
|
|
BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh),
|
2022-05-27 22:40:22 +03:00
|
|
|
zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags));
|
|
|
|
#else
|
|
|
|
#error "Unsupported kernel"
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2014-10-21 02:07:45 +04:00
|
|
|
static void
|
2010-08-26 22:45:02 +04:00
|
|
|
vdev_disk_io_start(zio_t *zio)
|
|
|
|
{
|
|
|
|
vdev_t *v = zio->io_vd;
|
|
|
|
vdev_disk_t *vd = v->vdev_tsd;
|
2019-11-12 19:59:06 +03:00
|
|
|
int rw, error;
|
2010-08-26 22:45:02 +04:00
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
/*
|
|
|
|
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
|
|
|
|
* Nothing to be done here but return failure.
|
|
|
|
*/
|
|
|
|
if (vd == NULL) {
|
|
|
|
zio->io_error = ENXIO;
|
|
|
|
zio_interrupt(zio);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
rw_enter(&vd->vd_lock, RW_READER);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the vdev is closed, it's likely due to a failed reopen and is
|
|
|
|
* in the UNAVAIL state. Nothing to be done here but return failure.
|
|
|
|
*/
|
2024-01-23 07:42:57 +03:00
|
|
|
if (vd->vd_bdh == NULL) {
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
rw_exit(&vd->vd_lock);
|
|
|
|
zio->io_error = ENXIO;
|
|
|
|
zio_interrupt(zio);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
switch (zio->io_type) {
|
|
|
|
case ZIO_TYPE_IOCTL:
|
|
|
|
|
|
|
|
if (!vdev_readable(v)) {
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
rw_exit(&vd->vd_lock);
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENXIO);
|
2014-10-21 02:07:45 +04:00
|
|
|
zio_interrupt(zio);
|
|
|
|
return;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
switch (zio->io_cmd) {
|
|
|
|
case DKIOCFLUSHWRITECACHE:
|
|
|
|
|
|
|
|
if (zfs_nocacheflush)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (v->vdev_nowritecache) {
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENOTSUP);
|
2010-08-26 22:45:02 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio);
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
if (error == 0) {
|
|
|
|
rw_exit(&vd->vd_lock);
|
2014-10-21 02:07:45 +04:00
|
|
|
return;
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
}
|
2010-08-26 22:45:02 +04:00
|
|
|
|
|
|
|
zio->io_error = error;
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENOTSUP);
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
rw_exit(&vd->vd_lock);
|
2014-10-21 02:07:45 +04:00
|
|
|
zio_execute(zio);
|
|
|
|
return;
|
2010-08-26 22:45:02 +04:00
|
|
|
case ZIO_TYPE_WRITE:
|
2016-07-27 20:55:32 +03:00
|
|
|
rw = WRITE;
|
2010-08-26 22:45:02 +04:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ZIO_TYPE_READ:
|
2016-07-27 20:55:32 +03:00
|
|
|
rw = READ;
|
2010-08-26 22:45:02 +04:00
|
|
|
break;
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
case ZIO_TYPE_TRIM:
|
2022-05-27 22:40:22 +03:00
|
|
|
zio->io_error = vdev_disk_io_trim(zio);
|
2019-03-29 19:13:20 +03:00
|
|
|
rw_exit(&vd->vd_lock);
|
2024-02-02 22:51:51 +03:00
|
|
|
#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
|
|
|
|
if (zio->io_trim_flags & ZIO_TRIM_SECURE)
|
|
|
|
zio_interrupt(zio);
|
|
|
|
#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
|
2019-03-29 19:13:20 +03:00
|
|
|
zio_interrupt(zio);
|
2024-02-02 22:51:51 +03:00
|
|
|
#endif
|
2019-03-29 19:13:20 +03:00
|
|
|
return;
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
default:
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
rw_exit(&vd->vd_lock);
|
2013-03-08 22:41:28 +04:00
|
|
|
zio->io_error = SET_ERROR(ENOTSUP);
|
2014-10-21 02:07:45 +04:00
|
|
|
zio_interrupt(zio);
|
|
|
|
return;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
|
2016-05-23 20:41:29 +03:00
|
|
|
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
2024-01-23 07:42:57 +03:00
|
|
|
error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
|
2019-11-12 19:59:06 +03:00
|
|
|
zio->io_size, zio->io_offset, rw, 0);
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
rw_exit(&vd->vd_lock);
|
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
if (error) {
|
|
|
|
zio->io_error = error;
|
2014-10-21 02:07:45 +04:00
|
|
|
zio_interrupt(zio);
|
|
|
|
return;
|
2010-08-26 22:45:02 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vdev_disk_io_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If the device returned EIO, we revalidate the media. If it is
|
|
|
|
* determined the media has changed this triggers the asynchronous
|
|
|
|
* removal of the device from the configuration.
|
|
|
|
*/
|
|
|
|
if (zio->io_error == EIO) {
|
2013-11-01 23:26:11 +04:00
|
|
|
vdev_t *v = zio->io_vd;
|
2010-08-26 22:45:02 +04:00
|
|
|
vdev_disk_t *vd = v->vdev_tsd;
|
|
|
|
|
2024-01-23 07:42:57 +03:00
|
|
|
if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) {
|
|
|
|
invalidate_bdev(BDH_BDEV(vd->vd_bdh));
|
2010-08-26 22:45:02 +04:00
|
|
|
v->vdev_remove_wanted = B_TRUE;
|
|
|
|
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vdev_disk_hold(vdev_t *vd)
|
|
|
|
{
|
|
|
|
ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
|
|
|
|
|
|
|
|
/* We must have a pathname, and it must be absolute. */
|
|
|
|
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only prefetch path and devid info if the device has
|
|
|
|
* never been opened.
|
|
|
|
*/
|
|
|
|
if (vd->vdev_tsd != NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vdev_disk_rele(vdev_t *vd)
|
|
|
|
{
|
|
|
|
ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
|
|
|
|
|
|
|
|
/* XXX: Implement me as a vnode rele for the device */
|
|
|
|
}
|
|
|
|
|
|
|
|
vdev_ops_t vdev_disk_ops = {
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
.vdev_op_init = NULL,
|
|
|
|
.vdev_op_fini = NULL,
|
2019-06-21 04:29:02 +03:00
|
|
|
.vdev_op_open = vdev_disk_open,
|
|
|
|
.vdev_op_close = vdev_disk_close,
|
|
|
|
.vdev_op_asize = vdev_default_asize,
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
.vdev_op_min_asize = vdev_default_min_asize,
|
|
|
|
.vdev_op_min_alloc = NULL,
|
2019-06-21 04:29:02 +03:00
|
|
|
.vdev_op_io_start = vdev_disk_io_start,
|
|
|
|
.vdev_op_io_done = vdev_disk_io_done,
|
|
|
|
.vdev_op_state_change = NULL,
|
|
|
|
.vdev_op_need_resilver = NULL,
|
|
|
|
.vdev_op_hold = vdev_disk_hold,
|
|
|
|
.vdev_op_rele = vdev_disk_rele,
|
|
|
|
.vdev_op_remap = NULL,
|
|
|
|
.vdev_op_xlate = vdev_default_xlate,
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
.vdev_op_rebuild_asize = NULL,
|
|
|
|
.vdev_op_metaslab_init = NULL,
|
|
|
|
.vdev_op_config_generate = NULL,
|
|
|
|
.vdev_op_nparity = NULL,
|
|
|
|
.vdev_op_ndisks = NULL,
|
2019-06-21 04:29:02 +03:00
|
|
|
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
|
2022-09-28 19:48:46 +03:00
|
|
|
.vdev_op_leaf = B_TRUE, /* leaf vdev */
|
|
|
|
.vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post
|
2010-08-26 22:45:02 +04:00
|
|
|
};
|
|
|
|
|
2019-11-27 21:35:49 +03:00
|
|
|
/*
|
|
|
|
* The zfs_vdev_scheduler module option has been deprecated. Setting this
|
|
|
|
* value no longer has any effect. It has not yet been entirely removed
|
|
|
|
* to allow the module to be loaded if this option is specified in the
|
|
|
|
* /etc/modprobe.d/zfs.conf file. The following warning will be logged.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
|
|
|
|
{
|
|
|
|
int error = param_set_charp(val, kp);
|
|
|
|
if (error == 0) {
|
|
|
|
printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
|
|
|
|
"is not supported.\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2022-01-15 02:37:55 +03:00
|
|
|
static const char *zfs_vdev_scheduler = "unused";
|
2017-09-05 23:41:32 +03:00
|
|
|
module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
|
|
|
|
param_get_charp, &zfs_vdev_scheduler, 0644);
|
2011-05-04 02:09:28 +04:00
|
|
|
MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
|
2020-08-21 22:53:17 +03:00
|
|
|
|
|
|
|
int
|
|
|
|
param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
|
|
|
|
{
|
Cleanup: 64-bit kernel module parameters should use fixed width types
Various module parameters such as `zfs_arc_max` were originally
`uint64_t` on OpenSolaris/Illumos, but were changed to `unsigned long`
for Linux compatibility because Linux's kernel default module parameter
implementation did not support 64-bit types on 32-bit platforms. This
caused problems when porting OpenZFS to Windows because its LLP64 memory
model made `unsigned long` a 32-bit type on 64-bit, which created the
undesireable situation that parameters that should accept 64-bit values
could not on 64-bit Windows.
Upon inspection, it turns out that the Linux kernel module parameter
interface is extensible, such that we are allowed to define our own
types. Rather than maintaining the original type change via hacks to to
continue shrinking module parameters on 32-bit Linux, we implement
support for 64-bit module parameters on Linux.
After doing a review of all 64-bit kernel parameters (found via the man
page and also proposed changes by Andrew Innes), the kernel module
parameters fell into a few groups:
Parameters that were originally 64-bit on Illumos:
* dbuf_cache_max_bytes
* dbuf_metadata_cache_max_bytes
* l2arc_feed_min_ms
* l2arc_feed_secs
* l2arc_headroom
* l2arc_headroom_boost
* l2arc_write_boost
* l2arc_write_max
* metaslab_aliquot
* metaslab_force_ganging
* zfetch_array_rd_sz
* zfs_arc_max
* zfs_arc_meta_limit
* zfs_arc_meta_min
* zfs_arc_min
* zfs_async_block_max_blocks
* zfs_condense_max_obsolete_bytes
* zfs_condense_min_mapping_bytes
* zfs_deadman_checktime_ms
* zfs_deadman_synctime_ms
* zfs_initialize_chunk_size
* zfs_initialize_value
* zfs_lua_max_instrlimit
* zfs_lua_max_memlimit
* zil_slog_bulk
Parameters that were originally 32-bit on Illumos:
* zfs_per_txg_dirty_frees_percent
Parameters that were originally `ssize_t` on Illumos:
* zfs_immediate_write_sz
Note that `ssize_t` is `int32_t` on 32-bit and `int64_t` on 64-bit. It
has been upgraded to 64-bit.
Parameters that were `long`/`unsigned long` because of Linux/FreeBSD
influence:
* l2arc_rebuild_blocks_min_l2size
* zfs_key_max_salt_uses
* zfs_max_log_walking
* zfs_max_logsm_summary_length
* zfs_metaslab_max_size_cache_sec
* zfs_min_metaslabs_to_flush
* zfs_multihost_interval
* zfs_unflushed_log_block_max
* zfs_unflushed_log_block_min
* zfs_unflushed_log_block_pct
* zfs_unflushed_max_mem_amt
* zfs_unflushed_max_mem_ppm
New parameters that do not exist in Illumos:
* l2arc_trim_ahead
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_arc_sys_free
* zfs_deadman_ziotime_ms
* zfs_delete_blocks
* zfs_history_output_max
* zfs_livelist_max_entries
* zfs_max_async_dedup_frees
* zfs_max_nvlist_src_size
* zfs_rebuild_max_segment
* zfs_rebuild_vdev_limit
* zfs_unflushed_log_txg_max
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
* zfs_vnops_read_chunk_size
* zvol_max_discard_blocks
Rather than clutter the lists with commentary, the module parameters
that need comments are repeated below.
A few parameters were defined in Linux/FreeBSD specific code, where the
use of ulong/long is not an issue for portability, so we leave them
alone:
* zfs_delete_blocks
* zfs_key_max_salt_uses
* zvol_max_discard_blocks
The documentation for a few parameters was found to be incorrect:
* zfs_deadman_checktime_ms - incorrectly documented as int
* zfs_delete_blocks - not documented as Linux only
* zfs_history_output_max - incorrectly documented as int
* zfs_vnops_read_chunk_size - incorrectly documented as long
* zvol_max_discard_blocks - incorrectly documented as ulong
The documentation for these has been fixed, alongside the changes to
document the switch to fixed width types.
In addition, several kernel module parameters were percentages or held
ashift values, so being 64-bit never made sense for them. They have been
downgraded to 32-bit:
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_per_txg_dirty_frees_percent
* zfs_unflushed_log_block_pct
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
Of special note are `zfs_vdev_max_auto_ashift` and
`zfs_vdev_min_auto_ashift`, which were already defined as `uint64_t`,
and passed to the kernel as `ulong`. This is inherently buggy on big
endian 32-bit Linux, since the values would not be written to the
correct locations. 32-bit FreeBSD was unaffected because its sysctl code
correctly treated this as a `uint64_t`.
Lastly, a code comment suggests that `zfs_arc_sys_free` is
Linux-specific, but there is nothing to indicate to me that it is
Linux-specific. Nothing was done about that.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Original-patch-by: Andrew Innes <andrew.c12@gmail.com>
Original-patch-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13984
Closes #14004
2022-10-03 22:06:54 +03:00
|
|
|
uint_t val;
|
2020-08-21 22:53:17 +03:00
|
|
|
int error;
|
|
|
|
|
Cleanup: 64-bit kernel module parameters should use fixed width types
Various module parameters such as `zfs_arc_max` were originally
`uint64_t` on OpenSolaris/Illumos, but were changed to `unsigned long`
for Linux compatibility because Linux's kernel default module parameter
implementation did not support 64-bit types on 32-bit platforms. This
caused problems when porting OpenZFS to Windows because its LLP64 memory
model made `unsigned long` a 32-bit type on 64-bit, which created the
undesireable situation that parameters that should accept 64-bit values
could not on 64-bit Windows.
Upon inspection, it turns out that the Linux kernel module parameter
interface is extensible, such that we are allowed to define our own
types. Rather than maintaining the original type change via hacks to to
continue shrinking module parameters on 32-bit Linux, we implement
support for 64-bit module parameters on Linux.
After doing a review of all 64-bit kernel parameters (found via the man
page and also proposed changes by Andrew Innes), the kernel module
parameters fell into a few groups:
Parameters that were originally 64-bit on Illumos:
* dbuf_cache_max_bytes
* dbuf_metadata_cache_max_bytes
* l2arc_feed_min_ms
* l2arc_feed_secs
* l2arc_headroom
* l2arc_headroom_boost
* l2arc_write_boost
* l2arc_write_max
* metaslab_aliquot
* metaslab_force_ganging
* zfetch_array_rd_sz
* zfs_arc_max
* zfs_arc_meta_limit
* zfs_arc_meta_min
* zfs_arc_min
* zfs_async_block_max_blocks
* zfs_condense_max_obsolete_bytes
* zfs_condense_min_mapping_bytes
* zfs_deadman_checktime_ms
* zfs_deadman_synctime_ms
* zfs_initialize_chunk_size
* zfs_initialize_value
* zfs_lua_max_instrlimit
* zfs_lua_max_memlimit
* zil_slog_bulk
Parameters that were originally 32-bit on Illumos:
* zfs_per_txg_dirty_frees_percent
Parameters that were originally `ssize_t` on Illumos:
* zfs_immediate_write_sz
Note that `ssize_t` is `int32_t` on 32-bit and `int64_t` on 64-bit. It
has been upgraded to 64-bit.
Parameters that were `long`/`unsigned long` because of Linux/FreeBSD
influence:
* l2arc_rebuild_blocks_min_l2size
* zfs_key_max_salt_uses
* zfs_max_log_walking
* zfs_max_logsm_summary_length
* zfs_metaslab_max_size_cache_sec
* zfs_min_metaslabs_to_flush
* zfs_multihost_interval
* zfs_unflushed_log_block_max
* zfs_unflushed_log_block_min
* zfs_unflushed_log_block_pct
* zfs_unflushed_max_mem_amt
* zfs_unflushed_max_mem_ppm
New parameters that do not exist in Illumos:
* l2arc_trim_ahead
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_arc_sys_free
* zfs_deadman_ziotime_ms
* zfs_delete_blocks
* zfs_history_output_max
* zfs_livelist_max_entries
* zfs_max_async_dedup_frees
* zfs_max_nvlist_src_size
* zfs_rebuild_max_segment
* zfs_rebuild_vdev_limit
* zfs_unflushed_log_txg_max
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
* zfs_vnops_read_chunk_size
* zvol_max_discard_blocks
Rather than clutter the lists with commentary, the module parameters
that need comments are repeated below.
A few parameters were defined in Linux/FreeBSD specific code, where the
use of ulong/long is not an issue for portability, so we leave them
alone:
* zfs_delete_blocks
* zfs_key_max_salt_uses
* zvol_max_discard_blocks
The documentation for a few parameters was found to be incorrect:
* zfs_deadman_checktime_ms - incorrectly documented as int
* zfs_delete_blocks - not documented as Linux only
* zfs_history_output_max - incorrectly documented as int
* zfs_vnops_read_chunk_size - incorrectly documented as long
* zvol_max_discard_blocks - incorrectly documented as ulong
The documentation for these has been fixed, alongside the changes to
document the switch to fixed width types.
In addition, several kernel module parameters were percentages or held
ashift values, so being 64-bit never made sense for them. They have been
downgraded to 32-bit:
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_per_txg_dirty_frees_percent
* zfs_unflushed_log_block_pct
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
Of special note are `zfs_vdev_max_auto_ashift` and
`zfs_vdev_min_auto_ashift`, which were already defined as `uint64_t`,
and passed to the kernel as `ulong`. This is inherently buggy on big
endian 32-bit Linux, since the values would not be written to the
correct locations. 32-bit FreeBSD was unaffected because its sysctl code
correctly treated this as a `uint64_t`.
Lastly, a code comment suggests that `zfs_arc_sys_free` is
Linux-specific, but there is nothing to indicate to me that it is
Linux-specific. Nothing was done about that.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Original-patch-by: Andrew Innes <andrew.c12@gmail.com>
Original-patch-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13984
Closes #14004
2022-10-03 22:06:54 +03:00
|
|
|
error = kstrtouint(buf, 0, &val);
|
2020-08-21 22:53:17 +03:00
|
|
|
if (error < 0)
|
|
|
|
return (SET_ERROR(error));
|
|
|
|
|
|
|
|
if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
|
|
|
|
return (SET_ERROR(-EINVAL));
|
|
|
|
|
Cleanup: 64-bit kernel module parameters should use fixed width types
Various module parameters such as `zfs_arc_max` were originally
`uint64_t` on OpenSolaris/Illumos, but were changed to `unsigned long`
for Linux compatibility because Linux's kernel default module parameter
implementation did not support 64-bit types on 32-bit platforms. This
caused problems when porting OpenZFS to Windows because its LLP64 memory
model made `unsigned long` a 32-bit type on 64-bit, which created the
undesireable situation that parameters that should accept 64-bit values
could not on 64-bit Windows.
Upon inspection, it turns out that the Linux kernel module parameter
interface is extensible, such that we are allowed to define our own
types. Rather than maintaining the original type change via hacks to to
continue shrinking module parameters on 32-bit Linux, we implement
support for 64-bit module parameters on Linux.
After doing a review of all 64-bit kernel parameters (found via the man
page and also proposed changes by Andrew Innes), the kernel module
parameters fell into a few groups:
Parameters that were originally 64-bit on Illumos:
* dbuf_cache_max_bytes
* dbuf_metadata_cache_max_bytes
* l2arc_feed_min_ms
* l2arc_feed_secs
* l2arc_headroom
* l2arc_headroom_boost
* l2arc_write_boost
* l2arc_write_max
* metaslab_aliquot
* metaslab_force_ganging
* zfetch_array_rd_sz
* zfs_arc_max
* zfs_arc_meta_limit
* zfs_arc_meta_min
* zfs_arc_min
* zfs_async_block_max_blocks
* zfs_condense_max_obsolete_bytes
* zfs_condense_min_mapping_bytes
* zfs_deadman_checktime_ms
* zfs_deadman_synctime_ms
* zfs_initialize_chunk_size
* zfs_initialize_value
* zfs_lua_max_instrlimit
* zfs_lua_max_memlimit
* zil_slog_bulk
Parameters that were originally 32-bit on Illumos:
* zfs_per_txg_dirty_frees_percent
Parameters that were originally `ssize_t` on Illumos:
* zfs_immediate_write_sz
Note that `ssize_t` is `int32_t` on 32-bit and `int64_t` on 64-bit. It
has been upgraded to 64-bit.
Parameters that were `long`/`unsigned long` because of Linux/FreeBSD
influence:
* l2arc_rebuild_blocks_min_l2size
* zfs_key_max_salt_uses
* zfs_max_log_walking
* zfs_max_logsm_summary_length
* zfs_metaslab_max_size_cache_sec
* zfs_min_metaslabs_to_flush
* zfs_multihost_interval
* zfs_unflushed_log_block_max
* zfs_unflushed_log_block_min
* zfs_unflushed_log_block_pct
* zfs_unflushed_max_mem_amt
* zfs_unflushed_max_mem_ppm
New parameters that do not exist in Illumos:
* l2arc_trim_ahead
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_arc_sys_free
* zfs_deadman_ziotime_ms
* zfs_delete_blocks
* zfs_history_output_max
* zfs_livelist_max_entries
* zfs_max_async_dedup_frees
* zfs_max_nvlist_src_size
* zfs_rebuild_max_segment
* zfs_rebuild_vdev_limit
* zfs_unflushed_log_txg_max
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
* zfs_vnops_read_chunk_size
* zvol_max_discard_blocks
Rather than clutter the lists with commentary, the module parameters
that need comments are repeated below.
A few parameters were defined in Linux/FreeBSD specific code, where the
use of ulong/long is not an issue for portability, so we leave them
alone:
* zfs_delete_blocks
* zfs_key_max_salt_uses
* zvol_max_discard_blocks
The documentation for a few parameters was found to be incorrect:
* zfs_deadman_checktime_ms - incorrectly documented as int
* zfs_delete_blocks - not documented as Linux only
* zfs_history_output_max - incorrectly documented as int
* zfs_vnops_read_chunk_size - incorrectly documented as long
* zvol_max_discard_blocks - incorrectly documented as ulong
The documentation for these has been fixed, alongside the changes to
document the switch to fixed width types.
In addition, several kernel module parameters were percentages or held
ashift values, so being 64-bit never made sense for them. They have been
downgraded to 32-bit:
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_per_txg_dirty_frees_percent
* zfs_unflushed_log_block_pct
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
Of special note are `zfs_vdev_max_auto_ashift` and
`zfs_vdev_min_auto_ashift`, which were already defined as `uint64_t`,
and passed to the kernel as `ulong`. This is inherently buggy on big
endian 32-bit Linux, since the values would not be written to the
correct locations. 32-bit FreeBSD was unaffected because its sysctl code
correctly treated this as a `uint64_t`.
Lastly, a code comment suggests that `zfs_arc_sys_free` is
Linux-specific, but there is nothing to indicate to me that it is
Linux-specific. Nothing was done about that.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Original-patch-by: Andrew Innes <andrew.c12@gmail.com>
Original-patch-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13984
Closes #14004
2022-10-03 22:06:54 +03:00
|
|
|
error = param_set_uint(buf, kp);
|
2020-08-21 22:53:17 +03:00
|
|
|
if (error < 0)
|
|
|
|
return (SET_ERROR(error));
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
|
|
|
|
{
|
Cleanup: 64-bit kernel module parameters should use fixed width types
Various module parameters such as `zfs_arc_max` were originally
`uint64_t` on OpenSolaris/Illumos, but were changed to `unsigned long`
for Linux compatibility because Linux's kernel default module parameter
implementation did not support 64-bit types on 32-bit platforms. This
caused problems when porting OpenZFS to Windows because its LLP64 memory
model made `unsigned long` a 32-bit type on 64-bit, which created the
undesireable situation that parameters that should accept 64-bit values
could not on 64-bit Windows.
Upon inspection, it turns out that the Linux kernel module parameter
interface is extensible, such that we are allowed to define our own
types. Rather than maintaining the original type change via hacks to to
continue shrinking module parameters on 32-bit Linux, we implement
support for 64-bit module parameters on Linux.
After doing a review of all 64-bit kernel parameters (found via the man
page and also proposed changes by Andrew Innes), the kernel module
parameters fell into a few groups:
Parameters that were originally 64-bit on Illumos:
* dbuf_cache_max_bytes
* dbuf_metadata_cache_max_bytes
* l2arc_feed_min_ms
* l2arc_feed_secs
* l2arc_headroom
* l2arc_headroom_boost
* l2arc_write_boost
* l2arc_write_max
* metaslab_aliquot
* metaslab_force_ganging
* zfetch_array_rd_sz
* zfs_arc_max
* zfs_arc_meta_limit
* zfs_arc_meta_min
* zfs_arc_min
* zfs_async_block_max_blocks
* zfs_condense_max_obsolete_bytes
* zfs_condense_min_mapping_bytes
* zfs_deadman_checktime_ms
* zfs_deadman_synctime_ms
* zfs_initialize_chunk_size
* zfs_initialize_value
* zfs_lua_max_instrlimit
* zfs_lua_max_memlimit
* zil_slog_bulk
Parameters that were originally 32-bit on Illumos:
* zfs_per_txg_dirty_frees_percent
Parameters that were originally `ssize_t` on Illumos:
* zfs_immediate_write_sz
Note that `ssize_t` is `int32_t` on 32-bit and `int64_t` on 64-bit. It
has been upgraded to 64-bit.
Parameters that were `long`/`unsigned long` because of Linux/FreeBSD
influence:
* l2arc_rebuild_blocks_min_l2size
* zfs_key_max_salt_uses
* zfs_max_log_walking
* zfs_max_logsm_summary_length
* zfs_metaslab_max_size_cache_sec
* zfs_min_metaslabs_to_flush
* zfs_multihost_interval
* zfs_unflushed_log_block_max
* zfs_unflushed_log_block_min
* zfs_unflushed_log_block_pct
* zfs_unflushed_max_mem_amt
* zfs_unflushed_max_mem_ppm
New parameters that do not exist in Illumos:
* l2arc_trim_ahead
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_arc_sys_free
* zfs_deadman_ziotime_ms
* zfs_delete_blocks
* zfs_history_output_max
* zfs_livelist_max_entries
* zfs_max_async_dedup_frees
* zfs_max_nvlist_src_size
* zfs_rebuild_max_segment
* zfs_rebuild_vdev_limit
* zfs_unflushed_log_txg_max
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
* zfs_vnops_read_chunk_size
* zvol_max_discard_blocks
Rather than clutter the lists with commentary, the module parameters
that need comments are repeated below.
A few parameters were defined in Linux/FreeBSD specific code, where the
use of ulong/long is not an issue for portability, so we leave them
alone:
* zfs_delete_blocks
* zfs_key_max_salt_uses
* zvol_max_discard_blocks
The documentation for a few parameters was found to be incorrect:
* zfs_deadman_checktime_ms - incorrectly documented as int
* zfs_delete_blocks - not documented as Linux only
* zfs_history_output_max - incorrectly documented as int
* zfs_vnops_read_chunk_size - incorrectly documented as long
* zvol_max_discard_blocks - incorrectly documented as ulong
The documentation for these has been fixed, alongside the changes to
document the switch to fixed width types.
In addition, several kernel module parameters were percentages or held
ashift values, so being 64-bit never made sense for them. They have been
downgraded to 32-bit:
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_per_txg_dirty_frees_percent
* zfs_unflushed_log_block_pct
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
Of special note are `zfs_vdev_max_auto_ashift` and
`zfs_vdev_min_auto_ashift`, which were already defined as `uint64_t`,
and passed to the kernel as `ulong`. This is inherently buggy on big
endian 32-bit Linux, since the values would not be written to the
correct locations. 32-bit FreeBSD was unaffected because its sysctl code
correctly treated this as a `uint64_t`.
Lastly, a code comment suggests that `zfs_arc_sys_free` is
Linux-specific, but there is nothing to indicate to me that it is
Linux-specific. Nothing was done about that.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Original-patch-by: Andrew Innes <andrew.c12@gmail.com>
Original-patch-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13984
Closes #14004
2022-10-03 22:06:54 +03:00
|
|
|
uint_t val;
|
2020-08-21 22:53:17 +03:00
|
|
|
int error;
|
|
|
|
|
Cleanup: 64-bit kernel module parameters should use fixed width types
Various module parameters such as `zfs_arc_max` were originally
`uint64_t` on OpenSolaris/Illumos, but were changed to `unsigned long`
for Linux compatibility because Linux's kernel default module parameter
implementation did not support 64-bit types on 32-bit platforms. This
caused problems when porting OpenZFS to Windows because its LLP64 memory
model made `unsigned long` a 32-bit type on 64-bit, which created the
undesireable situation that parameters that should accept 64-bit values
could not on 64-bit Windows.
Upon inspection, it turns out that the Linux kernel module parameter
interface is extensible, such that we are allowed to define our own
types. Rather than maintaining the original type change via hacks to to
continue shrinking module parameters on 32-bit Linux, we implement
support for 64-bit module parameters on Linux.
After doing a review of all 64-bit kernel parameters (found via the man
page and also proposed changes by Andrew Innes), the kernel module
parameters fell into a few groups:
Parameters that were originally 64-bit on Illumos:
* dbuf_cache_max_bytes
* dbuf_metadata_cache_max_bytes
* l2arc_feed_min_ms
* l2arc_feed_secs
* l2arc_headroom
* l2arc_headroom_boost
* l2arc_write_boost
* l2arc_write_max
* metaslab_aliquot
* metaslab_force_ganging
* zfetch_array_rd_sz
* zfs_arc_max
* zfs_arc_meta_limit
* zfs_arc_meta_min
* zfs_arc_min
* zfs_async_block_max_blocks
* zfs_condense_max_obsolete_bytes
* zfs_condense_min_mapping_bytes
* zfs_deadman_checktime_ms
* zfs_deadman_synctime_ms
* zfs_initialize_chunk_size
* zfs_initialize_value
* zfs_lua_max_instrlimit
* zfs_lua_max_memlimit
* zil_slog_bulk
Parameters that were originally 32-bit on Illumos:
* zfs_per_txg_dirty_frees_percent
Parameters that were originally `ssize_t` on Illumos:
* zfs_immediate_write_sz
Note that `ssize_t` is `int32_t` on 32-bit and `int64_t` on 64-bit. It
has been upgraded to 64-bit.
Parameters that were `long`/`unsigned long` because of Linux/FreeBSD
influence:
* l2arc_rebuild_blocks_min_l2size
* zfs_key_max_salt_uses
* zfs_max_log_walking
* zfs_max_logsm_summary_length
* zfs_metaslab_max_size_cache_sec
* zfs_min_metaslabs_to_flush
* zfs_multihost_interval
* zfs_unflushed_log_block_max
* zfs_unflushed_log_block_min
* zfs_unflushed_log_block_pct
* zfs_unflushed_max_mem_amt
* zfs_unflushed_max_mem_ppm
New parameters that do not exist in Illumos:
* l2arc_trim_ahead
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_arc_sys_free
* zfs_deadman_ziotime_ms
* zfs_delete_blocks
* zfs_history_output_max
* zfs_livelist_max_entries
* zfs_max_async_dedup_frees
* zfs_max_nvlist_src_size
* zfs_rebuild_max_segment
* zfs_rebuild_vdev_limit
* zfs_unflushed_log_txg_max
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
* zfs_vnops_read_chunk_size
* zvol_max_discard_blocks
Rather than clutter the lists with commentary, the module parameters
that need comments are repeated below.
A few parameters were defined in Linux/FreeBSD specific code, where the
use of ulong/long is not an issue for portability, so we leave them
alone:
* zfs_delete_blocks
* zfs_key_max_salt_uses
* zvol_max_discard_blocks
The documentation for a few parameters was found to be incorrect:
* zfs_deadman_checktime_ms - incorrectly documented as int
* zfs_delete_blocks - not documented as Linux only
* zfs_history_output_max - incorrectly documented as int
* zfs_vnops_read_chunk_size - incorrectly documented as long
* zvol_max_discard_blocks - incorrectly documented as ulong
The documentation for these has been fixed, alongside the changes to
document the switch to fixed width types.
In addition, several kernel module parameters were percentages or held
ashift values, so being 64-bit never made sense for them. They have been
downgraded to 32-bit:
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_per_txg_dirty_frees_percent
* zfs_unflushed_log_block_pct
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
Of special note are `zfs_vdev_max_auto_ashift` and
`zfs_vdev_min_auto_ashift`, which were already defined as `uint64_t`,
and passed to the kernel as `ulong`. This is inherently buggy on big
endian 32-bit Linux, since the values would not be written to the
correct locations. 32-bit FreeBSD was unaffected because its sysctl code
correctly treated this as a `uint64_t`.
Lastly, a code comment suggests that `zfs_arc_sys_free` is
Linux-specific, but there is nothing to indicate to me that it is
Linux-specific. Nothing was done about that.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Original-patch-by: Andrew Innes <andrew.c12@gmail.com>
Original-patch-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13984
Closes #14004
2022-10-03 22:06:54 +03:00
|
|
|
error = kstrtouint(buf, 0, &val);
|
2020-08-21 22:53:17 +03:00
|
|
|
if (error < 0)
|
|
|
|
return (SET_ERROR(error));
|
|
|
|
|
|
|
|
if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
|
|
|
|
return (SET_ERROR(-EINVAL));
|
|
|
|
|
Cleanup: 64-bit kernel module parameters should use fixed width types
Various module parameters such as `zfs_arc_max` were originally
`uint64_t` on OpenSolaris/Illumos, but were changed to `unsigned long`
for Linux compatibility because Linux's kernel default module parameter
implementation did not support 64-bit types on 32-bit platforms. This
caused problems when porting OpenZFS to Windows because its LLP64 memory
model made `unsigned long` a 32-bit type on 64-bit, which created the
undesireable situation that parameters that should accept 64-bit values
could not on 64-bit Windows.
Upon inspection, it turns out that the Linux kernel module parameter
interface is extensible, such that we are allowed to define our own
types. Rather than maintaining the original type change via hacks to to
continue shrinking module parameters on 32-bit Linux, we implement
support for 64-bit module parameters on Linux.
After doing a review of all 64-bit kernel parameters (found via the man
page and also proposed changes by Andrew Innes), the kernel module
parameters fell into a few groups:
Parameters that were originally 64-bit on Illumos:
* dbuf_cache_max_bytes
* dbuf_metadata_cache_max_bytes
* l2arc_feed_min_ms
* l2arc_feed_secs
* l2arc_headroom
* l2arc_headroom_boost
* l2arc_write_boost
* l2arc_write_max
* metaslab_aliquot
* metaslab_force_ganging
* zfetch_array_rd_sz
* zfs_arc_max
* zfs_arc_meta_limit
* zfs_arc_meta_min
* zfs_arc_min
* zfs_async_block_max_blocks
* zfs_condense_max_obsolete_bytes
* zfs_condense_min_mapping_bytes
* zfs_deadman_checktime_ms
* zfs_deadman_synctime_ms
* zfs_initialize_chunk_size
* zfs_initialize_value
* zfs_lua_max_instrlimit
* zfs_lua_max_memlimit
* zil_slog_bulk
Parameters that were originally 32-bit on Illumos:
* zfs_per_txg_dirty_frees_percent
Parameters that were originally `ssize_t` on Illumos:
* zfs_immediate_write_sz
Note that `ssize_t` is `int32_t` on 32-bit and `int64_t` on 64-bit. It
has been upgraded to 64-bit.
Parameters that were `long`/`unsigned long` because of Linux/FreeBSD
influence:
* l2arc_rebuild_blocks_min_l2size
* zfs_key_max_salt_uses
* zfs_max_log_walking
* zfs_max_logsm_summary_length
* zfs_metaslab_max_size_cache_sec
* zfs_min_metaslabs_to_flush
* zfs_multihost_interval
* zfs_unflushed_log_block_max
* zfs_unflushed_log_block_min
* zfs_unflushed_log_block_pct
* zfs_unflushed_max_mem_amt
* zfs_unflushed_max_mem_ppm
New parameters that do not exist in Illumos:
* l2arc_trim_ahead
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_arc_sys_free
* zfs_deadman_ziotime_ms
* zfs_delete_blocks
* zfs_history_output_max
* zfs_livelist_max_entries
* zfs_max_async_dedup_frees
* zfs_max_nvlist_src_size
* zfs_rebuild_max_segment
* zfs_rebuild_vdev_limit
* zfs_unflushed_log_txg_max
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
* zfs_vnops_read_chunk_size
* zvol_max_discard_blocks
Rather than clutter the lists with commentary, the module parameters
that need comments are repeated below.
A few parameters were defined in Linux/FreeBSD specific code, where the
use of ulong/long is not an issue for portability, so we leave them
alone:
* zfs_delete_blocks
* zfs_key_max_salt_uses
* zvol_max_discard_blocks
The documentation for a few parameters was found to be incorrect:
* zfs_deadman_checktime_ms - incorrectly documented as int
* zfs_delete_blocks - not documented as Linux only
* zfs_history_output_max - incorrectly documented as int
* zfs_vnops_read_chunk_size - incorrectly documented as long
* zvol_max_discard_blocks - incorrectly documented as ulong
The documentation for these has been fixed, alongside the changes to
document the switch to fixed width types.
In addition, several kernel module parameters were percentages or held
ashift values, so being 64-bit never made sense for them. They have been
downgraded to 32-bit:
* vdev_file_logical_ashift
* vdev_file_physical_ashift
* zfs_arc_dnode_limit_percent
* zfs_arc_dnode_reduce_percent
* zfs_arc_meta_limit_percent
* zfs_per_txg_dirty_frees_percent
* zfs_unflushed_log_block_pct
* zfs_vdev_max_auto_ashift
* zfs_vdev_min_auto_ashift
Of special note are `zfs_vdev_max_auto_ashift` and
`zfs_vdev_min_auto_ashift`, which were already defined as `uint64_t`,
and passed to the kernel as `ulong`. This is inherently buggy on big
endian 32-bit Linux, since the values would not be written to the
correct locations. 32-bit FreeBSD was unaffected because its sysctl code
correctly treated this as a `uint64_t`.
Lastly, a code comment suggests that `zfs_arc_sys_free` is
Linux-specific, but there is nothing to indicate to me that it is
Linux-specific. Nothing was done about that.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Original-patch-by: Andrew Innes <andrew.c12@gmail.com>
Original-patch-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13984
Closes #14004
2022-10-03 22:06:54 +03:00
|
|
|
error = param_set_uint(buf, kp);
|
2020-08-21 22:53:17 +03:00
|
|
|
if (error < 0)
|
|
|
|
return (SET_ERROR(error));
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
2022-11-04 01:02:46 +03:00
|
|
|
|
|
|
|
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
|
|
|
|
"Timeout before determining that a device is missing");
|
2022-11-11 00:37:12 +03:00
|
|
|
|
|
|
|
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
|
|
|
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
|