mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 19:57:43 +03:00
RAID-Z expansion feature
This feature allows disks to be added one at a time to a RAID-Z group, expanding its capacity incrementally. This feature is especially useful for small pools (typically with only one RAID-Z group), where there isn't sufficient hardware to add capacity by adding a whole new RAID-Z group (typically doubling the number of disks). == Initiating expansion == A new device (disk) can be attached to an existing RAIDZ vdev, by running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank raidz2-0 sda`. The new device will become part of the RAIDZ group. A "raidz expansion" will be initiated, and the new device will contribute additional space to the RAIDZ group once the expansion completes. The `feature@raidz_expansion` on-disk feature flag must be `enabled` to initiate an expansion, and it remains `active` for the life of the pool. In other words, pools with expanded RAIDZ vdevs can not be imported by older releases of the ZFS software. == During expansion == The expansion entails reading all allocated space from existing disks in the RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the newly added device). The expansion progress can be monitored with `zpool status`. Data redundancy is maintained during (and after) the expansion. If a disk fails while the expansion is in progress, the expansion pauses until the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk and waiting for reconstruction to complete). The pool remains accessible during expansion. Following a reboot or export/import, the expansion resumes where it left off. == After expansion == When the expansion completes, the additional space is available for use, and is reflected in the `available` zfs property (as seen in `zfs list`, `df`, etc). Expansion does not change the number of failures that can be tolerated without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). A RAIDZ vdev can be expanded multiple times. After the expansion completes, old blocks remain with their old data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly less space than is expected may be reported for newly-written blocks, according to `zfs list`, `df`, `ls -s`, and similar tools. Sponsored-by: The FreeBSD Foundation Sponsored-by: iXsystems, Inc. Sponsored-by: vStack Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Authored-by: Matthew Ahrens <mahrens@delphix.com> Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com> Contributions-by: Stuart Maybee <stuart.maybee@comcast.net> Contributions-by: Thorsten Behrens <tbehrens@outlook.com> Contributions-by: Fmstrat <nospam@nowsci.com> Contributions-by: Don Brady <dev.fs.zfs@gmail.com> Signed-off-by: Don Brady <dev.fs.zfs@gmail.com> Closes #15022
This commit is contained in:
@@ -30,6 +30,8 @@
|
||||
#include <sys/kstat.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/abd_impl.h>
|
||||
#include <sys/zfs_rlock.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -102,28 +104,32 @@ typedef struct raidz_impl_ops {
|
||||
char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */
|
||||
} raidz_impl_ops_t;
|
||||
|
||||
|
||||
typedef struct raidz_col {
|
||||
uint64_t rc_devidx; /* child device index for I/O */
|
||||
int rc_devidx; /* child device index for I/O */
|
||||
uint32_t rc_size; /* I/O size */
|
||||
uint64_t rc_offset; /* device offset */
|
||||
uint64_t rc_size; /* I/O size */
|
||||
abd_t rc_abdstruct; /* rc_abd probably points here */
|
||||
abd_t *rc_abd; /* I/O data */
|
||||
abd_t *rc_orig_data; /* pre-reconstruction */
|
||||
int rc_error; /* I/O error for this device */
|
||||
uint8_t rc_tried; /* Did we attempt this I/O column? */
|
||||
uint8_t rc_skipped; /* Did we skip this I/O column? */
|
||||
uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
|
||||
uint8_t rc_force_repair; /* Write good data to this column */
|
||||
uint8_t rc_allow_repair; /* Allow repair I/O to this column */
|
||||
uint8_t rc_tried:1; /* Did we attempt this I/O column? */
|
||||
uint8_t rc_skipped:1; /* Did we skip this I/O column? */
|
||||
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
|
||||
uint8_t rc_force_repair:1; /* Write good data to this column */
|
||||
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
|
||||
int rc_shadow_devidx; /* for double write during expansion */
|
||||
int rc_shadow_error; /* for double write during expansion */
|
||||
uint64_t rc_shadow_offset; /* for double write during expansion */
|
||||
} raidz_col_t;
|
||||
|
||||
typedef struct raidz_row {
|
||||
uint64_t rr_cols; /* Regular column count */
|
||||
uint64_t rr_scols; /* Count including skipped columns */
|
||||
uint64_t rr_bigcols; /* Remainder data column count */
|
||||
uint64_t rr_missingdata; /* Count of missing data devices */
|
||||
uint64_t rr_missingparity; /* Count of missing parity devices */
|
||||
uint64_t rr_firstdatacol; /* First data column/parity count */
|
||||
int rr_cols; /* Regular column count */
|
||||
int rr_scols; /* Count including skipped columns */
|
||||
int rr_bigcols; /* Remainder data column count */
|
||||
int rr_missingdata; /* Count of missing data devices */
|
||||
int rr_missingparity; /* Count of missing parity devices */
|
||||
int rr_firstdatacol; /* First data column/parity count */
|
||||
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
|
||||
int rr_nempty; /* empty sectors included in parity */
|
||||
#ifdef ZFS_DEBUG
|
||||
@@ -138,10 +144,25 @@ typedef struct raidz_map {
|
||||
int rm_nrows; /* Regular row count */
|
||||
int rm_nskip; /* RAIDZ sectors skipped for padding */
|
||||
int rm_skipstart; /* Column index of padding start */
|
||||
int rm_original_width; /* pre-expansion width of raidz vdev */
|
||||
int rm_nphys_cols; /* num entries in rm_phys_col[] */
|
||||
zfs_locked_range_t *rm_lr;
|
||||
const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
|
||||
raidz_col_t *rm_phys_col; /* if non-NULL, read i/o aggregation */
|
||||
raidz_row_t *rm_row[0]; /* flexible array of rows */
|
||||
} raidz_map_t;
|
||||
|
||||
/*
|
||||
* Nodes in vdev_raidz_t:vd_expand_txgs.
|
||||
* Blocks with physical birth time of re_txg or later have the specified
|
||||
* logical width (until the next node).
|
||||
*/
|
||||
typedef struct reflow_node {
|
||||
uint64_t re_txg;
|
||||
uint64_t re_logical_width;
|
||||
avl_node_t re_link;
|
||||
} reflow_node_t;
|
||||
|
||||
|
||||
#define RAIDZ_ORIGINAL_IMPL (INT_MAX)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user