mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-15 20:50:30 +03:00
b2255edcc0
This patch adds a new top-level vdev type called dRAID, which stands for Distributed parity RAID. This pool configuration allows all dRAID vdevs to participate when rebuilding to a distributed hot spare device. This can substantially reduce the total time required to restore full parity to pool with a failed device. A dRAID pool can be created using the new top-level `draid` type. Like `raidz`, the desired redundancy is specified after the type: `draid[1,2,3]`. No additional information is required to create the pool and reasonable default values will be chosen based on the number of child vdevs in the dRAID vdev. zpool create <pool> draid[1,2,3] <vdevs...> Unlike raidz, additional optional dRAID configuration values can be provided as part of the draid type as colon separated values. This allows administrators to fully specify a layout for either performance or capacity reasons. The supported options include: zpool create <pool> \ draid[<parity>][:<data>d][:<children>c][:<spares>s] \ <vdevs...> - draid[parity] - Parity level (default 1) - draid[:<data>d] - Data devices per group (default 8) - draid[:<children>c] - Expected number of child vdevs - draid[:<spares>s] - Distributed hot spares (default 0) Abbreviated example `zpool status` output for a 68 disk dRAID pool with two distributed spares using special allocation classes. ``` pool: tank state: ONLINE config: NAME STATE READ WRITE CKSUM slag7 ONLINE 0 0 0 draid2:8d:68c:2s-0 ONLINE 0 0 0 L0 ONLINE 0 0 0 L1 ONLINE 0 0 0 ... U25 ONLINE 0 0 0 U26 ONLINE 0 0 0 spare-53 ONLINE 0 0 0 U27 ONLINE 0 0 0 draid2-0-0 ONLINE 0 0 0 U28 ONLINE 0 0 0 U29 ONLINE 0 0 0 ... U42 ONLINE 0 0 0 U43 ONLINE 0 0 0 special mirror-1 ONLINE 0 0 0 L5 ONLINE 0 0 0 U5 ONLINE 0 0 0 mirror-2 ONLINE 0 0 0 L6 ONLINE 0 0 0 U6 ONLINE 0 0 0 spares draid2-0-0 INUSE currently in use draid2-0-1 AVAIL ``` When adding test coverage for the new dRAID vdev type the following options were added to the ztest command. These options are leverages by zloop.sh to test a wide range of dRAID configurations. -K draid|raidz|random - kind of RAID to test -D <value> - dRAID data drives per group -S <value> - dRAID distributed hot spares -R <value> - RAID parity (raidz or dRAID) The zpool_create, zpool_import, redundancy, replacement and fault test groups have all been updated provide test coverage for the dRAID feature. Co-authored-by: Isaac Huang <he.huang@intel.com> Co-authored-by: Mark Maybee <mmaybee@cray.com> Co-authored-by: Don Brady <don.brady@delphix.com> Co-authored-by: Matthew Ahrens <mahrens@delphix.com> Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mmaybee@cray.com> Reviewed-by: Matt Ahrens <matt@delphix.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #10102
195 lines
7.1 KiB
C
195 lines
7.1 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
|
|
* Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_DSL_SCAN_H
|
|
#define _SYS_DSL_SCAN_H
|
|
|
|
#include <sys/zfs_context.h>
|
|
#include <sys/zio.h>
|
|
#include <sys/ddt.h>
|
|
#include <sys/bplist.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
struct objset;
|
|
struct dsl_dir;
|
|
struct dsl_dataset;
|
|
struct dsl_pool;
|
|
struct dmu_tx;
|
|
|
|
extern int zfs_scan_suspend_progress;
|
|
|
|
/*
|
|
* All members of this structure must be uint64_t, for byteswap
|
|
* purposes.
|
|
*/
|
|
typedef struct dsl_scan_phys {
|
|
uint64_t scn_func; /* pool_scan_func_t */
|
|
uint64_t scn_state; /* dsl_scan_state_t */
|
|
uint64_t scn_queue_obj;
|
|
uint64_t scn_min_txg;
|
|
uint64_t scn_max_txg;
|
|
uint64_t scn_cur_min_txg;
|
|
uint64_t scn_cur_max_txg;
|
|
uint64_t scn_start_time;
|
|
uint64_t scn_end_time;
|
|
uint64_t scn_to_examine; /* total bytes to be scanned */
|
|
uint64_t scn_examined; /* bytes scanned so far */
|
|
uint64_t scn_to_process;
|
|
uint64_t scn_processed;
|
|
uint64_t scn_errors; /* scan I/O error count */
|
|
uint64_t scn_ddt_class_max;
|
|
ddt_bookmark_t scn_ddt_bookmark;
|
|
zbookmark_phys_t scn_bookmark;
|
|
uint64_t scn_flags; /* dsl_scan_flags_t */
|
|
} dsl_scan_phys_t;
|
|
|
|
#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
|
|
|
|
typedef enum dsl_scan_flags {
|
|
DSF_VISIT_DS_AGAIN = 1<<0,
|
|
DSF_SCRUB_PAUSED = 1<<1,
|
|
} dsl_scan_flags_t;
|
|
|
|
#define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN)
|
|
|
|
/*
|
|
* Every pool will have one dsl_scan_t and this structure will contain
|
|
* in-memory information about the scan and a pointer to the on-disk
|
|
* representation (i.e. dsl_scan_phys_t). Most of the state of the scan
|
|
* is contained on-disk to allow the scan to resume in the event of a reboot
|
|
* or panic. This structure maintains information about the behavior of a
|
|
* running scan, some caching information, and how it should traverse the pool.
|
|
*
|
|
* The following members of this structure direct the behavior of the scan:
|
|
*
|
|
* scn_suspending - a scan that cannot be completed in a single txg or
|
|
* has exceeded its allotted time will need to suspend.
|
|
* When this flag is set the scanner will stop traversing
|
|
* the pool and write out the current state to disk.
|
|
*
|
|
* scn_restart_txg - directs the scanner to either restart or start a
|
|
* a scan at the specified txg value.
|
|
*
|
|
* scn_done_txg - when a scan completes its traversal it will set
|
|
* the completion txg to the next txg. This is necessary
|
|
* to ensure that any blocks that were freed during
|
|
* the scan but have not yet been processed (i.e deferred
|
|
* frees) are accounted for.
|
|
*
|
|
* This structure also maintains information about deferred frees which are
|
|
* a special kind of traversal. Deferred free can exist in either a bptree or
|
|
* a bpobj structure. The scn_is_bptree flag will indicate the type of
|
|
* deferred free that is in progress. If the deferred free is part of an
|
|
* asynchronous destroy then the scn_async_destroying flag will be set.
|
|
*/
|
|
typedef struct dsl_scan {
|
|
struct dsl_pool *scn_dp;
|
|
uint64_t scn_restart_txg;
|
|
uint64_t scn_done_txg;
|
|
uint64_t scn_sync_start_time;
|
|
uint64_t scn_issued_before_pass;
|
|
|
|
/* for freeing blocks */
|
|
boolean_t scn_is_bptree;
|
|
boolean_t scn_async_destroying;
|
|
boolean_t scn_async_stalled;
|
|
uint64_t scn_async_block_min_time_ms;
|
|
|
|
/* flags and stats for controlling scan state */
|
|
boolean_t scn_is_sorted; /* doing sequential scan */
|
|
boolean_t scn_clearing; /* scan is issuing sequential extents */
|
|
boolean_t scn_checkpointing; /* scan is issuing all queued extents */
|
|
boolean_t scn_suspending; /* scan is suspending until next txg */
|
|
uint64_t scn_last_checkpoint; /* time of last checkpoint */
|
|
|
|
/* members for thread synchronization */
|
|
zio_t *scn_zio_root; /* root zio for waiting on IO */
|
|
taskq_t *scn_taskq; /* task queue for issuing extents */
|
|
|
|
/* for controlling scan prefetch, protected by spa_scrub_lock */
|
|
boolean_t scn_prefetch_stop; /* prefetch should stop */
|
|
zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
|
|
avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */
|
|
uint64_t scn_maxinflight_bytes; /* max bytes in flight for pool */
|
|
|
|
/* per txg statistics */
|
|
uint64_t scn_visited_this_txg; /* total bps visited this txg */
|
|
uint64_t scn_dedup_frees_this_txg; /* dedup bps freed this txg */
|
|
uint64_t scn_holes_this_txg;
|
|
uint64_t scn_lt_min_this_txg;
|
|
uint64_t scn_gt_max_this_txg;
|
|
uint64_t scn_ddt_contained_this_txg;
|
|
uint64_t scn_objsets_visited_this_txg;
|
|
uint64_t scn_avg_seg_size_this_txg;
|
|
uint64_t scn_segs_this_txg;
|
|
uint64_t scn_avg_zio_size_this_txg;
|
|
uint64_t scn_zios_this_txg;
|
|
|
|
/* members needed for syncing scan status to disk */
|
|
dsl_scan_phys_t scn_phys; /* on disk representation of scan */
|
|
dsl_scan_phys_t scn_phys_cached;
|
|
avl_tree_t scn_queue; /* queue of datasets to scan */
|
|
uint64_t scn_bytes_pending; /* outstanding data to issue */
|
|
} dsl_scan_t;
|
|
|
|
typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
|
|
|
|
void scan_init(void);
|
|
void scan_fini(void);
|
|
int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
|
|
void dsl_scan_setup_sync(void *, dmu_tx_t *);
|
|
void dsl_scan_fini(struct dsl_pool *dp);
|
|
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
|
|
int dsl_scan_cancel(struct dsl_pool *);
|
|
int dsl_scan(struct dsl_pool *, pool_scan_func_t);
|
|
void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd);
|
|
boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
|
|
int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
|
|
void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg);
|
|
boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
|
|
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
|
|
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
|
|
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
|
|
ddt_entry_t *dde, dmu_tx_t *tx);
|
|
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
|
|
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
|
|
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
|
|
struct dmu_tx *tx);
|
|
boolean_t dsl_scan_active(dsl_scan_t *scn);
|
|
boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
|
|
void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
|
|
void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
|
|
void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_DSL_SCAN_H */
|