mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-11-17 18:11:00 +03:00
6cb8e5306d
Some work has been done lately to improve the debugability of the ZFS pool load (and import) process. This includes: 7638 Refactor spa_load_impl into several functions 8961 SPA load/import should tell us why it failed 7277 zdb should be able to print zfs_dbgmsg's To iterate on top of that, there's a few changes that were made to make the import process more resilient and crash free. One of the first tasks during the pool load process is to parse a config provided from userland that describes what devices the pool is composed of. A vdev tree is generated from that config, and then all the vdevs are opened. The Meta Object Set (MOS) of the pool is accessed, and several metadata objects that are necessary to load the pool are read. The exact configuration of the pool is also stored inside the MOS. Since the configuration provided from userland is external and might not accurately describe the vdev tree of the pool at the txg that is being loaded, it cannot be relied upon to safely operate the pool. For that reason, the configuration in the MOS is read early on. In the past, the two configurations were compared together and if there was a mismatch then the load process was aborted and an error was returned. The latter was a good way to ensure a pool does not get corrupted, however it made the pool load process needlessly fragile in cases where the vdev configuration changed or the userland configuration was outdated. Since the MOS is stored in 3 copies, the configuration provided by userland doesn't have to be perfect in order to read its contents. Hence, a new approach has been adopted: The pool is first opened with the untrusted userland configuration just so that the real configuration can be read from the MOS. The trusted MOS configuration is then used to generate a new vdev tree and the pool is re-opened. When the pool is opened with an untrusted configuration, writes are disabled to avoid accidentally damaging it. During reads, some sanity checks are performed on block pointers to see if each DVA points to a known vdev; when the configuration is untrusted, instead of panicking the system if those checks fail we simply avoid issuing reads to the invalid DVAs. This new two-step pool load process now allows rewinding pools accross vdev tree changes such as device replacement, addition, etc. Loading a pool from an external config file in a clustering environment also becomes much safer now since the pool will import even if the config is outdated and didn't, for instance, register a recent device addition. With this code in place, it became relatively easy to implement a long-sought-after feature: the ability to import a pool with missing top level (i.e. non-redundant) devices. Note that since this almost guarantees some loss of data, this feature is for now restricted to a read-only import. Porting notes (ZTS): * Fix 'make dist' target in zpool_import * The maximum path length allowed by tar is 99 characters. Several of the new test cases exceeded this limit resulting in them not being included in the tarball. Shorten the names slightly. * Set/get tunables using accessor functions. * Get last synced txg via the "zfs_txg_history" mechanism. * Clear zinject handlers in cleanup for import_cache_device_replaced and import_rewind_device_replaced in order that the zpool can be exported if there is an error. * Increase FILESIZE to 8G in zfs-test.sh to allow for a larger ext4 file system to be created on ZFS_DISK2. Also, there's no need to partition ZFS_DISK2 at all. The partitioning had already been disabled for multipath devices. Among other things, the partitioning steals some space from the ext4 file system, makes it difficult to accurately calculate the paramters to parted and can make some of the tests fail. * Increase FS_SIZE and FILE_SIZE in the zpool_import test configuration now that FILESIZE is larger. * Write more data in order that device evacuation take lonnger in a couple tests. * Use mkdir -p to avoid errors when the directory already exists. * Remove use of sudo in import_rewind_config_changed. Authored by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Andrew Stormont <andyjstormont@gmail.com> Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Tim Chase <tim@chase2k.com> OpenZFS-issue: https://illumos.org/issues/9075 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123 Closes #7459
188 lines
7.0 KiB
C
188 lines
7.0 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_VDEV_H
|
|
#define _SYS_VDEV_H
|
|
|
|
#include <sys/spa.h>
|
|
#include <sys/zio.h>
|
|
#include <sys/dmu.h>
|
|
#include <sys/space_map.h>
|
|
#include <sys/fs/zfs.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
typedef enum vdev_dtl_type {
|
|
DTL_MISSING, /* 0% replication: no copies of the data */
|
|
DTL_PARTIAL, /* less than 100% replication: some copies missing */
|
|
DTL_SCRUB, /* unable to fully repair during scrub/resilver */
|
|
DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
|
|
DTL_TYPES
|
|
} vdev_dtl_type_t;
|
|
|
|
extern int zfs_nocacheflush;
|
|
|
|
extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
|
|
extern void vdev_dbgmsg_print_tree(vdev_t *, int);
|
|
extern int vdev_open(vdev_t *);
|
|
extern void vdev_open_children(vdev_t *);
|
|
extern int vdev_validate(vdev_t *);
|
|
extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
|
|
extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
|
|
extern void vdev_close(vdev_t *);
|
|
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
|
|
extern void vdev_reopen(vdev_t *);
|
|
extern int vdev_validate_aux(vdev_t *vd);
|
|
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
|
|
extern boolean_t vdev_is_concrete(vdev_t *vd);
|
|
extern boolean_t vdev_is_bootable(vdev_t *vd);
|
|
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
|
|
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
|
|
extern int vdev_count_leaves(spa_t *spa);
|
|
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
|
|
uint64_t txg, uint64_t size);
|
|
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
|
|
uint64_t txg, uint64_t size);
|
|
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
|
|
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
|
|
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
|
|
int scrub_done);
|
|
extern boolean_t vdev_dtl_required(vdev_t *vd);
|
|
extern boolean_t vdev_resilver_needed(vdev_t *vd,
|
|
uint64_t *minp, uint64_t *maxp);
|
|
extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
|
|
dmu_tx_t *tx);
|
|
extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
|
|
extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
|
|
extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
|
|
extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
|
|
uint64_t size, uint64_t txg);
|
|
extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
|
|
uint64_t offset, uint64_t size, dmu_tx_t *tx);
|
|
|
|
extern void vdev_hold(vdev_t *);
|
|
extern void vdev_rele(vdev_t *);
|
|
|
|
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
|
|
extern void vdev_metaslab_fini(vdev_t *vd);
|
|
extern void vdev_metaslab_set_size(vdev_t *);
|
|
extern void vdev_expand(vdev_t *vd, uint64_t txg);
|
|
extern void vdev_split(vdev_t *vd);
|
|
extern void vdev_deadman(vdev_t *vd, char *tag);
|
|
|
|
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
|
|
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
|
|
extern void vdev_clear_stats(vdev_t *vd);
|
|
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
|
|
extern void vdev_scan_stat_init(vdev_t *vd);
|
|
extern void vdev_propagate_state(vdev_t *vd);
|
|
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
|
|
vdev_aux_t aux);
|
|
extern boolean_t vdev_children_are_offline(vdev_t *vd);
|
|
|
|
extern void vdev_space_update(vdev_t *vd,
|
|
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
|
|
|
|
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
|
|
|
|
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
|
|
extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
|
|
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
|
|
vdev_state_t *);
|
|
extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
|
|
extern void vdev_clear(spa_t *spa, vdev_t *vd);
|
|
|
|
extern boolean_t vdev_is_dead(vdev_t *vd);
|
|
extern boolean_t vdev_readable(vdev_t *vd);
|
|
extern boolean_t vdev_writeable(vdev_t *vd);
|
|
extern boolean_t vdev_allocatable(vdev_t *vd);
|
|
extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
|
|
|
|
extern void vdev_cache_init(vdev_t *vd);
|
|
extern void vdev_cache_fini(vdev_t *vd);
|
|
extern boolean_t vdev_cache_read(zio_t *zio);
|
|
extern void vdev_cache_write(zio_t *zio);
|
|
extern void vdev_cache_purge(vdev_t *vd);
|
|
|
|
extern void vdev_queue_init(vdev_t *vd);
|
|
extern void vdev_queue_fini(vdev_t *vd);
|
|
extern zio_t *vdev_queue_io(zio_t *zio);
|
|
extern void vdev_queue_io_done(zio_t *zio);
|
|
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
|
|
|
|
extern int vdev_queue_length(vdev_t *vd);
|
|
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
|
|
|
|
extern void vdev_config_dirty(vdev_t *vd);
|
|
extern void vdev_config_clean(vdev_t *vd);
|
|
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
|
|
|
|
extern void vdev_state_dirty(vdev_t *vd);
|
|
extern void vdev_state_clean(vdev_t *vd);
|
|
|
|
typedef enum vdev_config_flag {
|
|
VDEV_CONFIG_SPARE = 1 << 0,
|
|
VDEV_CONFIG_L2CACHE = 1 << 1,
|
|
VDEV_CONFIG_REMOVING = 1 << 2,
|
|
VDEV_CONFIG_MOS = 1 << 3,
|
|
VDEV_CONFIG_MISSING = 1 << 4
|
|
} vdev_config_flag_t;
|
|
|
|
extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
|
|
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
|
|
boolean_t getstats, vdev_config_flag_t flags);
|
|
|
|
/*
|
|
* Label routines
|
|
*/
|
|
struct uberblock;
|
|
extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
|
|
extern int vdev_label_number(uint64_t psise, uint64_t offset);
|
|
extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
|
|
extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
|
|
extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
|
|
extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
|
|
offset, uint64_t size, zio_done_func_t *done, void *private, int flags);
|
|
|
|
typedef enum {
|
|
VDEV_LABEL_CREATE, /* create/add a new device */
|
|
VDEV_LABEL_REPLACE, /* replace an existing device */
|
|
VDEV_LABEL_SPARE, /* add a new hot spare */
|
|
VDEV_LABEL_REMOVE, /* remove an existing device */
|
|
VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
|
|
VDEV_LABEL_SPLIT /* generating new label for split-off dev */
|
|
} vdev_labeltype_t;
|
|
|
|
extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_VDEV_H */
|