mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-21 18:26:47 +03:00
Rebase master to b105
This commit is contained in:
+3
-2
@@ -3491,6 +3491,7 @@ arc_fini(void)
|
||||
mutex_destroy(&arc_mru_ghost->arcs_mtx);
|
||||
mutex_destroy(&arc_mfu->arcs_mtx);
|
||||
mutex_destroy(&arc_mfu_ghost->arcs_mtx);
|
||||
mutex_destroy(&arc_l2c_only->arcs_mtx);
|
||||
|
||||
mutex_destroy(&zfs_write_limit_lock);
|
||||
|
||||
@@ -4457,7 +4458,7 @@ l2arc_fini(void)
|
||||
void
|
||||
l2arc_start(void)
|
||||
{
|
||||
if (!(spa_mode & FWRITE))
|
||||
if (!(spa_mode_global & FWRITE))
|
||||
return;
|
||||
|
||||
(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
|
||||
@@ -4467,7 +4468,7 @@ l2arc_start(void)
|
||||
void
|
||||
l2arc_stop(void)
|
||||
{
|
||||
if (!(spa_mode & FWRITE))
|
||||
if (!(spa_mode_global & FWRITE))
|
||||
return;
|
||||
|
||||
mutex_enter(&l2arc_feed_thr_lock);
|
||||
|
||||
@@ -119,7 +119,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
|
||||
* We only want to visit blocks that have been claimed but not yet
|
||||
* replayed (or, in read-only mode, blocks that *would* be claimed).
|
||||
*/
|
||||
if (claim_txg == 0 && (spa_mode & FWRITE))
|
||||
if (claim_txg == 0 && spa_writeable(td->td_spa))
|
||||
return;
|
||||
|
||||
zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
|
||||
|
||||
@@ -56,6 +56,8 @@ dnode_cons(void *arg, void *unused, int kmflag)
|
||||
rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
|
||||
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
refcount_create(&dn->dn_holds);
|
||||
refcount_create(&dn->dn_tx_holds);
|
||||
|
||||
@@ -84,6 +86,7 @@ dnode_dest(void *arg, void *unused)
|
||||
rw_destroy(&dn->dn_struct_rwlock);
|
||||
mutex_destroy(&dn->dn_mtx);
|
||||
mutex_destroy(&dn->dn_dbufs_mtx);
|
||||
cv_destroy(&dn->dn_notxholds);
|
||||
refcount_destroy(&dn->dn_holds);
|
||||
refcount_destroy(&dn->dn_tx_holds);
|
||||
|
||||
|
||||
@@ -1948,6 +1948,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
|
||||
if (ds->ds_phys->ds_next_snap_obj) {
|
||||
stat->dds_is_snapshot = B_TRUE;
|
||||
stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
|
||||
} else {
|
||||
stat->dds_is_snapshot = B_FALSE;
|
||||
stat->dds_num_clones = 0;
|
||||
}
|
||||
|
||||
/* clone origin is really a dsl_dir thing... */
|
||||
@@ -1959,6 +1962,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
|
||||
ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
|
||||
dsl_dataset_name(ods, stat->dds_origin);
|
||||
dsl_dataset_drop_ref(ods, FTAG);
|
||||
} else {
|
||||
stat->dds_origin[0] = '\0';
|
||||
}
|
||||
rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
|
||||
}
|
||||
|
||||
+32
-29
@@ -391,7 +391,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
|
||||
* We only want to visit blocks that have been claimed but not yet
|
||||
* replayed (or, in read-only mode, blocks that *would* be claimed).
|
||||
*/
|
||||
if (claim_txg == 0 && (spa_mode & FWRITE))
|
||||
if (claim_txg == 0 && spa_writeable(dp->dp_spa))
|
||||
return;
|
||||
|
||||
zilog = zil_alloc(dp->dp_meta_objset, zh);
|
||||
@@ -409,9 +409,6 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
|
||||
int err;
|
||||
arc_buf_t *buf = NULL;
|
||||
|
||||
if (bp->blk_birth == 0)
|
||||
return;
|
||||
|
||||
if (bp->blk_birth <= dp->dp_scrub_min_txg)
|
||||
return;
|
||||
|
||||
@@ -740,6 +737,7 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
|
||||
void
|
||||
dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = dp->dp_spa;
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
boolean_t complete = B_TRUE;
|
||||
@@ -747,8 +745,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
|
||||
return;
|
||||
|
||||
/* If the spa is not fully loaded, don't bother. */
|
||||
if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
|
||||
/*
|
||||
* If the pool is not loaded, or is trying to unload, leave it alone.
|
||||
*/
|
||||
if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
|
||||
return;
|
||||
|
||||
if (dp->dp_scrub_restart) {
|
||||
@@ -757,13 +757,13 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
|
||||
}
|
||||
|
||||
if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
|
||||
if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
|
||||
/*
|
||||
* We must have resumed after rebooting; reset the vdev
|
||||
* stats to know that we're doing a scrub (although it
|
||||
* will think we're just starting now).
|
||||
*/
|
||||
vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
|
||||
vdev_scrub_stat_update(spa->spa_root_vdev,
|
||||
dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
|
||||
POOL_SCRUB_EVERYTHING, B_FALSE);
|
||||
}
|
||||
@@ -771,7 +771,7 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
dp->dp_scrub_pausing = B_FALSE;
|
||||
dp->dp_scrub_start_time = lbolt64;
|
||||
dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
|
||||
dp->dp_spa->spa_scrub_active = B_TRUE;
|
||||
spa->spa_scrub_active = B_TRUE;
|
||||
|
||||
if (dp->dp_scrub_bookmark.zb_objset == 0) {
|
||||
/* First do the MOS & ORIGIN */
|
||||
@@ -779,8 +779,8 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
if (dp->dp_scrub_pausing)
|
||||
goto out;
|
||||
|
||||
if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
|
||||
VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
|
||||
if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
|
||||
VERIFY(0 == dmu_objset_find_spa(spa,
|
||||
NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
|
||||
} else {
|
||||
scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
|
||||
@@ -830,15 +830,13 @@ out:
|
||||
VERIFY(0 == zap_update(dp->dp_meta_objset,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
|
||||
&dp->dp_spa->spa_scrub_errors, tx));
|
||||
&spa->spa_scrub_errors, tx));
|
||||
|
||||
/* XXX this is scrub-clean specific */
|
||||
mutex_enter(&dp->dp_spa->spa_scrub_lock);
|
||||
while (dp->dp_spa->spa_scrub_inflight > 0) {
|
||||
cv_wait(&dp->dp_spa->spa_scrub_io_cv,
|
||||
&dp->dp_spa->spa_scrub_lock);
|
||||
}
|
||||
mutex_exit(&dp->dp_spa->spa_scrub_lock);
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
while (spa->spa_scrub_inflight > 0)
|
||||
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -920,13 +918,17 @@ static int
|
||||
dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
|
||||
const blkptr_t *bp, const zbookmark_t *zb)
|
||||
{
|
||||
size_t size = BP_GET_LSIZE(bp);
|
||||
int d;
|
||||
size_t size = BP_GET_PSIZE(bp);
|
||||
spa_t *spa = dp->dp_spa;
|
||||
boolean_t needs_io;
|
||||
int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
|
||||
int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
|
||||
int zio_priority;
|
||||
|
||||
ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
|
||||
|
||||
if (bp->blk_birth >= dp->dp_scrub_max_txg)
|
||||
return (0);
|
||||
|
||||
count_block(dp->dp_blkstats, bp);
|
||||
|
||||
if (dp->dp_scrub_isresilver == 0) {
|
||||
@@ -945,7 +947,7 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
|
||||
if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
|
||||
zio_flags |= ZIO_FLAG_SPECULATIVE;
|
||||
|
||||
for (d = 0; d < BP_GET_NDVAS(bp); d++) {
|
||||
for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
|
||||
vdev_t *vd = vdev_lookup_top(spa,
|
||||
DVA_GET_VDEV(&bp->blk_dva[d]));
|
||||
|
||||
@@ -963,16 +965,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
|
||||
if (DVA_GET_GANG(&bp->blk_dva[d])) {
|
||||
/*
|
||||
* Gang members may be spread across multiple
|
||||
* vdevs, so the best we can do is look at the
|
||||
* pool-wide DTL.
|
||||
* vdevs, so the best estimate we have is the
|
||||
* scrub range, which has already been checked.
|
||||
* XXX -- it would be better to change our
|
||||
* allocation policy to ensure that this can't
|
||||
* happen.
|
||||
* allocation policy to ensure that all
|
||||
* gang members reside on the same vdev.
|
||||
*/
|
||||
vd = spa->spa_root_vdev;
|
||||
needs_io = B_TRUE;
|
||||
} else {
|
||||
needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
|
||||
bp->blk_birth, 1);
|
||||
}
|
||||
needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
|
||||
bp->blk_birth, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -332,7 +332,8 @@ extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
|
||||
extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
|
||||
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
|
||||
extern int spa_destroy(char *pool);
|
||||
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force);
|
||||
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
|
||||
boolean_t hardforce);
|
||||
extern int spa_reset(char *pool);
|
||||
extern void spa_async_request(spa_t *spa, int flag);
|
||||
extern void spa_async_unrequest(spa_t *spa, int flag);
|
||||
@@ -351,7 +352,8 @@ extern void spa_inject_delref(spa_t *spa);
|
||||
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
|
||||
extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
|
||||
int replacing);
|
||||
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
|
||||
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
|
||||
int replace_done);
|
||||
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
|
||||
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
|
||||
|
||||
@@ -475,6 +477,8 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
|
||||
extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
|
||||
extern boolean_t spa_has_slogs(spa_t *spa);
|
||||
extern boolean_t spa_is_root(spa_t *spa);
|
||||
extern boolean_t spa_writeable(spa_t *spa);
|
||||
extern int spa_mode(spa_t *spa);
|
||||
|
||||
/* history logging */
|
||||
typedef enum history_log_type {
|
||||
@@ -545,7 +549,7 @@ _NOTE(CONSTCOND) } while (0)
|
||||
#define dprintf_bp(bp, fmt, ...)
|
||||
#endif
|
||||
|
||||
extern int spa_mode; /* mode, e.g. FREAD | FWRITE */
|
||||
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -170,6 +170,7 @@ struct spa {
|
||||
boolean_t spa_import_faulted; /* allow faulted vdevs */
|
||||
boolean_t spa_is_root; /* pool is root */
|
||||
int spa_minref; /* num refs when first opened */
|
||||
int spa_mode; /* FREAD | FWRITE */
|
||||
spa_log_state_t spa_log_state; /* log state */
|
||||
/*
|
||||
* spa_refcnt & spa_config_lock must be the last elements
|
||||
|
||||
@@ -19,15 +19,13 @@
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_SPACE_MAP_H
|
||||
#define _SYS_SPACE_MAP_H
|
||||
|
||||
#pragma ident "%Z%%M% %I% %E% SMI"
|
||||
|
||||
#include <sys/avl.h>
|
||||
#include <sys/dmu.h>
|
||||
|
||||
@@ -58,6 +56,12 @@ typedef struct space_seg {
|
||||
uint64_t ss_end; /* ending offset (non-inclusive) */
|
||||
} space_seg_t;
|
||||
|
||||
typedef struct space_ref {
|
||||
avl_node_t sr_node; /* AVL node */
|
||||
uint64_t sr_offset; /* offset (start or end) */
|
||||
int64_t sr_refcnt; /* associated reference count */
|
||||
} space_ref_t;
|
||||
|
||||
typedef struct space_map_obj {
|
||||
uint64_t smo_object; /* on-disk space map object */
|
||||
uint64_t smo_objsize; /* size of the object */
|
||||
@@ -133,13 +137,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
|
||||
extern void space_map_destroy(space_map_t *sm);
|
||||
extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern boolean_t space_map_contains(space_map_t *sm,
|
||||
uint64_t start, uint64_t size);
|
||||
extern void space_map_vacate(space_map_t *sm,
|
||||
space_map_func_t *func, space_map_t *mdest);
|
||||
extern void space_map_walk(space_map_t *sm,
|
||||
space_map_func_t *func, space_map_t *mdest);
|
||||
extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
|
||||
extern void space_map_union(space_map_t *smd, space_map_t *sms);
|
||||
|
||||
extern void space_map_load_wait(space_map_t *sm);
|
||||
extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
|
||||
@@ -155,6 +158,15 @@ extern void space_map_sync(space_map_t *sm, uint8_t maptype,
|
||||
extern void space_map_truncate(space_map_obj_t *smo,
|
||||
objset_t *os, dmu_tx_t *tx);
|
||||
|
||||
extern void space_map_ref_create(avl_tree_t *t);
|
||||
extern void space_map_ref_destroy(avl_tree_t *t);
|
||||
extern void space_map_ref_add_seg(avl_tree_t *t,
|
||||
uint64_t start, uint64_t end, int64_t refcnt);
|
||||
extern void space_map_ref_add_map(avl_tree_t *t,
|
||||
space_map_t *sm, int64_t refcnt);
|
||||
extern void space_map_ref_generate_map(avl_tree_t *t,
|
||||
space_map_t *sm, int64_t minref);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -19,21 +19,24 @@
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_UBERBLOCK_IMPL_H
|
||||
#define _SYS_UBERBLOCK_IMPL_H
|
||||
|
||||
#pragma ident "%Z%%M% %I% %E% SMI"
|
||||
|
||||
#include <sys/uberblock.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For zdb use and debugging purposes only
|
||||
*/
|
||||
extern uint64_t ub_max_txg;
|
||||
|
||||
/*
|
||||
* The uberblock version is incremented whenever an incompatible on-disk
|
||||
* format change is made to the SPA, DMU, or ZAP.
|
||||
|
||||
@@ -36,6 +36,14 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum vdev_dtl_type {
|
||||
DTL_MISSING, /* 0% replication: no copies of the data */
|
||||
DTL_PARTIAL, /* less than 100% replication: some copies missing */
|
||||
DTL_SCRUB, /* unable to fully repair during scrub/resilver */
|
||||
DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
|
||||
DTL_TYPES
|
||||
} vdev_dtl_type_t;
|
||||
|
||||
extern boolean_t zfs_nocacheflush;
|
||||
|
||||
extern int vdev_open(vdev_t *);
|
||||
@@ -50,10 +58,14 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
|
||||
extern boolean_t vdev_is_bootable(vdev_t *vd);
|
||||
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
|
||||
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
|
||||
extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
|
||||
extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
|
||||
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
|
||||
uint64_t txg, uint64_t size);
|
||||
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
|
||||
uint64_t txg, uint64_t size);
|
||||
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
|
||||
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
|
||||
int scrub_done);
|
||||
extern boolean_t vdev_dtl_required(vdev_t *vd);
|
||||
extern boolean_t vdev_resilver_needed(vdev_t *vd,
|
||||
uint64_t *minp, uint64_t *maxp);
|
||||
|
||||
|
||||
@@ -123,8 +123,7 @@ struct vdev {
|
||||
vdev_t *vdev_parent; /* parent vdev */
|
||||
vdev_t **vdev_child; /* array of children */
|
||||
uint64_t vdev_children; /* number of children */
|
||||
space_map_t vdev_dtl_map; /* dirty time log in-core state */
|
||||
space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */
|
||||
space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
|
||||
vdev_stat_t vdev_stat; /* virtual device statistics */
|
||||
|
||||
/*
|
||||
@@ -149,7 +148,7 @@ struct vdev {
|
||||
* Leaf vdev state.
|
||||
*/
|
||||
uint64_t vdev_psize; /* physical device capacity */
|
||||
space_map_obj_t vdev_dtl; /* dirty time log on-disk state */
|
||||
space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */
|
||||
txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
|
||||
uint64_t vdev_wholedisk; /* true if this is a whole disk */
|
||||
uint64_t vdev_offline; /* persistent offline state */
|
||||
|
||||
@@ -26,8 +26,6 @@
|
||||
#ifndef _SYS_FS_ZFS_VFSOPS_H
|
||||
#define _SYS_FS_ZFS_VFSOPS_H
|
||||
|
||||
#pragma ident "%Z%%M% %I% %E% SMI"
|
||||
|
||||
#include <sys/isa_defs.h>
|
||||
#include <sys/types32.h>
|
||||
#include <sys/list.h>
|
||||
@@ -49,7 +47,6 @@ struct zfsvfs {
|
||||
uint64_t z_root; /* id of root znode */
|
||||
uint64_t z_unlinkedobj; /* id of unlinked zapobj */
|
||||
uint64_t z_max_blksz; /* maximum block size for files */
|
||||
uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */
|
||||
uint64_t z_fuid_obj; /* fuid table object number */
|
||||
uint64_t z_fuid_size; /* fuid table size */
|
||||
avl_tree_t z_fuid_idx; /* fuid tree keyed by index */
|
||||
@@ -74,6 +71,7 @@ struct zfsvfs {
|
||||
boolean_t z_issnap; /* true if this is a snapshot */
|
||||
boolean_t z_vscan; /* virus scan on/off */
|
||||
boolean_t z_use_fuids; /* version allows fuids */
|
||||
boolean_t z_replay; /* set during ZIL replay */
|
||||
kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */
|
||||
uint64_t z_version; /* ZPL version */
|
||||
#define ZFS_OBJ_MTX_SZ 64
|
||||
|
||||
@@ -335,7 +335,6 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
|
||||
typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
|
||||
uint64_t txg);
|
||||
typedef int zil_replay_func_t();
|
||||
typedef void zil_replay_cleaner_t();
|
||||
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
|
||||
|
||||
extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
|
||||
@@ -350,9 +349,8 @@ extern void zil_free(zilog_t *zilog);
|
||||
extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
|
||||
extern void zil_close(zilog_t *zilog);
|
||||
|
||||
extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp,
|
||||
zil_replay_func_t *replay_func[TX_MAX_TYPE],
|
||||
zil_replay_cleaner_t *replay_cleaner);
|
||||
extern void zil_replay(objset_t *os, void *arg,
|
||||
zil_replay_func_t *replay_func[TX_MAX_TYPE]);
|
||||
extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
|
||||
extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
|
||||
|
||||
|
||||
@@ -19,15 +19,13 @@
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_ZIL_IMPL_H
|
||||
#define _SYS_ZIL_IMPL_H
|
||||
|
||||
#pragma ident "%Z%%M% %I% %E% SMI"
|
||||
|
||||
#include <sys/zil.h>
|
||||
#include <sys/dmu_objset.h>
|
||||
|
||||
@@ -74,13 +72,14 @@ struct zilog {
|
||||
uint64_t zl_commit_seq; /* committed upto this number */
|
||||
uint64_t zl_lr_seq; /* log record sequence number */
|
||||
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
|
||||
uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
|
||||
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
|
||||
uint64_t zl_replaying_seq; /* current replay seq number */
|
||||
uint32_t zl_suspend; /* log suspend count */
|
||||
kcondvar_t zl_cv_writer; /* log writer thread completion */
|
||||
kcondvar_t zl_cv_suspend; /* log suspend completion */
|
||||
uint8_t zl_suspending; /* log is currently suspending */
|
||||
uint8_t zl_keep_first; /* keep first log block in destroy */
|
||||
uint8_t zl_stop_replay; /* don't replay any further */
|
||||
uint8_t zl_replay; /* replaying records while set */
|
||||
uint8_t zl_stop_sync; /* for debugging */
|
||||
uint8_t zl_writer; /* boolean: write setup in progress */
|
||||
uint8_t zl_log_error; /* boolean: log write error */
|
||||
|
||||
@@ -132,12 +132,14 @@ enum zio_compress {
|
||||
#define ZIO_FLAG_IO_RETRY 0x00400
|
||||
#define ZIO_FLAG_IO_REWRITE 0x00800
|
||||
|
||||
#define ZIO_FLAG_PROBE 0x01000
|
||||
#define ZIO_FLAG_SELF_HEAL 0x01000
|
||||
#define ZIO_FLAG_RESILVER 0x02000
|
||||
#define ZIO_FLAG_SCRUB 0x04000
|
||||
#define ZIO_FLAG_SCRUB_THREAD 0x08000
|
||||
|
||||
#define ZIO_FLAG_GANG_CHILD 0x10000
|
||||
#define ZIO_FLAG_PROBE 0x10000
|
||||
#define ZIO_FLAG_GANG_CHILD 0x20000
|
||||
#define ZIO_FLAG_RAW 0x40000
|
||||
|
||||
#define ZIO_FLAG_GANG_INHERIT \
|
||||
(ZIO_FLAG_CANFAIL | \
|
||||
@@ -146,6 +148,7 @@ enum zio_compress {
|
||||
ZIO_FLAG_DONT_RETRY | \
|
||||
ZIO_FLAG_DONT_CACHE | \
|
||||
ZIO_FLAG_DONT_AGGREGATE | \
|
||||
ZIO_FLAG_SELF_HEAL | \
|
||||
ZIO_FLAG_RESILVER | \
|
||||
ZIO_FLAG_SCRUB | \
|
||||
ZIO_FLAG_SCRUB_THREAD)
|
||||
@@ -156,6 +159,14 @@ enum zio_compress {
|
||||
ZIO_FLAG_IO_RETRY | \
|
||||
ZIO_FLAG_PROBE)
|
||||
|
||||
#define ZIO_FLAG_AGG_INHERIT \
|
||||
(ZIO_FLAG_DONT_AGGREGATE | \
|
||||
ZIO_FLAG_IO_REPAIR | \
|
||||
ZIO_FLAG_SELF_HEAL | \
|
||||
ZIO_FLAG_RESILVER | \
|
||||
ZIO_FLAG_SCRUB | \
|
||||
ZIO_FLAG_SCRUB_THREAD)
|
||||
|
||||
#define ZIO_PIPELINE_CONTINUE 0x100
|
||||
#define ZIO_PIPELINE_STOP 0x101
|
||||
|
||||
|
||||
+19
-2
@@ -720,6 +720,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||
vdev_t *vd;
|
||||
int dshift = 3;
|
||||
int all_zero;
|
||||
int zio_lock = B_FALSE;
|
||||
boolean_t allocatable;
|
||||
uint64_t offset = -1ULL;
|
||||
uint64_t asize;
|
||||
uint64_t distance;
|
||||
@@ -778,11 +780,20 @@ top:
|
||||
all_zero = B_TRUE;
|
||||
do {
|
||||
vd = mg->mg_vd;
|
||||
|
||||
/*
|
||||
* Don't allocate from faulted devices.
|
||||
*/
|
||||
if (!vdev_allocatable(vd))
|
||||
if (zio_lock) {
|
||||
spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
|
||||
allocatable = vdev_allocatable(vd);
|
||||
spa_config_exit(spa, SCL_ZIO, FTAG);
|
||||
} else {
|
||||
allocatable = vdev_allocatable(vd);
|
||||
}
|
||||
if (!allocatable)
|
||||
goto next;
|
||||
|
||||
/*
|
||||
* Avoid writing single-copy data to a failing vdev
|
||||
*/
|
||||
@@ -858,6 +869,12 @@ next:
|
||||
goto top;
|
||||
}
|
||||
|
||||
if (!zio_lock) {
|
||||
dshift = 3;
|
||||
zio_lock = B_TRUE;
|
||||
goto top;
|
||||
}
|
||||
|
||||
bzero(&dva[d], sizeof (dva_t));
|
||||
|
||||
return (ENOSPC);
|
||||
@@ -946,7 +963,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
|
||||
|
||||
space_map_claim(&msp->ms_map, offset, size);
|
||||
|
||||
if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */
|
||||
if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
|
||||
if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
|
||||
vdev_dirty(vd, VDD_METASLAB, msp, txg);
|
||||
space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
|
||||
|
||||
+148
-116
@@ -486,11 +486,12 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
|
||||
* Activate an uninitialized pool.
|
||||
*/
|
||||
static void
|
||||
spa_activate(spa_t *spa)
|
||||
spa_activate(spa_t *spa, int mode)
|
||||
{
|
||||
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
|
||||
|
||||
spa->spa_state = POOL_STATE_ACTIVE;
|
||||
spa->spa_mode = mode;
|
||||
|
||||
spa->spa_normal_class = metaslab_class_create();
|
||||
spa->spa_log_class = metaslab_class_create();
|
||||
@@ -639,11 +640,6 @@ spa_unload(spa_t *spa)
|
||||
cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
|
||||
mutex_exit(&spa->spa_async_root_lock);
|
||||
|
||||
/*
|
||||
* Drop and purge level 2 cache
|
||||
*/
|
||||
spa_l2cache_drop(spa);
|
||||
|
||||
/*
|
||||
* Close the dsl pool.
|
||||
*/
|
||||
@@ -652,6 +648,13 @@ spa_unload(spa_t *spa)
|
||||
spa->spa_dsl_pool = NULL;
|
||||
}
|
||||
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
/*
|
||||
* Drop and purge level 2 cache
|
||||
*/
|
||||
spa_l2cache_drop(spa);
|
||||
|
||||
/*
|
||||
* Close all vdevs.
|
||||
*/
|
||||
@@ -686,6 +689,8 @@ spa_unload(spa_t *spa)
|
||||
spa->spa_l2cache.sav_count = 0;
|
||||
|
||||
spa->spa_async_suspended = 0;
|
||||
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -897,12 +902,9 @@ spa_load_l2cache(spa_t *spa)
|
||||
|
||||
vd = oldvdevs[i];
|
||||
if (vd != NULL) {
|
||||
if ((spa_mode & FWRITE) &&
|
||||
spa_l2cache_exists(vd->vdev_guid, &pool) &&
|
||||
pool != 0ULL &&
|
||||
l2arc_vdev_present(vd)) {
|
||||
if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
|
||||
pool != 0ULL && l2arc_vdev_present(vd))
|
||||
l2arc_remove_vdev(vd);
|
||||
}
|
||||
(void) vdev_close(vd);
|
||||
spa_l2cache_remove(vd);
|
||||
}
|
||||
@@ -1018,8 +1020,16 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
|
||||
uint64_t pool_guid;
|
||||
uint64_t version;
|
||||
uint64_t autoreplace = 0;
|
||||
int orig_mode = spa->spa_mode;
|
||||
char *ereport = FM_EREPORT_ZFS_POOL;
|
||||
|
||||
/*
|
||||
* If this is an untrusted config, access the pool in read-only mode.
|
||||
* This prevents things like resilvering recently removed devices.
|
||||
*/
|
||||
if (!mosconfig)
|
||||
spa->spa_mode = FREAD;
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
spa->spa_load_state = state;
|
||||
@@ -1077,12 +1087,13 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
|
||||
* Validate the labels for all leaf vdevs. We need to grab the config
|
||||
* lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
|
||||
*/
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
error = vdev_validate(rvd);
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
|
||||
if (error != 0)
|
||||
goto out;
|
||||
if (mosconfig) {
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
error = vdev_validate(rvd);
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
if (error != 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
|
||||
error = ENXIO;
|
||||
@@ -1184,7 +1195,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
|
||||
spa_config_set(spa, newconfig);
|
||||
spa_unload(spa);
|
||||
spa_deactivate(spa);
|
||||
spa_activate(spa);
|
||||
spa_activate(spa, orig_mode);
|
||||
|
||||
return (spa_load(spa, newconfig, state, B_TRUE));
|
||||
}
|
||||
@@ -1376,10 +1387,11 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
|
||||
if (spa_writeable(spa)) {
|
||||
dmu_tx_t *tx;
|
||||
int need_update = B_FALSE;
|
||||
int c;
|
||||
|
||||
ASSERT(state != SPA_LOAD_TRYIMPORT);
|
||||
|
||||
/*
|
||||
* Claim log blocks that haven't been committed yet.
|
||||
@@ -1407,7 +1419,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
|
||||
state == SPA_LOAD_IMPORT)
|
||||
need_update = B_TRUE;
|
||||
|
||||
for (c = 0; c < rvd->vdev_children; c++)
|
||||
for (int c = 0; c < rvd->vdev_children; c++)
|
||||
if (rvd->vdev_child[c]->vdev_ms_array == 0)
|
||||
need_update = B_TRUE;
|
||||
|
||||
@@ -1417,6 +1429,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
|
||||
*/
|
||||
if (need_update)
|
||||
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
|
||||
|
||||
/*
|
||||
* Check all DTLs to see if anything needs resilvering.
|
||||
*/
|
||||
if (vdev_resilver_needed(rvd, NULL, NULL))
|
||||
spa_async_request(spa, SPA_ASYNC_RESILVER);
|
||||
}
|
||||
|
||||
error = 0;
|
||||
@@ -1469,7 +1487,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
|
||||
}
|
||||
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
|
||||
|
||||
spa_activate(spa);
|
||||
spa_activate(spa, spa_mode_global);
|
||||
|
||||
error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
|
||||
|
||||
@@ -1873,11 +1891,9 @@ spa_l2cache_drop(spa_t *spa)
|
||||
vd = sav->sav_vdevs[i];
|
||||
ASSERT(vd != NULL);
|
||||
|
||||
if ((spa_mode & FWRITE) &&
|
||||
spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
|
||||
l2arc_vdev_present(vd)) {
|
||||
if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
|
||||
pool != 0ULL && l2arc_vdev_present(vd))
|
||||
l2arc_remove_vdev(vd);
|
||||
}
|
||||
if (vd->vdev_isl2cache)
|
||||
spa_l2cache_remove(vd);
|
||||
vdev_clear_stats(vd);
|
||||
@@ -1918,7 +1934,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
|
||||
(void) nvlist_lookup_string(props,
|
||||
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
|
||||
spa = spa_add(pool, altroot);
|
||||
spa_activate(spa);
|
||||
spa_activate(spa, spa_mode_global);
|
||||
|
||||
spa->spa_uberblock.ub_txg = txg - 1;
|
||||
|
||||
@@ -2121,7 +2137,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
|
||||
(void) nvlist_lookup_string(props,
|
||||
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
|
||||
spa = spa_add(pool, altroot);
|
||||
spa_activate(spa);
|
||||
spa_activate(spa, spa_mode_global);
|
||||
|
||||
if (allowfaulted)
|
||||
spa->spa_import_faulted = B_TRUE;
|
||||
@@ -2160,7 +2176,8 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
|
||||
VDEV_ALLOC_L2CACHE);
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
|
||||
if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
|
||||
if (error != 0 || (props && spa_writeable(spa) &&
|
||||
(error = spa_prop_set(spa, props)))) {
|
||||
if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
|
||||
/*
|
||||
* If we failed to load the pool, but 'allowfaulted' is
|
||||
@@ -2219,7 +2236,7 @@ spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
|
||||
spa->spa_l2cache.sav_sync = B_TRUE;
|
||||
}
|
||||
|
||||
if (spa_mode & FWRITE) {
|
||||
if (spa_writeable(spa)) {
|
||||
/*
|
||||
* Update the config cache to include the newly-imported pool.
|
||||
*/
|
||||
@@ -2367,11 +2384,11 @@ spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
|
||||
char *cdevid, *cpath;
|
||||
uint64_t tmptxg;
|
||||
|
||||
cpath = NULL;
|
||||
cdevid = NULL;
|
||||
if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
|
||||
&cpath) != 0)
|
||||
return (EINVAL);
|
||||
if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID,
|
||||
&cdevid) != 0)
|
||||
&cpath) != 0 && nvlist_lookup_string(child[c],
|
||||
ZPOOL_CONFIG_DEVID, &cdevid) != 0)
|
||||
return (EINVAL);
|
||||
if ((spa_check_rootconf(cpath, cdevid, NULL,
|
||||
&tmptxg) == 0) && (tmptxg > txg)) {
|
||||
@@ -2489,7 +2506,7 @@ spa_tryimport(nvlist_t *tryconfig)
|
||||
*/
|
||||
mutex_enter(&spa_namespace_lock);
|
||||
spa = spa_add(TRYIMPORT_NAME, NULL);
|
||||
spa_activate(spa);
|
||||
spa_activate(spa, FREAD);
|
||||
|
||||
/*
|
||||
* Pass off the heavy lifting to spa_load().
|
||||
@@ -2563,18 +2580,19 @@ spa_tryimport(nvlist_t *tryconfig)
|
||||
* The act of destroying or exporting a pool is very simple. We make sure there
|
||||
* is no more pending I/O and any references to the pool are gone. Then, we
|
||||
* update the pool state and sync all the labels to disk, removing the
|
||||
* configuration from the cache afterwards.
|
||||
* configuration from the cache afterwards. If the 'hardforce' flag is set, then
|
||||
* we don't sync the labels or remove the configuration cache.
|
||||
*/
|
||||
static int
|
||||
spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
|
||||
boolean_t force)
|
||||
boolean_t force, boolean_t hardforce)
|
||||
{
|
||||
spa_t *spa;
|
||||
|
||||
if (oldconfig)
|
||||
*oldconfig = NULL;
|
||||
|
||||
if (!(spa_mode & FWRITE))
|
||||
if (!(spa_mode_global & FWRITE))
|
||||
return (EROFS);
|
||||
|
||||
mutex_enter(&spa_namespace_lock);
|
||||
@@ -2635,7 +2653,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
|
||||
* so mark them all dirty. spa_unload() will do the
|
||||
* final sync that pushes these changes out.
|
||||
*/
|
||||
if (new_state != POOL_STATE_UNINITIALIZED) {
|
||||
if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
spa->spa_state = new_state;
|
||||
spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
|
||||
@@ -2655,7 +2673,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
|
||||
VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
|
||||
|
||||
if (new_state != POOL_STATE_UNINITIALIZED) {
|
||||
spa_config_sync(spa, B_TRUE, B_TRUE);
|
||||
if (!hardforce)
|
||||
spa_config_sync(spa, B_TRUE, B_TRUE);
|
||||
spa_remove(spa);
|
||||
}
|
||||
mutex_exit(&spa_namespace_lock);
|
||||
@@ -2669,16 +2688,19 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
|
||||
int
|
||||
spa_destroy(char *pool)
|
||||
{
|
||||
return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE));
|
||||
return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
|
||||
B_FALSE, B_FALSE));
|
||||
}
|
||||
|
||||
/*
|
||||
* Export a storage pool.
|
||||
*/
|
||||
int
|
||||
spa_export(char *pool, nvlist_t **oldconfig, boolean_t force)
|
||||
spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
|
||||
boolean_t hardforce)
|
||||
{
|
||||
return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force));
|
||||
return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
|
||||
force, hardforce));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2689,7 +2711,7 @@ int
|
||||
spa_reset(char *pool)
|
||||
{
|
||||
return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
|
||||
B_FALSE));
|
||||
B_FALSE, B_FALSE));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2705,7 +2727,7 @@ int
|
||||
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
{
|
||||
uint64_t txg;
|
||||
int c, error;
|
||||
int error;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
vdev_t *vd, *tvd;
|
||||
nvlist_t **spares, **l2cache;
|
||||
@@ -2744,7 +2766,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
|
||||
/*
|
||||
* Transfer each new top-level vdev from vd to rvd.
|
||||
*/
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
for (int c = 0; c < vd->vdev_children; c++) {
|
||||
tvd = vd->vdev_child[c];
|
||||
vdev_remove_child(vd, tvd);
|
||||
tvd->vdev_id = rvd->vdev_children;
|
||||
@@ -2952,10 +2974,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
|
||||
*/
|
||||
open_txg = txg + TXG_CONCURRENT_STATES - 1;
|
||||
|
||||
mutex_enter(&newvd->vdev_dtl_lock);
|
||||
space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
|
||||
open_txg - TXG_INITIAL + 1);
|
||||
mutex_exit(&newvd->vdev_dtl_lock);
|
||||
vdev_dtl_dirty(newvd, DTL_MISSING,
|
||||
TXG_INITIAL, open_txg - TXG_INITIAL + 1);
|
||||
|
||||
if (newvd->vdev_isspare)
|
||||
spa_spare_activate(newvd);
|
||||
@@ -2999,10 +3019,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
|
||||
* is a replacing vdev.
|
||||
*/
|
||||
int
|
||||
spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
|
||||
spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
|
||||
{
|
||||
uint64_t txg;
|
||||
int c, t, error;
|
||||
int error;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
vdev_t *vd, *pvd, *cvd, *tvd;
|
||||
boolean_t unspare = B_FALSE;
|
||||
@@ -3021,6 +3041,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
|
||||
|
||||
pvd = vd->vdev_parent;
|
||||
|
||||
/*
|
||||
* If the parent/child relationship is not as expected, don't do it.
|
||||
* Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
|
||||
* vdev that's replacing B with C. The user's intent in replacing
|
||||
* is to go from M(A,B) to M(A,C). If the user decides to cancel
|
||||
* the replace by detaching C, the expected behavior is to end up
|
||||
* M(A,B). But suppose that right after deciding to detach C,
|
||||
* the replacement of B completes. We would have M(A,C), and then
|
||||
* ask to detach C, which would leave us with just A -- not what
|
||||
* the user wanted. To prevent this, we make sure that the
|
||||
* parent/child relationship hasn't changed -- in this example,
|
||||
* that C's parent is still the replacing vdev R.
|
||||
*/
|
||||
if (pvd->vdev_guid != pguid && pguid != 0)
|
||||
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
|
||||
|
||||
/*
|
||||
* If replace_done is specified, only remove this device if it's
|
||||
* the first child of a replacing vdev. For the 'spare' vdev, either
|
||||
@@ -3047,36 +3083,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
|
||||
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
|
||||
|
||||
/*
|
||||
* If there's only one replica, you can't detach it.
|
||||
* If this device has the only valid copy of some data,
|
||||
* we cannot safely detach it.
|
||||
*/
|
||||
if (pvd->vdev_children <= 1)
|
||||
if (vdev_dtl_required(vd))
|
||||
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
|
||||
|
||||
/*
|
||||
* If all siblings have non-empty DTLs, this device may have the only
|
||||
* valid copy of the data, which means we cannot safely detach it.
|
||||
*
|
||||
* XXX -- as in the vdev_offline() case, we really want a more
|
||||
* precise DTL check.
|
||||
*/
|
||||
for (c = 0; c < pvd->vdev_children; c++) {
|
||||
uint64_t dirty;
|
||||
|
||||
cvd = pvd->vdev_child[c];
|
||||
if (cvd == vd)
|
||||
continue;
|
||||
if (vdev_is_dead(cvd))
|
||||
continue;
|
||||
mutex_enter(&cvd->vdev_dtl_lock);
|
||||
dirty = cvd->vdev_dtl_map.sm_space |
|
||||
cvd->vdev_dtl_scrub.sm_space;
|
||||
mutex_exit(&cvd->vdev_dtl_lock);
|
||||
if (!dirty)
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == pvd->vdev_children)
|
||||
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
|
||||
ASSERT(pvd->vdev_children >= 2);
|
||||
|
||||
/*
|
||||
* If we are detaching the second disk from a replacing vdev, then
|
||||
@@ -3102,7 +3115,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
|
||||
* active spare list for the pool.
|
||||
*/
|
||||
if (pvd->vdev_ops == &vdev_spare_ops &&
|
||||
vd->vdev_id == 0)
|
||||
vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
|
||||
unspare = B_TRUE;
|
||||
|
||||
/*
|
||||
@@ -3128,14 +3141,18 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
|
||||
|
||||
/*
|
||||
* If we need to remove the remaining child from the list of hot spares,
|
||||
* do it now, marking the vdev as no longer a spare in the process. We
|
||||
* must do this before vdev_remove_parent(), because that can change the
|
||||
* GUID if it creates a new toplevel GUID.
|
||||
* do it now, marking the vdev as no longer a spare in the process.
|
||||
* We must do this before vdev_remove_parent(), because that can
|
||||
* change the GUID if it creates a new toplevel GUID. For a similar
|
||||
* reason, we must remove the spare now, in the same txg as the detach;
|
||||
* otherwise someone could attach a new sibling, change the GUID, and
|
||||
* the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
|
||||
*/
|
||||
if (unspare) {
|
||||
ASSERT(cvd->vdev_isspare);
|
||||
spa_spare_remove(cvd);
|
||||
unspare_guid = cvd->vdev_guid;
|
||||
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3173,7 +3190,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
|
||||
* But first make sure we're not on any *other* txg's DTL list, to
|
||||
* prevent vd from being accessed after it's freed.
|
||||
*/
|
||||
for (t = 0; t < TXG_SIZE; t++)
|
||||
for (int t = 0; t < TXG_SIZE; t++)
|
||||
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
|
||||
vd->vdev_detached = B_TRUE;
|
||||
vdev_dirty(tvd, VDD_DTL, vd, txg);
|
||||
@@ -3188,11 +3205,14 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
|
||||
* list of every other pool.
|
||||
*/
|
||||
if (unspare) {
|
||||
spa_t *myspa = spa;
|
||||
spa = NULL;
|
||||
mutex_enter(&spa_namespace_lock);
|
||||
while ((spa = spa_next(spa)) != NULL) {
|
||||
if (spa->spa_state != POOL_STATE_ACTIVE)
|
||||
continue;
|
||||
if (spa == myspa)
|
||||
continue;
|
||||
spa_open_ref(spa, FTAG);
|
||||
mutex_exit(&spa_namespace_lock);
|
||||
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
|
||||
@@ -3256,10 +3276,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
|
||||
vdev_t *vd;
|
||||
nvlist_t **spares, **l2cache, *nv;
|
||||
uint_t nspares, nl2cache;
|
||||
uint64_t txg;
|
||||
uint64_t txg = 0;
|
||||
int error = 0;
|
||||
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
|
||||
|
||||
txg = spa_vdev_enter(spa);
|
||||
if (!locked)
|
||||
txg = spa_vdev_enter(spa);
|
||||
|
||||
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
|
||||
|
||||
@@ -3302,7 +3324,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
|
||||
error = ENOENT;
|
||||
}
|
||||
|
||||
return (spa_vdev_exit(spa, NULL, txg, error));
|
||||
if (!locked)
|
||||
return (spa_vdev_exit(spa, NULL, txg, error));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3328,13 +3353,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
|
||||
oldvd = vd->vdev_child[0];
|
||||
newvd = vd->vdev_child[1];
|
||||
|
||||
mutex_enter(&newvd->vdev_dtl_lock);
|
||||
if (newvd->vdev_dtl_map.sm_space == 0 &&
|
||||
newvd->vdev_dtl_scrub.sm_space == 0) {
|
||||
mutex_exit(&newvd->vdev_dtl_lock);
|
||||
if (vdev_dtl_empty(newvd, DTL_MISSING) &&
|
||||
!vdev_dtl_required(oldvd))
|
||||
return (oldvd);
|
||||
}
|
||||
mutex_exit(&newvd->vdev_dtl_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3344,15 +3365,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
|
||||
newvd = vd->vdev_child[0];
|
||||
oldvd = vd->vdev_child[1];
|
||||
|
||||
mutex_enter(&newvd->vdev_dtl_lock);
|
||||
if (newvd->vdev_unspare &&
|
||||
newvd->vdev_dtl_map.sm_space == 0 &&
|
||||
newvd->vdev_dtl_scrub.sm_space == 0) {
|
||||
vdev_dtl_empty(newvd, DTL_MISSING) &&
|
||||
!vdev_dtl_required(oldvd)) {
|
||||
newvd->vdev_unspare = 0;
|
||||
mutex_exit(&newvd->vdev_dtl_lock);
|
||||
return (oldvd);
|
||||
}
|
||||
mutex_exit(&newvd->vdev_dtl_lock);
|
||||
}
|
||||
|
||||
return (NULL);
|
||||
@@ -3361,36 +3379,37 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
|
||||
static void
|
||||
spa_vdev_resilver_done(spa_t *spa)
|
||||
{
|
||||
vdev_t *vd;
|
||||
vdev_t *pvd;
|
||||
uint64_t guid;
|
||||
uint64_t pguid = 0;
|
||||
vdev_t *vd, *pvd, *ppvd;
|
||||
uint64_t guid, sguid, pguid, ppguid;
|
||||
|
||||
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
|
||||
while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
|
||||
pvd = vd->vdev_parent;
|
||||
ppvd = pvd->vdev_parent;
|
||||
guid = vd->vdev_guid;
|
||||
pguid = pvd->vdev_guid;
|
||||
ppguid = ppvd->vdev_guid;
|
||||
sguid = 0;
|
||||
/*
|
||||
* If we have just finished replacing a hot spared device, then
|
||||
* we need to detach the parent's first child (the original hot
|
||||
* spare) as well.
|
||||
*/
|
||||
pvd = vd->vdev_parent;
|
||||
if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
|
||||
pvd->vdev_id == 0) {
|
||||
if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
|
||||
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
|
||||
ASSERT(pvd->vdev_parent->vdev_children == 2);
|
||||
pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
|
||||
ASSERT(ppvd->vdev_children == 2);
|
||||
sguid = ppvd->vdev_child[1]->vdev_guid;
|
||||
}
|
||||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||
if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
|
||||
return;
|
||||
if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
|
||||
if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
|
||||
return;
|
||||
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
||||
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
||||
}
|
||||
|
||||
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
||||
spa_config_exit(spa, SCL_ALL, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3925,9 +3944,22 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
* into config changes that go out with this transaction group.
|
||||
*/
|
||||
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
|
||||
while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
|
||||
vdev_state_clean(vd);
|
||||
vdev_config_dirty(vd);
|
||||
while (list_head(&spa->spa_state_dirty_list) != NULL) {
|
||||
/*
|
||||
* We need the write lock here because, for aux vdevs,
|
||||
* calling vdev_config_dirty() modifies sav_config.
|
||||
* This is ugly and will become unnecessary when we
|
||||
* eliminate the aux vdev wart by integrating all vdevs
|
||||
* into the root vdev tree.
|
||||
*/
|
||||
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
|
||||
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
|
||||
while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
|
||||
vdev_state_clean(vd);
|
||||
vdev_config_dirty(vd);
|
||||
}
|
||||
spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
|
||||
spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
|
||||
}
|
||||
spa_config_exit(spa, SCL_STATE, FTAG);
|
||||
|
||||
|
||||
@@ -208,6 +208,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
|
||||
|
||||
ASSERT(MUTEX_HELD(&spa_namespace_lock));
|
||||
|
||||
if (rootdir == NULL)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Iterate over all cachefiles for the pool, past or present. When the
|
||||
* cachefile is changed, the new one is pushed onto this list, allowing
|
||||
|
||||
+26
-3
@@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock;
|
||||
static avl_tree_t spa_l2cache_avl;
|
||||
|
||||
kmem_cache_t *spa_buffer_pool;
|
||||
int spa_mode;
|
||||
int spa_mode_global;
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
/* Everything except dprintf is on by default in debug builds */
|
||||
@@ -880,8 +880,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
|
||||
txg_wait_synced(spa->spa_dsl_pool, txg);
|
||||
|
||||
if (vd != NULL) {
|
||||
ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
|
||||
ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
|
||||
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
|
||||
vdev_free(vd);
|
||||
spa_config_exit(spa, SCL_ALL, spa);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -912,6 +914,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
|
||||
|
||||
spa_config_exit(spa, SCL_STATE_ALL, spa);
|
||||
|
||||
/*
|
||||
* If anything changed, wait for it to sync. This ensures that,
|
||||
* from the system administrator's perspective, zpool(1M) commands
|
||||
* are synchronous. This is important for things like zpool offline:
|
||||
* when the command completes, you expect no further I/O from ZFS.
|
||||
*/
|
||||
if (vd != NULL)
|
||||
txg_wait_synced(spa->spa_dsl_pool, 0);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
@@ -1351,7 +1362,7 @@ spa_init(int mode)
|
||||
avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
|
||||
offsetof(spa_aux_t, aux_avl));
|
||||
|
||||
spa_mode = mode;
|
||||
spa_mode_global = mode;
|
||||
|
||||
refcount_init();
|
||||
unique_init();
|
||||
@@ -1408,3 +1419,15 @@ spa_is_root(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_is_root);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
spa_writeable(spa_t *spa)
|
||||
{
|
||||
return (!!(spa->spa_mode & FWRITE));
|
||||
}
|
||||
|
||||
int
|
||||
spa_mode(spa_t *spa)
|
||||
{
|
||||
return (spa->spa_mode);
|
||||
}
|
||||
|
||||
+134
-54
@@ -23,8 +23,6 @@
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "%Z%%M% %I% %E% SMI"
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/spa.h>
|
||||
#include <sys/dmu.h>
|
||||
@@ -60,6 +58,8 @@ space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
|
||||
{
|
||||
bzero(sm, sizeof (*sm));
|
||||
|
||||
cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
avl_create(&sm->sm_root, space_map_seg_compare,
|
||||
sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
|
||||
|
||||
@@ -75,6 +75,7 @@ space_map_destroy(space_map_t *sm)
|
||||
ASSERT(!sm->sm_loaded && !sm->sm_loading);
|
||||
VERIFY3U(sm->sm_space, ==, 0);
|
||||
avl_destroy(&sm->sm_root);
|
||||
cv_destroy(&sm->sm_load_cv);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -180,7 +181,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
sm->sm_space -= size;
|
||||
}
|
||||
|
||||
int
|
||||
boolean_t
|
||||
space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
avl_index_t where;
|
||||
@@ -220,59 +221,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
|
||||
{
|
||||
space_seg_t *ss;
|
||||
|
||||
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
|
||||
func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
|
||||
}
|
||||
|
||||
void
|
||||
space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
|
||||
{
|
||||
avl_tree_t *t = &sm->sm_root;
|
||||
avl_index_t where;
|
||||
space_seg_t *ss, search;
|
||||
uint64_t end = start + size;
|
||||
uint64_t rm_start, rm_end;
|
||||
|
||||
ASSERT(MUTEX_HELD(sm->sm_lock));
|
||||
|
||||
search.ss_start = start;
|
||||
search.ss_end = start;
|
||||
|
||||
for (;;) {
|
||||
ss = avl_find(t, &search, &where);
|
||||
|
||||
if (ss == NULL)
|
||||
ss = avl_nearest(t, where, AVL_AFTER);
|
||||
|
||||
if (ss == NULL || ss->ss_start >= end)
|
||||
break;
|
||||
|
||||
rm_start = MAX(ss->ss_start, start);
|
||||
rm_end = MIN(ss->ss_end, end);
|
||||
|
||||
space_map_remove(sm, rm_start, rm_end - rm_start);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Replace smd with the union of smd and sms.
|
||||
*/
|
||||
void
|
||||
space_map_union(space_map_t *smd, space_map_t *sms)
|
||||
{
|
||||
avl_tree_t *t = &sms->sm_root;
|
||||
space_seg_t *ss;
|
||||
|
||||
ASSERT(MUTEX_HELD(smd->sm_lock));
|
||||
|
||||
/*
|
||||
* For each source segment, remove any intersections with the
|
||||
* destination, then add the source segment to the destination.
|
||||
*/
|
||||
for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
|
||||
space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
|
||||
space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
|
||||
}
|
||||
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
|
||||
func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -504,3 +456,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
|
||||
smo->smo_objsize = 0;
|
||||
smo->smo_alloc = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Space map reference trees.
|
||||
*
|
||||
* A space map is a collection of integers. Every integer is either
|
||||
* in the map, or it's not. A space map reference tree generalizes
|
||||
* the idea: it allows its members to have arbitrary reference counts,
|
||||
* as opposed to the implicit reference count of 0 or 1 in a space map.
|
||||
* This representation comes in handy when computing the union or
|
||||
* intersection of multiple space maps. For example, the union of
|
||||
* N space maps is the subset of the reference tree with refcnt >= 1.
|
||||
* The intersection of N space maps is the subset with refcnt >= N.
|
||||
*
|
||||
* [It's very much like a Fourier transform. Unions and intersections
|
||||
* are hard to perform in the 'space map domain', so we convert the maps
|
||||
* into the 'reference count domain', where it's trivial, then invert.]
|
||||
*
|
||||
* vdev_dtl_reassess() uses computations of this form to determine
|
||||
* DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
|
||||
* has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
|
||||
* has an outage wherever refcnt >= vdev_children.
|
||||
*/
|
||||
static int
|
||||
space_map_ref_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const space_ref_t *sr1 = x1;
|
||||
const space_ref_t *sr2 = x2;
|
||||
|
||||
if (sr1->sr_offset < sr2->sr_offset)
|
||||
return (-1);
|
||||
if (sr1->sr_offset > sr2->sr_offset)
|
||||
return (1);
|
||||
|
||||
if (sr1 < sr2)
|
||||
return (-1);
|
||||
if (sr1 > sr2)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
space_map_ref_create(avl_tree_t *t)
|
||||
{
|
||||
avl_create(t, space_map_ref_compare,
|
||||
sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
|
||||
}
|
||||
|
||||
void
|
||||
space_map_ref_destroy(avl_tree_t *t)
|
||||
{
|
||||
space_ref_t *sr;
|
||||
void *cookie = NULL;
|
||||
|
||||
while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
|
||||
kmem_free(sr, sizeof (*sr));
|
||||
|
||||
avl_destroy(t);
|
||||
}
|
||||
|
||||
static void
|
||||
space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
|
||||
{
|
||||
space_ref_t *sr;
|
||||
|
||||
sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
|
||||
sr->sr_offset = offset;
|
||||
sr->sr_refcnt = refcnt;
|
||||
|
||||
avl_add(t, sr);
|
||||
}
|
||||
|
||||
void
|
||||
space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
|
||||
int64_t refcnt)
|
||||
{
|
||||
space_map_ref_add_node(t, start, refcnt);
|
||||
space_map_ref_add_node(t, end, -refcnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert (or add) a space map into a reference tree.
|
||||
*/
|
||||
void
|
||||
space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
|
||||
{
|
||||
space_seg_t *ss;
|
||||
|
||||
ASSERT(MUTEX_HELD(sm->sm_lock));
|
||||
|
||||
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
|
||||
space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a reference tree into a space map. The space map will contain
|
||||
* all members of the reference tree for which refcnt >= minref.
|
||||
*/
|
||||
void
|
||||
space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
|
||||
{
|
||||
uint64_t start = -1ULL;
|
||||
int64_t refcnt = 0;
|
||||
space_ref_t *sr;
|
||||
|
||||
ASSERT(MUTEX_HELD(sm->sm_lock));
|
||||
|
||||
space_map_vacate(sm, NULL, NULL);
|
||||
|
||||
for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
|
||||
refcnt += sr->sr_refcnt;
|
||||
if (refcnt >= minref) {
|
||||
if (start == -1ULL) {
|
||||
start = sr->sr_offset;
|
||||
}
|
||||
} else {
|
||||
if (start != -1ULL) {
|
||||
uint64_t end = sr->sr_offset;
|
||||
ASSERT(start <= end);
|
||||
if (end > start)
|
||||
space_map_add(sm, start, end - start);
|
||||
start = -1ULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT(refcnt == 0);
|
||||
ASSERT(start == -1ULL);
|
||||
}
|
||||
|
||||
@@ -63,6 +63,12 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
|
||||
rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
|
||||
mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
|
||||
cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
tx->tx_open_txg = txg;
|
||||
}
|
||||
|
||||
@@ -80,6 +86,12 @@ txg_fini(dsl_pool_t *dp)
|
||||
rw_destroy(&tx->tx_suspend);
|
||||
mutex_destroy(&tx->tx_sync_lock);
|
||||
|
||||
cv_destroy(&tx->tx_sync_more_cv);
|
||||
cv_destroy(&tx->tx_sync_done_cv);
|
||||
cv_destroy(&tx->tx_quiesce_more_cv);
|
||||
cv_destroy(&tx->tx_quiesce_done_cv);
|
||||
cv_destroy(&tx->tx_exit_cv);
|
||||
|
||||
for (c = 0; c < max_ncpus; c++) {
|
||||
int i;
|
||||
|
||||
|
||||
+259
-109
@@ -316,8 +316,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
|
||||
space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
|
||||
&vd->vdev_dtl_lock);
|
||||
}
|
||||
txg_list_create(&vd->vdev_ms_list,
|
||||
offsetof(struct metaslab, ms_txg_node));
|
||||
txg_list_create(&vd->vdev_dtl_list,
|
||||
@@ -474,7 +476,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
|
||||
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
|
||||
if (alloctype == VDEV_ALLOC_LOAD) {
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
|
||||
&vd->vdev_dtl.smo_object);
|
||||
&vd->vdev_dtl_smo.smo_object);
|
||||
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
|
||||
&vd->vdev_unspare);
|
||||
}
|
||||
@@ -566,12 +568,14 @@ vdev_free(vdev_t *vd)
|
||||
|
||||
txg_list_destroy(&vd->vdev_ms_list);
|
||||
txg_list_destroy(&vd->vdev_dtl_list);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_unload(&vd->vdev_dtl_map);
|
||||
space_map_destroy(&vd->vdev_dtl_map);
|
||||
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
|
||||
space_map_destroy(&vd->vdev_dtl_scrub);
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
space_map_unload(&vd->vdev_dtl[t]);
|
||||
space_map_destroy(&vd->vdev_dtl[t]);
|
||||
}
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
mutex_destroy(&vd->vdev_dtl_lock);
|
||||
mutex_destroy(&vd->vdev_stat_lock);
|
||||
mutex_destroy(&vd->vdev_probe_lock);
|
||||
@@ -709,14 +713,18 @@ vdev_remove_parent(vdev_t *cvd)
|
||||
|
||||
vdev_remove_child(mvd, cvd);
|
||||
vdev_remove_child(pvd, mvd);
|
||||
|
||||
/*
|
||||
* If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
|
||||
* Otherwise, we could have detached an offline device, and when we
|
||||
* go to import the pool we'll think we have two top-level vdevs,
|
||||
* instead of a different version of the same top-level vdev.
|
||||
*/
|
||||
if (mvd->vdev_top == mvd)
|
||||
cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid;
|
||||
if (mvd->vdev_top == mvd) {
|
||||
uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
|
||||
cvd->vdev_guid += guid_delta;
|
||||
cvd->vdev_guid_sum += guid_delta;
|
||||
}
|
||||
cvd->vdev_id = mvd->vdev_id;
|
||||
vdev_add_child(pvd, cvd);
|
||||
vdev_top_update(cvd->vdev_top, cvd->vdev_top);
|
||||
@@ -815,6 +823,7 @@ typedef struct vdev_probe_stats {
|
||||
static void
|
||||
vdev_probe_done(zio_t *zio)
|
||||
{
|
||||
spa_t *spa = zio->io_spa;
|
||||
vdev_probe_stats_t *vps = zio->io_private;
|
||||
vdev_t *vd = vps->vps_vd;
|
||||
|
||||
@@ -822,7 +831,7 @@ vdev_probe_done(zio_t *zio)
|
||||
ASSERT(zio->io_vd == vd);
|
||||
if (zio->io_error == 0)
|
||||
vps->vps_readable = 1;
|
||||
if (zio->io_error == 0 && (spa_mode & FWRITE)) {
|
||||
if (zio->io_error == 0 && spa_writeable(spa)) {
|
||||
zio_nowait(zio_write_phys(vps->vps_root, vd,
|
||||
zio->io_offset, zio->io_size, zio->io_data,
|
||||
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
|
||||
@@ -843,12 +852,12 @@ vdev_probe_done(zio_t *zio)
|
||||
vd->vdev_cant_write |= !vps->vps_writeable;
|
||||
|
||||
if (vdev_readable(vd) &&
|
||||
(vdev_writeable(vd) || !(spa_mode & FWRITE))) {
|
||||
(vdev_writeable(vd) || !spa_writeable(spa))) {
|
||||
zio->io_error = 0;
|
||||
} else {
|
||||
ASSERT(zio->io_error != 0);
|
||||
zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
|
||||
zio->io_spa, vd, NULL, 0, 0);
|
||||
spa, vd, NULL, 0, 0);
|
||||
zio->io_error = ENXIO;
|
||||
}
|
||||
kmem_free(vps, sizeof (*vps));
|
||||
@@ -916,12 +925,15 @@ vdev_probe(vdev_t *vd, zio_t *pio)
|
||||
int
|
||||
vdev_open(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
int error;
|
||||
int c;
|
||||
uint64_t osize = 0;
|
||||
uint64_t asize, psize;
|
||||
uint64_t ashift = 0;
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
|
||||
|
||||
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
|
||||
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
|
||||
vd->vdev_state == VDEV_STATE_OFFLINE);
|
||||
@@ -1055,16 +1067,12 @@ vdev_open(vdev_t *vd)
|
||||
|
||||
/*
|
||||
* If a leaf vdev has a DTL, and seems healthy, then kick off a
|
||||
* resilver. But don't do this if we are doing a reopen for a
|
||||
* scrub, since this would just restart the scrub we are already
|
||||
* doing.
|
||||
* resilver. But don't do this if we are doing a reopen for a scrub,
|
||||
* since this would just restart the scrub we are already doing.
|
||||
*/
|
||||
if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd))
|
||||
spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
}
|
||||
if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
|
||||
vdev_resilver_needed(vd, NULL, NULL))
|
||||
spa_async_request(spa, SPA_ASYNC_RESILVER);
|
||||
|
||||
return (0);
|
||||
}
|
||||
@@ -1165,6 +1173,10 @@ vdev_validate(vdev_t *vd)
|
||||
void
|
||||
vdev_close(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
|
||||
|
||||
vd->vdev_ops->vdev_op_close(vd);
|
||||
|
||||
vdev_cache_purge(vd);
|
||||
@@ -1283,34 +1295,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
|
||||
(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
|
||||
}
|
||||
|
||||
/*
|
||||
* DTLs.
|
||||
*
|
||||
* A vdev's DTL (dirty time log) is the set of transaction groups for which
|
||||
* the vdev has less than perfect replication. There are three kinds of DTL:
|
||||
*
|
||||
* DTL_MISSING: txgs for which the vdev has no valid copies of the data
|
||||
*
|
||||
* DTL_PARTIAL: txgs for which data is available, but not fully replicated
|
||||
*
|
||||
* DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
|
||||
* scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
|
||||
* txgs that was scrubbed.
|
||||
*
|
||||
* DTL_OUTAGE: txgs which cannot currently be read, whether due to
|
||||
* persistent errors or just some device being offline.
|
||||
* Unlike the other three, the DTL_OUTAGE map is not generally
|
||||
* maintained; it's only computed when needed, typically to
|
||||
* determine whether a device can be detached.
|
||||
*
|
||||
* For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
|
||||
* either has the data or it doesn't.
|
||||
*
|
||||
* For interior vdevs such as mirror and RAID-Z the picture is more complex.
|
||||
* A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
|
||||
* if any child is less than fully replicated, then so is its parent.
|
||||
* A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
|
||||
* comprising only those txgs which appear in 'maxfaults' or more children;
|
||||
* those are the txgs we don't have enough replication to read. For example,
|
||||
* double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
|
||||
* thus, its DTL_MISSING consists of the set of txgs that appear in more than
|
||||
* two child DTL_MISSING maps.
|
||||
*
|
||||
* It should be clear from the above that to compute the DTLs and outage maps
|
||||
* for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
|
||||
* Therefore, that is all we keep on disk. When loading the pool, or after
|
||||
* a configuration change, we generate all other DTLs from first principles.
|
||||
*/
|
||||
void
|
||||
vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
|
||||
vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
|
||||
ASSERT(t < DTL_TYPES);
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
if (!space_map_contains(sm, txg, size))
|
||||
space_map_add(sm, txg, size);
|
||||
mutex_exit(sm->sm_lock);
|
||||
}
|
||||
|
||||
int
|
||||
vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
|
||||
boolean_t
|
||||
vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
|
||||
{
|
||||
int dirty;
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
boolean_t dirty = B_FALSE;
|
||||
|
||||
/*
|
||||
* Quick test without the lock -- covers the common case that
|
||||
* there are no dirty time segments.
|
||||
*/
|
||||
if (sm->sm_space == 0)
|
||||
return (0);
|
||||
ASSERT(t < DTL_TYPES);
|
||||
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
dirty = space_map_contains(sm, txg, size);
|
||||
if (sm->sm_space != 0)
|
||||
dirty = space_map_contains(sm, txg, size);
|
||||
mutex_exit(sm->sm_lock);
|
||||
|
||||
return (dirty);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
|
||||
{
|
||||
space_map_t *sm = &vd->vdev_dtl[t];
|
||||
boolean_t empty;
|
||||
|
||||
mutex_enter(sm->sm_lock);
|
||||
empty = (sm->sm_space == 0);
|
||||
mutex_exit(sm->sm_lock);
|
||||
|
||||
return (empty);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reassess DTLs after a config change or scrub completion.
|
||||
*/
|
||||
@@ -1318,11 +1384,19 @@ void
|
||||
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
int c;
|
||||
avl_tree_t reftree;
|
||||
int minref;
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
|
||||
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
|
||||
|
||||
if (vd->vdev_children == 0) {
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
vdev_dtl_reassess(vd->vdev_child[c], txg,
|
||||
scrub_txg, scrub_done);
|
||||
|
||||
if (vd == spa->spa_root_vdev)
|
||||
return;
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
if (scrub_txg != 0 &&
|
||||
(spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
|
||||
@@ -1333,12 +1407,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
* will be valid, so excise the old region and
|
||||
* fold in the scrub dtl. Otherwise, leave the
|
||||
* dtl as-is if there was an error.
|
||||
*
|
||||
* There's little trick here: to excise the beginning
|
||||
* of the DTL_MISSING map, we put it into a reference
|
||||
* tree and then add a segment with refcnt -1 that
|
||||
* covers the range [0, scrub_txg). This means
|
||||
* that each txg in that range has refcnt -1 or 0.
|
||||
* We then add DTL_SCRUB with a refcnt of 2, so that
|
||||
* entries in the range [0, scrub_txg) will have a
|
||||
* positive refcnt -- either 1 or 2. We then convert
|
||||
* the reference tree into the new DTL_MISSING map.
|
||||
*/
|
||||
space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
|
||||
space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
|
||||
space_map_ref_create(&reftree);
|
||||
space_map_ref_add_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
|
||||
space_map_ref_add_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_SCRUB], 2);
|
||||
space_map_ref_generate_map(&reftree,
|
||||
&vd->vdev_dtl[DTL_MISSING], 1);
|
||||
space_map_ref_destroy(&reftree);
|
||||
}
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
|
||||
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
|
||||
space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
|
||||
if (scrub_done)
|
||||
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
|
||||
space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
|
||||
if (!vdev_readable(vd))
|
||||
space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
|
||||
else
|
||||
space_map_walk(&vd->vdev_dtl[DTL_MISSING],
|
||||
space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
if (txg != 0)
|
||||
@@ -1346,35 +1446,34 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure the DTLs are always correct under the scrub lock.
|
||||
*/
|
||||
if (vd == spa->spa_root_vdev)
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
|
||||
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
|
||||
space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
if (t == DTL_SCRUB)
|
||||
continue; /* leaf vdevs only */
|
||||
if (t == DTL_PARTIAL)
|
||||
minref = 1; /* i.e. non-zero */
|
||||
else if (vd->vdev_nparity != 0)
|
||||
minref = vd->vdev_nparity + 1; /* RAID-Z */
|
||||
else
|
||||
minref = vd->vdev_children; /* any kind of mirror */
|
||||
space_map_ref_create(&reftree);
|
||||
for (int c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
mutex_enter(&cvd->vdev_dtl_lock);
|
||||
space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
|
||||
mutex_exit(&cvd->vdev_dtl_lock);
|
||||
}
|
||||
space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
|
||||
space_map_ref_destroy(&reftree);
|
||||
}
|
||||
|
||||
if (vd == spa->spa_root_vdev)
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
}
|
||||
|
||||
static int
|
||||
vdev_dtl_load(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl_smo;
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
dmu_buf_t *db;
|
||||
int error;
|
||||
@@ -1392,7 +1491,8 @@ vdev_dtl_load(vdev_t *vd)
|
||||
dmu_buf_rele(db, FTAG);
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
|
||||
error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
|
||||
NULL, SM_ALLOC, smo, mos);
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
|
||||
return (error);
|
||||
@@ -1402,8 +1502,8 @@ void
|
||||
vdev_dtl_sync(vdev_t *vd, uint64_t txg)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl;
|
||||
space_map_t *sm = &vd->vdev_dtl_map;
|
||||
space_map_obj_t *smo = &vd->vdev_dtl_smo;
|
||||
space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
|
||||
objset_t *mos = spa->spa_meta_objset;
|
||||
space_map_t smsync;
|
||||
kmutex_t smlock;
|
||||
@@ -1460,6 +1560,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine whether the specified vdev can be offlined/detached/removed
|
||||
* without losing data.
|
||||
*/
|
||||
boolean_t
|
||||
vdev_dtl_required(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
vdev_t *tvd = vd->vdev_top;
|
||||
uint8_t cant_read = vd->vdev_cant_read;
|
||||
boolean_t required;
|
||||
|
||||
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
|
||||
|
||||
if (vd == spa->spa_root_vdev || vd == tvd)
|
||||
return (B_TRUE);
|
||||
|
||||
/*
|
||||
* Temporarily mark the device as unreadable, and then determine
|
||||
* whether this results in any DTL outages in the top-level vdev.
|
||||
* If not, we can safely offline/detach/remove the device.
|
||||
*/
|
||||
vd->vdev_cant_read = B_TRUE;
|
||||
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
|
||||
required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
|
||||
vd->vdev_cant_read = cant_read;
|
||||
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
|
||||
|
||||
return (required);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine if resilver is needed, and if so the txg range.
|
||||
*/
|
||||
@@ -1472,19 +1603,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
|
||||
|
||||
if (vd->vdev_children == 0) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) {
|
||||
if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
|
||||
vdev_writeable(vd)) {
|
||||
space_seg_t *ss;
|
||||
|
||||
ss = avl_first(&vd->vdev_dtl_map.sm_root);
|
||||
ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
|
||||
thismin = ss->ss_start - 1;
|
||||
ss = avl_last(&vd->vdev_dtl_map.sm_root);
|
||||
ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
|
||||
thismax = ss->ss_end;
|
||||
needed = B_TRUE;
|
||||
}
|
||||
mutex_exit(&vd->vdev_dtl_lock);
|
||||
} else {
|
||||
int c;
|
||||
for (c = 0; c < vd->vdev_children; c++) {
|
||||
for (int c = 0; c < vd->vdev_children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
uint64_t cmin, cmax;
|
||||
|
||||
@@ -1506,12 +1637,10 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
|
||||
void
|
||||
vdev_load(vdev_t *vd)
|
||||
{
|
||||
int c;
|
||||
|
||||
/*
|
||||
* Recursively load all children.
|
||||
*/
|
||||
for (c = 0; c < vd->vdev_children; c++)
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
vdev_load(vd->vdev_child[c]);
|
||||
|
||||
/*
|
||||
@@ -1731,11 +1860,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
|
||||
vd->vdev_parent->vdev_child[0] == vd)
|
||||
vd->vdev_unspare = B_TRUE;
|
||||
|
||||
(void) spa_vdev_state_exit(spa, vd, 0);
|
||||
|
||||
VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
|
||||
|
||||
return (0);
|
||||
return (spa_vdev_state_exit(spa, vd, 0));
|
||||
}
|
||||
|
||||
int
|
||||
@@ -1756,13 +1881,10 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
|
||||
*/
|
||||
if (!vd->vdev_offline) {
|
||||
/*
|
||||
* If this device's top-level vdev has a non-empty DTL,
|
||||
* don't allow the device to be offlined.
|
||||
*
|
||||
* XXX -- make this more precise by allowing the offline
|
||||
* as long as the remaining devices don't have any DTL holes.
|
||||
* If this device has the only valid copy of some data,
|
||||
* don't allow it to be offlined.
|
||||
*/
|
||||
if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
|
||||
if (vd->vdev_aux == NULL && vdev_dtl_required(vd))
|
||||
return (spa_vdev_state_exit(spa, NULL, EBUSY));
|
||||
|
||||
/*
|
||||
@@ -1772,7 +1894,7 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
|
||||
*/
|
||||
vd->vdev_offline = B_TRUE;
|
||||
vdev_reopen(vd->vdev_top);
|
||||
if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
|
||||
if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
|
||||
vd->vdev_offline = B_FALSE;
|
||||
vdev_reopen(vd->vdev_top);
|
||||
return (spa_vdev_state_exit(spa, NULL, EBUSY));
|
||||
@@ -1852,13 +1974,17 @@ vdev_writeable(vdev_t *vd)
|
||||
boolean_t
|
||||
vdev_allocatable(vdev_t *vd)
|
||||
{
|
||||
uint64_t state = vd->vdev_state;
|
||||
|
||||
/*
|
||||
* We currently allow allocations from vdevs which maybe in the
|
||||
* We currently allow allocations from vdevs which may be in the
|
||||
* process of reopening (i.e. VDEV_STATE_CLOSED). If the device
|
||||
* fails to reopen then we'll catch it later when we're holding
|
||||
* the proper locks.
|
||||
* the proper locks. Note that we have to get the vdev state
|
||||
* in a local variable because although it changes atomically,
|
||||
* we're asking two separate questions about it.
|
||||
*/
|
||||
return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) &&
|
||||
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
|
||||
!vd->vdev_cant_write);
|
||||
}
|
||||
|
||||
@@ -1928,7 +2054,8 @@ vdev_clear_stats(vdev_t *vd)
|
||||
void
|
||||
vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
{
|
||||
vdev_t *rvd = zio->io_spa->spa_root_vdev;
|
||||
spa_t *spa = zio->io_spa;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
|
||||
vdev_t *pvd;
|
||||
uint64_t txg = zio->io_txg;
|
||||
@@ -1961,21 +2088,23 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
return;
|
||||
|
||||
ASSERT(vd == zio->io_vd);
|
||||
if (!(flags & ZIO_FLAG_IO_BYPASS)) {
|
||||
mutex_enter(&vd->vdev_stat_lock);
|
||||
vs->vs_ops[type]++;
|
||||
vs->vs_bytes[type] += psize;
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
}
|
||||
|
||||
if (flags & ZIO_FLAG_IO_BYPASS)
|
||||
return;
|
||||
|
||||
mutex_enter(&vd->vdev_stat_lock);
|
||||
|
||||
if (flags & ZIO_FLAG_IO_REPAIR) {
|
||||
ASSERT(zio->io_delegate_list == NULL);
|
||||
mutex_enter(&vd->vdev_stat_lock);
|
||||
if (flags & ZIO_FLAG_SCRUB_THREAD)
|
||||
vs->vs_scrub_repaired += psize;
|
||||
else
|
||||
if (flags & ZIO_FLAG_SELF_HEAL)
|
||||
vs->vs_self_healed += psize;
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
}
|
||||
|
||||
vs->vs_ops[type]++;
|
||||
vs->vs_bytes[type] += psize;
|
||||
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1993,19 +2122,39 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
|
||||
vs->vs_write_errors++;
|
||||
mutex_exit(&vd->vdev_stat_lock);
|
||||
|
||||
if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) {
|
||||
if (flags & ZIO_FLAG_SCRUB_THREAD) {
|
||||
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
|
||||
for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
|
||||
vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
|
||||
}
|
||||
if (!(flags & ZIO_FLAG_IO_REPAIR)) {
|
||||
if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
|
||||
if (type == ZIO_TYPE_WRITE && txg != 0 &&
|
||||
(!(flags & ZIO_FLAG_IO_REPAIR) ||
|
||||
(flags & ZIO_FLAG_SCRUB_THREAD))) {
|
||||
/*
|
||||
* This is either a normal write (not a repair), or it's a
|
||||
* repair induced by the scrub thread. In the normal case,
|
||||
* we commit the DTL change in the same txg as the block
|
||||
* was born. In the scrub-induced repair case, we know that
|
||||
* scrubs run in first-pass syncing context, so we commit
|
||||
* the DTL change in spa->spa_syncing_txg.
|
||||
*
|
||||
* We currently do not make DTL entries for failed spontaneous
|
||||
* self-healing writes triggered by normal (non-scrubbing)
|
||||
* reads, because we have no transactional context in which to
|
||||
* do so -- and it's not clear that it'd be desirable anyway.
|
||||
*/
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
uint64_t commit_txg = txg;
|
||||
if (flags & ZIO_FLAG_SCRUB_THREAD) {
|
||||
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
|
||||
ASSERT(spa_sync_pass(spa) == 1);
|
||||
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
|
||||
commit_txg = spa->spa_syncing_txg;
|
||||
}
|
||||
ASSERT(commit_txg >= spa->spa_syncing_txg);
|
||||
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
|
||||
return;
|
||||
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
|
||||
for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
|
||||
vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
|
||||
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
|
||||
vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
|
||||
vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
|
||||
}
|
||||
if (vd != rvd)
|
||||
vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2218,7 +2367,8 @@ vdev_state_clean(vdev_t *vd)
|
||||
void
|
||||
vdev_propagate_state(vdev_t *vd)
|
||||
{
|
||||
vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
vdev_t *rvd = spa->spa_root_vdev;
|
||||
int degraded = 0, faulted = 0;
|
||||
int corrupted = 0;
|
||||
int c;
|
||||
@@ -2229,7 +2379,7 @@ vdev_propagate_state(vdev_t *vd)
|
||||
child = vd->vdev_child[c];
|
||||
|
||||
if (!vdev_readable(child) ||
|
||||
(!vdev_writeable(child) && (spa_mode & FWRITE))) {
|
||||
(!vdev_writeable(child) && spa_writeable(spa))) {
|
||||
/*
|
||||
* Root special: if there is a top-level log
|
||||
* device, treat the root vdev as if it were
|
||||
|
||||
@@ -61,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
|
||||
*/
|
||||
ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
|
||||
error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
|
||||
spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
|
||||
spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
|
||||
|
||||
if (error) {
|
||||
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
|
||||
@@ -105,7 +105,8 @@ vdev_file_close(vdev_t *vd)
|
||||
|
||||
if (vf->vf_vnode != NULL) {
|
||||
(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
|
||||
(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
|
||||
(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
|
||||
kcred, NULL);
|
||||
VN_RELE(vf->vf_vnode);
|
||||
}
|
||||
|
||||
|
||||
@@ -277,9 +277,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
|
||||
vd->vdev_islog) == 0);
|
||||
}
|
||||
|
||||
if (vd->vdev_dtl.smo_object != 0)
|
||||
if (vd->vdev_dtl_smo.smo_object != 0)
|
||||
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
|
||||
vd->vdev_dtl.smo_object) == 0);
|
||||
vd->vdev_dtl_smo.smo_object) == 0);
|
||||
|
||||
if (getstats) {
|
||||
vdev_stat_t vs;
|
||||
@@ -520,9 +520,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
|
||||
vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
|
||||
return (EBUSY);
|
||||
|
||||
ASSERT(reason != VDEV_LABEL_REMOVE ||
|
||||
vdev_inuse(vd, crtxg, reason, NULL, NULL));
|
||||
|
||||
/*
|
||||
* If this is a request to add or replace a spare or l2cache device
|
||||
* that is in use elsewhere on the system, then we must update the
|
||||
@@ -704,6 +701,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
|
||||
* ==========================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* For use by zdb and debugging purposes only
|
||||
*/
|
||||
uint64_t ub_max_txg = UINT64_MAX;
|
||||
|
||||
/*
|
||||
* Consider the following situation: txg is safely synced to disk. We've
|
||||
* written the first uberblock for txg + 1, and then we lose power. When we
|
||||
@@ -741,7 +743,8 @@ vdev_uberblock_load_done(zio_t *zio)
|
||||
|
||||
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
|
||||
mutex_enter(&rio->io_lock);
|
||||
if (vdev_uberblock_compare(ub, ubbest) > 0)
|
||||
if (ub->ub_txg <= ub_max_txg &&
|
||||
vdev_uberblock_compare(ub, ubbest) > 0)
|
||||
*ubbest = *ub;
|
||||
mutex_exit(&rio->io_lock);
|
||||
}
|
||||
|
||||
@@ -225,7 +225,7 @@ vdev_mirror_child_select(zio_t *zio)
|
||||
mc->mc_skipped = 1;
|
||||
continue;
|
||||
}
|
||||
if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
|
||||
if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
|
||||
return (c);
|
||||
mc->mc_error = ESTALE;
|
||||
mc->mc_skipped = 1;
|
||||
@@ -282,20 +282,10 @@ vdev_mirror_io_start(zio_t *zio)
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
|
||||
/*
|
||||
* If this is a resilvering I/O to a replacing vdev,
|
||||
* only the last child should be written -- unless the
|
||||
* first child happens to have a DTL entry here as well.
|
||||
* All other writes go to all children.
|
||||
* Writes go to all children.
|
||||
*/
|
||||
if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
|
||||
!vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
|
||||
zio->io_txg, 1)) {
|
||||
c = mm->mm_children - 1;
|
||||
children = 1;
|
||||
} else {
|
||||
c = 0;
|
||||
children = mm->mm_children;
|
||||
}
|
||||
c = 0;
|
||||
children = mm->mm_children;
|
||||
}
|
||||
|
||||
while (children--) {
|
||||
@@ -398,7 +388,7 @@ vdev_mirror_io_done(zio_t *zio)
|
||||
ASSERT(zio->io_error != 0);
|
||||
}
|
||||
|
||||
if (good_copies && (spa_mode & FWRITE) &&
|
||||
if (good_copies && spa_writeable(zio->io_spa) &&
|
||||
(unexpected_errors ||
|
||||
(zio->io_flags & ZIO_FLAG_RESILVER) ||
|
||||
((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
|
||||
@@ -419,7 +409,7 @@ vdev_mirror_io_done(zio_t *zio)
|
||||
if (mc->mc_tried)
|
||||
continue;
|
||||
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
|
||||
!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
|
||||
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
|
||||
zio->io_txg, 1))
|
||||
continue;
|
||||
mc->mc_error = ESTALE;
|
||||
@@ -429,7 +419,8 @@ vdev_mirror_io_done(zio_t *zio)
|
||||
mc->mc_vd, mc->mc_offset,
|
||||
zio->io_data, zio->io_size,
|
||||
ZIO_TYPE_WRITE, zio->io_priority,
|
||||
ZIO_FLAG_IO_REPAIR, NULL, NULL));
|
||||
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
|
||||
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+27
-15
@@ -176,6 +176,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
|
||||
zio_t *fio, *lio, *aio, *dio;
|
||||
avl_tree_t *tree;
|
||||
uint64_t size;
|
||||
int flags;
|
||||
|
||||
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
||||
|
||||
@@ -187,21 +188,32 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
|
||||
|
||||
tree = fio->io_vdev_tree;
|
||||
size = fio->io_size;
|
||||
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
|
||||
|
||||
while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
|
||||
!((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
|
||||
size + dio->io_size <= zfs_vdev_aggregation_limit) {
|
||||
dio->io_delegate_next = fio;
|
||||
fio = dio;
|
||||
size += dio->io_size;
|
||||
}
|
||||
|
||||
while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
|
||||
!((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
|
||||
size + dio->io_size <= zfs_vdev_aggregation_limit) {
|
||||
lio->io_delegate_next = dio;
|
||||
lio = dio;
|
||||
size += dio->io_size;
|
||||
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
|
||||
/*
|
||||
* We can aggregate I/Os that are adjacent and of the
|
||||
* same flavor, as expressed by the AGG_INHERIT flags.
|
||||
* The latter is necessary so that certain attributes
|
||||
* of the I/O, such as whether it's a normal I/O or a
|
||||
* scrub/resilver, can be preserved in the aggregate.
|
||||
*/
|
||||
while ((dio = AVL_PREV(tree, fio)) != NULL &&
|
||||
IS_ADJACENT(dio, fio) &&
|
||||
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
|
||||
size + dio->io_size <= zfs_vdev_aggregation_limit) {
|
||||
dio->io_delegate_next = fio;
|
||||
fio = dio;
|
||||
size += dio->io_size;
|
||||
}
|
||||
while ((dio = AVL_NEXT(tree, lio)) != NULL &&
|
||||
IS_ADJACENT(lio, dio) &&
|
||||
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
|
||||
size + dio->io_size <= zfs_vdev_aggregation_limit) {
|
||||
lio->io_delegate_next = dio;
|
||||
lio = dio;
|
||||
size += dio->io_size;
|
||||
}
|
||||
}
|
||||
|
||||
if (fio != lio) {
|
||||
@@ -212,7 +224,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
|
||||
|
||||
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
|
||||
buf, size, fio->io_type, ZIO_PRIORITY_NOW,
|
||||
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
|
||||
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
|
||||
vdev_queue_agg_io_done, NULL);
|
||||
|
||||
aio->io_delegate_list = fio;
|
||||
|
||||
@@ -687,7 +687,7 @@ vdev_raidz_io_start(zio_t *zio)
|
||||
rc->rc_skipped = 1;
|
||||
continue;
|
||||
}
|
||||
if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
|
||||
if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
|
||||
if (c >= rm->rm_firstdatacol)
|
||||
rm->rm_missingdata++;
|
||||
else
|
||||
@@ -1165,7 +1165,7 @@ vdev_raidz_io_done(zio_t *zio)
|
||||
done:
|
||||
zio_checksum_verified(zio);
|
||||
|
||||
if (zio->io_error == 0 && (spa_mode & FWRITE) &&
|
||||
if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
|
||||
(unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
|
||||
/*
|
||||
* Use the good data we have in hand to repair damaged children.
|
||||
@@ -1180,7 +1180,8 @@ done:
|
||||
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
|
||||
rc->rc_offset, rc->rc_data, rc->rc_size,
|
||||
ZIO_TYPE_WRITE, zio->io_priority,
|
||||
ZIO_FLAG_IO_REPAIR, NULL, NULL));
|
||||
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
|
||||
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2148,12 +2148,12 @@ top:
|
||||
}
|
||||
}
|
||||
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
mutex_exit(&zp->z_acl_lock);
|
||||
mutex_exit(&zp->z_lock);
|
||||
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -2208,7 +2208,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
|
||||
|
||||
*check_privs = B_TRUE;
|
||||
|
||||
if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
|
||||
if (zfsvfs->z_replay) {
|
||||
*working_mode = 0;
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -19,12 +19,10 @@
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "%Z%%M% %I% %E% SMI"
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/vfs.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
@@ -63,6 +61,20 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
|
||||
|
||||
while (ptr < end) {
|
||||
if (zfs_layout) {
|
||||
/*
|
||||
* Avoid overrun. Embedded aces can have one
|
||||
* of several sizes. We don't know exactly
|
||||
* how many our present, only the size of the
|
||||
* buffer containing them. That size may be
|
||||
* larger than needed to hold the aces
|
||||
* present. As long as we do not do any
|
||||
* swapping beyond the end of our block we are
|
||||
* okay. It it safe to swap any non-ace data
|
||||
* within the block since it is just zeros.
|
||||
*/
|
||||
if (ptr + sizeof (zfs_ace_hdr_t) > end) {
|
||||
break;
|
||||
}
|
||||
zacep = (zfs_ace_t *)ptr;
|
||||
zacep->z_hdr.z_access_mask =
|
||||
BSWAP_32(zacep->z_hdr.z_access_mask);
|
||||
@@ -71,6 +83,10 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
|
||||
BSWAP_16(zacep->z_hdr.z_type);
|
||||
entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
|
||||
} else {
|
||||
/* Overrun avoidance */
|
||||
if (ptr + sizeof (ace_t) > end) {
|
||||
break;
|
||||
}
|
||||
acep = (ace_t *)ptr;
|
||||
acep->a_access_mask = BSWAP_32(acep->a_access_mask);
|
||||
acep->a_flags = BSWAP_16(acep->a_flags);
|
||||
@@ -87,8 +103,14 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
|
||||
break;
|
||||
case ACE_IDENTIFIER_GROUP:
|
||||
default:
|
||||
/* Overrun avoidance */
|
||||
if (zfs_layout) {
|
||||
zacep->z_fuid = BSWAP_64(zacep->z_fuid);
|
||||
if (ptr + sizeof (zfs_ace_t) <= end) {
|
||||
zacep->z_fuid = BSWAP_64(zacep->z_fuid);
|
||||
} else {
|
||||
entry_size = sizeof (zfs_ace_t);
|
||||
break;
|
||||
}
|
||||
}
|
||||
switch (ace_type) {
|
||||
case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
|
||||
@@ -169,7 +191,8 @@ zfs_znode_byteswap(void *buf, size_t size)
|
||||
if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
|
||||
zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
|
||||
ZFS_ACE_SPACE);
|
||||
} else
|
||||
} else {
|
||||
zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
|
||||
ACE_SLOT_CNT);
|
||||
}
|
||||
}
|
||||
|
||||
+4
-22
@@ -561,24 +561,6 @@ zfs_rmnode(znode_t *zp)
|
||||
ASSERT(ZTOV(zp)->v_count == 0);
|
||||
ASSERT(zp->z_phys->zp_links == 0);
|
||||
|
||||
/*
|
||||
* If this is a ZIL replay then leave the object in the unlinked set.
|
||||
* Otherwise we can get a deadlock, because the delete can be
|
||||
* quite large and span multiple tx's and txgs, but each replay
|
||||
* creates a tx to atomically run the replay function and mark the
|
||||
* replay record as complete. We deadlock trying to start a tx in
|
||||
* a new txg to further the deletion but can't because the replay
|
||||
* tx hasn't finished.
|
||||
*
|
||||
* We actually delete the object if we get a failure to create an
|
||||
* object in zil_replay_log_record(), or after calling zil_replay().
|
||||
*/
|
||||
if (zfsvfs->z_assign >= TXG_INITIAL) {
|
||||
zfs_znode_dmu_fini(zp);
|
||||
zfs_znode_free(zp);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is an attribute directory, purge its contents.
|
||||
*/
|
||||
@@ -845,9 +827,9 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
|
||||
FUID_SIZE_ESTIMATE(zfsvfs));
|
||||
}
|
||||
}
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
|
||||
if (error == ERESTART)
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
return (error);
|
||||
@@ -930,7 +912,7 @@ top:
|
||||
error = zfs_make_xattrdir(zp, &va, xvpp, cr);
|
||||
zfs_dirent_unlock(dl);
|
||||
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
/* NB: we already did dmu_tx_wait() if necessary */
|
||||
goto top;
|
||||
}
|
||||
@@ -959,7 +941,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
|
||||
uid_t fowner;
|
||||
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
|
||||
|
||||
if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
|
||||
if (zdp->z_zfsvfs->z_replay)
|
||||
return (0);
|
||||
|
||||
if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
|
||||
|
||||
@@ -519,7 +519,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
|
||||
uint32_t rid;
|
||||
idmap_stat status;
|
||||
uint64_t idx;
|
||||
boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
|
||||
zfs_fuid_t *zfuid = NULL;
|
||||
zfs_fuid_info_t *fuidp;
|
||||
|
||||
@@ -534,7 +533,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
|
||||
if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
|
||||
return (id);
|
||||
|
||||
if (is_replay) {
|
||||
if (zfsvfs->z_replay) {
|
||||
fuidp = zfsvfs->z_fuid_replay;
|
||||
|
||||
/*
|
||||
@@ -584,7 +583,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
|
||||
|
||||
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
|
||||
|
||||
if (!is_replay)
|
||||
if (!zfsvfs->z_replay)
|
||||
zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
|
||||
else if (zfuid != NULL) {
|
||||
list_remove(&fuidp->z_fuids, zfuid);
|
||||
|
||||
@@ -856,9 +856,10 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
|
||||
{
|
||||
int error;
|
||||
boolean_t force = (boolean_t)zc->zc_cookie;
|
||||
boolean_t hardforce = (boolean_t)zc->zc_guid;
|
||||
|
||||
zfs_log_history(zc);
|
||||
error = spa_export(zc->zc_name, NULL, force);
|
||||
error = spa_export(zc->zc_name, NULL, force, hardforce);
|
||||
return (error);
|
||||
}
|
||||
|
||||
@@ -1162,7 +1163,7 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
|
||||
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
|
||||
return (error);
|
||||
|
||||
error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
|
||||
error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
|
||||
|
||||
spa_close(spa, FTAG);
|
||||
return (error);
|
||||
|
||||
+43
-5
@@ -45,13 +45,33 @@
|
||||
#include <sys/spa.h>
|
||||
#include <sys/zfs_fuid.h>
|
||||
#include <sys/ddi.h>
|
||||
#include <sys/dsl_dataset.h>
|
||||
|
||||
#define ZFS_HANDLE_REPLAY(zilog, tx) \
|
||||
if (zilog->zl_replay) { \
|
||||
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
|
||||
zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
|
||||
zilog->zl_replaying_seq; \
|
||||
return; \
|
||||
}
|
||||
|
||||
/*
|
||||
* All the functions in this file are used to construct the log entries
|
||||
* to record transactions. They allocate * an intent log transaction
|
||||
* structure (itx_t) and save within it all the information necessary to
|
||||
* possibly replay the transaction. The itx is then assigned a sequence
|
||||
* number and inserted in the in-memory list anchored in the zilog.
|
||||
* These zfs_log_* functions must be called within a dmu tx, in one
|
||||
* of 2 contexts depending on zilog->z_replay:
|
||||
*
|
||||
* Non replay mode
|
||||
* ---------------
|
||||
* We need to record the transaction so that if it is committed to
|
||||
* the Intent Log then it can be replayed. An intent log transaction
|
||||
* structure (itx_t) is allocated and all the information necessary to
|
||||
* possibly replay the transaction is saved in it. The itx is then assigned
|
||||
* a sequence number and inserted in the in-memory list anchored in the zilog.
|
||||
*
|
||||
* Replay mode
|
||||
* -----------
|
||||
* We need to mark the intent log record as replayed in the log header.
|
||||
* This is done in the same transaction as the replay so that they
|
||||
* commit atomically.
|
||||
*/
|
||||
|
||||
int
|
||||
@@ -231,6 +251,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
/*
|
||||
* If we have FUIDs present then add in space for
|
||||
* domains and ACE fuid's if any.
|
||||
@@ -334,6 +356,8 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
|
||||
lr = (lr_remove_t *)&itx->itx_lr;
|
||||
lr->lr_doid = dzp->z_id;
|
||||
@@ -358,6 +382,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
|
||||
lr = (lr_link_t *)&itx->itx_lr;
|
||||
lr->lr_doid = dzp->z_id;
|
||||
@@ -385,6 +411,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
|
||||
lr = (lr_create_t *)&itx->itx_lr;
|
||||
lr->lr_doid = dzp->z_id;
|
||||
@@ -419,6 +447,8 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
|
||||
lr = (lr_rename_t *)&itx->itx_lr;
|
||||
lr->lr_sdoid = sdzp->z_id;
|
||||
@@ -451,6 +481,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
/*
|
||||
* Writes are handled in three different ways:
|
||||
*
|
||||
@@ -549,6 +581,8 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr));
|
||||
lr = (lr_truncate_t *)&itx->itx_lr;
|
||||
lr->lr_foid = zp->z_id;
|
||||
@@ -578,6 +612,8 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
/*
|
||||
* If XVATTR set, then log record size needs to allow
|
||||
* for lr_attr_t + xvattr mask, mapsize and create time
|
||||
@@ -644,6 +680,8 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
|
||||
|
||||
txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
|
||||
TX_ACL_V0 : TX_ACL;
|
||||
|
||||
|
||||
+40
-12
@@ -583,21 +583,50 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
|
||||
* allow replays to succeed.
|
||||
*/
|
||||
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
|
||||
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
|
||||
if (readonly != 0)
|
||||
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
|
||||
else
|
||||
zfs_unlinked_drain(zfsvfs);
|
||||
|
||||
/*
|
||||
* Parse and replay the intent log.
|
||||
*/
|
||||
zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
|
||||
zfs_replay_vector, zfs_unlinked_drain);
|
||||
|
||||
zfs_unlinked_drain(zfsvfs);
|
||||
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
|
||||
if (zil_disable) {
|
||||
zil_destroy(zfsvfs->z_log, 0);
|
||||
zfsvfs->z_log = NULL;
|
||||
} else {
|
||||
/*
|
||||
* Parse and replay the intent log.
|
||||
*
|
||||
* Because of ziltest, this must be done after
|
||||
* zfs_unlinked_drain(). (Further note: ziltest
|
||||
* doesn't use readonly mounts, where
|
||||
* zfs_unlinked_drain() isn't called.) This is because
|
||||
* ziltest causes spa_sync() to think it's committed,
|
||||
* but actually it is not, so the intent log contains
|
||||
* many txg's worth of changes.
|
||||
*
|
||||
* In particular, if object N is in the unlinked set in
|
||||
* the last txg to actually sync, then it could be
|
||||
* actually freed in a later txg and then reallocated
|
||||
* in a yet later txg. This would write a "create
|
||||
* object N" record to the intent log. Normally, this
|
||||
* would be fine because the spa_sync() would have
|
||||
* written out the fact that object N is free, before
|
||||
* we could write the "create object N" intent log
|
||||
* record.
|
||||
*
|
||||
* But when we are in ziltest mode, we advance the "open
|
||||
* txg" without actually spa_sync()-ing the changes to
|
||||
* disk. So we would see that object N is still
|
||||
* allocated and in the unlinked set, and there is an
|
||||
* intent log record saying to allocate it.
|
||||
*/
|
||||
zfsvfs->z_replay = B_TRUE;
|
||||
zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
|
||||
zfsvfs->z_replay = B_FALSE;
|
||||
}
|
||||
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
|
||||
}
|
||||
|
||||
if (!zil_disable)
|
||||
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -634,7 +663,6 @@ zfs_domount(vfs_t *vfsp, char *osname)
|
||||
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
|
||||
zfsvfs->z_vfs = vfsp;
|
||||
zfsvfs->z_parent = zfsvfs;
|
||||
zfsvfs->z_assign = TXG_NOWAIT;
|
||||
zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
|
||||
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
|
||||
|
||||
|
||||
+127
-51
@@ -105,9 +105,7 @@
|
||||
* (3) All range locks must be grabbed before calling dmu_tx_assign(),
|
||||
* as they can span dmu_tx_assign() calls.
|
||||
*
|
||||
* (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
|
||||
* In normal operation, this will be TXG_NOWAIT. During ZIL replay,
|
||||
* it will be a specific txg. Either way, dmu_tx_assign() never blocks.
|
||||
* (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
|
||||
* This is critical because we don't want to block while holding locks.
|
||||
* Note, in particular, that if a lock is sometimes acquired before
|
||||
* the tx assigns, and sometimes after (e.g. z_lock), then failing to
|
||||
@@ -124,6 +122,8 @@
|
||||
* (5) If the operation succeeded, generate the intent log entry for it
|
||||
* before dropping locks. This ensures that the ordering of events
|
||||
* in the intent log matches the order in which they actually occurred.
|
||||
* During ZIL replay the zfs_log_* functions will update the sequence
|
||||
* number to indicate the zil transaction has replayed.
|
||||
*
|
||||
* (6) At the end of each vnode op, the DMU tx must always commit,
|
||||
* regardless of whether there were any errors.
|
||||
@@ -139,12 +139,12 @@
|
||||
* rw_enter(...); // grab any other locks you need
|
||||
* tx = dmu_tx_create(...); // get DMU tx
|
||||
* dmu_tx_hold_*(); // hold each object you might modify
|
||||
* error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign
|
||||
* error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign
|
||||
* if (error) {
|
||||
* rw_exit(...); // drop locks
|
||||
* zfs_dirent_unlock(dl); // unlock directory entry
|
||||
* VN_RELE(...); // release held vnodes
|
||||
* if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
* if (error == ERESTART) {
|
||||
* dmu_tx_wait(tx);
|
||||
* dmu_tx_abort(tx);
|
||||
* goto top;
|
||||
@@ -698,10 +698,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
|
||||
tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_bonus(tx, zp->z_id);
|
||||
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
if (error == ERESTART &&
|
||||
zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
continue;
|
||||
@@ -807,7 +806,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
|
||||
* If we're in replay mode, or we made no progress, return error.
|
||||
* Otherwise, it's at least a partial write, so it's successful.
|
||||
*/
|
||||
if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
|
||||
if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
|
||||
ZFS_EXIT(zfsvfs);
|
||||
return (error);
|
||||
}
|
||||
@@ -1233,11 +1232,10 @@ top:
|
||||
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
|
||||
0, SPA_MAXBLOCKSIZE);
|
||||
}
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
zfs_dirent_unlock(dl);
|
||||
if (error == ERESTART &&
|
||||
zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -1449,11 +1447,11 @@ top:
|
||||
/* charge as an update -- would be nice not to charge at all */
|
||||
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
||||
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
zfs_dirent_unlock(dl);
|
||||
VN_RELE(vp);
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -1659,10 +1657,10 @@ top:
|
||||
if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
|
||||
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
|
||||
0, SPA_MAXBLOCKSIZE);
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
zfs_dirent_unlock(dl);
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -1789,13 +1787,13 @@ top:
|
||||
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
|
||||
dmu_tx_hold_bonus(tx, zp->z_id);
|
||||
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
rw_exit(&zp->z_parent_lock);
|
||||
rw_exit(&zp->z_name_lock);
|
||||
zfs_dirent_unlock(dl);
|
||||
VN_RELE(vp);
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -2342,6 +2340,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
|
||||
zilog_t *zilog;
|
||||
dmu_tx_t *tx;
|
||||
vattr_t oldva;
|
||||
xvattr_t tmpxvattr;
|
||||
uint_t mask = vap->va_mask;
|
||||
uint_t saved_mask;
|
||||
int trim_mask = 0;
|
||||
@@ -2396,6 +2395,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
|
||||
*/
|
||||
xoap = xva_getxoptattr(xvap);
|
||||
|
||||
xva_init(&tmpxvattr);
|
||||
|
||||
/*
|
||||
* Immutable files can only alter immutable bit and atime
|
||||
*/
|
||||
@@ -2518,28 +2519,78 @@ top:
|
||||
oldva.va_mode = pzp->zp_mode;
|
||||
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
|
||||
if (mask & AT_XVATTR) {
|
||||
if ((need_policy == FALSE) &&
|
||||
(XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
|
||||
xoap->xoa_appendonly !=
|
||||
((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
|
||||
(XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
|
||||
xoap->xoa_nounlink !=
|
||||
((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
|
||||
(XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
|
||||
xoap->xoa_immutable !=
|
||||
((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
|
||||
(XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
|
||||
xoap->xoa_nodump !=
|
||||
((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
|
||||
(XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
|
||||
xoap->xoa_av_modified !=
|
||||
((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
|
||||
((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
|
||||
((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
|
||||
xoap->xoa_av_quarantined !=
|
||||
((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
|
||||
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
|
||||
(XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
|
||||
/*
|
||||
* Update xvattr mask to include only those attributes
|
||||
* that are actually changing.
|
||||
*
|
||||
* the bits will be restored prior to actually setting
|
||||
* the attributes so the caller thinks they were set.
|
||||
*/
|
||||
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
|
||||
if (xoap->xoa_appendonly !=
|
||||
((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
|
||||
need_policy = TRUE;
|
||||
} else {
|
||||
XVA_CLR_REQ(xvap, XAT_APPENDONLY);
|
||||
XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
|
||||
}
|
||||
}
|
||||
|
||||
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
|
||||
if (xoap->xoa_nounlink !=
|
||||
((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
|
||||
need_policy = TRUE;
|
||||
} else {
|
||||
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
|
||||
XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
|
||||
}
|
||||
}
|
||||
|
||||
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
|
||||
if (xoap->xoa_immutable !=
|
||||
((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
|
||||
need_policy = TRUE;
|
||||
} else {
|
||||
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
|
||||
XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
|
||||
}
|
||||
}
|
||||
|
||||
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
|
||||
if (xoap->xoa_nodump !=
|
||||
((pzp->zp_flags & ZFS_NODUMP) != 0)) {
|
||||
need_policy = TRUE;
|
||||
} else {
|
||||
XVA_CLR_REQ(xvap, XAT_NODUMP);
|
||||
XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
|
||||
}
|
||||
}
|
||||
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
|
||||
if (xoap->xoa_av_modified !=
|
||||
((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
|
||||
need_policy = TRUE;
|
||||
} else {
|
||||
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
|
||||
XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
|
||||
}
|
||||
}
|
||||
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
|
||||
if ((vp->v_type != VREG &&
|
||||
xoap->xoa_av_quarantined) ||
|
||||
xoap->xoa_av_quarantined !=
|
||||
((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
|
||||
need_policy = TRUE;
|
||||
} else {
|
||||
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
|
||||
XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
|
||||
}
|
||||
}
|
||||
|
||||
if (need_policy == FALSE &&
|
||||
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
|
||||
XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
|
||||
need_policy = TRUE;
|
||||
}
|
||||
}
|
||||
@@ -2649,7 +2700,7 @@ top:
|
||||
dmu_tx_hold_bonus(tx, attrzp->z_id);
|
||||
}
|
||||
|
||||
err = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
err = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (err) {
|
||||
if (attrzp)
|
||||
VN_RELE(ZTOV(attrzp));
|
||||
@@ -2659,7 +2710,7 @@ top:
|
||||
aclp = NULL;
|
||||
}
|
||||
|
||||
if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (err == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -2732,6 +2783,31 @@ top:
|
||||
*/
|
||||
|
||||
if (xoap && (mask & AT_XVATTR)) {
|
||||
|
||||
/*
|
||||
* restore trimmed off masks
|
||||
* so that return masks can be set for caller.
|
||||
*/
|
||||
|
||||
if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
|
||||
XVA_SET_REQ(xvap, XAT_APPENDONLY);
|
||||
}
|
||||
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
|
||||
XVA_SET_REQ(xvap, XAT_NOUNLINK);
|
||||
}
|
||||
if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
|
||||
XVA_SET_REQ(xvap, XAT_IMMUTABLE);
|
||||
}
|
||||
if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
|
||||
XVA_SET_REQ(xvap, XAT_NODUMP);
|
||||
}
|
||||
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
|
||||
XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
|
||||
}
|
||||
if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
|
||||
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
|
||||
}
|
||||
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
|
||||
size_t len;
|
||||
dmu_object_info_t doi;
|
||||
@@ -3104,7 +3180,7 @@ top:
|
||||
if (tzp)
|
||||
dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
|
||||
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
if (zl != NULL)
|
||||
zfs_rename_unlock(&zl);
|
||||
@@ -3113,7 +3189,7 @@ top:
|
||||
VN_RELE(ZTOV(szp));
|
||||
if (tzp)
|
||||
VN_RELE(ZTOV(tzp));
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -3242,10 +3318,10 @@ top:
|
||||
FUID_SIZE_ESTIMATE(zfsvfs));
|
||||
}
|
||||
}
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
zfs_dirent_unlock(dl);
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -3462,10 +3538,10 @@ top:
|
||||
tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_bonus(tx, szp->z_id);
|
||||
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
zfs_dirent_unlock(dl);
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -3547,7 +3623,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
|
||||
len = PAGESIZE;
|
||||
/*
|
||||
* If our blocksize is bigger than the page size, try to kluster
|
||||
* muiltiple pages so that we write a full block (thus avoiding
|
||||
* multiple pages so that we write a full block (thus avoiding
|
||||
* a read-modify-write).
|
||||
*/
|
||||
if (off < filesz && zp->z_blksz > PAGESIZE) {
|
||||
@@ -3589,9 +3665,9 @@ top:
|
||||
tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_write(tx, zp->z_id, off, len);
|
||||
dmu_tx_hold_bonus(tx, zp->z_id);
|
||||
err = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
err = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (err != 0) {
|
||||
if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (err == ERESTART) {
|
||||
zfs_range_unlock(rl);
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
|
||||
@@ -734,7 +734,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
|
||||
|
||||
ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
|
||||
|
||||
if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
|
||||
if (zfsvfs->z_replay) {
|
||||
obj = vap->va_nodeid;
|
||||
flag |= IS_REPLAY;
|
||||
now = vap->va_ctime; /* see zfs_replay_create() */
|
||||
@@ -1254,9 +1254,9 @@ top:
|
||||
newblksz = 0;
|
||||
}
|
||||
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -1358,9 +1358,9 @@ zfs_trunc(znode_t *zp, uint64_t end)
|
||||
top:
|
||||
tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_bonus(tx, zp->z_id);
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto top;
|
||||
@@ -1456,9 +1456,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
|
||||
log:
|
||||
tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_bonus(tx, zp->z_id);
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
error = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
if (error) {
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
if (error == ERESTART) {
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
goto log;
|
||||
@@ -1562,7 +1562,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
|
||||
bzero(&zfsvfs, sizeof (zfsvfs_t));
|
||||
|
||||
zfsvfs.z_os = os;
|
||||
zfsvfs.z_assign = TXG_NOWAIT;
|
||||
zfsvfs.z_parent = &zfsvfs;
|
||||
zfsvfs.z_version = version;
|
||||
zfsvfs.z_use_fuids = USE_FUIDS(version, os);
|
||||
|
||||
+33
-95
@@ -351,14 +351,20 @@ zil_create(zilog_t *zilog)
|
||||
blk = zh->zh_log;
|
||||
|
||||
/*
|
||||
* If we don't already have an initial log block, allocate one now.
|
||||
* If we don't already have an initial log block or we have one
|
||||
* but it's the wrong endianness then allocate one.
|
||||
*/
|
||||
if (BP_IS_HOLE(&blk)) {
|
||||
if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
|
||||
tx = dmu_tx_create(zilog->zl_os);
|
||||
(void) dmu_tx_assign(tx, TXG_WAIT);
|
||||
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
|
||||
txg = dmu_tx_get_txg(tx);
|
||||
|
||||
if (!BP_IS_HOLE(&blk)) {
|
||||
zio_free_blk(zilog->zl_spa, &blk, txg);
|
||||
BP_ZERO(&blk);
|
||||
}
|
||||
|
||||
error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
|
||||
NULL, txg);
|
||||
|
||||
@@ -1214,7 +1220,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
|
||||
|
||||
ASSERT(zilog->zl_stop_sync == 0);
|
||||
|
||||
zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
|
||||
zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
|
||||
|
||||
if (zilog->zl_destroy_txg == txg) {
|
||||
blkptr_t blk = zh->zh_log;
|
||||
@@ -1223,7 +1229,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
|
||||
ASSERT(spa_sync_pass(spa) == 1);
|
||||
|
||||
bzero(zh, sizeof (zil_header_t));
|
||||
bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
|
||||
bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
|
||||
|
||||
if (zilog->zl_keep_first) {
|
||||
/*
|
||||
@@ -1460,9 +1466,7 @@ zil_resume(zilog_t *zilog)
|
||||
typedef struct zil_replay_arg {
|
||||
objset_t *zr_os;
|
||||
zil_replay_func_t **zr_replay;
|
||||
zil_replay_cleaner_t *zr_replay_cleaner;
|
||||
void *zr_arg;
|
||||
uint64_t *zr_txgp;
|
||||
boolean_t zr_byteswap;
|
||||
char *zr_lrbuf;
|
||||
} zil_replay_arg_t;
|
||||
@@ -1475,9 +1479,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
|
||||
uint64_t reclen = lr->lrc_reclen;
|
||||
uint64_t txtype = lr->lrc_txtype;
|
||||
char *name;
|
||||
int pass, error, sunk;
|
||||
int pass, error;
|
||||
|
||||
if (zilog->zl_stop_replay)
|
||||
if (!zilog->zl_replay) /* giving up */
|
||||
return;
|
||||
|
||||
if (lr->lrc_txg < claim_txg) /* already committed */
|
||||
@@ -1489,6 +1493,11 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
|
||||
/* Strip case-insensitive bit, still present in log record */
|
||||
txtype &= ~TX_CI;
|
||||
|
||||
if (txtype == 0 || txtype >= TX_MAX_TYPE) {
|
||||
error = EINVAL;
|
||||
goto bad;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make a copy of the data so we can revise and extend it.
|
||||
*/
|
||||
@@ -1538,70 +1547,17 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Replay of large truncates can end up needing additional txs
|
||||
* and a different txg. If they are nested within the replay tx
|
||||
* as below then a hang is possible. So we do the truncate here
|
||||
* and redo the truncate later (a no-op) and update the sequence
|
||||
* number whilst in the replay tx. Fortunately, it's safe to repeat
|
||||
* a truncate if we crash and the truncate commits. A create over
|
||||
* an existing file will also come in as a TX_TRUNCATE record.
|
||||
*
|
||||
* Note, remove of large files and renames over large files is
|
||||
* handled by putting the deleted object on a stable list
|
||||
* and if necessary force deleting the object outside of the replay
|
||||
* transaction using the zr_replay_cleaner.
|
||||
*/
|
||||
if (txtype == TX_TRUNCATE) {
|
||||
*zr->zr_txgp = TXG_NOWAIT;
|
||||
error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf,
|
||||
zr->zr_byteswap);
|
||||
if (error)
|
||||
goto bad;
|
||||
zr->zr_byteswap = 0; /* only byteswap once */
|
||||
}
|
||||
|
||||
/*
|
||||
* We must now do two things atomically: replay this log record,
|
||||
* and update the log header to reflect the fact that we did so.
|
||||
* We use the DMU's ability to assign into a specific txg to do this.
|
||||
* and update the log header sequence number to reflect the fact that
|
||||
* we did so. At the end of each replay function the sequence number
|
||||
* is updated if we are in replay mode.
|
||||
*/
|
||||
for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
|
||||
uint64_t replay_txg;
|
||||
dmu_tx_t *replay_tx;
|
||||
|
||||
replay_tx = dmu_tx_create(zr->zr_os);
|
||||
error = dmu_tx_assign(replay_tx, TXG_WAIT);
|
||||
if (error) {
|
||||
dmu_tx_abort(replay_tx);
|
||||
break;
|
||||
}
|
||||
|
||||
replay_txg = dmu_tx_get_txg(replay_tx);
|
||||
|
||||
if (txtype == 0 || txtype >= TX_MAX_TYPE) {
|
||||
error = EINVAL;
|
||||
} else {
|
||||
/*
|
||||
* On the first pass, arrange for the replay vector
|
||||
* to fail its dmu_tx_assign(). That's the only way
|
||||
* to ensure that those code paths remain well tested.
|
||||
*
|
||||
* Only byteswap (if needed) on the 1st pass.
|
||||
*/
|
||||
*zr->zr_txgp = replay_txg - (pass == 1);
|
||||
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
|
||||
zr->zr_byteswap && pass == 1);
|
||||
*zr->zr_txgp = TXG_NOWAIT;
|
||||
}
|
||||
|
||||
if (error == 0) {
|
||||
dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
|
||||
zilog->zl_replay_seq[replay_txg & TXG_MASK] =
|
||||
lr->lrc_seq;
|
||||
}
|
||||
|
||||
dmu_tx_commit(replay_tx);
|
||||
for (pass = 1; pass <= 2; pass++) {
|
||||
zilog->zl_replaying_seq = lr->lrc_seq;
|
||||
/* Only byteswap (if needed) on the 1st pass. */
|
||||
error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
|
||||
zr->zr_byteswap && pass == 1);
|
||||
|
||||
if (!error)
|
||||
return;
|
||||
@@ -1609,37 +1565,22 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
|
||||
/*
|
||||
* The DMU's dnode layer doesn't see removes until the txg
|
||||
* commits, so a subsequent claim can spuriously fail with
|
||||
* EEXIST. So if we receive any error other than ERESTART
|
||||
* we try syncing out any removes then retrying the
|
||||
* transaction.
|
||||
* EEXIST. So if we receive any error we try syncing out
|
||||
* any removes then retry the transaction.
|
||||
*/
|
||||
if (error != ERESTART && !sunk) {
|
||||
if (zr->zr_replay_cleaner)
|
||||
zr->zr_replay_cleaner(zr->zr_arg);
|
||||
if (pass == 1)
|
||||
txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
|
||||
sunk = B_TRUE;
|
||||
continue; /* retry */
|
||||
}
|
||||
|
||||
if (error != ERESTART)
|
||||
break;
|
||||
|
||||
if (pass != 1)
|
||||
txg_wait_open(spa_get_dsl(zilog->zl_spa),
|
||||
replay_txg + 1);
|
||||
|
||||
dprintf("pass %d, retrying\n", pass);
|
||||
}
|
||||
|
||||
bad:
|
||||
ASSERT(error && error != ERESTART);
|
||||
ASSERT(error);
|
||||
name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
|
||||
dmu_objset_name(zr->zr_os, name);
|
||||
cmn_err(CE_WARN, "ZFS replay transaction error %d, "
|
||||
"dataset %s, seq 0x%llx, txtype %llu %s\n",
|
||||
error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
|
||||
(lr->lrc_txtype & TX_CI) ? "CI" : "");
|
||||
zilog->zl_stop_replay = 1;
|
||||
zilog->zl_replay = B_FALSE;
|
||||
kmem_free(name, MAXNAMELEN);
|
||||
}
|
||||
|
||||
@@ -1654,9 +1595,7 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
|
||||
* If this dataset has a non-empty intent log, replay it and destroy it.
|
||||
*/
|
||||
void
|
||||
zil_replay(objset_t *os, void *arg, uint64_t *txgp,
|
||||
zil_replay_func_t *replay_func[TX_MAX_TYPE],
|
||||
zil_replay_cleaner_t *replay_cleaner)
|
||||
zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
|
||||
{
|
||||
zilog_t *zilog = dmu_objset_zil(os);
|
||||
const zil_header_t *zh = zilog->zl_header;
|
||||
@@ -1669,9 +1608,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
|
||||
|
||||
zr.zr_os = os;
|
||||
zr.zr_replay = replay_func;
|
||||
zr.zr_replay_cleaner = replay_cleaner;
|
||||
zr.zr_arg = arg;
|
||||
zr.zr_txgp = txgp;
|
||||
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
|
||||
zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
|
||||
|
||||
@@ -1680,7 +1617,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
|
||||
*/
|
||||
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
||||
|
||||
zilog->zl_stop_replay = 0;
|
||||
zilog->zl_replay = B_TRUE;
|
||||
zilog->zl_replay_time = lbolt;
|
||||
ASSERT(zilog->zl_replay_blks == 0);
|
||||
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
|
||||
@@ -1689,6 +1626,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
|
||||
|
||||
zil_destroy(zilog, B_FALSE);
|
||||
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
|
||||
zilog->zl_replay = B_FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
+27
-3
@@ -767,7 +767,8 @@ zio_read_bp_init(zio_t *zio)
|
||||
{
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
|
||||
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
|
||||
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
|
||||
zio->io_logical == zio && !(zio->io_flags & ZIO_FLAG_RAW)) {
|
||||
uint64_t csize = BP_GET_PSIZE(bp);
|
||||
void *cbuf = zio_buf_alloc(csize);
|
||||
|
||||
@@ -1790,7 +1791,30 @@ zio_vdev_io_start(zio_t *zio)
|
||||
|
||||
ASSERT(P2PHASE(zio->io_offset, align) == 0);
|
||||
ASSERT(P2PHASE(zio->io_size, align) == 0);
|
||||
ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
|
||||
ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
|
||||
|
||||
/*
|
||||
* If this is a repair I/O, and there's no self-healing involved --
|
||||
* that is, we're just resilvering what we expect to resilver --
|
||||
* then don't do the I/O unless zio's txg is actually in vd's DTL.
|
||||
* This prevents spurious resilvering with nested replication.
|
||||
* For example, given a mirror of mirrors, (A+B)+(C+D), if only
|
||||
* A is out of date, we'll read from C+D, then use the data to
|
||||
* resilver A+B -- but we don't actually want to resilver B, just A.
|
||||
* The top-level mirror has no way to know this, so instead we just
|
||||
* discard unnecessary repairs as we work our way down the vdev tree.
|
||||
* The same logic applies to any form of nested replication:
|
||||
* ditto + mirror, RAID-Z + replacing, etc. This covers them all.
|
||||
*/
|
||||
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
|
||||
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
|
||||
zio->io_txg != 0 && /* not a delegated i/o */
|
||||
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
ASSERT(zio->io_delegate_list == NULL);
|
||||
zio_vdev_io_bypass(zio);
|
||||
return (ZIO_PIPELINE_CONTINUE);
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf &&
|
||||
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
|
||||
@@ -1806,7 +1830,6 @@ zio_vdev_io_start(zio_t *zio)
|
||||
zio_interrupt(zio);
|
||||
return (ZIO_PIPELINE_STOP);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return (vd->vdev_ops->vdev_op_io_start(zio));
|
||||
@@ -2157,6 +2180,7 @@ zio_done(zio_t *zio)
|
||||
if ((zio->io_type == ZIO_TYPE_READ ||
|
||||
zio->io_type == ZIO_TYPE_FREE) &&
|
||||
zio->io_error == ENXIO &&
|
||||
spa->spa_load_state == SPA_LOAD_NONE &&
|
||||
spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
|
||||
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
|
||||
|
||||
|
||||
+12
-4
@@ -75,6 +75,7 @@
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/zvol.h>
|
||||
#include <sys/dumphdr.h>
|
||||
#include <sys/zil_impl.h>
|
||||
|
||||
#include "zfs_namecheck.h"
|
||||
|
||||
@@ -113,7 +114,6 @@ typedef struct zvol_state {
|
||||
uint32_t zv_total_opens; /* total open count */
|
||||
zilog_t *zv_zilog; /* ZIL handle */
|
||||
list_t zv_extents; /* List of extents for dump */
|
||||
uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
|
||||
znode_t zv_znode; /* for range locking */
|
||||
} zvol_state_t;
|
||||
|
||||
@@ -381,7 +381,7 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
|
||||
|
||||
tx = dmu_tx_create(os);
|
||||
dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
|
||||
error = dmu_tx_assign(tx, zv->zv_txg_assign);
|
||||
error = dmu_tx_assign(tx, TXG_WAIT);
|
||||
if (error) {
|
||||
dmu_tx_abort(tx);
|
||||
} else {
|
||||
@@ -558,7 +558,7 @@ zvol_create_minor(const char *name, major_t maj)
|
||||
ASSERT(error == 0);
|
||||
zv->zv_volblocksize = doi.doi_data_block_size;
|
||||
|
||||
zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
|
||||
zil_replay(os, zv, zvol_replay_vector);
|
||||
zvol_size_changed(zv, maj);
|
||||
|
||||
/* XXX this should handle the possible i/o error */
|
||||
@@ -971,8 +971,16 @@ static void
|
||||
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
|
||||
{
|
||||
uint32_t blocksize = zv->zv_volblocksize;
|
||||
zilog_t *zilog = zv->zv_zilog;
|
||||
lr_write_t *lr;
|
||||
|
||||
if (zilog->zl_replay) {
|
||||
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
|
||||
zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
|
||||
zilog->zl_replaying_seq;
|
||||
return;
|
||||
}
|
||||
|
||||
while (len) {
|
||||
ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
|
||||
itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
|
||||
@@ -987,7 +995,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
|
||||
lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
|
||||
BP_ZERO(&lr->lr_blkptr);
|
||||
|
||||
(void) zil_itx_assign(zv->zv_zilog, itx, tx);
|
||||
(void) zil_itx_assign(zilog, itx, tx);
|
||||
len -= nbytes;
|
||||
off += nbytes;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user