2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Eric Schrock <Eric.Schrock@delphix.com>

References:
  illumos/illumos-gate@53089ab7c8
  illumos/illumos-gate@ad135b5d64
  illumos changeset: 13700:2889e2596bd6
  https://www.illumos.org/issues/2619
  https://www.illumos.org/issues/2747

NOTE: The grub specific changes were not ported.  This change
must be made to the Linux grub packages.

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Christopher Siden
2012-12-13 15:24:15 -08:00
committed by Brian Behlendorf
parent 15313c5e18
commit 9ae529ec5d
67 changed files with 4267 additions and 467 deletions
+126 -59
View File
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
@@ -35,6 +35,7 @@
#include <sys/arc.h>
#include <sys/zio.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
#include <sys/unique.h>
#include <sys/zfs_context.h>
#include <sys/zfs_ioctl.h>
@@ -102,7 +103,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
if (BP_IS_HOLE(bp))
return;
ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
if (ds == NULL) {
/*
* Account for the meta-objset space in its placeholder
@@ -119,7 +120,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
ds->ds_phys->ds_used_bytes += used;
ds->ds_phys->ds_referenced_bytes += used;
ds->ds_phys->ds_compressed_bytes += compressed;
ds->ds_phys->ds_uncompressed_bytes += uncompressed;
ds->ds_phys->ds_unique_bytes += used;
@@ -215,8 +216,8 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
}
}
mutex_enter(&ds->ds_lock);
ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
ds->ds_phys->ds_used_bytes -= used;
ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
ds->ds_phys->ds_referenced_bytes -= used;
ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
ds->ds_phys->ds_compressed_bytes -= compressed;
ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
@@ -823,8 +824,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dsphys->ds_prev_snap_obj = origin->ds_object;
dsphys->ds_prev_snap_txg =
origin->ds_phys->ds_creation_txg;
dsphys->ds_used_bytes =
origin->ds_phys->ds_used_bytes;
dsphys->ds_referenced_bytes =
origin->ds_phys->ds_referenced_bytes;
dsphys->ds_compressed_bytes =
origin->ds_phys->ds_compressed_bytes;
dsphys->ds_uncompressed_bytes =
@@ -938,7 +939,6 @@ dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed)
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
dsl_dataset_t *ds;
int err;
err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
if (err == 0) {
@@ -1088,19 +1088,23 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
goto out_free;
/*
* remove the objects in open context, so that we won't
* have too much to do in syncing context.
* If async destruction is not enabled try to remove all objects
* while in the open context so that there is less work to do in
* the syncing context.
*/
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
ds->ds_phys->ds_prev_snap_txg)) {
/*
* Ignore errors, if there is not enough disk space
* we will deal with it in dsl_dataset_destroy_sync().
*/
(void) dmu_free_object(os, obj);
if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
ds->ds_phys->ds_prev_snap_txg)) {
/*
* Ignore errors, if there is not enough disk space
* we will deal with it in dsl_dataset_destroy_sync().
*/
(void) dmu_free_object(os, obj);
}
if (err != ESRCH)
goto out_free;
}
if (err != ESRCH)
goto out_free;
/*
* Only the ZIL knows how to free log blocks.
@@ -1261,7 +1265,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
ASSERT(!dsl_dataset_is_snapshot(ds));
if (ds->ds_phys->ds_prev_snap_obj != 0)
mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
else
mrs_used = 0;
@@ -1269,7 +1273,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
ASSERT3U(dlused, <=, mrs_used);
ds->ds_phys->ds_unique_bytes =
ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
SPA_VERSION_UNIQUE_ACCURATE)
@@ -1627,12 +1631,36 @@ process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
ds_next->ds_phys->ds_deadlist_obj);
}
static int
old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
{
int err;
struct killarg ka;
/*
* Free everything that we point to (that's born after
* the previous snapshot, if we are a clone)
*
* NB: this should be very quick, because we already
* freed all the objects in open context.
*/
ka.ds = ds;
ka.tx = tx;
err = traverse_dataset(ds,
ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
kill_blkptr, &ka);
ASSERT3U(err, ==, 0);
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
return (err);
}
void
dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
struct dsl_ds_destroyarg *dsda = arg1;
dsl_dataset_t *ds = dsda->ds;
int err;
int err = 0;
int after_branch_point = FALSE;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
objset_t *mos = dp->dp_meta_objset;
@@ -1773,7 +1801,6 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
tx);
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
DD_USED_HEAD, used, comp, uncomp, tx);
dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
/* Merge our deadlist into next's and free it. */
dsl_deadlist_merge(&ds_next->ds_deadlist,
@@ -1849,32 +1876,54 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
}
dsl_dataset_rele(ds_next, FTAG);
} else {
zfeature_info_t *async_destroy =
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
/*
* There's no next snapshot, so this is a head dataset.
* Destroy the deadlist. Unless it's a clone, the
* deadlist should be empty. (If it's a clone, it's
* safe to ignore the deadlist contents.)
*/
struct killarg ka;
dsl_deadlist_close(&ds->ds_deadlist);
dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
ds->ds_phys->ds_deadlist_obj = 0;
/*
* Free everything that we point to (that's born after
* the previous snapshot, if we are a clone)
*
* NB: this should be very quick, because we already
* freed all the objects in open context.
*/
ka.ds = ds;
ka.tx = tx;
err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
TRAVERSE_POST, kill_blkptr, &ka);
ASSERT3U(err, ==, 0);
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
ds->ds_phys->ds_unique_bytes == 0);
if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
err = old_synchronous_dataset_destroy(ds, tx);
} else {
/*
* Move the bptree into the pool's list of trees to
* clean up and update space accounting information.
*/
uint64_t used, comp, uncomp;
ASSERT(err == 0 || err == EBUSY);
if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
spa_feature_incr(dp->dp_spa, async_destroy, tx);
dp->dp_bptree_obj = bptree_alloc(
dp->dp_meta_objset, tx);
VERIFY(zap_add(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
&dp->dp_bptree_obj, tx) == 0);
}
used = ds->ds_dir->dd_phys->dd_used_bytes;
comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
ds->ds_phys->ds_unique_bytes == used);
bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj,
&ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
used, comp, uncomp, tx);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-used, -comp, -uncomp, tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
used, comp, uncomp, tx);
}
if (ds->ds_prev != NULL) {
if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
@@ -2065,7 +2114,7 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
dsphys->ds_creation_time = gethrestime_sec();
dsphys->ds_creation_txg = crtxg;
dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
dsphys->ds_flags = ds->ds_phys->ds_flags;
@@ -2189,10 +2238,22 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
zap_cursor_advance(&zc)) {
dsl_dataset_t *clone;
char buf[ZFS_MAXNAMELEN];
/*
* Even though we hold the dp_config_rwlock, the dataset
* may fail to open, returning ENOENT. If there is a
* thread concurrently attempting to destroy this
* dataset, it will have the ds_rwlock held for
* RW_WRITER. Our call to dsl_dataset_hold_obj() ->
* dsl_dataset_hold_ref() will fail its
* rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
* dp_config_rwlock, and wait for the destroy progress
* and signal ds_exclusive_cv. If the destroy was
* successful, we will see that
* DSL_DATASET_IS_DESTROYED(), and return ENOENT.
*/
if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
za.za_first_integer, FTAG, &clone) != 0) {
goto fail;
}
za.za_first_integer, FTAG, &clone) != 0)
continue;
dsl_dir_name(clone->ds_dir, buf);
VERIFY(nvlist_add_boolean(val, buf) == 0);
dsl_dataset_rele(clone, FTAG);
@@ -2316,7 +2377,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp)
{
*refdbytesp = ds->ds_phys->ds_used_bytes;
*refdbytesp = ds->ds_phys->ds_referenced_bytes;
*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
@@ -2652,7 +2713,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
* Note however, if we stop before we reach the ORIGIN we get:
* uN + kN + kN-1 + ... + kM - uM-1
*/
pa->used = origin_ds->ds_phys->ds_used_bytes;
pa->used = origin_ds->ds_phys->ds_referenced_bytes;
pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
for (snap = list_head(&pa->shared_snaps); snap;
@@ -2686,7 +2747,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
* so we need to subtract out the clone origin's used space.
*/
if (pa->origin_origin) {
pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
}
@@ -3203,8 +3264,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
dsl_deadlist_space(&csa->ohds->ds_deadlist,
&odl_used, &odl_comp, &odl_uncomp);
dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
(csa->ohds->ds_phys->ds_used_bytes + odl_used);
dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
(csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
(csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
@@ -3233,8 +3294,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
}
/* swap ds_*_bytes */
SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
csa->cds->ds_phys->ds_used_bytes);
SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
csa->cds->ds_phys->ds_referenced_bytes);
SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
csa->cds->ds_phys->ds_compressed_bytes);
SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
@@ -3363,8 +3424,9 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
if (inflight > 0 ||
ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
error = ERESTART;
else
error = EDQUOT;
@@ -3393,7 +3455,7 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
if (psa->psa_effective_value == 0)
return (0);
if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
psa->psa_effective_value < ds->ds_reserved)
return (ENOSPC);
@@ -4141,8 +4203,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
dsl_pool_t *dp = new->ds_dir->dd_pool;
*usedp = 0;
*usedp += new->ds_phys->ds_used_bytes;
*usedp -= oldsnap->ds_phys->ds_used_bytes;
*usedp += new->ds_phys->ds_referenced_bytes;
*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
*compp = 0;
*compp += new->ds_phys->ds_compressed_bytes;
@@ -4158,9 +4220,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
dsl_dataset_t *snap;
uint64_t used, comp, uncomp;
err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
if (err != 0)
break;
if (snapobj == new->ds_object) {
snap = new;
} else {
err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
if (err != 0)
break;
}
if (snap->ds_phys->ds_prev_snap_txg ==
oldsnap->ds_phys->ds_creation_txg) {
@@ -4189,7 +4255,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
* was not a snapshot of/before new.
*/
snapobj = snap->ds_phys->ds_prev_snap_obj;
dsl_dataset_rele(snap, FTAG);
if (snap != new)
dsl_dataset_rele(snap, FTAG);
if (snapobj == 0) {
err = EINVAL;
break;