Illumos 3897 - zfs filesystem and snapshot limits

3897 zfs filesystem and snapshot limits
Author: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Christopher Siden <christopher.siden@delphix.com>

References:
  https://www.illumos.org/issues/3897
  https://github.com/illumos/illumos-gate/commit/a2afb61

Porting Notes:

dsl_dataset_snapshot_check(): reduce stack usage using kmem_alloc().

Ported-by: Chris Dunlop <chris@onthe.net.au>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Jerry Jelinek
2015-04-02 00:07:48 +11:00
committed by Brian Behlendorf
parent 308a451f7f
commit 788eb90c4c
19 changed files with 1021 additions and 27 deletions
+154 -9
View File
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 RackTop Systems.
*/
@@ -318,7 +318,8 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
}
int
dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
boolean_t adj_cnt)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
@@ -335,6 +336,11 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
err = zap_remove_norm(mos, snapobj, name, mt, tx);
if (err == ENOTSUP && mt == MT_FIRST)
err = zap_remove(mos, snapobj, name, tx);
if (err == 0 && adj_cnt)
dsl_fs_ss_count_adjust(ds->ds_dir, -1,
DD_FIELD_SNAPSHOT_COUNT, tx);
return (err);
}
@@ -767,6 +773,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_deleg_set_create_perms(dd, tx, cr);
/*
* Since we're creating a new node we know it's a leaf, so we can
* initialize the counts if the limit feature is active.
*/
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
uint64_t cnt = 0;
objset_t *os = dd->dd_pool->dp_meta_objset;
dsl_dir_zapify(dd, tx);
VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
sizeof (cnt), 1, &cnt, tx));
VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
sizeof (cnt), 1, &cnt, tx));
}
dsl_dir_rele(dd, FTAG);
/*
@@ -935,11 +956,12 @@ typedef struct dsl_dataset_snapshot_arg {
nvlist_t *ddsa_snaps;
nvlist_t *ddsa_props;
nvlist_t *ddsa_errors;
cred_t *ddsa_cr;
} dsl_dataset_snapshot_arg_t;
int
dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
dmu_tx_t *tx, boolean_t recv)
dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
{
int error;
uint64_t value;
@@ -977,6 +999,18 @@ dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
if (!recv && DS_IS_INCONSISTENT(ds))
return (SET_ERROR(EBUSY));
/*
* Skip the check for temporary snapshots or if we have already checked
* the counts in dsl_dataset_snapshot_check. This means we really only
* check the count here when we're receiving a stream.
*/
if (cnt != 0 && cr != NULL) {
error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
if (error != 0)
return (error);
}
error = dsl_dataset_snapshot_reserve_space(ds, tx);
if (error != 0)
return (error);
@@ -992,6 +1026,103 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
nvpair_t *pair;
int rv = 0;
/*
* Pre-compute how many total new snapshots will be created for each
* level in the tree and below. This is needed for validating the
* snapshot limit when either taking a recursive snapshot or when
* taking multiple snapshots.
*
* The problem is that the counts are not actually adjusted when
* we are checking, only when we finally sync. For a single snapshot,
* this is easy, the count will increase by 1 at each node up the tree,
* but its more complicated for the recursive/multiple snapshot case.
*
* The dsl_fs_ss_limit_check function does recursively check the count
* at each level up the tree but since it is validating each snapshot
* independently we need to be sure that we are validating the complete
* count for the entire set of snapshots. We do this by rolling up the
* counts for each component of the name into an nvlist and then
* checking each of those cases with the aggregated count.
*
* This approach properly handles not only the recursive snapshot
* case (where we get all of those on the ddsa_snaps list) but also
* the sibling case (e.g. snapshot a/b and a/c so that we will also
* validate the limit on 'a' using a count of 2).
*
* We validate the snapshot names in the third loop and only report
* name errors once.
*/
if (dmu_tx_is_syncing(tx)) {
char *nm;
nvlist_t *cnt_track = NULL;
cnt_track = fnvlist_alloc();
nm = kmem_alloc(MAXPATHLEN, KM_SLEEP);
/* Rollup aggregated counts into the cnt_track list */
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL;
pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
char *pdelim;
uint64_t val;
(void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN);
pdelim = strchr(nm, '@');
if (pdelim == NULL)
continue;
*pdelim = '\0';
do {
if (nvlist_lookup_uint64(cnt_track, nm,
&val) == 0) {
/* update existing entry */
fnvlist_add_uint64(cnt_track, nm,
val + 1);
} else {
/* add to list */
fnvlist_add_uint64(cnt_track, nm, 1);
}
pdelim = strrchr(nm, '/');
if (pdelim != NULL)
*pdelim = '\0';
} while (pdelim != NULL);
}
kmem_free(nm, MAXPATHLEN);
/* Check aggregated counts at each level */
for (pair = nvlist_next_nvpair(cnt_track, NULL);
pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
int error = 0;
char *name;
uint64_t cnt = 0;
dsl_dataset_t *ds;
name = nvpair_name(pair);
cnt = fnvpair_value_uint64(pair);
ASSERT(cnt > 0);
error = dsl_dataset_hold(dp, name, FTAG, &ds);
if (error == 0) {
error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
ZFS_PROP_SNAPSHOT_LIMIT, NULL,
ddsa->ddsa_cr);
dsl_dataset_rele(ds, FTAG);
}
if (error != 0) {
if (ddsa->ddsa_errors != NULL)
fnvlist_add_int32(ddsa->ddsa_errors,
name, error);
rv = error;
/* only report one error for this check */
break;
}
}
nvlist_free(cnt_track);
}
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
int error = 0;
@@ -1012,8 +1143,9 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
if (error == 0)
error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
if (error == 0) {
/* passing 0/NULL skips dsl_fs_ss_limit_check */
error = dsl_dataset_snapshot_check_impl(ds,
atp + 1, tx, B_FALSE);
atp + 1, tx, B_FALSE, 0, NULL);
dsl_dataset_rele(ds, FTAG);
}
@@ -1025,6 +1157,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
rv = error;
}
}
return (rv);
}
@@ -1051,6 +1184,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
bcmp(&os->os_phys->os_zil_header, &zero_zil,
sizeof (zero_zil)) == 0);
dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
/*
* The origin's ds_creation_txg has to be < TXG_INITIAL
@@ -1227,6 +1361,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
ddsa.ddsa_snaps = snaps;
ddsa.ddsa_props = props;
ddsa.ddsa_errors = errors;
ddsa.ddsa_cr = CRED();
if (error == 0) {
error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
@@ -1275,8 +1410,9 @@ dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
if (error != 0)
return (error);
/* NULL cred means no limit check for tmp snapshot */
error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
tx, B_FALSE);
tx, B_FALSE, 0, NULL);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
@@ -1644,7 +1780,8 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
spa_history_log_internal_ds(ds, "rename", tx,
"-> @%s", ddrsa->ddrsa_newsnapname);
VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
B_FALSE));
mutex_enter(&ds->ds_lock);
(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
mutex_exit(&ds->ds_lock);
@@ -1896,6 +2033,7 @@ typedef struct dsl_dataset_promote_arg {
dsl_dataset_t *origin_origin; /* origin of the origin */
uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
char *err_ds;
cred_t *cr;
} dsl_dataset_promote_arg_t;
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -1913,6 +2051,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
dsl_dataset_t *origin_ds;
int err;
uint64_t unused;
uint64_t ss_mv_cnt;
err = promote_hold(ddpa, dp, FTAG);
if (err != 0)
@@ -1959,6 +2098,7 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
* Note however, if we stop before we reach the ORIGIN we get:
* uN + kN + kN-1 + ... + kM - uM-1
*/
ss_mv_cnt = 0;
ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
@@ -1967,6 +2107,8 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
uint64_t val, dlused, dlcomp, dluncomp;
dsl_dataset_t *ds = snap->ds;
ss_mv_cnt++;
/*
* If there are long holds, we won't be able to evict
* the objset.
@@ -2009,9 +2151,9 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
}
/* Check that there is enough space here */
/* Check that there is enough space and limit headroom here */
err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
ddpa->used);
0, ss_mv_cnt, ddpa->used, ddpa->cr);
if (err != 0)
goto out;
@@ -2151,10 +2293,12 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
/* move snap name entry */
VERIFY0(dsl_dataset_get_snapname(ds));
VERIFY0(dsl_dataset_snap_remove(origin_head,
ds->ds_snapname, tx));
ds->ds_snapname, tx, B_TRUE));
VERIFY0(zap_add(dp->dp_meta_objset,
hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
dsl_fs_ss_count_adjust(hds->ds_dir, 1,
DD_FIELD_SNAPSHOT_COUNT, tx);
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
@@ -2392,6 +2536,7 @@ dsl_dataset_promote(const char *name, char *conflsnap)
ddpa.ddpa_clonename = name;
ddpa.err_ds = conflsnap;
ddpa.cr = CRED();
return (dsl_sync_task(name, dsl_dataset_promote_check,
dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));