From b0bc7a84d90dcbf5321d48c5b24ed771c5a128b0 Mon Sep 17 00:00:00 2001 From: Max Grossman Date: Mon, 9 Dec 2013 10:37:51 -0800 Subject: [PATCH] Illumos 4370, 4371 4370 avoid transmitting holes during zfs send 4371 DMU code clean up Reviewed by: Matthew Ahrens Reviewed by: George Wilson Reviewed by: Christopher Siden Reviewed by: Josef 'Jeff' Sipek Approved by: Garrett D'Amore a References: https://www.illumos.org/issues/4370 https://www.illumos.org/issues/4371 https://github.com/illumos/illumos-gate/commit/43466aa Ported by: Tim Chase Signed-off-by: Brian Behlendorf Closes #2529 --- cmd/zdb/zdb.c | 83 ++++++++------- cmd/zdb/zdb_il.c | 14 ++- cmd/zhack/zhack.c | 14 ++- include/sys/dbuf.h | 18 +--- include/sys/dmu.h | 1 + include/sys/spa.h | 84 ++++++++++------ include/sys/spa_impl.h | 4 + include/sys/vdev.h | 4 +- include/sys/zfeature.h | 8 ++ include/zfeature_common.h | 7 ++ man/man5/zpool-features.5 | 65 ++++++++++++ module/zfs/arc.c | 12 ++- module/zfs/bptree.c | 2 +- module/zfs/dbuf.c | 135 ++++++++++--------------- module/zfs/ddt.c | 17 ++-- module/zfs/dmu.c | 6 +- module/zfs/dmu_diff.c | 2 +- module/zfs/dmu_send.c | 5 +- module/zfs/dmu_traverse.c | 39 +++++-- module/zfs/dnode.c | 53 ++++------ module/zfs/dnode_sync.c | 190 +++++++++++++++++++---------------- module/zfs/dsl_dataset.c | 12 +-- module/zfs/dsl_destroy.c | 4 +- module/zfs/dsl_scan.c | 7 +- module/zfs/spa.c | 32 +++++- module/zfs/spa_misc.c | 29 +++++- module/zfs/vdev_cache.c | 20 ++-- module/zfs/zfeature.c | 122 +++++++++++++++++++--- module/zfs/zfeature_common.c | 47 +++++++-- module/zfs/zfs_znode.c | 14 +-- module/zfs/zil.c | 6 +- module/zfs/zio.c | 19 +++- 32 files changed, 690 insertions(+), 385 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 60cd81222..b29cddb6e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -767,7 +767,7 @@ dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); - sprintf_blkptr(blkbuf, &blk); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); @@ -1036,32 +1036,40 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) } static void -sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp) +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) { const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; int i; if (dump_opt['b'] >= 6) { - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, buflen, bp); return; } blkbuf[0] = '\0'; for (i = 0; i < ndvas; i++) - (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ", + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); - (void) sprintf(blkbuf + strlen(blkbuf), - "%llxL/%llxP F=%llu B=%llu/%llu", - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_birth, - (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (BP_IS_HOLE(bp)) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "B=%llu", + (u_longlong_t)bp->blk_birth); + } else { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)bp->blk_fill, + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + } } static void @@ -1086,7 +1094,7 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb, } } - sprintf_blkptr_compact(blkbuf, bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } @@ -1101,7 +1109,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, print_indirect(bp, zb, dnp); - if (BP_GET_LEVEL(bp) > 0) { + if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { uint32_t flags = ARC_WAIT; int i; blkptr_t *cbp; @@ -1226,7 +1234,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) zdb_nicenum(ds->ds_compressed_bytes, compressed); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); zdb_nicenum(ds->ds_unique_bytes, unique); - sprintf_blkptr(blkbuf, &ds->ds_bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); @@ -1271,7 +1279,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) char blkbuf[BP_SPRINTF_LEN]; if (bp->blk_birth != 0) { - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } return (0); @@ -1309,7 +1317,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); - sprintf_blkptr_compact(blkbuf, bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); return (0); } @@ -1865,8 +1873,9 @@ dump_dir(objset_t *os) zdb_nicenum(refdbytes, numbuf); if (verbosity >= 4) { - (void) sprintf(blkbuf, ", rootbp "); - (void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp); + (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); + (void) snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } @@ -1896,7 +1905,7 @@ dump_dir(objset_t *os) if (verbosity < 2) return; - if (os->os_rootbp->blk_birth == 0) + if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header); @@ -1937,7 +1946,7 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, &ub->ub_rootbp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf("%s", footer ? footer : ""); @@ -2245,7 +2254,7 @@ zdb_blkptr_done(zio_t *zio) zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); else blkbuf[0] = '\0'; @@ -2267,11 +2276,22 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; - char blkbuf[BP_SPRINTF_LEN]; dmu_object_type_t type; boolean_t is_metadata; - if (bp == NULL) + if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), + blkbuf); + } + + if (BP_IS_HOLE(bp)) return (0); type = BP_GET_TYPE(bp); @@ -2302,17 +2322,6 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_readfails = 0; - if (dump_opt['b'] >= 5) { - sprintf_blkptr(blkbuf, bp); - (void) printf("objset %llu object %llu " - "level %lld offset 0x%llx %s\n", - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (u_longlong_t)blkid2offset(dnp, bp, zb), - blkbuf); - } - if (dump_opt['b'] < 5 && isatty(STDERR_FILENO) && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); @@ -2473,7 +2482,7 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) if (dump_opt['b'] >= 5) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("[%s] %s\n", "deferred free", blkbuf); } @@ -2709,7 +2718,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; - if (bp == NULL) + if (BP_IS_HOLE(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { @@ -2879,7 +2888,7 @@ zdb_print_blkptr(blkptr_t *bp, int flags) if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index bc167cee6..371a09bcd 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + /* * Print intent log header and statistics. */ @@ -48,7 +52,7 @@ print_log_bp(const blkptr_t *bp, const char *prefix) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s%s\n", prefix, blkbuf); } @@ -133,6 +137,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", prefix, + !BP_IS_HOLE(bp) && bp->blk_birth >= spa_first_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, prefix); @@ -140,8 +145,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); - } - if (bp->blk_birth == 0) { bzero(buf, sizeof (buf)); (void) printf("%s\n", prefix); return; @@ -314,7 +317,8 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) if (verbose >= 5) { (void) strcpy(blkbuf, ", "); - sprintf_blkptr(blkbuf + strlen(blkbuf), bp); + snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), bp); } else { blkbuf[0] = '\0'; } @@ -362,7 +366,7 @@ dump_intent_log(zilog_t *zilog) int verbose = MAX(dump_opt['d'], dump_opt['i']); int i; - if (zh->zh_log.blk_birth == 0 || verbose < 1) + if (BP_IS_HOLE(&zh->zh_log) || verbose < 1) return; (void) printf("\n ZIL header: claim_txg %llu, " diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index 9abfc0ed4..19493bd09 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -277,6 +277,9 @@ zhack_do_feature_stat(int argc, char **argv) dump_obj(os, spa->spa_feat_for_read_obj, "for_read"); dump_obj(os, spa->spa_feat_for_write_obj, "for_write"); dump_obj(os, spa->spa_feat_desc_obj, "descriptions"); + if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { + dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg"); + } dump_mos(spa); spa_close(spa, FTAG); @@ -313,7 +316,9 @@ zhack_do_feature_enable(int argc, char **argv) feature.fi_uname = "zhack"; feature.fi_mos = B_FALSE; feature.fi_can_readonly = B_FALSE; + feature.fi_activate_on_enable = B_FALSE; feature.fi_depends = nodeps; + feature.fi_feature = SPA_FEATURE_NONE; optind = 1; while ((c = getopt(argc, argv, "rmd:")) != -1) { @@ -371,7 +376,7 @@ feature_incr_sync(void *arg, dmu_tx_t *tx) zfeature_info_t *feature = arg; uint64_t refcount; - VERIFY0(feature_get_refcount(spa, feature, &refcount)); + VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); feature_sync(spa, feature, refcount + 1, tx); spa_history_log_internal(spa, "zhack feature incr", tx, "name=%s", feature->fi_guid); @@ -384,7 +389,7 @@ feature_decr_sync(void *arg, dmu_tx_t *tx) zfeature_info_t *feature = arg; uint64_t refcount; - VERIFY0(feature_get_refcount(spa, feature, &refcount)); + VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); feature_sync(spa, feature, refcount - 1, tx); spa_history_log_internal(spa, "zhack feature decr", tx, "name=%s", feature->fi_guid); @@ -411,6 +416,7 @@ zhack_do_feature_ref(int argc, char **argv) feature.fi_mos = B_FALSE; feature.fi_desc = NULL; feature.fi_depends = nodeps; + feature.fi_feature = SPA_FEATURE_NONE; optind = 1; while ((c = getopt(argc, argv, "md")) != -1) { @@ -459,8 +465,8 @@ zhack_do_feature_ref(int argc, char **argv) if (decr) { uint64_t count; - if (feature_get_refcount(spa, &feature, &count) == 0 && - count != 0) { + if (feature_get_refcount_from_disk(spa, &feature, + &count) == 0 && count != 0) { fatal(spa, FTAG, "feature refcount already 0: %s", feature.fi_guid); } diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 23b919bf7..9446d9691 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -266,8 +266,6 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); -void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); @@ -295,20 +293,6 @@ void dbuf_stats_destroy(void); #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) #define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) #define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) -#define DB_GET_SPA(_spa_p, _db) { \ - dnode_t *__dn; \ - DB_DNODE_ENTER(_db); \ - __dn = DB_DNODE(_db); \ - *(_spa_p) = __dn->dn_objset->os_spa; \ - DB_DNODE_EXIT(_db); \ -} -#define DB_GET_OBJSET(_os_p, _db) { \ - dnode_t *__dn; \ - DB_DNODE_ENTER(_db); \ - __dn = DB_DNODE(_db); \ - *(_os_p) = __dn->dn_objset; \ - DB_DNODE_EXIT(_db); \ -} void dbuf_init(void); void dbuf_fini(void); @@ -358,7 +342,7 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_PUSHPAGE); \ - sprintf_blkptr(__blkbuf, bp); \ + snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1314c1eed..8e25acf85 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -290,6 +290,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" +#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" diff --git a/include/sys/spa.h b/include/sys/spa.h index d5a91c2d3..5c754b0af 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ @@ -67,20 +67,33 @@ struct dsl_dataset; #define BF32_GET(x, low, len) BF32_DECODE(x, low, len) #define BF64_GET(x, low, len) BF64_DECODE(x, low, len) -#define BF32_SET(x, low, len, val) \ - ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) -#define BF64_SET(x, low, len, val) \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +_NOTE(CONSTCOND) } while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +_NOTE(CONSTCOND) } while (0) #define BF32_GET_SB(x, low, len, shift, bias) \ ((BF32_GET(x, low, len) + (bias)) << (shift)) #define BF64_GET_SB(x, low, len, shift, bias) \ ((BF64_GET(x, low, len) + (bias)) << (shift)) -#define BF32_SET_SB(x, low, len, shift, bias, val) \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) -#define BF64_SET_SB(x, low, len, shift, bias, val) \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) /* * We currently support nine block sizes, from 512 bytes to 128K. @@ -188,6 +201,15 @@ typedef struct zio_cksum { #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +/* + * A block is a hole when it has either 1) never been written to, or + * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads + * without physically allocating disk space. Holes are represented in the + * blkptr_t structure by zeroed blk_dva. Correct checking for holes is + * done through the BP_IS_HOLE macro. For holes, the logical size, level, + * DMU object type, and birth times are all also stored for holes that + * were written to at some point (i.e. were punched after having been filled). + */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ @@ -202,9 +224,10 @@ typedef struct blkptr { * Macros to get and set fields in a bp or DVA. */ #define DVA_GET_ASIZE(dva) \ - BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) + BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ - BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) + BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ + SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) @@ -221,14 +244,14 @@ typedef struct blkptr { #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ - BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) + BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define BP_SET_LSIZE(bp, x) \ - BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define BP_GET_PSIZE(bp) \ - BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) + BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define BP_SET_PSIZE(bp, x) \ - BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) #define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) @@ -248,7 +271,7 @@ typedef struct blkptr { #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) -#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ @@ -306,7 +329,9 @@ typedef struct blkptr { #define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) #define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) -#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) +#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ + (dva)->dva_word[1] == 0ULL) +#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp)) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ @@ -329,14 +354,10 @@ typedef struct blkptr { ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } -/* - * Note: the byteorder is either 0 or -1, both of which are palindromes. - * This simplifies the endianness handling a bit. - */ #ifdef _BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else -#define ZFS_HOST_BYTEORDER (-1ULL) +#define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) @@ -348,19 +369,23 @@ typedef struct blkptr { * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ -#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \ +#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ { "zero", "single", "double", "triple" }; \ - int size = BP_SPRINTF_LEN; \ int len = 0; \ int copies = 0; \ int d; \ \ if (bp == NULL) { \ - len = func(buf + len, size - len, ""); \ + len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ - len = func(buf + len, size - len, ""); \ + len += func(buf + len, size - len, ""); \ + if (bp->blk_birth > 0) { \ + len += func(buf + len, size - len, \ + " birth=%lluL", \ + (u_longlong_t)bp->blk_birth); \ + } \ } else { \ for (d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ @@ -642,7 +667,8 @@ extern objset_t *spa_meta_objset(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ -extern void spa_activate_mos_feature(spa_t *spa, const char *feature); +extern void spa_activate_mos_feature(spa_t *spa, const char *feature, + dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern int spa_rename(const char *oldname, const char *newname); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); @@ -651,7 +677,7 @@ extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); -extern void sprintf_blkptr(char *buf, const blkptr_t *bp); +extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); @@ -721,7 +747,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_PUSHPAGE); \ - sprintf_blkptr(__blkbuf, (bp)); \ + snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 90a32d3f0..5e129a937 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -232,6 +233,9 @@ struct spa { uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ + uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ + /* cache feature refcounts */ + uint64_t spa_feat_refcount_cache[SPA_FEATURES]; taskqid_t spa_deadman_tqid; /* Task id */ uint64_t spa_deadman_calls; /* number of deadman calls */ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index f49086a47..bef6aaf8f 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -109,7 +109,7 @@ extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern void vdev_cache_init(vdev_t *vd); extern void vdev_cache_fini(vdev_t *vd); -extern int vdev_cache_read(zio_t *zio); +extern boolean_t vdev_cache_read(zio_t *zio); extern void vdev_cache_write(zio_t *zio); extern void vdev_cache_purge(vdev_t *vd); diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h index c2ca63f7c..5abde149a 100644 --- a/include/sys/zfeature.h +++ b/include/sys/zfeature.h @@ -34,6 +34,10 @@ extern "C" { #endif +#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES) +#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \ + VALID_FEATURE_FID(fid)) + struct spa; struct dmu_tx; struct objset; @@ -45,6 +49,8 @@ extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *); extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *); extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t); extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t); +extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, + uint64_t *txg); extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t); extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *); @@ -53,6 +59,8 @@ extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *); * use the above interfaces. */ extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *); +extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, + uint64_t *res); extern void feature_enable_sync(struct spa *, zfeature_info_t *, struct dmu_tx *); extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t, diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 084153cb0..bc774184a 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -43,10 +43,14 @@ typedef enum spa_feature { SPA_FEATURE_EMPTY_BPOBJ, SPA_FEATURE_LZ4_COMPRESS, SPA_FEATURE_SPACEMAP_HISTOGRAM, + SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_HOLE_BIRTH, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURES } spa_feature_t; +#define SPA_FEATURE_DISABLED (-1ULL) + typedef struct zfeature_info { spa_feature_t fi_feature; const char *fi_uname; /* User-facing feature name */ @@ -54,6 +58,8 @@ typedef struct zfeature_info { const char *fi_desc; /* Feature description */ boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */ boolean_t fi_mos; /* Is the feature necessary to read the MOS? */ + /* Activate this feature at the same time it is enabled */ + boolean_t fi_activate_on_enable; /* array of dependencies, terminated by SPA_FEATURE_NONE */ const spa_feature_t *fi_depends; } zfeature_info_t; @@ -68,6 +74,7 @@ extern boolean_t zfeature_is_valid_guid(const char *); extern boolean_t zfeature_is_supported(const char *); extern int zfeature_lookup_name(const char *name, spa_feature_t *res); +extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check); extern void zpool_feature_init(void); diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index b37bf98d8..60437c9bf 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -273,5 +273,70 @@ this feature are destroyed. .RE +.sp +.ne 2 +.na +\fB\fBenabled_txg\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:enabled_txg +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +Once this feature is enabled ZFS records the transaction group number +in which new features are enabled. This has no user-visible impact, +but other features may depend on this feature. + +This feature becomes \fBactive\fR as soon as it is enabled and will +never return to being \fBenabled\fB. + +.RE + +.sp +.ne 2 +.na +\fB\fBhole_birth\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:hole_birth +READ\-ONLY COMPATIBLE no +DEPENDENCIES enabled_txg +.TE + +This feature improves performance of incremental sends ("zfs send -i") +and receives for objects with many holes. The most common case of +hole-filled objects is zvols. + +An incremental send stream from snapshot \fBA\fR to snapshot \fBB\fR +contains information about every block that changed between \fBA\fR and +\fBB\fR. Blocks which did not change between those snapshots can be +identified and omitted from the stream using a piece of metadata called +the 'block birth time', but birth times are not recorded for holes (blocks +filled only with zeroes). Since holes created after \fBA\fR cannot be +distinguished from holes created before \fBA\fR, information about every +hole in the entire filesystem or zvol is included in the send stream. + +For workloads where holes are rare this is not a problem. However, when +incrementally replicating filesystems or zvols with many holes (for +example a zvol formatted with another filesystem) a lot of time will +be spent sending and receiving unnecessary information about holes that +already exist on the receiving side. + +Once the \fBhole_birth\fR feature has been enabled the block birth times +of all new holes will be recorded. Incremental sends between snapshots +created after this feature is enabled will use this new metadata to avoid +sending information about holes that already exist on the receiving side. + +This feature becomes \fBactive\fR as soon as it is enabled and will +never return to being \fBenabled\fB. + +.RE + + .SH "SEE ALSO" \fBzpool\fR(8) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 387faaf8d..7f6df0ae8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -795,7 +795,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_birth == 0) + (buf)->b_cksum0 == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -3854,9 +3854,13 @@ arc_write_done(zio_t *zio) ASSERT(hdr->b_acb == NULL); if (zio->io_error == 0) { - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + if (BP_IS_HOLE(zio->io_bp)) { + buf_discard_identity(hdr); + } else { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + } } else { ASSERT(BUF_EMPTY(hdr)); } diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index c03cb1f84..83f365864 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -141,7 +141,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int err; struct bptree_args *ba = arg; - if (bp == NULL) + if (BP_IS_HOLE(bp)) return (0); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c70306c9a..f530e055b 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -511,10 +511,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; - spa_t *spa; + spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); - DB_GET_SPA(&spa, db); abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { @@ -575,7 +574,6 @@ static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; - spa_t *spa; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; @@ -615,9 +613,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, - db->db.db_size, db, type)); DB_DNODE_EXIT(db); + dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; @@ -625,7 +623,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) return; } - spa = dn->dn_objset->os_spa; DB_DNODE_EXIT(db); db->db_state = DB_READ; @@ -642,7 +639,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_add_ref(db, NULL); - (void) arc_read(zio, spa, db->db_blkptr, + (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -654,8 +651,8 @@ int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; - int havepzio = (zio != NULL); - int prefetch; + boolean_t havepzio = (zio != NULL); + boolean_t prefetch; dnode_t *dn; /* @@ -750,11 +747,10 @@ dbuf_noread(dmu_buf_impl_t *db) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; + spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - DB_GET_SPA(&spa, db); dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { @@ -809,9 +805,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; + spa_t *spa = db->db_objset->os_spa; - DB_GET_SPA(&spa, db); dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { @@ -837,12 +832,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) ASSERT(db->db_data_pending != dr); /* free this block */ - if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { - spa_t *spa; + if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) + zio_free(db->db_objset->os_spa, txg, bp); - DB_GET_SPA(&spa, db); - zio_free(spa, txg, bp); - } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; @@ -860,9 +852,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find - * empty blocks. Also, if we happen across any level-1 dbufs in the - * range that have not already been marked dirty, mark them dirty so - * they stay in memory. + * empty blocks. * * This is a no-op if the dataset is in the middle of an incremental * receive; see comment below for details. @@ -872,14 +862,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - uint64_t first_l1 = start >> epbs; - uint64_t last_l1 = end >> epbs; - if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) end = dn->dn_maxblkid; - last_l1 = end >> epbs; - } dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); @@ -902,23 +887,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - if (db->db_level == 1 && - db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { - mutex_enter(&db->db_mtx); - if (db->db_last_dirty && - db->db_last_dirty->dr_txg < txg) { - dbuf_add_ref(db, FTAG); - mutex_exit(&db->db_mtx); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } else { - mutex_exit(&db->db_mtx); - } - } - if (db->db_level != 0) continue; - dprintf_dbuf(db, "found buf %s\n", ""); if (db->db_blkid < start || db->db_blkid > end) continue; @@ -995,24 +965,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db) * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. + * + * This logic ensures that only block births for + * filled blocks are considered. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_last_dirty) + if (db->db_last_dirty && (db->db_blkptr == NULL || + !BP_IS_HOLE(db->db_blkptr))) { birth_txg = db->db_last_dirty->dr_txg; - else if (db->db_blkptr) + } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { birth_txg = db->db_blkptr->blk_birth; + } /* - * If we don't exist or are in a snapshot, we can't be freed. + * If this block don't exist or is in a snapshot, it can't be freed. * Don't pass the bp to dsl_dataset_block_freeable() since we * are holding the db_mtx lock and might deadlock if we are * prefetching a dedup-ed block. */ - if (birth_txg) + if (birth_txg != 0) return (ds == NULL || dsl_dataset_block_freeable(ds, NULL, birth_txg)); else - return (FALSE); + return (B_FALSE); } void @@ -1032,7 +1007,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* - * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. @@ -1041,7 +1016,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); @@ -1071,9 +1046,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) void dbuf_release_bp(dmu_buf_impl_t *db) { - objset_t *os; + ASSERTV(objset_t *os = db->db_objset); - DB_GET_OBJSET(&os, db); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); @@ -1448,10 +1422,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); } -#pragma weak dmu_buf_will_dirty = dbuf_will_dirty void -dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); @@ -1574,7 +1548,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - dbuf_fill_done(db, tx); + dmu_buf_fill_done(&db->db, tx); } /* @@ -2132,7 +2106,6 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ -#pragma weak dmu_buf_rele = dbuf_rele void dbuf_rele(dmu_buf_impl_t *db, void *tag) { @@ -2140,6 +2113,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_rele_and_unlock(db, tag); } +void +dmu_buf_rele(dmu_buf_t *db, void *tag) +{ + dbuf_rele((dmu_buf_impl_t *)db, tag); +} + /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. @@ -2600,18 +2579,14 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (BP_IS_HOLE(bp)) { - ASSERT(bp->blk_fill == 0); - DB_DNODE_EXIT(db); - return; + if (bp->blk_birth != 0) { + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype)); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); } - ASSERT((db->db_blkid != DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_type) || - (db->db_blkid == DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_bonustype)); - ASSERT(BP_GET_LEVEL(bp) == db->db_level); - mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG @@ -2637,7 +2612,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) fill++; } } else { - fill = 1; + if (BP_IS_HOLE(bp)) { + fill = 0; + } else { + fill = 1; + } } } else { blkptr_t *ibp = db->db.db_data; @@ -2692,9 +2671,10 @@ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; - blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - uint64_t txg = zio->io_txg; + blkptr_t *bp = db->db_blkptr; + objset_t *os = db->db_objset; + dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); @@ -2707,14 +2687,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { - objset_t *os; - dsl_dataset_t *ds; - dmu_tx_t *tx; - - DB_GET_OBJSET(&os, db); - ds = os->os_dsl_dataset; - tx = os->os_synctx; - + dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } @@ -2727,7 +2700,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); - ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; @@ -2761,14 +2733,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { ASSERTV(int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + ASSERT3U(db->db_blkid, <=, + dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - ASSERT3U(dn->dn_phys->dn_maxblkid - >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); @@ -2781,8 +2753,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; - - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 070c83155..93997588d 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -116,12 +116,12 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); - if (error) + if (error != 0) return (error); - error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class]); + &ddt->ddt_histogram[type][class])); /* * Seed the cached statistics. @@ -138,8 +138,7 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; - ASSERT(error == 0); - return (error); + return (0); } static void @@ -616,7 +615,10 @@ ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) bcopy(src, dst, s_len); } - *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + *version = cpfunc; + /* CONSTCOND */ + if (ZFS_HOST_BYTEORDER) + *version |= DDT_COMPRESS_BYTEORDER_MASK; return (c_len + 1); } @@ -633,7 +635,8 @@ ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) else bcopy(src, dst, d_len); - if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != + (ZFS_HOST_BYTEORDER != 0)) byteswap_uint64_array(dst, d_len); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index edad9b496..103eec1d5 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -684,7 +684,7 @@ dmu_free_long_range(objset_t *os, uint64_t object, * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ - if (offset == 0 && length == DMU_OBJECT_END) + if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); @@ -1314,10 +1314,8 @@ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - spa_t *spa; - DB_GET_SPA(&spa, db); - return (arc_loan_buf(spa, size)); + return (arc_loan_buf(db->db_objset->os_spa, size)); } /* diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index a2cb2fc8d..8d7385539 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -118,7 +118,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (zb->zb_object != DMU_META_DNODE_OBJECT) return (0); - if (bp == NULL) { + if (BP_IS_HOLE(bp)) { uint64_t span = DBP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 9264fbb27..15abdeacb 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -385,11 +385,12 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { + } else if (BP_IS_HOLE(bp) && + zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); - } else if (bp == NULL) { + } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dnp, zb->zb_level); err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 27f9eb332..8c44e1771 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -36,6 +36,7 @@ #include #include #include +#include int zfs_pd_blks_max = 100; @@ -74,7 +75,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) traverse_data_t *td = arg; zbookmark_t zb; - if (bp->blk_birth == 0) + if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) @@ -98,7 +99,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) blkptr_t *bp = &lr->lr_blkptr; zbookmark_t zb; - if (bp->blk_birth == 0) + if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 || bp->blk_birth < claim_txg) @@ -225,13 +226,34 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, ASSERT(0); } - if (BP_IS_HOLE(bp)) { - err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); - return (err); + if (bp->blk_birth == 0) { + if (spa_feature_is_active(td->td_spa, SPA_FEATURE_HOLE_BIRTH)) { + /* + * Since this block has a birth time of 0 it must be a + * hole created before the SPA_FEATURE_HOLE_BIRTH + * feature was enabled. If SPA_FEATURE_HOLE_BIRTH + * was enabled before the min_txg for this traveral we + * know the hole must have been created before the + * min_txg for this traveral, so we can skip it. If + * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg + * for this traveral we cannot tell if the hole was + * created before or after the min_txg for this + * traversal, so we cannot skip it. + */ + uint64_t hole_birth_enabled_txg; + VERIFY(spa_feature_enabled_txg(td->td_spa, + SPA_FEATURE_HOLE_BIRTH, &hole_birth_enabled_txg)); + if (hole_birth_enabled_txg < td->td_min_txg) + return (0); + } + } else if (bp->blk_birth <= td->td_min_txg) { + return (0); } - if (bp->blk_birth <= td->td_min_txg) - return (0); + if (BP_IS_HOLE(bp)) { + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + return (err); + } if (td->td_pfd && !td->td_pfd->pd_exited && ((td->td_pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || @@ -441,7 +463,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (pfd->pd_cancel) return (SET_ERROR(EINTR)); - if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || + if (BP_IS_HOLE(bp) || + !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) return (0); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index f95066ddd..44ec80001 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1545,7 +1545,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { - /* Freeing the whole block; fast-track this request */ + /* + * Freeing the whole block; fast-track this request. + * Note that we won't dirty any indirect blocks, + * which is fine because we will be freeing the entire + * file and thus all indirect blocks will be freed + * by free_children(). + */ blkid = 0; nblks = 1; goto done; @@ -1572,7 +1578,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); @@ -1608,7 +1614,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } @@ -1629,18 +1635,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) nblks += 1; /* - * Read in and mark all the level-1 indirects dirty, - * so that they will stay in memory until syncing phase. - * Always dirty the first and last indirect to make sure - * we dirty all the partial indirects. + * Dirty the first and last indirect blocks, as they (and/or their + * parents) will need to be written out if they were only + * partially freed. Interior indirect blocks will be themselves freed, + * by free_children(), so they need not be dirtied. Note that these + * interior blocks have already been prefetched by dmu_tx_hold_free(). */ if (dn->dn_nlevels > 1) { - uint64_t i, first, last; - int shift = epbs + dn->dn_datablkshift; + uint64_t first, last; first = blkid >> epbs; if ((db = dbuf_hold_level(dn, 1, first, FTAG))) { - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } if (trunc) @@ -1648,26 +1654,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) else last = (blkid + nblks - 1) >> epbs; if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } - for (i = first + 1; i < last; i++) { - uint64_t ibyte = i << shift; - int err; - - err = dnode_next_offset(dn, - DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); - i = ibyte >> shift; - if (err == ESRCH || i >= last) - break; - ASSERT(err == 0); - db = dbuf_hold_level(dn, 1, i, FTAG); - if (db) { - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } - } } + done: /* * Add this range to the dnode range list. @@ -1695,8 +1686,6 @@ done: dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: - if (trunc && dn->dn_maxblkid >= (off >> blkshift)) - dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); rw_exit(&dn->dn_struct_rwlock); } @@ -1873,8 +1862,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, data = db->db.db_data; } - if (db && txg && - (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { + + if (db != NULL && txg != 0 && (db->db_blkptr == NULL || + db->db_blkptr->blk_birth <= txg || + BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 78bccdfc6..885dc2dbc 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -32,6 +32,7 @@ #include #include #include +#include static void dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) @@ -112,26 +113,48 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); } -static int +static void free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; - int i, blocks_freed = 0; + int i; dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); for (i = 0; i < num; i++, bp++) { + uint64_t lsize, lvl; + dmu_object_type_t type; + if (BP_IS_HOLE(bp)) continue; bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); + + /* + * Save some useful information on the holes being + * punched, including logical size, type, and indirection + * level. Retaining birth time enables detection of when + * holes are punched for reducing the number of free + * records transmitted during a zfs send. + */ + + lsize = BP_GET_LSIZE(bp); + type = BP_GET_TYPE(bp); + lvl = BP_GET_LEVEL(bp); + bzero(bp, sizeof (blkptr_t)); - blocks_freed += 1; + + if (spa_feature_is_active(dn->dn_objset->os_spa, + SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, type); + BP_SET_LEVEL(bp, lvl); + BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); + } } dnode_diduse_space(dn, -bytesfreed); - return (blocks_freed); } #ifdef ZFS_DEBUG @@ -215,30 +238,27 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) #define ALL -1 -static int -free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, +static void +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; - int epbs, shift, err; - int all = TRUE; - int blocks_freed = 0; + int epbs, shift; /* * There is a small possibility that this block will not be cached: * 1 - if level > 1 and there are no children with level <= 1 - * 2 - if we didn't get a dirty hold (because this block had just - * finished being written -- and so had no holds), and then this - * block got evicted before we got here. + * 2 - if this block was evicted since we read it from + * dmu_tx_hold_free(). */ if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); dbuf_release_bp(db); - bp = (blkptr_t *)db->db.db_data; + bp = db->db.db_data; DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -248,7 +268,6 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, start = blkid >> shift; if (dbstart < start) { bp += start - dbstart; - all = FALSE; } else { start = dbstart; } @@ -256,49 +275,46 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, end = (blkid + nblks - 1) >> shift; if (dbend <= end) end = dbend; - else if (all) - all = trunc; + ASSERT3U(start, <=, end); if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - blocks_freed = free_blocks(dn, bp, end-start+1, tx); - arc_buf_freeze(db->db_buf); - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); - DB_DNODE_EXIT(db); - return (all ? ALL : blocks_freed); - } + free_blocks(dn, bp, end-start+1, tx); + } else { + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, + i, B_TRUE, FTAG, &subdb)); + rw_exit(&dn->dn_struct_rwlock); + ASSERT3P(bp, ==, subdb->db_blkptr); - for (i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); - ASSERT0(err); - rw_exit(&dn->dn_struct_rwlock); - - if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { - ASSERT3P(subdb->db_blkptr, ==, bp); - blocks_freed += free_blocks(dn, bp, 1, tx); - } else { - all = FALSE; + free_children(subdb, blkid, nblks, tx); + dbuf_rele(subdb, FTAG); } - dbuf_rele(subdb, FTAG); } + + /* If this whole block is free, free ourself too. */ + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + if (!BP_IS_HOLE(bp)) + break; + } + if (i == 1 << epbs) { + /* didn't find any non-holes */ + bzero(db->db.db_data, db->db.db_size); + free_blocks(dn, db->db_blkptr, 1, tx); + } else { + /* + * Partial block free; must be marked dirty so that it + * will be written out. + */ + ASSERT(db->db_dirtycnt > 0); + } + DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); -#ifdef ZFS_DEBUG - bp -= (end-start)+1; - for (i = start; i <= end; i++, bp++) { - if (i == start && blkid != 0) - continue; - else if (i == end && !trunc) - continue; - ASSERT0(bp->blk_birth); - } -#endif - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); - return (all ? ALL : blocks_freed); } /* @@ -306,20 +322,21 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, * and "free" all the blocks contained there. */ static void -dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, + dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; - dmu_buf_impl_t *db; - int trunc, start, end, shift, i, err; int dnlevel = dn->dn_phys->dn_nlevels; + boolean_t trunc = B_FALSE; if (blkid > dn->dn_phys->dn_maxblkid) return; ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); - trunc = blkid + nblks > dn->dn_phys->dn_maxblkid; - if (trunc) + if (blkid + nblks > dn->dn_phys->dn_maxblkid) { nblks = dn->dn_phys->dn_maxblkid - blkid + 1; + trunc = B_TRUE; + } /* There are no indirect blocks in the object */ if (dnlevel == 1) { @@ -328,41 +345,36 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) return; } ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); - (void) free_blocks(dn, bp + blkid, nblks, tx); - if (trunc) { - ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec<dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); - ASSERT(off < dn->dn_phys->dn_maxblkid || - dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); + free_blocks(dn, bp + blkid, nblks, tx); + } else { + int shift = (dnlevel - 1) * + (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + int start = blkid >> shift; + int end = (blkid + nblks - 1) >> shift; + dmu_buf_impl_t *db; + int i; + + ASSERT(start < dn->dn_phys->dn_nblkptr); + bp += start; + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, + TRUE, FTAG, &db)); + rw_exit(&dn->dn_struct_rwlock); + + free_children(db, blkid, nblks, tx); + dbuf_rele(db, FTAG); } - return; } - shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); - start = blkid >> shift; - ASSERT(start < dn->dn_phys->dn_nblkptr); - end = (blkid + nblks - 1) >> shift; - bp += start; - for (i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); - ASSERT0(err); - rw_exit(&dn->dn_struct_rwlock); - - if (free_children(db, blkid, nblks, trunc, tx) == ALL) { - ASSERT3P(db->db_blkptr, ==, bp); - (void) free_blocks(dn, bp, 1, tx); - } - dbuf_rele(db, FTAG); - } if (trunc) { - ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * + ASSERTV(uint64_t off); + dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; + + ASSERTV(off = (dn->dn_phys->dn_maxblkid + 1) * (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT)); - dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); @@ -504,7 +516,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) ASSERT(dn->dn_free_txg > 0); if (dn->dn_allocated_txg != dn->dn_free_txg) - dbuf_will_dirty(dn->dn_dbuf, tx); + dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); bzero(dn->dn_phys, sizeof (dnode_phys_t)); mutex_enter(&dn->dn_mtx); @@ -535,6 +547,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; boolean_t kill_spill = B_FALSE; + boolean_t freeing_dnode; ASSERTV(static const dnode_phys_t zerodn = { 0 }); ASSERT(dmu_tx_is_syncing(tx)); @@ -611,13 +624,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_bonustype[txgoff] = 0; } + freeing_dnode = dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg; + /* * We will either remove a spill block when a file is being removed * or we have been asked to remove it. */ if (dn->dn_rm_spillblk[txgoff] || - ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && - dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) { + ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && freeing_dnode)) { if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) kill_spill = B_TRUE; dn->dn_rm_spillblk[txgoff] = 0; @@ -640,7 +654,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); if (kill_spill) { - (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); + free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); mutex_enter(&dn->dn_mtx); dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); @@ -656,7 +670,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) kmem_free(rp, sizeof (free_range_t)); } - if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { + if (freeing_dnode) { dnode_sync_free(dn, tx); return; } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index cbdf393c0..cae56534d 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -122,7 +122,9 @@ int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { - int used, compressed, uncompressed; + int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); if (BP_IS_HOLE(bp)) return (0); @@ -130,11 +132,6 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); - used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - compressed = BP_GET_PSIZE(bp); - uncompressed = BP_GET_UCSIZE(bp); - - ASSERT(used > 0); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_pool_mos_diduse_space(tx->tx_pool, @@ -232,7 +229,8 @@ boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, uint64_t blk_birth) { - if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) + if (blk_birth <= dsl_dataset_prev_snap_txg(ds) || + (bp != NULL && BP_IS_HOLE(bp))) return (B_FALSE); ddt_prefetch(dsl_dataset_get_spa(ds), bp); diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 11038b1f9..cbe89739f 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -144,6 +144,8 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + ASSERT(!BP_IS_HOLE(bp)); + if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); if (poa->ds_prev && !poa->after_branch_point && @@ -544,7 +546,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (bp == NULL) + if (BP_IS_HOLE(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 22b68d685..bf06b51ee 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -488,7 +488,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) zil_header_t *zh = zsa->zsa_zh; zbookmark_t zb; - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -520,7 +520,8 @@ dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) blkptr_t *bp = &lr->lr_blkptr; zbookmark_t zb; - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + if (BP_IS_HOLE(bp) || + bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -775,7 +776,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, if (dsl_scan_check_resume(scn, dnp, zb)) goto out; - if (bp->blk_birth == 0) + if (BP_IS_HOLE(bp)) goto out; scn->scn_visited_this_txg++; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index fabf16169..5bf59315a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1872,7 +1872,7 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { - if (bp != NULL) { + if (!BP_IS_HOLE(bp)) { zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); @@ -2328,6 +2328,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; + spa_feature_t i; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj) != 0) { @@ -2398,6 +2399,33 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } + + /* + * Load refcounts for ZFS features from disk into an in-memory + * cache during SPA initialization. + */ + for (i = 0; i < SPA_FEATURES; i++) { + uint64_t refcount; + + error = feature_get_refcount_from_disk(spa, + &spa_feature_table[i], &refcount); + if (error == 0) { + spa->spa_feat_refcount_cache[i] = refcount; + } else if (error == ENOTSUP) { + spa->spa_feat_refcount_cache[i] = + SPA_FEATURE_DISABLED; + } else { + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + } + } + + if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { + if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, + &spa->spa_feat_enabled_txg_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } } spa->spa_is_initializing = B_TRUE; @@ -5820,7 +5848,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration - * information. This avoids the dbuf_will_dirty() path and + * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 50c3b9afd..88f9e34e3 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -469,6 +469,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_t *spa; spa_config_dirent_t *dp; int t; + int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -548,6 +549,15 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); + /* + * As a pool is being created, treat all features as disabled by + * setting SPA_FEATURE_DISABLED for all entries in the feature + * refcount cache. + */ + for (i = 0; i < SPA_FEATURES; i++) { + spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; + } + return (spa); } @@ -1094,11 +1104,19 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) */ void -spa_activate_mos_feature(spa_t *spa, const char *feature) +spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); - vdev_config_dirty(spa->spa_root_vdev); + /* + * When we are creating the pool (tx_txg==TXG_INITIAL), we can't + * dirty the vdev config because lock SCL_CONFIG is not held. + * Thankfully, in this case we don't need to dirty the config + * because it will be written out anyway when we finish + * creating the pool. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); } } @@ -1257,7 +1275,7 @@ spa_generate_guid(spa_t *spa) } void -sprintf_blkptr(char *buf, const blkptr_t *bp) +snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; char *checksum = NULL; @@ -1279,7 +1297,8 @@ sprintf_blkptr(char *buf, const blkptr_t *bp) compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } - SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); + SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, + compress); } void @@ -1875,7 +1894,7 @@ EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_get_random); EXPORT_SYMBOL(spa_generate_guid); -EXPORT_SYMBOL(sprintf_blkptr); +EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index d1ee9afd9..e4ce3eaef 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -248,9 +248,9 @@ vdev_cache_fill(zio_t *fio) } /* - * Read data from the cache. Returns 0 on cache hit, errno on a miss. + * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. */ -int +boolean_t vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; @@ -262,16 +262,16 @@ vdev_cache_read(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (SET_ERROR(EINVAL)); + return (B_FALSE); if (zio->io_size > zfs_vdev_cache_max) - return (SET_ERROR(EOVERFLOW)); + return (B_FALSE); /* * If the I/O straddles two or more cache blocks, don't cache it. */ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) - return (SET_ERROR(EXDEV)); + return (B_FALSE); ASSERT(cache_phase + zio->io_size <= VCBS); @@ -285,7 +285,7 @@ vdev_cache_read(zio_t *zio) if (ve != NULL) { if (ve->ve_missed_update) { mutex_exit(&vc->vc_lock); - return (SET_ERROR(ESTALE)); + return (B_FALSE); } if ((fio = ve->ve_fill_io) != NULL) { @@ -293,7 +293,7 @@ vdev_cache_read(zio_t *zio) zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_delegations); - return (0); + return (B_TRUE); } vdev_cache_hit(vc, ve, zio); @@ -301,14 +301,14 @@ vdev_cache_read(zio_t *zio) mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_hits); - return (0); + return (B_TRUE); } ve = vdev_cache_allocate(zio); if (ve == NULL) { mutex_exit(&vc->vc_lock); - return (SET_ERROR(ENOMEM)); + return (B_FALSE); } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, @@ -323,7 +323,7 @@ vdev_cache_read(zio_t *zio) zio_nowait(fio); VDCSTAT_BUMP(vdc_stat_misses); - return (0); + return (B_TRUE); } /* diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c index 89d4b43f0..e6cab2c03 100644 --- a/module/zfs/zfeature.c +++ b/module/zfs/zfeature.c @@ -224,12 +224,32 @@ spa_features_check(spa_t *spa, boolean_t for_write, } /* + * Use an in-memory cache of feature refcounts for quick retrieval. + * * Note: well-designed features will not need to use this; they should * use spa_feature_is_enabled() and spa_feature_is_active() instead. * However, this is non-static for zdb and zhack. */ int feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ + ASSERT(VALID_FEATURE_FID(feature->fi_feature)); + if (spa->spa_feat_refcount_cache[feature->fi_feature] == + SPA_FEATURE_DISABLED) { + return (SET_ERROR(ENOTSUP)); + } + *res = spa->spa_feat_refcount_cache[feature->fi_feature]; + return (0); +} + +/* + * Note: well-designed features will not need to use this; they should + * use spa_feature_is_enabled() and spa_feature_is_active() instead. + * However, this is non-static for zdb and zhack. + */ +int +feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, + uint64_t *res) { int err; uint64_t refcount; @@ -255,6 +275,26 @@ feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) return (0); } + +static int +feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) { + ASSERTV(uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj); + + ASSERT(zfeature_depends_on(feature->fi_feature, + SPA_FEATURE_ENABLED_TXG)); + + if (!spa_feature_is_enabled(spa, feature->fi_feature)) { + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(enabled_txg_obj != 0); + + VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj, + feature->fi_guid, sizeof (uint64_t), 1, res)); + + return (0); +} + /* * This function is non-static for zhack; it should otherwise not be used * outside this file. @@ -263,16 +303,32 @@ void feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, dmu_tx_t *tx) { - uint64_t zapobj = feature->fi_can_readonly ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + uint64_t zapobj; + ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); + zapobj = feature->fi_can_readonly ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, sizeof (uint64_t), 1, &refcount, tx)); + /* + * feature_sync is called directly from zhack, allowing the + * creation of arbitrary features whose fi_feature field may + * be greater than SPA_FEATURES. When called from zhack, the + * zfeature_info_t object's fi_feature field will be set to + * SPA_FEATURE_NONE. + */ + if (feature->fi_feature != SPA_FEATURE_NONE) { + uint64_t *refcount_cache = + &spa->spa_feat_refcount_cache[feature->fi_feature]; + VERIFY3U(*refcount_cache, ==, + atomic_swap_64(refcount_cache, refcount)); + } + if (refcount == 0) spa_deactivate_mos_feature(spa, feature->fi_guid); else if (feature->fi_mos) - spa_activate_mos_feature(spa, feature->fi_guid); + spa_activate_mos_feature(spa, feature->fi_guid, tx); } /* @@ -282,6 +338,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, void feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) { + uint64_t initial_refcount = feature->fi_activate_on_enable ? 1 : 0; uint64_t zapobj = feature->fi_can_readonly ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; int i; @@ -302,27 +359,43 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj, feature->fi_guid, 1, strlen(feature->fi_desc) + 1, feature->fi_desc, tx)); - feature_sync(spa, feature, 0, tx); + + feature_sync(spa, feature, initial_refcount, tx); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { + uint64_t enabling_txg = dmu_tx_get_txg(tx); + + if (spa->spa_feat_enabled_txg_obj == 0ULL) { + spa->spa_feat_enabled_txg_obj = + zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_ENABLED_TXG, tx); + } + spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx); + + VERIFY0(zap_add(spa->spa_meta_objset, + spa->spa_feat_enabled_txg_obj, feature->fi_guid, + sizeof (uint64_t), 1, &enabling_txg, tx)); + } } static void feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, dmu_tx_t *tx) { - uint64_t refcount; + uint64_t refcount = 0; zfeature_info_t *feature = &spa_feature_table[fid]; - uint64_t zapobj = feature->fi_can_readonly ? - spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + ASSERTV(uint64_t zapobj = feature->fi_can_readonly ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj); - ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(VALID_FEATURE_FID(fid)); ASSERT(0 != zapobj); ASSERT(zfeature_is_valid_guid(feature->fi_guid)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); - VERIFY0(zap_lookup(spa->spa_meta_objset, zapobj, feature->fi_guid, - sizeof (uint64_t), 1, &refcount)); + VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); switch (action) { case FEATURE_ACTION_INCR: @@ -369,7 +442,7 @@ void spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) { ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); - ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(VALID_FEATURE_FID(fid)); feature_enable_sync(spa, &spa_feature_table[fid], tx); } @@ -391,7 +464,7 @@ spa_feature_is_enabled(spa_t *spa, spa_feature_t fid) int err; uint64_t refcount = 0; - ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(VALID_FEATURE_FID(fid)); if (spa_version(spa) < SPA_VERSION_FEATURES) return (B_FALSE); @@ -406,7 +479,7 @@ spa_feature_is_active(spa_t *spa, spa_feature_t fid) int err; uint64_t refcount = 0; - ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(VALID_FEATURE_FID(fid)); if (spa_version(spa) < SPA_VERSION_FEATURES) return (B_FALSE); @@ -414,3 +487,26 @@ spa_feature_is_active(spa_t *spa, spa_feature_t fid) ASSERT(err == 0 || err == ENOTSUP); return (err == 0 && refcount > 0); } + +/* + * For the feature specified by fid (which must depend on + * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the + * OUT txg argument. + * + * Returns B_TRUE if the feature is enabled, in which case txg will be filled + * with the transaction group in which the specified feature was enabled. + * Returns B_FALSE otherwise (i.e. if the feature is not enabled). + */ +boolean_t +spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) { + int err; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg); + ASSERT(err == 0 || err == ENOTSUP); + + return (err == 0); +} diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c index 5bcc08ade..35e662ee2 100644 --- a/module/zfs/zfeature_common.c +++ b/module/zfs/zfeature_common.c @@ -119,10 +119,22 @@ zfeature_lookup_name(const char *name, spa_feature_t *res) return (ENOENT); } +boolean_t +zfeature_depends_on(spa_feature_t fid, spa_feature_t check) { + zfeature_info_t *feature = &spa_feature_table[fid]; + int i; + + for (i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { + if (feature->fi_depends[i] == check) + return (B_TRUE); + } + return (B_FALSE); +} + static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, const char *desc, boolean_t readonly, boolean_t mos, - const spa_feature_t *deps) + boolean_t activate_on_enable, const spa_feature_t *deps) { zfeature_info_t *feature = &spa_feature_table[fid]; static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; @@ -142,6 +154,7 @@ zfeature_register(spa_feature_t fid, const char *guid, const char *name, feature->fi_desc = desc; feature->fi_can_readonly = readonly; feature->fi_mos = mos; + feature->fi_activate_on_enable = activate_on_enable; feature->fi_depends = deps; } @@ -150,18 +163,40 @@ zpool_feature_init(void) { zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", - "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); + "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, + B_FALSE, NULL); + zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, "com.delphix:empty_bpobj", "empty_bpobj", - "Snapshots use less space.", B_TRUE, B_FALSE, NULL); + "Snapshots use less space.", B_TRUE, B_FALSE, + B_FALSE, NULL); + zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", - "LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL); + "LZ4 compression algorithm support.", B_FALSE, B_FALSE, + B_FALSE, NULL); + zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", - "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL); + "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, + B_FALSE, NULL); + + zfeature_register(SPA_FEATURE_ENABLED_TXG, + "com.delphix:enabled_txg", "enabled_txg", + "Record txg at which a feature is enabled", B_TRUE, B_FALSE, + B_FALSE, NULL); + + { + static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_NONE }; + zfeature_register(SPA_FEATURE_HOLE_BIRTH, + "com.delphix:hole_birth", "hole_birth", + "Retain hole birth txg for more precise zfs send", + B_FALSE, B_TRUE, B_TRUE, hole_birth_deps); + } + zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", - B_FALSE, B_FALSE, NULL); + B_FALSE, B_FALSE, B_FALSE, NULL); } diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 2d1ce93a7..4403e3289 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -558,7 +558,6 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; - int err; int bonuslen; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; @@ -591,10 +590,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ if (S_ISDIR(vap->va_mode)) { if (zsb->z_replay) { - err = zap_create_claim_norm(zsb->z_os, obj, + VERIFY0(zap_create_claim_norm(zsb->z_os, obj, zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, - obj_type, bonuslen, tx); - ASSERT0(err); + obj_type, bonuslen, tx)); } else { obj = zap_create_norm(zsb->z_os, zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, @@ -602,10 +600,9 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } } else { if (zsb->z_replay) { - err = dmu_object_claim(zsb->z_os, obj, + VERIFY0(dmu_object_claim(zsb->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, - obj_type, bonuslen, tx); - ASSERT0(err); + obj_type, bonuslen, tx)); } else { obj = dmu_object_alloc(zsb->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, @@ -784,8 +781,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { - err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx); - ASSERT0(err); + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zsb, obj); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index b69a7bf56..b5ee395d1 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -395,7 +395,8 @@ zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ - if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) + if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || + zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, @@ -445,7 +446,8 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) * If we previously claimed it, we need to free it. */ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && - bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) + bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && + !BP_IS_HOLE(bp)) zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 332d50c69..f84e2129b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -36,6 +36,7 @@ #include #include #include +#include /* * ========================================================================== @@ -1096,7 +1097,7 @@ zio_write_bp_init(zio_t *zio) BP_ZERO(bp); } - if (bp->blk_birth == zio->io_txg) { + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to @@ -1140,7 +1141,8 @@ zio_write_bp_init(zio_t *zio) * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ - if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && + BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; ASSERT(psize != 0); @@ -1152,15 +1154,22 @@ zio_write_bp_init(zio_t *zio) } if (psize == 0) { + if (zio->io_bp_orig.blk_birth != 0 && + spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_BIRTH(bp, zio->io_txg, 0); + } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); - BP_SET_TYPE(bp, zp->zp_type); - BP_SET_LEVEL(bp, zp->zp_level); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { @@ -2613,7 +2622,7 @@ zio_vdev_io_start(zio_t *zio) if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (ZIO_PIPELINE_CONTINUE); if ((zio = vdev_queue_io(zio)) == NULL)