Illumos 4757, 4913

4757 ZFS embedded-data block pointers ("zero block compression")
4913 zfs release should not be subject to space checks

Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/4757
  https://www.illumos.org/issues/4913
  https://github.com/illumos/illumos-gate/commit/5d7b4d4

Porting notes:

For compatibility with the fastpath code the zio_done() function
needed to be updated.  Because embedded-data block pointers do
not require DVAs to be allocated the associated vdevs will not
be marked and therefore should not be unmarked.

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2544
This commit is contained in:
Matthew Ahrens 2014-06-05 13:19:08 -08:00 committed by Brian Behlendorf
parent faf0f58c69
commit 9b67f60560
46 changed files with 1195 additions and 258 deletions

View File

@ -1047,6 +1047,16 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
return; return;
} }
if (BP_IS_EMBEDDED(bp)) {
(void) sprintf(blkbuf,
"EMBEDDED et=%u %llxL/%llxP B=%llu",
(int)BPE_GET_ETYPE(bp),
(u_longlong_t)BPE_GET_LSIZE(bp),
(u_longlong_t)BPE_GET_PSIZE(bp),
(u_longlong_t)bp->blk_birth);
return;
}
blkbuf[0] = '\0'; blkbuf[0] = '\0';
for (i = 0; i < ndvas; i++) for (i = 0; i < ndvas; i++)
@ -1066,7 +1076,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
"%llxL/%llxP F=%llu B=%llu/%llu", "%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)bp->blk_fill, (u_longlong_t)BP_GET_FILL(bp),
(u_longlong_t)bp->blk_birth, (u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp)); (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
} }
@ -1079,8 +1089,10 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
char blkbuf[BP_SPRINTF_LEN]; char blkbuf[BP_SPRINTF_LEN];
int l; int l;
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); if (!BP_IS_EMBEDDED(bp)) {
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
}
(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
@ -1134,10 +1146,10 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
err = visit_indirect(spa, dnp, cbp, &czb); err = visit_indirect(spa, dnp, cbp, &czb);
if (err) if (err)
break; break;
fill += cbp->blk_fill; fill += BP_GET_FILL(cbp);
} }
if (!err) if (!err)
ASSERT3U(fill, ==, bp->blk_fill); ASSERT3U(fill, ==, BP_GET_FILL(bp));
(void) arc_buf_remove_ref(buf, &buf); (void) arc_buf_remove_ref(buf, &buf);
} }
@ -1861,14 +1873,14 @@ dump_dir(objset_t *os)
if (dds.dds_type == DMU_OST_META) { if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL; dds.dds_creation_txg = TXG_INITIAL;
usedobjs = os->os_rootbp->blk_fill; usedobjs = BP_GET_FILL(os->os_rootbp);
refdbytes = os->os_spa->spa_dsl_pool-> refdbytes = os->os_spa->spa_dsl_pool->
dp_mos_dir->dd_phys->dd_used_bytes; dp_mos_dir->dd_phys->dd_used_bytes;
} else { } else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
} }
ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill); ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
zdb_nicenum(refdbytes, numbuf); zdb_nicenum(refdbytes, numbuf);
@ -2171,6 +2183,9 @@ typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_dedup_asize; uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks; uint64_t zcb_dedup_blocks;
uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
[BPE_PAYLOAD_SIZE];
uint64_t zcb_start; uint64_t zcb_start;
uint64_t zcb_lastprint; uint64_t zcb_lastprint;
uint64_t zcb_totalasize; uint64_t zcb_totalasize;
@ -2204,6 +2219,13 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++; zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
} }
if (BP_IS_EMBEDDED(bp)) {
zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
[BPE_GET_PSIZE(bp)]++;
return;
}
if (dump_opt['L']) if (dump_opt['L'])
return; return;
@ -2301,7 +2323,8 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp); size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size); void *data = zio_data_buf_alloc(size);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
@ -2497,8 +2520,9 @@ dump_block_stats(spa_t *spa)
zdb_blkstats_t *zb, *tzb; zdb_blkstats_t *zb, *tzb;
uint64_t norm_alloc, norm_space, total_alloc, total_found; uint64_t norm_alloc, norm_space, total_alloc, total_found;
int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
int leaks = 0; boolean_t leaks = B_FALSE;
int e; int e;
bp_embedded_type_t i;
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
@ -2587,7 +2611,7 @@ dump_block_stats(spa_t *spa)
(u_longlong_t)total_alloc, (u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked", (dump_opt['L']) ? "unreachable" : "leaked",
(longlong_t)(total_alloc - total_found)); (longlong_t)(total_alloc - total_found));
leaks = 1; leaks = B_TRUE;
} }
if (tzb->zb_count == 0) if (tzb->zb_count == 0)
@ -2617,6 +2641,23 @@ dump_block_stats(spa_t *spa)
(void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
if (zcb.zcb_embedded_blocks[i] == 0)
continue;
(void) printf("\n");
(void) printf("\tadditional, non-pointer bps of type %u: "
"%10llu\n",
i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
if (dump_opt['b'] >= 3) {
(void) printf("\t number of (compressed) bytes: "
"number of bps\n");
dump_histogram(zcb.zcb_embedded_histogram[i],
sizeof (zcb.zcb_embedded_histogram[i]) /
sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
}
}
if (dump_opt['b'] >= 2) { if (dump_opt['b'] >= 2) {
int l, t, level; int l, t, level;
(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
@ -2718,14 +2759,14 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
avl_index_t where; avl_index_t where;
zdb_ddt_entry_t *zdde, zdde_search; zdb_ddt_entry_t *zdde, zdde_search;
if (BP_IS_HOLE(bp)) if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
(void) printf("traversing objset %llu, %llu objects, " (void) printf("traversing objset %llu, %llu objects, "
"%lu blocks so far\n", "%lu blocks so far\n",
(u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_objset,
(u_longlong_t)bp->blk_fill, (u_longlong_t)BP_GET_FILL(bp),
avl_numnodes(t)); avl_numnodes(t));
} }

View File

@ -258,9 +258,9 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK: case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n")); return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND: case HELP_SEND:
return (gettext("\tsend [-DnPpRrv] [-[iI] snapshot] " return (gettext("\tsend [-DnPpRrve] [-[iI] snapshot] "
"<snapshot>\n" "<snapshot>\n"
"\tsend [-i snapshot|bookmark] " "\tsend [-e] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n")); "<filesystem|volume|snapshot>\n"));
case HELP_SET: case HELP_SET:
return (gettext("\tset <property=value> " return (gettext("\tset <property=value> "
@ -3338,6 +3338,8 @@ rollback_check_dependent(zfs_handle_t *zhp, void *data)
zfs_close(zhp); zfs_close(zhp);
return (0); return (0);
} }
/* /*
* Report any snapshots more recent than the one specified. Used when '-r' is * Report any snapshots more recent than the one specified. Used when '-r' is
* not specified. We reuse this same callback for the snapshot dependents - if * not specified. We reuse this same callback for the snapshot dependents - if
@ -3677,7 +3679,7 @@ zfs_do_send(int argc, char **argv)
boolean_t extraverbose = B_FALSE; boolean_t extraverbose = B_FALSE;
/* check options */ /* check options */
while ((c = getopt(argc, argv, ":i:I:RDpvnP")) != -1) { while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
switch (c) { switch (c) {
case 'i': case 'i':
if (fromname) if (fromname)
@ -3712,6 +3714,9 @@ zfs_do_send(int argc, char **argv)
case 'n': case 'n':
flags.dryrun = B_TRUE; flags.dryrun = B_TRUE;
break; break;
case 'e':
flags.embed_data = B_TRUE;
break;
case ':': case ':':
(void) fprintf(stderr, gettext("missing argument for " (void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt); "'%c' option\n"), optopt);
@ -3750,6 +3755,7 @@ zfs_do_send(int argc, char **argv)
if (strchr(argv[0], '@') == NULL || if (strchr(argv[0], '@') == NULL ||
(fromname && strchr(fromname, '#') != NULL)) { (fromname && strchr(fromname, '#') != NULL)) {
char frombuf[ZFS_MAXNAMELEN]; char frombuf[ZFS_MAXNAMELEN];
enum lzc_send_flags lzc_flags = 0;
if (flags.replicate || flags.doall || flags.props || if (flags.replicate || flags.doall || flags.props ||
flags.dedup || flags.dryrun || flags.verbose || flags.dedup || flags.dryrun || flags.verbose ||
@ -3764,6 +3770,9 @@ zfs_do_send(int argc, char **argv)
if (zhp == NULL) if (zhp == NULL)
return (1); return (1);
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
if (fromname != NULL && if (fromname != NULL &&
(fromname[0] == '#' || fromname[0] == '@')) { (fromname[0] == '#' || fromname[0] == '@')) {
/* /*
@ -3777,7 +3786,7 @@ zfs_do_send(int argc, char **argv)
(void) strlcat(frombuf, fromname, sizeof (frombuf)); (void) strlcat(frombuf, fromname, sizeof (frombuf));
fromname = frombuf; fromname = frombuf;
} }
err = zfs_send_one(zhp, fromname, STDOUT_FILENO); err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
zfs_close(zhp); zfs_close(zhp);
return (err != 0); return (err != 0);
} }

View File

@ -36,7 +36,6 @@
#include <sys/zfs_ioctl.h> #include <sys/zfs_ioctl.h>
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
uint64_t drr_record_count[DRR_NUMTYPES];
uint64_t total_write_size = 0; uint64_t total_write_size = 0;
uint64_t total_stream_len = 0; uint64_t total_stream_len = 0;
FILE *send_stream = 0; FILE *send_stream = 0;
@ -81,6 +80,8 @@ int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
char *buf = malloc(INITIAL_BUFLEN); char *buf = malloc(INITIAL_BUFLEN);
uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
uint64_t total_records = 0;
dmu_replay_record_t thedrr; dmu_replay_record_t thedrr;
dmu_replay_record_t *drr = &thedrr; dmu_replay_record_t *drr = &thedrr;
struct drr_begin *drrb = &thedrr.drr_u.drr_begin; struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@ -91,6 +92,7 @@ main(int argc, char *argv[])
struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
struct drr_free *drrf = &thedrr.drr_u.drr_free; struct drr_free *drrf = &thedrr.drr_u.drr_free;
struct drr_spill *drrs = &thedrr.drr_u.drr_spill; struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
char c; char c;
boolean_t verbose = B_FALSE; boolean_t verbose = B_FALSE;
boolean_t first = B_TRUE; boolean_t first = B_TRUE;
@ -170,6 +172,7 @@ main(int argc, char *argv[])
} }
drr_record_count[drr->drr_type]++; drr_record_count[drr->drr_type]++;
total_records++;
switch (drr->drr_type) { switch (drr->drr_type) {
case DRR_BEGIN: case DRR_BEGIN:
@ -286,8 +289,8 @@ main(int argc, char *argv[])
drro->drr_bonuslen); drro->drr_bonuslen);
} }
if (drro->drr_bonuslen > 0) { if (drro->drr_bonuslen > 0) {
(void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen, (void) ssread(buf,
8), &zc); P2ROUNDUP(drro->drr_bonuslen, 8), &zc);
} }
break; break;
@ -397,6 +400,38 @@ main(int argc, char *argv[])
} }
(void) ssread(buf, drrs->drr_length, &zc); (void) ssread(buf, drrs->drr_length, &zc);
break; break;
case DRR_WRITE_EMBEDDED:
if (do_byteswap) {
drrwe->drr_object =
BSWAP_64(drrwe->drr_object);
drrwe->drr_offset =
BSWAP_64(drrwe->drr_offset);
drrwe->drr_length =
BSWAP_64(drrwe->drr_length);
drrwe->drr_toguid =
BSWAP_64(drrwe->drr_toguid);
drrwe->drr_lsize =
BSWAP_32(drrwe->drr_lsize);
drrwe->drr_psize =
BSWAP_32(drrwe->drr_psize);
}
if (verbose) {
(void) printf("WRITE_EMBEDDED object = %llu "
"offset = %llu length = %llu\n"
"toguid = %llx comp = %u etype = %u "
"lsize = %u psize = %u\n",
(u_longlong_t)drrwe->drr_object,
(u_longlong_t)drrwe->drr_offset,
(u_longlong_t)drrwe->drr_length,
(u_longlong_t)drrwe->drr_toguid,
drrwe->drr_compression,
drrwe->drr_etype,
drrwe->drr_lsize,
drrwe->drr_psize);
}
(void) ssread(buf,
P2ROUNDUP(drrwe->drr_psize, 8), &zc);
break;
case DRR_NUMTYPES: case DRR_NUMTYPES:
/* should never be reached */ /* should never be reached */
exit(1); exit(1);
@ -418,18 +453,16 @@ main(int argc, char *argv[])
(u_longlong_t)drr_record_count[DRR_FREEOBJECTS]); (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
(void) printf("\tTotal DRR_WRITE records = %lld\n", (void) printf("\tTotal DRR_WRITE records = %lld\n",
(u_longlong_t)drr_record_count[DRR_WRITE]); (u_longlong_t)drr_record_count[DRR_WRITE]);
(void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n",
(u_longlong_t)drr_record_count[DRR_WRITE_BYREF]);
(void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n",
(u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]);
(void) printf("\tTotal DRR_FREE records = %lld\n", (void) printf("\tTotal DRR_FREE records = %lld\n",
(u_longlong_t)drr_record_count[DRR_FREE]); (u_longlong_t)drr_record_count[DRR_FREE]);
(void) printf("\tTotal DRR_SPILL records = %lld\n", (void) printf("\tTotal DRR_SPILL records = %lld\n",
(u_longlong_t)drr_record_count[DRR_SPILL]); (u_longlong_t)drr_record_count[DRR_SPILL]);
(void) printf("\tTotal records = %lld\n", (void) printf("\tTotal records = %lld\n",
(u_longlong_t)(drr_record_count[DRR_BEGIN] + (u_longlong_t)total_records);
drr_record_count[DRR_OBJECT] +
drr_record_count[DRR_FREEOBJECTS] +
drr_record_count[DRR_WRITE] +
drr_record_count[DRR_FREE] +
drr_record_count[DRR_SPILL] +
drr_record_count[DRR_END]));
(void) printf("\tTotal write size = %lld (0x%llx)\n", (void) printf("\tTotal write size = %lld (0x%llx)\n",
(u_longlong_t)total_write_size, (u_longlong_t)total_write_size); (u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
(void) printf("\tTotal stream length = %lld (0x%llx)\n", (void) printf("\tTotal stream length = %lld (0x%llx)\n",

View File

@ -52,7 +52,7 @@
* At random times, the child self-immolates with a SIGKILL. * At random times, the child self-immolates with a SIGKILL.
* This is the software equivalent of pulling the power cord. * This is the software equivalent of pulling the power cord.
* The parent then runs the test again, using the existing * The parent then runs the test again, using the existing
* storage pool, as many times as desired. If backwards compatability * storage pool, as many times as desired. If backwards compatibility
* testing is enabled ztest will sometimes run the "older" version * testing is enabled ztest will sometimes run the "older" version
* of ztest after a SIGKILL. * of ztest after a SIGKILL.
* *
@ -1301,13 +1301,13 @@ static void
ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
{ {
ASSERT(bt->bt_magic == BT_MAGIC); ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
ASSERT(bt->bt_objset == dmu_objset_id(os)); ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
ASSERT(bt->bt_object == object); ASSERT3U(bt->bt_object, ==, object);
ASSERT(bt->bt_offset == offset); ASSERT3U(bt->bt_offset, ==, offset);
ASSERT(bt->bt_gen <= gen); ASSERT3U(bt->bt_gen, <=, gen);
ASSERT(bt->bt_txg <= txg); ASSERT3U(bt->bt_txg, <=, txg);
ASSERT(bt->bt_crtxg == crtxg); ASSERT3U(bt->bt_crtxg, ==, crtxg);
} }
static ztest_block_tag_t * static ztest_block_tag_t *
@ -3557,6 +3557,11 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
if (error) if (error)
fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
error = dsl_dataset_promote(clone2name, NULL); error = dsl_dataset_promote(clone2name, NULL);
if (error == ENOSPC) {
dmu_objset_disown(os, FTAG);
ztest_record_enospc(FTAG);
goto out;
}
if (error != EBUSY) if (error != EBUSY)
fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
error); error);
@ -3739,11 +3744,19 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
return; return;
} }
dmu_object_set_checksum(os, bigobj, enum zio_checksum cksum;
(enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx); do {
cksum = (enum zio_checksum)
ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
dmu_object_set_checksum(os, bigobj, cksum, tx);
dmu_object_set_compress(os, bigobj, enum zio_compress comp;
(enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx); do {
comp = (enum zio_compress)
ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
dmu_object_set_compress(os, bigobj, comp, tx);
/* /*
* For each index from n to n + s, verify that the existing bufwad * For each index from n to n + s, verify that the existing bufwad
@ -4867,8 +4880,13 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
error = dsl_dataset_user_hold(holds, 0, NULL); error = dsl_dataset_user_hold(holds, 0, NULL);
fnvlist_free(holds); fnvlist_free(holds);
if (error) if (error == ENOSPC) {
fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag); ztest_record_enospc("dsl_dataset_user_hold");
goto out;
} else if (error) {
fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
fullname, tag, error);
}
error = dsl_destroy_snapshot(fullname, B_FALSE); error = dsl_destroy_snapshot(fullname, B_FALSE);
if (error != EBUSY) { if (error != EBUSY) {
@ -5336,7 +5354,7 @@ ztest_run_zdb(char *pool)
} }
(void) sprintf(zdb, (void) sprintf(zdb,
"%s -bcc%s%s -U %s %s", "%s -bcc%s%s -d -U %s %s",
bin, bin,
ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 3 ? "s" : "",
ztest_opts.zo_verbose >= 4 ? "v" : "", ztest_opts.zo_verbose >= 4 ? "v" : "",

View File

@ -39,6 +39,7 @@
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/avl.h> #include <sys/avl.h>
#include <ucred.h> #include <ucred.h>
#include <libzfs_core.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -614,13 +615,16 @@ typedef struct sendflags {
/* show progress (ie. -v) */ /* show progress (ie. -v) */
boolean_t progress; boolean_t progress;
/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
} sendflags_t; } sendflags_t;
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
extern int zfs_send(zfs_handle_t *, const char *, const char *, extern int zfs_send(zfs_handle_t *, const char *, const char *,
sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
extern int zfs_send_one(zfs_handle_t *, const char *, int); extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
extern int zfs_promote(zfs_handle_t *); extern int zfs_promote(zfs_handle_t *);
extern int zfs_hold(zfs_handle_t *, const char *, const char *, extern int zfs_hold(zfs_handle_t *, const char *, const char *,

View File

@ -52,7 +52,11 @@ int lzc_hold(nvlist_t *, int, nvlist_t **);
int lzc_release(nvlist_t *, nvlist_t **); int lzc_release(nvlist_t *, nvlist_t **);
int lzc_get_holds(const char *, nvlist_t **); int lzc_get_holds(const char *, nvlist_t **);
int lzc_send(const char *, const char *, int); enum lzc_send_flags {
LZC_SEND_FLAG_EMBED_DATA = 1 << 0
};
int lzc_send(const char *, const char *, int, enum lzc_send_flags);
int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int); int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
int lzc_send_space(const char *, const char *, uint64_t *); int lzc_send_space(const char *, const char *, uint64_t *);

View File

@ -4,6 +4,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc.h \
$(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl.h \
$(top_srcdir)/include/sys/avl_impl.h \ $(top_srcdir)/include/sys/avl_impl.h \
$(top_srcdir)/include/sys/blkptr.h \
$(top_srcdir)/include/sys/bplist.h \ $(top_srcdir)/include/sys/bplist.h \
$(top_srcdir)/include/sys/bpobj.h \ $(top_srcdir)/include/sys/bpobj.h \
$(top_srcdir)/include/sys/bptree.h \ $(top_srcdir)/include/sys/bptree.h \

38
include/sys/blkptr.h Normal file
View File

@ -0,0 +1,38 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_BLKPTR_H
#define _SYS_BLKPTR_H
#include <sys/spa.h>
#include <sys/zio.h>
#ifdef __cplusplus
extern "C" {
#endif
void encode_embedded_bp_compressed(blkptr_t *, void *,
enum zio_compress, int, int);
void decode_embedded_bp_compressed(const blkptr_t *, void *);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_BLKPTR_H */

View File

@ -272,6 +272,9 @@ void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db);

View File

@ -116,6 +116,14 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_METADATA) : \ ((ot) & DMU_OT_METADATA) : \
dmu_ot[(int)(ot)].ot_metadata) dmu_ot[(int)(ot)].ot_metadata)
/*
* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
* is repurposed for embedded BPs.
*/
#define DMU_OT_HAS_FILL(ot) \
((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_BYTESWAP_MASK) : \ ((ot) & DMU_OT_BYTESWAP_MASK) : \
dmu_ot[(int)(ot)].ot_byteswap) dmu_ot[(int)(ot)].ot_byteswap)
@ -391,6 +399,11 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx); dmu_tx_t *tx);
void
dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx);
/* /*
* Decide how to write a block: checksum, compression, number of copies, etc. * Decide how to write a block: checksum, compression, number of copies, etc.
*/ */

View File

@ -269,12 +269,15 @@ typedef struct dmu_sendarg {
int dsa_err; int dsa_err;
dmu_pendop_t dsa_pending_op; dmu_pendop_t dsa_pending_op;
boolean_t dsa_incremental; boolean_t dsa_incremental;
uint64_t dsa_featureflags;
uint64_t dsa_last_data_object; uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset; uint64_t dsa_last_data_offset;
} dmu_sendarg_t; } dmu_sendarg_t;
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
void *, dmu_buf_t **);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -37,12 +37,12 @@ struct dsl_dataset;
struct drr_begin; struct drr_begin;
struct avl_tree; struct avl_tree;
int dmu_send(const char *tosnap, const char *fromsnap, int outfd, int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
struct vnode *vp, offset_t *off); int outfd, struct vnode *vp, offset_t *off);
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep); uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
int outfd, struct vnode *vp, offset_t *off); boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
typedef struct dmu_recv_cookie { typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_ds; struct dsl_dataset *drc_ds;

View File

@ -156,7 +156,7 @@ typedef struct zio_cksum {
* +-------+-------+-------+-------+-------+-------+-------+-------+ * +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 | * 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+ * +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE | * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+ * +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding | * 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+ * +-------+-------+-------+-------+-------+-------+-------+-------+
@ -190,7 +190,8 @@ typedef struct zio_cksum {
* G gang block indicator * G gang block indicator
* B byteorder (endianness) * B byteorder (endianness)
* D dedup * D dedup
* X unused * X encryption (on version 30, which is not supported)
* E blkptr_t contains embedded data (see below)
* lvl level of indirection * lvl level of indirection
* type DMU object type * type DMU object type
* phys birth txg of block allocation; zero if same as logical birth txg * phys birth txg of block allocation; zero if same as logical birth txg
@ -198,6 +199,100 @@ typedef struct zio_cksum {
* fill count number of non-zero blocks under this bp * fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes * checksum[4] 256-bit checksum of the data this bp describes
*/ */
/*
* "Embedded" blkptr_t's don't actually point to a block, instead they
* have a data payload embedded in the blkptr_t itself. See the comment
* in blkptr.c for more details.
*
* The blkptr_t is laid out as follows:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | payload |
* 1 | payload |
* 2 | payload |
* 3 | payload |
* 4 | payload |
* 5 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | payload |
* 8 | payload |
* 9 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | payload |
* c | payload |
* d | payload |
* e | payload |
* f | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* Legend:
*
* payload contains the embedded data
* B (byteorder) byteorder (endianness)
* D (dedup) padding (set to zero)
* X encryption (set to zero; see above)
* E (embedded) set to one
* lvl indirection level
* type DMU object type
* etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
* comp compression function of payload
* PSIZE size of payload after compression, in bytes
* LSIZE logical size of payload, in bytes
* note that 25 bits is enough to store the largest
* "normal" BP's LSIZE (2^16 * 2^9) in bytes
* log. birth transaction group in which the block was logically born
*
* Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
* bp's they are stored in units of SPA_MINBLOCKSHIFT.
* Generally, the generic BP_GET_*() macros can be used on embedded BP's.
* The B, D, X, lvl, type, and comp fields are stored the same as with normal
* BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
* be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
* other macros, as they assert that they are only used on BP's of the correct
* "embedded-ness".
*/
#define BPE_GET_ETYPE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET((bp)->blk_prop, 40, 8))
#define BPE_SET_ETYPE(bp, t) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop, 40, 8, t); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_LSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
#define BPE_SET_LSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BPE_GET_PSIZE(bp) \
(ASSERT(BP_IS_EMBEDDED(bp)), \
BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
#define BPE_SET_PSIZE(bp, x) do { \
ASSERT(BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
_NOTE(CONSTCOND) } while (0)
typedef enum bp_embedded_type {
BP_EMBEDDED_TYPE_DATA,
BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
} bp_embedded_type_t;
#define BPE_NUM_WORDS 14
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
#define BPE_IS_PAYLOADWORD(bp, wp) \
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
@ -244,20 +339,37 @@ typedef struct blkptr {
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \ #define BP_GET_LSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) (BP_IS_EMBEDDED(bp) ? \
#define BP_SET_LSIZE(bp, x) \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
#define BP_SET_LSIZE(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, \
0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_PSIZE(bp) \ #define BP_GET_PSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) (BP_IS_EMBEDDED(bp) ? 0 : \
#define BP_SET_PSIZE(bp, x) \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define BP_SET_PSIZE(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET_SB((bp)->blk_prop, \
16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) #define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7)
#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) #define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x)
#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
#define BP_GET_CHECKSUM(bp) \
(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
BF64_GET((bp)->blk_prop, 40, 8))
#define BP_SET_CHECKSUM(bp, x) do { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop, 40, 8, x); \
_NOTE(CONSTCOND) } while (0)
#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
@ -265,9 +377,6 @@ typedef struct blkptr {
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
@ -275,31 +384,39 @@ typedef struct blkptr {
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \ #define BP_PHYSICAL_BIRTH(bp) \
((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) (BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
#define BP_SET_BIRTH(bp, logical, physical) \ #define BP_SET_BIRTH(bp, logical, physical) \
{ \ { \
ASSERT(!BP_IS_EMBEDDED(bp)); \
(bp)->blk_birth = (logical); \ (bp)->blk_birth = (logical); \
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
} }
#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
#define BP_GET_ASIZE(bp) \ #define BP_GET_ASIZE(bp) \
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (BP_IS_EMBEDDED(bp) ? 0 : \
DVA_GET_ASIZE(&(bp)->blk_dva[2])) DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_GET_UCSIZE(bp) \ #define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \ #define BP_GET_NDVAS(bp) \
(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ (BP_IS_EMBEDDED(bp) ? 0 : \
!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[2])) !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_COUNT_GANG(bp) \ #define BP_COUNT_GANG(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(DVA_GET_GANG(&(bp)->blk_dva[0]) + \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
DVA_GET_GANG(&(bp)->blk_dva[1]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \
DVA_GET_GANG(&(bp)->blk_dva[2])) DVA_GET_GANG(&(bp)->blk_dva[2])))
#define DVA_EQUAL(dva1, dva2) \ #define DVA_EQUAL(dva1, dva2) \
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
@ -307,6 +424,7 @@ typedef struct blkptr {
#define BP_EQUAL(bp1, bp2) \ #define BP_EQUAL(bp1, bp2) \
(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
(bp1)->blk_birth == (bp2)->blk_birth && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@ -327,11 +445,13 @@ typedef struct blkptr {
(zcp)->zc_word[3] = w3; \ (zcp)->zc_word[3] = w3; \
} }
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) #define BP_IS_GANG(bp) \
(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
(dva)->dva_word[1] == 0ULL) (dva)->dva_word[1] == 0ULL)
#define BP_IS_HOLE(bp) DVA_IS_EMPTY(BP_IDENTITY(bp)) #define BP_IS_HOLE(bp) \
(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
/* BP_IS_RAIDZ(bp) assumes no block compression */ /* BP_IS_RAIDZ(bp) assumes no block compression */
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@ -386,6 +506,17 @@ typedef struct blkptr {
" birth=%lluL", \ " birth=%lluL", \
(u_longlong_t)bp->blk_birth); \ (u_longlong_t)bp->blk_birth); \
} \ } \
} else if (BP_IS_EMBEDDED(bp)) { \
len = func(buf + len, size - len, \
"EMBEDDED [L%llu %s] et=%u %s " \
"size=%llxL/%llxP birth=%lluL", \
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(int)BPE_GET_ETYPE(bp), \
compress, \
(u_longlong_t)BPE_GET_LSIZE(bp), \
(u_longlong_t)BPE_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
} else { \ } else { \
for (d = 0; d < BP_GET_NDVAS(bp); d++) { \ for (d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \ const dva_t *dva = &bp->blk_dva[d]; \
@ -419,7 +550,7 @@ typedef struct blkptr {
(u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth, \ (u_longlong_t)bp->blk_birth, \
(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
(u_longlong_t)bp->blk_fill, \ (u_longlong_t)BP_GET_FILL(bp), \
ws, \ ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[0], \
(u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \

View File

@ -38,6 +38,7 @@
#include <sys/refcount.h> #include <sys/refcount.h>
#include <sys/bplist.h> #include <sys/bplist.h>
#include <sys/bpobj.h> #include <sys/bpobj.h>
#include <sys/zfeature.h>
#include <zfeature_common.h> #include <zfeature_common.h>
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_ZFS_IOCTL_H #ifndef _SYS_ZFS_IOCTL_H
@ -90,15 +90,19 @@ typedef enum drr_headertype {
* Feature flags for zfs send streams (flags in drr_versioninfo) * Feature flags for zfs send streams (flags in drr_versioninfo)
*/ */
#define DMU_BACKUP_FEATURE_DEDUP (0x1) #define DMU_BACKUP_FEATURE_DEDUP (1<<0)
#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2) #define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1)
#define DMU_BACKUP_FEATURE_SA_SPILL (0x4) #define DMU_BACKUP_FEATURE_SA_SPILL (1<<2)
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
/* /*
* Mask of all supported backup features * Mask of all supported backup features
*/ */
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL) DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
/* Are all features in the given flag word currently supported? */ /* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@ -140,7 +144,7 @@ typedef struct dmu_replay_record {
enum { enum {
DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
DRR_SPILL, DRR_NUMTYPES DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
} drr_type; } drr_type;
uint32_t drr_payloadlen; uint32_t drr_payloadlen;
union { union {
@ -217,6 +221,19 @@ typedef struct dmu_replay_record {
uint64_t drr_pad[4]; /* needed for crypto */ uint64_t drr_pad[4]; /* needed for crypto */
/* spill data follows */ /* spill data follows */
} drr_spill; } drr_spill;
struct drr_write_embedded {
uint64_t drr_object;
uint64_t drr_offset;
/* logical length, should equal blocksize */
uint64_t drr_length;
uint64_t drr_toguid;
uint8_t drr_compression;
uint8_t drr_etype;
uint8_t drr_pad[6];
uint32_t drr_lsize; /* uncompressed size of payload */
uint32_t drr_psize; /* compr. (real) size of payload */
/* (possibly compressed) content follows */
} drr_write_embedded;
} drr_u; } drr_u;
} dmu_replay_record_t; } dmu_replay_record_t;
@ -325,8 +342,8 @@ typedef struct zfs_cmd {
dmu_objset_stats_t zc_objset_stats; dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record; struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record; zinject_record_t zc_inject_record;
boolean_t zc_defer_destroy; uint32_t zc_defer_destroy;
boolean_t zc_temphold; uint32_t zc_flags;
uint64_t zc_action_handle; uint64_t zc_action_handle;
int zc_cleanup_fd; int zc_cleanup_fd;
uint8_t zc_simple; uint8_t zc_simple;

View File

@ -82,6 +82,12 @@ enum zio_checksum {
ZIO_CHECKSUM_FUNCTIONS ZIO_CHECKSUM_FUNCTIONS
}; };
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
*/
#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
@ -111,6 +117,12 @@ enum zio_compress {
ZIO_COMPRESS_FUNCTIONS ZIO_COMPRESS_FUNCTIONS
}; };
/*
* The number of "legacy" compression functions which can be set on individual
* objects.
*/
#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF

View File

@ -46,6 +46,7 @@ typedef enum spa_feature {
SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_HOLE_BIRTH, SPA_FEATURE_HOLE_BIRTH,
SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_EMBEDDED_DATA,
SPA_FEATURE_BOOKMARKS, SPA_FEATURE_BOOKMARKS,
SPA_FEATURES SPA_FEATURES
} spa_feature_t; } spa_feature_t;
@ -65,7 +66,7 @@ typedef struct zfeature_info {
const spa_feature_t *fi_depends; const spa_feature_t *fi_depends;
} zfeature_info_t; } zfeature_info_t;
typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg); typedef int (zfeature_func_t)(zfeature_info_t *, void *);
#define ZFS_FEATURE_DEBUG #define ZFS_FEATURE_DEBUG
@ -74,8 +75,8 @@ extern zfeature_info_t spa_feature_table[SPA_FEATURES];
extern boolean_t zfeature_is_valid_guid(const char *); extern boolean_t zfeature_is_valid_guid(const char *);
extern boolean_t zfeature_is_supported(const char *); extern boolean_t zfeature_is_supported(const char *);
extern int zfeature_lookup_name(const char *name, spa_feature_t *res); extern int zfeature_lookup_name(const char *, spa_feature_t *);
extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check); extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
extern void zpool_feature_init(void); extern void zpool_feature_init(void);

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
* All rights reserved * All rights reserved
@ -49,6 +49,7 @@
#include <time.h> #include <time.h>
#include <libzfs.h> #include <libzfs.h>
#include <libzfs_core.h>
#include "zfs_namecheck.h" #include "zfs_namecheck.h"
#include "zfs_prop.h" #include "zfs_prop.h"
@ -220,6 +221,7 @@ cksummer(void *arg)
struct drr_object *drro = &thedrr.drr_u.drr_object; struct drr_object *drro = &thedrr.drr_u.drr_object;
struct drr_write *drrw = &thedrr.drr_u.drr_write; struct drr_write *drrw = &thedrr.drr_u.drr_write;
struct drr_spill *drrs = &thedrr.drr_u.drr_spill; struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
FILE *ofp; FILE *ofp;
int outfd; int outfd;
dmu_replay_record_t wbr_drr = {0}; dmu_replay_record_t wbr_drr = {0};
@ -415,6 +417,20 @@ cksummer(void *arg)
break; break;
} }
case DRR_WRITE_EMBEDDED:
{
if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
&stream_cksum, outfd) == -1)
goto out;
(void) ssread(buf,
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
if (cksum_and_write(buf,
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
&stream_cksum, outfd) == -1)
goto out;
break;
}
case DRR_FREE: case DRR_FREE:
{ {
if (cksum_and_write(drr, sizeof (dmu_replay_record_t), if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
@ -796,7 +812,7 @@ typedef struct send_dump_data {
char prevsnap[ZFS_MAXNAMELEN]; char prevsnap[ZFS_MAXNAMELEN];
uint64_t prevsnap_obj; uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t seenfrom, seento, replicate, doall, fromorigin;
boolean_t verbose, dryrun, parsable, progress; boolean_t verbose, dryrun, parsable, progress, embed_data;
int outfd; int outfd;
boolean_t err; boolean_t err;
nvlist_t *fss; nvlist_t *fss;
@ -876,7 +892,8 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
*/ */
static int static int
dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
boolean_t fromorigin, int outfd, nvlist_t *debugnv) boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
nvlist_t *debugnv)
{ {
zfs_cmd_t zc = {"\0"}; zfs_cmd_t zc = {"\0"};
libzfs_handle_t *hdl = zhp->zfs_hdl; libzfs_handle_t *hdl = zhp->zfs_hdl;
@ -890,6 +907,7 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
zc.zc_obj = fromorigin; zc.zc_obj = fromorigin;
zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
zc.zc_fromobj = fromsnap_obj; zc.zc_fromobj = fromsnap_obj;
zc.zc_flags = flags;
VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
if (fromsnap && fromsnap[0] != '\0') { if (fromsnap && fromsnap[0] != '\0') {
@ -1140,8 +1158,12 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
} }
} }
enum lzc_send_flags flags = 0;
if (sdd->embed_data)
flags |= LZC_SEND_FLAG_EMBED_DATA;
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
fromorigin, sdd->outfd, sdd->debugnv); fromorigin, sdd->outfd, flags, sdd->debugnv);
if (sdd->progress) { if (sdd->progress) {
(void) pthread_cancel(tid); (void) pthread_cancel(tid);
@ -1485,6 +1507,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
sdd.parsable = flags->parsable; sdd.parsable = flags->parsable;
sdd.progress = flags->progress; sdd.progress = flags->progress;
sdd.dryrun = flags->dryrun; sdd.dryrun = flags->dryrun;
sdd.embed_data = flags->embed_data;
sdd.filter_cb = filter_func; sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg; sdd.filter_cb_arg = cb_arg;
if (debugnvp) if (debugnvp)
@ -1616,7 +1639,8 @@ err_out:
} }
int int
zfs_send_one(zfs_handle_t *zhp, const char *from, int fd) zfs_send_one(zfs_handle_t *zhp, const char *from, int fd,
enum lzc_send_flags flags)
{ {
int err; int err;
libzfs_handle_t *hdl = zhp->zfs_hdl; libzfs_handle_t *hdl = zhp->zfs_hdl;
@ -1625,7 +1649,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd)
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"warning: cannot send '%s'"), zhp->zfs_name); "warning: cannot send '%s'"), zhp->zfs_name);
err = lzc_send(zhp->zfs_name, from, fd); err = lzc_send(zhp->zfs_name, from, fd, flags);
if (err != 0) { if (err != 0) {
switch (errno) { switch (errno) {
case EXDEV: case EXDEV:
@ -2543,6 +2567,16 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
(void) recv_read(hdl, fd, buf, (void) recv_read(hdl, fd, buf,
drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
break; break;
case DRR_WRITE_EMBEDDED:
if (byteswap) {
drr->drr_u.drr_write_embedded.drr_psize =
BSWAP_32(drr->drr_u.drr_write_embedded.
drr_psize);
}
(void) recv_read(hdl, fd, buf,
P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
8), B_FALSE, NULL);
break;
case DRR_WRITE_BYREF: case DRR_WRITE_BYREF:
case DRR_FREEOBJECTS: case DRR_FREEOBJECTS:
case DRR_FREE: case DRR_FREE:

View File

@ -439,6 +439,8 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
} }
/* /*
* Generate a zfs send stream for the specified snapshot and write it to
* the specified file descriptor.
* *
* "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap") * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
* *
@ -452,9 +454,15 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
* snapshot in the origin, etc. * snapshot in the origin, etc.
* *
* "fd" is the file descriptor to write the send stream to. * "fd" is the file descriptor to write the send stream to.
*
* If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
* to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
* which the receiving system must support (as indicated by support
* for the "embedded_data" feature).
*/ */
int int
lzc_send(const char *snapname, const char *from, int fd) lzc_send(const char *snapname, const char *from, int fd,
enum lzc_send_flags flags)
{ {
nvlist_t *args; nvlist_t *args;
int err; int err;
@ -463,6 +471,8 @@ lzc_send(const char *snapname, const char *from, int fd)
fnvlist_add_int32(args, "fd", fd); fnvlist_add_int32(args, "fd", fd);
if (from != NULL) if (from != NULL)
fnvlist_add_string(args, "fromsnap", from); fnvlist_add_string(args, "fromsnap", from);
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
nvlist_free(args); nvlist_free(args);
return (err); return (err);

View File

@ -21,6 +21,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zcommon/zpool_prop.c \ $(top_srcdir)/module/zcommon/zpool_prop.c \
$(top_srcdir)/module/zcommon/zprop_common.c \ $(top_srcdir)/module/zcommon/zprop_common.c \
$(top_srcdir)/module/zfs/arc.c \ $(top_srcdir)/module/zfs/arc.c \
$(top_srcdir)/module/zfs/blkptr.c \
$(top_srcdir)/module/zfs/bplist.c \ $(top_srcdir)/module/zfs/bplist.c \
$(top_srcdir)/module/zfs/bpobj.c \ $(top_srcdir)/module/zfs/bpobj.c \
$(top_srcdir)/module/zfs/bptree.c \ $(top_srcdir)/module/zfs/bptree.c \

View File

@ -358,6 +358,33 @@ never return to being \fBenabled\fB.
.RE .RE
.sp
.ne 2
.na
\fB\fBembedded_data\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.delphix:embedded_data
READ\-ONLY COMPATIBLE no
DEPENDENCIES none
.TE
This feature improves the performance and compression ratio of
highly-compressible blocks. Blocks whose contents can compress to 112 bytes
or smaller can take advantage of this feature.
When this feature is enabled, the contents of highly-compressible blocks are
stored in the block "pointer" itself (a misnomer in this case, as it contains
the compresseed data, rather than a pointer to its location on disk). Thus
the space of the block (one sector, typically 512 bytes or 4KB) is saved,
and no additional i/o is needed to read and write the data block.
This feature becomes \fBactive\fR as soon as it is enabled and will
never return to being \fBenabled\fR.
.RE
.SH "SEE ALSO" .SH "SEE ALSO"
\fBzpool\fR(8) \fBzpool\fR(8)

View File

@ -174,12 +174,12 @@ zfs \- configures ZFS file systems
.LP .LP
.nf .nf
\fBzfs\fR \fBsend\fR [\fB-DnPpRv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR \fBzfs\fR \fBsend\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
.fi .fi
.LP .LP
.nf .nf
\fBzfs\fR \fBsend\fR [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fBzfs\fR \fBsend\fR [\fB-e\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
.fi .fi
.LP .LP
@ -2600,7 +2600,7 @@ See \fBzpool-features\fR(5) for details on ZFS feature flags and the
.sp .sp
.ne 2 .ne 2
.na .na
\fBzfs send\fR [\fB-DnPpRv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR \fBzfs send\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
.ad .ad
.sp .6 .sp .6
.RS 4n .RS 4n
@ -2657,6 +2657,23 @@ Generate a deduplicated stream. Blocks which would have been sent multiple times
.ne 2 .ne 2
.mk .mk
.na .na
\fB\fB-e\fR\fR
.ad
.sp .6
.RS 4n
Generate a more compact stream by using WRITE_EMBEDDED records for blocks
which are stored more compactly on disk by the \fBembedded_data\fR pool
feature. This flag has no effect if the \fBembedded_data\fR feature is
disabled. The receiving system must have the \fBembedded_data\fR feature
enabled. If the \fBlz4_compress\fR feature is active on the sending system,
then the receiving system must have that feature enabled as well. See
\fBzpool-features\fR(5) for details on ZFS feature flags and the
\fBembedded_data\fR feature.
.RE
.sp
.ne 2
.na
\fB\fB-p\fR\fR \fB\fB-p\fR\fR
.ad .ad
.sp .6 .sp .6
@ -2705,7 +2722,7 @@ The format of the stream is committed. You will be able to receive your streams
.sp .sp
.ne 2 .ne 2
.na .na
\fBzfs send\fR [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fBzfs send\fR [\fB-e\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
.ad .ad
.sp .6 .sp .6
.RS 4n .RS 4n
@ -2822,6 +2839,22 @@ Do not actually receive the stream. This can be useful in conjunction with the \
Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side. Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side.
.RE .RE
.sp
.ne 2
.na
\fB\fB-e\fR\fR
.ad
.sp .6
.RS 4n
Generate a more compact stream by using WRITE_EMBEDDED records for blocks
which are stored more compactly on disk by the \fBembedded_data\fR pool
feature. This flag has no effect if the \fBembedded_data\fR feature is
disabled. The receiving system must have the \fBembedded_data\fR feature
enabled. If the \fBlz4_compress\fR feature is active on the sending system,
then the receiving system must have that feature enabled as well. See
\fBzpool-features\fR(5) for details on ZFS feature flags and the
\fBembedded_data\fR feature.
.RE
.RE .RE
.sp .sp

View File

@ -5,6 +5,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
obj-$(CONFIG_ZFS) := $(MODULE).o obj-$(CONFIG_ZFS) := $(MODULE).o
$(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
$(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o

View File

@ -812,8 +812,10 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
} }
static arc_buf_hdr_t * static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{ {
const dva_t *dva = BP_IDENTITY(bp);
uint64_t birth = BP_PHYSICAL_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx); kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *buf; arc_buf_hdr_t *buf;
@ -845,6 +847,8 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
arc_buf_hdr_t *fbuf; arc_buf_hdr_t *fbuf;
uint32_t i; uint32_t i;
ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
ASSERT(buf->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(buf)); ASSERT(!HDR_IN_HASH_TABLE(buf));
*lockp = hash_lock; *lockp = hash_lock;
mutex_enter(hash_lock); mutex_enter(hash_lock);
@ -3034,10 +3038,10 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
static void static void
arc_read_done(zio_t *zio) arc_read_done(zio_t *zio)
{ {
arc_buf_hdr_t *hdr, *found; arc_buf_hdr_t *hdr;
arc_buf_t *buf; arc_buf_t *buf;
arc_buf_t *abuf; /* buffer we're assigning to callback */ arc_buf_t *abuf; /* buffer we're assigning to callback */
kmutex_t *hash_lock; kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list, *acb; arc_callback_t *callback_list, *acb;
int freeable = FALSE; int freeable = FALSE;
@ -3052,12 +3056,24 @@ arc_read_done(zio_t *zio)
* reason for it not to be found is if we were freed during the * reason for it not to be found is if we were freed during the
* read. * read.
*/ */
found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, if (HDR_IN_HASH_TABLE(hdr)) {
&hash_lock); arc_buf_hdr_t *found;
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
(found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || ASSERT3U(hdr->b_dva.dva_word[0], ==,
(found == hdr && HDR_L2_READING(hdr))); BP_IDENTITY(zio->io_bp)->dva_word[0]);
ASSERT3U(hdr->b_dva.dva_word[1], ==,
BP_IDENTITY(zio->io_bp)->dva_word[1]);
found = buf_hash_find(hdr->b_spa, zio->io_bp,
&hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
hash_lock == NULL) ||
(found == hdr &&
DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
(found == hdr && HDR_L2_READING(hdr)));
}
hdr->b_flags &= ~ARC_L2_EVICTED; hdr->b_flags &= ~ARC_L2_EVICTED;
if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
@ -3181,17 +3197,26 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb) const zbookmark_t *zb)
{ {
arc_buf_hdr_t *hdr; arc_buf_hdr_t *hdr = NULL;
arc_buf_t *buf = NULL; arc_buf_t *buf = NULL;
kmutex_t *hash_lock; kmutex_t *hash_lock = NULL;
zio_t *rzio; zio_t *rzio;
uint64_t guid = spa_load_guid(spa); uint64_t guid = spa_load_guid(spa);
int rc = 0; int rc = 0;
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
top: top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), if (!BP_IS_EMBEDDED(bp)) {
&hash_lock); /*
if (hdr && hdr->b_datacnt > 0) { * Embedded BP's have no DVA and require no I/O to "read".
* Create an anonymous arc buf to back it.
*/
hdr = buf_hash_find(guid, bp, &hash_lock);
}
if (hdr != NULL && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED; *arc_flags |= ARC_CACHED;
@ -3265,7 +3290,7 @@ top:
done(NULL, buf, private); done(NULL, buf, private);
} else { } else {
uint64_t size = BP_GET_LSIZE(bp); uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb; arc_callback_t *acb;
vdev_t *vd = NULL; vdev_t *vd = NULL;
uint64_t addr = 0; uint64_t addr = 0;
boolean_t devw = B_FALSE; boolean_t devw = B_FALSE;
@ -3274,15 +3299,17 @@ top:
if (hdr == NULL) { if (hdr == NULL) {
/* this block is not in the cache */ /* this block is not in the cache */
arc_buf_hdr_t *exists; arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
buf = arc_buf_alloc(spa, size, private, type); buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr; hdr = buf->b_hdr;
hdr->b_dva = *BP_IDENTITY(bp); if (!BP_IS_EMBEDDED(bp)) {
hdr->b_birth = BP_PHYSICAL_BIRTH(bp); hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
exists = buf_hash_insert(hdr, &hash_lock); hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
if (exists) { exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
/* somebody beat us to the hash insert */ /* somebody beat us to the hash insert */
mutex_exit(hash_lock); mutex_exit(hash_lock);
buf_discard_identity(hdr); buf_discard_identity(hdr);
@ -3354,7 +3381,8 @@ top:
vd = NULL; vd = NULL;
} }
mutex_exit(hash_lock); if (hash_lock != NULL)
mutex_exit(hash_lock);
/* /*
* At this point, we have a level 1 cache miss. Try again in * At this point, we have a level 1 cache miss. Try again in
@ -3526,8 +3554,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
kmutex_t *hash_lock; kmutex_t *hash_lock;
uint64_t guid = spa_load_guid(spa); uint64_t guid = spa_load_guid(spa);
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), ASSERT(!BP_IS_EMBEDDED(bp));
&hash_lock);
hdr = buf_hash_find(guid, bp, &hash_lock);
if (hdr == NULL) if (hdr == NULL)
return; return;
if (HDR_BUF_AVAILABLE(hdr)) { if (HDR_BUF_AVAILABLE(hdr)) {
@ -3854,7 +3883,7 @@ arc_write_done(zio_t *zio)
ASSERT(hdr->b_acb == NULL); ASSERT(hdr->b_acb == NULL);
if (zio->io_error == 0) { if (zio->io_error == 0) {
if (BP_IS_HOLE(zio->io_bp)) { if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
buf_discard_identity(hdr); buf_discard_identity(hdr);
} else { } else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_dva = *BP_IDENTITY(zio->io_bp);
@ -3866,10 +3895,10 @@ arc_write_done(zio_t *zio)
} }
/* /*
* If the block to be written was all-zero, we may have * If the block to be written was all-zero or compressed enough to be
* compressed it away. In this case no write was performed * embedded in the BP, no write was performed so there will be no
* so there will be no dva/birth/checksum. The buffer must * dva/birth/checksum. The buffer must therefore remain anonymous
* therefore remain anonymous (and uncached). * (and uncached).
*/ */
if (!BUF_EMPTY(hdr)) { if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists; arc_buf_hdr_t *exists;
@ -5219,7 +5248,7 @@ static boolean_t
l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
{ {
void *cdata; void *cdata;
size_t csize, len; size_t csize, len, rounded;
ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
ASSERT(l2hdr->b_tmp_cdata != NULL); ASSERT(l2hdr->b_tmp_cdata != NULL);
@ -5229,6 +5258,12 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
cdata, l2hdr->b_asize); cdata, l2hdr->b_asize);
rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
if (rounded > csize) {
bzero((char *)cdata + csize, rounded - csize);
csize = rounded;
}
if (csize == 0) { if (csize == 0) {
/* zero block, indicate that there's nothing to write */ /* zero block, indicate that there's nothing to write */
zio_data_buf_free(cdata, len); zio_data_buf_free(cdata, len);

121
module/zfs/blkptr.c Normal file
View File

@ -0,0 +1,121 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/zio_compress.h>
/*
* Embedded-data Block Pointers
*
* Normally, block pointers point (via their DVAs) to a block which holds data.
* If the data that we need to store is very small, this is an inefficient
* use of space, because a block must be at minimum 1 sector (typically 512
* bytes or 4KB). Additionally, reading these small blocks tends to generate
* more random reads.
*
* Embedded-data Block Pointers allow small pieces of data (the "payload",
* up to 112 bytes) to be stored in the block pointer itself, instead of
* being pointed to. The "Pointer" part of this name is a bit of a
* misnomer, as nothing is pointed to.
*
* BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
* be embedded in the block pointer. The logic for this is handled in
* the SPA, by the zio pipeline. Therefore most code outside the zio
* pipeline doesn't need special-cases to handle these block pointers.
*
* See spa.h for details on the exact layout of embedded block pointers.
*/
void
encode_embedded_bp_compressed(blkptr_t *bp, void *data,
enum zio_compress comp, int uncompressed_size, int compressed_size)
{
uint64_t *bp64 = (uint64_t *)bp;
uint64_t w = 0;
uint8_t *data8 = data;
int i;
ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
ASSERT(uncompressed_size == compressed_size ||
comp != ZIO_COMPRESS_OFF);
ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
bzero(bp, sizeof (*bp));
BP_SET_EMBEDDED(bp, B_TRUE);
BP_SET_COMPRESS(bp, comp);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
BPE_SET_LSIZE(bp, uncompressed_size);
BPE_SET_PSIZE(bp, compressed_size);
/*
* Encode the byte array into the words of the block pointer.
* First byte goes into low bits of first word (little endian).
*/
for (i = 0; i < compressed_size; i++) {
BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
if (i % sizeof (w) == sizeof (w) - 1) {
/* we've reached the end of a word */
ASSERT3P(bp64, <, bp + 1);
*bp64 = w;
bp64++;
if (!BPE_IS_PAYLOADWORD(bp, bp64))
bp64++;
w = 0;
}
}
/* write last partial word */
if (bp64 < (uint64_t *)(bp + 1))
*bp64 = w;
}
/*
* buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
* more than BPE_PAYLOAD_SIZE bytes).
*/
void
decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
{
int psize;
uint8_t *buf8 = buf;
uint64_t w = 0;
const uint64_t *bp64 = (const uint64_t *)bp;
int i;
ASSERT(BP_IS_EMBEDDED(bp));
psize = BPE_GET_PSIZE(bp);
/*
* Decode the words of the block pointer into the byte array.
* Low bits of first word are the first byte (little endian).
*/
for (i = 0; i < psize; i++) {
if (i % sizeof (w) == 0) {
/* beginning of a word */
ASSERT3P(bp64, <, bp + 1);
w = *bp64;
bp64++;
if (!BPE_IS_PAYLOADWORD(bp, bp64))
bp64++;
}
buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
}
}

View File

@ -192,6 +192,13 @@ bpobj_close(bpobj_t *bpo)
mutex_destroy(&bpo->bpo_lock); mutex_destroy(&bpo->bpo_lock);
} }
static boolean_t
bpobj_hasentries(bpobj_t *bpo)
{
return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
(bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
}
static int static int
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
boolean_t free) boolean_t free)
@ -332,9 +339,11 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
out: out:
/* If there are no entries, there should be no bytes. */ /* If there are no entries, there should be no bytes. */
ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || if (!bpobj_hasentries(bpo)) {
(bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || ASSERT0(bpo->bpo_phys->bpo_bytes);
bpo->bpo_phys->bpo_bytes == 0); ASSERT0(bpo->bpo_phys->bpo_comp);
ASSERT0(bpo->bpo_phys->bpo_uncomp);
}
mutex_exit(&bpo->bpo_lock); mutex_exit(&bpo->bpo_lock);
return (err); return (err);
@ -378,7 +387,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
if (used == 0) { if (!bpobj_hasentries(&subbpo)) {
/* No point in having an empty subobj. */ /* No point in having an empty subobj. */
bpobj_close(&subbpo); bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx); bpobj_free(bpo->bpo_os, subobj, tx);
@ -453,13 +462,29 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
if (BP_IS_EMBEDDED(bp)) {
/*
* The bpobj will compress better without the payload.
*
* Note that we store EMBEDDED bp's because they have an
* uncompressed size, which must be accounted for. An
* alternative would be to add their size to bpo_uncomp
* without storing the bp, but that would create additional
* complications: bpo_uncomp would be inconsistent with the
* set of BP's stored, and bpobj_iterate() wouldn't visit
* all the space accounted for in the bpobj.
*/
bzero(&stored_bp, sizeof (stored_bp));
stored_bp.blk_prop = bp->blk_prop;
stored_bp.blk_birth = bp->blk_birth;
} else if (!BP_GET_DEDUP(bp)) {
/* The bpobj will compress better without the checksum */
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
}
/* We never need the fill count. */ /* We never need the fill count. */
stored_bp.blk_fill = 0; stored_bp.blk_fill = 0;
/* The bpobj will compress better if we can leave off the checksum */
if (!BP_GET_DEDUP(bp))
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
mutex_enter(&bpo->bpo_lock); mutex_enter(&bpo->bpo_lock);
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);

View File

@ -40,6 +40,8 @@
#include <sys/dmu_zfetch.h> #include <sys/dmu_zfetch.h>
#include <sys/sa.h> #include <sys/sa.h>
#include <sys/sa_impl.h> #include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/blkptr.h>
#include <sys/range_tree.h> #include <sys/range_tree.h>
struct dbuf_hold_impl_data { struct dbuf_hold_impl_data {
@ -1492,6 +1494,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
} }
void
dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
bp_embedded_type_t etype, enum zio_compress comp,
int uncompressed_size, int compressed_size, int byteorder,
dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
struct dirty_leaf *dl;
dmu_object_type_t type;
DB_DNODE_ENTER(db);
type = DB_DNODE(db)->dn_type;
DB_DNODE_EXIT(db);
ASSERT0(db->db_level);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
dmu_buf_will_not_fill(dbuf, tx);
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
dl = &db->db_last_dirty->dt.dl;
encode_embedded_bp_compressed(&dl->dr_overridden_by,
data, comp, uncompressed_size, compressed_size);
BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
BP_SET_TYPE(&dl->dr_overridden_by, type);
BP_SET_LEVEL(&dl->dr_overridden_by, 0);
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
dl->dr_override_state = DR_OVERRIDDEN;
dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
}
/* /*
* Directly assign a provided arc buf to a given dbuf if it's not referenced * Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@ -1885,7 +1919,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
} }
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
if (bp && !BP_IS_HOLE(bp)) { if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb; zbookmark_t zb;
@ -2575,7 +2609,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
uint64_t fill = 0; uint64_t fill = 0;
int i; int i;
ASSERT(db->db_blkptr == bp); ASSERT3P(db->db_blkptr, ==, bp);
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
dn = DB_DNODE(db); dn = DB_DNODE(db);
@ -2587,7 +2621,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT((db->db_blkid != DMU_SPILL_BLKID && ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) || BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID && (db->db_blkid == DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_bonustype)); BP_GET_TYPE(bp) == dn->dn_bonustype) ||
BP_IS_EMBEDDED(bp));
ASSERT(BP_GET_LEVEL(bp) == db->db_level); ASSERT(BP_GET_LEVEL(bp) == db->db_level);
} }
@ -2628,12 +2663,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp)) if (BP_IS_HOLE(ibp))
continue; continue;
fill += ibp->blk_fill; fill += BP_GET_FILL(ibp);
} }
} }
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
bp->blk_fill = fill; if (!BP_IS_EMBEDDED(bp))
bp->blk_fill = fill;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
} }
@ -2745,7 +2781,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size); db->db.db_size);
arc_set_callback(db->db_buf, dbuf_do_evict, db); if (!arc_released(db->db_buf))
arc_set_callback(db->db_buf, dbuf_do_evict, db);
} }
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx); mutex_destroy(&dr->dt.di.dr_mtx);
@ -2871,10 +2908,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { if (db->db_level == 0 &&
ASSERT(db->db_state != DB_NOFILL); dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
* The BP for this block has been provided by open context
* (by dmu_sync() or dmu_buf_write_embedded()).
*/
void *contents = (data != NULL) ? data->b_data : NULL;
dr->dr_zio = zio_write(zio, os->os_spa, txg, dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, data->b_data, arc_buf_size(data), &zp, db->db_blkptr, contents, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, dbuf_write_override_done, dbuf_write_override_ready, NULL, dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);

View File

@ -124,17 +124,13 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
}; };
int int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags) void *tag, dmu_buf_t **dbp)
{ {
dnode_t *dn; dnode_t *dn;
uint64_t blkid; uint64_t blkid;
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
int err; int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dnode_hold(os, object, FTAG, &dn); err = dnode_hold(os, object, FTAG, &dn);
if (err) if (err)
@ -143,18 +139,37 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag); db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
if (db == NULL) { if (db == NULL) {
err = SET_ERROR(EIO); *dbp = NULL;
} else { return (SET_ERROR(EIO));
}
*dbp = &db->db;
return (err);
}
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
if (flags & DMU_READ_NO_PREFETCH)
db_flags |= DB_RF_NOPREFETCH;
err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
if (err == 0) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags); err = dbuf_read(db, NULL, db_flags);
if (err) { if (err != 0) {
dbuf_rele(db, tag); dbuf_rele(db, tag);
db = NULL; *dbp = NULL;
} }
} }
dnode_rele(dn, FTAG);
*dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err); return (err);
} }
@ -852,6 +867,25 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG); dmu_buf_rele_array(dbp, numbufs, FTAG);
} }
void
dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx)
{
dmu_buf_t *db;
ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
VERIFY0(dmu_buf_hold_noread(os, object, offset,
FTAG, &db));
dmu_buf_write_embedded(db,
data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
uncompressed_size, compressed_size, byteorder, tx);
dmu_buf_rele(db, FTAG);
}
/* /*
* DMU support for xuio * DMU support for xuio
*/ */
@ -1393,7 +1427,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
* block size still needs to be known for replay. * block size still needs to be known for replay.
*/ */
BP_SET_LSIZE(bp, db->db_size); BP_SET_LSIZE(bp, db->db_size);
} else { } else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1; bp->blk_fill = 1;
} }
@ -1664,9 +1698,15 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
{ {
dnode_t *dn; dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */ /*
(void) dnode_hold(os, object, FTAG, &dn); * Send streams include each object's checksum function. This
ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); * check ensures that the receiving system can understand the
* checksum function transmitted.
*/
ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum; dn->dn_checksum = checksum;
dnode_setdirty(dn, tx); dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
@ -1678,9 +1718,14 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
{ {
dnode_t *dn; dnode_t *dn;
/* XXX assumes dnode_hold will not get an i/o error */ /*
(void) dnode_hold(os, object, FTAG, &dn); * Send streams include each object's compression function. This
ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); * check ensures that the receiving system can understand the
* compression function transmitted.
*/
ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
VERIFY0(dnode_hold(os, object, FTAG, &dn));
dn->dn_compress = compress; dn->dn_compress = compress;
dnode_setdirty(dn, tx); dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
@ -1843,7 +1888,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
doi->doi_fill_count = 0; doi->doi_fill_count = 0;
for (i = 0; i < dnp->dn_nblkptr; i++) for (i = 0; i < dnp->dn_nblkptr; i++)
doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
} }
void void

View File

@ -337,7 +337,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* default (fletcher2/off). Snapshots don't need to know about * default (fletcher2/off). Snapshots don't need to know about
* checksum/compression/copies. * checksum/compression/copies.
*/ */
if (ds) { if (ds != NULL) {
err = dsl_prop_register(ds, err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os); primary_cache_changed_cb, os);
@ -390,7 +390,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
kmem_free(os, sizeof (objset_t)); kmem_free(os, sizeof (objset_t));
return (err); return (err);
} }
} else if (ds == NULL) { } else {
/* It's the meta-objset. */ /* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_LZJB; os->os_compress = ZIO_COMPRESS_LZJB;
@ -434,17 +434,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
&os->os_groupused_dnode); &os->os_groupused_dnode);
} }
/*
* We should be the only thread trying to do this because we
* have ds_opening_lock
*/
if (ds) {
mutex_enter(&ds->ds_lock);
ASSERT(ds->ds_objset == NULL);
ds->ds_objset = os;
mutex_exit(&ds->ds_lock);
}
*osp = os; *osp = os;
return (0); return (0);
} }
@ -455,11 +444,19 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
int err = 0; int err = 0;
mutex_enter(&ds->ds_opening_lock); mutex_enter(&ds->ds_opening_lock);
*osp = ds->ds_objset; if (ds->ds_objset == NULL) {
if (*osp == NULL) { objset_t *os;
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
ds, dsl_dataset_get_blkptr(ds), osp); ds, dsl_dataset_get_blkptr(ds), &os);
if (err == 0) {
mutex_enter(&ds->ds_lock);
ASSERT(ds->ds_objset == NULL);
ds->ds_objset = os;
mutex_exit(&ds->ds_lock);
}
} }
*osp = ds->ds_objset;
mutex_exit(&ds->ds_opening_lock); mutex_exit(&ds->ds_opening_lock);
return (err); return (err);
} }
@ -981,6 +978,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
objset_t *os = arg; objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3P(bp, ==, os->os_rootbp); ASSERT3P(bp, ==, os->os_rootbp);
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
ASSERT0(BP_GET_LEVEL(bp)); ASSERT0(BP_GET_LEVEL(bp));
@ -993,7 +991,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
*/ */
bp->blk_fill = 0; bp->blk_fill = 0;
for (i = 0; i < dnp->dn_nblkptr; i++) for (i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill; bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
} }
/* ARGSUSED */ /* ARGSUSED */

View File

@ -50,7 +50,9 @@
#include <sys/zfs_onexit.h> #include <sys/zfs_onexit.h>
#include <sys/dmu_send.h> #include <sys/dmu_send.h>
#include <sys/dsl_destroy.h> #include <sys/dsl_destroy.h>
#include <sys/blkptr.h>
#include <sys/dsl_bookmark.h> #include <sys/dsl_bookmark.h>
#include <sys/zfeature.h>
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
int zfs_send_corrupt_data = B_FALSE; int zfs_send_corrupt_data = B_FALSE;
@ -197,7 +199,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
} }
static int static int
dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{ {
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
@ -232,13 +234,22 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset; drrw->drr_offset = offset;
drrw->drr_length = blksz; drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_toguid = dsp->dsa_toguid;
drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (BP_IS_EMBEDDED(bp)) {
if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) /*
drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; * There's no pre-computed checksum of embedded BP's, so
DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); * (like fletcher4-checkummed blocks) userland will have
DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); * to compute a dedup-capable checksum itself.
DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); */
drrw->drr_key.ddk_cksum = bp->blk_cksum; drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
drrw->drr_key.ddk_cksum = bp->blk_cksum;
}
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));
@ -247,6 +258,43 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
return (0); return (0);
} }
static int
dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
int blksz, const blkptr_t *bp)
{
char buf[BPE_PAYLOAD_SIZE];
struct drr_write_embedded *drrw =
&(dsp->dsa_drr->drr_u.drr_write_embedded);
if (dsp->dsa_pending_op != PENDING_NONE) {
if (dump_bytes(dsp, dsp->dsa_drr,
sizeof (dmu_replay_record_t)) != 0)
return (EINTR);
dsp->dsa_pending_op = PENDING_NONE;
}
ASSERT(BP_IS_EMBEDDED(bp));
bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
drrw->drr_object = object;
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
drrw->drr_compression = BP_GET_COMPRESS(bp);
drrw->drr_etype = BPE_GET_ETYPE(bp);
drrw->drr_lsize = BPE_GET_LSIZE(bp);
drrw->drr_psize = BPE_GET_PSIZE(bp);
decode_embedded_bp_compressed(bp, buf);
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (EINTR);
if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
return (EINTR);
return (0);
}
static int static int
dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
{ {
@ -367,6 +415,33 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
return (0); return (0);
} }
static boolean_t
backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
{
if (!BP_IS_EMBEDDED(bp))
return (B_FALSE);
/*
* Compression function must be legacy, or explicitly enabled.
*/
if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
return (B_FALSE);
/*
* Embed type must be explicitly enabled.
*/
switch (BPE_GET_ETYPE(bp)) {
case BP_EMBEDDED_TYPE_DATA:
if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
return (B_TRUE);
break;
default:
return (B_FALSE);
}
return (B_FALSE);
}
#define BP_SPAN(dnp, level) \ #define BP_SPAN(dnp, level) \
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
@ -435,11 +510,17 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf); (void) arc_buf_remove_ref(abuf, &abuf);
} else if (backup_do_embed(dsp, bp)) {
/* it's an embedded level-0 block of a regular object */
int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
err = dump_write_embedded(dsp, zb->zb_object,
zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */ } else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT; uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf; arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp); int blksz = BP_GET_LSIZE(bp);
ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level); ASSERT0(zb->zb_level);
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
@ -458,7 +539,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
} }
} }
err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
blksz, bp, abuf->b_data); blksz, bp, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf); (void) arc_buf_remove_ref(abuf, &abuf);
} }
@ -472,14 +553,15 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/ */
static int static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd, zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
vnode_t *vp, offset_t *off) int outfd, vnode_t *vp, offset_t *off)
{ {
objset_t *os; objset_t *os;
dmu_replay_record_t *drr; dmu_replay_record_t *drr;
dmu_sendarg_t *dsp; dmu_sendarg_t *dsp;
int err; int err;
uint64_t fromtxg = 0; uint64_t fromtxg = 0;
uint64_t featureflags = 0;
err = dmu_objset_from_ds(ds, &os); err = dmu_objset_from_ds(ds, &os);
if (err != 0) { if (err != 0) {
@ -502,13 +584,23 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
} }
if (version >= ZPL_VERSION_SA) { if (version >= ZPL_VERSION_SA) {
DMU_SET_FEATUREFLAGS( featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
drr->drr_u.drr_begin.drr_versioninfo,
DMU_BACKUP_FEATURE_SA_SPILL);
} }
} }
#endif #endif
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
} else {
embedok = B_FALSE;
}
DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
featureflags);
drr->drr_u.drr_begin.drr_creation_time = drr->drr_u.drr_begin.drr_creation_time =
ds->ds_phys->ds_creation_time; ds->ds_phys->ds_creation_time;
drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
@ -540,6 +632,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (fromzb != NULL); dsp->dsa_incremental = (fromzb != NULL);
dsp->dsa_featureflags = featureflags;
mutex_enter(&ds->ds_sendstream_lock); mutex_enter(&ds->ds_sendstream_lock);
list_insert_head(&ds->ds_sendstreams, dsp); list_insert_head(&ds->ds_sendstreams, dsp);
@ -591,7 +684,7 @@ out:
int int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
int outfd, vnode_t *vp, offset_t *off) boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
{ {
dsl_pool_t *dp; dsl_pool_t *dp;
dsl_dataset_t *ds; dsl_dataset_t *ds;
@ -625,10 +718,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = fromds->ds_phys->ds_guid; zb.zbm_guid = fromds->ds_phys->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir); is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG); dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off); outfd, vp, off);
} else { } else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off); outfd, vp, off);
} }
dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(ds, FTAG);
@ -636,7 +729,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
} }
int int
dmu_send(const char *tosnap, const char *fromsnap, dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
int outfd, vnode_t *vp, offset_t *off) int outfd, vnode_t *vp, offset_t *off)
{ {
dsl_pool_t *dp; dsl_pool_t *dp;
@ -703,10 +796,10 @@ dmu_send(const char *tosnap, const char *fromsnap,
dsl_pool_rele(dp, FTAG); dsl_pool_rele(dp, FTAG);
return (err); return (err);
} }
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off); outfd, vp, off);
} else { } else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off); outfd, vp, off);
} }
if (owned) if (owned)
@ -861,6 +954,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
uint64_t fromguid = drrb->drr_fromguid; uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags; int flags = drrb->drr_flags;
int error; int error;
uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds; dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs; const char *tofs = drba->drba_cookie->drc_tofs;
@ -874,11 +968,22 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
/* Verify pool version supports SA if SA_SPILL feature set */ /* Verify pool version supports SA if SA_SPILL feature set */
if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(dp->dp_spa) < SPA_VERSION_SA)
spa_version(dp->dp_spa) < SPA_VERSION_SA) { return (SET_ERROR(ENOTSUP));
/*
* The receiving code doesn't know how to translate a WRITE_EMBEDDED
* record to a plan WRITE record, so the pool must have the
* EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
* records. Same with WRITE_EMBEDDED records that use LZ4 compression.
*/
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
return (SET_ERROR(ENOTSUP));
if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
}
error = dsl_dataset_hold(dp, tofs, FTAG, &ds); error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) { if (error == 0) {
@ -1153,7 +1258,6 @@ backup_byteswap(dmu_replay_record_t *drr)
break; break;
case DRR_OBJECT: case DRR_OBJECT:
DO64(drr_object.drr_object); DO64(drr_object.drr_object);
/* DO64(drr_object.drr_allocation_txg); */
DO32(drr_object.drr_type); DO32(drr_object.drr_type);
DO32(drr_object.drr_bonustype); DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz); DO32(drr_object.drr_blksz);
@ -1191,6 +1295,14 @@ backup_byteswap(dmu_replay_record_t *drr)
DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
DO64(drr_write_byref.drr_key.ddk_prop); DO64(drr_write_byref.drr_key.ddk_prop);
break; break;
case DRR_WRITE_EMBEDDED:
DO64(drr_write_embedded.drr_object);
DO64(drr_write_embedded.drr_offset);
DO64(drr_write_embedded.drr_length);
DO64(drr_write_embedded.drr_toguid);
DO32(drr_write_embedded.drr_lsize);
DO32(drr_write_embedded.drr_psize);
break;
case DRR_FREE: case DRR_FREE:
DO64(drr_free.drr_object); DO64(drr_free.drr_object);
DO64(drr_free.drr_offset); DO64(drr_free.drr_offset);
@ -1380,7 +1492,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
int err; int err;
guid_map_entry_t gmesrch; guid_map_entry_t gmesrch;
guid_map_entry_t *gmep; guid_map_entry_t *gmep;
avl_index_t where; avl_index_t where;
objset_t *ref_os = NULL; objset_t *ref_os = NULL;
dmu_buf_t *dbp; dmu_buf_t *dbp;
@ -1405,7 +1517,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
if (err) if (err != 0)
return (err); return (err);
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
@ -1424,6 +1536,48 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
return (0); return (0);
} }
static int
restore_write_embedded(struct restorearg *ra, objset_t *os,
struct drr_write_embedded *drrwnp)
{
dmu_tx_t *tx;
int err;
void *data;
if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
return (EINVAL);
if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
return (EINVAL);
if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
return (EINVAL);
if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
return (EINVAL);
data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8));
if (data == NULL)
return (ra->err);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, drrwnp->drr_object,
drrwnp->drr_offset, drrwnp->drr_length);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
return (err);
}
dmu_write_embedded(os, drrwnp->drr_object,
drrwnp->drr_offset, data, drrwnp->drr_etype,
drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
dmu_tx_commit(tx);
return (0);
}
static int static int
restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
{ {
@ -1618,6 +1772,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.err = restore_write_byref(&ra, os, &drrwbr); ra.err = restore_write_byref(&ra, os, &drrwbr);
break; break;
} }
case DRR_WRITE_EMBEDDED:
{
struct drr_write_embedded drrwe =
drr->drr_u.drr_write_embedded;
ra.err = restore_write_embedded(&ra, os, &drrwe);
break;
}
case DRR_FREE: case DRR_FREE:
{ {
struct drr_free drrf = drr->drr_u.drr_free; struct drr_free drrf = drr->drr_u.drr_free;

View File

@ -463,7 +463,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (pfd->pd_cancel) if (pfd->pd_cancel)
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));
if (BP_IS_HOLE(bp) || if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
!((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)

View File

@ -1814,8 +1814,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*offset = *offset >> span; *offset = *offset >> span;
for (i = BF64_GET(*offset, 0, epbs); for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) { i >= 0 && i < epb; i += inc) {
if (bp[i].blk_fill >= minfill && if (BP_GET_FILL(&bp[i]) >= minfill &&
bp[i].blk_fill <= maxfill && BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg)) (hole || bp[i].blk_birth > txg))
break; break;
if (inc > 0 || *offset > 0) if (inc > 0 || *offset > 0)

View File

@ -237,8 +237,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
} }
#endif #endif
#define ALL -1
static void static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx) dmu_tx_t *tx)
@ -601,11 +599,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen; dnp->dn_bonuslen = dn->dn_bonuslen;
} }
ASSERT(dnp->dn_nlevels > 1 || ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) == BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT(dnp->dn_nlevels < 2 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
if (dn->dn_next_type[txgoff] != 0) { if (dn->dn_next_type[txgoff] != 0) {
dnp->dn_type = dn->dn_type; dnp->dn_type = dn->dn_type;

View File

@ -1525,7 +1525,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
else else
*availbytesp = 0; *availbytesp = 0;
} }
*usedobjsp = ds->ds_phys->ds_bp.blk_fill; *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp);
*availobjsp = DN_MAX_OBJECT - *usedobjsp; *availobjsp = DN_MAX_OBJECT - *usedobjsp;
} }

View File

@ -546,7 +546,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg; struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx; dmu_tx_t *tx = ka->tx;
if (BP_IS_HOLE(bp)) if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) { if (zb->zb_level == ZB_ZIL_LEVEL) {
@ -596,6 +596,7 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
uint64_t count; uint64_t count;
objset_t *mos; objset_t *mos;
ASSERT(!dsl_dataset_is_snapshot(ds));
if (dsl_dataset_is_snapshot(ds)) if (dsl_dataset_is_snapshot(ds))
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
@ -708,7 +709,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ds->ds_prev->ds_phys->ds_num_children == 2 && ds->ds_prev->ds_phys->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0); ds->ds_prev->ds_userrefs == 0);
/* Remove our reservation */ /* Remove our reservation. */
if (ds->ds_reserved != 0) { if (ds->ds_reserved != 0) {
dsl_dataset_set_refreservation_sync_impl(ds, dsl_dataset_set_refreservation_sync_impl(ds,
(ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),

View File

@ -1515,6 +1515,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
} }
if (err == ERESTART) if (err == ERESTART)
return; return;
/* finished; verify that space accounting went to zero */
ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
} }
if (scn->scn_phys.scn_state != DSS_SCANNING) if (scn->scn_phys.scn_state != DSS_SCANNING)
@ -1700,6 +1704,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
count_block(dp->dp_blkstats, bp); count_block(dp->dp_blkstats, bp);
if (BP_IS_EMBEDDED(bp))
return (0);
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB; zio_flags |= ZIO_FLAG_SCRUB;

View File

@ -610,8 +610,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
KM_PUSHPAGE)); KM_PUSHPAGE));
error = dsl_sync_task(pool, dsl_dataset_user_release_check, error = dsl_sync_task(pool, dsl_dataset_user_release_check,
dsl_dataset_user_release_sync, &ddura, dsl_dataset_user_release_sync, &ddura, 0);
fnvlist_num_pairs(holds));
fnvlist_free(ddura.ddura_todelete); fnvlist_free(ddura.ddura_todelete);
fnvlist_free(ddura.ddura_chkholds); fnvlist_free(ddura.ddura_chkholds);

View File

@ -2236,6 +2236,7 @@ metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
vdev_t *vd; vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0); ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@ -2259,6 +2260,7 @@ metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
vdev_t *vd; vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0); ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

View File

@ -1872,7 +1872,7 @@ static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{ {
if (!BP_IS_HOLE(bp)) { if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
zio_t *rio = arg; zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp); size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size); void *data = zio_data_buf_alloc(size);
@ -2423,9 +2423,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
&spa->spa_feat_enabled_txg_obj) != 0) { &spa->spa_feat_enabled_txg_obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
}
} }
spa->spa_is_initializing = B_TRUE; spa->spa_is_initializing = B_TRUE;
@ -5333,11 +5332,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
ASSERT(!locked); ASSERT(!locked);
ASSERT(vd == vd->vdev_top); ASSERT(vd == vd->vdev_top);
/*
* XXX - Once we have bp-rewrite this should
* become the common case.
*/
mg = vd->vdev_mg; mg = vd->vdev_mg;
/* /*
@ -6487,7 +6481,7 @@ spa_upgrade(spa_t *spa, uint64_t version)
* possible. * possible.
*/ */
ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
ASSERT(version >= spa->spa_uberblock.ub_version); ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
spa->spa_uberblock.ub_version = version; spa->spa_uberblock.ub_version = version;
vdev_config_dirty(spa->spa_root_vdev); vdev_config_dirty(spa->spa_root_vdev);

View File

@ -1293,7 +1293,10 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type)); sizeof (type));
} }
checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; if (!BP_IS_EMBEDDED(bp)) {
checksum =
zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
}
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
} }
@ -1588,7 +1591,7 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
uint64_t dsize = 0; uint64_t dsize = 0;
int d; int d;
for (d = 0; d < SPA_DVAS_PER_BP; d++) for (d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
return (dsize); return (dsize);
@ -1602,7 +1605,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (d = 0; d < SPA_DVAS_PER_BP; d++) for (d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
spa_config_exit(spa, SCL_VDEV, FTAG); spa_config_exit(spa, SCL_VDEV, FTAG);

View File

@ -213,4 +213,9 @@ zpool_feature_init(void)
"\"zfs bookmark\" command", "\"zfs bookmark\" command",
B_TRUE, B_FALSE, B_FALSE, bookmarks_deps); B_TRUE, B_FALSE, B_FALSE, bookmarks_deps);
} }
zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
"com.delphix:embedded_data", "embedded_data",
"Blocks which compress very well use even less space.",
B_FALSE, B_TRUE, B_TRUE, NULL);
} }

View File

@ -4238,6 +4238,7 @@ out:
* zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored. * zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type. * output size in zc_objset_type.
* zc_flags if =1, WRITE_EMBEDDED records are permitted
* *
* outputs: * outputs:
* zc_objset_type estimated size, if zc_guid is set * zc_objset_type estimated size, if zc_guid is set
@ -4248,6 +4249,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
int error; int error;
offset_t off; offset_t off;
boolean_t estimate = (zc->zc_guid != 0); boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
if (zc->zc_obj != 0) { if (zc->zc_obj != 0) {
dsl_pool_t *dp; dsl_pool_t *dp;
@ -4308,7 +4310,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset; off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off); zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off; fp->f_offset = off;
@ -5174,6 +5176,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: { * innvl: {
* "fd" -> file descriptor to write stream to (int32) * "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "fromsnap" -> full snap name to send an incremental from
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* } * }
* *
* outnvl is unused * outnvl is unused
@ -5187,6 +5191,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
char *fromname = NULL; char *fromname = NULL;
int fd; int fd;
file_t *fp; file_t *fp;
boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd); error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0) if (error != 0)
@ -5194,11 +5199,13 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
embedok = nvlist_exists(innvl, "embedok");
if ((fp = getf(fd)) == NULL) if ((fp = getf(fd)) == NULL)
return (SET_ERROR(EBADF)); return (SET_ERROR(EBADF));
off = fp->f_offset; off = fp->f_offset;
error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off); error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off; fp->f_offset = off;

View File

@ -159,10 +159,15 @@ int
zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{ {
avl_tree_t *t = &zilog->zl_bp_tree; avl_tree_t *t = &zilog->zl_bp_tree;
const dva_t *dva = BP_IDENTITY(bp); const dva_t *dva;
zil_bp_node_t *zn; zil_bp_node_t *zn;
avl_index_t where; avl_index_t where;
if (BP_IS_EMBEDDED(bp))
return (0);
dva = BP_IDENTITY(bp);
if (avl_find(t, dva, &where) != NULL) if (avl_find(t, dva, &where) != NULL)
return (SET_ERROR(EEXIST)); return (SET_ERROR(EEXIST));
@ -863,7 +868,7 @@ zil_lwb_write_done(zio_t *zio)
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_GANG(zio->io_bp));
ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp));
ASSERT(zio->io_bp->blk_fill == 0); ASSERT(BP_GET_FILL(zio->io_bp) == 0);
/* /*
* Ensure the lwb buffer pointer is cleared before releasing * Ensure the lwb buffer pointer is cleared before releasing

View File

@ -36,6 +36,7 @@
#include <sys/dmu_objset.h> #include <sys/dmu_objset.h>
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/ddt.h> #include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h> #include <sys/zfeature.h>
/* /*
@ -243,7 +244,7 @@ zio_buf_alloc(size_t size)
{ {
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG)); return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG));
} }
@ -711,6 +712,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_physdone = physdone; zio->io_physdone = physdone;
zio->io_prop = *zp; zio->io_prop = *zp;
/*
* Data can be NULL if we are going to call zio_write_override() to
* provide the already-allocated BP. But we may need the data to
* verify a dedup hit (if requested). In this case, don't try to
* dedup (just take the already-allocated BP verbatim).
*/
if (data == NULL && zio->io_prop.zp_dedup_verify) {
zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
}
return (zio); return (zio);
} }
@ -750,6 +761,14 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
void void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{ {
/*
* The check for EMBEDDED is a performance optimization. We
* process the free here (by ignoring it) rather than
* putting it on the list and then processing it in zio_free_sync().
*/
if (BP_IS_EMBEDDED(bp))
return;
metaslab_check_free(spa, bp); metaslab_check_free(spa, bp);
/* /*
@ -774,13 +793,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_t *zio; zio_t *zio;
enum zio_stage stage = ZIO_FREE_PIPELINE; enum zio_stage stage = ZIO_FREE_PIPELINE;
dprintf_bp(bp, "freeing in txg %llu, pass %u",
(longlong_t)txg, spa->spa_sync_pass);
ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
if (BP_IS_EMBEDDED(bp))
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
metaslab_check_free(spa, bp); metaslab_check_free(spa, bp);
arc_freed(spa, bp); arc_freed(spa, bp);
@ -805,6 +824,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{ {
zio_t *zio; zio_t *zio;
dprintf_bp(bp, "claiming in txg %llu", txg);
if (BP_IS_EMBEDDED(bp))
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
/* /*
* A claim is an allocation of a specific block. Claims are needed * A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that * to support immediate writes in the intent log. The issue is that
@ -1011,12 +1035,20 @@ zio_read_bp_init(zio_t *zio)
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL && zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) { !(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize = BP_GET_PSIZE(bp); uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(psize); void *cbuf = zio_buf_alloc(psize);
zio_push_transform(zio, cbuf, psize, psize, zio_decompress); zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
} }
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
decode_embedded_bp_compressed(bp, zio->io_data);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
}
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE; zio->io_flags |= ZIO_FLAG_DONT_CACHE;
@ -1060,6 +1092,9 @@ zio_write_bp_init(zio_t *zio)
*bp = *zio->io_bp_override; *bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (BP_IS_EMBEDDED(bp))
return (ZIO_PIPELINE_CONTINUE);
/* /*
* If we've been overridden and nopwrite is set then * If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite * set the flag accordingly to indicate that a nopwrite
@ -1108,7 +1143,7 @@ zio_write_bp_init(zio_t *zio)
compress = ZIO_COMPRESS_OFF; compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */ /* Make sure someone doesn't change their mind on overwrites */
ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
spa_max_replication(spa)) == BP_GET_NDVAS(bp)); spa_max_replication(spa)) == BP_GET_NDVAS(bp));
} }
@ -1118,9 +1153,38 @@ zio_write_bp_init(zio_t *zio)
if (psize == 0 || psize == lsize) { if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF; compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize); zio_buf_free(cbuf, lsize);
} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
encode_embedded_bp_compressed(bp,
cbuf, compress, lsize, psize);
BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
BP_SET_TYPE(bp, zio->io_prop.zp_type);
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
zio_buf_free(cbuf, lsize);
bp->blk_birth = zio->io_txg;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_EMBEDDED_DATA));
return (ZIO_PIPELINE_CONTINUE);
} else { } else {
ASSERT(psize < lsize); /*
zio_push_transform(zio, cbuf, psize, lsize, NULL); * Round up compressed size to MINBLOCKSIZE and
* zero the tail.
*/
size_t rounded =
P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
if (rounded > psize) {
bzero((char *)cbuf + psize, rounded - psize);
psize = rounded;
}
if (psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
} else {
zio_push_transform(zio, cbuf,
psize, lsize, NULL);
}
} }
} }
@ -2873,7 +2937,7 @@ zio_checksum_verified(zio_t *zio)
/* /*
* ========================================================================== * ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
* An error of 0 indictes success. ENXIO indicates whole-device failure, * An error of 0 indicates success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent. * indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it. * Any other error is presumed to be worse because we weren't expecting it.
@ -2979,7 +3043,7 @@ zio_done(zio_t *zio)
for (w = 0; w < ZIO_WAIT_TYPES; w++) for (w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0); ASSERT(zio->io_children[c][w] == 0);
if (zio->io_bp != NULL) { if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[0] == 0);
ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0);
ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
@ -3216,7 +3280,8 @@ zio_done(zio_t *zio)
} }
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
!BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
!(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
} }

View File

@ -126,7 +126,7 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
static void static void
zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
{ {
dva_t *dva = BP_IDENTITY(bp); const dva_t *dva = BP_IDENTITY(bp);
uint64_t txg = BP_PHYSICAL_BIRTH(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp);
ASSERT(BP_IS_GANG(bp)); ASSERT(BP_IS_GANG(bp));

View File

@ -80,7 +80,7 @@ size_t
zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
{ {
uint64_t *word, *word_end; uint64_t *word, *word_end;
size_t c_len, d_len, r_len; size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c]; zio_compress_info_t *ci = &zio_compress_table[c];
ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
@ -102,28 +102,13 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
return (s_len); return (s_len);
/* Compress at least 12.5% */ /* Compress at least 12.5% */
d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE); d_len = s_len - (s_len >> 3);
if (d_len == 0)
return (s_len);
c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
if (c_len > d_len) if (c_len > d_len)
return (s_len); return (s_len);
/*
* Cool. We compressed at least as much as we were hoping to.
* For both security and repeatability, pad out the last sector.
*/
r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
if (r_len > c_len) {
bzero((char *)dst + c_len, r_len - c_len);
c_len = r_len;
}
ASSERT3U(c_len, <=, d_len); ASSERT3U(c_len, <=, d_len);
ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
return (c_len); return (c_len);
} }