/* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2020 by Delphix. All rights reserved. */ #include <assert.h> #include <cityhash.h> #include <ctype.h> #include <errno.h> #include <fcntl.h> #include <libzfs.h> #include <libzutil.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <umem.h> #include <unistd.h> #include <sys/debug.h> #include <sys/stat.h> #include <sys/zfs_ioctl.h> #include <sys/zio_checksum.h> #include "zfs_fletcher.h" #include "zstream.h" #define MAX_RDT_PHYSMEM_PERCENT 20 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128 typedef struct redup_entry { struct redup_entry *rde_next; uint64_t rde_guid; uint64_t rde_object; uint64_t rde_offset; uint64_t rde_stream_offset; } redup_entry_t; typedef struct redup_table { redup_entry_t **redup_hash_array; umem_cache_t *ddecache; uint64_t ddt_count; int numhashbits; } redup_table_t; int highbit64(uint64_t i) { if (i == 0) return (0); return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); } void * safe_calloc(size_t n) { void *rv = calloc(1, n); if (rv == NULL) { fprintf(stderr, "Error: could not allocate %u bytes of memory\n", (int)n); exit(1); } return (rv); } /* * Safe version of fread(), exits on error. */ int sfread(void *buf, size_t size, FILE *fp) { int rv = fread(buf, size, 1, fp); if (rv == 0 && ferror(fp)) { (void) fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } return (rv); } /* * Safe version of pread(), exits on error. */ static void spread(int fd, void *buf, size_t count, off_t offset) { ssize_t err = pread(fd, buf, count, offset); if (err == -1) { (void) fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } else if (err != count) { (void) fprintf(stderr, "Error while reading file: short read\n"); exit(1); } } static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } static void rdt_insert(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) { uint64_t ch = cityhash4(guid, object, offset, 0); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); redup_entry_t **rdepp; rdepp = &(rdt->redup_hash_array[hashcode]); redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); rde->rde_next = *rdepp; rde->rde_guid = guid; rde->rde_object = object; rde->rde_offset = offset; rde->rde_stream_offset = stream_offset; *rdepp = rde; rdt->ddt_count++; } static void rdt_lookup(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t *stream_offsetp) { uint64_t ch = cityhash4(guid, object, offset, 0); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; rde != NULL; rde = rde->rde_next) { if (rde->rde_guid == guid && rde->rde_object == object && rde->rde_offset == offset) { *stream_offsetp = rde->rde_stream_offset; return; } } assert(!"could not find expected redup table entry"); } /* * Convert a dedup stream (generated by "zfs send -D") to a * non-deduplicated stream. The entire infd will be converted, including * any substreams in a stream package (generated by "zfs send -RD"). The * infd must be seekable. */ static void zfs_redup_stream(int infd, int outfd, boolean_t verbose) { int bufsz = SPA_MAXBLOCKSIZE; dmu_replay_record_t thedrr = { 0 }; dmu_replay_record_t *drr = &thedrr; redup_table_t rdt; zio_cksum_t stream_cksum; uint64_t numbuckets; uint64_t num_records = 0; uint64_t num_write_byref_records = 0; #ifdef _ILP32 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; #else uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); uint64_t max_rde_size = MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, SMALLEST_POSSIBLE_MAX_RDT_MB << 20); #endif numbuckets = max_rde_size / (sizeof (redup_entry_t)); /* * numbuckets must be a power of 2. Increase number to * a power of 2 if necessary. */ if (!ISP2(numbuckets)) numbuckets = 1ULL << highbit64(numbuckets); rdt.redup_hash_array = safe_calloc(numbuckets * sizeof (redup_entry_t *)); rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); rdt.numhashbits = highbit64(numbuckets) - 1; rdt.ddt_count = 0; char *buf = safe_calloc(bufsz); FILE *ofp = fdopen(infd, "r"); long offset = ftell(ofp); int begin = 0; boolean_t seen = B_FALSE; while (sfread(drr, sizeof (*drr), ofp) != 0) { num_records++; /* * We need to regenerate the checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } uint64_t payload_size = 0; switch (drr->drr_type) { case DRR_BEGIN: { struct drr_begin *drrb = &drr->drr_u.drr_begin; int fflags; ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); VERIFY0(begin++); seen = B_TRUE; assert(drrb->drr_magic == DMU_BACKUP_MAGIC); /* clear the DEDUP feature flag for this stream */ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); /* cppcheck-suppress syntaxError */ DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); uint32_t sz = drr->drr_payloadlen; VERIFY3U(sz, <=, 1U << 28); if (sz != 0) { if (sz > bufsz) { free(buf); buf = safe_calloc(sz); bufsz = sz; } (void) sfread(buf, sz, ofp); } payload_size = sz; break; } case DRR_END: { struct drr_end *drre = &drr->drr_u.drr_end; /* * We would prefer to just check --begin == 0, but * replication streams have an end of stream END * record, so we must avoid tripping it. */ VERIFY3B(seen, ==, B_TRUE); begin--; /* * Use the recalculated checksum, unless this is * the END record of a stream package, which has * no checksum. */ if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) drre->drr_checksum = stream_cksum; break; } case DRR_OBJECT: { struct drr_object *drro = &drr->drr_u.drr_object; VERIFY3S(begin, ==, 1); if (drro->drr_bonuslen > 0) { payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); (void) sfread(buf, payload_size, ofp); } break; } case DRR_SPILL: { struct drr_spill *drrs = &drr->drr_u.drr_spill; VERIFY3S(begin, ==, 1); payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); (void) sfread(buf, payload_size, ofp); break; } case DRR_WRITE_BYREF: { struct drr_write_byref drrwb = drr->drr_u.drr_write_byref; VERIFY3S(begin, ==, 1); num_write_byref_records++; /* * Look up in hash table by drrwb->drr_refguid, * drr_refobject, drr_refoffset. Replace this * record with the found WRITE record, but with * drr_object,drr_offset,drr_toguid replaced with ours. */ uint64_t stream_offset = 0; rdt_lookup(&rdt, drrwb.drr_refguid, drrwb.drr_refobject, drrwb.drr_refoffset, &stream_offset); spread(infd, drr, sizeof (*drr), stream_offset); assert(drr->drr_type == DRR_WRITE); struct drr_write *drrw = &drr->drr_u.drr_write; assert(drrw->drr_toguid == drrwb.drr_refguid); assert(drrw->drr_object == drrwb.drr_refobject); assert(drrw->drr_offset == drrwb.drr_refoffset); payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); spread(infd, buf, payload_size, stream_offset + sizeof (*drr)); drrw->drr_toguid = drrwb.drr_toguid; drrw->drr_object = drrwb.drr_object; drrw->drr_offset = drrwb.drr_offset; break; } case DRR_WRITE: { struct drr_write *drrw = &drr->drr_u.drr_write; VERIFY3S(begin, ==, 1); payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); (void) sfread(buf, payload_size, ofp); rdt_insert(&rdt, drrw->drr_toguid, drrw->drr_object, drrw->drr_offset, offset); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drr->drr_u.drr_write_embedded; VERIFY3S(begin, ==, 1); payload_size = P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); (void) sfread(buf, payload_size, ofp); break; } case DRR_FREEOBJECTS: case DRR_FREE: case DRR_OBJECT_RANGE: VERIFY3S(begin, ==, 1); break; default: (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); } if (feof(ofp)) { fprintf(stderr, "Error: unexpected end-of-file\n"); exit(1); } if (ferror(ofp)) { fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } /* * We need to recalculate the checksum, and it needs to be * initially zero to do that. BEGIN records don't have * a checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } if (dump_record(drr, buf, payload_size, &stream_cksum, outfd) != 0) break; if (drr->drr_type == DRR_END) { /* * Typically the END record is either the last * thing in the stream, or it is followed * by a BEGIN record (which also zeros the checksum). * However, a stream package ends with two END * records. The last END record's checksum starts * from zero. */ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); } offset = ftell(ofp); } if (verbose) { char mem_str[16]; zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), mem_str, sizeof (mem_str)); fprintf(stderr, "converted stream with %llu total records, " "including %llu dedup records, using %sB memory.\n", (long long)num_records, (long long)num_write_byref_records, mem_str); } umem_cache_destroy(rdt.ddecache); free(rdt.redup_hash_array); free(buf); (void) fclose(ofp); } int zstream_do_redup(int argc, char *argv[]) { boolean_t verbose = B_FALSE; int c; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { case 'v': verbose = B_TRUE; break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); zstream_usage(); break; } } argc -= optind; argv += optind; if (argc != 1) zstream_usage(); const char *filename = argv[0]; if (isatty(STDOUT_FILENO)) { (void) fprintf(stderr, "Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n"); return (1); } int fd = open(filename, O_RDONLY); if (fd == -1) { (void) fprintf(stderr, "Error while opening file '%s': %s\n", filename, strerror(errno)); exit(1); } fletcher_4_init(); zfs_redup_stream(fd, STDOUT_FILENO, verbose); fletcher_4_fini(); close(fd); return (0); }