Add zstream redup command to convert deduplicated send streams

Deduplicated send and receive is deprecated.  To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.

The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand.  The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.

The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.

Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)

For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).

For WRITE_BYREF records, we change them to WRITE records.  We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file.  This is why the stream can not be a pipe.  The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).

This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`).  A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".

Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124 
Closes #10156
This commit is contained in:
Matthew Ahrens
2020-04-10 10:39:55 -07:00
committed by GitHub
parent 77f6826b83
commit c618f87cd2
16 changed files with 728 additions and 36 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest
SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest
SUBDIRS += fsck_zfs vdev_id raidz_test zgenhostid
if USING_PYTHON
+1
View File
@@ -0,0 +1 @@
zstream
+13
View File
@@ -0,0 +1,13 @@
include $(top_srcdir)/config/Rules.am
sbin_PROGRAMS = zstream
zstream_SOURCES = \
zstream.c \
zstream.h \
zstream_dump.c \
zstream_redup.c
zstream_LDADD = \
$(top_builddir)/lib/libnvpair/libnvpair.la \
$(top_builddir)/lib/libzfs/libzfs.la
+61
View File
@@ -0,0 +1,61 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2020 by Delphix. All rights reserved.
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <libintl.h>
#include <stddef.h>
#include <libzfs.h>
#include "zstream.h"
void
zstream_usage(void)
{
(void) fprintf(stderr,
"usage: zstream command args ...\n"
"Available commands are:\n"
"\n"
"\tzstream dump [-vCd] FILE\n"
"\t... | zstream dump [-vCd]\n"
"\n"
"\tzstream redup [-v] FILE | ...\n");
exit(1);
}
int
main(int argc, char *argv[])
{
if (argc < 2)
zstream_usage();
char *subcommand = argv[1];
if (strcmp(subcommand, "dump") == 0) {
return (zstream_do_dump(argc - 1, argv + 1));
} else if (strcmp(subcommand, "redup") == 0) {
return (zstream_do_redup(argc - 1, argv + 1));
} else {
zstream_usage();
}
}
+35
View File
@@ -0,0 +1,35 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2020 by Delphix. All rights reserved.
*/
#ifndef _ZSTREAM_H
#define _ZSTREAM_H
#ifdef __cplusplus
extern "C" {
#endif
extern int zstream_do_redup(int, char *[]);
extern int zstream_do_dump(int, char *[]);
extern void zstream_usage(void);
#ifdef __cplusplus
}
#endif
#endif /* _ZSTREAM_H */
@@ -42,6 +42,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zio.h>
#include <zfs_fletcher.h>
#include "zstream.h"
/*
* If dump mode is enabled, the number of bytes to print per line
@@ -58,17 +59,6 @@ FILE *send_stream = 0;
boolean_t do_byteswap = B_FALSE;
boolean_t do_cksum = B_TRUE;
static void
usage(void)
{
(void) fprintf(stderr, "usage: zstreamdump [-v] [-C] [-d] < file\n");
(void) fprintf(stderr, "\t -v -- verbose\n");
(void) fprintf(stderr, "\t -C -- suppress checksum verification\n");
(void) fprintf(stderr, "\t -d -- dump contents of blocks modified, "
"implies verbose\n");
exit(1);
}
static void *
safe_malloc(size_t size)
{
@@ -215,7 +205,7 @@ sprintf_bytes(char *str, uint8_t *buf, uint_t buf_len)
}
int
main(int argc, char *argv[])
zstream_do_dump(int argc, char *argv[])
{
char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
@@ -273,26 +263,39 @@ main(int argc, char *argv[])
case ':':
(void) fprintf(stderr,
"missing argument for '%c' option\n", optopt);
usage();
zstream_usage();
break;
case '?':
(void) fprintf(stderr, "invalid option '%c'\n",
optopt);
usage();
zstream_usage();
break;
}
}
if (isatty(STDIN_FILENO)) {
(void) fprintf(stderr,
"Error: Backup stream can not be read "
"from a terminal.\n"
"You must redirect standard input.\n");
exit(1);
if (argc > optind) {
const char *filename = argv[optind];
send_stream = fopen(filename, "r");
if (send_stream == NULL) {
(void) fprintf(stderr,
"Error while opening file '%s': %s\n",
filename, strerror(errno));
exit(1);
}
} else {
if (isatty(STDIN_FILENO)) {
(void) fprintf(stderr,
"Error: The send stream is a binary format "
"and can not be read from a\n"
"terminal. Standard input must be redirected, "
"or a file must be\n"
"specified as a command-line argument.\n");
exit(1);
}
send_stream = stdin;
}
fletcher_4_init();
send_stream = stdin;
while (read_hdr(drr, &zc)) {
/*
+468
View File
@@ -0,0 +1,468 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2020 by Delphix. All rights reserved.
*/
#include <assert.h>
#include <cityhash.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <libzfs_impl.h>
#include <libzfs.h>
#include <libzutil.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <umem.h>
#include <unistd.h>
#include <sys/debug.h>
#include <sys/stat.h>
#include <sys/zfs_ioctl.h>
#include <sys/zio_checksum.h>
#include "zfs_fletcher.h"
#include "zstream.h"
#define MAX_RDT_PHYSMEM_PERCENT 20
#define SMALLEST_POSSIBLE_MAX_RDT_MB 128
typedef struct redup_entry {
struct redup_entry *rde_next;
uint64_t rde_guid;
uint64_t rde_object;
uint64_t rde_offset;
uint64_t rde_stream_offset;
} redup_entry_t;
typedef struct redup_table {
redup_entry_t **redup_hash_array;
umem_cache_t *ddecache;
uint64_t ddt_count;
int numhashbits;
} redup_table_t;
int
highbit64(uint64_t i)
{
if (i == 0)
return (0);
return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
}
static void *
safe_calloc(size_t n)
{
void *rv = calloc(1, n);
if (rv == NULL) {
fprintf(stderr,
"Error: could not allocate %u bytes of memory\n",
(int)n);
exit(1);
}
return (rv);
}
/*
* Safe version of fread(), exits on error.
*/
static int
sfread(void *buf, size_t size, FILE *fp)
{
int rv = fread(buf, size, 1, fp);
if (rv == 0 && ferror(fp)) {
(void) fprintf(stderr, "Error while reading file: %s\n",
strerror(errno));
exit(1);
}
return (rv);
}
/*
* Safe version of pread(), exits on error.
*/
static void
spread(int fd, void *buf, size_t count, off_t offset)
{
ssize_t err = pread(fd, buf, count, offset);
if (err == -1) {
(void) fprintf(stderr,
"Error while reading file: %s\n",
strerror(errno));
exit(1);
} else if (err != count) {
(void) fprintf(stderr,
"Error while reading file: short read\n");
exit(1);
}
}
static int
dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
zio_cksum_t *zc, int outfd)
{
assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
== sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
fletcher_4_incremental_native(drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
if (drr->drr_type != DRR_BEGIN) {
assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
drr_checksum.drr_checksum));
drr->drr_u.drr_checksum.drr_checksum = *zc;
}
fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
sizeof (zio_cksum_t), zc);
if (write(outfd, drr, sizeof (*drr)) == -1)
return (errno);
if (payload_len != 0) {
fletcher_4_incremental_native(payload, payload_len, zc);
if (write(outfd, payload, payload_len) == -1)
return (errno);
}
return (0);
}
static void
rdt_insert(redup_table_t *rdt,
uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
{
uint64_t ch = cityhash4(guid, object, offset, 0);
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
redup_entry_t **rdepp;
rdepp = &(rdt->redup_hash_array[hashcode]);
redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
rde->rde_next = *rdepp;
rde->rde_guid = guid;
rde->rde_object = object;
rde->rde_offset = offset;
rde->rde_stream_offset = stream_offset;
*rdepp = rde;
rdt->ddt_count++;
}
static void
rdt_lookup(redup_table_t *rdt,
uint64_t guid, uint64_t object, uint64_t offset,
uint64_t *stream_offsetp)
{
uint64_t ch = cityhash4(guid, object, offset, 0);
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
rde != NULL; rde = rde->rde_next) {
if (rde->rde_guid == guid &&
rde->rde_object == object &&
rde->rde_offset == offset) {
*stream_offsetp = rde->rde_stream_offset;
return;
}
}
assert(!"could not find expected redup table entry");
}
/*
* Convert a dedup stream (generated by "zfs send -D") to a
* non-deduplicated stream. The entire infd will be converted, including
* any substreams in a stream package (generated by "zfs send -RD"). The
* infd must be seekable.
*/
static void
zfs_redup_stream(int infd, int outfd, boolean_t verbose)
{
int bufsz = SPA_MAXBLOCKSIZE;
dmu_replay_record_t thedrr = { 0 };
dmu_replay_record_t *drr = &thedrr;
redup_table_t rdt;
zio_cksum_t stream_cksum;
uint64_t numbuckets;
uint64_t num_records = 0;
uint64_t num_write_byref_records = 0;
#ifdef _ILP32
uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
#else
uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
uint64_t max_rde_size =
MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
#endif
numbuckets = max_rde_size / (sizeof (redup_entry_t));
/*
* numbuckets must be a power of 2. Increase number to
* a power of 2 if necessary.
*/
if (!ISP2(numbuckets))
numbuckets = 1ULL << highbit64(numbuckets);
rdt.redup_hash_array =
safe_calloc(numbuckets * sizeof (redup_entry_t *));
rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
NULL, NULL, NULL, NULL, NULL, 0);
rdt.numhashbits = highbit64(numbuckets) - 1;
char *buf = safe_calloc(bufsz);
FILE *ofp = fdopen(infd, "r");
long offset = ftell(ofp);
while (sfread(drr, sizeof (*drr), ofp) != 0) {
num_records++;
/*
* We need to regenerate the checksum.
*/
if (drr->drr_type != DRR_BEGIN) {
bzero(&drr->drr_u.drr_checksum.drr_checksum,
sizeof (drr->drr_u.drr_checksum.drr_checksum));
}
uint64_t payload_size = 0;
switch (drr->drr_type) {
case DRR_BEGIN:
{
struct drr_begin *drrb = &drr->drr_u.drr_begin;
int fflags;
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
/* clear the DEDUP feature flag for this stream */
fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
DMU_BACKUP_FEATURE_DEDUPPROPS);
DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
int sz = drr->drr_payloadlen;
if (sz != 0) {
if (sz > bufsz) {
free(buf);
buf = safe_calloc(sz);
bufsz = sz;
}
(void) sfread(buf, sz, ofp);
}
payload_size = sz;
break;
}
case DRR_END:
{
struct drr_end *drre = &drr->drr_u.drr_end;
/*
* Use the recalculated checksum, unless this is
* the END record of a stream package, which has
* no checksum.
*/
if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
drre->drr_checksum = stream_cksum;
break;
}
case DRR_OBJECT:
{
struct drr_object *drro = &drr->drr_u.drr_object;
if (drro->drr_bonuslen > 0) {
payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
(void) sfread(buf, payload_size, ofp);
}
break;
}
case DRR_SPILL:
{
struct drr_spill *drrs = &drr->drr_u.drr_spill;
payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
(void) sfread(buf, payload_size, ofp);
break;
}
case DRR_WRITE_BYREF:
{
struct drr_write_byref drrwb =
drr->drr_u.drr_write_byref;
num_write_byref_records++;
/*
* Look up in hash table by drrwb->drr_refguid,
* drr_refobject, drr_refoffset. Replace this
* record with the found WRITE record, but with
* drr_object,drr_offset,drr_toguid replaced with ours.
*/
uint64_t stream_offset;
rdt_lookup(&rdt, drrwb.drr_refguid,
drrwb.drr_refobject, drrwb.drr_refoffset,
&stream_offset);
spread(infd, drr, sizeof (*drr), stream_offset);
assert(drr->drr_type == DRR_WRITE);
struct drr_write *drrw = &drr->drr_u.drr_write;
assert(drrw->drr_toguid == drrwb.drr_refguid);
assert(drrw->drr_object == drrwb.drr_refobject);
assert(drrw->drr_offset == drrwb.drr_refoffset);
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
spread(infd, buf, payload_size,
stream_offset + sizeof (*drr));
drrw->drr_toguid = drrwb.drr_toguid;
drrw->drr_object = drrwb.drr_object;
drrw->drr_offset = drrwb.drr_offset;
break;
}
case DRR_WRITE:
{
struct drr_write *drrw = &drr->drr_u.drr_write;
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
(void) sfread(buf, payload_size, ofp);
rdt_insert(&rdt, drrw->drr_toguid,
drrw->drr_object, drrw->drr_offset, offset);
break;
}
case DRR_WRITE_EMBEDDED:
{
struct drr_write_embedded *drrwe =
&drr->drr_u.drr_write_embedded;
payload_size =
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
(void) sfread(buf, payload_size, ofp);
break;
}
case DRR_FREEOBJECTS:
case DRR_FREE:
case DRR_OBJECT_RANGE:
break;
default:
(void) fprintf(stderr, "INVALID record type 0x%x\n",
drr->drr_type);
/* should never happen, so assert */
assert(B_FALSE);
}
if (feof(ofp)) {
fprintf(stderr, "Error: unexpected end-of-file\n");
exit(1);
}
if (ferror(ofp)) {
fprintf(stderr, "Error while reading file: %s\n",
strerror(errno));
exit(1);
}
/*
* We need to recalculate the checksum, and it needs to be
* initially zero to do that. BEGIN records don't have
* a checksum.
*/
if (drr->drr_type != DRR_BEGIN) {
bzero(&drr->drr_u.drr_checksum.drr_checksum,
sizeof (drr->drr_u.drr_checksum.drr_checksum));
}
if (dump_record(drr, buf, payload_size,
&stream_cksum, outfd) != 0)
break;
if (drr->drr_type == DRR_END) {
/*
* Typically the END record is either the last
* thing in the stream, or it is followed
* by a BEGIN record (which also zeros the checksum).
* However, a stream package ends with two END
* records. The last END record's checksum starts
* from zero.
*/
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
}
offset = ftell(ofp);
}
if (verbose) {
char mem_str[16];
zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
mem_str, sizeof (mem_str));
fprintf(stderr, "converted stream with %llu total records, "
"including %llu dedup records, using %sB memory.\n",
(long long)num_records,
(long long)num_write_byref_records,
mem_str);
}
umem_cache_destroy(rdt.ddecache);
free(rdt.redup_hash_array);
free(buf);
(void) fclose(ofp);
}
int
zstream_do_redup(int argc, char *argv[])
{
boolean_t verbose = B_FALSE;
char c;
while ((c = getopt(argc, argv, "v")) != -1) {
switch (c) {
case 'v':
verbose = B_TRUE;
break;
case '?':
(void) fprintf(stderr, "invalid option '%c'\n",
optopt);
zstream_usage();
break;
}
}
argc -= optind;
argv += optind;
if (argc != 1)
zstream_usage();
const char *filename = argv[0];
if (isatty(STDOUT_FILENO)) {
(void) fprintf(stderr,
"Error: Stream can not be written to a terminal.\n"
"You must redirect standard output.\n");
return (1);
}
int fd = open(filename, O_RDONLY);
if (fd == -1) {
(void) fprintf(stderr,
"Error while opening file '%s': %s\n",
filename, strerror(errno));
exit(1);
}
fletcher_4_init();
zfs_redup_stream(fd, STDOUT_FILENO, verbose);
fletcher_4_fini();
close(fd);
return (0);
}
+1 -10
View File
@@ -1,10 +1 @@
include $(top_srcdir)/config/Rules.am
sbin_PROGRAMS = zstreamdump
zstreamdump_SOURCES = \
zstreamdump.c
zstreamdump_LDADD = \
$(top_builddir)/lib/libnvpair/libnvpair.la \
$(top_builddir)/lib/libzfs/libzfs.la
dist_sbin_SCRIPTS = zstreamdump
+3
View File
@@ -0,0 +1,3 @@
#!/bin/sh
zstream dump "$@"