mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	 9530eb64e0
			
		
	
	
		9530eb64e0
		
	
	
	
	
		
			
			Sponsored-by: https://despairlabs.com/sponsor/ Signed-off-by: Rob Norris <robn@despairlabs.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
		
			
				
	
	
		
			484 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			484 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: CDDL-1.0
 | |
| /*
 | |
|  * CDDL HEADER START
 | |
|  *
 | |
|  * This file and its contents are supplied under the terms of the
 | |
|  * Common Development and Distribution License ("CDDL"), version 1.0.
 | |
|  * You may only use this file in accordance with the terms of version
 | |
|  * 1.0 of the CDDL.
 | |
|  *
 | |
|  * A full copy of the text of the CDDL should have accompanied this
 | |
|  * source.  A copy of the CDDL is also available via the Internet at
 | |
|  * http://www.illumos.org/license/CDDL.
 | |
|  *
 | |
|  * CDDL HEADER END
 | |
|  */
 | |
| 
 | |
| /*
 | |
|  * Copyright (c) 2020 by Delphix. All rights reserved.
 | |
|  */
 | |
| 
 | |
| #include <assert.h>
 | |
| #include <cityhash.h>
 | |
| #include <ctype.h>
 | |
| #include <errno.h>
 | |
| #include <fcntl.h>
 | |
| #include <libzfs.h>
 | |
| #include <libzutil.h>
 | |
| #include <stddef.h>
 | |
| #include <stdio.h>
 | |
| #include <stdlib.h>
 | |
| #include <string.h>
 | |
| #include <umem.h>
 | |
| #include <unistd.h>
 | |
| #include <sys/debug.h>
 | |
| #include <sys/stat.h>
 | |
| #include <sys/zfs_ioctl.h>
 | |
| #include <sys/zio_checksum.h>
 | |
| #include "zfs_fletcher.h"
 | |
| #include "zstream.h"
 | |
| 
 | |
| 
 | |
| #define	MAX_RDT_PHYSMEM_PERCENT		20
 | |
| #define	SMALLEST_POSSIBLE_MAX_RDT_MB		128
 | |
| 
 | |
| typedef struct redup_entry {
 | |
| 	struct redup_entry	*rde_next;
 | |
| 	uint64_t rde_guid;
 | |
| 	uint64_t rde_object;
 | |
| 	uint64_t rde_offset;
 | |
| 	uint64_t rde_stream_offset;
 | |
| } redup_entry_t;
 | |
| 
 | |
| typedef struct redup_table {
 | |
| 	redup_entry_t	**redup_hash_array;
 | |
| 	umem_cache_t	*ddecache;
 | |
| 	uint64_t	ddt_count;
 | |
| 	int		numhashbits;
 | |
| } redup_table_t;
 | |
| 
 | |
| void *
 | |
| safe_calloc(size_t n)
 | |
| {
 | |
| 	void *rv = calloc(1, n);
 | |
| 	if (rv == NULL) {
 | |
| 		fprintf(stderr,
 | |
| 		    "Error: could not allocate %u bytes of memory\n",
 | |
| 		    (int)n);
 | |
| 		exit(1);
 | |
| 	}
 | |
| 	return (rv);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Safe version of fread(), exits on error.
 | |
|  */
 | |
| int
 | |
| sfread(void *buf, size_t size, FILE *fp)
 | |
| {
 | |
| 	int rv = fread(buf, size, 1, fp);
 | |
| 	if (rv == 0 && ferror(fp)) {
 | |
| 		(void) fprintf(stderr, "Error while reading file: %s\n",
 | |
| 		    strerror(errno));
 | |
| 		exit(1);
 | |
| 	}
 | |
| 	return (rv);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Safe version of pread(), exits on error.
 | |
|  */
 | |
| static void
 | |
| spread(int fd, void *buf, size_t count, off_t offset)
 | |
| {
 | |
| 	ssize_t err = pread(fd, buf, count, offset);
 | |
| 	if (err == -1) {
 | |
| 		(void) fprintf(stderr,
 | |
| 		    "Error while reading file: %s\n",
 | |
| 		    strerror(errno));
 | |
| 		exit(1);
 | |
| 	} else if (err != count) {
 | |
| 		(void) fprintf(stderr,
 | |
| 		    "Error while reading file: short read\n");
 | |
| 		exit(1);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static int
 | |
| dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
 | |
|     zio_cksum_t *zc, int outfd)
 | |
| {
 | |
| 	assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
 | |
| 	    == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 | |
| 	fletcher_4_incremental_native(drr,
 | |
| 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
 | |
| 	if (drr->drr_type != DRR_BEGIN) {
 | |
| 		assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
 | |
| 		    drr_checksum.drr_checksum));
 | |
| 		drr->drr_u.drr_checksum.drr_checksum = *zc;
 | |
| 	}
 | |
| 	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
 | |
| 	    sizeof (zio_cksum_t), zc);
 | |
| 	if (write(outfd, drr, sizeof (*drr)) == -1)
 | |
| 		return (errno);
 | |
| 	if (payload_len != 0) {
 | |
| 		fletcher_4_incremental_native(payload, payload_len, zc);
 | |
| 		if (write(outfd, payload, payload_len) == -1)
 | |
| 			return (errno);
 | |
| 	}
 | |
| 	return (0);
 | |
| }
 | |
| 
 | |
| static void
 | |
| rdt_insert(redup_table_t *rdt,
 | |
|     uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
 | |
| {
 | |
| 	uint64_t ch = cityhash3(guid, object, offset);
 | |
| 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
 | |
| 	redup_entry_t **rdepp;
 | |
| 
 | |
| 	rdepp = &(rdt->redup_hash_array[hashcode]);
 | |
| 	redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
 | |
| 	rde->rde_next = *rdepp;
 | |
| 	rde->rde_guid = guid;
 | |
| 	rde->rde_object = object;
 | |
| 	rde->rde_offset = offset;
 | |
| 	rde->rde_stream_offset = stream_offset;
 | |
| 	*rdepp = rde;
 | |
| 	rdt->ddt_count++;
 | |
| }
 | |
| 
 | |
| static void
 | |
| rdt_lookup(redup_table_t *rdt,
 | |
|     uint64_t guid, uint64_t object, uint64_t offset,
 | |
|     uint64_t *stream_offsetp)
 | |
| {
 | |
| 	uint64_t ch = cityhash3(guid, object, offset);
 | |
| 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
 | |
| 
 | |
| 	for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
 | |
| 	    rde != NULL; rde = rde->rde_next) {
 | |
| 		if (rde->rde_guid == guid &&
 | |
| 		    rde->rde_object == object &&
 | |
| 		    rde->rde_offset == offset) {
 | |
| 			*stream_offsetp = rde->rde_stream_offset;
 | |
| 			return;
 | |
| 		}
 | |
| 	}
 | |
| 	assert(!"could not find expected redup table entry");
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Convert a dedup stream (generated by "zfs send -D") to a
 | |
|  * non-deduplicated stream.  The entire infd will be converted, including
 | |
|  * any substreams in a stream package (generated by "zfs send -RD"). The
 | |
|  * infd must be seekable.
 | |
|  */
 | |
| static void
 | |
| zfs_redup_stream(int infd, int outfd, boolean_t verbose)
 | |
| {
 | |
| 	int bufsz = SPA_MAXBLOCKSIZE;
 | |
| 	dmu_replay_record_t thedrr;
 | |
| 	dmu_replay_record_t *drr = &thedrr;
 | |
| 	redup_table_t rdt;
 | |
| 	zio_cksum_t stream_cksum;
 | |
| 	uint64_t numbuckets;
 | |
| 	uint64_t num_records = 0;
 | |
| 	uint64_t num_write_byref_records = 0;
 | |
| 
 | |
| 	memset(&thedrr, 0, sizeof (dmu_replay_record_t));
 | |
| 
 | |
| #ifdef _ILP32
 | |
| 	uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
 | |
| #else
 | |
| 	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
 | |
| 	uint64_t max_rde_size =
 | |
| 	    MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
 | |
| 	    SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
 | |
| #endif
 | |
| 
 | |
| 	numbuckets = max_rde_size / (sizeof (redup_entry_t));
 | |
| 
 | |
| 	/*
 | |
| 	 * numbuckets must be a power of 2.  Increase number to
 | |
| 	 * a power of 2 if necessary.
 | |
| 	 */
 | |
| 	if (!ISP2(numbuckets))
 | |
| 		numbuckets = 1ULL << highbit64(numbuckets);
 | |
| 
 | |
| 	rdt.redup_hash_array =
 | |
| 	    safe_calloc(numbuckets * sizeof (redup_entry_t *));
 | |
| 	rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
 | |
| 	    NULL, NULL, NULL, NULL, NULL, 0);
 | |
| 	rdt.numhashbits = highbit64(numbuckets) - 1;
 | |
| 	rdt.ddt_count = 0;
 | |
| 
 | |
| 	char *buf = safe_calloc(bufsz);
 | |
| 	FILE *ofp = fdopen(infd, "r");
 | |
| 	long offset = ftell(ofp);
 | |
| 	int begin = 0;
 | |
| 	boolean_t seen = B_FALSE;
 | |
| 	while (sfread(drr, sizeof (*drr), ofp) != 0) {
 | |
| 		num_records++;
 | |
| 
 | |
| 		/*
 | |
| 		 * We need to regenerate the checksum.
 | |
| 		 */
 | |
| 		if (drr->drr_type != DRR_BEGIN) {
 | |
| 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
 | |
| 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
 | |
| 		}
 | |
| 
 | |
| 		uint64_t payload_size = 0;
 | |
| 		switch (drr->drr_type) {
 | |
| 		case DRR_BEGIN:
 | |
| 		{
 | |
| 			struct drr_begin *drrb = &drr->drr_u.drr_begin;
 | |
| 			int fflags;
 | |
| 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
 | |
| 			VERIFY0(begin++);
 | |
| 			seen = B_TRUE;
 | |
| 
 | |
| 			assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
 | |
| 
 | |
| 			/* clear the DEDUP feature flag for this stream */
 | |
| 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 | |
| 			fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
 | |
| 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
 | |
| 			/* cppcheck-suppress syntaxError */
 | |
| 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
 | |
| 
 | |
| 			uint32_t sz = drr->drr_payloadlen;
 | |
| 
 | |
| 			VERIFY3U(sz, <=, 1U << 28);
 | |
| 
 | |
| 			if (sz != 0) {
 | |
| 				if (sz > bufsz) {
 | |
| 					free(buf);
 | |
| 					buf = safe_calloc(sz);
 | |
| 					bufsz = sz;
 | |
| 				}
 | |
| 				(void) sfread(buf, sz, ofp);
 | |
| 			}
 | |
| 			payload_size = sz;
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		case DRR_END:
 | |
| 		{
 | |
| 			struct drr_end *drre = &drr->drr_u.drr_end;
 | |
| 			/*
 | |
| 			 * We would prefer to just check --begin == 0, but
 | |
| 			 * replication streams have an end of stream END
 | |
| 			 * record, so we must avoid tripping it.
 | |
| 			 */
 | |
| 			VERIFY3B(seen, ==, B_TRUE);
 | |
| 			begin--;
 | |
| 			/*
 | |
| 			 * Use the recalculated checksum, unless this is
 | |
| 			 * the END record of a stream package, which has
 | |
| 			 * no checksum.
 | |
| 			 */
 | |
| 			if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
 | |
| 				drre->drr_checksum = stream_cksum;
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		case DRR_OBJECT:
 | |
| 		{
 | |
| 			struct drr_object *drro = &drr->drr_u.drr_object;
 | |
| 			VERIFY3S(begin, ==, 1);
 | |
| 
 | |
| 			if (drro->drr_bonuslen > 0) {
 | |
| 				payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
 | |
| 				(void) sfread(buf, payload_size, ofp);
 | |
| 			}
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		case DRR_SPILL:
 | |
| 		{
 | |
| 			struct drr_spill *drrs = &drr->drr_u.drr_spill;
 | |
| 			VERIFY3S(begin, ==, 1);
 | |
| 			payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
 | |
| 			(void) sfread(buf, payload_size, ofp);
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		case DRR_WRITE_BYREF:
 | |
| 		{
 | |
| 			struct drr_write_byref drrwb =
 | |
| 			    drr->drr_u.drr_write_byref;
 | |
| 			VERIFY3S(begin, ==, 1);
 | |
| 
 | |
| 			num_write_byref_records++;
 | |
| 
 | |
| 			/*
 | |
| 			 * Look up in hash table by drrwb->drr_refguid,
 | |
| 			 * drr_refobject, drr_refoffset.  Replace this
 | |
| 			 * record with the found WRITE record, but with
 | |
| 			 * drr_object,drr_offset,drr_toguid replaced with ours.
 | |
| 			 */
 | |
| 			uint64_t stream_offset = 0;
 | |
| 			rdt_lookup(&rdt, drrwb.drr_refguid,
 | |
| 			    drrwb.drr_refobject, drrwb.drr_refoffset,
 | |
| 			    &stream_offset);
 | |
| 
 | |
| 			spread(infd, drr, sizeof (*drr), stream_offset);
 | |
| 
 | |
| 			assert(drr->drr_type == DRR_WRITE);
 | |
| 			struct drr_write *drrw = &drr->drr_u.drr_write;
 | |
| 			assert(drrw->drr_toguid == drrwb.drr_refguid);
 | |
| 			assert(drrw->drr_object == drrwb.drr_refobject);
 | |
| 			assert(drrw->drr_offset == drrwb.drr_refoffset);
 | |
| 
 | |
| 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
 | |
| 			spread(infd, buf, payload_size,
 | |
| 			    stream_offset + sizeof (*drr));
 | |
| 
 | |
| 			drrw->drr_toguid = drrwb.drr_toguid;
 | |
| 			drrw->drr_object = drrwb.drr_object;
 | |
| 			drrw->drr_offset = drrwb.drr_offset;
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		case DRR_WRITE:
 | |
| 		{
 | |
| 			struct drr_write *drrw = &drr->drr_u.drr_write;
 | |
| 			VERIFY3S(begin, ==, 1);
 | |
| 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
 | |
| 			(void) sfread(buf, payload_size, ofp);
 | |
| 
 | |
| 			rdt_insert(&rdt, drrw->drr_toguid,
 | |
| 			    drrw->drr_object, drrw->drr_offset, offset);
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		case DRR_WRITE_EMBEDDED:
 | |
| 		{
 | |
| 			struct drr_write_embedded *drrwe =
 | |
| 			    &drr->drr_u.drr_write_embedded;
 | |
| 			VERIFY3S(begin, ==, 1);
 | |
| 			payload_size =
 | |
| 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
 | |
| 			(void) sfread(buf, payload_size, ofp);
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		case DRR_FREEOBJECTS:
 | |
| 		case DRR_FREE:
 | |
| 		case DRR_OBJECT_RANGE:
 | |
| 			VERIFY3S(begin, ==, 1);
 | |
| 			break;
 | |
| 
 | |
| 		default:
 | |
| 			(void) fprintf(stderr, "INVALID record type 0x%x\n",
 | |
| 			    drr->drr_type);
 | |
| 			/* should never happen, so assert */
 | |
| 			assert(B_FALSE);
 | |
| 		}
 | |
| 
 | |
| 		if (feof(ofp)) {
 | |
| 			fprintf(stderr, "Error: unexpected end-of-file\n");
 | |
| 			exit(1);
 | |
| 		}
 | |
| 		if (ferror(ofp)) {
 | |
| 			fprintf(stderr, "Error while reading file: %s\n",
 | |
| 			    strerror(errno));
 | |
| 			exit(1);
 | |
| 		}
 | |
| 
 | |
| 		/*
 | |
| 		 * We need to recalculate the checksum, and it needs to be
 | |
| 		 * initially zero to do that.  BEGIN records don't have
 | |
| 		 * a checksum.
 | |
| 		 */
 | |
| 		if (drr->drr_type != DRR_BEGIN) {
 | |
| 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
 | |
| 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
 | |
| 		}
 | |
| 		if (dump_record(drr, buf, payload_size,
 | |
| 		    &stream_cksum, outfd) != 0)
 | |
| 			break;
 | |
| 		if (drr->drr_type == DRR_END) {
 | |
| 			/*
 | |
| 			 * Typically the END record is either the last
 | |
| 			 * thing in the stream, or it is followed
 | |
| 			 * by a BEGIN record (which also zeros the checksum).
 | |
| 			 * However, a stream package ends with two END
 | |
| 			 * records.  The last END record's checksum starts
 | |
| 			 * from zero.
 | |
| 			 */
 | |
| 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
 | |
| 		}
 | |
| 		offset = ftell(ofp);
 | |
| 	}
 | |
| 
 | |
| 	if (verbose) {
 | |
| 		char mem_str[16];
 | |
| 		zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
 | |
| 		    mem_str, sizeof (mem_str));
 | |
| 		fprintf(stderr, "converted stream with %llu total records, "
 | |
| 		    "including %llu dedup records, using %sB memory.\n",
 | |
| 		    (long long)num_records,
 | |
| 		    (long long)num_write_byref_records,
 | |
| 		    mem_str);
 | |
| 	}
 | |
| 
 | |
| 	umem_cache_destroy(rdt.ddecache);
 | |
| 	free(rdt.redup_hash_array);
 | |
| 	free(buf);
 | |
| 	(void) fclose(ofp);
 | |
| }
 | |
| 
 | |
| int
 | |
| zstream_do_redup(int argc, char *argv[])
 | |
| {
 | |
| 	boolean_t verbose = B_FALSE;
 | |
| 	int c;
 | |
| 
 | |
| 	while ((c = getopt(argc, argv, "v")) != -1) {
 | |
| 		switch (c) {
 | |
| 		case 'v':
 | |
| 			verbose = B_TRUE;
 | |
| 			break;
 | |
| 		case '?':
 | |
| 			(void) fprintf(stderr, "invalid option '%c'\n",
 | |
| 			    optopt);
 | |
| 			zstream_usage();
 | |
| 			break;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	argc -= optind;
 | |
| 	argv += optind;
 | |
| 
 | |
| 	if (argc != 1)
 | |
| 		zstream_usage();
 | |
| 
 | |
| 	const char *filename = argv[0];
 | |
| 
 | |
| 	if (isatty(STDOUT_FILENO)) {
 | |
| 		(void) fprintf(stderr,
 | |
| 		    "Error: Stream can not be written to a terminal.\n"
 | |
| 		    "You must redirect standard output.\n");
 | |
| 		return (1);
 | |
| 	}
 | |
| 
 | |
| 	int fd = open(filename, O_RDONLY);
 | |
| 	if (fd == -1) {
 | |
| 		(void) fprintf(stderr,
 | |
| 		    "Error while opening file '%s': %s\n",
 | |
| 		    filename, strerror(errno));
 | |
| 		exit(1);
 | |
| 	}
 | |
| 
 | |
| 	fletcher_4_init();
 | |
| 	zfs_redup_stream(fd, STDOUT_FILENO, verbose);
 | |
| 	fletcher_4_fini();
 | |
| 
 | |
| 	close(fd);
 | |
| 
 | |
| 	return (0);
 | |
| }
 |