mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-16 04:57:14 +03:00
73968defdd
In the zstream code, Coverity reported: "The argument could be controlled by an attacker, who could invoke the function with arbitrary values (for example, a very high or negative buffer size)." It did not report this in the kernel. This is likely because the userspace code stored this in an int before passing it into the allocator, while the kernel code stored it in a uint32_t. However, this did reveal a potentially real problem. On 32-bit systems and systems with only 4GB of physical memory or less in general, it is possible to pass a large enough value that the system will hang. Even worse, on Linux systems, the kernel memory allocator is not able to support allocations up to the maximum 4GB allocation size that this allows. This had already been limited in userspace to 64MB by `ZFS_SENDRECV_MAX_NVLIST`, but we need a hard limit in the kernel to protect systems. After some discussion, we settle on 256MB as a hard upper limit. Attempting to receive a stream that requires more memory than that will result in E2BIG being returned to user space. Reported-by: Coverity (CID-1529836) Reported-by: Coverity (CID-1529837) Reported-by: Coverity (CID-1529838) Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Closes #14285
490 lines
12 KiB
C
490 lines
12 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* This file and its contents are supplied under the terms of the
|
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
|
* You may only use this file in accordance with the terms of version
|
|
* 1.0 of the CDDL.
|
|
*
|
|
* A full copy of the text of the CDDL should have accompanied this
|
|
* source. A copy of the CDDL is also available via the Internet at
|
|
* http://www.illumos.org/license/CDDL.
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2020 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <cityhash.h>
|
|
#include <ctype.h>
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <libzfs.h>
|
|
#include <libzutil.h>
|
|
#include <stddef.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <umem.h>
|
|
#include <unistd.h>
|
|
#include <sys/debug.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/zfs_ioctl.h>
|
|
#include <sys/zio_checksum.h>
|
|
#include "zfs_fletcher.h"
|
|
#include "zstream.h"
|
|
|
|
|
|
#define MAX_RDT_PHYSMEM_PERCENT 20
|
|
#define SMALLEST_POSSIBLE_MAX_RDT_MB 128
|
|
|
|
typedef struct redup_entry {
|
|
struct redup_entry *rde_next;
|
|
uint64_t rde_guid;
|
|
uint64_t rde_object;
|
|
uint64_t rde_offset;
|
|
uint64_t rde_stream_offset;
|
|
} redup_entry_t;
|
|
|
|
typedef struct redup_table {
|
|
redup_entry_t **redup_hash_array;
|
|
umem_cache_t *ddecache;
|
|
uint64_t ddt_count;
|
|
int numhashbits;
|
|
} redup_table_t;
|
|
|
|
int
|
|
highbit64(uint64_t i)
|
|
{
|
|
if (i == 0)
|
|
return (0);
|
|
|
|
return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
|
|
}
|
|
|
|
void *
|
|
safe_calloc(size_t n)
|
|
{
|
|
void *rv = calloc(1, n);
|
|
if (rv == NULL) {
|
|
fprintf(stderr,
|
|
"Error: could not allocate %u bytes of memory\n",
|
|
(int)n);
|
|
exit(1);
|
|
}
|
|
return (rv);
|
|
}
|
|
|
|
/*
|
|
* Safe version of fread(), exits on error.
|
|
*/
|
|
int
|
|
sfread(void *buf, size_t size, FILE *fp)
|
|
{
|
|
int rv = fread(buf, size, 1, fp);
|
|
if (rv == 0 && ferror(fp)) {
|
|
(void) fprintf(stderr, "Error while reading file: %s\n",
|
|
strerror(errno));
|
|
exit(1);
|
|
}
|
|
return (rv);
|
|
}
|
|
|
|
/*
|
|
* Safe version of pread(), exits on error.
|
|
*/
|
|
static void
|
|
spread(int fd, void *buf, size_t count, off_t offset)
|
|
{
|
|
ssize_t err = pread(fd, buf, count, offset);
|
|
if (err == -1) {
|
|
(void) fprintf(stderr,
|
|
"Error while reading file: %s\n",
|
|
strerror(errno));
|
|
exit(1);
|
|
} else if (err != count) {
|
|
(void) fprintf(stderr,
|
|
"Error while reading file: short read\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
static int
|
|
dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
|
|
zio_cksum_t *zc, int outfd)
|
|
{
|
|
assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
|
|
== sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
|
|
fletcher_4_incremental_native(drr,
|
|
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
|
|
if (drr->drr_type != DRR_BEGIN) {
|
|
assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
|
|
drr_checksum.drr_checksum));
|
|
drr->drr_u.drr_checksum.drr_checksum = *zc;
|
|
}
|
|
fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
|
|
sizeof (zio_cksum_t), zc);
|
|
if (write(outfd, drr, sizeof (*drr)) == -1)
|
|
return (errno);
|
|
if (payload_len != 0) {
|
|
fletcher_4_incremental_native(payload, payload_len, zc);
|
|
if (write(outfd, payload, payload_len) == -1)
|
|
return (errno);
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
rdt_insert(redup_table_t *rdt,
|
|
uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
|
|
{
|
|
uint64_t ch = cityhash4(guid, object, offset, 0);
|
|
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
|
|
redup_entry_t **rdepp;
|
|
|
|
rdepp = &(rdt->redup_hash_array[hashcode]);
|
|
redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
|
|
rde->rde_next = *rdepp;
|
|
rde->rde_guid = guid;
|
|
rde->rde_object = object;
|
|
rde->rde_offset = offset;
|
|
rde->rde_stream_offset = stream_offset;
|
|
*rdepp = rde;
|
|
rdt->ddt_count++;
|
|
}
|
|
|
|
static void
|
|
rdt_lookup(redup_table_t *rdt,
|
|
uint64_t guid, uint64_t object, uint64_t offset,
|
|
uint64_t *stream_offsetp)
|
|
{
|
|
uint64_t ch = cityhash4(guid, object, offset, 0);
|
|
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
|
|
|
|
for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
|
|
rde != NULL; rde = rde->rde_next) {
|
|
if (rde->rde_guid == guid &&
|
|
rde->rde_object == object &&
|
|
rde->rde_offset == offset) {
|
|
*stream_offsetp = rde->rde_stream_offset;
|
|
return;
|
|
}
|
|
}
|
|
assert(!"could not find expected redup table entry");
|
|
}
|
|
|
|
/*
|
|
* Convert a dedup stream (generated by "zfs send -D") to a
|
|
* non-deduplicated stream. The entire infd will be converted, including
|
|
* any substreams in a stream package (generated by "zfs send -RD"). The
|
|
* infd must be seekable.
|
|
*/
|
|
static void
|
|
zfs_redup_stream(int infd, int outfd, boolean_t verbose)
|
|
{
|
|
int bufsz = SPA_MAXBLOCKSIZE;
|
|
dmu_replay_record_t thedrr = { 0 };
|
|
dmu_replay_record_t *drr = &thedrr;
|
|
redup_table_t rdt;
|
|
zio_cksum_t stream_cksum;
|
|
uint64_t numbuckets;
|
|
uint64_t num_records = 0;
|
|
uint64_t num_write_byref_records = 0;
|
|
|
|
#ifdef _ILP32
|
|
uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
|
|
#else
|
|
uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
|
|
uint64_t max_rde_size =
|
|
MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
|
|
SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
|
|
#endif
|
|
|
|
numbuckets = max_rde_size / (sizeof (redup_entry_t));
|
|
|
|
/*
|
|
* numbuckets must be a power of 2. Increase number to
|
|
* a power of 2 if necessary.
|
|
*/
|
|
if (!ISP2(numbuckets))
|
|
numbuckets = 1ULL << highbit64(numbuckets);
|
|
|
|
rdt.redup_hash_array =
|
|
safe_calloc(numbuckets * sizeof (redup_entry_t *));
|
|
rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
|
|
NULL, NULL, NULL, NULL, NULL, 0);
|
|
rdt.numhashbits = highbit64(numbuckets) - 1;
|
|
rdt.ddt_count = 0;
|
|
|
|
char *buf = safe_calloc(bufsz);
|
|
FILE *ofp = fdopen(infd, "r");
|
|
long offset = ftell(ofp);
|
|
int begin = 0;
|
|
boolean_t seen = B_FALSE;
|
|
while (sfread(drr, sizeof (*drr), ofp) != 0) {
|
|
num_records++;
|
|
|
|
/*
|
|
* We need to regenerate the checksum.
|
|
*/
|
|
if (drr->drr_type != DRR_BEGIN) {
|
|
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
|
|
sizeof (drr->drr_u.drr_checksum.drr_checksum));
|
|
}
|
|
|
|
uint64_t payload_size = 0;
|
|
switch (drr->drr_type) {
|
|
case DRR_BEGIN:
|
|
{
|
|
struct drr_begin *drrb = &drr->drr_u.drr_begin;
|
|
int fflags;
|
|
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
|
|
VERIFY0(begin++);
|
|
seen = B_TRUE;
|
|
|
|
assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
|
|
|
|
/* clear the DEDUP feature flag for this stream */
|
|
fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
|
|
fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
|
|
DMU_BACKUP_FEATURE_DEDUPPROPS);
|
|
/* cppcheck-suppress syntaxError */
|
|
DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
|
|
|
|
uint32_t sz = drr->drr_payloadlen;
|
|
|
|
VERIFY3U(sz, <=, 1U << 28);
|
|
|
|
if (sz != 0) {
|
|
if (sz > bufsz) {
|
|
free(buf);
|
|
buf = safe_calloc(sz);
|
|
bufsz = sz;
|
|
}
|
|
(void) sfread(buf, sz, ofp);
|
|
}
|
|
payload_size = sz;
|
|
break;
|
|
}
|
|
|
|
case DRR_END:
|
|
{
|
|
struct drr_end *drre = &drr->drr_u.drr_end;
|
|
/*
|
|
* We would prefer to just check --begin == 0, but
|
|
* replication streams have an end of stream END
|
|
* record, so we must avoid tripping it.
|
|
*/
|
|
VERIFY3B(seen, ==, B_TRUE);
|
|
begin--;
|
|
/*
|
|
* Use the recalculated checksum, unless this is
|
|
* the END record of a stream package, which has
|
|
* no checksum.
|
|
*/
|
|
if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
|
|
drre->drr_checksum = stream_cksum;
|
|
break;
|
|
}
|
|
|
|
case DRR_OBJECT:
|
|
{
|
|
struct drr_object *drro = &drr->drr_u.drr_object;
|
|
VERIFY3S(begin, ==, 1);
|
|
|
|
if (drro->drr_bonuslen > 0) {
|
|
payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
|
|
(void) sfread(buf, payload_size, ofp);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case DRR_SPILL:
|
|
{
|
|
struct drr_spill *drrs = &drr->drr_u.drr_spill;
|
|
VERIFY3S(begin, ==, 1);
|
|
payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
|
|
(void) sfread(buf, payload_size, ofp);
|
|
break;
|
|
}
|
|
|
|
case DRR_WRITE_BYREF:
|
|
{
|
|
struct drr_write_byref drrwb =
|
|
drr->drr_u.drr_write_byref;
|
|
VERIFY3S(begin, ==, 1);
|
|
|
|
num_write_byref_records++;
|
|
|
|
/*
|
|
* Look up in hash table by drrwb->drr_refguid,
|
|
* drr_refobject, drr_refoffset. Replace this
|
|
* record with the found WRITE record, but with
|
|
* drr_object,drr_offset,drr_toguid replaced with ours.
|
|
*/
|
|
uint64_t stream_offset = 0;
|
|
rdt_lookup(&rdt, drrwb.drr_refguid,
|
|
drrwb.drr_refobject, drrwb.drr_refoffset,
|
|
&stream_offset);
|
|
|
|
spread(infd, drr, sizeof (*drr), stream_offset);
|
|
|
|
assert(drr->drr_type == DRR_WRITE);
|
|
struct drr_write *drrw = &drr->drr_u.drr_write;
|
|
assert(drrw->drr_toguid == drrwb.drr_refguid);
|
|
assert(drrw->drr_object == drrwb.drr_refobject);
|
|
assert(drrw->drr_offset == drrwb.drr_refoffset);
|
|
|
|
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
|
|
spread(infd, buf, payload_size,
|
|
stream_offset + sizeof (*drr));
|
|
|
|
drrw->drr_toguid = drrwb.drr_toguid;
|
|
drrw->drr_object = drrwb.drr_object;
|
|
drrw->drr_offset = drrwb.drr_offset;
|
|
break;
|
|
}
|
|
|
|
case DRR_WRITE:
|
|
{
|
|
struct drr_write *drrw = &drr->drr_u.drr_write;
|
|
VERIFY3S(begin, ==, 1);
|
|
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
|
|
(void) sfread(buf, payload_size, ofp);
|
|
|
|
rdt_insert(&rdt, drrw->drr_toguid,
|
|
drrw->drr_object, drrw->drr_offset, offset);
|
|
break;
|
|
}
|
|
|
|
case DRR_WRITE_EMBEDDED:
|
|
{
|
|
struct drr_write_embedded *drrwe =
|
|
&drr->drr_u.drr_write_embedded;
|
|
VERIFY3S(begin, ==, 1);
|
|
payload_size =
|
|
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
|
|
(void) sfread(buf, payload_size, ofp);
|
|
break;
|
|
}
|
|
|
|
case DRR_FREEOBJECTS:
|
|
case DRR_FREE:
|
|
case DRR_OBJECT_RANGE:
|
|
VERIFY3S(begin, ==, 1);
|
|
break;
|
|
|
|
default:
|
|
(void) fprintf(stderr, "INVALID record type 0x%x\n",
|
|
drr->drr_type);
|
|
/* should never happen, so assert */
|
|
assert(B_FALSE);
|
|
}
|
|
|
|
if (feof(ofp)) {
|
|
fprintf(stderr, "Error: unexpected end-of-file\n");
|
|
exit(1);
|
|
}
|
|
if (ferror(ofp)) {
|
|
fprintf(stderr, "Error while reading file: %s\n",
|
|
strerror(errno));
|
|
exit(1);
|
|
}
|
|
|
|
/*
|
|
* We need to recalculate the checksum, and it needs to be
|
|
* initially zero to do that. BEGIN records don't have
|
|
* a checksum.
|
|
*/
|
|
if (drr->drr_type != DRR_BEGIN) {
|
|
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
|
|
sizeof (drr->drr_u.drr_checksum.drr_checksum));
|
|
}
|
|
if (dump_record(drr, buf, payload_size,
|
|
&stream_cksum, outfd) != 0)
|
|
break;
|
|
if (drr->drr_type == DRR_END) {
|
|
/*
|
|
* Typically the END record is either the last
|
|
* thing in the stream, or it is followed
|
|
* by a BEGIN record (which also zeros the checksum).
|
|
* However, a stream package ends with two END
|
|
* records. The last END record's checksum starts
|
|
* from zero.
|
|
*/
|
|
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
|
|
}
|
|
offset = ftell(ofp);
|
|
}
|
|
|
|
if (verbose) {
|
|
char mem_str[16];
|
|
zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
|
|
mem_str, sizeof (mem_str));
|
|
fprintf(stderr, "converted stream with %llu total records, "
|
|
"including %llu dedup records, using %sB memory.\n",
|
|
(long long)num_records,
|
|
(long long)num_write_byref_records,
|
|
mem_str);
|
|
}
|
|
|
|
umem_cache_destroy(rdt.ddecache);
|
|
free(rdt.redup_hash_array);
|
|
free(buf);
|
|
(void) fclose(ofp);
|
|
}
|
|
|
|
int
|
|
zstream_do_redup(int argc, char *argv[])
|
|
{
|
|
boolean_t verbose = B_FALSE;
|
|
int c;
|
|
|
|
while ((c = getopt(argc, argv, "v")) != -1) {
|
|
switch (c) {
|
|
case 'v':
|
|
verbose = B_TRUE;
|
|
break;
|
|
case '?':
|
|
(void) fprintf(stderr, "invalid option '%c'\n",
|
|
optopt);
|
|
zstream_usage();
|
|
break;
|
|
}
|
|
}
|
|
|
|
argc -= optind;
|
|
argv += optind;
|
|
|
|
if (argc != 1)
|
|
zstream_usage();
|
|
|
|
const char *filename = argv[0];
|
|
|
|
if (isatty(STDOUT_FILENO)) {
|
|
(void) fprintf(stderr,
|
|
"Error: Stream can not be written to a terminal.\n"
|
|
"You must redirect standard output.\n");
|
|
return (1);
|
|
}
|
|
|
|
int fd = open(filename, O_RDONLY);
|
|
if (fd == -1) {
|
|
(void) fprintf(stderr,
|
|
"Error while opening file '%s': %s\n",
|
|
filename, strerror(errno));
|
|
exit(1);
|
|
}
|
|
|
|
fletcher_4_init();
|
|
zfs_redup_stream(fd, STDOUT_FILENO, verbose);
|
|
fletcher_4_fini();
|
|
|
|
close(fd);
|
|
|
|
return (0);
|
|
}
|