mirror_zfs/lib/libzpool/kernel.c

912 lines
18 KiB
C
Raw Normal View History

// SPDX-License-Identifier: CDDL-1.0
2008-11-20 23:01:55 +03:00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
2008-11-20 23:01:55 +03:00
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
Implement Redacted Send/Receive Redacted send/receive allows users to send subsets of their data to a target system. One possible use case for this feature is to not transmit sensitive information to a data warehousing, test/dev, or analytics environment. Another is to save space by not replicating unimportant data within a given dataset, for example in backup tools like zrepl. Redacted send/receive is a three-stage process. First, a clone (or clones) is made of the snapshot to be sent to the target. In this clone (or clones), all unnecessary or unwanted data is removed or modified. This clone is then snapshotted to create the "redaction snapshot" (or snapshots). Second, the new zfs redact command is used to create a redaction bookmark. The redaction bookmark stores the list of blocks in a snapshot that were modified by the redaction snapshot(s). Finally, the redaction bookmark is passed as a parameter to zfs send. When sending to the snapshot that was redacted, the redaction bookmark is used to filter out blocks that contain sensitive or unwanted information, and those blocks are not included in the send stream. When sending from the redaction bookmark, the blocks it contains are considered as candidate blocks in addition to those blocks in the destination snapshot that were modified since the creation_txg of the redaction bookmark. This step is necessary to allow the target to rehydrate data in the case where some blocks are accidentally or unnecessarily modified in the redaction snapshot. The changes to bookmarks to enable fast space estimation involve adding deadlists to bookmarks. There is also logic to manage the life cycles of these deadlists. The new size estimation process operates in cases where previously an accurate estimate could not be provided. In those cases, a send is performed where no data blocks are read, reducing the runtime significantly and providing a byte-accurate size estimate. Reviewed-by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Reviewed-by: Prashanth Sreenivasa <pks@delphix.com> Reviewed-by: John Kennedy <john.kennedy@delphix.com> Reviewed-by: George Wilson <george.wilson@delphix.com> Reviewed-by: Chris Williamson <chris.williamson@delphix.com> Reviewed-by: Pavel Zhakarov <pavel.zakharov@delphix.com> Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed-by: Prakash Surya <prakash.surya@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Dagnelie <pcd@delphix.com> Closes #7958
2019-06-19 19:48:13 +03:00
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2025, Klara, Inc.
2008-11-20 23:01:55 +03:00
*/
#include <assert.h>
#include <fcntl.h>
#include <libgen.h>
2008-11-20 23:01:55 +03:00
#include <poll.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <libzutil.h>
#include <sys/crypto/icp.h>
2008-11-20 23:01:55 +03:00
#include <sys/processor.h>
#include <sys/rrwlock.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/sid.h>
#include <sys/stat.h>
2009-02-18 23:51:31 +03:00
#include <sys/systeminfo.h>
#include <sys/time.h>
#include <sys/tsd.h>
#include <libspl.h>
#include <libzpool.h>
#include <sys/zfs_context.h>
#include <sys/zfs_onexit.h>
#include <sys/zfs_vfsops.h>
Add zstd support to zfs This PR adds two new compression types, based on ZStandard: - zstd: A basic ZStandard compression algorithm Available compression. Levels for zstd are zstd-1 through zstd-19, where the compression increases with every level, but speed decreases. - zstd-fast: A faster version of the ZStandard compression algorithm zstd-fast is basically a "negative" level of zstd. The compression decreases with every level, but speed increases. Available compression levels for zstd-fast: - zstd-fast-1 through zstd-fast-10 - zstd-fast-20 through zstd-fast-100 (in increments of 10) - zstd-fast-500 and zstd-fast-1000 For more information check the man page. Implementation details: Rather than treat each level of zstd as a different algorithm (as was done historically with gzip), the block pointer `enum zio_compress` value is simply zstd for all levels, including zstd-fast, since they all use the same decompression function. The compress= property (a 64bit unsigned integer) uses the lower 7 bits to store the compression algorithm (matching the number of bits used in a block pointer, as the 8th bit was borrowed for embedded block pointers). The upper bits are used to store the compression level. It is necessary to be able to determine what compression level was used when later reading a block back, so the concept used in LZ4, where the first 32bits of the on-disk value are the size of the compressed data (since the allocation is rounded up to the nearest ashift), was extended, and we store the version of ZSTD and the level as well as the compressed size. This value is returned when decompressing a block, so that if the block needs to be recompressed (L2ARC, nop-write, etc), that the same parameters will be used to result in the matching checksum. All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`, `zio_prop_t`, etc.) uses the separated _compress and _complevel variables. Only the properties ZAP contains the combined/bit-shifted value. The combined value is split when the compression_changed_cb() callback is called, and sets both objset members (os_compress and os_complevel). The userspace tools all use the combined/bit-shifted value. Additional notes: zdb can now also decode the ZSTD compression header (flag -Z) and inspect the size, version and compression level saved in that header. For each record, if it is ZSTD compressed, the parameters of the decoded compression header get printed. ZSTD is included with all current tests and new tests are added as-needed. Per-dataset feature flags now get activated when the property is set. If a compression algorithm requires a feature flag, zfs activates the feature when the property is set, rather than waiting for the first block to be born. This is currently only used by zstd but can be extended as needed. Portions-Sponsored-By: The FreeBSD Foundation Co-authored-by: Allan Jude <allanjude@freebsd.org> Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com> Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl> Co-authored-by: Michael Niewöhner <foss@mniewoehner.de> Signed-off-by: Allan Jude <allan@klarasystems.com> Signed-off-by: Allan Jude <allanjude@freebsd.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com> Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl> Signed-off-by: Michael Niewöhner <foss@mniewoehner.de> Closes #6247 Closes #9024 Closes #10277 Closes #10278
2020-08-18 20:10:17 +03:00
#include <sys/zstd/zstd.h>
#include <sys/zvol.h>
#include <zfs_fletcher.h>
#include <zlib.h>
2008-11-20 23:01:55 +03:00
/*
* Emulation of kernel services in userland.
*/
uint32_t hostid;
2008-11-20 23:01:55 +03:00
/* If set, all blocks read will be copied to the specified directory. */
char *vn_dumpdir = NULL;
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery Some work has been done lately to improve the debugability of the ZFS pool load (and import) process. This includes: 7638 Refactor spa_load_impl into several functions 8961 SPA load/import should tell us why it failed 7277 zdb should be able to print zfs_dbgmsg's To iterate on top of that, there's a few changes that were made to make the import process more resilient and crash free. One of the first tasks during the pool load process is to parse a config provided from userland that describes what devices the pool is composed of. A vdev tree is generated from that config, and then all the vdevs are opened. The Meta Object Set (MOS) of the pool is accessed, and several metadata objects that are necessary to load the pool are read. The exact configuration of the pool is also stored inside the MOS. Since the configuration provided from userland is external and might not accurately describe the vdev tree of the pool at the txg that is being loaded, it cannot be relied upon to safely operate the pool. For that reason, the configuration in the MOS is read early on. In the past, the two configurations were compared together and if there was a mismatch then the load process was aborted and an error was returned. The latter was a good way to ensure a pool does not get corrupted, however it made the pool load process needlessly fragile in cases where the vdev configuration changed or the userland configuration was outdated. Since the MOS is stored in 3 copies, the configuration provided by userland doesn't have to be perfect in order to read its contents. Hence, a new approach has been adopted: The pool is first opened with the untrusted userland configuration just so that the real configuration can be read from the MOS. The trusted MOS configuration is then used to generate a new vdev tree and the pool is re-opened. When the pool is opened with an untrusted configuration, writes are disabled to avoid accidentally damaging it. During reads, some sanity checks are performed on block pointers to see if each DVA points to a known vdev; when the configuration is untrusted, instead of panicking the system if those checks fail we simply avoid issuing reads to the invalid DVAs. This new two-step pool load process now allows rewinding pools accross vdev tree changes such as device replacement, addition, etc. Loading a pool from an external config file in a clustering environment also becomes much safer now since the pool will import even if the config is outdated and didn't, for instance, register a recent device addition. With this code in place, it became relatively easy to implement a long-sought-after feature: the ability to import a pool with missing top level (i.e. non-redundant) devices. Note that since this almost guarantees some loss of data, this feature is for now restricted to a read-only import. Porting notes (ZTS): * Fix 'make dist' target in zpool_import * The maximum path length allowed by tar is 99 characters. Several of the new test cases exceeded this limit resulting in them not being included in the tarball. Shorten the names slightly. * Set/get tunables using accessor functions. * Get last synced txg via the "zfs_txg_history" mechanism. * Clear zinject handlers in cleanup for import_cache_device_replaced and import_rewind_device_replaced in order that the zpool can be exported if there is an error. * Increase FILESIZE to 8G in zfs-test.sh to allow for a larger ext4 file system to be created on ZFS_DISK2. Also, there's no need to partition ZFS_DISK2 at all. The partitioning had already been disabled for multipath devices. Among other things, the partitioning steals some space from the ext4 file system, makes it difficult to accurately calculate the paramters to parted and can make some of the tests fail. * Increase FS_SIZE and FILE_SIZE in the zpool_import test configuration now that FILESIZE is larger. * Write more data in order that device evacuation take lonnger in a couple tests. * Use mkdir -p to avoid errors when the directory already exists. * Remove use of sudo in import_rewind_config_changed. Authored by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Andrew Stormont <andyjstormont@gmail.com> Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Tim Chase <tim@chase2k.com> OpenZFS-issue: https://illumos.org/issues/9075 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123 Closes #7459
2016-07-22 17:39:36 +03:00
uint32_t
zone_get_hostid(void *zonep)
{
/*
* We're emulating the system's hostid in userland.
*/
(void) zonep;
return (hostid);
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery Some work has been done lately to improve the debugability of the ZFS pool load (and import) process. This includes: 7638 Refactor spa_load_impl into several functions 8961 SPA load/import should tell us why it failed 7277 zdb should be able to print zfs_dbgmsg's To iterate on top of that, there's a few changes that were made to make the import process more resilient and crash free. One of the first tasks during the pool load process is to parse a config provided from userland that describes what devices the pool is composed of. A vdev tree is generated from that config, and then all the vdevs are opened. The Meta Object Set (MOS) of the pool is accessed, and several metadata objects that are necessary to load the pool are read. The exact configuration of the pool is also stored inside the MOS. Since the configuration provided from userland is external and might not accurately describe the vdev tree of the pool at the txg that is being loaded, it cannot be relied upon to safely operate the pool. For that reason, the configuration in the MOS is read early on. In the past, the two configurations were compared together and if there was a mismatch then the load process was aborted and an error was returned. The latter was a good way to ensure a pool does not get corrupted, however it made the pool load process needlessly fragile in cases where the vdev configuration changed or the userland configuration was outdated. Since the MOS is stored in 3 copies, the configuration provided by userland doesn't have to be perfect in order to read its contents. Hence, a new approach has been adopted: The pool is first opened with the untrusted userland configuration just so that the real configuration can be read from the MOS. The trusted MOS configuration is then used to generate a new vdev tree and the pool is re-opened. When the pool is opened with an untrusted configuration, writes are disabled to avoid accidentally damaging it. During reads, some sanity checks are performed on block pointers to see if each DVA points to a known vdev; when the configuration is untrusted, instead of panicking the system if those checks fail we simply avoid issuing reads to the invalid DVAs. This new two-step pool load process now allows rewinding pools accross vdev tree changes such as device replacement, addition, etc. Loading a pool from an external config file in a clustering environment also becomes much safer now since the pool will import even if the config is outdated and didn't, for instance, register a recent device addition. With this code in place, it became relatively easy to implement a long-sought-after feature: the ability to import a pool with missing top level (i.e. non-redundant) devices. Note that since this almost guarantees some loss of data, this feature is for now restricted to a read-only import. Porting notes (ZTS): * Fix 'make dist' target in zpool_import * The maximum path length allowed by tar is 99 characters. Several of the new test cases exceeded this limit resulting in them not being included in the tarball. Shorten the names slightly. * Set/get tunables using accessor functions. * Get last synced txg via the "zfs_txg_history" mechanism. * Clear zinject handlers in cleanup for import_cache_device_replaced and import_rewind_device_replaced in order that the zpool can be exported if there is an error. * Increase FILESIZE to 8G in zfs-test.sh to allow for a larger ext4 file system to be created on ZFS_DISK2. Also, there's no need to partition ZFS_DISK2 at all. The partitioning had already been disabled for multipath devices. Among other things, the partitioning steals some space from the ext4 file system, makes it difficult to accurately calculate the paramters to parted and can make some of the tests fail. * Increase FS_SIZE and FILE_SIZE in the zpool_import test configuration now that FILESIZE is larger. * Write more data in order that device evacuation take lonnger in a couple tests. * Use mkdir -p to avoid errors when the directory already exists. * Remove use of sudo in import_rewind_config_changed. Authored by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Andrew Stormont <andyjstormont@gmail.com> Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Tim Chase <tim@chase2k.com> OpenZFS-issue: https://illumos.org/issues/9075 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123 Closes #7459
2016-07-22 17:39:36 +03:00
}
2008-11-20 23:01:55 +03:00
/*
* =========================================================================
* vnode operations
* =========================================================================
*/
2008-11-20 23:01:55 +03:00
/*
* =========================================================================
* Figure out which debugging statements to print
* =========================================================================
*/
static char *dprintf_string;
static int dprintf_print_all;
int
dprintf_find_string(const char *string)
{
char *tmp_str = dprintf_string;
int len = strlen(string);
/*
* Find out if this is a string we want to print.
* String format: file1.c,function_name1,file2.c,file3.c
*/
while (tmp_str != NULL) {
if (strncmp(tmp_str, string, len) == 0 &&
(tmp_str[len] == ',' || tmp_str[len] == '\0'))
return (1);
tmp_str = strchr(tmp_str, ',');
if (tmp_str != NULL)
tmp_str++; /* Get rid of , */
}
return (0);
}
void
dprintf_setup(int *argc, char **argv)
{
int i, j;
/*
* Debugging can be specified two ways: by setting the
* environment variable ZFS_DEBUG, or by including a
* "debug=..." argument on the command line. The command
* line setting overrides the environment variable.
*/
for (i = 1; i < *argc; i++) {
int len = strlen("debug=");
/* First look for a command line argument */
if (strncmp("debug=", argv[i], len) == 0) {
dprintf_string = argv[i] + len;
/* Remove from args */
for (j = i; j < *argc; j++)
argv[j] = argv[j+1];
argv[j] = NULL;
(*argc)--;
}
}
if (dprintf_string == NULL) {
/* Look for ZFS_DEBUG environment variable */
dprintf_string = getenv("ZFS_DEBUG");
}
/*
* Are we just turning on all debugging?
*/
if (dprintf_find_string("on"))
dprintf_print_all = 1;
if (dprintf_string != NULL)
zfs_flags |= ZFS_DEBUG_DPRINTF;
2008-11-20 23:01:55 +03:00
}
/*
* =========================================================================
* debug printfs
* =========================================================================
*/
void
__dprintf(boolean_t dprint, const char *file, const char *func,
int line, const char *fmt, ...)
2008-11-20 23:01:55 +03:00
{
/* Get rid of annoying "../common/" prefix to filename. */
const char *newfile = zfs_basename(file);
2008-11-20 23:01:55 +03:00
va_list adx;
if (dprint) {
/* dprintf messages are printed immediately */
if (!dprintf_print_all &&
!dprintf_find_string(newfile) &&
!dprintf_find_string(func))
return;
2008-11-20 23:01:55 +03:00
/* Print out just the function name if requested */
flockfile(stdout);
if (dprintf_find_string("pid"))
(void) printf("%d ", getpid());
if (dprintf_find_string("tid"))
(void) printf("%ju ",
(uintmax_t)(uintptr_t)pthread_self());
2008-11-20 23:01:55 +03:00
if (dprintf_find_string("cpu"))
(void) printf("%u ", getcpuid());
if (dprintf_find_string("time"))
(void) printf("%llu ", gethrtime());
if (dprintf_find_string("long"))
(void) printf("%s, line %d: ", newfile, line);
(void) printf("dprintf: %s: ", func);
2008-11-20 23:01:55 +03:00
va_start(adx, fmt);
(void) vprintf(fmt, adx);
va_end(adx);
funlockfile(stdout);
} else {
/* zfs_dbgmsg is logged for dumping later */
size_t size;
char *buf;
int i;
size = 1024;
buf = umem_alloc(size, UMEM_NOFAIL);
i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
if (i < size) {
va_start(adx, fmt);
(void) vsnprintf(buf + i, size - i, fmt, adx);
va_end(adx);
}
__zfs_dbgmsg(buf);
umem_free(buf, size);
2008-11-20 23:01:55 +03:00
}
}
/*
* =========================================================================
* cmn_err() and panic()
* =========================================================================
*/
static __attribute__((noreturn)) void
panic_stop_or_abort(void)
2008-11-20 23:01:55 +03:00
{
const char *stopenv = getenv("LIBZPOOL_PANIC_STOP");
if (stopenv != NULL && atoi(stopenv)) {
fputs("libzpool: LIBZPOOL_PANIC_STOP is set, sending "
"SIGSTOP to process group\n", stderr);
fflush(stderr);
kill(0, SIGSTOP);
fputs("libzpool: continued after panic stop, "
"aborting\n", stderr);
}
2008-11-20 23:01:55 +03:00
abort(); /* think of it as a "user-level crash dump" */
}
static void
vcmn_msg(int ce, const char *fmt, va_list adx)
{
switch (ce) {
case CE_IGNORE:
return;
case CE_CONT:
break;
case CE_NOTE:
fputs("libzpool: NOTICE: ", stderr);
break;
case CE_WARN:
fputs("libzpool: WARNING: ", stderr);
break;
case CE_PANIC:
fputs("libzpool: PANIC: ", stderr);
break;
default:
fputs("libzpool: [unknown severity %d]: ", stderr);
break;
}
2008-11-20 23:01:55 +03:00
vfprintf(stderr, fmt, adx);
if (ce != CE_CONT)
fputc('\n', stderr);
fflush(stderr);
2008-11-20 23:01:55 +03:00
}
void
vcmn_err(int ce, const char *fmt, va_list adx)
{
vcmn_msg(ce, fmt, adx);
2008-11-20 23:01:55 +03:00
if (ce == CE_PANIC)
panic_stop_or_abort();
2008-11-20 23:01:55 +03:00
}
void
cmn_err(int ce, const char *fmt, ...)
{
va_list adx;
va_start(adx, fmt);
vcmn_err(ce, fmt, adx);
va_end(adx);
}
__attribute__((noreturn)) void
panic(const char *fmt, ...)
{
va_list adx;
va_start(adx, fmt);
vcmn_msg(CE_PANIC, fmt, adx);
va_end(adx);
panic_stop_or_abort();
}
__attribute__((noreturn)) void
vpanic(const char *fmt, va_list adx)
{
vcmn_msg(CE_PANIC, fmt, adx);
panic_stop_or_abort();
}
2008-11-20 23:01:55 +03:00
/*
* =========================================================================
* misc routines
* =========================================================================
*/
void
delay(clock_t ticks)
{
(void) poll(0, 0, ticks * (1000 / hz));
2008-11-20 23:01:55 +03:00
}
int
ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
{
errno = 0;
*result = strtoull(str, nptr, base);
if (*result == 0)
return (errno);
return (0);
}
2008-11-20 23:01:55 +03:00
/*
* =========================================================================
* kernel emulation setup & teardown
* =========================================================================
*/
static int
umem_out_of_memory(void)
{
char errmsg[] = "out of memory -- generating core dump\n";
(void) fprintf(stderr, "%s", errmsg);
2008-11-20 23:01:55 +03:00
abort();
return (0);
}
static void
spa_config_load(void)
{
void *buf = NULL;
nvlist_t *nvlist, *child;
nvpair_t *nvpair;
char *pathname;
zfs_file_t *fp;
zfs_file_attr_t zfa;
uint64_t fsize;
int err;
/*
* Open the configuration file.
*/
pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
if (err)
err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
kmem_free(pathname, MAXPATHLEN);
if (err)
return;
if (zfs_file_getattr(fp, &zfa))
goto out;
fsize = zfa.zfa_size;
buf = kmem_alloc(fsize, KM_SLEEP);
/*
* Read the nvlist from the file.
*/
if (zfs_file_read(fp, buf, fsize, NULL) < 0)
goto out;
/*
* Unpack the nvlist.
*/
if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
goto out;
/*
* Iterate over all elements in the nvlist, creating a new spa_t for
* each one with the specified configuration.
*/
spa_namespace_enter(FTAG);
nvpair = NULL;
while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
continue;
child = fnvpair_value_nvlist(nvpair);
if (spa_lookup(nvpair_name(nvpair)) != NULL)
continue;
(void) spa_add(nvpair_name(nvpair), child, NULL);
}
spa_namespace_exit(FTAG);
nvlist_free(nvlist);
out:
if (buf != NULL)
kmem_free(buf, fsize);
zfs_file_close(fp);
}
2008-11-20 23:01:55 +03:00
void
kernel_init(int mode)
{
extern uint_t rrw_tsd_key;
libspl_init();
2008-11-20 23:01:55 +03:00
umem_nofail_callback(umem_out_of_memory);
dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem,
2008-11-20 23:01:55 +03:00
(double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
hostid = (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0;
2008-11-20 23:01:55 +03:00
system_taskq_init();
icp_init();
Add zstd support to zfs This PR adds two new compression types, based on ZStandard: - zstd: A basic ZStandard compression algorithm Available compression. Levels for zstd are zstd-1 through zstd-19, where the compression increases with every level, but speed decreases. - zstd-fast: A faster version of the ZStandard compression algorithm zstd-fast is basically a "negative" level of zstd. The compression decreases with every level, but speed increases. Available compression levels for zstd-fast: - zstd-fast-1 through zstd-fast-10 - zstd-fast-20 through zstd-fast-100 (in increments of 10) - zstd-fast-500 and zstd-fast-1000 For more information check the man page. Implementation details: Rather than treat each level of zstd as a different algorithm (as was done historically with gzip), the block pointer `enum zio_compress` value is simply zstd for all levels, including zstd-fast, since they all use the same decompression function. The compress= property (a 64bit unsigned integer) uses the lower 7 bits to store the compression algorithm (matching the number of bits used in a block pointer, as the 8th bit was borrowed for embedded block pointers). The upper bits are used to store the compression level. It is necessary to be able to determine what compression level was used when later reading a block back, so the concept used in LZ4, where the first 32bits of the on-disk value are the size of the compressed data (since the allocation is rounded up to the nearest ashift), was extended, and we store the version of ZSTD and the level as well as the compressed size. This value is returned when decompressing a block, so that if the block needs to be recompressed (L2ARC, nop-write, etc), that the same parameters will be used to result in the matching checksum. All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`, `zio_prop_t`, etc.) uses the separated _compress and _complevel variables. Only the properties ZAP contains the combined/bit-shifted value. The combined value is split when the compression_changed_cb() callback is called, and sets both objset members (os_compress and os_complevel). The userspace tools all use the combined/bit-shifted value. Additional notes: zdb can now also decode the ZSTD compression header (flag -Z) and inspect the size, version and compression level saved in that header. For each record, if it is ZSTD compressed, the parameters of the decoded compression header get printed. ZSTD is included with all current tests and new tests are added as-needed. Per-dataset feature flags now get activated when the property is set. If a compression algorithm requires a feature flag, zfs activates the feature when the property is set, rather than waiting for the first block to be born. This is currently only used by zstd but can be extended as needed. Portions-Sponsored-By: The FreeBSD Foundation Co-authored-by: Allan Jude <allanjude@freebsd.org> Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com> Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl> Co-authored-by: Michael Niewöhner <foss@mniewoehner.de> Signed-off-by: Allan Jude <allan@klarasystems.com> Signed-off-by: Allan Jude <allanjude@freebsd.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com> Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl> Signed-off-by: Michael Niewöhner <foss@mniewoehner.de> Closes #6247 Closes #9024 Closes #10277 Closes #10278
2020-08-18 20:10:17 +03:00
zstd_init();
spa_init((spa_mode_t)mode);
spa_config_load();
fletcher_4_init();
tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
2008-11-20 23:01:55 +03:00
}
void
kernel_fini(void)
{
fletcher_4_fini();
2008-11-20 23:01:55 +03:00
spa_fini();
Add zstd support to zfs This PR adds two new compression types, based on ZStandard: - zstd: A basic ZStandard compression algorithm Available compression. Levels for zstd are zstd-1 through zstd-19, where the compression increases with every level, but speed decreases. - zstd-fast: A faster version of the ZStandard compression algorithm zstd-fast is basically a "negative" level of zstd. The compression decreases with every level, but speed increases. Available compression levels for zstd-fast: - zstd-fast-1 through zstd-fast-10 - zstd-fast-20 through zstd-fast-100 (in increments of 10) - zstd-fast-500 and zstd-fast-1000 For more information check the man page. Implementation details: Rather than treat each level of zstd as a different algorithm (as was done historically with gzip), the block pointer `enum zio_compress` value is simply zstd for all levels, including zstd-fast, since they all use the same decompression function. The compress= property (a 64bit unsigned integer) uses the lower 7 bits to store the compression algorithm (matching the number of bits used in a block pointer, as the 8th bit was borrowed for embedded block pointers). The upper bits are used to store the compression level. It is necessary to be able to determine what compression level was used when later reading a block back, so the concept used in LZ4, where the first 32bits of the on-disk value are the size of the compressed data (since the allocation is rounded up to the nearest ashift), was extended, and we store the version of ZSTD and the level as well as the compressed size. This value is returned when decompressing a block, so that if the block needs to be recompressed (L2ARC, nop-write, etc), that the same parameters will be used to result in the matching checksum. All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`, `zio_prop_t`, etc.) uses the separated _compress and _complevel variables. Only the properties ZAP contains the combined/bit-shifted value. The combined value is split when the compression_changed_cb() callback is called, and sets both objset members (os_compress and os_complevel). The userspace tools all use the combined/bit-shifted value. Additional notes: zdb can now also decode the ZSTD compression header (flag -Z) and inspect the size, version and compression level saved in that header. For each record, if it is ZSTD compressed, the parameters of the decoded compression header get printed. ZSTD is included with all current tests and new tests are added as-needed. Per-dataset feature flags now get activated when the property is set. If a compression algorithm requires a feature flag, zfs activates the feature when the property is set, rather than waiting for the first block to be born. This is currently only used by zstd but can be extended as needed. Portions-Sponsored-By: The FreeBSD Foundation Co-authored-by: Allan Jude <allanjude@freebsd.org> Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov> Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com> Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl> Co-authored-by: Michael Niewöhner <foss@mniewoehner.de> Signed-off-by: Allan Jude <allan@klarasystems.com> Signed-off-by: Allan Jude <allanjude@freebsd.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com> Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl> Signed-off-by: Michael Niewöhner <foss@mniewoehner.de> Closes #6247 Closes #9024 Closes #10277 Closes #10278
2020-08-18 20:10:17 +03:00
zstd_fini();
icp_fini();
system_taskq_fini();
libspl_fini();
2008-11-20 23:01:55 +03:00
}
zfs_file_t *
zfs_onexit_fd_hold(int fd, minor_t *minorp)
{
(void) fd;
*minorp = 0;
return (NULL);
}
void
zfs_onexit_fd_rele(zfs_file_t *fp)
{
(void) fp;
}
int
zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
uintptr_t *action_handle)
{
(void) minor, (void) func, (void) data, (void) action_handle;
return (0);
}
void
zvol_create_minors(const char *name)
{
(void) name;
}
void
zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
{
(void) spa, (void) name, (void) async;
}
void
zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
boolean_t async)
{
(void) spa, (void) oldname, (void) newname, (void) async;
}
/*
* Open file
*
* path - fully qualified path to file
* flags - file attributes O_READ / O_WRITE / O_EXCL
* fpp - pointer to return file pointer
*
* Returns 0 on success underlying error on failure.
*/
int
zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
{
int fd;
int dump_fd;
int err;
int old_umask = 0;
zfs_file_t *fp;
struct stat64 st;
if (!(flags & O_CREAT) && stat64(path, &st) == -1)
return (errno);
if (!(flags & O_CREAT) && S_ISBLK(st.st_mode))
flags |= O_DIRECT;
if (flags & O_CREAT)
old_umask = umask(0);
fd = open64(path, flags, mode);
if (fd == -1)
return (errno);
if (flags & O_CREAT)
(void) umask(old_umask);
if (vn_dumpdir != NULL) {
char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL);
const char *inpath = zfs_basename(path);
(void) snprintf(dumppath, MAXPATHLEN,
"%s/%s", vn_dumpdir, inpath);
dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
umem_free(dumppath, MAXPATHLEN);
if (dump_fd == -1) {
err = errno;
close(fd);
return (err);
}
} else {
dump_fd = -1;
}
(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL);
fp->f_fd = fd;
fp->f_dump_fd = dump_fd;
*fpp = fp;
return (0);
}
void
zfs_file_close(zfs_file_t *fp)
{
close(fp->f_fd);
if (fp->f_dump_fd != -1)
close(fp->f_dump_fd);
umem_free(fp, sizeof (zfs_file_t));
}
/*
* Stateful write - use os internal file pointer to determine where to
* write and update on successful completion.
*
* fp - pointer to file (pipe, socket, etc) to write to
* buf - buffer to write
* count - # of bytes to write
* resid - pointer to count of unwritten bytes (if short write)
*
* Returns 0 on success errno on failure.
*/
int
zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
{
ssize_t rc;
rc = write(fp->f_fd, buf, count);
if (rc < 0)
return (errno);
if (resid) {
*resid = count - rc;
} else if (rc != count) {
return (EIO);
}
return (0);
}
/*
* Stateless write - os internal file pointer is not updated.
*
* fp - pointer to file (pipe, socket, etc) to write to
* buf - buffer to write
* count - # of bytes to write
* off - file offset to write to (only valid for seekable types)
* resid - pointer to count of unwritten bytes
*
* Returns 0 on success errno on failure.
*/
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf,
size_t count, loff_t pos, uint8_t ashift, ssize_t *resid)
{
ssize_t rc, split, done;
int sectors;
/*
* To simulate partial disk writes, we split writes into two
* system calls so that the process can be killed in between.
* This is used by ztest to simulate realistic failure modes.
*/
sectors = count >> ashift;
split = (sectors > 0 ? rand() % sectors : 0) << ashift;
rc = pwrite64(fp->f_fd, buf, split, pos);
if (rc != -1) {
done = rc;
rc = pwrite64(fp->f_fd, (char *)buf + split,
count - split, pos + split);
}
#ifdef __linux__
if (rc == -1 && errno == EINVAL) {
/*
* Under Linux, this most likely means an alignment issue
* (memory or disk) due to O_DIRECT, so we abort() in order
* to catch the offender.
*/
abort();
}
#endif
if (rc < 0)
return (errno);
done += rc;
if (resid) {
*resid = count - done;
} else if (done != count) {
return (EIO);
}
return (0);
}
/*
* Stateful read - use os internal file pointer to determine where to
* read and update on successful completion.
*
* fp - pointer to file (pipe, socket, etc) to read from
* buf - buffer to write
* count - # of bytes to read
* resid - pointer to count of unread bytes (if short read)
*
* Returns 0 on success errno on failure.
*/
int
zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
{
int rc;
rc = read(fp->f_fd, buf, count);
if (rc < 0)
return (errno);
if (resid) {
*resid = count - rc;
} else if (rc != count) {
return (EIO);
}
return (0);
}
/*
* Stateless read - os internal file pointer is not updated.
*
* fp - pointer to file (pipe, socket, etc) to read from
* buf - buffer to write
* count - # of bytes to write
* off - file offset to read from (only valid for seekable types)
* resid - pointer to count of unwritten bytes (if short write)
*
* Returns 0 on success errno on failure.
*/
int
zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
ssize_t *resid)
{
ssize_t rc;
rc = pread64(fp->f_fd, buf, count, off);
if (rc < 0) {
#ifdef __linux__
/*
* Under Linux, this most likely means an alignment issue
* (memory or disk) due to O_DIRECT, so we abort() in order to
* catch the offender.
*/
if (errno == EINVAL)
abort();
#endif
return (errno);
}
if (fp->f_dump_fd != -1) {
int status;
status = pwrite64(fp->f_dump_fd, buf, rc, off);
ASSERT(status != -1);
}
if (resid) {
*resid = count - rc;
} else if (rc != count) {
return (EIO);
}
return (0);
}
/*
* lseek - set / get file pointer
*
* fp - pointer to file (pipe, socket, etc) to read from
* offp - value to seek to, returns current value plus passed offset
* whence - see man pages for standard lseek whence values
*
* Returns 0 on success errno on failure (ESPIPE for non seekable types)
*/
int
zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
{
loff_t rc;
rc = lseek(fp->f_fd, *offp, whence);
if (rc < 0)
return (errno);
*offp = rc;
return (0);
}
/*
* Get file attributes
*
* filp - file pointer
* zfattr - pointer to file attr structure
*
* Currently only used for fetching size and file mode
*
* Returns 0 on success or error code of underlying getattr call on failure.
*/
int
zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
{
struct stat64 st;
if (fstat64_blk(fp->f_fd, &st) == -1)
return (errno);
zfattr->zfa_size = st.st_size;
zfattr->zfa_mode = st.st_mode;
return (0);
}
/*
* Sync file to disk
*
* filp - file pointer
* flags - O_SYNC and or O_DSYNC
*
* Returns 0 on success or error code of underlying sync call on failure.
*/
int
zfs_file_fsync(zfs_file_t *fp, int flags)
{
(void) flags;
if (fsync(fp->f_fd) < 0)
return (errno);
return (0);
}
/*
* deallocate - zero and/or deallocate file storage
*
* fp - file pointer
* offset - offset to start zeroing or deallocating
* len - length to zero or deallocate
*/
int
zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len)
{
int rc;
#if defined(__linux__)
rc = fallocate(fp->f_fd,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len);
#elif defined(__FreeBSD__) && (__FreeBSD_version >= 1400029)
struct spacectl_range rqsr = {
.r_offset = offset,
.r_len = len,
};
rc = fspacectl(fp->f_fd, SPACECTL_DEALLOC, &rqsr, 0, &rqsr);
#else
(void) fp, (void) offset, (void) len;
rc = EOPNOTSUPP;
#endif
if (rc)
return (SET_ERROR(rc));
return (0);
}
/*
* Request current file pointer offset
*
* fp - pointer to file
*
* Returns current file offset.
*/
loff_t
zfs_file_off(zfs_file_t *fp)
{
return (lseek(fp->f_fd, SEEK_CUR, 0));
}
/*
* unlink file
*
* path - fully qualified file path
*
* Returns 0 on success.
*
* OPTIONAL
*/
int
zfs_file_unlink(const char *path)
{
return (remove(path));
}
/*
* Get reference to file pointer
*
* fd - input file descriptor
*
* Returns pointer to file struct or NULL.
* Unsupported in user space.
*/
zfs_file_t *
zfs_file_get(int fd)
{
(void) fd;
abort();
return (NULL);
}
/*
* Drop reference to file pointer
*
* fp - pointer to file struct
*
* Unsupported in user space.
*/
void
zfs_file_put(zfs_file_t *fp)
{
abort();
(void) fp;
}
void
zfsvfs_update_fromname(const char *oldname, const char *newname)
{
(void) oldname, (void) newname;
}
void
spa_import_os(spa_t *spa)
{
(void) spa;
}
void
spa_export_os(spa_t *spa)
{
(void) spa;
}
void
spa_activate_os(spa_t *spa)
{
(void) spa;
}
void
spa_deactivate_os(spa_t *spa)
{
(void) spa;
}