2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
2012-12-14 03:24:15 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2015-07-24 19:53:55 +03:00
|
|
|
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
2015-06-25 07:05:32 +03:00
|
|
|
* Copyright (c) 2015, Intel Corporation.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
2013-03-25 01:24:51 +04:00
|
|
|
#include <unistd.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <stdio_ext.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/spa_impl.h>
|
|
|
|
#include <sys/dmu.h>
|
|
|
|
#include <sys/zap.h>
|
|
|
|
#include <sys/fs/zfs.h>
|
|
|
|
#include <sys/zfs_znode.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/zfs_sa.h>
|
|
|
|
#include <sys/sa.h>
|
|
|
|
#include <sys/sa_impl.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/vdev.h>
|
|
|
|
#include <sys/vdev_impl.h>
|
|
|
|
#include <sys/metaslab_impl.h>
|
|
|
|
#include <sys/dmu_objset.h>
|
|
|
|
#include <sys/dsl_dir.h>
|
|
|
|
#include <sys/dsl_dataset.h>
|
|
|
|
#include <sys/dsl_pool.h>
|
|
|
|
#include <sys/dbuf.h>
|
|
|
|
#include <sys/zil.h>
|
|
|
|
#include <sys/zil_impl.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/resource.h>
|
|
|
|
#include <sys/dmu_traverse.h>
|
|
|
|
#include <sys/zio_checksum.h>
|
|
|
|
#include <sys/zio_compress.h>
|
|
|
|
#include <sys/zfs_fuid.h>
|
2008-12-03 23:09:06 +03:00
|
|
|
#include <sys/arc.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/ddt.h>
|
2012-12-14 03:24:15 +04:00
|
|
|
#include <sys/zfeature.h>
|
2013-08-28 15:45:09 +04:00
|
|
|
#include <zfs_comutil.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <libzfs.h>
|
|
|
|
|
2013-01-12 04:42:50 +04:00
|
|
|
#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
|
|
|
|
zio_compress_table[(idx)].ci_name : "UNKNOWN")
|
|
|
|
#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
|
|
|
|
zio_checksum_table[(idx)].ci_name : "UNKNOWN")
|
|
|
|
#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
|
|
|
|
dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \
|
|
|
|
dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
|
|
|
|
#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
|
|
|
|
(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ? \
|
|
|
|
DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
#ifndef lint
|
|
|
|
extern int zfs_recover;
|
2014-09-17 00:24:48 +04:00
|
|
|
extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
|
2015-05-15 02:41:29 +03:00
|
|
|
extern int zfs_vdev_async_read_max_active;
|
2010-05-29 00:45:14 +04:00
|
|
|
#else
|
|
|
|
int zfs_recover;
|
2014-09-17 00:24:48 +04:00
|
|
|
uint64_t zfs_arc_max, zfs_arc_meta_limit;
|
2015-05-15 02:41:29 +03:00
|
|
|
int zfs_vdev_async_read_max_active;
|
2010-05-29 00:45:14 +04:00
|
|
|
#endif
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
const char cmdname[] = "zdb";
|
|
|
|
uint8_t dump_opt[256];
|
|
|
|
|
|
|
|
typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
|
|
|
|
|
|
|
|
extern void dump_intent_log(zilog_t *);
|
|
|
|
uint64_t *zopt_object = NULL;
|
|
|
|
int zopt_objects = 0;
|
|
|
|
libzfs_handle_t *g_zfs;
|
2014-09-17 00:24:48 +04:00
|
|
|
uint64_t max_inflight = 1000;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2015-04-27 01:27:36 +03:00
|
|
|
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* These libumem hooks provide a reasonable set of defaults for the allocator's
|
|
|
|
* debugging facilities.
|
|
|
|
*/
|
|
|
|
const char *
|
2010-08-26 20:52:41 +04:00
|
|
|
_umem_debug_init(void)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
return ("default,verbose"); /* $UMEM_DEBUG setting */
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *
|
|
|
|
_umem_logging_init(void)
|
|
|
|
{
|
|
|
|
return ("fail,contents"); /* $UMEM_LOGGING setting */
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
usage(void)
|
|
|
|
{
|
|
|
|
(void) fprintf(stderr,
|
2014-07-20 00:19:24 +04:00
|
|
|
"Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
|
2016-01-01 16:42:58 +03:00
|
|
|
"[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
|
2013-05-03 03:36:32 +04:00
|
|
|
" %s [-divPA] [-e -p path...] [-U config] dataset "
|
|
|
|
"[object...]\n"
|
2014-07-20 00:19:24 +04:00
|
|
|
" %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
|
2012-02-04 09:44:53 +04:00
|
|
|
"poolname [vdev [metaslab...]]\n"
|
|
|
|
" %s -R [-A] [-e [-p path...]] poolname "
|
|
|
|
"vdev:offset:size[:flags]\n"
|
2013-05-03 03:36:32 +04:00
|
|
|
" %s -S [-PA] [-e [-p path...]] [-U config] poolname\n"
|
2012-02-04 09:44:53 +04:00
|
|
|
" %s -l [-uA] device\n"
|
|
|
|
" %s -C [-A] [-U config]\n\n",
|
2010-05-29 00:45:14 +04:00
|
|
|
cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
|
|
|
|
|
|
|
|
(void) fprintf(stderr, " Dataset name must include at least one "
|
|
|
|
"separator character '/' or '@'\n");
|
|
|
|
(void) fprintf(stderr, " If dataset name is specified, only that "
|
|
|
|
"dataset is dumped\n");
|
|
|
|
(void) fprintf(stderr, " If object numbers are specified, only "
|
|
|
|
"those objects are dumped\n\n");
|
|
|
|
(void) fprintf(stderr, " Options to control amount of output:\n");
|
|
|
|
(void) fprintf(stderr, " -u uberblock\n");
|
|
|
|
(void) fprintf(stderr, " -d dataset(s)\n");
|
|
|
|
(void) fprintf(stderr, " -i intent logs\n");
|
|
|
|
(void) fprintf(stderr, " -C config (or cachefile if alone)\n");
|
|
|
|
(void) fprintf(stderr, " -h pool history\n");
|
|
|
|
(void) fprintf(stderr, " -b block statistics\n");
|
|
|
|
(void) fprintf(stderr, " -m metaslabs\n");
|
2014-07-20 00:19:24 +04:00
|
|
|
(void) fprintf(stderr, " -M metaslab groups\n");
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) fprintf(stderr, " -c checksum all metadata (twice for "
|
2009-07-03 02:44:48 +04:00
|
|
|
"all data) blocks\n");
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
|
|
|
|
(void) fprintf(stderr, " -D dedup statistics\n");
|
|
|
|
(void) fprintf(stderr, " -S simulate dedup to measure effect\n");
|
|
|
|
(void) fprintf(stderr, " -v verbose (applies to all others)\n");
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) fprintf(stderr, " -l dump label contents\n");
|
2009-01-16 00:59:39 +03:00
|
|
|
(void) fprintf(stderr, " -L disable leak tracking (do not "
|
|
|
|
"load spacemaps)\n");
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) fprintf(stderr, " -R read and display block from a "
|
2010-05-29 00:45:14 +04:00
|
|
|
"device\n\n");
|
|
|
|
(void) fprintf(stderr, " Below options are intended for use "
|
2016-01-01 16:42:58 +03:00
|
|
|
"with other options:\n");
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) fprintf(stderr, " -A ignore assertions (-A), enable "
|
|
|
|
"panic recovery (-AA) or both (-AAA)\n");
|
|
|
|
(void) fprintf(stderr, " -F attempt automatic rewind within "
|
|
|
|
"safe range of transaction groups\n");
|
|
|
|
(void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
|
|
|
|
"cachefile\n");
|
|
|
|
(void) fprintf(stderr, " -X attempt extreme rewind (does not "
|
|
|
|
"work with dataset)\n");
|
|
|
|
(void) fprintf(stderr, " -e pool is exported/destroyed/"
|
|
|
|
"has altroot/not in a cachefile\n");
|
|
|
|
(void) fprintf(stderr, " -p <path> -- use one or more with "
|
|
|
|
"-e to specify path to vdev dir\n");
|
2016-01-01 16:42:58 +03:00
|
|
|
(void) fprintf(stderr, " -x <dumpdir> -- "
|
|
|
|
"dump all read blocks into specified directory\n");
|
2016-05-23 20:20:42 +03:00
|
|
|
(void) fprintf(stderr, " -P print numbers in parsable form\n");
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) fprintf(stderr, " -t <txg> -- highest txg to use when "
|
2009-01-16 00:59:39 +03:00
|
|
|
"searching for uberblocks\n");
|
2014-07-20 00:19:24 +04:00
|
|
|
(void) fprintf(stderr, " -I <number of inflight I/Os> -- "
|
2016-01-01 16:42:58 +03:00
|
|
|
"specify the maximum number of "
|
|
|
|
"checksumming I/Os [default is 200]\n");
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
|
|
|
|
"to make only that option verbose\n");
|
|
|
|
(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* Called for usage errors that are discovered after a call to spa_open(),
|
|
|
|
* dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
|
|
|
|
*/
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
|
|
|
fatal(const char *fmt, ...)
|
|
|
|
{
|
|
|
|
va_list ap;
|
|
|
|
|
|
|
|
va_start(ap, fmt);
|
|
|
|
(void) fprintf(stderr, "%s: ", cmdname);
|
|
|
|
(void) vfprintf(stderr, fmt, ap);
|
|
|
|
va_end(ap);
|
|
|
|
(void) fprintf(stderr, "\n");
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
exit(1);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
nvlist_t *nv;
|
|
|
|
size_t nvsize = *(uint64_t *)data;
|
|
|
|
char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
|
|
|
|
|
|
|
|
umem_free(packed, nvsize);
|
|
|
|
|
|
|
|
dump_nvlist(nv, 8);
|
|
|
|
|
|
|
|
nvlist_free(nv);
|
|
|
|
}
|
|
|
|
|
2013-08-28 15:45:09 +04:00
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
spa_history_phys_t *shp = data;
|
|
|
|
|
|
|
|
if (shp == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
(void) printf("\t\tpool_create_len = %llu\n",
|
|
|
|
(u_longlong_t)shp->sh_pool_create_len);
|
|
|
|
(void) printf("\t\tphys_max_off = %llu\n",
|
|
|
|
(u_longlong_t)shp->sh_phys_max_off);
|
|
|
|
(void) printf("\t\tbof = %llu\n",
|
|
|
|
(u_longlong_t)shp->sh_bof);
|
|
|
|
(void) printf("\t\teof = %llu\n",
|
|
|
|
(u_longlong_t)shp->sh_eof);
|
|
|
|
(void) printf("\t\trecords_lost = %llu\n",
|
|
|
|
(u_longlong_t)shp->sh_records_lost);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
zdb_nicenum(uint64_t num, char *buf)
|
|
|
|
{
|
|
|
|
if (dump_opt['P'])
|
|
|
|
(void) sprintf(buf, "%llu", (longlong_t)num);
|
|
|
|
else
|
|
|
|
nicenum(num, buf);
|
|
|
|
}
|
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
const char histo_stars[] = "****************************************";
|
|
|
|
const int histo_width = sizeof (histo_stars) - 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
static void
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_histogram(const uint64_t *histo, int size, int offset)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
int i;
|
2013-03-25 01:24:51 +04:00
|
|
|
int minidx = size - 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
int maxidx = 0;
|
|
|
|
uint64_t max = 0;
|
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
for (i = 0; i < size; i++) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (histo[i] > max)
|
|
|
|
max = histo[i];
|
|
|
|
if (histo[i] > 0 && i > maxidx)
|
|
|
|
maxidx = i;
|
|
|
|
if (histo[i] > 0 && i < minidx)
|
|
|
|
minidx = i;
|
|
|
|
}
|
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
if (max < histo_width)
|
|
|
|
max = histo_width;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
for (i = minidx; i <= maxidx; i++) {
|
|
|
|
(void) printf("\t\t\t%3u: %6llu %s\n",
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
i + offset, (u_longlong_t)histo[i],
|
2013-03-25 01:24:51 +04:00
|
|
|
&histo_stars[(max - histo[i]) * histo_width / max]);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_zap_stats(objset_t *os, uint64_t object)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
zap_stats_t zs;
|
|
|
|
|
|
|
|
error = zap_get_stats(os, object, &zs);
|
|
|
|
if (error)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (zs.zs_ptrtbl_len == 0) {
|
|
|
|
ASSERT(zs.zs_num_blocks == 1);
|
|
|
|
(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
|
|
|
|
(u_longlong_t)zs.zs_blocksize,
|
|
|
|
(u_longlong_t)zs.zs_num_entries);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) printf("\tFat ZAP stats:\n");
|
|
|
|
|
|
|
|
(void) printf("\t\tPointer table:\n");
|
|
|
|
(void) printf("\t\t\t%llu elements\n",
|
|
|
|
(u_longlong_t)zs.zs_ptrtbl_len);
|
|
|
|
(void) printf("\t\t\tzt_blk: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_ptrtbl_zt_blk);
|
|
|
|
(void) printf("\t\t\tzt_numblks: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_ptrtbl_zt_numblks);
|
|
|
|
(void) printf("\t\t\tzt_shift: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_ptrtbl_zt_shift);
|
|
|
|
(void) printf("\t\t\tzt_blks_copied: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_ptrtbl_blks_copied);
|
|
|
|
(void) printf("\t\t\tzt_nextblk: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_ptrtbl_nextblk);
|
|
|
|
|
|
|
|
(void) printf("\t\tZAP entries: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_num_entries);
|
|
|
|
(void) printf("\t\tLeaf blocks: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_num_leafs);
|
|
|
|
(void) printf("\t\tTotal blocks: %llu\n",
|
|
|
|
(u_longlong_t)zs.zs_num_blocks);
|
|
|
|
(void) printf("\t\tzap_block_type: 0x%llx\n",
|
|
|
|
(u_longlong_t)zs.zs_block_type);
|
|
|
|
(void) printf("\t\tzap_magic: 0x%llx\n",
|
|
|
|
(u_longlong_t)zs.zs_magic);
|
|
|
|
(void) printf("\t\tzap_salt: 0x%llx\n",
|
|
|
|
(u_longlong_t)zs.zs_salt);
|
|
|
|
|
|
|
|
(void) printf("\t\tLeafs with 2^n pointers:\n");
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("\t\tBlocks with n*5 entries:\n");
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("\t\tBlocks n/10 full:\n");
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("\t\tEntries with n chunks:\n");
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("\t\tBuckets with n entries:\n");
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_none(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
(void) printf("\tUNKNOWN OBJECT TYPE\n");
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*ARGSUSED*/
|
|
|
|
void
|
|
|
|
dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t attr;
|
|
|
|
void *prop;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
dump_zap_stats(os, object);
|
|
|
|
(void) printf("\n");
|
|
|
|
|
|
|
|
for (zap_cursor_init(&zc, os, object);
|
|
|
|
zap_cursor_retrieve(&zc, &attr) == 0;
|
|
|
|
zap_cursor_advance(&zc)) {
|
|
|
|
(void) printf("\t\t%s = ", attr.za_name);
|
|
|
|
if (attr.za_num_integers == 0) {
|
|
|
|
(void) printf("\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
prop = umem_zalloc(attr.za_num_integers *
|
|
|
|
attr.za_integer_length, UMEM_NOFAIL);
|
|
|
|
(void) zap_lookup(os, object, attr.za_name,
|
|
|
|
attr.za_integer_length, attr.za_num_integers, prop);
|
|
|
|
if (attr.za_integer_length == 1) {
|
|
|
|
(void) printf("%s", (char *)prop);
|
|
|
|
} else {
|
|
|
|
for (i = 0; i < attr.za_num_integers; i++) {
|
|
|
|
switch (attr.za_integer_length) {
|
|
|
|
case 2:
|
|
|
|
(void) printf("%u ",
|
|
|
|
((uint16_t *)prop)[i]);
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
(void) printf("%u ",
|
|
|
|
((uint32_t *)prop)[i]);
|
|
|
|
break;
|
|
|
|
case 8:
|
|
|
|
(void) printf("%lld ",
|
|
|
|
(u_longlong_t)((int64_t *)prop)[i]);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(void) printf("\n");
|
|
|
|
umem_free(prop, attr.za_num_integers * attr.za_integer_length);
|
|
|
|
}
|
|
|
|
zap_cursor_fini(&zc);
|
|
|
|
}
|
|
|
|
|
2015-04-27 01:27:36 +03:00
|
|
|
static void
|
|
|
|
dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
bpobj_phys_t *bpop = data;
|
|
|
|
uint64_t i;
|
|
|
|
char bytes[32], comp[32], uncomp[32];
|
|
|
|
|
|
|
|
if (bpop == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
zdb_nicenum(bpop->bpo_bytes, bytes);
|
|
|
|
zdb_nicenum(bpop->bpo_comp, comp);
|
|
|
|
zdb_nicenum(bpop->bpo_uncomp, uncomp);
|
|
|
|
|
|
|
|
(void) printf("\t\tnum_blkptrs = %llu\n",
|
|
|
|
(u_longlong_t)bpop->bpo_num_blkptrs);
|
|
|
|
(void) printf("\t\tbytes = %s\n", bytes);
|
|
|
|
if (size >= BPOBJ_SIZE_V1) {
|
|
|
|
(void) printf("\t\tcomp = %s\n", comp);
|
|
|
|
(void) printf("\t\tuncomp = %s\n", uncomp);
|
|
|
|
}
|
|
|
|
if (size >= sizeof (*bpop)) {
|
|
|
|
(void) printf("\t\tsubobjs = %llu\n",
|
|
|
|
(u_longlong_t)bpop->bpo_subobjs);
|
|
|
|
(void) printf("\t\tnum_subobjs = %llu\n",
|
|
|
|
(u_longlong_t)bpop->bpo_num_subobjs);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dump_opt['d'] < 5)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
|
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
blkptr_t bp;
|
|
|
|
|
|
|
|
int err = dmu_read(os, object,
|
|
|
|
i * sizeof (bp), sizeof (bp), &bp, 0);
|
|
|
|
if (err != 0) {
|
|
|
|
(void) printf("got error %u from dmu_read\n", err);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
|
|
|
|
(void) printf("\t%s\n", blkbuf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
dmu_object_info_t doi;
|
2015-10-09 21:28:12 +03:00
|
|
|
int64_t i;
|
2015-04-27 01:27:36 +03:00
|
|
|
|
|
|
|
VERIFY0(dmu_object_info(os, object, &doi));
|
|
|
|
uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
|
|
|
|
|
|
|
|
int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
|
|
|
|
if (err != 0) {
|
|
|
|
(void) printf("got error %u from dmu_read\n", err);
|
|
|
|
kmem_free(subobjs, doi.doi_max_offset);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
int64_t last_nonzero = -1;
|
|
|
|
for (i = 0; i < doi.doi_max_offset / 8; i++) {
|
|
|
|
if (subobjs[i] != 0)
|
|
|
|
last_nonzero = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i <= last_nonzero; i++) {
|
2015-10-09 21:28:12 +03:00
|
|
|
(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
|
2015-04-27 01:27:36 +03:00
|
|
|
}
|
|
|
|
kmem_free(subobjs, doi.doi_max_offset);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
dump_zap_stats(os, object);
|
|
|
|
/* contents are printed elsewhere, properly decoded */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t attr;
|
|
|
|
|
|
|
|
dump_zap_stats(os, object);
|
|
|
|
(void) printf("\n");
|
|
|
|
|
|
|
|
for (zap_cursor_init(&zc, os, object);
|
|
|
|
zap_cursor_retrieve(&zc, &attr) == 0;
|
|
|
|
zap_cursor_advance(&zc)) {
|
|
|
|
(void) printf("\t\t%s = ", attr.za_name);
|
|
|
|
if (attr.za_num_integers == 0) {
|
|
|
|
(void) printf("\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
(void) printf(" %llx : [%d:%d:%d]\n",
|
|
|
|
(u_longlong_t)attr.za_first_integer,
|
|
|
|
(int)ATTR_LENGTH(attr.za_first_integer),
|
|
|
|
(int)ATTR_BSWAP(attr.za_first_integer),
|
|
|
|
(int)ATTR_NUM(attr.za_first_integer));
|
|
|
|
}
|
|
|
|
zap_cursor_fini(&zc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t attr;
|
|
|
|
uint16_t *layout_attrs;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
dump_zap_stats(os, object);
|
|
|
|
(void) printf("\n");
|
|
|
|
|
|
|
|
for (zap_cursor_init(&zc, os, object);
|
|
|
|
zap_cursor_retrieve(&zc, &attr) == 0;
|
|
|
|
zap_cursor_advance(&zc)) {
|
|
|
|
(void) printf("\t\t%s = [", attr.za_name);
|
|
|
|
if (attr.za_num_integers == 0) {
|
|
|
|
(void) printf("\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
VERIFY(attr.za_integer_length == 2);
|
|
|
|
layout_attrs = umem_zalloc(attr.za_num_integers *
|
|
|
|
attr.za_integer_length, UMEM_NOFAIL);
|
|
|
|
|
|
|
|
VERIFY(zap_lookup(os, object, attr.za_name,
|
|
|
|
attr.za_integer_length,
|
|
|
|
attr.za_num_integers, layout_attrs) == 0);
|
|
|
|
|
|
|
|
for (i = 0; i != attr.za_num_integers; i++)
|
|
|
|
(void) printf(" %d ", (int)layout_attrs[i]);
|
|
|
|
(void) printf("]\n");
|
|
|
|
umem_free(layout_attrs,
|
|
|
|
attr.za_num_integers * attr.za_integer_length);
|
|
|
|
}
|
|
|
|
zap_cursor_fini(&zc);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t attr;
|
|
|
|
const char *typenames[] = {
|
|
|
|
/* 0 */ "not specified",
|
|
|
|
/* 1 */ "FIFO",
|
|
|
|
/* 2 */ "Character Device",
|
|
|
|
/* 3 */ "3 (invalid)",
|
|
|
|
/* 4 */ "Directory",
|
|
|
|
/* 5 */ "5 (invalid)",
|
|
|
|
/* 6 */ "Block Device",
|
|
|
|
/* 7 */ "7 (invalid)",
|
|
|
|
/* 8 */ "Regular File",
|
|
|
|
/* 9 */ "9 (invalid)",
|
|
|
|
/* 10 */ "Symbolic Link",
|
|
|
|
/* 11 */ "11 (invalid)",
|
|
|
|
/* 12 */ "Socket",
|
|
|
|
/* 13 */ "Door",
|
|
|
|
/* 14 */ "Event Port",
|
|
|
|
/* 15 */ "15 (invalid)",
|
|
|
|
};
|
|
|
|
|
|
|
|
dump_zap_stats(os, object);
|
|
|
|
(void) printf("\n");
|
|
|
|
|
|
|
|
for (zap_cursor_init(&zc, os, object);
|
|
|
|
zap_cursor_retrieve(&zc, &attr) == 0;
|
|
|
|
zap_cursor_advance(&zc)) {
|
|
|
|
(void) printf("\t\t%s = %lld (type: %s)\n",
|
|
|
|
attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
|
|
|
|
typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
|
|
|
|
}
|
|
|
|
zap_cursor_fini(&zc);
|
|
|
|
}
|
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
int
|
|
|
|
get_dtl_refcount(vdev_t *vd)
|
|
|
|
{
|
|
|
|
int refcount = 0;
|
|
|
|
int c;
|
|
|
|
|
|
|
|
if (vd->vdev_ops->vdev_op_leaf) {
|
|
|
|
space_map_t *sm = vd->vdev_dtl_sm;
|
|
|
|
|
|
|
|
if (sm != NULL &&
|
|
|
|
sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
|
|
|
|
return (1);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (c = 0; c < vd->vdev_children; c++)
|
|
|
|
refcount += get_dtl_refcount(vd->vdev_child[c]);
|
|
|
|
return (refcount);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
get_metaslab_refcount(vdev_t *vd)
|
|
|
|
{
|
|
|
|
int refcount = 0;
|
|
|
|
int c, m;
|
|
|
|
|
2014-07-20 00:19:24 +04:00
|
|
|
if (vd->vdev_top == vd && !vd->vdev_removing) {
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
for (m = 0; m < vd->vdev_ms_count; m++) {
|
|
|
|
space_map_t *sm = vd->vdev_ms[m]->ms_sm;
|
|
|
|
|
|
|
|
if (sm != NULL &&
|
|
|
|
sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
|
|
|
|
refcount++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (c = 0; c < vd->vdev_children; c++)
|
|
|
|
refcount += get_metaslab_refcount(vd->vdev_child[c]);
|
|
|
|
|
|
|
|
return (refcount);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
verify_spacemap_refcounts(spa_t *spa)
|
|
|
|
{
|
2013-10-08 21:13:05 +04:00
|
|
|
uint64_t expected_refcount = 0;
|
|
|
|
uint64_t actual_refcount;
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
|
2013-10-08 21:13:05 +04:00
|
|
|
(void) feature_get_refcount(spa,
|
|
|
|
&spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
|
|
|
|
&expected_refcount);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
|
|
|
|
actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
|
|
|
|
|
|
|
|
if (expected_refcount != actual_refcount) {
|
2013-10-08 21:13:05 +04:00
|
|
|
(void) printf("space map refcount mismatch: expected %lld != "
|
|
|
|
"actual %lld\n",
|
|
|
|
(longlong_t)expected_refcount,
|
|
|
|
(longlong_t)actual_refcount);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
return (2);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_spacemap(objset_t *os, space_map_t *sm)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
uint64_t alloc, offset, entry;
|
|
|
|
char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
|
|
|
|
"INVALID", "INVALID", "INVALID", "INVALID" };
|
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
if (sm == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Print out the freelist entries in both encoded and decoded form.
|
|
|
|
*/
|
|
|
|
alloc = 0;
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
for (offset = 0; offset < space_map_length(sm);
|
|
|
|
offset += sizeof (entry)) {
|
|
|
|
uint8_t mapshift = sm->sm_shift;
|
|
|
|
|
|
|
|
VERIFY0(dmu_read(os, space_map_object(sm), offset,
|
2009-07-03 02:44:48 +04:00
|
|
|
sizeof (entry), &entry, DMU_READ_PREFETCH));
|
2008-11-20 23:01:55 +03:00
|
|
|
if (SM_DEBUG_DECODE(entry)) {
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)(offset / sizeof (entry)),
|
|
|
|
ddata[SM_DEBUG_ACTION_DECODE(entry)],
|
|
|
|
(u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
|
|
|
|
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
|
|
|
|
} else {
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\t [%6llu] %c range:"
|
|
|
|
" %010llx-%010llx size: %06llx\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)(offset / sizeof (entry)),
|
|
|
|
SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
|
|
|
|
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
mapshift) + sm->sm_start),
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)((SM_OFFSET_DECODE(entry) <<
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
mapshift) + sm->sm_start +
|
|
|
|
(SM_RUN_DECODE(entry) << mapshift)),
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
|
|
|
|
if (SM_TYPE_DECODE(entry) == SM_ALLOC)
|
|
|
|
alloc += SM_RUN_DECODE(entry) << mapshift;
|
|
|
|
else
|
|
|
|
alloc -= SM_RUN_DECODE(entry) << mapshift;
|
|
|
|
}
|
|
|
|
}
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
if (alloc != space_map_allocated(sm)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("space_map_object alloc (%llu) INCONSISTENT "
|
|
|
|
"with space map summary (%llu)\n",
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
(u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
static void
|
|
|
|
dump_metaslab_stats(metaslab_t *msp)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
char maxbuf[32];
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
range_tree_t *rt = msp->ms_tree;
|
|
|
|
avl_tree_t *t = &msp->ms_size_tree;
|
|
|
|
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
|
2009-07-03 02:44:48 +04:00
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
|
2009-07-03 02:44:48 +04:00
|
|
|
"segments", avl_numnodes(t), "maxsize", maxbuf,
|
|
|
|
"freepct", free_pct);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
(void) printf("\tIn-memory histogram:\n");
|
|
|
|
dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
|
|
|
dump_metaslab(metaslab_t *msp)
|
|
|
|
{
|
|
|
|
vdev_t *vd = msp->ms_group->mg_vd;
|
|
|
|
spa_t *spa = vd->vdev_spa;
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
space_map_t *sm = msp->ms_sm;
|
2010-05-29 00:45:14 +04:00
|
|
|
char freebuf[32];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf(
|
2010-05-29 00:45:14 +04:00
|
|
|
"\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
|
|
|
|
(u_longlong_t)space_map_object(sm), freebuf);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
if (dump_opt['m'] > 2 && !dump_opt['L']) {
|
2009-07-03 02:44:48 +04:00
|
|
|
mutex_enter(&msp->ms_lock);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
metaslab_load_wait(msp);
|
|
|
|
if (!msp->ms_loaded) {
|
|
|
|
VERIFY0(metaslab_load(msp));
|
|
|
|
range_tree_stat_verify(msp->ms_tree);
|
|
|
|
}
|
2009-07-03 02:44:48 +04:00
|
|
|
dump_metaslab_stats(msp);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
metaslab_unload(msp);
|
2009-07-03 02:44:48 +04:00
|
|
|
mutex_exit(&msp->ms_lock);
|
|
|
|
}
|
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
if (dump_opt['m'] > 1 && sm != NULL &&
|
2013-10-08 21:13:05 +04:00
|
|
|
spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
/*
|
|
|
|
* The space map histogram represents free space in chunks
|
|
|
|
* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
|
|
|
|
*/
|
2014-07-20 00:19:24 +04:00
|
|
|
(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
|
|
|
|
(u_longlong_t)msp->ms_fragmentation);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_histogram(sm->sm_phys->smp_histogram,
|
2014-07-20 00:19:24 +04:00
|
|
|
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
|
|
|
|
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
mutex_enter(&msp->ms_lock);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
|
2009-07-03 02:44:48 +04:00
|
|
|
mutex_exit(&msp->ms_lock);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
print_vdev_metaslab_header(vdev_t *vd)
|
|
|
|
{
|
|
|
|
(void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n",
|
|
|
|
(u_longlong_t)vd->vdev_id,
|
|
|
|
"metaslabs", (u_longlong_t)vd->vdev_ms_count,
|
|
|
|
"offset", "spacemap", "free");
|
|
|
|
(void) printf("\t%15s %19s %15s %10s\n",
|
|
|
|
"---------------", "-------------------",
|
|
|
|
"---------------", "-------------");
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-07-20 00:19:24 +04:00
|
|
|
static void
|
|
|
|
dump_metaslab_groups(spa_t *spa)
|
|
|
|
{
|
|
|
|
vdev_t *rvd = spa->spa_root_vdev;
|
|
|
|
metaslab_class_t *mc = spa_normal_class(spa);
|
|
|
|
uint64_t fragmentation;
|
|
|
|
int c;
|
|
|
|
|
|
|
|
metaslab_class_histogram_verify(mc);
|
|
|
|
|
|
|
|
for (c = 0; c < rvd->vdev_children; c++) {
|
|
|
|
vdev_t *tvd = rvd->vdev_child[c];
|
|
|
|
metaslab_group_t *mg = tvd->vdev_mg;
|
|
|
|
|
|
|
|
if (mg->mg_class != mc)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
metaslab_group_histogram_verify(mg);
|
|
|
|
mg->mg_fragmentation = metaslab_group_fragmentation(mg);
|
|
|
|
|
|
|
|
(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
|
|
|
|
"fragmentation",
|
|
|
|
(u_longlong_t)tvd->vdev_id,
|
|
|
|
(u_longlong_t)tvd->vdev_ms_count);
|
|
|
|
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
|
|
|
|
(void) printf("%3s\n", "-");
|
|
|
|
} else {
|
|
|
|
(void) printf("%3llu%%\n",
|
|
|
|
(u_longlong_t)mg->mg_fragmentation);
|
|
|
|
}
|
|
|
|
dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) printf("\tpool %s\tfragmentation", spa_name(spa));
|
|
|
|
fragmentation = metaslab_class_fragmentation(mc);
|
|
|
|
if (fragmentation == ZFS_FRAG_INVALID)
|
|
|
|
(void) printf("\t%3s\n", "-");
|
|
|
|
else
|
|
|
|
(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
|
|
|
|
dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
|
|
|
dump_metaslabs(spa_t *spa)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
vdev_t *vd, *rvd = spa->spa_root_vdev;
|
|
|
|
uint64_t m, c = 0, children = rvd->vdev_children;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("\nMetaslabs:\n");
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!dump_opt['d'] && zopt_objects > 0) {
|
|
|
|
c = zopt_object[0];
|
|
|
|
|
|
|
|
if (c >= children)
|
|
|
|
(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zopt_objects > 1) {
|
|
|
|
vd = rvd->vdev_child[c];
|
|
|
|
print_vdev_metaslab_header(vd);
|
|
|
|
|
|
|
|
for (m = 1; m < zopt_objects; m++) {
|
|
|
|
if (zopt_object[m] < vd->vdev_ms_count)
|
|
|
|
dump_metaslab(
|
|
|
|
vd->vdev_ms[zopt_object[m]]);
|
|
|
|
else
|
|
|
|
(void) fprintf(stderr, "bad metaslab "
|
|
|
|
"number %llu\n",
|
|
|
|
(u_longlong_t)zopt_object[m]);
|
|
|
|
}
|
|
|
|
(void) printf("\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
children = c + 1;
|
|
|
|
}
|
|
|
|
for (; c < children; c++) {
|
|
|
|
vd = rvd->vdev_child[c];
|
|
|
|
print_vdev_metaslab_header(vd);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
for (m = 0; m < vd->vdev_ms_count; m++)
|
|
|
|
dump_metaslab(vd->vdev_ms[m]);
|
|
|
|
(void) printf("\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
|
|
|
|
{
|
|
|
|
const ddt_phys_t *ddp = dde->dde_phys;
|
|
|
|
const ddt_key_t *ddk = &dde->dde_key;
|
|
|
|
char *types[4] = { "ditto", "single", "double", "triple" };
|
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
blkptr_t blk;
|
2010-08-26 20:52:39 +04:00
|
|
|
int p;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (ddp->ddp_phys_birth == 0)
|
|
|
|
continue;
|
|
|
|
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("index %llx refcnt %llu %s %s\n",
|
|
|
|
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
|
|
|
|
types[p], blkbuf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_dedup_ratio(const ddt_stat_t *dds)
|
|
|
|
{
|
|
|
|
double rL, rP, rD, D, dedup, compress, copies;
|
|
|
|
|
|
|
|
if (dds->dds_blocks == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
rL = (double)dds->dds_ref_lsize;
|
|
|
|
rP = (double)dds->dds_ref_psize;
|
|
|
|
rD = (double)dds->dds_ref_dsize;
|
|
|
|
D = (double)dds->dds_dsize;
|
|
|
|
|
|
|
|
dedup = rD / D;
|
|
|
|
compress = rL / rP;
|
|
|
|
copies = rD / rP;
|
|
|
|
|
|
|
|
(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
|
|
|
|
"dedup * compress / copies = %.2f\n\n",
|
|
|
|
dedup, compress, copies, dedup * compress / copies);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
|
|
|
|
{
|
|
|
|
char name[DDT_NAMELEN];
|
|
|
|
ddt_entry_t dde;
|
|
|
|
uint64_t walk = 0;
|
|
|
|
dmu_object_info_t doi;
|
|
|
|
uint64_t count, dspace, mspace;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = ddt_object_info(ddt, type, class, &doi);
|
|
|
|
|
|
|
|
if (error == ENOENT)
|
|
|
|
return;
|
|
|
|
ASSERT(error == 0);
|
|
|
|
|
2012-10-26 21:01:49 +04:00
|
|
|
error = ddt_object_count(ddt, type, class, &count);
|
|
|
|
ASSERT(error == 0);
|
|
|
|
if (count == 0)
|
2010-08-27 01:24:34 +04:00
|
|
|
return;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
dspace = doi.doi_physical_blocks_512 << 9;
|
|
|
|
mspace = doi.doi_fill_count * doi.doi_data_block_size;
|
|
|
|
|
|
|
|
ddt_object_name(ddt, type, class, name);
|
|
|
|
|
|
|
|
(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
|
|
|
|
name,
|
|
|
|
(u_longlong_t)count,
|
|
|
|
(u_longlong_t)(dspace / count),
|
|
|
|
(u_longlong_t)(mspace / count));
|
|
|
|
|
|
|
|
if (dump_opt['D'] < 3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
|
|
|
|
|
|
|
|
if (dump_opt['D'] < 4)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
|
|
|
|
return;
|
|
|
|
|
|
|
|
(void) printf("%s contents:\n\n", name);
|
|
|
|
|
|
|
|
while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
|
|
|
|
dump_dde(ddt, &dde, walk);
|
|
|
|
|
|
|
|
ASSERT(error == ENOENT);
|
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_all_ddts(spa_t *spa)
|
|
|
|
{
|
2010-08-26 20:52:39 +04:00
|
|
|
ddt_histogram_t ddh_total;
|
|
|
|
ddt_stat_t dds_total;
|
|
|
|
enum zio_checksum c;
|
|
|
|
enum ddt_type type;
|
|
|
|
enum ddt_class class;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
bzero(&ddh_total, sizeof (ddt_histogram_t));
|
|
|
|
bzero(&dds_total, sizeof (ddt_stat_t));
|
|
|
|
|
|
|
|
for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_t *ddt = spa->spa_ddt[c];
|
2010-08-26 20:52:39 +04:00
|
|
|
for (type = 0; type < DDT_TYPES; type++) {
|
|
|
|
for (class = 0; class < DDT_CLASSES;
|
2010-05-29 00:45:14 +04:00
|
|
|
class++) {
|
|
|
|
dump_ddt(ddt, type, class);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ddt_get_dedup_stats(spa, &dds_total);
|
|
|
|
|
|
|
|
if (dds_total.dds_blocks == 0) {
|
|
|
|
(void) printf("All DDTs are empty\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
|
|
|
|
if (dump_opt['D'] > 1) {
|
|
|
|
(void) printf("DDT histogram (aggregated over all DDTs):\n");
|
|
|
|
ddt_get_dedup_histogram(spa, &ddh_total);
|
|
|
|
zpool_dump_ddt(&dds_total, &ddh_total);
|
|
|
|
}
|
|
|
|
|
|
|
|
dump_dedup_ratio(&dds_total);
|
|
|
|
}
|
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
static void
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
|
2009-01-16 00:59:39 +03:00
|
|
|
{
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
char *prefix = arg;
|
2009-01-16 00:59:39 +03:00
|
|
|
|
|
|
|
(void) printf("%s [%llu,%llu) length %llu\n",
|
|
|
|
prefix,
|
|
|
|
(u_longlong_t)start,
|
|
|
|
(u_longlong_t)(start + size),
|
|
|
|
(u_longlong_t)(size));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
|
|
|
dump_dtl(vdev_t *vd, int indent)
|
|
|
|
{
|
2009-01-16 00:59:39 +03:00
|
|
|
spa_t *spa = vd->vdev_spa;
|
|
|
|
boolean_t required;
|
|
|
|
char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
|
|
|
|
char prefix[256];
|
2010-08-26 20:52:39 +04:00
|
|
|
int c, t;
|
2009-01-16 00:59:39 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_vdev_state_enter(spa, SCL_NONE);
|
2009-01-16 00:59:39 +03:00
|
|
|
required = vdev_dtl_required(vd);
|
|
|
|
(void) spa_vdev_state_exit(spa, NULL, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (indent == 0)
|
|
|
|
(void) printf("\nDirty time logs:\n\n");
|
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
(void) printf("\t%*s%s [%s]\n", indent, "",
|
2008-12-03 23:09:06 +03:00
|
|
|
vd->vdev_path ? vd->vdev_path :
|
2009-01-16 00:59:39 +03:00
|
|
|
vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
|
|
|
|
required ? "DTL-required" : "DTL-expendable");
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (t = 0; t < DTL_TYPES; t++) {
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
range_tree_t *rt = vd->vdev_dtl[t];
|
|
|
|
if (range_tree_space(rt) == 0)
|
2009-01-16 00:59:39 +03:00
|
|
|
continue;
|
|
|
|
(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
|
|
|
|
indent + 2, "", name[t]);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
mutex_enter(rt->rt_lock);
|
|
|
|
range_tree_walk(rt, dump_dtl_seg, prefix);
|
|
|
|
mutex_exit(rt->rt_lock);
|
2009-01-16 00:59:39 +03:00
|
|
|
if (dump_opt['d'] > 5 && vd->vdev_children == 0)
|
|
|
|
dump_spacemap(spa->spa_meta_objset,
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
vd->vdev_dtl_sm);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (c = 0; c < vd->vdev_children; c++)
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_dtl(vd->vdev_child[c], indent + 4);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
dump_history(spa_t *spa)
|
|
|
|
{
|
|
|
|
nvlist_t **events = NULL;
|
2015-06-24 21:17:36 +03:00
|
|
|
char *buf;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t resid, len, off = 0;
|
|
|
|
uint_t num = 0;
|
|
|
|
int error;
|
|
|
|
time_t tsec;
|
|
|
|
struct tm t;
|
|
|
|
char tbuf[30];
|
|
|
|
char internalstr[MAXPATHLEN];
|
2010-08-26 20:52:39 +04:00
|
|
|
int i;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2015-06-24 21:17:36 +03:00
|
|
|
if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
|
|
|
|
(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
|
|
|
|
__func__);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
do {
|
2015-06-24 21:17:36 +03:00
|
|
|
len = SPA_OLD_MAXBLOCKSIZE;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
|
|
|
|
(void) fprintf(stderr, "Unable to read history: "
|
|
|
|
"error %d\n", error);
|
2015-06-24 21:17:36 +03:00
|
|
|
free(buf);
|
2010-05-29 00:45:14 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
off -= resid;
|
|
|
|
} while (len != 0);
|
|
|
|
|
|
|
|
(void) printf("\nHistory:\n");
|
2010-08-26 20:52:39 +04:00
|
|
|
for (i = 0; i < num; i++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t time, txg, ievent;
|
|
|
|
char *cmd, *intstr;
|
2013-08-28 15:45:09 +04:00
|
|
|
boolean_t printed = B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
|
|
|
|
&time) != 0)
|
2013-08-28 15:45:09 +04:00
|
|
|
goto next;
|
2010-05-29 00:45:14 +04:00
|
|
|
if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
|
|
|
|
&cmd) != 0) {
|
|
|
|
if (nvlist_lookup_uint64(events[i],
|
|
|
|
ZPOOL_HIST_INT_EVENT, &ievent) != 0)
|
2013-08-28 15:45:09 +04:00
|
|
|
goto next;
|
2010-05-29 00:45:14 +04:00
|
|
|
verify(nvlist_lookup_uint64(events[i],
|
|
|
|
ZPOOL_HIST_TXG, &txg) == 0);
|
|
|
|
verify(nvlist_lookup_string(events[i],
|
|
|
|
ZPOOL_HIST_INT_STR, &intstr) == 0);
|
2013-08-28 15:45:09 +04:00
|
|
|
if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
|
|
|
|
goto next;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
(void) snprintf(internalstr,
|
|
|
|
sizeof (internalstr),
|
|
|
|
"[internal %s txg:%lld] %s",
|
2010-08-26 20:52:39 +04:00
|
|
|
zfs_history_event_names[ievent],
|
|
|
|
(longlong_t)txg, intstr);
|
2010-05-29 00:45:14 +04:00
|
|
|
cmd = internalstr;
|
|
|
|
}
|
|
|
|
tsec = time;
|
|
|
|
(void) localtime_r(&tsec, &t);
|
|
|
|
(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
|
|
|
|
(void) printf("%s %s\n", tbuf, cmd);
|
2013-08-28 15:45:09 +04:00
|
|
|
printed = B_TRUE;
|
|
|
|
|
|
|
|
next:
|
|
|
|
if (dump_opt['h'] > 1) {
|
|
|
|
if (!printed)
|
|
|
|
(void) printf("unrecognized record:\n");
|
|
|
|
dump_nvlist(events[i], 2);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2015-06-24 21:17:36 +03:00
|
|
|
free(buf);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t
|
2014-06-25 22:37:59 +04:00
|
|
|
blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
|
|
|
|
const zbookmark_phys_t *zb)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dnp == NULL) {
|
|
|
|
ASSERT(zb->zb_level < 0);
|
|
|
|
if (zb->zb_object == 0)
|
|
|
|
return (zb->zb_blkid);
|
|
|
|
return (zb->zb_blkid * BP_GET_LSIZE(bp));
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(zb->zb_level >= 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
return ((zb->zb_blkid <<
|
|
|
|
(zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
|
2008-11-20 23:01:55 +03:00
|
|
|
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
const dva_t *dva = bp->blk_dva;
|
|
|
|
int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
|
2010-08-26 20:52:39 +04:00
|
|
|
int i;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
if (dump_opt['b'] >= 6) {
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, buflen, bp);
|
2010-05-29 00:45:14 +04:00
|
|
|
return;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp)) {
|
|
|
|
(void) sprintf(blkbuf,
|
|
|
|
"EMBEDDED et=%u %llxL/%llxP B=%llu",
|
|
|
|
(int)BPE_GET_ETYPE(bp),
|
|
|
|
(u_longlong_t)BPE_GET_LSIZE(bp),
|
|
|
|
(u_longlong_t)BPE_GET_PSIZE(bp),
|
|
|
|
(u_longlong_t)bp->blk_birth);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
blkbuf[0] = '\0';
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (i = 0; i < ndvas; i++)
|
2013-12-09 22:37:51 +04:00
|
|
|
(void) snprintf(blkbuf + strlen(blkbuf),
|
|
|
|
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
|
|
|
|
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
|
|
|
|
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (BP_IS_HOLE(bp)) {
|
|
|
|
(void) snprintf(blkbuf + strlen(blkbuf),
|
2015-03-27 05:03:22 +03:00
|
|
|
buflen - strlen(blkbuf),
|
|
|
|
"%llxL B=%llu",
|
|
|
|
(u_longlong_t)BP_GET_LSIZE(bp),
|
2013-12-09 22:37:51 +04:00
|
|
|
(u_longlong_t)bp->blk_birth);
|
|
|
|
} else {
|
|
|
|
(void) snprintf(blkbuf + strlen(blkbuf),
|
|
|
|
buflen - strlen(blkbuf),
|
|
|
|
"%llxL/%llxP F=%llu B=%llu/%llu",
|
|
|
|
(u_longlong_t)BP_GET_LSIZE(bp),
|
|
|
|
(u_longlong_t)BP_GET_PSIZE(bp),
|
2014-06-06 01:19:08 +04:00
|
|
|
(u_longlong_t)BP_GET_FILL(bp),
|
2013-12-09 22:37:51 +04:00
|
|
|
(u_longlong_t)bp->blk_birth,
|
|
|
|
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
static void
|
2014-06-25 22:37:59 +04:00
|
|
|
print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
|
2008-12-03 23:09:06 +03:00
|
|
|
const dnode_phys_t *dnp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
2008-11-20 23:01:55 +03:00
|
|
|
int l;
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (!BP_IS_EMBEDDED(bp)) {
|
|
|
|
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
|
|
|
|
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
ASSERT(zb->zb_level >= 0);
|
|
|
|
|
|
|
|
for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
|
|
|
|
if (l == zb->zb_level) {
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("L%llx", (u_longlong_t)zb->zb_level);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf(" ");
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("%s\n", blkbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
|
2014-06-25 22:37:59 +04:00
|
|
|
blkptr_t *bp, const zbookmark_phys_t *zb)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
int err = 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (bp->blk_birth == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
print_indirect(bp, zb, dnp);
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
|
2014-12-06 20:24:32 +03:00
|
|
|
arc_flags_t flags = ARC_FLAG_WAIT;
|
2008-12-03 23:09:06 +03:00
|
|
|
int i;
|
|
|
|
blkptr_t *cbp;
|
|
|
|
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
|
|
|
|
arc_buf_t *buf;
|
|
|
|
uint64_t fill = 0;
|
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
|
2008-12-03 23:09:06 +03:00
|
|
|
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
|
|
|
|
if (err)
|
|
|
|
return (err);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(buf->b_data);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
/* recursively visit blocks below this */
|
|
|
|
cbp = buf->b_data;
|
|
|
|
for (i = 0; i < epb; i++, cbp++) {
|
2014-06-25 22:37:59 +04:00
|
|
|
zbookmark_phys_t czb;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
|
|
|
|
zb->zb_level - 1,
|
|
|
|
zb->zb_blkid * epb + i);
|
|
|
|
err = visit_indirect(spa, dnp, cbp, &czb);
|
|
|
|
if (err)
|
|
|
|
break;
|
2014-06-06 01:19:08 +04:00
|
|
|
fill += BP_GET_FILL(cbp);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2009-01-16 00:59:39 +03:00
|
|
|
if (!err)
|
2014-06-06 01:19:08 +04:00
|
|
|
ASSERT3U(fill, ==, BP_GET_FILL(bp));
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) arc_buf_remove_ref(buf, &buf);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
return (err);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
dump_indirect(dnode_t *dn)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2008-12-03 23:09:06 +03:00
|
|
|
dnode_phys_t *dnp = dn->dn_phys;
|
|
|
|
int j;
|
2014-06-25 22:37:59 +04:00
|
|
|
zbookmark_phys_t czb;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("Indirect blocks:\n");
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
|
2008-12-03 23:09:06 +03:00
|
|
|
dn->dn_object, dnp->dn_nlevels - 1, 0);
|
|
|
|
for (j = 0; j < dnp->dn_nblkptr; j++) {
|
|
|
|
czb.zb_blkid = j;
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
|
2008-12-03 23:09:06 +03:00
|
|
|
&dnp->dn_blkptr[j], &czb);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
dsl_dir_phys_t *dd = data;
|
|
|
|
time_t crtime;
|
2010-05-29 00:45:14 +04:00
|
|
|
char nice[32];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (dd == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
|
|
|
|
|
|
|
|
crtime = dd->dd_creation_time;
|
|
|
|
(void) printf("\t\tcreation_time = %s", ctime(&crtime));
|
|
|
|
(void) printf("\t\thead_dataset_obj = %llu\n",
|
|
|
|
(u_longlong_t)dd->dd_head_dataset_obj);
|
|
|
|
(void) printf("\t\tparent_dir_obj = %llu\n",
|
|
|
|
(u_longlong_t)dd->dd_parent_obj);
|
|
|
|
(void) printf("\t\torigin_obj = %llu\n",
|
|
|
|
(u_longlong_t)dd->dd_origin_obj);
|
|
|
|
(void) printf("\t\tchild_dir_zapobj = %llu\n",
|
|
|
|
(u_longlong_t)dd->dd_child_dir_zapobj);
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(dd->dd_used_bytes, nice);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tused_bytes = %s\n", nice);
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(dd->dd_compressed_bytes, nice);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tcompressed_bytes = %s\n", nice);
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(dd->dd_uncompressed_bytes, nice);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tuncompressed_bytes = %s\n", nice);
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(dd->dd_quota, nice);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tquota = %s\n", nice);
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(dd->dd_reserved, nice);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\treserved = %s\n", nice);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\t\tprops_zapobj = %llu\n",
|
|
|
|
(u_longlong_t)dd->dd_props_zapobj);
|
|
|
|
(void) printf("\t\tdeleg_zapobj = %llu\n",
|
|
|
|
(u_longlong_t)dd->dd_deleg_zapobj);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tflags = %llx\n",
|
|
|
|
(u_longlong_t)dd->dd_flags);
|
|
|
|
|
|
|
|
#define DO(which) \
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
|
|
|
|
DO(HEAD);
|
|
|
|
DO(SNAP);
|
|
|
|
DO(CHILD);
|
|
|
|
DO(CHILD_RSRV);
|
|
|
|
DO(REFRSRV);
|
|
|
|
#undef DO
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
dsl_dataset_phys_t *ds = data;
|
|
|
|
time_t crtime;
|
2010-05-29 00:45:14 +04:00
|
|
|
char used[32], compressed[32], uncompressed[32], unique[32];
|
2008-11-20 23:01:55 +03:00
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
|
|
|
|
if (ds == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(size == sizeof (*ds));
|
|
|
|
crtime = ds->ds_creation_time;
|
2012-12-14 03:24:15 +04:00
|
|
|
zdb_nicenum(ds->ds_referenced_bytes, used);
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(ds->ds_compressed_bytes, compressed);
|
|
|
|
zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
|
|
|
|
zdb_nicenum(ds->ds_unique_bytes, unique);
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tdir_obj = %llu\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)ds->ds_dir_obj);
|
|
|
|
(void) printf("\t\tprev_snap_obj = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_prev_snap_obj);
|
|
|
|
(void) printf("\t\tprev_snap_txg = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_prev_snap_txg);
|
|
|
|
(void) printf("\t\tnext_snap_obj = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_next_snap_obj);
|
|
|
|
(void) printf("\t\tsnapnames_zapobj = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_snapnames_zapobj);
|
|
|
|
(void) printf("\t\tnum_children = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_num_children);
|
2009-08-18 22:43:27 +04:00
|
|
|
(void) printf("\t\tuserrefs_obj = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_userrefs_obj);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\t\tcreation_time = %s", ctime(&crtime));
|
|
|
|
(void) printf("\t\tcreation_txg = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_creation_txg);
|
|
|
|
(void) printf("\t\tdeadlist_obj = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_deadlist_obj);
|
|
|
|
(void) printf("\t\tused_bytes = %s\n", used);
|
|
|
|
(void) printf("\t\tcompressed_bytes = %s\n", compressed);
|
|
|
|
(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
|
|
|
|
(void) printf("\t\tunique = %s\n", unique);
|
|
|
|
(void) printf("\t\tfsid_guid = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_fsid_guid);
|
|
|
|
(void) printf("\t\tguid = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_guid);
|
|
|
|
(void) printf("\t\tflags = %llx\n",
|
|
|
|
(u_longlong_t)ds->ds_flags);
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) printf("\t\tnext_clones_obj = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_next_clones_obj);
|
|
|
|
(void) printf("\t\tprops_obj = %llu\n",
|
|
|
|
(u_longlong_t)ds->ds_props_obj);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\t\tbp = %s\n", blkbuf);
|
|
|
|
}
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
|
|
|
|
if (bp->blk_birth != 0) {
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
2012-12-14 03:24:15 +04:00
|
|
|
(void) printf("\t%s\n", blkbuf);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_bptree(objset_t *os, uint64_t obj, char *name)
|
|
|
|
{
|
|
|
|
char bytes[32];
|
|
|
|
bptree_phys_t *bt;
|
|
|
|
dmu_buf_t *db;
|
|
|
|
|
|
|
|
if (dump_opt['d'] < 3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
|
|
|
|
bt = db->db_data;
|
|
|
|
zdb_nicenum(bt->bt_bytes, bytes);
|
|
|
|
(void) printf("\n %s: %llu datasets, %s\n",
|
|
|
|
name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
|
|
|
|
dmu_buf_rele(db, FTAG);
|
|
|
|
|
|
|
|
if (dump_opt['d'] < 5)
|
|
|
|
return;
|
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
|
|
|
|
(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
|
|
|
|
ASSERT(bp->blk_birth != 0);
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\t%s\n", blkbuf);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
char bytes[32];
|
|
|
|
char comp[32];
|
|
|
|
char uncomp[32];
|
2013-07-05 23:37:16 +04:00
|
|
|
uint64_t i;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (dump_opt['d'] < 3)
|
|
|
|
return;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
|
2013-07-05 23:37:16 +04:00
|
|
|
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
|
|
|
|
zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
|
2013-07-05 23:37:16 +04:00
|
|
|
(void) printf(" %*s: object %llu, %llu local blkptrs, "
|
2015-04-27 01:27:36 +03:00
|
|
|
"%llu subobjs in object, %llu, %s (%s/%s comp)\n",
|
2013-07-05 23:37:16 +04:00
|
|
|
indent * 8, name,
|
|
|
|
(u_longlong_t)bpo->bpo_object,
|
|
|
|
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
2010-05-29 00:45:14 +04:00
|
|
|
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
|
2015-04-27 01:27:36 +03:00
|
|
|
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
|
2008-11-20 23:01:55 +03:00
|
|
|
bytes, comp, uncomp);
|
2013-07-05 23:37:16 +04:00
|
|
|
|
|
|
|
for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
|
|
|
|
uint64_t subobj;
|
|
|
|
bpobj_t subbpo;
|
|
|
|
int error;
|
|
|
|
VERIFY0(dmu_read(bpo->bpo_os,
|
|
|
|
bpo->bpo_phys->bpo_subobjs,
|
|
|
|
i * sizeof (subobj), sizeof (subobj), &subobj, 0));
|
|
|
|
error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
|
|
|
|
if (error != 0) {
|
|
|
|
(void) printf("ERROR %u while trying to open "
|
|
|
|
"subobj id %llu\n",
|
|
|
|
error, (u_longlong_t)subobj);
|
|
|
|
continue;
|
|
|
|
}
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_full_bpobj(&subbpo, "subobj", indent + 1);
|
2015-12-31 18:57:11 +03:00
|
|
|
bpobj_close(&subbpo);
|
2013-07-05 23:37:16 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2013-07-05 23:37:16 +04:00
|
|
|
(void) printf(" %*s: object %llu, %llu blkptrs, %s\n",
|
|
|
|
indent * 8, name,
|
|
|
|
(u_longlong_t)bpo->bpo_object,
|
|
|
|
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
|
|
|
|
bytes);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['d'] < 5)
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
2013-07-05 23:37:16 +04:00
|
|
|
if (indent == 0) {
|
|
|
|
(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
|
|
|
|
(void) printf("\n");
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
dump_deadlist(dsl_deadlist_t *dl)
|
|
|
|
{
|
|
|
|
dsl_deadlist_entry_t *dle;
|
2013-07-05 23:37:16 +04:00
|
|
|
uint64_t unused;
|
2010-05-29 00:45:14 +04:00
|
|
|
char bytes[32];
|
|
|
|
char comp[32];
|
|
|
|
char uncomp[32];
|
|
|
|
|
|
|
|
if (dump_opt['d'] < 3)
|
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-09-17 11:14:39 +04:00
|
|
|
if (dl->dl_oldfmt) {
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
|
2014-09-17 11:14:39 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(dl->dl_phys->dl_used, bytes);
|
|
|
|
zdb_nicenum(dl->dl_phys->dl_comp, comp);
|
|
|
|
zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
|
|
|
|
(void) printf("\n Deadlist: %s (%s/%s comp)\n",
|
|
|
|
bytes, comp, uncomp);
|
|
|
|
|
|
|
|
if (dump_opt['d'] < 4)
|
|
|
|
return;
|
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
|
2013-07-05 23:37:16 +04:00
|
|
|
/* force the tree to be loaded */
|
|
|
|
dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
for (dle = avl_first(&dl->dl_tree); dle;
|
|
|
|
dle = AVL_NEXT(&dl->dl_tree, dle)) {
|
2013-07-05 23:37:16 +04:00
|
|
|
if (dump_opt['d'] >= 5) {
|
|
|
|
char buf[128];
|
2013-11-01 23:26:11 +04:00
|
|
|
(void) snprintf(buf, sizeof (buf),
|
|
|
|
"mintxg %llu -> obj %llu",
|
2013-07-05 23:37:16 +04:00
|
|
|
(longlong_t)dle->dle_mintxg,
|
|
|
|
(longlong_t)dle->dle_bpobj.bpo_object);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_full_bpobj(&dle->dle_bpobj, buf, 0);
|
2013-07-05 23:37:16 +04:00
|
|
|
} else {
|
|
|
|
(void) printf("mintxg %llu -> obj %llu\n",
|
|
|
|
(longlong_t)dle->dle_mintxg,
|
|
|
|
(longlong_t)dle->dle_bpobj.bpo_object);
|
|
|
|
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static avl_tree_t idx_tree;
|
|
|
|
static avl_tree_t domain_tree;
|
|
|
|
static boolean_t fuid_table_loaded;
|
2010-05-29 00:45:14 +04:00
|
|
|
static boolean_t sa_loaded;
|
|
|
|
sa_attr_type_t *sa_attr_table;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
static void
|
2010-08-26 20:52:41 +04:00
|
|
|
fuid_table_destroy(void)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
if (fuid_table_loaded) {
|
|
|
|
zfs_fuid_table_destroy(&idx_tree, &domain_tree);
|
|
|
|
fuid_table_loaded = B_FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* print uid or gid information.
|
|
|
|
* For normal POSIX id just the id is printed in decimal format.
|
|
|
|
* For CIFS files with FUID the fuid is printed in hex followed by
|
2013-07-05 23:37:16 +04:00
|
|
|
* the domain-rid string.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
print_idstr(uint64_t id, const char *id_type)
|
|
|
|
{
|
|
|
|
if (FUID_INDEX(id)) {
|
|
|
|
char *domain;
|
|
|
|
|
|
|
|
domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
|
|
|
|
(void) printf("\t%s %llx [%s-%d]\n", id_type,
|
|
|
|
(u_longlong_t)id, domain, (int)FUID_RID(id));
|
|
|
|
} else {
|
|
|
|
(void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
uint32_t uid_idx, gid_idx;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
uid_idx = FUID_INDEX(uid);
|
|
|
|
gid_idx = FUID_INDEX(gid);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/* Load domain table, if not already loaded */
|
|
|
|
if (!fuid_table_loaded && (uid_idx || gid_idx)) {
|
|
|
|
uint64_t fuid_obj;
|
|
|
|
|
|
|
|
/* first find the fuid object. It lives in the master node */
|
|
|
|
VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
|
|
|
|
8, 1, &fuid_obj) == 0);
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) zfs_fuid_table_load(os, fuid_obj,
|
|
|
|
&idx_tree, &domain_tree);
|
|
|
|
fuid_table_loaded = B_TRUE;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
print_idstr(uid, "uid");
|
|
|
|
print_idstr(gid, "gid");
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2013-07-09 16:15:26 +04:00
|
|
|
static void
|
|
|
|
dump_znode_sa_xattr(sa_handle_t *hdl)
|
|
|
|
{
|
|
|
|
nvlist_t *sa_xattr;
|
|
|
|
nvpair_t *elem = NULL;
|
|
|
|
int sa_xattr_size = 0;
|
|
|
|
int sa_xattr_entries = 0;
|
|
|
|
int error;
|
|
|
|
char *sa_xattr_packed;
|
|
|
|
|
|
|
|
error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
|
|
|
|
if (error || sa_xattr_size == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
sa_xattr_packed = malloc(sa_xattr_size);
|
|
|
|
if (sa_xattr_packed == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
|
|
|
|
sa_xattr_packed, sa_xattr_size);
|
|
|
|
if (error) {
|
|
|
|
free(sa_xattr_packed);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
|
|
|
|
if (error) {
|
|
|
|
free(sa_xattr_packed);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
|
|
|
|
sa_xattr_entries++;
|
|
|
|
|
|
|
|
(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
|
|
|
|
sa_xattr_size, sa_xattr_entries);
|
|
|
|
while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
|
|
|
|
uchar_t *value;
|
|
|
|
uint_t cnt, idx;
|
|
|
|
|
|
|
|
(void) printf("\t\t%s = ", nvpair_name(elem));
|
|
|
|
nvpair_value_byte_array(elem, &value, &cnt);
|
2013-11-01 23:26:11 +04:00
|
|
|
for (idx = 0; idx < cnt; ++idx) {
|
2013-07-09 16:15:26 +04:00
|
|
|
if (isprint(value[idx]))
|
|
|
|
(void) putchar(value[idx]);
|
|
|
|
else
|
|
|
|
(void) printf("\\%3.3o", value[idx]);
|
|
|
|
}
|
|
|
|
(void) putchar('\n');
|
|
|
|
}
|
|
|
|
|
|
|
|
nvlist_free(sa_xattr);
|
|
|
|
free(sa_xattr_packed);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
|
2010-05-29 00:45:14 +04:00
|
|
|
sa_handle_t *hdl;
|
|
|
|
uint64_t xattr, rdev, gen;
|
|
|
|
uint64_t uid, gid, mode, fsize, parent, links;
|
|
|
|
uint64_t pflags;
|
|
|
|
uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
|
|
|
|
time_t z_crtime, z_atime, z_mtime, z_ctime;
|
|
|
|
sa_bulk_attr_t bulk[12];
|
|
|
|
int idx = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!sa_loaded) {
|
|
|
|
uint64_t sa_attrs = 0;
|
|
|
|
uint64_t version;
|
|
|
|
|
|
|
|
VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
|
|
|
|
8, 1, &version) == 0);
|
|
|
|
if (version >= ZPL_VERSION_SA) {
|
|
|
|
VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
|
|
|
|
8, 1, &sa_attrs) == 0);
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
|
|
|
|
ZPL_END, &sa_attr_table)) != 0) {
|
|
|
|
(void) printf("sa_setup failed errno %d, can't "
|
|
|
|
"display znode contents\n", error);
|
|
|
|
return;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
sa_loaded = B_TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
|
|
|
|
(void) printf("Failed to get handle for SA znode\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
|
|
|
|
&links, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
|
|
|
|
&mode, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
|
|
|
|
NULL, &parent, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
|
|
|
|
&fsize, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
|
|
|
|
acctm, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
|
|
|
|
modtm, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
|
|
|
|
crtm, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
|
|
|
|
chgtm, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
|
|
|
|
&pflags, 8);
|
|
|
|
|
|
|
|
if (sa_bulk_lookup(hdl, bulk, idx)) {
|
|
|
|
(void) sa_handle_destroy(hdl);
|
|
|
|
return;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
error = zfs_obj_to_path(os, object, path, sizeof (path));
|
|
|
|
if (error != 0) {
|
|
|
|
(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
|
|
|
|
(u_longlong_t)object);
|
|
|
|
}
|
|
|
|
if (dump_opt['d'] < 3) {
|
|
|
|
(void) printf("\t%s\n", path);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) sa_handle_destroy(hdl);
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
z_crtime = (time_t)crtm[0];
|
|
|
|
z_atime = (time_t)acctm[0];
|
|
|
|
z_mtime = (time_t)modtm[0];
|
|
|
|
z_ctime = (time_t)chgtm[0];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("\tpath %s\n", path);
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_uidgid(os, uid, gid);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\tatime %s", ctime(&z_atime));
|
|
|
|
(void) printf("\tmtime %s", ctime(&z_mtime));
|
|
|
|
(void) printf("\tctime %s", ctime(&z_ctime));
|
|
|
|
(void) printf("\tcrtime %s", ctime(&z_crtime));
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\tgen %llu\n", (u_longlong_t)gen);
|
|
|
|
(void) printf("\tmode %llo\n", (u_longlong_t)mode);
|
|
|
|
(void) printf("\tsize %llu\n", (u_longlong_t)fsize);
|
|
|
|
(void) printf("\tparent %llu\n", (u_longlong_t)parent);
|
|
|
|
(void) printf("\tlinks %llu\n", (u_longlong_t)links);
|
|
|
|
(void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
|
|
|
|
if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
|
|
|
|
sizeof (uint64_t)) == 0)
|
|
|
|
(void) printf("\txattr %llu\n", (u_longlong_t)xattr);
|
|
|
|
if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
|
|
|
|
sizeof (uint64_t)) == 0)
|
|
|
|
(void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
|
2013-07-09 16:15:26 +04:00
|
|
|
dump_znode_sa_xattr(hdl);
|
2010-05-29 00:45:14 +04:00
|
|
|
sa_handle_destroy(hdl);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static void
|
|
|
|
dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_none, /* unallocated */
|
|
|
|
dump_zap, /* object directory */
|
|
|
|
dump_uint64, /* object array */
|
|
|
|
dump_none, /* packed nvlist */
|
|
|
|
dump_packed_nvlist, /* packed nvlist size */
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_none, /* bpobj */
|
|
|
|
dump_bpobj, /* bpobj header */
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_none, /* SPA space map header */
|
|
|
|
dump_none, /* SPA space map */
|
|
|
|
dump_none, /* ZIL intent log */
|
|
|
|
dump_dnode, /* DMU dnode */
|
|
|
|
dump_dmu_objset, /* DMU objset */
|
|
|
|
dump_dsl_dir, /* DSL directory */
|
|
|
|
dump_zap, /* DSL directory child map */
|
|
|
|
dump_zap, /* DSL dataset snap map */
|
|
|
|
dump_zap, /* DSL props */
|
|
|
|
dump_dsl_dataset, /* DSL dataset */
|
|
|
|
dump_znode, /* ZFS znode */
|
|
|
|
dump_acl, /* ZFS V0 ACL */
|
|
|
|
dump_uint8, /* ZFS plain file */
|
|
|
|
dump_zpldir, /* ZFS directory */
|
|
|
|
dump_zap, /* ZFS master node */
|
|
|
|
dump_zap, /* ZFS delete queue */
|
|
|
|
dump_uint8, /* zvol object */
|
|
|
|
dump_zap, /* zvol prop */
|
|
|
|
dump_uint8, /* other uint8[] */
|
|
|
|
dump_uint64, /* other uint64[] */
|
|
|
|
dump_zap, /* other ZAP */
|
|
|
|
dump_zap, /* persistent error log */
|
|
|
|
dump_uint8, /* SPA history */
|
2013-08-28 15:45:09 +04:00
|
|
|
dump_history_offsets, /* SPA history offsets */
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_zap, /* Pool properties */
|
|
|
|
dump_zap, /* DSL permissions */
|
|
|
|
dump_acl, /* ZFS ACL */
|
|
|
|
dump_uint8, /* ZFS SYSACL */
|
|
|
|
dump_none, /* FUID nvlist */
|
|
|
|
dump_packed_nvlist, /* FUID nvlist size */
|
2008-12-03 23:09:06 +03:00
|
|
|
dump_zap, /* DSL dataset next clones */
|
|
|
|
dump_zap, /* DSL scrub queue */
|
2009-07-03 02:44:48 +04:00
|
|
|
dump_zap, /* ZFS user/group used */
|
|
|
|
dump_zap, /* ZFS user/group quota */
|
2009-08-18 22:43:27 +04:00
|
|
|
dump_zap, /* snapshot refcount tags */
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_ddt_zap, /* DDT ZAP object */
|
|
|
|
dump_zap, /* DDT statistics */
|
|
|
|
dump_znode, /* SA object */
|
|
|
|
dump_zap, /* SA Master Node */
|
|
|
|
dump_sa_attrs, /* SA attribute registration */
|
|
|
|
dump_sa_layouts, /* SA attribute layouts */
|
|
|
|
dump_zap, /* DSL scrub translations */
|
|
|
|
dump_none, /* fake dedup BP */
|
|
|
|
dump_zap, /* deadlist */
|
|
|
|
dump_none, /* deadlist hdr */
|
|
|
|
dump_zap, /* dsl clones */
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_bpobj_subobjs, /* bpobj subobjs */
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_unknown, /* Unknown type, must be last */
|
2008-11-20 23:01:55 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
|
|
|
|
{
|
|
|
|
dmu_buf_t *db = NULL;
|
|
|
|
dmu_object_info_t doi;
|
|
|
|
dnode_t *dn;
|
|
|
|
void *bonus = NULL;
|
|
|
|
size_t bsize = 0;
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
|
2010-05-29 00:45:14 +04:00
|
|
|
char bonus_size[32];
|
2008-11-20 23:01:55 +03:00
|
|
|
char aux[50];
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (*print_header) {
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
(void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
|
|
|
|
"Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
|
|
|
|
"lsize", "%full", "type");
|
2008-11-20 23:01:55 +03:00
|
|
|
*print_header = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (object == 0) {
|
2010-08-27 01:24:34 +04:00
|
|
|
dn = DMU_META_DNODE(os);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
error = dmu_bonus_hold(os, object, FTAG, &db);
|
|
|
|
if (error)
|
|
|
|
fatal("dmu_bonus_hold(%llu) failed, errno %u",
|
|
|
|
object, error);
|
|
|
|
bonus = db->db_data;
|
|
|
|
bsize = db->db_size;
|
2010-08-27 01:24:34 +04:00
|
|
|
dn = DB_DNODE((dmu_buf_impl_t *)db);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
dmu_object_info_from_dnode(dn, &doi);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(doi.doi_metadata_block_size, iblk);
|
|
|
|
zdb_nicenum(doi.doi_data_block_size, dblk);
|
|
|
|
zdb_nicenum(doi.doi_max_offset, lsize);
|
|
|
|
zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
|
|
|
|
zdb_nicenum(doi.doi_bonus_size, bonus_size);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
zdb_nicenum(doi.doi_dnodesize, dnsize);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
|
|
|
|
doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
|
|
|
|
doi.doi_max_offset);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
aux[0] = '\0';
|
|
|
|
|
|
|
|
if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
|
|
|
|
(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
|
2010-05-29 00:45:14 +04:00
|
|
|
ZDB_CHECKSUM_NAME(doi.doi_checksum));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
|
|
|
|
(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
|
2010-05-29 00:45:14 +04:00
|
|
|
ZDB_COMPRESS_NAME(doi.doi_compress));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
(void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
|
2010-05-29 00:45:14 +04:00
|
|
|
(u_longlong_t)object, doi.doi_indirection, iblk, dblk,
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-17 04:25:34 +03:00
|
|
|
(void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
|
|
|
|
"", "", "", "", "", "", bonus_size, "bonus",
|
2010-05-29 00:45:14 +04:00
|
|
|
ZDB_OT_NAME(doi.doi_bonus_type));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (verbosity >= 4) {
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\tdnode flags: %s%s%s\n",
|
2009-07-03 02:44:48 +04:00
|
|
|
(dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
|
|
|
|
"USED_BYTES " : "",
|
|
|
|
(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
|
2010-05-29 00:45:14 +04:00
|
|
|
"USERUSED_ACCOUNTED " : "",
|
|
|
|
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
|
|
|
|
"SPILL_BLKPTR" : "");
|
2009-07-03 02:44:48 +04:00
|
|
|
(void) printf("\tdnode maxblkid: %llu\n",
|
|
|
|
(longlong_t)dn->dn_phys->dn_maxblkid);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
|
|
|
|
bonus, bsize);
|
|
|
|
object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
*print_header = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (verbosity >= 5)
|
2008-12-03 23:09:06 +03:00
|
|
|
dump_indirect(dn);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (verbosity >= 5) {
|
|
|
|
/*
|
|
|
|
* Report the list of segments that comprise the object.
|
|
|
|
*/
|
|
|
|
uint64_t start = 0;
|
|
|
|
uint64_t end;
|
|
|
|
uint64_t blkfill = 1;
|
|
|
|
int minlvl = 1;
|
|
|
|
|
|
|
|
if (dn->dn_type == DMU_OT_DNODE) {
|
|
|
|
minlvl = 0;
|
|
|
|
blkfill = DNODES_PER_BLOCK;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (;;) {
|
2010-05-29 00:45:14 +04:00
|
|
|
char segsize[32];
|
2008-12-03 23:09:06 +03:00
|
|
|
error = dnode_next_offset(dn,
|
|
|
|
0, &start, minlvl, blkfill, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
end = start;
|
2008-12-03 23:09:06 +03:00
|
|
|
error = dnode_next_offset(dn,
|
|
|
|
DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(end - start, segsize);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\t\tsegment [%016llx, %016llx)"
|
|
|
|
" size %5s\n", (u_longlong_t)start,
|
|
|
|
(u_longlong_t)end, segsize);
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
start = end;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (db != NULL)
|
|
|
|
dmu_buf_rele(db, FTAG);
|
|
|
|
}
|
|
|
|
|
|
|
|
static char *objset_types[DMU_OST_NUMTYPES] = {
|
|
|
|
"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_dir(objset_t *os)
|
|
|
|
{
|
|
|
|
dmu_objset_stats_t dds;
|
|
|
|
uint64_t object, object_count;
|
|
|
|
uint64_t refdbytes, usedobjs, scratch;
|
2010-05-29 00:45:14 +04:00
|
|
|
char numbuf[32];
|
2009-07-03 02:44:48 +04:00
|
|
|
char blkbuf[BP_SPRINTF_LEN + 20];
|
2016-06-16 00:28:36 +03:00
|
|
|
char osname[ZFS_MAX_DATASET_NAME_LEN];
|
2008-11-20 23:01:55 +03:00
|
|
|
char *type = "UNKNOWN";
|
|
|
|
int verbosity = dump_opt['d'];
|
|
|
|
int print_header = 1;
|
|
|
|
int i, error;
|
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_objset_fast_stat(os, &dds);
|
2013-09-04 16:00:57 +04:00
|
|
|
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (dds.dds_type < DMU_OST_NUMTYPES)
|
|
|
|
type = objset_types[dds.dds_type];
|
|
|
|
|
|
|
|
if (dds.dds_type == DMU_OST_META) {
|
|
|
|
dds.dds_creation_txg = TXG_INITIAL;
|
2014-06-06 01:19:08 +04:00
|
|
|
usedobjs = BP_GET_FILL(os->os_rootbp);
|
2015-04-01 18:14:34 +03:00
|
|
|
refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
|
|
|
|
dd_used_bytes;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
|
|
|
|
}
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(refdbytes, numbuf);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (verbosity >= 4) {
|
2013-12-09 22:37:51 +04:00
|
|
|
(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
|
|
|
|
(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
|
|
|
|
sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
blkbuf[0] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
dmu_objset_name(os, osname);
|
|
|
|
|
|
|
|
(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
|
|
|
|
"%s, %llu objects%s\n",
|
|
|
|
osname, type, (u_longlong_t)dmu_objset_id(os),
|
|
|
|
(u_longlong_t)dds.dds_creation_txg,
|
|
|
|
numbuf, (u_longlong_t)usedobjs, blkbuf);
|
|
|
|
|
|
|
|
if (zopt_objects != 0) {
|
|
|
|
for (i = 0; i < zopt_objects; i++)
|
|
|
|
dump_object(os, zopt_object[i], verbosity,
|
|
|
|
&print_header);
|
|
|
|
(void) printf("\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['i'] != 0 || verbosity >= 2)
|
|
|
|
dump_intent_log(dmu_objset_zil(os));
|
|
|
|
|
|
|
|
if (dmu_objset_ds(os) != NULL)
|
|
|
|
dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
|
|
|
|
|
|
|
|
if (verbosity < 2)
|
|
|
|
return;
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (BP_IS_HOLE(os->os_rootbp))
|
2010-05-29 00:45:14 +04:00
|
|
|
return;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_object(os, 0, verbosity, &print_header);
|
2009-07-03 02:44:48 +04:00
|
|
|
object_count = 0;
|
2010-08-27 01:24:34 +04:00
|
|
|
if (DMU_USERUSED_DNODE(os) != NULL &&
|
|
|
|
DMU_USERUSED_DNODE(os)->dn_type != 0) {
|
2009-07-03 02:44:48 +04:00
|
|
|
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
|
|
|
|
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
object = 0;
|
|
|
|
while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
|
|
|
|
dump_object(os, object, verbosity, &print_header);
|
|
|
|
object_count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT3U(object_count, ==, usedobjs);
|
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (error != ESRCH) {
|
|
|
|
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
|
|
|
|
abort();
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
time_t timestamp = ub->ub_timestamp;
|
|
|
|
|
2010-08-26 20:52:40 +04:00
|
|
|
(void) printf("%s", header ? header : "");
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
|
|
|
|
(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
|
|
|
|
(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
|
|
|
|
(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
|
|
|
|
(void) printf("\ttimestamp = %llu UTC = %s",
|
|
|
|
(u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp)));
|
|
|
|
if (dump_opt['u'] >= 3) {
|
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\trootbp = %s\n", blkbuf);
|
|
|
|
}
|
2010-08-26 20:52:40 +04:00
|
|
|
(void) printf("%s", footer ? footer : "");
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_config(spa_t *spa)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_buf_t *db;
|
|
|
|
size_t nvsize = 0;
|
|
|
|
int error = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
error = dmu_bonus_hold(spa->spa_meta_objset,
|
|
|
|
spa->spa_config_object, FTAG, &db);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
nvsize = *(uint64_t *)db->db_data;
|
|
|
|
dmu_buf_rele(db, FTAG);
|
|
|
|
|
|
|
|
(void) printf("\nMOS Configuration:\n");
|
|
|
|
dump_packed_nvlist(spa->spa_meta_objset,
|
|
|
|
spa->spa_config_object, (void *)&nvsize, 1);
|
|
|
|
} else {
|
|
|
|
(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
|
|
|
|
(u_longlong_t)spa->spa_config_object, error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2008-12-03 23:09:06 +03:00
|
|
|
dump_cachefile(const char *cachefile)
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
struct stat64 statbuf;
|
|
|
|
char *buf;
|
|
|
|
nvlist_t *config;
|
|
|
|
|
|
|
|
if ((fd = open64(cachefile, O_RDONLY)) < 0) {
|
|
|
|
(void) printf("cannot open '%s': %s\n", cachefile,
|
|
|
|
strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fstat64(fd, &statbuf) != 0) {
|
|
|
|
(void) printf("failed to stat '%s': %s\n", cachefile,
|
|
|
|
strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((buf = malloc(statbuf.st_size)) == NULL) {
|
|
|
|
(void) fprintf(stderr, "failed to allocate %llu bytes\n",
|
|
|
|
(u_longlong_t)statbuf.st_size);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
|
|
|
|
(void) fprintf(stderr, "failed to read %llu bytes\n",
|
|
|
|
(u_longlong_t)statbuf.st_size);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) close(fd);
|
|
|
|
|
|
|
|
if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
|
|
|
|
(void) fprintf(stderr, "failed to unpack nvlist\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
free(buf);
|
|
|
|
|
|
|
|
dump_nvlist(config, 0);
|
|
|
|
|
|
|
|
nvlist_free(config);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#define ZDB_MAX_UB_HEADER_SIZE 32
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
|
|
|
|
{
|
|
|
|
vdev_t vd;
|
|
|
|
vdev_t *vdp = &vd;
|
|
|
|
char header[ZDB_MAX_UB_HEADER_SIZE];
|
2010-08-26 20:52:39 +04:00
|
|
|
int i;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
vd.vdev_ashift = ashift;
|
|
|
|
vdp->vdev_top = vdp;
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
|
|
|
|
uberblock_t *ub = (void *)((char *)lbl + uoff);
|
|
|
|
|
|
|
|
if (uberblock_verify(ub))
|
|
|
|
continue;
|
|
|
|
(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
|
|
|
|
"Uberblock[%d]\n", i);
|
|
|
|
dump_uberblock(ub, header, "");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
|
|
|
dump_label(const char *dev)
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
vdev_label_t label;
|
2010-05-29 00:45:14 +04:00
|
|
|
char *path, *buf = label.vl_vdev_phys.vp_nvlist;
|
2008-11-20 23:01:55 +03:00
|
|
|
size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
|
|
|
|
struct stat64 statbuf;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t psize, ashift;
|
|
|
|
int len = strlen(dev) + 1;
|
2010-08-26 20:52:39 +04:00
|
|
|
int l;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (strncmp(dev, "/dev/dsk/", 9) == 0) {
|
|
|
|
len++;
|
|
|
|
path = malloc(len);
|
|
|
|
(void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
|
|
|
|
} else {
|
|
|
|
path = strdup(dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((fd = open64(path, O_RDONLY)) < 0) {
|
|
|
|
(void) printf("cannot open '%s': %s\n", path, strerror(errno));
|
|
|
|
free(path);
|
2008-11-20 23:01:55 +03:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2010-12-14 20:50:37 +03:00
|
|
|
if (fstat64_blk(fd, &statbuf) != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("failed to stat '%s': %s\n", path,
|
2008-11-20 23:01:55 +03:00
|
|
|
strerror(errno));
|
2010-05-29 00:45:14 +04:00
|
|
|
free(path);
|
|
|
|
(void) close(fd);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
psize = statbuf.st_size;
|
|
|
|
psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (l = 0; l < VDEV_LABELS; l++) {
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_t *config = NULL;
|
|
|
|
|
|
|
|
(void) printf("--------------------------------------------\n");
|
|
|
|
(void) printf("LABEL %d\n", l);
|
|
|
|
(void) printf("--------------------------------------------\n");
|
|
|
|
|
|
|
|
if (pread64(fd, &label, sizeof (label),
|
|
|
|
vdev_label_offset(psize, l, 0)) != sizeof (label)) {
|
|
|
|
(void) printf("failed to read label %d\n", l);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
|
|
|
|
(void) printf("failed to unpack label %d\n", l);
|
2010-05-29 00:45:14 +04:00
|
|
|
ashift = SPA_MINBLOCKSHIFT;
|
|
|
|
} else {
|
|
|
|
nvlist_t *vdev_tree = NULL;
|
|
|
|
|
|
|
|
dump_nvlist(config, 4);
|
|
|
|
if ((nvlist_lookup_nvlist(config,
|
|
|
|
ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
|
|
|
|
(nvlist_lookup_uint64(vdev_tree,
|
|
|
|
ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
|
|
|
|
ashift = SPA_MINBLOCKSHIFT;
|
|
|
|
nvlist_free(config);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['u'])
|
|
|
|
dump_label_uberblocks(&label, ashift);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
free(path);
|
|
|
|
(void) close(fd);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2015-07-24 19:53:55 +03:00
|
|
|
static uint64_t dataset_feature_count[SPA_FEATURES];
|
2014-11-03 23:15:08 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_one_dir(const char *dsname, void *arg)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
objset_t *os;
|
2015-07-24 19:53:55 +03:00
|
|
|
spa_feature_t f;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("Could not open %s, error %d\n", dsname, error);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
2015-07-24 19:53:55 +03:00
|
|
|
|
|
|
|
for (f = 0; f < SPA_FEATURES; f++) {
|
|
|
|
if (!dmu_objset_ds(os)->ds_feature_inuse[f])
|
|
|
|
continue;
|
|
|
|
ASSERT(spa_feature_table[f].fi_flags &
|
|
|
|
ZFEATURE_FLAG_PER_DATASET);
|
|
|
|
dataset_feature_count[f]++;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_dir(os);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_objset_disown(os, FTAG);
|
2008-11-20 23:01:55 +03:00
|
|
|
fuid_table_destroy();
|
2010-05-29 00:45:14 +04:00
|
|
|
sa_loaded = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Block statistics.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2014-11-03 23:15:08 +03:00
|
|
|
#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef struct zdb_blkstats {
|
2013-03-25 01:24:51 +04:00
|
|
|
uint64_t zb_asize;
|
|
|
|
uint64_t zb_lsize;
|
|
|
|
uint64_t zb_psize;
|
|
|
|
uint64_t zb_count;
|
2014-11-03 22:12:40 +03:00
|
|
|
uint64_t zb_gangs;
|
|
|
|
uint64_t zb_ditto_samevdev;
|
2013-03-25 01:24:51 +04:00
|
|
|
uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
|
2008-11-20 23:01:55 +03:00
|
|
|
} zdb_blkstats_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Extended object types to report deferred frees and dedup auto-ditto blocks.
|
|
|
|
*/
|
|
|
|
#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
|
|
|
|
#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
|
2012-12-14 03:24:15 +04:00
|
|
|
#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
|
|
|
|
#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
static char *zdb_ot_extname[] = {
|
|
|
|
"deferred free",
|
|
|
|
"dedup ditto",
|
2012-12-14 03:24:15 +04:00
|
|
|
"other",
|
2010-05-29 00:45:14 +04:00
|
|
|
"Total",
|
|
|
|
};
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
#define ZB_TOTAL DN_MAX_LEVELS
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef struct zdb_cb {
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
|
|
|
|
uint64_t zcb_dedup_asize;
|
|
|
|
uint64_t zcb_dedup_blocks;
|
2014-06-06 01:19:08 +04:00
|
|
|
uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
|
|
|
|
uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
|
|
|
|
[BPE_PAYLOAD_SIZE];
|
2013-03-25 01:24:51 +04:00
|
|
|
uint64_t zcb_start;
|
|
|
|
uint64_t zcb_lastprint;
|
|
|
|
uint64_t zcb_totalasize;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t zcb_errors[256];
|
|
|
|
int zcb_readfails;
|
|
|
|
int zcb_haderrors;
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_t *zcb_spa;
|
2008-11-20 23:01:55 +03:00
|
|
|
} zdb_cb_t;
|
|
|
|
|
|
|
|
static void
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
|
|
|
dmu_object_type_t type)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t refcnt = 0;
|
2010-08-26 20:52:39 +04:00
|
|
|
int i;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
ASSERT(type < ZDB_OT_TOTAL);
|
|
|
|
|
|
|
|
if (zilog && zil_bp_tree_add(zilog, bp) != 0)
|
|
|
|
return;
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (i = 0; i < 4; i++) {
|
2008-11-20 23:01:55 +03:00
|
|
|
int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
|
2010-05-29 00:45:14 +04:00
|
|
|
int t = (i & 1) ? type : ZDB_OT_TOTAL;
|
2014-11-03 22:12:40 +03:00
|
|
|
int equal;
|
2008-11-20 23:01:55 +03:00
|
|
|
zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
|
|
|
|
|
|
|
|
zb->zb_asize += BP_GET_ASIZE(bp);
|
|
|
|
zb->zb_lsize += BP_GET_LSIZE(bp);
|
|
|
|
zb->zb_psize += BP_GET_PSIZE(bp);
|
|
|
|
zb->zb_count++;
|
2014-11-03 23:15:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The histogram is only big enough to record blocks up to
|
|
|
|
* SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
|
|
|
|
* "other", bucket.
|
|
|
|
*/
|
|
|
|
int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
|
|
|
|
idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
|
|
|
|
zb->zb_psize_histogram[idx]++;
|
2014-11-03 22:12:40 +03:00
|
|
|
|
|
|
|
zb->zb_gangs += BP_COUNT_GANG(bp);
|
|
|
|
|
|
|
|
switch (BP_GET_NDVAS(bp)) {
|
|
|
|
case 2:
|
|
|
|
if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
|
|
|
|
DVA_GET_VDEV(&bp->blk_dva[1]))
|
|
|
|
zb->zb_ditto_samevdev++;
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
|
|
|
|
DVA_GET_VDEV(&bp->blk_dva[1])) +
|
|
|
|
(DVA_GET_VDEV(&bp->blk_dva[0]) ==
|
|
|
|
DVA_GET_VDEV(&bp->blk_dva[2])) +
|
|
|
|
(DVA_GET_VDEV(&bp->blk_dva[1]) ==
|
|
|
|
DVA_GET_VDEV(&bp->blk_dva[2]));
|
|
|
|
if (equal != 0)
|
|
|
|
zb->zb_ditto_samevdev++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (BP_IS_EMBEDDED(bp)) {
|
|
|
|
zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
|
|
|
|
zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
|
|
|
|
[BPE_GET_PSIZE(bp)]++;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['L'])
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (BP_GET_DEDUP(bp)) {
|
|
|
|
ddt_t *ddt;
|
|
|
|
ddt_entry_t *dde;
|
|
|
|
|
|
|
|
ddt = ddt_select(zcb->zcb_spa, bp);
|
|
|
|
ddt_enter(ddt);
|
|
|
|
dde = ddt_lookup(ddt, bp, B_FALSE);
|
|
|
|
|
|
|
|
if (dde == NULL) {
|
|
|
|
refcnt = 0;
|
|
|
|
} else {
|
|
|
|
ddt_phys_t *ddp = ddt_phys_select(dde, bp);
|
|
|
|
ddt_phys_decref(ddp);
|
|
|
|
refcnt = ddp->ddp_refcnt;
|
|
|
|
if (ddt_phys_total_refcnt(dde) == 0)
|
|
|
|
ddt_remove(ddt, dde);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
ddt_exit(ddt);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
|
|
|
|
refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
|
|
|
|
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2013-05-03 03:36:32 +04:00
|
|
|
static void
|
|
|
|
zdb_blkptr_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
int ioerr = zio->io_error;
|
|
|
|
zdb_cb_t *zcb = zio->io_private;
|
2014-06-25 22:37:59 +04:00
|
|
|
zbookmark_phys_t *zb = &zio->io_bookmark;
|
2013-05-03 03:36:32 +04:00
|
|
|
|
|
|
|
zio_data_buf_free(zio->io_data, zio->io_size);
|
|
|
|
|
|
|
|
mutex_enter(&spa->spa_scrub_lock);
|
|
|
|
spa->spa_scrub_inflight--;
|
|
|
|
cv_broadcast(&spa->spa_scrub_io_cv);
|
|
|
|
|
|
|
|
if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
|
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
|
|
|
|
zcb->zcb_haderrors = 1;
|
|
|
|
zcb->zcb_errors[ioerr]++;
|
|
|
|
|
|
|
|
if (dump_opt['b'] >= 2)
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
2013-05-03 03:36:32 +04:00
|
|
|
else
|
|
|
|
blkbuf[0] = '\0';
|
|
|
|
|
|
|
|
(void) printf("zdb_blkptr_cb: "
|
|
|
|
"Got error %d reading "
|
|
|
|
"<%llu, %llu, %lld, %llx> %s -- skipping\n",
|
|
|
|
ioerr,
|
|
|
|
(u_longlong_t)zb->zb_objset,
|
|
|
|
(u_longlong_t)zb->zb_object,
|
|
|
|
(u_longlong_t)zb->zb_level,
|
|
|
|
(u_longlong_t)zb->zb_blkid,
|
|
|
|
blkbuf);
|
|
|
|
}
|
|
|
|
mutex_exit(&spa->spa_scrub_lock);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
2013-07-03 00:26:24 +04:00
|
|
|
zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
2014-06-25 22:37:59 +04:00
|
|
|
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
zdb_cb_t *zcb = arg;
|
2009-07-03 02:44:48 +04:00
|
|
|
dmu_object_type_t type;
|
2010-05-29 00:45:14 +04:00
|
|
|
boolean_t is_metadata;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
if (bp == NULL)
|
|
|
|
return (0);
|
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
|
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
|
|
|
(void) printf("objset %llu object %llu "
|
|
|
|
"level %lld offset 0x%llx %s\n",
|
|
|
|
(u_longlong_t)zb->zb_objset,
|
|
|
|
(u_longlong_t)zb->zb_object,
|
|
|
|
(longlong_t)zb->zb_level,
|
|
|
|
(u_longlong_t)blkid2offset(dnp, bp, zb),
|
|
|
|
blkbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BP_IS_HOLE(bp))
|
2008-12-03 23:09:06 +03:00
|
|
|
return (0);
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
type = BP_GET_TYPE(bp);
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
zdb_count_block(zcb, zilog, bp,
|
|
|
|
(type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
if (!BP_IS_EMBEDDED(bp) &&
|
|
|
|
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
|
2010-05-29 00:45:14 +04:00
|
|
|
size_t size = BP_GET_PSIZE(bp);
|
2013-05-03 03:36:32 +04:00
|
|
|
void *data = zio_data_buf_alloc(size);
|
2010-05-29 00:45:14 +04:00
|
|
|
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/* If it's an intent log block, failure is expected. */
|
|
|
|
if (zb->zb_level == ZB_ZIL_LEVEL)
|
|
|
|
flags |= ZIO_FLAG_SPECULATIVE;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2013-05-03 03:36:32 +04:00
|
|
|
mutex_enter(&spa->spa_scrub_lock);
|
|
|
|
while (spa->spa_scrub_inflight > max_inflight)
|
|
|
|
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
|
|
|
|
spa->spa_scrub_inflight++;
|
|
|
|
mutex_exit(&spa->spa_scrub_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2013-05-03 03:36:32 +04:00
|
|
|
zio_nowait(zio_read(NULL, spa, bp, data, size,
|
|
|
|
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
zcb->zcb_readfails = 0;
|
|
|
|
|
2015-05-15 02:41:29 +03:00
|
|
|
/* only call gethrtime() every 100 blocks */
|
|
|
|
static int iters;
|
|
|
|
if (++iters > 100)
|
|
|
|
iters = 0;
|
|
|
|
else
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
|
2013-03-25 01:24:51 +04:00
|
|
|
uint64_t now = gethrtime();
|
|
|
|
char buf[10];
|
|
|
|
uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
|
|
|
|
int kb_per_sec =
|
|
|
|
1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
|
|
|
|
int sec_remaining =
|
|
|
|
(zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
|
|
|
|
|
|
|
|
zfs_nicenum(bytes, buf, sizeof (buf));
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"\r%5s completed (%4dMB/s) "
|
|
|
|
"estimated time remaining: %uhr %02umin %02usec ",
|
|
|
|
buf, kb_per_sec / 1024,
|
|
|
|
sec_remaining / 60 / 60,
|
|
|
|
sec_remaining / 60 % 60,
|
|
|
|
sec_remaining % 60);
|
|
|
|
|
|
|
|
zcb->zcb_lastprint = now;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
zdb_leak(void *arg, uint64_t start, uint64_t size)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
vdev_t *vd = arg;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
|
|
|
|
(u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
|
|
|
|
}
|
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
static metaslab_ops_t zdb_metaslab_ops = {
|
2014-07-20 00:19:24 +04:00
|
|
|
NULL /* alloc */
|
2010-05-29 00:45:14 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
|
|
|
{
|
|
|
|
ddt_bookmark_t ddb = { 0 };
|
|
|
|
ddt_entry_t dde;
|
|
|
|
int error;
|
2010-08-26 20:52:39 +04:00
|
|
|
int p;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
|
|
|
|
blkptr_t blk;
|
|
|
|
ddt_phys_t *ddp = dde.dde_phys;
|
|
|
|
|
|
|
|
if (ddb.ddb_class == DDT_CLASS_UNIQUE)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(ddt_phys_total_refcnt(&dde) > 1);
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (ddp->ddp_phys_birth == 0)
|
|
|
|
continue;
|
|
|
|
ddt_bp_create(ddb.ddb_checksum,
|
|
|
|
&dde.dde_key, ddp, &blk);
|
|
|
|
if (p == DDT_PHYS_DITTO) {
|
|
|
|
zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
|
|
|
|
} else {
|
|
|
|
zcb->zcb_dedup_asize +=
|
|
|
|
BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
|
|
|
|
zcb->zcb_dedup_blocks++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!dump_opt['L']) {
|
|
|
|
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
|
|
|
|
ddt_enter(ddt);
|
|
|
|
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
|
|
|
|
ddt_exit(ddt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(error == ENOENT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
|
|
|
|
{
|
|
|
|
zcb->zcb_spa = spa;
|
2014-09-17 00:24:48 +04:00
|
|
|
uint64_t c, m;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (!dump_opt['L']) {
|
|
|
|
vdev_t *rvd = spa->spa_root_vdev;
|
2010-08-26 20:52:39 +04:00
|
|
|
for (c = 0; c < rvd->vdev_children; c++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
vdev_t *vd = rvd->vdev_child[c];
|
2010-08-26 20:52:39 +04:00
|
|
|
for (m = 0; m < vd->vdev_ms_count; m++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
metaslab_t *msp = vd->vdev_ms[m];
|
|
|
|
mutex_enter(&msp->ms_lock);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
metaslab_unload(msp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For leak detection, we overload the metaslab
|
|
|
|
* ms_tree to contain allocated segments
|
|
|
|
* instead of free segments. As a result,
|
|
|
|
* we can't use the normal metaslab_load/unload
|
|
|
|
* interfaces.
|
|
|
|
*/
|
|
|
|
if (msp->ms_sm != NULL) {
|
2014-09-17 00:24:48 +04:00
|
|
|
(void) fprintf(stderr,
|
|
|
|
"\rloading space map for "
|
|
|
|
"vdev %llu of %llu, "
|
|
|
|
"metaslab %llu of %llu ...",
|
|
|
|
(longlong_t)c,
|
|
|
|
(longlong_t)rvd->vdev_children,
|
|
|
|
(longlong_t)m,
|
|
|
|
(longlong_t)vd->vdev_ms_count);
|
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
msp->ms_ops = &zdb_metaslab_ops;
|
2015-05-15 02:41:29 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't want to spend the CPU
|
|
|
|
* manipulating the size-ordered
|
|
|
|
* tree, so clear the range_tree
|
|
|
|
* ops.
|
|
|
|
*/
|
|
|
|
msp->ms_tree->rt_ops = NULL;
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
VERIFY0(space_map_load(msp->ms_sm,
|
|
|
|
msp->ms_tree, SM_ALLOC));
|
|
|
|
msp->ms_loaded = B_TRUE;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_exit(&msp->ms_lock);
|
|
|
|
}
|
|
|
|
}
|
2014-09-17 00:24:48 +04:00
|
|
|
(void) fprintf(stderr, "\n");
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
|
|
|
|
|
|
|
zdb_ddt_leak_init(spa, zcb);
|
|
|
|
|
|
|
|
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_leak_fini(spa_t *spa)
|
|
|
|
{
|
2010-08-26 20:52:39 +04:00
|
|
|
int c, m;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!dump_opt['L']) {
|
|
|
|
vdev_t *rvd = spa->spa_root_vdev;
|
2010-08-26 20:52:39 +04:00
|
|
|
for (c = 0; c < rvd->vdev_children; c++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
vdev_t *vd = rvd->vdev_child[c];
|
2010-08-26 20:52:39 +04:00
|
|
|
for (m = 0; m < vd->vdev_ms_count; m++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
metaslab_t *msp = vd->vdev_ms[m];
|
|
|
|
mutex_enter(&msp->ms_lock);
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The ms_tree has been overloaded to
|
|
|
|
* contain allocated segments. Now that we
|
|
|
|
* finished traversing all blocks, any
|
|
|
|
* block that remains in the ms_tree
|
|
|
|
* represents an allocated block that we
|
|
|
|
* did not claim during the traversal.
|
|
|
|
* Claimed blocks would have been removed
|
|
|
|
* from the ms_tree.
|
|
|
|
*/
|
|
|
|
range_tree_vacate(msp->ms_tree, zdb_leak, vd);
|
|
|
|
msp->ms_loaded = B_FALSE;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_exit(&msp->ms_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
zdb_cb_t *zcb = arg;
|
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
if (dump_opt['b'] >= 5) {
|
2010-05-29 00:45:14 +04:00
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("[%s] %s\n",
|
|
|
|
"deferred free", blkbuf);
|
|
|
|
}
|
|
|
|
zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static int
|
|
|
|
dump_block_stats(spa_t *spa)
|
|
|
|
{
|
2010-08-26 20:52:41 +04:00
|
|
|
zdb_cb_t zcb;
|
2008-11-20 23:01:55 +03:00
|
|
|
zdb_blkstats_t *zb, *tzb;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t norm_alloc, norm_space, total_alloc, total_found;
|
|
|
|
int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
|
2014-06-06 01:19:08 +04:00
|
|
|
boolean_t leaks = B_FALSE;
|
2014-09-17 10:59:43 +04:00
|
|
|
int e, c;
|
2014-06-06 01:19:08 +04:00
|
|
|
bp_embedded_type_t i;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
|
2010-05-29 00:45:14 +04:00
|
|
|
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
|
|
|
|
(dump_opt['c'] == 1) ? "metadata " : "",
|
|
|
|
dump_opt['c'] ? "checksums " : "",
|
|
|
|
(dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
|
|
|
|
!dump_opt['L'] ? "nothing leaked " : "");
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2008-12-03 23:09:06 +03:00
|
|
|
* Load all space maps as SM_ALLOC maps, then traverse the pool
|
|
|
|
* claiming each block we discover. If the pool is perfectly
|
|
|
|
* consistent, the space maps will be empty when we're done.
|
|
|
|
* Anything left over is a leak; any block we can't claim (because
|
|
|
|
* it's not part of any space map) is a double allocation,
|
|
|
|
* reference to a freed block, or an unclaimed log block.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2013-11-01 23:26:11 +04:00
|
|
|
bzero(&zcb, sizeof (zdb_cb_t));
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_leak_init(spa, &zcb);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If there's a deferred-free bplist, process that first.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
|
|
|
|
count_block_cb, &zcb, NULL);
|
2013-09-04 16:00:57 +04:00
|
|
|
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
|
|
|
|
(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
|
|
|
|
count_block_cb, &zcb, NULL);
|
|
|
|
}
|
2013-10-08 21:13:05 +04:00
|
|
|
if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
|
2012-12-14 03:24:15 +04:00
|
|
|
VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
|
|
|
|
spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
|
|
|
|
&zcb, NULL));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['c'] > 1)
|
|
|
|
flags |= TRAVERSE_PREFETCH_DATA;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
|
|
|
|
zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
|
2010-05-29 00:45:14 +04:00
|
|
|
zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-05-03 03:36:32 +04:00
|
|
|
/*
|
|
|
|
* If we've traversed the data blocks then we need to wait for those
|
|
|
|
* I/Os to complete. We leverage "The Godfather" zio to wait on
|
|
|
|
* all async I/Os to complete.
|
|
|
|
*/
|
|
|
|
if (dump_opt['c']) {
|
2014-09-17 10:59:43 +04:00
|
|
|
for (c = 0; c < max_ncpus; c++) {
|
|
|
|
(void) zio_wait(spa->spa_async_zio_root[c]);
|
|
|
|
spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
|
|
|
|
ZIO_FLAG_GODFATHER);
|
|
|
|
}
|
2013-05-03 03:36:32 +04:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zcb.zcb_haderrors) {
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) printf("\nError counts:\n\n");
|
|
|
|
(void) printf("\t%5s %s\n", "errno", "count");
|
2010-08-26 20:52:39 +04:00
|
|
|
for (e = 0; e < 256; e++) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zcb.zcb_errors[e] != 0) {
|
|
|
|
(void) printf("\t%5d %llu\n",
|
|
|
|
e, (u_longlong_t)zcb.zcb_errors[e]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Report any leaked segments.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_leak_fini(spa);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
|
|
|
|
norm_space = metaslab_class_get_space(spa_normal_class(spa));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
|
|
|
|
total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (total_found == total_alloc) {
|
2009-01-16 00:59:39 +03:00
|
|
|
if (!dump_opt['L'])
|
|
|
|
(void) printf("\n\tNo leaks (block sum matches space"
|
|
|
|
" maps exactly)\n");
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
(void) printf("block traversal size %llu != alloc %llu "
|
2009-01-16 00:59:39 +03:00
|
|
|
"(%s %lld)\n",
|
2010-05-29 00:45:14 +04:00
|
|
|
(u_longlong_t)total_found,
|
|
|
|
(u_longlong_t)total_alloc,
|
2009-01-16 00:59:39 +03:00
|
|
|
(dump_opt['L']) ? "unreachable" : "leaked",
|
2010-05-29 00:45:14 +04:00
|
|
|
(longlong_t)(total_alloc - total_found));
|
2014-06-06 01:19:08 +04:00
|
|
|
leaks = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (tzb->zb_count == 0)
|
|
|
|
return (2);
|
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
(void) printf("\tbp count: %10llu\n",
|
|
|
|
(u_longlong_t)tzb->zb_count);
|
2014-11-03 22:12:40 +03:00
|
|
|
(void) printf("\tganged count: %10llu\n",
|
|
|
|
(longlong_t)tzb->zb_gangs);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\tbp logical: %10llu avg: %6llu\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)tzb->zb_lsize,
|
|
|
|
(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\tbp physical: %10llu avg:"
|
|
|
|
" %6llu compression: %6.2f\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)tzb->zb_psize,
|
|
|
|
(u_longlong_t)(tzb->zb_psize / tzb->zb_count),
|
|
|
|
(double)tzb->zb_lsize / tzb->zb_psize);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\tbp allocated: %10llu avg:"
|
|
|
|
" %6llu compression: %6.2f\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)tzb->zb_asize,
|
|
|
|
(u_longlong_t)(tzb->zb_asize / tzb->zb_count),
|
|
|
|
(double)tzb->zb_lsize / tzb->zb_asize);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("\tbp deduped: %10llu ref>1:"
|
|
|
|
" %6llu deduplication: %6.2f\n",
|
|
|
|
(u_longlong_t)zcb.zcb_dedup_asize,
|
|
|
|
(u_longlong_t)zcb.zcb_dedup_blocks,
|
|
|
|
(double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
|
|
|
|
(void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
|
|
|
|
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-06-06 01:19:08 +04:00
|
|
|
for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
|
|
|
|
if (zcb.zcb_embedded_blocks[i] == 0)
|
|
|
|
continue;
|
|
|
|
(void) printf("\n");
|
|
|
|
(void) printf("\tadditional, non-pointer bps of type %u: "
|
|
|
|
"%10llu\n",
|
|
|
|
i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
|
|
|
|
|
|
|
|
if (dump_opt['b'] >= 3) {
|
|
|
|
(void) printf("\t number of (compressed) bytes: "
|
|
|
|
"number of bps\n");
|
|
|
|
dump_histogram(zcb.zcb_embedded_histogram[i],
|
|
|
|
sizeof (zcb.zcb_embedded_histogram[i]) /
|
|
|
|
sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-03 22:12:40 +03:00
|
|
|
if (tzb->zb_ditto_samevdev != 0) {
|
|
|
|
(void) printf("\tDittoed blocks on same vdev: %llu\n",
|
|
|
|
(longlong_t)tzb->zb_ditto_samevdev);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (dump_opt['b'] >= 2) {
|
|
|
|
int l, t, level;
|
|
|
|
(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
|
|
|
|
"\t avg\t comp\t%%Total\tType\n");
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
for (t = 0; t <= ZDB_OT_TOTAL; t++) {
|
|
|
|
char csize[32], lsize[32], psize[32], asize[32];
|
2014-11-03 22:12:40 +03:00
|
|
|
char avg[32], gang[32];
|
2008-11-20 23:01:55 +03:00
|
|
|
char *typename;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (t < DMU_OT_NUMTYPES)
|
|
|
|
typename = dmu_ot[t].ot_name;
|
|
|
|
else
|
|
|
|
typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
|
|
|
|
(void) printf("%6s\t%5s\t%5s\t%5s"
|
|
|
|
"\t%5s\t%5s\t%6s\t%s\n",
|
|
|
|
"-",
|
|
|
|
"-",
|
|
|
|
"-",
|
|
|
|
"-",
|
|
|
|
"-",
|
|
|
|
"-",
|
|
|
|
"-",
|
|
|
|
typename);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (l = ZB_TOTAL - 1; l >= -1; l--) {
|
|
|
|
level = (l == -1 ? ZB_TOTAL : l);
|
|
|
|
zb = &zcb.zcb_type[level][t];
|
|
|
|
|
|
|
|
if (zb->zb_asize == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (dump_opt['b'] < 3 && level != ZB_TOTAL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (level == 0 && zb->zb_asize ==
|
|
|
|
zcb.zcb_type[ZB_TOTAL][t].zb_asize)
|
|
|
|
continue;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_nicenum(zb->zb_count, csize);
|
|
|
|
zdb_nicenum(zb->zb_lsize, lsize);
|
|
|
|
zdb_nicenum(zb->zb_psize, psize);
|
|
|
|
zdb_nicenum(zb->zb_asize, asize);
|
|
|
|
zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
|
2014-11-03 22:12:40 +03:00
|
|
|
zdb_nicenum(zb->zb_gangs, gang);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
|
|
|
|
"\t%5.2f\t%6.2f\t",
|
|
|
|
csize, lsize, psize, asize, avg,
|
|
|
|
(double)zb->zb_lsize / zb->zb_psize,
|
|
|
|
100.0 * zb->zb_asize / tzb->zb_asize);
|
|
|
|
|
|
|
|
if (level == ZB_TOTAL)
|
|
|
|
(void) printf("%s\n", typename);
|
|
|
|
else
|
|
|
|
(void) printf(" L%d %s\n",
|
|
|
|
level, typename);
|
2013-03-25 01:24:51 +04:00
|
|
|
|
2014-11-03 22:12:40 +03:00
|
|
|
if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
|
|
|
|
(void) printf("\t number of ganged "
|
|
|
|
"blocks: %s\n", gang);
|
|
|
|
}
|
|
|
|
|
2013-03-25 01:24:51 +04:00
|
|
|
if (dump_opt['b'] >= 4) {
|
|
|
|
(void) printf("psize "
|
|
|
|
"(in 512-byte sectors): "
|
|
|
|
"number of blocks\n");
|
|
|
|
dump_histogram(zb->zb_psize_histogram,
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
PSIZE_HISTO_SIZE, 0);
|
2013-03-25 01:24:51 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) printf("\n");
|
|
|
|
|
|
|
|
if (leaks)
|
|
|
|
return (2);
|
|
|
|
|
|
|
|
if (zcb.zcb_haderrors)
|
|
|
|
return (3);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
typedef struct zdb_ddt_entry {
|
|
|
|
ddt_key_t zdde_key;
|
|
|
|
uint64_t zdde_ref_blocks;
|
|
|
|
uint64_t zdde_ref_lsize;
|
|
|
|
uint64_t zdde_ref_psize;
|
|
|
|
uint64_t zdde_ref_dsize;
|
|
|
|
avl_node_t zdde_node;
|
|
|
|
} zdb_ddt_entry_t;
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
2014-06-25 22:37:59 +04:00
|
|
|
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
avl_tree_t *t = arg;
|
|
|
|
avl_index_t where;
|
|
|
|
zdb_ddt_entry_t *zdde, zdde_search;
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
|
2010-05-29 00:45:14 +04:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
|
|
|
|
(void) printf("traversing objset %llu, %llu objects, "
|
|
|
|
"%lu blocks so far\n",
|
|
|
|
(u_longlong_t)zb->zb_objset,
|
2014-06-06 01:19:08 +04:00
|
|
|
(u_longlong_t)BP_GET_FILL(bp),
|
2010-05-29 00:45:14 +04:00
|
|
|
avl_numnodes(t));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
|
2012-12-14 03:24:15 +04:00
|
|
|
BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
|
2010-05-29 00:45:14 +04:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
ddt_key_fill(&zdde_search.zdde_key, bp);
|
|
|
|
|
|
|
|
zdde = avl_find(t, &zdde_search, &where);
|
|
|
|
|
|
|
|
if (zdde == NULL) {
|
|
|
|
zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
|
|
|
|
zdde->zdde_key = zdde_search.zdde_key;
|
|
|
|
avl_insert(t, zdde, where);
|
|
|
|
}
|
|
|
|
|
|
|
|
zdde->zdde_ref_blocks += 1;
|
|
|
|
zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
|
|
|
|
zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
|
|
|
|
zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_simulated_ddt(spa_t *spa)
|
|
|
|
{
|
|
|
|
avl_tree_t t;
|
|
|
|
void *cookie = NULL;
|
|
|
|
zdb_ddt_entry_t *zdde;
|
2010-08-26 20:52:41 +04:00
|
|
|
ddt_histogram_t ddh_total;
|
|
|
|
ddt_stat_t dds_total;
|
|
|
|
|
|
|
|
bzero(&ddh_total, sizeof (ddt_histogram_t));
|
|
|
|
bzero(&dds_total, sizeof (ddt_stat_t));
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
avl_create(&t, ddt_entry_compare,
|
|
|
|
sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
|
|
|
|
|
|
|
|
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
|
|
|
|
|
|
|
(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
|
|
|
|
zdb_ddt_add_cb, &t);
|
|
|
|
|
|
|
|
spa_config_exit(spa, SCL_CONFIG, FTAG);
|
|
|
|
|
|
|
|
while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
|
|
|
|
ddt_stat_t dds;
|
|
|
|
uint64_t refcnt = zdde->zdde_ref_blocks;
|
|
|
|
ASSERT(refcnt != 0);
|
|
|
|
|
|
|
|
dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
|
|
|
|
dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
|
|
|
|
dds.dds_psize = zdde->zdde_ref_psize / refcnt;
|
|
|
|
dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
|
|
|
|
|
|
|
|
dds.dds_ref_blocks = zdde->zdde_ref_blocks;
|
|
|
|
dds.dds_ref_lsize = zdde->zdde_ref_lsize;
|
|
|
|
dds.dds_ref_psize = zdde->zdde_ref_psize;
|
|
|
|
dds.dds_ref_dsize = zdde->zdde_ref_dsize;
|
|
|
|
|
2014-04-16 07:40:22 +04:00
|
|
|
ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
|
|
|
|
&dds, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
umem_free(zdde, sizeof (*zdde));
|
|
|
|
}
|
|
|
|
|
|
|
|
avl_destroy(&t);
|
|
|
|
|
|
|
|
ddt_histogram_stat(&dds_total, &ddh_total);
|
|
|
|
|
|
|
|
(void) printf("Simulated DDT histogram:\n");
|
|
|
|
|
|
|
|
zpool_dump_ddt(&dds_total, &ddh_total);
|
|
|
|
|
|
|
|
dump_dedup_ratio(&dds_total);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
|
|
|
dump_zpool(spa_t *spa)
|
|
|
|
{
|
|
|
|
dsl_pool_t *dp = spa_get_dsl(spa);
|
|
|
|
int rc = 0;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['S']) {
|
|
|
|
dump_simulated_ddt(spa);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!dump_opt['e'] && dump_opt['C'] > 1) {
|
|
|
|
(void) printf("\nCached configuration:\n");
|
|
|
|
dump_nvlist(spa->spa_config, 8);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dump_opt['C'])
|
|
|
|
dump_config(spa);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (dump_opt['u'])
|
2010-05-29 00:45:14 +04:00
|
|
|
dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['D'])
|
|
|
|
dump_all_ddts(spa);
|
|
|
|
|
|
|
|
if (dump_opt['d'] > 2 || dump_opt['m'])
|
|
|
|
dump_metaslabs(spa);
|
2014-07-20 00:19:24 +04:00
|
|
|
if (dump_opt['M'])
|
|
|
|
dump_metaslab_groups(spa);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (dump_opt['d'] || dump_opt['i']) {
|
2015-07-24 19:53:55 +03:00
|
|
|
spa_feature_t f;
|
2015-06-25 07:05:32 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_dir(dp->dp_meta_objset);
|
|
|
|
if (dump_opt['d'] >= 3) {
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_full_bpobj(&spa->spa_deferred_bpobj,
|
2013-07-05 23:37:16 +04:00
|
|
|
"Deferred frees", 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
|
2015-04-27 01:27:36 +03:00
|
|
|
dump_full_bpobj(
|
|
|
|
&spa->spa_dsl_pool->dp_free_bpobj,
|
2013-07-05 23:37:16 +04:00
|
|
|
"Pool snapshot frees", 0);
|
2012-12-14 03:24:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (spa_feature_is_active(spa,
|
2013-10-08 21:13:05 +04:00
|
|
|
SPA_FEATURE_ASYNC_DESTROY)) {
|
2012-12-14 03:24:15 +04:00
|
|
|
dump_bptree(spa->spa_meta_objset,
|
|
|
|
spa->spa_dsl_pool->dp_bptree_obj,
|
|
|
|
"Pool dataset frees");
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_dtl(spa->spa_root_vdev, 0);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
|
|
|
|
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
|
2014-11-03 23:15:08 +03:00
|
|
|
|
2015-07-24 19:53:55 +03:00
|
|
|
for (f = 0; f < SPA_FEATURES; f++) {
|
|
|
|
uint64_t refcount;
|
|
|
|
|
|
|
|
if (!(spa_feature_table[f].fi_flags &
|
|
|
|
ZFEATURE_FLAG_PER_DATASET)) {
|
|
|
|
ASSERT0(dataset_feature_count[f]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (feature_get_refcount(spa, &spa_feature_table[f],
|
|
|
|
&refcount) == ENOTSUP)
|
|
|
|
continue;
|
|
|
|
if (dataset_feature_count[f] != refcount) {
|
|
|
|
(void) printf("%s feature refcount mismatch: "
|
|
|
|
"%lld datasets != %lld refcount\n",
|
|
|
|
spa_feature_table[f].fi_uname,
|
|
|
|
(longlong_t)dataset_feature_count[f],
|
2015-06-25 07:05:32 +03:00
|
|
|
(longlong_t)refcount);
|
|
|
|
rc = 2;
|
|
|
|
} else {
|
2015-07-24 19:53:55 +03:00
|
|
|
(void) printf("Verified %s feature refcount "
|
|
|
|
"of %llu is correct\n",
|
|
|
|
spa_feature_table[f].fi_uname,
|
2015-06-25 07:05:32 +03:00
|
|
|
(longlong_t)refcount);
|
|
|
|
}
|
2014-11-03 23:15:08 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2014-11-03 23:15:08 +03:00
|
|
|
if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
|
2008-11-20 23:01:55 +03:00
|
|
|
rc = dump_block_stats(spa);
|
|
|
|
|
Illumos #4101, #4102, #4103, #4105, #4106
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Prior to this patch, space_maps were preferred solely based on the
amount of free space left in each. Unfortunately, this heuristic didn't
contain any information about the make-up of that free space, which
meant we could keep preferring and loading a highly fragmented space map
that wouldn't actually have enough contiguous space to satisfy the
allocation; then unloading that space_map and repeating the process.
This change modifies the space_map's to store additional information
about the contiguous space in the space_map, so that we can use this
information to make a better decision about which space_map to load.
This requires reallocating all space_map objects to increase their
bonus buffer size sizes enough to fit the new metadata.
The above feature can be enabled via a new feature flag introduced by
this change: com.delphix:spacemap_histogram
In addition to the above, this patch allows the space_map block size to
be increase. Currently the block size is set to be 4K in size, which has
certain implications including the following:
* 4K sector devices will not see any compression benefit
* large space_maps require more metadata on-disk
* large space_maps require more time to load (typically random reads)
Now the space_map block size can adjust as needed up to the maximum size
set via the space_map_max_blksz variable.
A bug was fixed which resulted in potentially leaking an object when
removing a mirrored log device. The previous logic for vdev_remove() did
not deal with removing top-level vdevs that are interior vdevs (i.e.
mirror) correctly. The problem would occur when removing a mirrored log
device, and result in the DTL space map object being leaked; because
top-level vdevs don't have DTL space map objects associated with them.
References:
https://www.illumos.org/issues/4101
https://www.illumos.org/issues/4102
https://www.illumos.org/issues/4103
https://www.illumos.org/issues/4105
https://www.illumos.org/issues/4106
https://github.com/illumos/illumos-gate/commit/0713e23
Porting notes:
A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also,
the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary.
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2488
2013-10-02 01:25:53 +04:00
|
|
|
if (rc == 0)
|
|
|
|
rc = verify_spacemap_refcounts(spa);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (dump_opt['s'])
|
|
|
|
show_pool_stats(spa);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['h'])
|
|
|
|
dump_history(spa);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (rc != 0)
|
|
|
|
exit(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define ZDB_FLAG_CHECKSUM 0x0001
|
|
|
|
#define ZDB_FLAG_DECOMPRESS 0x0002
|
|
|
|
#define ZDB_FLAG_BSWAP 0x0004
|
|
|
|
#define ZDB_FLAG_GBH 0x0008
|
|
|
|
#define ZDB_FLAG_INDIRECT 0x0010
|
|
|
|
#define ZDB_FLAG_PHYS 0x0020
|
|
|
|
#define ZDB_FLAG_RAW 0x0040
|
|
|
|
#define ZDB_FLAG_PRINT_BLKPTR 0x0080
|
|
|
|
|
|
|
|
int flagbits[256];
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_print_blkptr(blkptr_t *bp, int flags)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
char blkbuf[BP_SPRINTF_LEN];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (flags & ZDB_FLAG_BSWAP)
|
|
|
|
byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2013-12-09 22:37:51 +04:00
|
|
|
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf("%s\n", blkbuf);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nbps; i++)
|
|
|
|
zdb_print_blkptr(&bp[i], flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_dump_gbh(void *buf, int flags)
|
|
|
|
{
|
|
|
|
zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_dump_block_raw(void *buf, uint64_t size, int flags)
|
|
|
|
{
|
|
|
|
if (flags & ZDB_FLAG_BSWAP)
|
|
|
|
byteswap_uint64_array(buf, size);
|
2010-08-26 20:52:40 +04:00
|
|
|
VERIFY(write(fileno(stdout), buf, size) == size);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
|
|
|
|
{
|
|
|
|
uint64_t *d = (uint64_t *)buf;
|
|
|
|
int nwords = size / sizeof (uint64_t);
|
|
|
|
int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
|
|
|
|
int i, j;
|
|
|
|
char *hdr, *c;
|
|
|
|
|
|
|
|
|
|
|
|
if (do_bswap)
|
|
|
|
hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
|
|
|
|
else
|
|
|
|
hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
|
|
|
|
|
|
|
|
(void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
|
|
|
|
|
2015-11-21 02:47:37 +03:00
|
|
|
#ifdef _LITTLE_ENDIAN
|
|
|
|
/* correct the endianess */
|
|
|
|
do_bswap = !do_bswap;
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
for (i = 0; i < nwords; i += 2) {
|
|
|
|
(void) printf("%06llx: %016llx %016llx ",
|
|
|
|
(u_longlong_t)(i * sizeof (uint64_t)),
|
|
|
|
(u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
|
|
|
|
(u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
|
|
|
|
|
|
|
|
c = (char *)&d[i];
|
|
|
|
for (j = 0; j < 2 * sizeof (uint64_t); j++)
|
|
|
|
(void) printf("%c", isprint(c[j]) ? c[j] : '.');
|
|
|
|
(void) printf("\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are two acceptable formats:
|
|
|
|
* leaf_name - For example: c1t0d0 or /tmp/ztest.0a
|
|
|
|
* child[.child]* - For example: 0.1.1
|
|
|
|
*
|
|
|
|
* The second form can be used to specify arbitrary vdevs anywhere
|
|
|
|
* in the heirarchy. For example, in a pool with a mirror of
|
|
|
|
* RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
|
|
|
|
*/
|
|
|
|
static vdev_t *
|
|
|
|
zdb_vdev_lookup(vdev_t *vdev, char *path)
|
|
|
|
{
|
|
|
|
char *s, *p, *q;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (vdev == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
/* First, assume the x.x.x.x format */
|
|
|
|
i = (int)strtoul(path, &s, 10);
|
|
|
|
if (s == path || (s && *s != '.' && *s != '\0'))
|
|
|
|
goto name;
|
|
|
|
if (i < 0 || i >= vdev->vdev_children)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
vdev = vdev->vdev_child[i];
|
|
|
|
if (*s == '\0')
|
|
|
|
return (vdev);
|
|
|
|
return (zdb_vdev_lookup(vdev, s+1));
|
|
|
|
|
|
|
|
name:
|
|
|
|
for (i = 0; i < vdev->vdev_children; i++) {
|
|
|
|
vdev_t *vc = vdev->vdev_child[i];
|
|
|
|
|
|
|
|
if (vc->vdev_path == NULL) {
|
|
|
|
vc = zdb_vdev_lookup(vc, path);
|
|
|
|
if (vc == NULL)
|
|
|
|
continue;
|
|
|
|
else
|
|
|
|
return (vc);
|
|
|
|
}
|
|
|
|
|
|
|
|
p = strrchr(vc->vdev_path, '/');
|
|
|
|
p = p ? p + 1 : vc->vdev_path;
|
|
|
|
q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
|
|
|
|
|
|
|
|
if (strcmp(vc->vdev_path, path) == 0)
|
|
|
|
return (vc);
|
|
|
|
if (strcmp(p, path) == 0)
|
|
|
|
return (vc);
|
|
|
|
if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
|
|
|
|
return (vc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read a block from a pool and print it out. The syntax of the
|
|
|
|
* block descriptor is:
|
|
|
|
*
|
|
|
|
* pool:vdev_specifier:offset:size[:flags]
|
|
|
|
*
|
|
|
|
* pool - The name of the pool you wish to read from
|
|
|
|
* vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
|
|
|
|
* offset - offset, in hex, in bytes
|
|
|
|
* size - Amount of data to read, in hex, in bytes
|
|
|
|
* flags - A string of characters specifying options
|
|
|
|
* b: Decode a blkptr at given offset within block
|
|
|
|
* *c: Calculate and display checksums
|
2010-05-29 00:45:14 +04:00
|
|
|
* d: Decompress data before dumping
|
2008-11-20 23:01:55 +03:00
|
|
|
* e: Byteswap data before dumping
|
2010-05-29 00:45:14 +04:00
|
|
|
* g: Display data as a gang block header
|
|
|
|
* i: Display as an indirect block
|
2008-11-20 23:01:55 +03:00
|
|
|
* p: Do I/O to physical offset
|
|
|
|
* r: Dump raw data to stdout
|
|
|
|
*
|
|
|
|
* * = not yet implemented
|
|
|
|
*/
|
|
|
|
static void
|
2010-05-29 00:45:14 +04:00
|
|
|
zdb_read_block(char *thing, spa_t *spa)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
blkptr_t blk, *bp = &blk;
|
|
|
|
dva_t *dva = bp->blk_dva;
|
2008-11-20 23:01:55 +03:00
|
|
|
int flags = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *zio;
|
|
|
|
vdev_t *vd;
|
2010-05-29 00:45:14 +04:00
|
|
|
void *pbuf, *lbuf, *buf;
|
|
|
|
char *s, *p, *dup, *vdev, *flagstr;
|
|
|
|
int i, error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
dup = strdup(thing);
|
|
|
|
s = strtok(dup, ":");
|
|
|
|
vdev = s ? s : "";
|
|
|
|
s = strtok(NULL, ":");
|
|
|
|
offset = strtoull(s ? s : "", NULL, 16);
|
|
|
|
s = strtok(NULL, ":");
|
|
|
|
size = strtoull(s ? s : "", NULL, 16);
|
|
|
|
s = strtok(NULL, ":");
|
|
|
|
flagstr = s ? s : "";
|
|
|
|
|
|
|
|
s = NULL;
|
|
|
|
if (size == 0)
|
|
|
|
s = "size must not be zero";
|
|
|
|
if (!IS_P2ALIGNED(size, DEV_BSIZE))
|
|
|
|
s = "size must be a multiple of sector size";
|
|
|
|
if (!IS_P2ALIGNED(offset, DEV_BSIZE))
|
|
|
|
s = "offset must be a multiple of sector size";
|
|
|
|
if (s) {
|
|
|
|
(void) printf("Invalid block specifier: %s - %s\n", thing, s);
|
|
|
|
free(dup);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
|
|
|
|
for (i = 0; flagstr[i]; i++) {
|
|
|
|
int bit = flagbits[(uchar_t)flagstr[i]];
|
|
|
|
|
|
|
|
if (bit == 0) {
|
|
|
|
(void) printf("***Invalid flag: %c\n",
|
|
|
|
flagstr[i]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
flags |= bit;
|
|
|
|
|
|
|
|
/* If it's not something with an argument, keep going */
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((bit & (ZDB_FLAG_CHECKSUM |
|
2008-11-20 23:01:55 +03:00
|
|
|
ZDB_FLAG_PRINT_BLKPTR)) == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
p = &flagstr[i + 1];
|
2016-02-03 19:07:34 +03:00
|
|
|
if (bit == ZDB_FLAG_PRINT_BLKPTR) {
|
2008-11-20 23:01:55 +03:00
|
|
|
blkptr_offset = strtoull(p, &p, 16);
|
2016-02-03 19:07:34 +03:00
|
|
|
i = p - &flagstr[i + 1];
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
if (*p != ':' && *p != '\0') {
|
|
|
|
(void) printf("***Invalid flag arg: '%s'\n", s);
|
|
|
|
free(dup);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
|
|
|
|
if (vd == NULL) {
|
|
|
|
(void) printf("***Invalid vdev: %s\n", vdev);
|
|
|
|
free(dup);
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
if (vd->vdev_path)
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) fprintf(stderr, "Found vdev: %s\n",
|
|
|
|
vd->vdev_path);
|
2008-11-20 23:01:55 +03:00
|
|
|
else
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) fprintf(stderr, "Found vdev type: %s\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
vd->vdev_ops->vdev_op_type);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
psize = size;
|
|
|
|
lsize = size;
|
|
|
|
|
2013-07-04 08:02:05 +04:00
|
|
|
pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, 512, UMEM_NOFAIL);
|
2010-05-29 00:45:14 +04:00
|
|
|
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
|
|
|
|
|
|
|
|
BP_ZERO(bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
DVA_SET_VDEV(&dva[0], vd->vdev_id);
|
|
|
|
DVA_SET_OFFSET(&dva[0], offset);
|
|
|
|
DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
|
|
|
|
DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
|
|
|
|
|
|
|
|
BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
|
|
|
|
|
|
|
|
BP_SET_LSIZE(bp, lsize);
|
|
|
|
BP_SET_PSIZE(bp, psize);
|
|
|
|
BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
|
|
|
|
BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
|
|
|
|
BP_SET_TYPE(bp, DMU_OT_NONE);
|
|
|
|
BP_SET_LEVEL(bp, 0);
|
|
|
|
BP_SET_DEDUP(bp, 0);
|
|
|
|
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
|
2008-11-20 23:01:55 +03:00
|
|
|
zio = zio_root(spa, NULL, NULL, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (vd == vd->vdev_top) {
|
|
|
|
/*
|
|
|
|
* Treat this as a normal block read.
|
|
|
|
*/
|
|
|
|
zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
|
|
|
|
ZIO_PRIORITY_SYNC_READ,
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Treat this as a vdev child I/O.
|
|
|
|
*/
|
|
|
|
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
|
|
|
|
ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
|
|
|
|
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
|
|
|
|
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
|
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
error = zio_wait(zio);
|
2008-12-03 23:09:06 +03:00
|
|
|
spa_config_exit(spa, SCL_STATE, FTAG);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (error) {
|
|
|
|
(void) printf("Read of %s failed, error: %d\n", thing, error);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (flags & ZDB_FLAG_DECOMPRESS) {
|
|
|
|
/*
|
|
|
|
* We don't know how the data was compressed, so just try
|
|
|
|
* every decompress function at every inflated blocksize.
|
|
|
|
*/
|
|
|
|
enum zio_compress c;
|
|
|
|
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
|
|
|
|
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
|
|
|
|
|
|
|
|
bcopy(pbuf, pbuf2, psize);
|
|
|
|
|
|
|
|
VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
|
|
|
|
SPA_MAXBLOCKSIZE - psize) == 0);
|
|
|
|
|
|
|
|
VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
|
|
|
|
SPA_MAXBLOCKSIZE - psize) == 0);
|
|
|
|
|
|
|
|
for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
|
|
|
|
lsize -= SPA_MINBLOCKSIZE) {
|
|
|
|
for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
|
|
|
|
if (zio_decompress_data(c, pbuf, lbuf,
|
|
|
|
psize, lsize) == 0 &&
|
|
|
|
zio_decompress_data(c, pbuf2, lbuf2,
|
|
|
|
psize, lsize) == 0 &&
|
|
|
|
bcmp(lbuf, lbuf2, lsize) == 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (c != ZIO_COMPRESS_FUNCTIONS)
|
|
|
|
break;
|
|
|
|
lsize -= SPA_MINBLOCKSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
umem_free(pbuf2, SPA_MAXBLOCKSIZE);
|
|
|
|
umem_free(lbuf2, SPA_MAXBLOCKSIZE);
|
|
|
|
|
|
|
|
if (lsize <= psize) {
|
|
|
|
(void) printf("Decompress of %s failed\n", thing);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
buf = lbuf;
|
|
|
|
size = lsize;
|
|
|
|
} else {
|
|
|
|
buf = pbuf;
|
|
|
|
size = psize;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (flags & ZDB_FLAG_PRINT_BLKPTR)
|
|
|
|
zdb_print_blkptr((blkptr_t *)(void *)
|
|
|
|
((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
|
|
|
|
else if (flags & ZDB_FLAG_RAW)
|
|
|
|
zdb_dump_block_raw(buf, size, flags);
|
|
|
|
else if (flags & ZDB_FLAG_INDIRECT)
|
|
|
|
zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
|
|
|
|
flags);
|
|
|
|
else if (flags & ZDB_FLAG_GBH)
|
|
|
|
zdb_dump_gbh(buf, flags);
|
|
|
|
else
|
|
|
|
zdb_dump_block(thing, buf, size, flags);
|
|
|
|
|
|
|
|
out:
|
2010-05-29 00:45:14 +04:00
|
|
|
umem_free(pbuf, SPA_MAXBLOCKSIZE);
|
|
|
|
umem_free(lbuf, SPA_MAXBLOCKSIZE);
|
2008-11-20 23:01:55 +03:00
|
|
|
free(dup);
|
|
|
|
}
|
|
|
|
|
|
|
|
static boolean_t
|
2010-05-29 00:45:14 +04:00
|
|
|
pool_match(nvlist_t *cfg, char *tgt)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t v, guid = strtoull(tgt, NULL, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
char *s;
|
|
|
|
|
|
|
|
if (guid != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
|
|
|
|
return (v == guid);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
|
|
|
|
return (strcmp(s, tgt) == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
return (B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static char *
|
|
|
|
find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
nvlist_t *pools;
|
|
|
|
nvlist_t *match = NULL;
|
2010-05-29 00:45:14 +04:00
|
|
|
char *name = NULL;
|
|
|
|
char *sepp = NULL;
|
2016-06-16 00:28:36 +03:00
|
|
|
char sep = '\0';
|
2010-05-29 00:45:14 +04:00
|
|
|
int count = 0;
|
|
|
|
importargs_t args = { 0 };
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
args.paths = dirc;
|
|
|
|
args.path = dirv;
|
|
|
|
args.can_be_active = B_TRUE;
|
|
|
|
|
|
|
|
if ((sepp = strpbrk(*target, "/@")) != NULL) {
|
|
|
|
sep = *sepp;
|
|
|
|
*sepp = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
pools = zpool_search_import(g_zfs, &args);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (pools != NULL) {
|
|
|
|
nvpair_t *elem = NULL;
|
|
|
|
while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
|
|
|
|
verify(nvpair_value_nvlist(elem, configp) == 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (pool_match(*configp, *target)) {
|
|
|
|
count++;
|
2008-11-20 23:01:55 +03:00
|
|
|
if (match != NULL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
/* print previously found config */
|
|
|
|
if (name != NULL) {
|
|
|
|
(void) printf("%s\n", name);
|
|
|
|
dump_nvlist(match, 8);
|
|
|
|
name = NULL;
|
|
|
|
}
|
|
|
|
(void) printf("%s\n",
|
|
|
|
nvpair_name(elem));
|
|
|
|
dump_nvlist(*configp, 8);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
match = *configp;
|
2010-05-29 00:45:14 +04:00
|
|
|
name = nvpair_name(elem);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
if (count > 1)
|
|
|
|
(void) fatal("\tMatched %d pools - use pool GUID "
|
|
|
|
"instead of pool name or \n"
|
|
|
|
"\tpool name part of a dataset name to select pool", count);
|
|
|
|
|
|
|
|
if (sepp)
|
|
|
|
*sepp = sep;
|
|
|
|
/*
|
|
|
|
* If pool GUID was specified for pool id, replace it with pool name
|
|
|
|
*/
|
|
|
|
if (name && (strstr(*target, name) != *target)) {
|
|
|
|
int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
|
|
|
|
|
|
|
|
*target = umem_alloc(sz, UMEM_NOFAIL);
|
|
|
|
(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
*configp = name ? match : NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
return (name);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
main(int argc, char **argv)
|
|
|
|
{
|
|
|
|
int i, c;
|
|
|
|
struct rlimit rl = { 1024, 1024 };
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_t *spa = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
objset_t *os = NULL;
|
|
|
|
int dump_all = 1;
|
|
|
|
int verbose = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
int error = 0;
|
|
|
|
char **searchdirs = NULL;
|
|
|
|
int nsearch = 0;
|
|
|
|
char *target;
|
|
|
|
nvlist_t *policy = NULL;
|
|
|
|
uint64_t max_txg = UINT64_MAX;
|
2014-06-08 22:10:14 +04:00
|
|
|
int flags = ZFS_IMPORT_MISSING_LOG;
|
2010-05-29 00:45:14 +04:00
|
|
|
int rewind = ZPOOL_NEVER_REWIND;
|
2013-06-24 10:45:20 +04:00
|
|
|
char *spa_config_path_env;
|
2015-05-14 20:45:56 +03:00
|
|
|
boolean_t target_is_spa = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) setrlimit(RLIMIT_NOFILE, &rl);
|
|
|
|
(void) enable_extended_FILE_stdio(-1, -1);
|
|
|
|
|
|
|
|
dprintf_setup(&argc, argv);
|
|
|
|
|
2013-06-24 10:45:20 +04:00
|
|
|
/*
|
|
|
|
* If there is an environment variable SPA_CONFIG_PATH it overrides
|
|
|
|
* default spa_config_path setting. If -U flag is specified it will
|
|
|
|
* override this environment variable settings once again.
|
|
|
|
*/
|
|
|
|
spa_config_path_env = getenv("SPA_CONFIG_PATH");
|
|
|
|
if (spa_config_path_env != NULL)
|
|
|
|
spa_config_path = spa_config_path_env;
|
|
|
|
|
2016-01-01 16:42:58 +03:00
|
|
|
while ((c = getopt(argc, argv,
|
|
|
|
"bcdhilmMI:suCDRSAFLXx:evp:t:U:PV")) != -1) {
|
2008-11-20 23:01:55 +03:00
|
|
|
switch (c) {
|
|
|
|
case 'b':
|
|
|
|
case 'c':
|
2010-05-29 00:45:14 +04:00
|
|
|
case 'd':
|
|
|
|
case 'h':
|
|
|
|
case 'i':
|
|
|
|
case 'l':
|
2009-07-03 02:44:48 +04:00
|
|
|
case 'm':
|
2008-11-20 23:01:55 +03:00
|
|
|
case 's':
|
2010-05-29 00:45:14 +04:00
|
|
|
case 'u':
|
2008-11-20 23:01:55 +03:00
|
|
|
case 'C':
|
2010-05-29 00:45:14 +04:00
|
|
|
case 'D':
|
2014-07-20 00:19:24 +04:00
|
|
|
case 'M':
|
2008-11-20 23:01:55 +03:00
|
|
|
case 'R':
|
2010-05-29 00:45:14 +04:00
|
|
|
case 'S':
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_opt[c]++;
|
|
|
|
dump_all = 0;
|
|
|
|
break;
|
2010-05-29 00:45:14 +04:00
|
|
|
case 'A':
|
|
|
|
case 'F':
|
2009-01-16 00:59:39 +03:00
|
|
|
case 'L':
|
2010-05-29 00:45:14 +04:00
|
|
|
case 'X':
|
|
|
|
case 'e':
|
|
|
|
case 'P':
|
2009-01-16 00:59:39 +03:00
|
|
|
dump_opt[c]++;
|
|
|
|
break;
|
2014-06-08 22:10:14 +04:00
|
|
|
case 'V':
|
2016-01-07 00:32:32 +03:00
|
|
|
flags |= ZFS_IMPORT_VERBATIM;
|
2014-06-08 22:10:14 +04:00
|
|
|
break;
|
2014-07-20 00:19:24 +04:00
|
|
|
case 'I':
|
2013-05-03 03:36:32 +04:00
|
|
|
max_inflight = strtoull(optarg, NULL, 0);
|
|
|
|
if (max_inflight == 0) {
|
|
|
|
(void) fprintf(stderr, "maximum number "
|
|
|
|
"of inflight I/Os must be greater "
|
|
|
|
"than 0\n");
|
|
|
|
usage();
|
|
|
|
}
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
case 'p':
|
2010-05-29 00:45:14 +04:00
|
|
|
if (searchdirs == NULL) {
|
|
|
|
searchdirs = umem_alloc(sizeof (char *),
|
|
|
|
UMEM_NOFAIL);
|
|
|
|
} else {
|
|
|
|
char **tmp = umem_alloc((nsearch + 1) *
|
|
|
|
sizeof (char *), UMEM_NOFAIL);
|
|
|
|
bcopy(searchdirs, tmp, nsearch *
|
|
|
|
sizeof (char *));
|
|
|
|
umem_free(searchdirs,
|
|
|
|
nsearch * sizeof (char *));
|
|
|
|
searchdirs = tmp;
|
|
|
|
}
|
|
|
|
searchdirs[nsearch++] = optarg;
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
2016-01-01 16:42:58 +03:00
|
|
|
case 'x':
|
|
|
|
vn_dumpdir = optarg;
|
|
|
|
break;
|
2009-01-16 00:59:39 +03:00
|
|
|
case 't':
|
2010-05-29 00:45:14 +04:00
|
|
|
max_txg = strtoull(optarg, NULL, 0);
|
|
|
|
if (max_txg < TXG_INITIAL) {
|
2009-01-16 00:59:39 +03:00
|
|
|
(void) fprintf(stderr, "incorrect txg "
|
|
|
|
"specified: %s\n", optarg);
|
|
|
|
usage();
|
|
|
|
}
|
|
|
|
break;
|
2010-05-29 00:45:14 +04:00
|
|
|
case 'U':
|
|
|
|
spa_config_path = optarg;
|
|
|
|
break;
|
2014-07-20 00:19:24 +04:00
|
|
|
case 'v':
|
|
|
|
verbose++;
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
default:
|
|
|
|
usage();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!dump_opt['e'] && searchdirs != NULL) {
|
2008-12-03 23:09:06 +03:00
|
|
|
(void) fprintf(stderr, "-p option requires use of -e\n");
|
|
|
|
usage();
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-10-24 02:26:49 +04:00
|
|
|
#if defined(_LP64)
|
2014-09-17 00:24:48 +04:00
|
|
|
/*
|
|
|
|
* ZDB does not typically re-read blocks; therefore limit the ARC
|
|
|
|
* to 256 MB, which can be used entirely for metadata.
|
|
|
|
*/
|
|
|
|
zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
|
2014-10-24 02:26:49 +04:00
|
|
|
#endif
|
2014-09-17 00:24:48 +04:00
|
|
|
|
2015-05-15 02:41:29 +03:00
|
|
|
/*
|
|
|
|
* "zdb -c" uses checksum-verifying scrub i/os which are async reads.
|
|
|
|
* "zdb -b" uses traversal prefetch which uses async reads.
|
|
|
|
* For good performance, let several of them be active at once.
|
|
|
|
*/
|
|
|
|
zfs_vdev_async_read_max_active = 10;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
kernel_init(FREAD);
|
2015-05-21 00:39:52 +03:00
|
|
|
if ((g_zfs = libzfs_init()) == NULL) {
|
|
|
|
(void) fprintf(stderr, "%s", libzfs_error_init(errno));
|
2010-08-26 22:57:29 +04:00
|
|
|
return (1);
|
2015-05-21 00:39:52 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_all)
|
|
|
|
verbose = MAX(verbose, 1);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
for (c = 0; c < 256; c++) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_all && !strchr("elAFLRSXP", c))
|
2008-11-20 23:01:55 +03:00
|
|
|
dump_opt[c] = 1;
|
|
|
|
if (dump_opt[c])
|
|
|
|
dump_opt[c] += verbose;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
|
|
|
|
zfs_recover = (dump_opt['A'] > 1);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
argc -= optind;
|
|
|
|
argv += optind;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (argc < 2 && dump_opt['R'])
|
|
|
|
usage();
|
2008-11-20 23:01:55 +03:00
|
|
|
if (argc < 1) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!dump_opt['e'] && dump_opt['C']) {
|
2008-12-03 23:09:06 +03:00
|
|
|
dump_cachefile(spa_config_path);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
usage();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dump_opt['l']) {
|
|
|
|
dump_label(argv[0]);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['X'] || dump_opt['F'])
|
|
|
|
rewind = ZPOOL_DO_REWIND |
|
|
|
|
(dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
|
|
|
|
nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
|
|
|
|
nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
|
|
|
|
fatal("internal error: %s", strerror(ENOMEM));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
error = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
target = argv[0];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dump_opt['e']) {
|
|
|
|
nvlist_t *cfg = NULL;
|
|
|
|
char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
error = ENOENT;
|
|
|
|
if (name) {
|
|
|
|
if (dump_opt['C'] > 1) {
|
|
|
|
(void) printf("\nConfiguration for import:\n");
|
|
|
|
dump_nvlist(cfg, 8);
|
|
|
|
}
|
|
|
|
if (nvlist_add_nvlist(cfg,
|
|
|
|
ZPOOL_REWIND_POLICY, policy) != 0) {
|
|
|
|
fatal("can't open '%s': %s",
|
|
|
|
target, strerror(ENOMEM));
|
|
|
|
}
|
2014-06-08 22:10:14 +04:00
|
|
|
error = spa_import(name, cfg, NULL, flags);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2015-05-14 20:45:56 +03:00
|
|
|
if (strpbrk(target, "/@") != NULL) {
|
|
|
|
size_t targetlen;
|
|
|
|
|
|
|
|
target_is_spa = B_FALSE;
|
|
|
|
targetlen = strlen(target);
|
|
|
|
if (targetlen && target[targetlen - 1] == '/')
|
|
|
|
target[targetlen - 1] = '\0';
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (error == 0) {
|
2015-05-14 20:45:56 +03:00
|
|
|
if (target_is_spa || dump_opt['R']) {
|
2010-05-29 00:45:14 +04:00
|
|
|
error = spa_open_rewind(target, &spa, FTAG, policy,
|
|
|
|
NULL);
|
|
|
|
if (error) {
|
|
|
|
/*
|
|
|
|
* If we're missing the log device then
|
|
|
|
* try opening the pool after clearing the
|
|
|
|
* log state.
|
|
|
|
*/
|
|
|
|
mutex_enter(&spa_namespace_lock);
|
|
|
|
if ((spa = spa_lookup(target)) != NULL &&
|
|
|
|
spa->spa_log_state == SPA_LOG_MISSING) {
|
|
|
|
spa->spa_log_state = SPA_LOG_CLEAR;
|
|
|
|
error = 0;
|
|
|
|
}
|
|
|
|
mutex_exit(&spa_namespace_lock);
|
|
|
|
|
|
|
|
if (!error) {
|
|
|
|
error = spa_open_rewind(target, &spa,
|
|
|
|
FTAG, policy, NULL);
|
|
|
|
}
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
} else {
|
2010-05-29 00:45:14 +04:00
|
|
|
error = dmu_objset_own(target, DMU_OST_ANY,
|
|
|
|
B_TRUE, FTAG, &os);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
nvlist_free(policy);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (error)
|
2010-05-29 00:45:14 +04:00
|
|
|
fatal("can't open '%s': %s", target, strerror(error));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
argv++;
|
2010-05-29 00:45:14 +04:00
|
|
|
argc--;
|
|
|
|
if (!dump_opt['R']) {
|
|
|
|
if (argc > 0) {
|
|
|
|
zopt_objects = argc;
|
|
|
|
zopt_object = calloc(zopt_objects, sizeof (uint64_t));
|
|
|
|
for (i = 0; i < zopt_objects; i++) {
|
|
|
|
errno = 0;
|
|
|
|
zopt_object[i] = strtoull(argv[i], NULL, 0);
|
|
|
|
if (zopt_object[i] == 0 && errno != 0)
|
|
|
|
fatal("bad number %s: %s",
|
|
|
|
argv[i], strerror(errno));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2013-01-12 04:42:50 +04:00
|
|
|
if (os != NULL) {
|
|
|
|
dump_dir(os);
|
|
|
|
} else if (zopt_objects > 0 && !dump_opt['m']) {
|
|
|
|
dump_dir(spa->spa_meta_objset);
|
|
|
|
} else {
|
|
|
|
dump_zpool(spa);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2010-05-29 00:45:14 +04:00
|
|
|
flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
|
|
|
|
flagbits['c'] = ZDB_FLAG_CHECKSUM;
|
|
|
|
flagbits['d'] = ZDB_FLAG_DECOMPRESS;
|
|
|
|
flagbits['e'] = ZDB_FLAG_BSWAP;
|
|
|
|
flagbits['g'] = ZDB_FLAG_GBH;
|
|
|
|
flagbits['i'] = ZDB_FLAG_INDIRECT;
|
|
|
|
flagbits['p'] = ZDB_FLAG_PHYS;
|
|
|
|
flagbits['r'] = ZDB_FLAG_RAW;
|
|
|
|
|
|
|
|
for (i = 0; i < argc; i++)
|
|
|
|
zdb_read_block(argv[i], spa);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
fuid_table_destroy();
|
2010-05-29 00:45:14 +04:00
|
|
|
sa_loaded = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
libzfs_fini(g_zfs);
|
|
|
|
kernel_fini();
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|