Illumos 4976-4984 - metaslab improvements

4976 zfs should only avoid writing to a failing non-redundant top-level vdev
4978 ztest fails in get_metaslab_refcount()
4979 extend free space histogram to device and pool
4980 metaslabs should have a fragmentation metric
4981 remove fragmented ops vector from block allocator
4982 space_map object should proactively upgrade when feature is enabled
4983 need to collect metaslab information via mdb
4984 device selection should use fragmentation metric
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <adam.leventhal@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>

References:
  https://www.illumos.org/issues/4976
  https://www.illumos.org/issues/4978
  https://www.illumos.org/issues/4979
  https://www.illumos.org/issues/4980
  https://www.illumos.org/issues/4981
  https://www.illumos.org/issues/4982
  https://www.illumos.org/issues/4983
  https://www.illumos.org/issues/4984
  https://github.com/illumos/illumos-gate/commit/2e4c998

Notes:
    The "zdb -M" option has been re-tasked to display the new metaslab
    fragmentation metric and the new "zdb -I" option is used to control
    the maximum number of in-flight I/Os.

    The new fragmentation metric is derived from the space map histogram
    which has been rolled up to the vdev and pool level and is presented
    to the user via "zpool list".

    Add a number of module parameters related to the new metaslab weighting
    logic.

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2595
This commit is contained in:
George Wilson 2014-07-19 12:19:24 -08:00 committed by Brian Behlendorf
parent f67d709080
commit f3a7f6610f
18 changed files with 839 additions and 246 deletions

View File

@ -110,11 +110,11 @@ static void
usage(void) usage(void)
{ {
(void) fprintf(stderr, (void) fprintf(stderr,
"Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
"[-U config] [-M inflight I/Os] poolname [object...]\n" "[-U config] [-I inflight I/Os] poolname [object...]\n"
" %s [-divPA] [-e -p path...] [-U config] dataset " " %s [-divPA] [-e -p path...] [-U config] dataset "
"[object...]\n" "[object...]\n"
" %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " " %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
"poolname [vdev [metaslab...]]\n" "poolname [vdev [metaslab...]]\n"
" %s -R [-A] [-e [-p path...]] poolname " " %s -R [-A] [-e [-p path...]] poolname "
"vdev:offset:size[:flags]\n" "vdev:offset:size[:flags]\n"
@ -137,6 +137,7 @@ usage(void)
(void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -b block statistics\n");
(void) fprintf(stderr, " -m metaslabs\n"); (void) fprintf(stderr, " -m metaslabs\n");
(void) fprintf(stderr, " -M metaslab groups\n");
(void) fprintf(stderr, " -c checksum all metadata (twice for " (void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n"); "all data) blocks\n");
(void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
@ -165,7 +166,7 @@ usage(void)
(void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -P print numbers in parseable form\n");
(void) fprintf(stderr, " -t <txg> -- highest txg to use when " (void) fprintf(stderr, " -t <txg> -- highest txg to use when "
"searching for uberblocks\n"); "searching for uberblocks\n");
(void) fprintf(stderr, " -M <number of inflight I/Os> -- " (void) fprintf(stderr, " -I <number of inflight I/Os> -- "
"specify the maximum number of checksumming I/Os " "specify the maximum number of checksumming I/Os "
"[default is 200]\n"); "[default is 200]\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
@ -547,7 +548,7 @@ get_metaslab_refcount(vdev_t *vd)
int refcount = 0; int refcount = 0;
int c, m; int c, m;
if (vd->vdev_top == vd) { if (vd->vdev_top == vd && !vd->vdev_removing) {
for (m = 0; m < vd->vdev_ms_count; m++) { for (m = 0; m < vd->vdev_ms_count; m++) {
space_map_t *sm = vd->vdev_ms[m]->ms_sm; space_map_t *sm = vd->vdev_ms[m]->ms_sm;
@ -685,9 +686,10 @@ dump_metaslab(metaslab_t *msp)
* The space map histogram represents free space in chunks * The space map histogram represents free space in chunks
* of sm_shift (i.e. bucket 0 refers to 2^sm_shift). * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
*/ */
(void) printf("\tOn-disk histogram:\n"); (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
(u_longlong_t)msp->ms_fragmentation);
dump_histogram(sm->sm_phys->smp_histogram, dump_histogram(sm->sm_phys->smp_histogram,
SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift); SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
} }
if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
@ -711,6 +713,48 @@ print_vdev_metaslab_header(vdev_t *vd)
"---------------", "-------------"); "---------------", "-------------");
} }
static void
dump_metaslab_groups(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
metaslab_class_t *mc = spa_normal_class(spa);
uint64_t fragmentation;
int c;
metaslab_class_histogram_verify(mc);
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
if (mg->mg_class != mc)
continue;
metaslab_group_histogram_verify(mg);
mg->mg_fragmentation = metaslab_group_fragmentation(mg);
(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
"fragmentation",
(u_longlong_t)tvd->vdev_id,
(u_longlong_t)tvd->vdev_ms_count);
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
(void) printf("%3s\n", "-");
} else {
(void) printf("%3llu%%\n",
(u_longlong_t)mg->mg_fragmentation);
}
dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}
(void) printf("\tpool %s\tfragmentation", spa_name(spa));
fragmentation = metaslab_class_fragmentation(mc);
if (fragmentation == ZFS_FRAG_INVALID)
(void) printf("\t%3s\n", "-");
else
(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}
static void static void
dump_metaslabs(spa_t *spa) dump_metaslabs(spa_t *spa)
{ {
@ -2381,8 +2425,7 @@ zdb_leak(void *arg, uint64_t start, uint64_t size)
} }
static metaslab_ops_t zdb_metaslab_ops = { static metaslab_ops_t zdb_metaslab_ops = {
NULL, /* alloc */ NULL /* alloc */
NULL /* fragmented */
}; };
static void static void
@ -2874,6 +2917,8 @@ dump_zpool(spa_t *spa)
if (dump_opt['d'] > 2 || dump_opt['m']) if (dump_opt['d'] > 2 || dump_opt['m'])
dump_metaslabs(spa); dump_metaslabs(spa);
if (dump_opt['M'])
dump_metaslab_groups(spa);
if (dump_opt['d'] || dump_opt['i']) { if (dump_opt['d'] || dump_opt['i']) {
dump_dir(dp->dp_meta_objset); dump_dir(dp->dp_meta_objset);
@ -3363,7 +3408,7 @@ main(int argc, char **argv)
int flags = ZFS_IMPORT_MISSING_LOG; int flags = ZFS_IMPORT_MISSING_LOG;
int rewind = ZPOOL_NEVER_REWIND; int rewind = ZPOOL_NEVER_REWIND;
char *spa_config_path_env; char *spa_config_path_env;
const char *opts = "bcdhilmM:suCDRSAFLVXevp:t:U:P"; const char *opts = "bcdhilmMI:suCDRSAFLXevp:t:U:P";
(void) setrlimit(RLIMIT_NOFILE, &rl); (void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1); (void) enable_extended_FILE_stdio(-1, -1);
@ -3392,6 +3437,7 @@ main(int argc, char **argv)
case 'u': case 'u':
case 'C': case 'C':
case 'D': case 'D':
case 'M':
case 'R': case 'R':
case 'S': case 'S':
dump_opt[c]++; dump_opt[c]++;
@ -3408,10 +3454,7 @@ main(int argc, char **argv)
case 'V': case 'V':
flags = ZFS_IMPORT_VERBATIM; flags = ZFS_IMPORT_VERBATIM;
break; break;
case 'v': case 'I':
verbose++;
break;
case 'M':
max_inflight = strtoull(optarg, NULL, 0); max_inflight = strtoull(optarg, NULL, 0);
if (max_inflight == 0) { if (max_inflight == 0) {
(void) fprintf(stderr, "maximum number " (void) fprintf(stderr, "maximum number "
@ -3446,6 +3489,9 @@ main(int argc, char **argv)
case 'U': case 'U':
spa_config_path = optarg; spa_config_path = optarg;
break; break;
case 'v':
verbose++;
break;
default: default:
usage(); usage();
break; break;

View File

@ -2998,10 +2998,16 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted)
boolean_t fixed; boolean_t fixed;
size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
zfs_nicenum(value, propval, sizeof (propval));
if (prop == ZPOOL_PROP_EXPANDSZ && value == 0) if (prop == ZPOOL_PROP_EXPANDSZ && value == 0)
(void) strlcpy(propval, "-", sizeof (propval)); (void) strlcpy(propval, "-", sizeof (propval));
else if (prop == ZPOOL_PROP_FRAGMENTATION && value == ZFS_FRAG_INVALID)
(void) strlcpy(propval, "-", sizeof (propval));
else if (prop == ZPOOL_PROP_FRAGMENTATION)
(void) snprintf(propval, sizeof (propval), "%llu%%",
(unsigned long long)value);
else
zfs_nicenum(value, propval, sizeof (propval));
if (scripted) if (scripted)
(void) printf("\t%s", propval); (void) printf("\t%s", propval);
@ -3034,9 +3040,9 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
/* only toplevel vdevs have capacity stats */ /* only toplevel vdevs have capacity stats */
if (vs->vs_space == 0) { if (vs->vs_space == 0) {
if (scripted) if (scripted)
(void) printf("\t-\t-\t-"); (void) printf("\t-\t-\t-\t-");
else else
(void) printf(" - - -"); (void) printf(" - - - -");
} else { } else {
print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, print_one_column(ZPOOL_PROP_SIZE, vs->vs_space,
scripted); scripted);
@ -3044,6 +3050,8 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
scripted); scripted);
print_one_column(ZPOOL_PROP_FREE, print_one_column(ZPOOL_PROP_FREE,
vs->vs_space - vs->vs_alloc, scripted); vs->vs_space - vs->vs_alloc, scripted);
print_one_column(ZPOOL_PROP_FRAGMENTATION,
vs->vs_fragmentation, scripted);
} }
print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize,
scripted); scripted);
@ -3128,8 +3136,8 @@ zpool_do_list(int argc, char **argv)
int ret = 0; int ret = 0;
list_cbdata_t cb = { 0 }; list_cbdata_t cb = { 0 };
static char default_props[] = static char default_props[] =
"name,size,allocated,free,capacity,dedupratio," "name,size,allocated,free,fragmentation,capacity,"
"health,altroot"; "dedupratio,health,altroot";
char *props = default_props; char *props = default_props;
unsigned long interval = 0, count = 0; unsigned long interval = 0, count = 0;
zpool_list_t *list; zpool_list_t *list;

View File

@ -194,6 +194,7 @@ typedef enum {
ZPOOL_PROP_COMMENT, ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING, ZPOOL_PROP_FREEING,
ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED, ZPOOL_PROP_LEAKED,
ZPOOL_NUM_PROPS ZPOOL_NUM_PROPS
} zpool_prop_t; } zpool_prop_t;
@ -599,6 +600,13 @@ typedef struct zpool_rewind_policy {
*/ */
#define SPA_MINDEVSIZE (64ULL << 20) #define SPA_MINDEVSIZE (64ULL << 20)
/*
* Set if the fragmentation has not yet been calculated. This can happen
* because the space maps have not been upgraded or the histogram feature
* is not enabled.
*/
#define ZFS_FRAG_INVALID UINT64_MAX
/* /*
* The location of the pool configuration repository, shared between kernel and * The location of the pool configuration repository, shared between kernel and
* userland. * userland.
@ -747,6 +755,7 @@ typedef struct vdev_stat {
uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
} vdev_stat_t; } vdev_stat_t;
/* /*

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_METASLAB_H #ifndef _SYS_METASLAB_H
@ -38,23 +38,22 @@ extern "C" {
typedef struct metaslab_ops { typedef struct metaslab_ops {
uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size); uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
boolean_t (*msop_fragmented)(metaslab_t *msp);
} metaslab_ops_t; } metaslab_ops_t;
extern metaslab_ops_t *zfs_metaslab_ops; extern metaslab_ops_t *zfs_metaslab_ops;
metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id, metaslab_t *metaslab_init(metaslab_group_t *, uint64_t,
uint64_t object, uint64_t txg); uint64_t, uint64_t);
void metaslab_fini(metaslab_t *msp); void metaslab_fini(metaslab_t *);
void metaslab_load_wait(metaslab_t *msp); void metaslab_load_wait(metaslab_t *);
int metaslab_load(metaslab_t *msp); int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *msp); void metaslab_unload(metaslab_t *);
void metaslab_sync(metaslab_t *msp, uint64_t txg); void metaslab_sync(metaslab_t *, uint64_t);
void metaslab_sync_done(metaslab_t *msp, uint64_t txg); void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *mg); void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *msp); uint64_t metaslab_block_maxsize(metaslab_t *);
#define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1 #define METASLAB_HINTBP_AVOID 0x1
@ -63,30 +62,36 @@ uint64_t metaslab_block_maxsize(metaslab_t *msp);
#define METASLAB_GANG_AVOID 0x8 #define METASLAB_GANG_AVOID 0x8
#define METASLAB_FASTWRITE 0x10 #define METASLAB_FASTWRITE 0x10
int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); blkptr_t *, int, uint64_t, blkptr_t *, int);
void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *spa, const blkptr_t *bp); void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
metaslab_class_t *metaslab_class_create(spa_t *spa, metaslab_ops_t *ops); metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
void metaslab_class_destroy(metaslab_class_t *mc); void metaslab_class_destroy(metaslab_class_t *);
int metaslab_class_validate(metaslab_class_t *mc); int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
void metaslab_class_space_update(metaslab_class_t *mc, void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t alloc_delta, int64_t defer_delta, int64_t, int64_t);
int64_t space_delta, int64_t dspace_delta); uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *mc); uint64_t metaslab_class_get_dspace(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); uint64_t metaslab_class_get_deferred(metaslab_class_t *);
uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
void metaslab_group_destroy(metaslab_group_t *mg); void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *mg); void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *mg); void metaslab_group_passivate(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -41,6 +41,23 @@
extern "C" { extern "C" {
#endif #endif
/*
* A metaslab class encompasses a category of allocatable top-level vdevs.
* Each top-level vdev is associated with a metaslab group which defines
* the allocatable region for that vdev. Examples of these categories include
* "normal" for data block allocations (i.e. main pool allocations) or "log"
* for allocations designated for intent log devices (i.e. slog devices).
* When a block allocation is requested from the SPA it is associated with a
* metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
* to the class can be used to satisfy that request. Allocations are done
* by traversing the metaslab groups that are linked off of the mc_rotor field.
* This rotor points to the next metaslab group where allocations will be
* attempted. Allocating a block is a 3 step process -- select the metaslab
* group, select the metaslab, and then allocate the block. The metaslab
* class defines the low-level block allocator that will be used as the
* final step in allocation. These allocators are pluggable allowing each class
* to use a block allocator that best suits that class.
*/
struct metaslab_class { struct metaslab_class {
spa_t *mc_spa; spa_t *mc_spa;
metaslab_group_t *mc_rotor; metaslab_group_t *mc_rotor;
@ -51,9 +68,19 @@ struct metaslab_class {
uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */ uint64_t mc_dspace; /* total deflated space */
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
kmutex_t mc_fastwrite_lock; kmutex_t mc_fastwrite_lock;
}; };
/*
* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
* of a top-level vdev. They are linked togther to form a circular linked
* list and can belong to only one metaslab class. Metaslab groups may become
* ineligible for allocations for a number of reasons such as limited free
* space, fragmentation, or going offline. When this happens the allocator will
* simply find the next metaslab group in the linked list and attempt
* to allocate from that group instead.
*/
struct metaslab_group { struct metaslab_group {
kmutex_t mg_lock; kmutex_t mg_lock;
avl_tree_t mg_metaslab_tree; avl_tree_t mg_metaslab_tree;
@ -67,12 +94,14 @@ struct metaslab_group {
taskq_t *mg_taskq; taskq_t *mg_taskq;
metaslab_group_t *mg_prev; metaslab_group_t *mg_prev;
metaslab_group_t *mg_next; metaslab_group_t *mg_next;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
}; };
/* /*
* This value defines the number of elements in the ms_lbas array. The value * This value defines the number of elements in the ms_lbas array. The value
* of 64 was chosen as it covers to cover all power of 2 buckets up to * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
* UINT64_MAX. This is the equivalent of highbit(UINT64_MAX). * This is the equivalent of highbit(UINT64_MAX).
*/ */
#define MAX_LBAS 64 #define MAX_LBAS 64
@ -135,6 +164,7 @@ struct metaslab {
uint64_t ms_id; uint64_t ms_id;
uint64_t ms_start; uint64_t ms_start;
uint64_t ms_size; uint64_t ms_size;
uint64_t ms_fragmentation;
range_tree_t *ms_alloctree[TXG_SIZE]; range_tree_t *ms_alloctree[TXG_SIZE];
range_tree_t *ms_freetree[TXG_SIZE]; range_tree_t *ms_freetree[TXG_SIZE];
@ -142,12 +172,12 @@ struct metaslab {
range_tree_t *ms_tree; range_tree_t *ms_tree;
boolean_t ms_condensing; /* condensing? */ boolean_t ms_condensing; /* condensing? */
boolean_t ms_condense_wanted;
boolean_t ms_loaded; boolean_t ms_loaded;
boolean_t ms_loading; boolean_t ms_loading;
int64_t ms_deferspace; /* sum of ms_defermap[] space */ int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_weight; /* weight vs. others in group */
uint64_t ms_factor;
uint64_t ms_access_txg; uint64_t ms_access_txg;
/* /*

View File

@ -24,7 +24,7 @@
*/ */
/* /*
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_SPACE_MAP_H #ifndef _SYS_SPACE_MAP_H
@ -44,9 +44,7 @@ extern "C" {
* maintain backward compatibility. * maintain backward compatibility.
*/ */
#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) #define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
#define SPACE_MAP_HISTOGRAM_SIZE(sm) \ #define SPACE_MAP_HISTOGRAM_SIZE 32
(sizeof ((sm)->sm_phys->smp_histogram) / \
sizeof ((sm)->sm_phys->smp_histogram[0]))
/* /*
* The space_map_phys is the on-disk representation of the space map. * The space_map_phys is the on-disk representation of the space map.
@ -68,7 +66,7 @@ typedef struct space_map_phys {
* whose size is: * whose size is:
* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
*/ */
uint64_t smp_histogram[32]; /* histogram of free space */ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
} space_map_phys_t; } space_map_phys_t;
/* /*

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_ZFS_DEBUG_H #ifndef _SYS_ZFS_DEBUG_H
@ -57,6 +57,7 @@ extern int zfs_free_leak_on_eio;
#define ZFS_DEBUG_MODIFY (1<<4) #define ZFS_DEBUG_MODIFY (1<<4)
#define ZFS_DEBUG_SPA (1<<5) #define ZFS_DEBUG_SPA (1<<5)
#define ZFS_DEBUG_ZIO_FREE (1<<6) #define ZFS_DEBUG_ZIO_FREE (1<<6)
#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7)
/* /*
* Always log zfs debug messages to the spl debug subsystem as SS_USER1. * Always log zfs debug messages to the spl debug subsystem as SS_USER1.

View File

@ -274,6 +274,15 @@ zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
intval = zpool_get_prop_int(zhp, prop, &src); intval = zpool_get_prop_int(zhp, prop, &src);
(void) snprintf(buf, len, "%llu", (u_longlong_t)intval); (void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
break; break;
case ZPOOL_PROP_FRAGMENTATION:
intval = zpool_get_prop_int(zhp, prop, &src);
if (intval == UINT64_MAX) {
(void) strlcpy(buf, "-", len);
} else {
(void) snprintf(buf, len, "%llu%%",
(u_longlong_t)intval);
}
break;
case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_ALTROOT:
case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_CACHEFILE:

View File

@ -134,6 +134,18 @@ Max write bytes per interval
Default value: \fB8,388,608\fR. Default value: \fB8,388,608\fR.
.RE .RE
.sp
.ne 2
.na
\fBmetaslab_bias_enabled\fR (int)
.ad
.RS 12n
Enable metaslab group biasing based on its vdev's over- or under-utilization
relative to the pool.
.sp
Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE
.sp .sp
.ne 2 .ne 2
.na .na
@ -156,6 +168,41 @@ Prevent metaslabs from being unloaded.
Use \fB1\fR for yes and \fB0\fR for no (default). Use \fB1\fR for yes and \fB0\fR for no (default).
.RE .RE
.sp
.ne 2
.na
\fBmetaslab_fragmentation_factor_enabled\fR (int)
.ad
.RS 12n
Enable use of the fragmentation metric in computing metaslab weights.
.sp
Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE
.sp
.ne 2
.na
\fBmetaslab_preload_enabled\fR (int)
.ad
.RS 12n
Enable metaslab group preloading.
.sp
Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE
.sp
.ne 2
.na
\fBmetaslab_lba_weighting_enabled\fR (int)
.ad
.RS 12n
Give more weight to metaslabs with lower LBAs, assuming they have
greater bandwidth as is typically the case on a modern constant
angular velocity disk drive.
.sp
Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE
.sp .sp
.ne 2 .ne 2
.na .na
@ -766,6 +813,35 @@ Disable meta data compression
Use \fB1\fR for yes and \fB0\fR for no (default). Use \fB1\fR for yes and \fB0\fR for no (default).
.RE .RE
.sp
.ne 2
.na
\fBzfs_metaslab_fragmentation_threshold\fR (int)
.ad
.RS 12n
Allow metaslabs to keep their active state as long as their fragmentation
percentage is less than or equal to this value. An active metaslab that
exceeds this threshold will no longer keep its active status allowing
better metaslabs to be selected.
.sp
Default value: \fB70\fR.
.RE
.sp
.ne 2
.na
\fBzfs_mg_fragmentation_threshold\fR (int)
.ad
.RS 12n
Metaslab groups are considered eligible for allocations if their
fragmenation metric (measured as a percentage) is less than or equal to
this value. If a metaslab group exceeds this threshold then it will be
skipped unless all metaslab groups within the metaslab class have also
crossed this threshold.
.sp
Default value: \fB85\fR.
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -19,16 +19,16 @@
\fBzdb\fR - Display zpool debugging and consistency information \fBzdb\fR - Display zpool debugging and consistency information
.SH "SYNOPSIS" .SH "SYNOPSIS"
\fBzdb\fR [-CumdibcsDvhLVXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR] \fBzdb\fR [-CumdibcsDvhLMXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR]
[-U \fIcache\fR] [-M \fIinflight I/Os\fR] [\fIpoolname\fR [-U \fIcache\fR] [-I \fIinflight I/Os\fR]
[\fIobject\fR ...]] [\fIpoolname\fR [\fIobject\fR ...]]
.P .P
\fBzdb\fR [-divPA] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] \fBzdb\fR [-divPA] [-e [-p \fIpath\fR...]] [-U \fIcache\fR]
\fIdataset\fR [\fIobject\fR ...] \fIdataset\fR [\fIobject\fR ...]
.P .P
\fBzdb\fR -m [-LXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] \fBzdb\fR -m [-MLXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR]
\fIpoolname\fR [\fIvdev\fR [\fImetaslab\fR ...]] \fIpoolname\fR [\fIvdev\fR [\fImetaslab\fR ...]]
.P .P
@ -194,6 +194,21 @@ verifies that all non-free blocks are referenced, which can be very expensive.
.sp .6 .sp .6
.RS 4n .RS 4n
Display the offset, spacemap, and free space of each metaslab. Display the offset, spacemap, and free space of each metaslab.
When specified twice, also display information about the on-disk free
space histogram associated with each metaslab. When specified three time,
display the maximum contiguous free space, the in-core free space histogram,
and the percentage of free space in each space map. When specified
four times display every spacemap record.
.RE
.sp
.ne 2
.na
\fB-M\fR
.ad
.sp .6
.RS 4n
Display the offset, spacemap, and free space of each metaslab.
When specified twice, also display information about the maximum contiguous When specified twice, also display information about the maximum contiguous
free space and the percentage of free space in each space map. When specified free space and the percentage of free space in each space map. When specified
three times display every spacemap record. three times display every spacemap record.
@ -366,7 +381,7 @@ transactions.
.sp .sp
.ne 2 .ne 2
.na .na
\fB-M \fIinflight I/Os\fR \fR \fB-I \fIinflight I/Os\fR \fR
.ad .ad
.sp .6 .sp .6
.RS 4n .RS 4n

View File

@ -1,7 +1,7 @@
'\" te '\" te
.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
.\" Copyright 2011 Nexenta Systems, Inc. All rights reserved. .\" Copyright 2011 Nexenta Systems, Inc. All rights reserved.
.\" Copyright (c) 2012 by Delphix. All Rights Reserved. .\" Copyright (c) 2013 by Delphix. All rights reserved.
.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
.\" The contents of this file are subject to the terms of the Common Development .\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except .\" and Distribution License (the "License"). You may not use this file except
@ -502,6 +502,17 @@ any space on an EFI labeled vdev which has not been brought online
(i.e. zpool online -e). This space occurs when a LUN is dynamically expanded. (i.e. zpool online -e). This space occurs when a LUN is dynamically expanded.
.RE .RE
.sp
.ne 2
.mk
.na
\fB\fBfragmentation\fR\fR
.ad
.RS 20n
.rt
The amount of fragmentation in the pool.
.RE
.sp .sp
.ne 2 .ne 2
.mk .mk
@ -1587,7 +1598,7 @@ Specify \fBu\fR for a printed representation of the internal representation of t
.ad .ad
.RS 12n .RS 12n
.rt .rt
Comma-separated list of properties to display. See the "Properties" section for a list of valid properties. The default list is "name, size, used, available, expandsize, capacity, dedupratio, health, altroot" Comma-separated list of properties to display. See the "Properties" section for a list of valid properties. The default list is "name, size, used, available, fragmentation, expandsize, capacity, dedupratio, health, altroot"
.RE .RE
.sp .sp
@ -2002,10 +2013,10 @@ The results from this command are similar to the following:
.in +2 .in +2
.nf .nf
# \fBzpool list\fR # \fBzpool list\fR
NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
rpool 19.9G 8.43G 11.4G - 42% 1.00x ONLINE - rpool 19.9G 8.43G 11.4G 33% - 42% 1.00x ONLINE -
tank 61.5G 20.0G 41.5G - 32% 1.00x ONLINE - tank 61.5G 20.0G 41.5G 48% - 32% 1.00x ONLINE -
zion - - - - - - FAULTED - zion - - - - - - - FAULTED -
.fi .fi
.in -2 .in -2
.sp .sp
@ -2212,9 +2223,9 @@ The command to remove the mirrored log \fBmirror-2\fR is:
\fBExample 15 \fRDisplaying expanded space on a device \fBExample 15 \fRDisplaying expanded space on a device
.sp .sp
.LP .LP
The following command dipslays the detailed information for the \fIdata\fR The following command displays the detailed information for the \fIdata\fR
pool. This pool is comprised of a single \fIraidz\fR vdev where one of its pool. This pool is comprised of a single \fIraidz\fR vdev where one of its
devices increased its capacity by 1GB. In this example, the pool will not devices increased its capacity by 10GB. In this example, the pool will not
be able to utilized this extra capacity until all the devices under the be able to utilized this extra capacity until all the devices under the
\fIraidz\fR vdev have been expanded. \fIraidz\fR vdev have been expanded.
@ -2222,12 +2233,12 @@ be able to utilized this extra capacity until all the devices under the
.in +2 .in +2
.nf .nf
# \fBzpool list -v data\fR # \fBzpool list -v data\fR
NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
data 17.9G 174K 17.9G - 0% 1.00x ONLINE - data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE -
raidz1 17.9G 174K 17.9G - raidz1 23.9G 14.6G 9.30G 48% -
c4t2d0 - - - 1G c1t1d0 - - - - -
c4t3d0 - - - - c1t2d0 - - - - 10G
c4t4d0 - - - - c1t3d0 - - - - -
.fi .fi
.in -2 .in -2

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/ */
#include <sys/zio.h> #include <sys/zio.h>
@ -87,6 +87,8 @@ zpool_prop_init(void)
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC"); PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ"); PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "CAP"); ZFS_TYPE_POOL, "<size>", "CAP");
zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,

View File

@ -32,6 +32,7 @@
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/spa_impl.h> #include <sys/spa_impl.h>
#include <sys/zfeature.h>
#define WITH_DF_BLOCK_ALLOCATOR #define WITH_DF_BLOCK_ALLOCATOR
@ -66,7 +67,7 @@ int zfs_condense_pct = 200;
/* /*
* The zfs_mg_noalloc_threshold defines which metaslab groups should * The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of * be eligible for allocation. The value is defined as a percentage of
* a free space. Metaslab groups that have more free space than * free space. Metaslab groups that have more free space than
* zfs_mg_noalloc_threshold are always eligible for allocations. Once * zfs_mg_noalloc_threshold are always eligible for allocations. Once
* a metaslab group's free space is less than or equal to the * a metaslab group's free space is less than or equal to the
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
@ -78,6 +79,23 @@ int zfs_condense_pct = 200;
*/ */
int zfs_mg_noalloc_threshold = 0; int zfs_mg_noalloc_threshold = 0;
/*
* Metaslab groups are considered eligible for allocations if their
* fragmenation metric (measured as a percentage) is less than or equal to
* zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
* then it will be skipped unless all metaslab groups within the metaslab
* class have also crossed this threshold.
*/
int zfs_mg_fragmentation_threshold = 85;
/*
* Allow metaslabs to keep their active state as long as their fragmentation
* percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
* active metaslab that exceeds this threshold will no longer keep its active
* status allowing better metaslabs to be selected.
*/
int zfs_metaslab_fragmentation_threshold = 70;
/* /*
* When set will load all metaslabs when pool is first opened. * When set will load all metaslabs when pool is first opened.
*/ */
@ -122,11 +140,6 @@ int metaslab_load_pct = 50;
*/ */
int metaslab_unload_delay = TXG_SIZE * 2; int metaslab_unload_delay = TXG_SIZE * 2;
/*
* Should we be willing to write data to degraded vdevs?
*/
boolean_t zfs_write_to_degraded = B_FALSE;
/* /*
* Max number of metaslabs per group to preload. * Max number of metaslabs per group to preload.
*/ */
@ -135,13 +148,24 @@ int metaslab_preload_limit = SPA_DVAS_PER_BP;
/* /*
* Enable/disable preloading of metaslab. * Enable/disable preloading of metaslab.
*/ */
boolean_t metaslab_preload_enabled = B_TRUE; int metaslab_preload_enabled = B_TRUE;
/* /*
* Enable/disable additional weight factor for each metaslab. * Enable/disable fragmentation weighting on metaslabs.
*/ */
boolean_t metaslab_weight_factor_enable = B_FALSE; int metaslab_fragmentation_factor_enabled = B_TRUE;
/*
* Enable/disable lba weighting (i.e. outer tracks are given preference).
*/
int metaslab_lba_weighting_enabled = B_TRUE;
/*
* Enable/disable metaslab group biasing.
*/
int metaslab_bias_enabled = B_TRUE;
static uint64_t metaslab_fragmentation(metaslab_t *);
/* /*
* ========================================================================== * ==========================================================================
@ -236,6 +260,123 @@ metaslab_class_get_dspace(metaslab_class_t *mc)
return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
} }
void
metaslab_class_histogram_verify(metaslab_class_t *mc)
{
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
uint64_t *mc_hist;
int i, c;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
KM_PUSHPAGE);
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
/*
* Skip any holes, uninitialized top-levels, or
* vdevs that are not in this metalab class.
*/
if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
mg->mg_class != mc) {
continue;
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
mc_hist[i] += mg->mg_histogram[i];
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}
/*
* Calculate the metaslab class's fragmentation metric. The metric
* is weighted based on the space contribution of each metaslab group.
* The return value will be a number between 0 and 100 (inclusive), or
* ZFS_FRAG_INVALID if the metric has not been set. See comment above the
* zfs_frag_table for more information about the metric.
*/
uint64_t
metaslab_class_fragmentation(metaslab_class_t *mc)
{
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
uint64_t fragmentation = 0;
int c;
spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
/*
* Skip any holes, uninitialized top-levels, or
* vdevs that are not in this metalab class.
*/
if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
mg->mg_class != mc) {
continue;
}
/*
* If a metaslab group does not contain a fragmentation
* metric then just bail out.
*/
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (ZFS_FRAG_INVALID);
}
/*
* Determine how much this metaslab_group is contributing
* to the overall pool fragmentation metric.
*/
fragmentation += mg->mg_fragmentation *
metaslab_group_get_space(mg);
}
fragmentation /= metaslab_class_get_space(mc);
ASSERT3U(fragmentation, <=, 100);
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (fragmentation);
}
/*
* Calculate the amount of expandable space that is available in
* this metaslab class. If a device is expanded then its expandable
* space will be the amount of allocatable space that is currently not
* part of this metaslab class.
*/
uint64_t
metaslab_class_expandable_space(metaslab_class_t *mc)
{
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
uint64_t space = 0;
int c;
spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
mg->mg_class != mc) {
continue;
}
space += tvd->vdev_max_asize - tvd->vdev_asize;
}
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (space);
}
/* /*
* ========================================================================== * ==========================================================================
* Metaslab groups * Metaslab groups
@ -288,7 +429,15 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
(vs->vs_space + 1); (vs->vs_space + 1);
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); /*
* A metaslab group is considered allocatable if it has plenty
* of free space or is not heavily fragmented. We only take
* fragmentation into account if the metaslab group has a valid
* fragmentation metric (i.e. a value between 0 and 100).
*/
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
/* /*
* The mc_alloc_groups maintains a count of the number of * The mc_alloc_groups maintains a count of the number of
@ -309,6 +458,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
mc->mc_alloc_groups--; mc->mc_alloc_groups--;
else if (!was_allocatable && mg->mg_allocatable) else if (!was_allocatable && mg->mg_allocatable)
mc->mc_alloc_groups++; mc->mc_alloc_groups++;
mutex_exit(&mg->mg_lock); mutex_exit(&mg->mg_lock);
} }
@ -398,6 +548,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
} }
taskq_wait(mg->mg_taskq); taskq_wait(mg->mg_taskq);
metaslab_group_alloc_update(mg);
mgprev = mg->mg_prev; mgprev = mg->mg_prev;
mgnext = mg->mg_next; mgnext = mg->mg_next;
@ -414,20 +565,115 @@ metaslab_group_passivate(metaslab_group_t *mg)
mg->mg_next = NULL; mg->mg_next = NULL;
} }
uint64_t
metaslab_group_get_space(metaslab_group_t *mg)
{
return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
}
void
metaslab_group_histogram_verify(metaslab_group_t *mg)
{
uint64_t *mg_hist;
vdev_t *vd = mg->mg_vd;
uint64_t ashift = vd->vdev_ashift;
int i, m;
if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
return;
mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
KM_PUSHPAGE);
ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
SPACE_MAP_HISTOGRAM_SIZE + ashift);
for (m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp->ms_sm == NULL)
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
mg_hist[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
}
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
}
static void
metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
{
metaslab_class_t *mc = mg->mg_class;
uint64_t ashift = mg->mg_vd->vdev_ashift;
int i;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if (msp->ms_sm == NULL)
return;
mutex_enter(&mg->mg_lock);
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
mg->mg_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
}
mutex_exit(&mg->mg_lock);
}
void
metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
{
metaslab_class_t *mc = mg->mg_class;
uint64_t ashift = mg->mg_vd->vdev_ashift;
int i;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if (msp->ms_sm == NULL)
return;
mutex_enter(&mg->mg_lock);
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
ASSERT3U(mg->mg_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
ASSERT3U(mc->mc_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
mg->mg_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
}
mutex_exit(&mg->mg_lock);
}
static void static void
metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
{ {
mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == NULL); ASSERT(msp->ms_group == NULL);
mutex_enter(&mg->mg_lock);
msp->ms_group = mg; msp->ms_group = mg;
msp->ms_weight = 0; msp->ms_weight = 0;
avl_add(&mg->mg_metaslab_tree, msp); avl_add(&mg->mg_metaslab_tree, msp);
mutex_exit(&mg->mg_lock); mutex_exit(&mg->mg_lock);
mutex_enter(&msp->ms_lock);
metaslab_group_histogram_add(mg, msp);
mutex_exit(&msp->ms_lock);
} }
static void static void
metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
{ {
mutex_enter(&msp->ms_lock);
metaslab_group_histogram_remove(mg, msp);
mutex_exit(&msp->ms_lock);
mutex_enter(&mg->mg_lock); mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == mg); ASSERT(msp->ms_group == mg);
avl_remove(&mg->mg_metaslab_tree, msp); avl_remove(&mg->mg_metaslab_tree, msp);
@ -440,9 +686,9 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
{ {
/* /*
* Although in principle the weight can be any value, in * Although in principle the weight can be any value, in
* practice we do not use values in the range [1, 510]. * practice we do not use values in the range [1, 511].
*/ */
ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
mutex_enter(&mg->mg_lock); mutex_enter(&mg->mg_lock);
@ -453,10 +699,44 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
mutex_exit(&mg->mg_lock); mutex_exit(&mg->mg_lock);
} }
/*
* Calculate the fragmentation for a given metaslab group. We can use
* a simple average here since all metaslabs within the group must have
* the same size. The return value will be a value between 0 and 100
* (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
* group have a fragmentation metric.
*/
uint64_t
metaslab_group_fragmentation(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
uint64_t fragmentation = 0;
uint64_t valid_ms = 0;
int m;
for (m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
valid_ms++;
fragmentation += msp->ms_fragmentation;
}
if (valid_ms <= vd->vdev_ms_count / 2)
return (ZFS_FRAG_INVALID);
fragmentation /= valid_ms;
ASSERT3U(fragmentation, <=, 100);
return (fragmentation);
}
/* /*
* Determine if a given metaslab group should skip allocations. A metaslab * Determine if a given metaslab group should skip allocations. A metaslab
* group should avoid allocations if its used capacity has crossed the * group should avoid allocations if its free capacity is less than the
* zfs_mg_noalloc_threshold and there is at least one metaslab group * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
* zfs_mg_fragmentation_threshold and there is at least one metaslab group
* that can still handle allocations. * that can still handle allocations.
*/ */
static boolean_t static boolean_t
@ -467,12 +747,19 @@ metaslab_group_allocatable(metaslab_group_t *mg)
metaslab_class_t *mc = mg->mg_class; metaslab_class_t *mc = mg->mg_class;
/* /*
* A metaslab group is considered allocatable if its free capacity * We use two key metrics to determine if a metaslab group is
* is greater than the set value of zfs_mg_noalloc_threshold, it's * considered allocatable -- free space and fragmentation. If
* associated with a slog, or there are no other metaslab groups * the free space is greater than the free space threshold and
* with free capacity greater than zfs_mg_noalloc_threshold. * the fragmentation is less than the fragmentation threshold then
* consider the group allocatable. There are two case when we will
* not consider these key metrics. The first is if the group is
* associated with a slog device and the second is if all groups
* in this metaslab class have already been consider ineligible
* for allocations.
*/ */
return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
(mg->mg_fragmentation == ZFS_FRAG_INVALID ||
mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) ||
mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
} }
@ -701,16 +988,8 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
return (metaslab_block_picker(t, cursor, size, align)); return (metaslab_block_picker(t, cursor, size, align));
} }
/* ARGSUSED */
static boolean_t
metaslab_ff_fragmented(metaslab_t *msp)
{
return (B_TRUE);
}
static metaslab_ops_t metaslab_ff_ops = { static metaslab_ops_t metaslab_ff_ops = {
metaslab_ff_alloc, metaslab_ff_alloc
metaslab_ff_fragmented
}; };
metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; metaslab_ops_t *zfs_metaslab_ops = &metaslab_ff_ops;
@ -761,24 +1040,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
return (metaslab_block_picker(t, cursor, size, 1ULL)); return (metaslab_block_picker(t, cursor, size, 1ULL));
} }
static boolean_t
metaslab_df_fragmented(metaslab_t *msp)
{
range_tree_t *rt = msp->ms_tree;
uint64_t max_size = metaslab_block_maxsize(msp);
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
if (max_size >= metaslab_df_alloc_threshold &&
free_pct >= metaslab_df_free_pct)
return (B_FALSE);
return (B_TRUE);
}
static metaslab_ops_t metaslab_df_ops = { static metaslab_ops_t metaslab_df_ops = {
metaslab_df_alloc, metaslab_df_alloc
metaslab_df_fragmented
}; };
metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
@ -825,15 +1088,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
return (offset); return (offset);
} }
static boolean_t
metaslab_cf_fragmented(metaslab_t *msp)
{
return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
}
static metaslab_ops_t metaslab_cf_ops = { static metaslab_ops_t metaslab_cf_ops = {
metaslab_cf_alloc, metaslab_cf_alloc
metaslab_cf_fragmented
}; };
metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops; metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
@ -894,16 +1150,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
return (-1ULL); return (-1ULL);
} }
static boolean_t
metaslab_ndf_fragmented(metaslab_t *msp)
{
return (metaslab_block_maxsize(msp) <=
(metaslab_min_alloc_size << metaslab_ndf_clump_shift));
}
static metaslab_ops_t metaslab_ndf_ops = { static metaslab_ops_t metaslab_ndf_ops = {
metaslab_ndf_alloc, metaslab_ndf_alloc
metaslab_ndf_fragmented
}; };
metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
@ -1008,6 +1256,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
metaslab_group_add(mg, msp); metaslab_group_add(mg, msp);
msp->ms_fragmentation = metaslab_fragmentation(msp);
msp->ms_ops = mg->mg_class->mc_ops; msp->ms_ops = mg->mg_class->mc_ops;
/* /*
@ -1075,69 +1324,114 @@ metaslab_fini(metaslab_t *msp)
kmem_free(msp, sizeof (metaslab_t)); kmem_free(msp, sizeof (metaslab_t));
} }
#define FRAGMENTATION_TABLE_SIZE 17
/* /*
* Apply a weighting factor based on the histogram information for this * This table defines a segment size based fragmentation metric that will
* metaslab. The current weighting factor is somewhat arbitrary and requires * allow each metaslab to derive its own fragmentation value. This is done
* additional investigation. The implementation provides a measure of * by calculating the space in each bucket of the spacemap histogram and
* "weighted" free space and gives a higher weighting for larger contiguous * multiplying that by the fragmetation metric in this table. Doing
* regions. The weighting factor is determined by counting the number of * this for all buckets and dividing it by the total amount of free
* sm_shift sectors that exist in each region represented by the histogram. * space in this metaslab (i.e. the total free space in all buckets) gives
* That value is then multiplied by the power of 2 exponent and the sm_shift * us the fragmentation metric. This means that a high fragmentation metric
* value. * equates to most of the free space being comprised of small segments.
* Conversely, if the metric is low, then most of the free space is in
* large segments. A 10% change in fragmentation equates to approximately
* double the number of segments.
* *
* For example, assume the 2^21 histogram bucket has 4 2MB regions and the * This table defines 0% fragmented space using 16MB segments. Testing has
* metaslab has an sm_shift value of 9 (512B): * shown that segments that are greater than or equal to 16MB do not suffer
* * from drastic performance problems. Using this value, we derive the rest
* 1) calculate the number of sm_shift sectors in the region: * of the table. Since the fragmentation value is never stored on disk, it
* 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384 * is possible to change these calculations in the future.
* 2) multiply by the power of 2 exponent and the sm_shift value: */
* 16384 * 21 * 9 = 3096576 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
* This value will be added to the weighting of the metaslab. 100, /* 512B */
100, /* 1K */
98, /* 2K */
95, /* 4K */
90, /* 8K */
80, /* 16K */
70, /* 32K */
60, /* 64K */
50, /* 128K */
40, /* 256K */
30, /* 512K */
20, /* 1M */
15, /* 2M */
10, /* 4M */
5, /* 8M */
0 /* 16M */
};
/*
* Calclate the metaslab's fragmentation metric. A return value
* of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
* not support this metric. Otherwise, the return value should be in the
* range [0, 100].
*/ */
static uint64_t static uint64_t
metaslab_weight_factor(metaslab_t *msp) metaslab_fragmentation(metaslab_t *msp)
{ {
uint64_t factor = 0; spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
uint64_t sectors; uint64_t fragmentation = 0;
uint64_t total = 0;
boolean_t feature_enabled = spa_feature_is_enabled(spa,
SPA_FEATURE_SPACEMAP_HISTOGRAM);
int i; int i;
if (!feature_enabled)
return (ZFS_FRAG_INVALID);
/* /*
* A null space map means that the entire metaslab is free, * A null space map means that the entire metaslab is free
* calculate a weight factor that spans the entire size of the * and thus is not fragmented.
* metaslab.
*/ */
if (msp->ms_sm == NULL) { if (msp->ms_sm == NULL)
vdev_t *vd = msp->ms_group->mg_vd;
i = highbit64(msp->ms_size) - 1;
sectors = msp->ms_size >> vd->vdev_ashift;
return (sectors * i * vd->vdev_ashift);
}
if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
return (0); return (0);
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) { /*
* If this metaslab's space_map has not been upgraded, flag it
* so that we upgrade next time we encounter it.
*/
if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
uint64_t txg = spa_syncing_txg(spa);
vdev_t *vd = msp->ms_group->mg_vd;
msp->ms_condense_wanted = B_TRUE;
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
spa_dbgmsg(spa, "txg %llu, requesting force condense: "
"msp %p, vd %p", txg, msp, vd);
return (ZFS_FRAG_INVALID);
}
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
uint64_t space = 0;
uint8_t shift = msp->ms_sm->sm_shift;
int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
FRAGMENTATION_TABLE_SIZE - 1);
if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
continue; continue;
/* space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
* Determine the number of sm_shift sectors in the region total += space;
* indicated by the histogram. For example, given an
* sm_shift value of 9 (512 bytes) and i = 4 then we know ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
* that we're looking at an 8K region in the histogram fragmentation += space * zfs_frag_table[idx];
* (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
* number of sm_shift sectors (512 bytes in this example),
* we would take 8192 / 512 = 16. Since the histogram
* is offset by sm_shift we can simply use the value of
* of i to calculate this (i.e. 2^i = 16 where i = 4).
*/
sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
factor += (i + msp->ms_sm->sm_shift) * sectors;
}
return (factor * msp->ms_sm->sm_shift);
} }
if (total > 0)
fragmentation /= total;
ASSERT3U(fragmentation, <=, 100);
return (fragmentation);
}
/*
* Compute a weight -- a selection preference value -- for the given metaslab.
* This is based on the amount of free space, the level of fragmentation,
* the LBA range, and whether the metaslab is loaded.
*/
static uint64_t static uint64_t
metaslab_weight(metaslab_t *msp) metaslab_weight(metaslab_t *msp)
{ {
@ -1161,6 +1455,29 @@ metaslab_weight(metaslab_t *msp)
* The baseline weight is the metaslab's free space. * The baseline weight is the metaslab's free space.
*/ */
space = msp->ms_size - space_map_allocated(msp->ms_sm); space = msp->ms_size - space_map_allocated(msp->ms_sm);
msp->ms_fragmentation = metaslab_fragmentation(msp);
if (metaslab_fragmentation_factor_enabled &&
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
/*
* Use the fragmentation information to inversely scale
* down the baseline weight. We need to ensure that we
* don't exclude this metaslab completely when it's 100%
* fragmented. To avoid this we reduce the fragmented value
* by 1.
*/
space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
/*
* If space < SPA_MINBLOCKSIZE, then we will not allocate from
* this metaslab again. The fragmentation metric may have
* decreased the space to something smaller than
* SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
* so that we can consume any remaining space.
*/
if (space > 0 && space < SPA_MINBLOCKSIZE)
space = SPA_MINBLOCKSIZE;
}
weight = space; weight = space;
/* /*
@ -1172,19 +1489,19 @@ metaslab_weight(metaslab_t *msp)
* In effect, this means that we'll select the metaslab with the most * In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space. * free bandwidth rather than simply the one with the most free space.
*/ */
if (metaslab_lba_weighting_enabled) {
weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
ASSERT(weight >= space && weight <= 2 * space); ASSERT(weight >= space && weight <= 2 * space);
}
msp->ms_factor = metaslab_weight_factor(msp);
if (metaslab_weight_factor_enable)
weight += msp->ms_factor;
if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
/* /*
* If this metaslab is one we're actively using, adjust its * If this metaslab is one we're actively using, adjust its
* weight to make it preferable to any inactive metaslab so * weight to make it preferable to any inactive metaslab so
* we'll polish it off. * we'll polish it off. If the fragmentation on this metaslab
* has exceed our threshold, then don't mark it active.
*/ */
if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
} }
@ -1269,9 +1586,16 @@ metaslab_group_preload(metaslab_group_t *mg)
while (msp != NULL) { while (msp != NULL) {
metaslab_t *msp_next = AVL_NEXT(t, msp); metaslab_t *msp_next = AVL_NEXT(t, msp);
/* If we have reached our preload limit then we're done */ /*
if (++m > metaslab_preload_limit) * We preload only the maximum number of metaslabs specified
break; * by metaslab_preload_limit. If a metaslab is being forced
* to condense then we preload it too. This will ensure
* that force condensing happens in the next txg.
*/
if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
msp = msp_next;
continue;
}
/* /*
* We must drop the metaslab group lock here to preserve * We must drop the metaslab group lock here to preserve
@ -1329,11 +1653,12 @@ metaslab_should_condense(metaslab_t *msp)
/* /*
* Use the ms_size_tree range tree, which is ordered by size, to * Use the ms_size_tree range tree, which is ordered by size, to
* obtain the largest segment in the free tree. If the tree is empty * obtain the largest segment in the free tree. We always condense
* then we should condense the map. * metaslabs that are empty and metaslabs for which a condense
* request has been made.
*/ */
rs = avl_last(&msp->ms_size_tree); rs = avl_last(&msp->ms_size_tree);
if (rs == NULL) if (rs == NULL || msp->ms_condense_wanted)
return (B_TRUE); return (B_TRUE);
/* /*
@ -1369,9 +1694,14 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT3U(spa_sync_pass(spa), ==, 1);
ASSERT(msp->ms_loaded); ASSERT(msp->ms_loaded);
spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
"smp size %llu, segments %lu", txg, msp->ms_id, msp, "smp size %llu, segments %lu, forcing condense=%s", txg,
space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root)); msp->ms_id, msp, space_map_length(msp->ms_sm),
avl_numnodes(&msp->ms_tree->rt_root),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
/* /*
* Create an range tree that is 100% allocated. We remove segments * Create an range tree that is 100% allocated. We remove segments
@ -1464,8 +1794,14 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
ASSERT3P(*freetree, !=, NULL); ASSERT3P(*freetree, !=, NULL);
ASSERT3P(*freed_tree, !=, NULL); ASSERT3P(*freed_tree, !=, NULL);
/*
* Normally, we don't want to process a metaslab if there
* are no allocations or frees to perform. However, if the metaslab
* is being forced to condense we need to let it through.
*/
if (range_tree_space(alloctree) == 0 && if (range_tree_space(alloctree) == 0 &&
range_tree_space(*freetree) == 0) range_tree_space(*freetree) == 0 &&
!msp->ms_condense_wanted)
return; return;
/* /*
@ -1502,8 +1838,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
} }
range_tree_vacate(alloctree, NULL, NULL); metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
metaslab_group_histogram_remove(mg, msp);
if (msp->ms_loaded) { if (msp->ms_loaded) {
/* /*
* When the space map is loaded, we have an accruate * When the space map is loaded, we have an accruate
@ -1523,6 +1860,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
*/ */
space_map_histogram_add(msp->ms_sm, *freetree, tx); space_map_histogram_add(msp->ms_sm, *freetree, tx);
} }
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
/* /*
* For sync pass 1, we avoid traversing this txg's free range tree * For sync pass 1, we avoid traversing this txg's free range tree
@ -1535,6 +1875,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
} else { } else {
range_tree_vacate(*freetree, range_tree_add, *freed_tree); range_tree_vacate(*freetree, range_tree_add, *freed_tree);
} }
range_tree_vacate(alloctree, NULL, NULL);
ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
@ -1646,13 +1987,13 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
metaslab_group_sort(mg, msp, metaslab_weight(msp)); metaslab_group_sort(mg, msp, metaslab_weight(msp));
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
} }
void void
metaslab_sync_reassess(metaslab_group_t *mg) metaslab_sync_reassess(metaslab_group_t *mg)
{ {
metaslab_group_alloc_update(mg); metaslab_group_alloc_update(mg);
mg->mg_fragmentation = metaslab_group_fragmentation(mg);
/* /*
* Preload the next potential metaslabs * Preload the next potential metaslabs
@ -1926,9 +2267,7 @@ top:
*/ */
if ((vd->vdev_stat.vs_write_errors > 0 || if ((vd->vdev_stat.vs_write_errors > 0 ||
vd->vdev_state < VDEV_STATE_HEALTHY) && vd->vdev_state < VDEV_STATE_HEALTHY) &&
d == 0 && dshift == 3 && d == 0 && dshift == 3 && vd->vdev_children == 0) {
!(zfs_write_to_degraded && vd->vdev_state ==
VDEV_STATE_DEGRADED)) {
all_zero = B_FALSE; all_zero = B_FALSE;
goto next; goto next;
} }
@ -1953,7 +2292,7 @@ top:
* over- or under-used relative to the pool, * over- or under-used relative to the pool,
* and set an allocation bias to even it out. * and set an allocation bias to even it out.
*/ */
if (mc->mc_aliquot == 0) { if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_t *vs = &vd->vdev_stat;
int64_t vu, cu; int64_t vu, cu;
@ -1975,6 +2314,8 @@ top:
*/ */
mg->mg_bias = ((cu - vu) * mg->mg_bias = ((cu - vu) *
(int64_t)mg->mg_aliquot) / 100; (int64_t)mg->mg_aliquot) / 100;
} else if (!metaslab_bias_enabled) {
mg->mg_bias = 0;
} }
if ((flags & METASLAB_FASTWRITE) || if ((flags & METASLAB_FASTWRITE) ||
@ -2305,12 +2646,32 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
#if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_KERNEL) && defined(HAVE_SPL)
module_param(metaslab_debug_load, int, 0644); module_param(metaslab_debug_load, int, 0644);
module_param(metaslab_debug_unload, int, 0644); module_param(metaslab_debug_unload, int, 0644);
module_param(metaslab_preload_enabled, int, 0644);
module_param(zfs_mg_noalloc_threshold, int, 0644);
module_param(zfs_mg_fragmentation_threshold, int, 0644);
module_param(zfs_metaslab_fragmentation_threshold, int, 0644);
module_param(metaslab_fragmentation_factor_enabled, int, 0644);
module_param(metaslab_lba_weighting_enabled, int, 0644);
module_param(metaslab_bias_enabled, int, 0644);
MODULE_PARM_DESC(metaslab_debug_load, MODULE_PARM_DESC(metaslab_debug_load,
"load all metaslabs when pool is first opened"); "load all metaslabs when pool is first opened");
MODULE_PARM_DESC(metaslab_debug_unload, MODULE_PARM_DESC(metaslab_debug_unload,
"prevent metaslabs from being unloaded"); "prevent metaslabs from being unloaded");
MODULE_PARM_DESC(metaslab_preload_enabled,
"preload potential metaslabs during reassessment");
module_param(zfs_mg_noalloc_threshold, int, 0644);
MODULE_PARM_DESC(zfs_mg_noalloc_threshold, MODULE_PARM_DESC(zfs_mg_noalloc_threshold,
"percentage of free space for metaslab group to allow allocation"); "percentage of free space for metaslab group to allow allocation");
MODULE_PARM_DESC(zfs_mg_fragmentation_threshold,
"fragmentation for metaslab group to allow allocation");
MODULE_PARM_DESC(zfs_metaslab_fragmentation_threshold,
"fragmentation for metaslab to allow allocation");
MODULE_PARM_DESC(metaslab_fragmentation_factor_enabled,
"use the fragmentation metric to prefer less fragmented metaslabs");
MODULE_PARM_DESC(metaslab_lba_weighting_enabled,
"prefer metaslabs with lower LBAs");
MODULE_PARM_DESC(metaslab_bias_enabled,
"enable metaslab group biasing");
#endif /* _KERNEL && HAVE_SPL */ #endif /* _KERNEL && HAVE_SPL */

View File

@ -81,6 +81,7 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
uint64_t size = rs->rs_end - rs->rs_start; uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit64(size) - 1; int idx = highbit64(size) - 1;
ASSERT(size != 0);
ASSERT3U(idx, <, ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
@ -95,6 +96,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
uint64_t size = rs->rs_end - rs->rs_start; uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit64(size) - 1; int idx = highbit64(size) - 1;
ASSERT(size != 0);
ASSERT3U(idx, <, ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));

View File

@ -190,13 +190,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
{ {
vdev_t *rvd = spa->spa_root_vdev; vdev_t *rvd = spa->spa_root_vdev;
dsl_pool_t *pool = spa->spa_dsl_pool; dsl_pool_t *pool = spa->spa_dsl_pool;
uint64_t size; uint64_t size, alloc, cap, version;
uint64_t alloc;
uint64_t space;
uint64_t cap, version;
zprop_source_t src = ZPROP_SRC_NONE; zprop_source_t src = ZPROP_SRC_NONE;
spa_config_dirent_t *dp; spa_config_dirent_t *dp;
int c; metaslab_class_t *mc = spa_normal_class(spa);
ASSERT(MUTEX_HELD(&spa->spa_props_lock)); ASSERT(MUTEX_HELD(&spa->spa_props_lock));
@ -209,14 +206,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
size - alloc, src); size - alloc, src);
space = 0; spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
for (c = 0; c < rvd->vdev_children; c++) { metaslab_class_fragmentation(mc), src);
vdev_t *tvd = rvd->vdev_child[c]; spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
space += tvd->vdev_max_asize - tvd->vdev_asize; metaslab_class_expandable_space(mc), src);
}
spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
src);
spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
(spa_mode(spa) == FREAD), src); (spa_mode(spa) == FREAD), src);

View File

@ -205,10 +205,10 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
* reached the maximum bucket size. Accumulate all ranges * reached the maximum bucket size. Accumulate all ranges
* larger than the max bucket size into the last bucket. * larger than the max bucket size into the last bucket.
*/ */
if (idx < SPACE_MAP_HISTOGRAM_SIZE(sm) - 1) { if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
ASSERT3U(idx + sm->sm_shift, ==, i); ASSERT3U(idx + sm->sm_shift, ==, i);
idx++; idx++;
ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE(sm)); ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
} }
} }
} }

View File

@ -539,7 +539,9 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
} }
spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
vdev_get_stats(spa->spa_root_vdev, vs1); vdev_get_stats(spa->spa_root_vdev, vs1);
spa_config_exit(spa, SCL_ALL, FTAG);
/* /*
* Consume the quiesced txg which has been handed off to * Consume the quiesced txg which has been handed off to
@ -575,7 +577,9 @@ txg_sync_thread(dsl_pool_t *dp)
*/ */
txg_dispatch_callbacks(dp, txg); txg_dispatch_callbacks(dp, txg);
spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
vdev_get_stats(spa->spa_root_vdev, vs2); vdev_get_stats(spa->spa_root_vdev, vs2);
spa_config_exit(spa, SCL_ALL, FTAG);
spa_txg_history_set_io(spa, txg, spa_txg_history_set_io(spa, txg,
vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],

View File

@ -2151,11 +2151,16 @@ vdev_remove(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset; objset_t *mos = spa->spa_meta_objset;
dmu_tx_t *tx; dmu_tx_t *tx;
int m; int m, i;
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
if (vd->vdev_ms != NULL) { if (vd->vdev_ms != NULL) {
metaslab_group_t *mg = vd->vdev_mg;
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (m = 0; m < vd->vdev_ms_count; m++) { for (m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m]; metaslab_t *msp = vd->vdev_ms[m];
@ -2163,12 +2168,27 @@ vdev_remove(vdev_t *vd, uint64_t txg)
continue; continue;
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
/*
* If the metaslab was not loaded when the vdev
* was removed then the histogram accounting may
* not be accurate. Update the histogram information
* here so that we ensure that the metaslab group
* and metaslab class are up-to-date.
*/
metaslab_group_histogram_remove(mg, msp);
VERIFY0(space_map_allocated(msp->ms_sm)); VERIFY0(space_map_allocated(msp->ms_sm));
space_map_free(msp->ms_sm, tx); space_map_free(msp->ms_sm, tx);
space_map_close(msp->ms_sm); space_map_close(msp->ms_sm);
msp->ms_sm = NULL; msp->ms_sm = NULL;
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
} }
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
} }
if (vd->vdev_ms_array) { if (vd->vdev_ms_array) {
@ -2621,9 +2641,12 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
void void
vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
{ {
vdev_t *rvd = vd->vdev_spa->spa_root_vdev; spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
int c, t; int c, t;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
mutex_enter(&vd->vdev_stat_lock); mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs)); bcopy(&vd->vdev_stat, vs, sizeof (*vs));
vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
@ -2632,7 +2655,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
if (vd->vdev_ops->vdev_op_leaf) if (vd->vdev_ops->vdev_op_leaf)
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
mutex_exit(&vd->vdev_stat_lock); if (vd->vdev_aux == NULL && vd == vd->vdev_top)
vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
/* /*
* If we're getting stats on the root vdev, aggregate the I/O counts * If we're getting stats on the root vdev, aggregate the I/O counts
@ -2643,16 +2667,15 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vdev_t *cvd = rvd->vdev_child[c]; vdev_t *cvd = rvd->vdev_child[c];
vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_t *cvs = &cvd->vdev_stat;
mutex_enter(&vd->vdev_stat_lock);
for (t = 0; t < ZIO_TYPES; t++) { for (t = 0; t < ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t]; vs->vs_bytes[t] += cvs->vs_bytes[t];
} }
cvs->vs_scan_removing = cvd->vdev_removing; cvs->vs_scan_removing = cvd->vdev_removing;
}
}
mutex_exit(&vd->vdev_stat_lock); mutex_exit(&vd->vdev_stat_lock);
} }
}
}
void void
vdev_clear_stats(vdev_t *vd) vdev_clear_stats(vdev_t *vd)