Pool allocation classes

Allocation Classes add the ability to have allocation classes in a
pool that are dedicated to serving specific block categories, such
as DDT data, metadata, and small file blocks. A pool can opt-in to
this feature by adding a 'special' or 'dedup' top-level VDEV.

Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Alek Pinchuk <apinchuk@datto.com>
Reviewed-by: Håkan Johansson <f96hajo@chalmers.se>
Reviewed-by: Andreas Dilger <andreas.dilger@chamcloud.com>
Reviewed-by: DHE <git@dehacked.net>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Gregor Kopka <gregor@kopka.net>
Reviewed-by: Kash Pande <kash@tripleback.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Don Brady <don.brady@delphix.com>
Closes #5182
This commit is contained in:
Don Brady
2018-09-05 19:33:36 -06:00
committed by Brian Behlendorf
parent cfa37548eb
commit cc99f275a2
57 changed files with 2326 additions and 282 deletions
+9 -1
View File
@@ -20,10 +20,11 @@
*/
/*
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
*/
#ifndef _KERNEL
@@ -408,6 +409,13 @@ zpool_feature_init(void)
ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
project_quota_deps);
}
{
zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
"org.zfsonlinux:allocation_classes", "allocation_classes",
"Support for separate allocation classes.",
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
}
}
#if defined(_KERNEL)
+3
View File
@@ -538,6 +538,9 @@ zfs_prop_init(void)
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
"special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"zero or 512 to 128K, power of 2", "SPECIAL_SMALL_BLOCKS");
/* hidden properties */
zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+2
View File
@@ -2281,6 +2281,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
os->os_zpl_special_smallblock : 0;
ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
}
+20
View File
@@ -314,6 +314,20 @@ dnodesize_changed_cb(void *arg, uint64_t newval)
}
}
static void
smallblk_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
ASSERT(ISP2(newval));
os->os_zpl_special_smallblock = newval;
}
static void
logbias_changed_cb(void *arg, uint64_t newval)
{
@@ -556,6 +570,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_DNODESIZE),
dnodesize_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(
ZFS_PROP_SPECIAL_SMALL_BLOCKS),
smallblk_changed_cb, os);
}
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+84 -66
View File
@@ -20,8 +20,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -300,7 +301,7 @@ metaslab_class_validate(metaslab_class_t *mc)
return (0);
}
void
static void
metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
{
@@ -337,7 +338,8 @@ metaslab_class_get_dspace(metaslab_class_t *mc)
void
metaslab_class_histogram_verify(metaslab_class_t *mc)
{
vdev_t *rvd = mc->mc_spa->spa_root_vdev;
spa_t *spa = mc->mc_spa;
vdev_t *rvd = spa->spa_root_vdev;
uint64_t *mc_hist;
int i;
@@ -834,7 +836,8 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
if (msp->ms_sm == NULL)
/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
@@ -967,12 +970,14 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
if (msp->ms_group != mg)
continue;
valid_ms++;
fragmentation += msp->ms_fragmentation;
}
if (valid_ms <= vd->vdev_ms_count / 2)
if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
return (ZFS_FRAG_INVALID);
fragmentation /= valid_ms;
@@ -1003,7 +1008,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
if ((mc != spa_normal_class(spa) &&
mc != spa_special_class(spa) &&
mc != spa_dedup_class(spa)) ||
mc->mc_groups <= 1)
return (B_TRUE);
/*
@@ -1466,12 +1474,26 @@ metaslab_unload(metaslab_t *msp)
msp->ms_max_size = 0;
}
static void
metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta)
{
vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
ASSERT(vd->vdev_ms_count != 0);
metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
vdev_deflated_space(vd, space_delta));
}
int
metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
metaslab_t **msp)
{
vdev_t *vd = mg->mg_vd;
objset_t *mos = vd->vdev_spa->spa_meta_objset;
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
metaslab_t *ms;
int error;
@@ -1528,8 +1550,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
/*
* If metaslab_debug_load is set and we're initializing a metaslab
* that has an allocated space map object then load the its space
* map so that can verify frees.
* that has an allocated space map object then load the space map
* so that we can verify frees.
*/
if (metaslab_debug_load && ms->ms_sm != NULL) {
mutex_enter(&ms->ms_lock);
@@ -1551,16 +1573,19 @@ void
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
0, -msp->ms_size);
metaslab_space_update(vd, mg->mg_class,
-space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
space_map_close(msp->ms_sm);
metaslab_unload(msp);
range_tree_destroy(msp->ms_allocatable);
range_tree_destroy(msp->ms_freeing);
range_tree_destroy(msp->ms_freed);
@@ -2583,7 +2608,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT3P(msp->ms_checkpointing, ==, NULL);
msp->ms_checkpointing = range_tree_create(NULL, NULL);
vdev_space_update(vd, 0, 0, msp->ms_size);
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_checkpointing));
@@ -2605,7 +2630,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
defer_delta -= range_tree_space(*defer_tree);
}
vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
defer_delta, 0);
/*
* If there's a metaslab_load() in progress, wait for it to complete
@@ -2704,21 +2730,25 @@ metaslab_sync_reassess(metaslab_group_t *mg)
spa_config_exit(spa, SCL_ALLOC, FTAG);
}
static uint64_t
metaslab_distance(metaslab_t *msp, dva_t *dva)
/*
* When writing a ditto block (i.e. more than one DVA for a given BP) on
* the same vdev as an existing DVA of this BP, then try to allocate it
* on a different metaslab than existing DVAs (i.e. a unique metaslab).
*/
static boolean_t
metaslab_is_unique(metaslab_t *msp, dva_t *dva)
{
uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
uint64_t start = msp->ms_id;
uint64_t dva_ms_id;
if (DVA_GET_ASIZE(dva) == 0)
return (B_TRUE);
if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
return (1ULL << 63);
return (B_TRUE);
if (offset < start)
return ((start - offset) << ms_shift);
if (offset > start)
return ((offset - start) << ms_shift);
return (0);
dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
return (msp->ms_id != dva_ms_id);
}
/*
@@ -2978,7 +3008,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
*/
static metaslab_t *
find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
{
avl_index_t idx;
@@ -3012,13 +3042,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
break;
uint64_t target_distance = min_distance
+ (space_map_allocated(msp->ms_sm) != 0 ? 0 :
min_distance >> 1);
for (i = 0; i < d; i++) {
if (metaslab_distance(msp, &dva[i]) < target_distance)
break;
if (want_unique &&
!metaslab_is_unique(msp, &dva[i]))
break; /* try another metaslab */
}
if (i == d)
break;
@@ -3036,8 +3063,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
int allocator)
uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
int d, int allocator)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
@@ -3091,7 +3118,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
was_active = B_TRUE;
} else {
msp = find_valid_metaslab(mg, activation_weight, dva, d,
min_distance, asize, allocator, zal, search,
want_unique, asize, allocator, zal, search,
&was_active);
}
@@ -3221,6 +3248,7 @@ next:
* metaslab.
*/
ASSERT(!metaslab_should_allocate(msp, asize));
mutex_exit(&msp->ms_lock);
}
mutex_exit(&msp->ms_lock);
@@ -3230,14 +3258,14 @@ next:
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
int allocator)
uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
int d, int allocator)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
min_distance, dva, d, allocator);
offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
dva, d, allocator);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
@@ -3264,14 +3292,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
return (offset);
}
/*
* If we have to write a ditto block (i.e. more than one DVA for a given BP)
* on the same vdev as an existing DVA of this BP, then try to allocate it
* at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
* existing DVAs.
*/
int ditto_same_vdev_distance_shift = 3;
/*
* Allocate a block for the specified i/o.
*/
@@ -3288,6 +3308,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/*
* For testing, make some blocks above a certain size be gang blocks.
* This will also test spilling from special to normal.
*/
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
@@ -3348,6 +3369,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
} else {
ASSERT(mc->mc_rotor != NULL);
mg = mc->mc_rotor;
}
@@ -3412,25 +3434,17 @@ top:
ASSERT(mg->mg_class == mc);
/*
* If we don't need to try hard, then require that the
* block be 1/8th of the device away from any other DVAs
* in this BP. If we are trying hard, allow any offset
* to be used (distance=0).
*/
uint64_t distance = 0;
if (!try_hard) {
distance = vd->vdev_asize >>
ditto_same_vdev_distance_shift;
if (distance <= (1ULL << vd->vdev_ms_shift))
distance = 0;
}
uint64_t asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
/*
* If we don't need to try hard, then require that the
* block be on an different metaslab from any other DVAs
* in this BP (unique=true). If we are trying hard, then
* allow any metaslab to be used (unique=false).
*/
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
distance, dva, d, allocator);
!try_hard, dva, d, allocator);
if (offset != -1ULL) {
/*
@@ -3830,7 +3844,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
if (reserved_slots < max)
available_slots = max - reserved_slots;
if (slots <= available_slots || GANG_ALLOCATION(flags)) {
if (slots <= available_slots || GANG_ALLOCATION(flags) ||
flags & METASLAB_MUST_RESERVE) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
@@ -4107,9 +4122,11 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
for (int d = 0; d < ndvas; d++)
if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
for (int d = 0; d < ndvas; d++) {
error = metaslab_claim_dva(spa, &dva[d], txg);
if (error != 0)
break;
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -4235,7 +4252,7 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
}
#if defined(_KERNEL)
/* CSTYLED */
/* BEGIN CSTYLED */
module_param(metaslab_aliquot, ulong, 0644);
MODULE_PARM_DESC(metaslab_aliquot,
"allocation granularity (a.k.a. stripe size)");
@@ -4284,8 +4301,9 @@ module_param(zfs_metaslab_switch_threshold, int, 0644);
MODULE_PARM_DESC(zfs_metaslab_switch_threshold,
"segment-based metaslab selection maximum buckets before switching");
/* CSTYLED */
module_param(metaslab_force_ganging, ulong, 0644);
MODULE_PARM_DESC(metaslab_force_ganging,
"blocks larger than this size are forced to be gang blocks");
/* END CSTYLED */
#endif
+53 -12
View File
@@ -31,6 +31,7 @@
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
*/
/*
@@ -272,8 +273,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (rvd != NULL) {
alloc = metaslab_class_get_alloc(spa_normal_class(spa));
size = metaslab_class_get_space(spa_normal_class(spa));
alloc = metaslab_class_get_alloc(mc);
alloc += metaslab_class_get_alloc(spa_special_class(spa));
alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
size = metaslab_class_get_space(mc);
size += metaslab_class_get_space(spa_special_class(spa));
size += metaslab_class_get_space(spa_dedup_class(spa));
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
@@ -1173,6 +1180,8 @@ spa_activate(spa_t *spa, int mode)
spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
/* Try to create a covering process */
mutex_enter(&spa->spa_proc_lock);
@@ -1320,6 +1329,12 @@ spa_deactivate(spa_t *spa)
metaslab_class_destroy(spa->spa_log_class);
spa->spa_log_class = NULL;
metaslab_class_destroy(spa->spa_special_class);
spa->spa_special_class = NULL;
metaslab_class_destroy(spa->spa_dedup_class);
spa->spa_dedup_class = NULL;
/*
* If this was part of an import or the open otherwise failed, we may
* still have errors left in the queues. Empty them just in case.
@@ -4988,7 +5003,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
char *poolname;
nvlist_t *nvl;
if (nvlist_lookup_string(props, "tname", &poolname) != 0)
if (props == NULL ||
nvlist_lookup_string(props, "tname", &poolname) != 0)
poolname = (char *)pool;
/*
@@ -5092,9 +5108,15 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_metaslab_set_size(rvd->vdev_child[c]);
vdev_expand(rvd->vdev_child[c], txg);
/*
* instantiate the metaslab groups (this will dirty the vdevs)
* we can no longer error exit past this point
*/
for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
vdev_metaslab_set_size(vd);
vdev_expand(vd, txg);
}
}
@@ -6940,8 +6962,14 @@ spa_async_thread(void *arg)
mutex_enter(&spa_namespace_lock);
old_space = metaslab_class_get_space(spa_normal_class(spa));
old_space += metaslab_class_get_space(spa_special_class(spa));
old_space += metaslab_class_get_space(spa_dedup_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
new_space = metaslab_class_get_space(spa_normal_class(spa));
new_space += metaslab_class_get_space(spa_special_class(spa));
new_space += metaslab_class_get_space(spa_dedup_class(spa));
mutex_exit(&spa_namespace_lock);
/*
@@ -7630,6 +7658,9 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
metaslab_class_t *normal = spa_normal_class(spa);
metaslab_class_t *special = spa_special_class(spa);
metaslab_class_t *dedup = spa_dedup_class(spa);
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
@@ -7723,9 +7754,13 @@ spa_sync(spa_t *spa, uint64_t txg)
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
metaslab_class_t *mc;
if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
!metaslab_group_initialized(mg))
if (mg == NULL || !metaslab_group_initialized(mg))
continue;
mc = mg->mg_class;
if (mc != normal && mc != special && mc != dedup)
continue;
/*
@@ -7743,12 +7778,18 @@ spa_sync(spa_t *spa, uint64_t txg)
}
slots_per_allocator += zfs_vdev_def_queue_depth;
}
metaslab_class_t *mc = spa_normal_class(spa);
for (int i = 0; i < spa->spa_alloc_count; i++) {
ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
mc->mc_alloc_max_slots[i] = slots_per_allocator;
ASSERT0(refcount_count(&normal->mc_alloc_slots[i]));
ASSERT0(refcount_count(&special->mc_alloc_slots[i]));
ASSERT0(refcount_count(&dedup->mc_alloc_slots[i]));
normal->mc_alloc_max_slots[i] = slots_per_allocator;
special->mc_alloc_max_slots[i] = slots_per_allocator;
dedup->mc_alloc_max_slots[i] = slots_per_allocator;
}
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
+109
View File
@@ -25,6 +25,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -408,6 +409,19 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
spa->spa_trust_config ? "trusted" : "untrusted", buf);
}
/*
* By default dedup and user data indirects land in the special class
*/
int zfs_ddt_data_is_special = B_TRUE;
int zfs_user_indirect_is_special = B_TRUE;
/*
* The percentage of special class final space reserved for metadata only.
* Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
* let metadata into the class.
*/
int zfs_special_class_metadata_reserve_pct = 25;
/*
* ==========================================================================
* SPA config locking
@@ -1159,6 +1173,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
*/
ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
spa_config_exit(spa, SCL_ALL, spa);
@@ -1554,6 +1570,16 @@ zfs_strtonum(const char *str, char **nptr)
return (val);
}
void
spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
{
/*
* We bump the feature refcount for each special vdev added to the pool
*/
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
}
/*
* ==========================================================================
* Accessor functions
@@ -1811,6 +1837,79 @@ spa_log_class(spa_t *spa)
return (spa->spa_log_class);
}
metaslab_class_t *
spa_special_class(spa_t *spa)
{
return (spa->spa_special_class);
}
metaslab_class_t *
spa_dedup_class(spa_t *spa)
{
return (spa->spa_dedup_class);
}
/*
* Locate an appropriate allocation class
*/
metaslab_class_t *
spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
uint_t level, uint_t special_smallblk)
{
if (DMU_OT_IS_ZIL(objtype)) {
if (spa->spa_log_class->mc_groups != 0)
return (spa_log_class(spa));
else
return (spa_normal_class(spa));
}
boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
if (DMU_OT_IS_DDT(objtype)) {
if (spa->spa_dedup_class->mc_groups != 0)
return (spa_dedup_class(spa));
else if (has_special_class && zfs_ddt_data_is_special)
return (spa_special_class(spa));
else
return (spa_normal_class(spa));
}
/* Indirect blocks for user data can land in special if allowed */
if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
if (has_special_class && zfs_user_indirect_is_special)
return (spa_special_class(spa));
else
return (spa_normal_class(spa));
}
if (DMU_OT_IS_METADATA(objtype) || level > 0) {
if (has_special_class)
return (spa_special_class(spa));
else
return (spa_normal_class(spa));
}
/*
* Allow small file blocks in special class in some cases (like
* for the dRAID vdev feature). But always leave a reserve of
* zfs_special_class_metadata_reserve_pct exclusively for metadata.
*/
if (DMU_OT_IS_FILE(objtype) &&
has_special_class && size < special_smallblk) {
metaslab_class_t *special = spa_special_class(spa);
uint64_t alloc = metaslab_class_get_alloc(special);
uint64_t space = metaslab_class_get_space(special);
uint64_t limit =
(space * (100 - zfs_special_class_metadata_reserve_pct))
/ 100;
if (alloc < limit)
return (special);
}
return (spa_normal_class(spa));
}
void
spa_evicting_os_register(spa_t *spa, objset_t *os)
{
@@ -2500,6 +2599,8 @@ EXPORT_SYMBOL(spa_update_dspace);
EXPORT_SYMBOL(spa_deflate);
EXPORT_SYMBOL(spa_normal_class);
EXPORT_SYMBOL(spa_log_class);
EXPORT_SYMBOL(spa_special_class);
EXPORT_SYMBOL(spa_preferred_class);
EXPORT_SYMBOL(spa_max_replication);
EXPORT_SYMBOL(spa_prev_software_version);
EXPORT_SYMBOL(spa_get_failmode);
@@ -2579,5 +2680,13 @@ MODULE_PARM_DESC(spa_asize_inflation,
module_param(spa_slop_shift, int, 0644);
MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool");
module_param(zfs_ddt_data_is_special, int, 0644);
MODULE_PARM_DESC(zfs_ddt_data_is_special,
"Place DDT data into the special class");
module_param(zfs_user_indirect_is_special, int, 0644);
MODULE_PARM_DESC(zfs_user_indirect_is_special,
"Place user data indirect blocks into the special class");
/* END CSTYLED */
#endif
+183 -28
View File
@@ -26,6 +26,7 @@
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -204,6 +205,25 @@ vdev_getops(const char *type)
return (ops);
}
/*
* Derive the enumerated alloction bias from string input.
* String origin is either the per-vdev zap or zpool(1M).
*/
static vdev_alloc_bias_t
vdev_derive_alloc_bias(const char *bias)
{
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
alloc_bias = VDEV_BIAS_LOG;
else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
alloc_bias = VDEV_BIAS_SPECIAL;
else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
alloc_bias = VDEV_BIAS_DEDUP;
return (alloc_bias);
}
/*
* Default asize function: return the MAX of psize with the asize of
* all children. This is what's used by anything other than RAID-Z.
@@ -528,6 +548,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
vdev_indirect_config_t *vic;
char *tmp = NULL;
int rc;
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
boolean_t top_level = (parent && !parent->vdev_parent);
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
@@ -614,11 +636,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
}
ASSERT(nparity != -1ULL);
/*
* If creating a top-level vdev, check for allocation classes input
*/
if (top_level && alloctype == VDEV_ALLOC_ADD) {
char *bias;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&bias) == 0) {
alloc_bias = vdev_derive_alloc_bias(bias);
/* spa_vdev_add() expects feature to be enabled */
if (spa->spa_load_state != SPA_LOAD_CREATE &&
!spa_feature_is_enabled(spa,
SPA_FEATURE_ALLOCATION_CLASSES)) {
return (SET_ERROR(ENOTSUP));
}
}
}
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
@@ -687,7 +730,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
if (parent && !parent->vdev_parent &&
if (top_level &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
@@ -703,14 +746,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
ASSERT0(vd->vdev_top_zap);
}
if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
alloctype == VDEV_ALLOC_ADD ||
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
vd->vdev_mg = metaslab_group_create(islog ?
spa_log_class(spa) : spa_normal_class(spa), vd,
spa->spa_alloc_count);
/* Note: metaslab_group_create() is now deferred */
}
if (vd->vdev_ops->vdev_op_leaf &&
@@ -952,6 +993,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
svd->vdev_checkpoint_sm = NULL;
tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
svd->vdev_alloc_bias = VDEV_BIAS_NONE;
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
@@ -1114,6 +1158,55 @@ vdev_remove_parent(vdev_t *cvd)
vdev_free(mvd);
}
static void
vdev_metaslab_group_create(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
/*
* metaslab_group_create was delayed until allocation bias was available
*/
if (vd->vdev_mg == NULL) {
metaslab_class_t *mc;
if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
vd->vdev_alloc_bias = VDEV_BIAS_LOG;
ASSERT3U(vd->vdev_islog, ==,
(vd->vdev_alloc_bias == VDEV_BIAS_LOG));
switch (vd->vdev_alloc_bias) {
case VDEV_BIAS_LOG:
mc = spa_log_class(spa);
break;
case VDEV_BIAS_SPECIAL:
mc = spa_special_class(spa);
break;
case VDEV_BIAS_DEDUP:
mc = spa_dedup_class(spa);
break;
default:
mc = spa_normal_class(spa);
}
vd->vdev_mg = metaslab_group_create(mc, vd,
spa->spa_alloc_count);
/*
* The spa ashift values currently only reflect the
* general vdev classes. Class destination is late
* binding so ashift checking had to wait until now
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
if (vd->vdev_ashift > spa->spa_max_ashift)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
}
}
}
int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
@@ -1124,6 +1217,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
boolean_t expanding = (oldc != 0);
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
@@ -1139,7 +1233,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (oldc != 0) {
if (expanding) {
bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
@@ -1165,6 +1259,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
}
}
#ifndef _KERNEL
/*
* To accomodate zdb_leak_init() fake indirect
* metaslabs, we allocate a metaslab group for
* indirect vdevs which normally don't have one.
*/
if (vd->vdev_mg == NULL) {
ASSERT0(vdev_is_concrete(vd));
vdev_metaslab_group_create(vd);
}
#endif
error = metaslab_init(vd->vdev_mg, m, object, txg,
&(vd->vdev_ms[m]));
if (error != 0) {
@@ -1182,8 +1287,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
* the metaslabs since we want to ensure that no new
* allocations are performed on this device.
*/
if (oldc == 0 && !vd->vdev_removing)
if (!expanding && !vd->vdev_removing) {
metaslab_group_activate(vd->vdev_mg);
}
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -1673,9 +1779,13 @@ vdev_open(vdev_t *vd)
/*
* Track the min and max ashift values for normal data devices.
*
* DJB - TBD these should perhaps be tracked per allocation class
* (e.g. spa_min_ashift is used to round up post compression buffers)
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
!vd->vdev_islog && vd->vdev_aux == NULL) {
vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
vd->vdev_aux == NULL) {
if (vd->vdev_ashift > spa->spa_max_ashift)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
@@ -2561,6 +2671,30 @@ vdev_dtl_load(vdev_t *vd)
return (error);
}
static void
vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
const char *string;
ASSERT(alloc_bias != VDEV_BIAS_NONE);
string =
(alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
(alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
(alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
ASSERT(string != NULL);
VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
1, strlen(string) + 1, string, tx));
if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
spa_activate_allocation_classes(spa, tx);
}
}
void
vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
{
@@ -2597,8 +2731,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
}
if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
vdev_zap_allocation_data(vd, tx);
}
}
for (uint64_t i = 0; i < vd->vdev_children; i++) {
vdev_construct_zaps(vd->vdev_child[i], tx);
}
@@ -2801,10 +2938,27 @@ vdev_load(vdev_t *vd)
vdev_set_deflate_ratio(vd);
/*
* On spa_load path, grab the allocation bias from our zap
*/
if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
spa_t *spa = vd->vdev_spa;
char bias_str[64];
if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
bias_str) == 0) {
ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
}
}
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
vdev_metaslab_group_create(vd);
if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -2999,6 +3153,7 @@ vdev_remove_empty(vdev_t *vd, uint64_t txg)
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
}
@@ -3673,7 +3828,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
}
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
}
@@ -3878,19 +4034,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
}
}
int64_t
vdev_deflated_space(vdev_t *vd, int64_t space)
{
ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
}
/*
* Update the in-core space usage stats for this vdev, its metaslab class,
* and the root vdev.
* Update the in-core space usage stats for this vdev and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta)
{
int64_t dspace_delta = space_delta;
int64_t dspace_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
metaslab_group_t *mg = vd->vdev_mg;
metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
@@ -3900,10 +4062,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
* because the root vdev's psize-to-asize is simply the max of its
* childrens', thus not accurate enough for us.
*/
ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
vd->vdev_deflate_ratio;
dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_alloc += alloc_delta;
@@ -3911,21 +4070,15 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
if (mc == spa_normal_class(spa)) {
/* every class but log contributes to root space stats */
if (vd->vdev_mg != NULL && !vd->vdev_islog) {
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
if (mc != NULL) {
ASSERT(rvd == vd->vdev_parent);
ASSERT(vd->vdev_ms_count != 0);
metaslab_class_space_update(mc,
alloc_delta, defer_delta, space_delta, dspace_delta);
}
/* Note: metaslab_class_space_update moved to metaslab_space_update */
}
/*
@@ -4354,7 +4507,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
vdev_set_deflate_ratio(vd);
if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
vdev_is_concrete(vd)) {
vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
+24
View File
@@ -22,6 +22,8 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
*/
/*
@@ -463,6 +465,28 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
vd->vdev_removing);
}
/* zpool command expects alloc class data */
if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
const char *bias = NULL;
switch (vd->vdev_alloc_bias) {
case VDEV_BIAS_LOG:
bias = VDEV_ALLOC_BIAS_LOG;
break;
case VDEV_BIAS_SPECIAL:
bias = VDEV_ALLOC_BIAS_SPECIAL;
break;
case VDEV_BIAS_DEDUP:
bias = VDEV_ALLOC_BIAS_DEDUP;
break;
default:
ASSERT3U(vd->vdev_alloc_bias, ==,
VDEV_BIAS_NONE);
}
fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
bias);
}
}
if (vd->vdev_dtl_sm != NULL) {
+32 -6
View File
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -944,8 +944,18 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
}
ASSERT3U(size, <=, maxalloc);
int error = metaslab_alloc_dva(spa, mg->mg_class, size,
&dst, 0, NULL, txg, 0, zal, 0);
/*
* An allocation class might not have any remaining vdevs or space
*/
metaslab_class_t *mc = mg->mg_class;
if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
mc = spa_normal_class(spa);
int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
zal, 0);
if (error == ENOSPC && mc != spa_normal_class(spa)) {
error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
&dst, 0, NULL, txg, 0, zal, 0);
}
if (error != 0)
return (error);
@@ -1853,15 +1863,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
/* available space in the pool's normal class */
uint64_t available = dsl_dir_space_available(
spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
metaslab_class_t *mc = vd->vdev_mg->mg_class;
/*
* When removing a vdev from an allocation class that has
* remaining vdevs, include available space from the class.
*/
if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
uint64_t class_avail = metaslab_class_get_space(mc) -
metaslab_class_get_alloc(mc);
/* add class space, adjusted for overhead */
available += (class_avail * 94) / 100;
}
/*
* There has to be enough free space to remove the
* device and leave double the "slop" space (i.e. we
* must leave at least 3% of the pool free, in addition to
* the normal slop space).
*/
if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
NULL, 0, B_TRUE) <
vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
return (SET_ERROR(ENOSPC));
}
+5 -1
View File
@@ -133,11 +133,15 @@ zfs_dbgmsg_fini(void)
{
if (zfs_dbgmsg_kstat)
kstat_delete(zfs_dbgmsg_kstat);
/*
* TODO - decide how to make this permanent
*/
#ifdef _KERNEL
mutex_enter(&zfs_dbgmsgs_lock);
zfs_dbgmsg_purge(0);
mutex_exit(&zfs_dbgmsgs_lock);
mutex_destroy(&zfs_dbgmsgs_lock);
#endif
}
void
+9
View File
@@ -4191,6 +4191,15 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
}
break;
case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
/*
* This property could require the allocation classes
* feature to be active for setting, however we allow
* it so that tests of settable properties succeed.
* The CLI will issue a warning in this case.
*/
break;
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
+79 -19
View File
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
*/
#include <sys/sysmacros.h>
@@ -825,6 +826,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
if (zio->io_metaslab_class == NULL)
zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
@@ -1315,9 +1318,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
*/
if (flags & ZIO_FLAG_IO_ALLOCATING &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa));
ASSERT(mc->mc_alloc_throttle_enabled);
ASSERT(pio->io_metaslab_class != NULL);
ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
ASSERT(type == ZIO_TYPE_WRITE);
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
@@ -1644,8 +1646,9 @@ zio_write_compress(zio_t *zio)
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
ASSERT(psize != 0);
VERIFY3U(psize, !=, 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
@@ -3266,7 +3269,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
* reserve then we throttle.
*/
ASSERT3U(zio->io_allocator, ==, allocator);
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
return (NULL);
}
@@ -3282,9 +3285,14 @@ zio_dva_throttle(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *nio;
metaslab_class_t *mc;
/* locate an appropriate allocation class */
mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
!spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
!mc->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
zio->io_flags & ZIO_FLAG_NODATA) {
return (zio);
@@ -3306,17 +3314,15 @@ zio_dva_throttle(zio_t *zio)
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio->io_metaslab_class = mc;
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
nio = zio_io_to_allocate(spa, zio->io_allocator);
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
return (nio);
}
void
static void
zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
@@ -3336,7 +3342,7 @@ static zio_t *
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
metaslab_class_t *mc = spa_normal_class(spa);
metaslab_class_t *mc;
blkptr_t *bp = zio->io_bp;
int error;
int flags = 0;
@@ -3360,10 +3366,50 @@ zio_dva_allocate(zio_t *zio)
if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
flags |= METASLAB_ASYNC_ALLOC;
/*
* if not already chosen, locate an appropriate allocation class
*/
mc = zio->io_metaslab_class;
if (mc == NULL) {
mc = spa_preferred_class(spa, zio->io_size,
zio->io_prop.zp_type, zio->io_prop.zp_level,
zio->io_prop.zp_zpl_smallblk);
zio->io_metaslab_class = mc;
}
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
/*
* Fallback to normal class when an alloc class is full
*/
if (error == ENOSPC && mc != spa_normal_class(spa)) {
/*
* If throttling, transfer reservation over to normal class.
* The io_allocator slot can remain the same even though we
* are switching classes.
*/
if (mc->mc_alloc_throttle_enabled &&
(zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
metaslab_class_throttle_unreserve(mc,
zio->io_prop.zp_copies, zio->io_allocator, zio);
zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
mc = spa_normal_class(spa);
VERIFY(metaslab_class_throttle_reserve(mc,
zio->io_prop.zp_copies, zio->io_allocator, zio,
flags | METASLAB_MUST_RESERVE));
} else {
mc = spa_normal_class(spa);
}
zio->io_metaslab_class = mc;
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
}
if (error != 0) {
zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
@@ -3431,6 +3477,15 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
/*
* Block pointer fields are useful to metaslabs for stats and debugging.
* Fill in the obvious ones before calling into metaslab_alloc().
*/
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_PSIZE(new_bp, size);
BP_SET_LEVEL(new_bp, 0);
/*
* When allocating a zil block, we don't have information about
* the final destination of the block except the objset it's part
@@ -4144,13 +4199,15 @@ zio_ready(zio_t *zio)
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_metaslab_class != NULL);
/*
* We were unable to allocate anything, unreserve and
* issue the next I/O to allocate.
*/
metaslab_class_throttle_unreserve(
spa_normal_class(zio->io_spa),
zio->io_prop.zp_copies, zio->io_allocator, zio);
zio->io_metaslab_class, zio->io_prop.zp_copies,
zio->io_allocator, zio);
zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
}
}
@@ -4233,14 +4290,15 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
ASSERT(zio->io_metaslab_class != NULL);
mutex_enter(&pio->io_lock);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
pio->io_allocator, B_TRUE);
mutex_exit(&pio->io_lock);
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
1, pio->io_allocator, pio);
metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
pio->io_allocator, pio);
/*
* Call into the pipeline to see if there is more work that
@@ -4259,7 +4317,6 @@ zio_done(zio_t *zio)
*/
const uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
ASSERTV(metaslab_class_t *mc = spa_normal_class(zio->io_spa));
zio_link_t *zl = NULL;
/*
@@ -4278,7 +4335,8 @@ zio_done(zio_t *zio)
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
zio->io_child_type == ZIO_CHILD_VDEV) {
ASSERT(mc->mc_alloc_throttle_enabled);
ASSERT(zio->io_metaslab_class != NULL);
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
zio_dva_throttle_done(zio);
}
@@ -4290,9 +4348,11 @@ zio_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_bp != NULL);
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
zio->io_allocator);
VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
VERIFY(refcount_not_held(
&zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
zio));
}