diff --git a/include/sys/mmp.h b/include/sys/mmp.h index c99c124e6..edb0d4347 100644 --- a/include/sys/mmp.h +++ b/include/sys/mmp.h @@ -44,6 +44,8 @@ typedef struct mmp_thread { zio_t *mmp_zio_root; /* root of mmp write zios */ uint64_t mmp_kstat_id; /* unique id for next MMP write kstat */ int mmp_skip_error; /* reason for last skipped write */ + vdev_t *mmp_last_leaf; /* last mmp write sent here */ + uint64_t mmp_leaf_last_gen; /* last mmp write sent here */ } mmp_thread_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 404aaa9ee..c3aaad611 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -391,6 +391,8 @@ struct spa { taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ + list_t spa_leaf_list; /* list of leaf vdevs */ + uint64_t spa_leaf_list_gen; /* track leaf_list changes */ /* * spa_refcount & spa_config_lock must be the last elements diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 8f8a8ccf6..c115a5e10 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -364,6 +364,7 @@ struct vdev { hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ + list_node_t vdev_leaf_node; /* leaf vdev list */ /* * For DTrace to work in userland (libzpool) context, these fields must diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 746ee0f77..16975dd98 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -205,80 +205,57 @@ typedef enum mmp_vdev_state_flag { MMP_FAIL_WRITE_PENDING = (1 << 1), } mmp_vdev_state_flag_t; -static vdev_t * -mmp_random_leaf_impl(vdev_t *vd, int *fail_mask) -{ - int child_idx; - - if (vd->vdev_ops->vdev_op_leaf) { - vdev_t *ret; - - if (!vdev_writeable(vd)) { - *fail_mask |= MMP_FAIL_NOT_WRITABLE; - ret = NULL; - } else if (vd->vdev_mmp_pending != 0) { - *fail_mask |= MMP_FAIL_WRITE_PENDING; - ret = NULL; - } else { - ret = vd; - } - - return (ret); - } - - if (vd->vdev_children == 0) - return (NULL); - - child_idx = spa_get_random(vd->vdev_children); - for (int offset = vd->vdev_children; offset > 0; offset--) { - vdev_t *leaf; - vdev_t *child = vd->vdev_child[(child_idx + offset) % - vd->vdev_children]; - - leaf = mmp_random_leaf_impl(child, fail_mask); - if (leaf) - return (leaf); - } - - return (NULL); -} - /* * Find a leaf vdev to write an MMP block to. It must not have an outstanding * mmp write (if so a new write will also likely block). If there is no usable - * leaf in the tree rooted at in_vd, a nonzero error value is returned, and - * *out_vd is unchanged. + * leaf, a nonzero error value is returned. The error value returned is a bit + * field. * - * The error value returned is a bit field. - * - * MMP_FAIL_WRITE_PENDING - * If set, one or more leaf vdevs are writeable, but have an MMP write which has - * not yet completed. - * - * MMP_FAIL_NOT_WRITABLE - * If set, one or more vdevs are not writeable. The children of those vdevs - * were not examined. - * - * Assuming in_vd points to a tree, a random subtree will be chosen to start. - * That subtree, and successive ones, will be walked until a usable leaf has - * been found, or all subtrees have been examined (except that the children of - * un-writeable vdevs are not examined). - * - * If the leaf vdevs in the tree are healthy, the distribution of returned leaf - * vdevs will be even. If there are unhealthy leaves, the following leaves - * (child_index % index_children) will be chosen more often. + * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an + * outstanding MMP write. + * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable. */ static int -mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd) +mmp_next_leaf(spa_t *spa) { - int error_mask = 0; - vdev_t *vd = mmp_random_leaf_impl(in_vd, &error_mask); + vdev_t *leaf; + vdev_t *starting_leaf; + int fail_mask = 0; - if (error_mask == 0) - *out_vd = vd; + ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock)); + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER)); + ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE); + ASSERT(!list_is_empty(&spa->spa_leaf_list)); - return (error_mask); + if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) { + spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list); + spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen; + } + + leaf = spa->spa_mmp.mmp_last_leaf; + if (leaf == NULL) + leaf = list_head(&spa->spa_leaf_list); + starting_leaf = leaf; + + do { + leaf = list_next(&spa->spa_leaf_list, leaf); + if (leaf == NULL) + leaf = list_head(&spa->spa_leaf_list); + + if (!vdev_writeable(leaf)) { + fail_mask |= MMP_FAIL_NOT_WRITABLE; + } else if (leaf->vdev_mmp_pending != 0) { + fail_mask |= MMP_FAIL_WRITE_PENDING; + } else { + spa->spa_mmp.mmp_last_leaf = leaf; + return (0); + } + } while (leaf != starting_leaf); + + ASSERT(fail_mask); + + return (fail_mask); } /* @@ -398,10 +375,10 @@ mmp_write_uberblock(spa_t *spa) zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n", (u_longlong_t)lock_acquire_time); - error = mmp_random_leaf(spa->spa_root_vdev, &vd); - mutex_enter(&mmp->mmp_io_lock); + error = mmp_next_leaf(spa); + /* * spa_mmp_history has two types of entries: * Issued MMP write: records time issued, error status, etc. @@ -425,6 +402,7 @@ mmp_write_uberblock(spa_t *spa) return; } + vd = spa->spa_mmp.mmp_last_leaf; mmp->mmp_skip_error = 0; if (mmp->mmp_zio_root == NULL) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 0976cc49c..71221b21b 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -730,6 +730,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } + list_create(&spa->spa_leaf_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_leaf_node)); + return (spa); } @@ -772,6 +775,7 @@ spa_remove(spa_t *spa) sizeof (avl_tree_t)); list_destroy(&spa->spa_config_list); + list_destroy(&spa->spa_leaf_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 1332c720f..890bb1135 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -398,6 +398,11 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; + + if (cvd->vdev_ops->vdev_op_leaf) { + list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); + cvd->vdev_spa->spa_leaf_list_gen++; + } } void @@ -427,6 +432,12 @@ vdev_remove_child(vdev_t *pvd, vdev_t *cvd) pvd->vdev_children = 0; } + if (cvd->vdev_ops->vdev_op_leaf) { + spa_t *spa = cvd->vdev_spa; + list_remove(&spa->spa_leaf_list, cvd); + spa->spa_leaf_list_gen++; + } + /* * Walk up all ancestors to update guid sum. */ @@ -531,6 +542,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); + list_link_init(&vd->vdev_leaf_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); @@ -914,6 +926,7 @@ vdev_free(vdev_t *vd) vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); + ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5caa63bdc..917bf24f9 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -640,7 +640,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb'] + 'mmp_on_zdb', 'mmp_write_distribution'] tags = ['functional', 'mmp'] [tests/functional/mount] diff --git a/tests/zfs-tests/tests/functional/mmp/Makefile.am b/tests/zfs-tests/tests/functional/mmp/Makefile.am index f2d0ad0ea..e39a0a5aa 100644 --- a/tests/zfs-tests/tests/functional/mmp/Makefile.am +++ b/tests/zfs-tests/tests/functional/mmp/Makefile.am @@ -11,6 +11,7 @@ dist_pkgdata_SCRIPTS = \ mmp_write_uberblocks.ksh \ mmp_reset_interval.ksh \ mmp_on_zdb.ksh \ + mmp_write_distribution.ksh \ setup.ksh \ cleanup.ksh diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh new file mode 100755 index 000000000..7504caa4d --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify MMP writes are distributed evenly among leaves +# +# STRATEGY: +# 1. Create an asymmetric mirrored pool +# 2. Enable multihost and multihost_history +# 3. Delay for MMP writes to occur +# 4. Verify the MMP writes are distributed evenly across leaf vdevs +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + log_must zpool destroy $MMP_POOL + log_must rm $MMP_DIR/file.{0,1,2,3,4,5,6,7} + log_must rm $MMP_HISTORY_TMP + log_must rmdir $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "mmp writes are evenly distributed across leaf vdevs" +log_onexit cleanup + +MMP_HISTORY_TMP=$MMP_DIR/history +MMP_HISTORY=/proc/spl/kstat/zfs/$MMP_POOL/multihost + +# Step 1 +log_must mkdir -p $MMP_DIR +log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5,6,7} +log_must zpool create -f $MMP_POOL mirror $MMP_DIR/file.{0,1} mirror $MMP_DIR/file.{2,3,4,5,6,7} + +# Step 2 +log_must mmp_set_hostid $HOSTID1 +log_must zpool set multihost=on $MMP_POOL +set_tunable64 zfs_multihost_history 0 +set_tunable64 zfs_multihost_history 40 + +# Step 3 +# default settings, every leaf written once/second +sleep 4 + +# Step 4 +typeset -i min_writes=999 +typeset -i max_writes=0 +typeset -i write_count +# copy to get as close to a consistent view as possible +cat $MMP_HISTORY > $MMP_HISTORY_TMP +for x in $(seq 0 7); do + write_count=$(grep -c file.${x} $MMP_HISTORY_TMP) + if [ $write_count -lt $min_writes ]; then + min_writes=$write_count + fi + if [ $write_count -gt $max_writes ]; then + max_writes=$write_count + fi +done +log_note "mmp min_writes $min_writes max_writes $max_writes" + +if [ $min_writes -lt 1 ]; then + log_fail "mmp writes were not counted correctly" +fi + +if [ $((max_writes - min_writes)) -gt 1 ]; then + log_fail "mmp writes were not evenly distributed across leaf vdevs" +fi + +log_pass "mmp writes were evenly distributed across leaf vdevs"