mirror_zfs/include/sys/metaslab_impl.h

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright (c) 2013 by Delphix. All rights reserved.
 */

#ifndef _SYS_METASLAB_IMPL_H
#define	_SYS_METASLAB_IMPL_H

#include <sys/metaslab.h>
#include <sys/space_map.h>
#include <sys/vdev.h>
#include <sys/txg.h>
#include <sys/avl.h>

#ifdef	__cplusplus
extern "C" {
#endif

struct metaslab_class {
	spa_t			*mc_spa;
	metaslab_group_t	*mc_rotor;
	space_map_ops_t		*mc_ops;
	uint64_t		mc_aliquot;
	uint64_t		mc_alloc_groups; /* # of allocatable groups */
	uint64_t		mc_alloc;	/* total allocated space */
	uint64_t		mc_deferred;	/* total deferred frees */
	uint64_t		mc_space;	/* total space (alloc + free) */
	uint64_t		mc_dspace;	/* total deflated space */
	kmutex_t		mc_fastwrite_lock;
};

struct metaslab_group {
	kmutex_t		mg_lock;
	avl_tree_t		mg_metaslab_tree;
	uint64_t		mg_aliquot;
	uint64_t		mg_bonus_area;
	uint64_t		mg_alloc_failures;
	boolean_t		mg_allocatable;		/* can we allocate? */
	uint64_t		mg_free_capacity;	/* percentage free */
	int64_t			mg_bias;
	int64_t			mg_activation_count;
	metaslab_class_t	*mg_class;
	vdev_t			*mg_vd;
	metaslab_group_t	*mg_prev;
	metaslab_group_t	*mg_next;
};

/*
 * Each metaslab maintains an in-core free map (ms_map) that contains the
 * current list of free segments. As blocks are allocated, the allocated
 * segment is removed from the ms_map and added to a per txg allocation map.
 * As blocks are freed, they are added to the per txg free map. These per
 * txg maps allow us to process all allocations and frees in syncing context
 * where it is safe to update the on-disk space maps.
 *
 * Each metaslab's free space is tracked in a space map object in the MOS,
 * which is only updated in syncing context. Each time we sync a txg,
 * we append the allocs and frees from that txg to the space map object.
 * When the txg is done syncing, metaslab_sync_done() updates ms_smo
 * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
 *
 * To load the in-core free map we read the space map object from disk.
 * This object contains a series of alloc and free records that are
 * combined to make up the list of all free segments in this metaslab. These
 * segments are represented in-core by the ms_map and are stored in an
 * AVL tree.
 *
 * As the space map objects grows (as a result of the appends) it will
 * eventually become space-inefficient. When the space map object is
 * zfs_condense_pct/100 times the size of the minimal on-disk representation,
 * we rewrite it in its minimized form.
 */
struct metaslab {
	kmutex_t	ms_lock;	/* metaslab lock		*/
	space_map_obj_t	ms_smo;		/* synced space map object	*/
	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
	space_map_t	*ms_allocmap[TXG_SIZE];	/* allocated this txg	*/
	space_map_t	*ms_freemap[TXG_SIZE];	/* freed this txg	*/
	space_map_t	*ms_defermap[TXG_DEFER_SIZE];	/* deferred frees */
	space_map_t	*ms_map;	/* in-core free space map	*/
	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
	uint64_t	ms_weight;	/* weight vs. others in group	*/
	metaslab_group_t *ms_group;	/* metaslab group		*/
	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
};

#ifdef	__cplusplus
}
#endif

#endif	/* _SYS_METASLAB_IMPL_H */
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`/*`
			`* CDDL HEADER START`
			`*`
			`* The contents of this file are subject to the terms of the`
			`* Common Development and Distribution License (the "License").`
			`* You may not use this file except in compliance with the License.`
			`*`
			`* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE`
			`* or http://www.opensolaris.org/os/licensing.`
			`* See the License for the specific language governing permissions`
			`* and limitations under the License.`
			`*`
			`* When distributing Covered Code, include this CDDL HEADER in each`
			`* file and include the License file at usr/src/OPENSOLARIS.LICENSE.`
			`* If applicable, add the following below this CDDL HEADER, with the`
			`* fields enclosed by brackets "[]" replaced with your own identifying`
			`* information: Portions Copyright [yyyy] [name of copyright owner]`
			`*`
			`* CDDL HEADER END`
			`*/`
			`/*`
Rebase master to b117 2009-07-03 02:44:48 +04:00			`* Copyright 2009 Sun Microsystems, Inc. All rights reserved.`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`* Use is subject to license terms.`
Illumos #3329, #3330, #3331, #3335 3329 spa_sync() spends 10-20% of its time in spa_free_sync_cb() 3330 space_seg_t should have its own kmem_cache 3331 deferred frees should happen after sync_pass 1 3335 make SYNC_PASS_* constants tunable Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com> Reviewed by: Christopher Siden <chris.siden@delphix.com> Reviewed by: Eric Schrock <eric.schrock@delphix.com> Reviewed by: Richard Lowe <richlowe@richlowe.net> Reviewed by: Dan McDonald <danmcd@nexenta.com> Approved by: Eric Schrock <eric.schrock@delphix.com> References: illumos/illumos-gate@01f55e48fb4d524eaf70687728aa51b7762e2e97 https://www.illumos.org/issues/3329 https://www.illumos.org/issues/3330 https://www.illumos.org/issues/3331 https://www.illumos.org/issues/3335 Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> 2013-05-06 21:14:52 +04:00			`*/`

			`/*`
Illumos #3954, #4080, #4081 3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit 4080 zpool clear fails to clear pool 4081 need zfs_mg_noalloc_threshold Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: https://www.illumos.org/issues/3954 https://www.illumos.org/issues/4080 https://www.illumos.org/issues/4081 illumos/illumos-gate@22e30981d82a0b6dc89253596ededafae8655e00 Ported-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775 2013-08-29 22:56:49 +04:00			`* Copyright (c) 2013 by Delphix. All rights reserved.`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`*/`

			`#ifndef _SYS_METASLAB_IMPL_H`
			`#define _SYS_METASLAB_IMPL_H`

			`#include <sys/metaslab.h>`
			`#include <sys/space_map.h>`
			`#include <sys/vdev.h>`
			`#include <sys/txg.h>`
			`#include <sys/avl.h>`

			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`struct metaslab_class {`
Update core ZFS code from build 121 to build 141. 2010-05-29 00:45:14 +04:00			`spa_t *mc_spa;`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`metaslab_group_t *mc_rotor;`
Rebase master to b117 2009-07-03 02:44:48 +04:00			`space_map_ops_t *mc_ops;`
Update core ZFS code from build 121 to build 141. 2010-05-29 00:45:14 +04:00			`uint64_t mc_aliquot;`
Illumos #3954, #4080, #4081 3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit 4080 zpool clear fails to clear pool 4081 need zfs_mg_noalloc_threshold Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: https://www.illumos.org/issues/3954 https://www.illumos.org/issues/4080 https://www.illumos.org/issues/4081 illumos/illumos-gate@22e30981d82a0b6dc89253596ededafae8655e00 Ported-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775 2013-08-29 22:56:49 +04:00			`uint64_t mc_alloc_groups; /* # of allocatable groups */`
Update core ZFS code from build 121 to build 141. 2010-05-29 00:45:14 +04:00			`uint64_t mc_alloc; /* total allocated space */`
			`uint64_t mc_deferred; /* total deferred frees */`
			`uint64_t mc_space; /* total space (alloc + free) */`
			`uint64_t mc_dspace; /* total deflated space */`
Add FASTWRITE algorithm for synchronous writes. Currently, ZIL blocks are spread over vdevs using hint block pointers managed by the ZIL commit code and passed to metaslab_alloc(). Spreading log blocks accross vdevs is important for performance: indeed, using mutliple disks in parallel decreases the ZIL commit latency, which is the main performance metric for synchronous writes. However, the current implementation suffers from the following issues: 1) It would be best if the ZIL module was not aware of such low-level details. They should be handled by the ZIO and metaslab modules; 2) Because the hint block pointer is managed per log, simultaneous commits from multiple logs might use the same vdevs at the same time, which is inefficient; 3) Because dmu_write() does not honor the block pointer hint, indirect writes are not spread. The naive solution of rotating the metaslab rotor each time a block is allocated for the ZIL or dmu_sync() doesn't work in practice because the first ZIL block to be written is actually allocated during the previous commit. Consequently, when metaslab_alloc() decides the vdev for this block, it will do so while a bunch of other allocations are happening at the same time (from dmu_sync() and other ZILs). This means the vdev for this block is chosen more or less at random. When the next commit happens, there is a high chance (especially when the number of blocks per commit is slightly less than the number of the disks) that one disk will have to write two blocks (with a potential seek) while other disks are sitting idle, which defeats spreading and increases the commit latency. This commit introduces a new concept in the metaslab allocator: fastwrites. Basically, each top-level vdev maintains a counter indicating the number of synchronous writes (from dmu_sync() and the ZIL) which have been allocated but not yet completed. When the metaslab is called with the FASTWRITE flag, it will choose the vdev with the least amount of pending synchronous writes. If there are multiple vdevs with the same value, the first matching vdev (starting from the rotor) is used. Once metaslab_alloc() has decided which vdev the block is allocated to, it updates the fastwrite counter for this vdev. The rationale goes like this: when an allocation is done with FASTWRITE, it "reserves" the vdev until the data is written. Until then, all future allocations will naturally avoid this vdev, even after a full rotation of the rotor. As a result, pending synchronous writes at a given point in time will be nicely spread over all vdevs. This contrasts with the previous algorithm, which is based on the implicit assumption that blocks are written instantaneously after they're allocated. metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to manually increase or decrease fastwrite counters, respectively. They should be used with caution, as there is no per-BP tracking of fastwrite information, so leaks and "double-unmarks" are possible. There is, however, an assert in the vdev teardown code which will fire if the fastwrite counters are not zero when the pool is exported or the vdev removed. Note that as stated above, marking is also done implictly by metaslab_alloc(). ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to the metaslab when allocating (assuming ZIO does the allocation, which is only true in the case of dmu_sync). This flag will also trigger an unmark when zio_done() fires. A side-effect of the new algorithm is that when a ZIL stops being used, its last block can stay in the pending state (allocated but not yet written) for a long time, polluting the fastwrite counters. To avoid that, I've implemented a somewhat crude but working solution which unmarks these pending blocks in zil_sync(), thus guaranteeing that linguering fastwrites will get pruned at each sync event. The best performance improvements are observed with pools using a large number of top-level vdevs and heavy synchronous write workflows (especially indirect writes and concurrent writes from multiple ZILs). Real-life testing shows a 200% to 300% performance increase with indirect writes and various commit sizes. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1013 2012-06-27 17:20:20 +04:00			`kmutex_t mc_fastwrite_lock;`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`};`

			`struct metaslab_group {`
			`kmutex_t mg_lock;`
			`avl_tree_t mg_metaslab_tree;`
			`uint64_t mg_aliquot;`
Update core ZFS code from build 121 to build 141. 2010-05-29 00:45:14 +04:00			`uint64_t mg_bonus_area;`
Illumos #1051: zfs should handle imbalanced luns Today zfs tries to allocate blocks evenly across all devices. This means when devices are imbalanced zfs will use lots of CPU searching for space on devices which tend to be pretty full. It should instead fail quickly on the full LUNs and move onto devices which have more availability. Reviewed by: Eric Schrock <Eric.Schrock@delphix.com> Reviewed by: Matt Ahrens <Matt.Ahrens@delphix.com> Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com> Reviewed by: Albert Lee <trisk@nexenta.com> Reviewed by: Gordon Ross <gwr@nexenta.com> Approved by: Garrett D'Amore <garrett@nexenta.com> References to Illumos issue and patch: - https://www.illumos.org/issues/510 - https://github.com/illumos/illumos-gate/commit/5ead3ed965 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #340 2011-07-26 23:08:52 +04:00			`uint64_t mg_alloc_failures;`
Illumos #3954, #4080, #4081 3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit 4080 zpool clear fails to clear pool 4081 need zfs_mg_noalloc_threshold Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: https://www.illumos.org/issues/3954 https://www.illumos.org/issues/4080 https://www.illumos.org/issues/4081 illumos/illumos-gate@22e30981d82a0b6dc89253596ededafae8655e00 Ported-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775 2013-08-29 22:56:49 +04:00			`boolean_t mg_allocatable; /* can we allocate? */`
			`uint64_t mg_free_capacity; /* percentage free */`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`int64_t mg_bias;`
Update core ZFS code from build 121 to build 141. 2010-05-29 00:45:14 +04:00			`int64_t mg_activation_count;`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`metaslab_class_t *mg_class;`
			`vdev_t *mg_vd;`
			`metaslab_group_t *mg_prev;`
			`metaslab_group_t *mg_next;`
			`};`

			`/*`
Illumos #3552, #3564 3552 condensing one space map burns 3 seconds of CPU in spa_sync() thread 3564 spa_sync() spends 5-10% of its time in metaslab_sync() (when not condensing) Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: illumos/illumos-gate@16a4a8074274d2d7cc408589cf6359f4a378c861 https://www.illumos.org/issues/3552 https://www.illumos.org/issues/3564 Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1513 2013-06-15 07:30:35 +04:00			`* Each metaslab maintains an in-core free map (ms_map) that contains the`
			`* current list of free segments. As blocks are allocated, the allocated`
			`* segment is removed from the ms_map and added to a per txg allocation map.`
			`* As blocks are freed, they are added to the per txg free map. These per`
			`* txg maps allow us to process all allocations and frees in syncing context`
			`* where it is safe to update the on-disk space maps.`
			`*`
			`* Each metaslab's free space is tracked in a space map object in the MOS,`
			`* which is only updated in syncing context. Each time we sync a txg,`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`* we append the allocs and frees from that txg to the space map object.`
			`* When the txg is done syncing, metaslab_sync_done() updates ms_smo`
Illumos #3552, #3564 3552 condensing one space map burns 3 seconds of CPU in spa_sync() thread 3564 spa_sync() spends 5-10% of its time in metaslab_sync() (when not condensing) Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: illumos/illumos-gate@16a4a8074274d2d7cc408589cf6359f4a378c861 https://www.illumos.org/issues/3552 https://www.illumos.org/issues/3564 Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1513 2013-06-15 07:30:35 +04:00			`* to ms_smo_syncing. Everything in ms_smo is always safe to allocate.`
			`*`
			`* To load the in-core free map we read the space map object from disk.`
			`* This object contains a series of alloc and free records that are`
			`* combined to make up the list of all free segments in this metaslab. These`
			`* segments are represented in-core by the ms_map and are stored in an`
			`* AVL tree.`
			`*`
			`* As the space map objects grows (as a result of the appends) it will`
			`* eventually become space-inefficient. When the space map object is`
			`* zfs_condense_pct/100 times the size of the minimal on-disk representation,`
			`* we rewrite it in its minimized form.`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`*/`
			`struct metaslab {`
			`kmutex_t ms_lock; /* metaslab lock */`
			`space_map_obj_t ms_smo; /* synced space map object */`
			`space_map_obj_t ms_smo_syncing; /* syncing space map object */`
Illumos #3552, #3564 3552 condensing one space map burns 3 seconds of CPU in spa_sync() thread 3564 spa_sync() spends 5-10% of its time in metaslab_sync() (when not condensing) Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: illumos/illumos-gate@16a4a8074274d2d7cc408589cf6359f4a378c861 https://www.illumos.org/issues/3552 https://www.illumos.org/issues/3564 Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1513 2013-06-15 07:30:35 +04:00			`space_map_t ms_allocmap[TXG_SIZE]; / allocated this txg */`
			`space_map_t ms_freemap[TXG_SIZE]; / freed this txg */`
			`space_map_t ms_defermap[TXG_DEFER_SIZE]; / deferred frees */`
			`space_map_t ms_map; / in-core free space map */`
Update core ZFS code from build 121 to build 141. 2010-05-29 00:45:14 +04:00			`int64_t ms_deferspace; /* sum of ms_defermap[] space */`
Initial Linux ZFS GIT Repo 2008-11-20 23:01:55 +03:00			`uint64_t ms_weight; /* weight vs. others in group */`
			`metaslab_group_t ms_group; / metaslab group */`
			`avl_node_t ms_group_node; /* node in metaslab group tree */`
			`txg_node_t ms_txg_node; /* per-txg dirty metaslab links */`
			`};`

			`#ifdef __cplusplus`
			`}`
			`#endif`

			`#endif /* _SYS_METASLAB_IMPL_H */`