mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-28 19:04:23 +03:00
492f64e941
Overview ======== We parallelize the allocation process by creating the concept of "allocators". There are a certain number of allocators per metaslab group, defined by the value of a tunable at pool open time. Each allocator for a given metaslab group has up to 2 active metaslabs; one "primary", and one "secondary". The primary and secondary weight mean the same thing they did in in the pre-allocator world; primary metaslabs are used for most allocations, secondary metaslabs are used for ditto blocks being allocated in the same metaslab group. There is also the CLAIM weight, which has been separated out from the other weights, but that is less important to understanding the patch. The active metaslabs for each allocator are moved from their normal place in the metaslab tree for the group to the back of the tree. This way, they will not be selected for use by other allocators searching for new metaslabs unless all the passive metaslabs are unsuitable for allocations. If that does happen, the allocators will "steal" from each other to ensure that IOs don't fail until there is truly no space left to perform allocations. In addition, the alloc queue for each metaslab group has been broken into a separate queue for each allocator. We don't want to dramatically increase the number of inflight IOs on low-end systems, because it can significantly increase txg times. On the other hand, we want to ensure that there are enough IOs for each allocator to allow for good coalescing before sending the IOs to the disk. As a result, we take a compromise path; each allocator's alloc queue max depth starts at a certain value for every txg. Every time an IO completes, we increase the max depth. This should hopefully provide a good balance between the two failure modes, while not dramatically increasing complexity. We also parallelize the spa_alloc_tree and spa_alloc_lock, which cause very similar contention when selecting IOs to allocate. This parallelization uses the same allocator scheme as metaslab selection. Performance Results =================== Performance improvements from this change can vary significantly based on the number of CPUs in the system, whether or not the system has a NUMA architecture, the speed of the drives, the values for the various tunables, and the workload being performed. For an fio async sequential write workload on a 24 core NUMA system with 256 GB of RAM and 8 128 GB SSDs, there is a roughly 25% performance improvement. Future Work =========== Analysis of the performance of the system with this patch applied shows that a significant new bottleneck is the vdev disk queues, which also need to be parallelized. Prototyping of this change has occurred, and there was a performance improvement, but more work needs to be done before its stability has been verified and it is ready to be upstreamed. Authored by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Reviewed by: Alexander Motin <mav@FreeBSD.org> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Gordon Ross <gwr@nexenta.com> Ported-by: Paul Dagnelie <pcd@delphix.com> Signed-off-by: Paul Dagnelie <pcd@delphix.com> Porting Notes: * Fix reservation test failures by increasing tolerance. OpenZFS-issue: https://illumos.org/issues/9112 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3f3cc3c3 Closes #7682
124 lines
4.5 KiB
C
124 lines
4.5 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_METASLAB_H
|
|
#define _SYS_METASLAB_H
|
|
|
|
#include <sys/spa.h>
|
|
#include <sys/space_map.h>
|
|
#include <sys/txg.h>
|
|
#include <sys/zio.h>
|
|
#include <sys/avl.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
|
|
typedef struct metaslab_ops {
|
|
uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
|
|
} metaslab_ops_t;
|
|
|
|
|
|
extern metaslab_ops_t *zfs_metaslab_ops;
|
|
|
|
int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
|
|
metaslab_t **);
|
|
void metaslab_fini(metaslab_t *);
|
|
|
|
void metaslab_load_wait(metaslab_t *);
|
|
int metaslab_load(metaslab_t *);
|
|
void metaslab_unload(metaslab_t *);
|
|
|
|
void metaslab_sync(metaslab_t *, uint64_t);
|
|
void metaslab_sync_done(metaslab_t *, uint64_t);
|
|
void metaslab_sync_reassess(metaslab_group_t *);
|
|
uint64_t metaslab_block_maxsize(metaslab_t *);
|
|
|
|
#define METASLAB_HINTBP_FAVOR 0x0
|
|
#define METASLAB_HINTBP_AVOID 0x1
|
|
#define METASLAB_GANG_HEADER 0x2
|
|
#define METASLAB_GANG_CHILD 0x4
|
|
#define METASLAB_ASYNC_ALLOC 0x8
|
|
#define METASLAB_DONT_THROTTLE 0x10
|
|
#define METASLAB_FASTWRITE 0x20
|
|
|
|
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
|
|
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
|
|
int);
|
|
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
|
|
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
|
|
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
|
|
void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
|
|
void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
|
|
void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
|
|
void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
|
|
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
|
|
int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
|
|
void metaslab_check_free(spa_t *, const blkptr_t *);
|
|
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
|
|
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
|
|
|
|
void metaslab_alloc_trace_init(void);
|
|
void metaslab_alloc_trace_fini(void);
|
|
void metaslab_trace_init(zio_alloc_list_t *);
|
|
void metaslab_trace_fini(zio_alloc_list_t *);
|
|
|
|
metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
|
|
void metaslab_class_destroy(metaslab_class_t *);
|
|
int metaslab_class_validate(metaslab_class_t *);
|
|
void metaslab_class_histogram_verify(metaslab_class_t *);
|
|
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
|
|
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
|
|
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
|
|
zio_t *, int);
|
|
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
|
|
|
|
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
|
|
int64_t, int64_t);
|
|
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
|
|
uint64_t metaslab_class_get_space(metaslab_class_t *);
|
|
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
|
|
uint64_t metaslab_class_get_deferred(metaslab_class_t *);
|
|
|
|
metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
|
|
void metaslab_group_destroy(metaslab_group_t *);
|
|
void metaslab_group_activate(metaslab_group_t *);
|
|
void metaslab_group_passivate(metaslab_group_t *);
|
|
boolean_t metaslab_group_initialized(metaslab_group_t *);
|
|
uint64_t metaslab_group_get_space(metaslab_group_t *);
|
|
void metaslab_group_histogram_verify(metaslab_group_t *);
|
|
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
|
|
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
|
|
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
|
|
boolean_t);
|
|
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_METASLAB_H */
|