mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-13 03:30:34 +03:00
93cf20764a
4101 metaslab_debug should allow for fine-grained control 4102 space_maps should store more information about themselves 4103 space map object blocksize should be increased 4105 removing a mirrored log device results in a leaked object 4106 asynchronously load metaslab Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Sebastien Roy <seb@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> Prior to this patch, space_maps were preferred solely based on the amount of free space left in each. Unfortunately, this heuristic didn't contain any information about the make-up of that free space, which meant we could keep preferring and loading a highly fragmented space map that wouldn't actually have enough contiguous space to satisfy the allocation; then unloading that space_map and repeating the process. This change modifies the space_map's to store additional information about the contiguous space in the space_map, so that we can use this information to make a better decision about which space_map to load. This requires reallocating all space_map objects to increase their bonus buffer size sizes enough to fit the new metadata. The above feature can be enabled via a new feature flag introduced by this change: com.delphix:spacemap_histogram In addition to the above, this patch allows the space_map block size to be increase. Currently the block size is set to be 4K in size, which has certain implications including the following: * 4K sector devices will not see any compression benefit * large space_maps require more metadata on-disk * large space_maps require more time to load (typically random reads) Now the space_map block size can adjust as needed up to the maximum size set via the space_map_max_blksz variable. A bug was fixed which resulted in potentially leaking an object when removing a mirrored log device. The previous logic for vdev_remove() did not deal with removing top-level vdevs that are interior vdevs (i.e. mirror) correctly. The problem would occur when removing a mirrored log device, and result in the DTL space map object being leaked; because top-level vdevs don't have DTL space map objects associated with them. References: https://www.illumos.org/issues/4101 https://www.illumos.org/issues/4102 https://www.illumos.org/issues/4103 https://www.illumos.org/issues/4105 https://www.illumos.org/issues/4106 https://github.com/illumos/illumos-gate/commit/0713e23 Porting notes: A handful of kmem_alloc() calls were converted to kmem_zalloc(). Also, the KM_PUSHPAGE and TQ_PUSHPAGE flags were used as necessary. Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Prakash Surya <surya1@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2488
178 lines
6.0 KiB
C
178 lines
6.0 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
|
* Use is subject to license terms.
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_SPACE_MAP_H
|
|
#define _SYS_SPACE_MAP_H
|
|
|
|
#include <sys/avl.h>
|
|
#include <sys/range_tree.h>
|
|
#include <sys/dmu.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/*
|
|
* The size of the space map object has increased to include a histogram.
|
|
* The SPACE_MAP_SIZE_V0 designates the original size and is used to
|
|
* maintain backward compatibility.
|
|
*/
|
|
#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
|
|
#define SPACE_MAP_HISTOGRAM_SIZE(sm) \
|
|
(sizeof ((sm)->sm_phys->smp_histogram) / \
|
|
sizeof ((sm)->sm_phys->smp_histogram[0]))
|
|
|
|
/*
|
|
* The space_map_phys is the on-disk representation of the space map.
|
|
* Consumers of space maps should never reference any of the members of this
|
|
* structure directly. These members may only be updated in syncing context.
|
|
*
|
|
* Note the smp_object is no longer used but remains in the structure
|
|
* for backward compatibility.
|
|
*/
|
|
typedef struct space_map_phys {
|
|
uint64_t smp_object; /* on-disk space map object */
|
|
uint64_t smp_objsize; /* size of the object */
|
|
uint64_t smp_alloc; /* space allocated from the map */
|
|
uint64_t smp_pad[5]; /* reserved */
|
|
|
|
/*
|
|
* The smp_histogram maintains a histogram of free regions. Each
|
|
* bucket, smp_histogram[i], contains the number of free regions
|
|
* whose size is:
|
|
* 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
|
|
*/
|
|
uint64_t smp_histogram[32]; /* histogram of free space */
|
|
} space_map_phys_t;
|
|
|
|
/*
|
|
* The space map object defines a region of space, its size, how much is
|
|
* allocated, and the on-disk object that stores this information.
|
|
* Consumers of space maps may only access the members of this structure.
|
|
*/
|
|
typedef struct space_map {
|
|
uint64_t sm_start; /* start of map */
|
|
uint64_t sm_size; /* size of map */
|
|
uint8_t sm_shift; /* unit shift */
|
|
uint64_t sm_length; /* synced length */
|
|
uint64_t sm_alloc; /* synced space allocated */
|
|
objset_t *sm_os; /* objset for this map */
|
|
uint64_t sm_object; /* object id for this map */
|
|
uint32_t sm_blksz; /* block size for space map */
|
|
dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */
|
|
space_map_phys_t *sm_phys; /* on-disk space map */
|
|
kmutex_t *sm_lock; /* pointer to lock that protects map */
|
|
} space_map_t;
|
|
|
|
/*
|
|
* debug entry
|
|
*
|
|
* 1 3 10 50
|
|
* ,---+--------+------------+---------------------------------.
|
|
* | 1 | action | syncpass | txg (lower bits) |
|
|
* `---+--------+------------+---------------------------------'
|
|
* 63 62 60 59 50 49 0
|
|
*
|
|
*
|
|
* non-debug entry
|
|
*
|
|
* 1 47 1 15
|
|
* ,-----------------------------------------------------------.
|
|
* | 0 | offset (sm_shift units) | type | run |
|
|
* `-----------------------------------------------------------'
|
|
* 63 62 17 16 15 0
|
|
*/
|
|
|
|
/* All this stuff takes and returns bytes */
|
|
#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
|
|
#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
|
|
#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
|
|
#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
|
|
#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
|
|
#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
|
|
#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
|
|
#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
|
|
|
|
#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
|
|
#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
|
|
|
|
#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
|
|
#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
|
|
|
|
#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
|
|
#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
|
|
|
|
#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
|
|
|
|
typedef enum {
|
|
SM_ALLOC,
|
|
SM_FREE
|
|
} maptype_t;
|
|
|
|
/*
|
|
* The data for a given space map can be kept on blocks of any size.
|
|
* Larger blocks entail fewer i/o operations, but they also cause the
|
|
* DMU to keep more data in-core, and also to waste more i/o bandwidth
|
|
* when only a few blocks have changed since the last transaction group.
|
|
* Rather than having a fixed block size for all space maps the block size
|
|
* can adjust as needed (see space_map_max_blksz). Set the initial block
|
|
* size for the space map to 4k.
|
|
*/
|
|
#define SPACE_MAP_INITIAL_BLOCKSIZE (1ULL << 12)
|
|
|
|
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
|
|
|
|
void space_map_histogram_clear(space_map_t *sm);
|
|
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
|
|
dmu_tx_t *tx);
|
|
|
|
void space_map_update(space_map_t *sm);
|
|
|
|
uint64_t space_map_object(space_map_t *sm);
|
|
uint64_t space_map_allocated(space_map_t *sm);
|
|
uint64_t space_map_length(space_map_t *sm);
|
|
|
|
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
|
dmu_tx_t *tx);
|
|
void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
|
|
uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
|
|
void space_map_free(space_map_t *sm, dmu_tx_t *tx);
|
|
|
|
int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
|
|
uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
|
|
void space_map_close(space_map_t *sm);
|
|
|
|
int64_t space_map_alloc_delta(space_map_t *sm);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_SPACE_MAP_H */
|