mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-05-23 15:04:59 +03:00
Implement allocation size ranges and use for gang leaves (#17111)
When forced to resort to ganging, ZFS currently allocates three child blocks, each one third of the size of the original. This is true regardless of whether larger allocations could be made, which would allow us to have fewer gang leaves. This improves performance when fragmentation is high enough to require ganging, but not so high that all the free ranges are only just big enough to hold a third of the recordsize. This is also useful for improving the behavior of a future change to allow larger gang headers. We add the ability for the allocation codepath to allocate a range of sizes instead of a single fixed size. We then use this to pre-allocate the DVAs for the gang children. If those allocations fail, we fall back to the normal write path, which will likely re-gang. Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Co-authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
parent
a7de203c86
commit
246e5883bb
@ -41,7 +41,7 @@ extern "C" {
|
|||||||
|
|
||||||
typedef struct metaslab_ops {
|
typedef struct metaslab_ops {
|
||||||
const char *msop_name;
|
const char *msop_name;
|
||||||
uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
|
uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *);
|
||||||
} metaslab_ops_t;
|
} metaslab_ops_t;
|
||||||
|
|
||||||
|
|
||||||
@ -82,6 +82,9 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
|
|||||||
|
|
||||||
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int,
|
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int,
|
||||||
uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *);
|
uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *);
|
||||||
|
int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t,
|
||||||
|
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *,
|
||||||
|
int, const void *, uint64_t *);
|
||||||
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
|
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
|
||||||
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
|
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
|
||||||
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
|
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
|
||||||
@ -95,6 +98,7 @@ void metaslab_check_free(spa_t *, const blkptr_t *);
|
|||||||
|
|
||||||
void metaslab_stat_init(void);
|
void metaslab_stat_init(void);
|
||||||
void metaslab_stat_fini(void);
|
void metaslab_stat_fini(void);
|
||||||
|
void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *);
|
||||||
void metaslab_trace_init(zio_alloc_list_t *);
|
void metaslab_trace_init(zio_alloc_list_t *);
|
||||||
void metaslab_trace_fini(zio_alloc_list_t *);
|
void metaslab_trace_fini(zio_alloc_list_t *);
|
||||||
|
|
||||||
@ -127,6 +131,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
|
|||||||
void metaslab_group_histogram_verify(metaslab_group_t *);
|
void metaslab_group_histogram_verify(metaslab_group_t *);
|
||||||
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
|
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
|
||||||
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
|
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
|
||||||
|
void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int,
|
||||||
|
uint64_t, const void *);
|
||||||
void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t,
|
void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t,
|
||||||
const void *);
|
const void *);
|
||||||
void metaslab_recalculate_weight_and_sort(metaslab_t *);
|
void metaslab_recalculate_weight_and_sort(metaslab_t *);
|
||||||
|
@ -134,6 +134,8 @@ extern void vdev_space_update(vdev_t *vd,
|
|||||||
|
|
||||||
extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
|
extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
|
||||||
|
|
||||||
|
extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize,
|
||||||
|
uint64_t txg);
|
||||||
extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
|
extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
|
||||||
uint64_t txg);
|
uint64_t txg);
|
||||||
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
|
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
|
||||||
|
@ -95,7 +95,7 @@ extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **);
|
|||||||
*/
|
*/
|
||||||
extern boolean_t vdev_draid_readable(vdev_t *, uint64_t);
|
extern boolean_t vdev_draid_readable(vdev_t *, uint64_t);
|
||||||
extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t);
|
extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t);
|
||||||
extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t);
|
extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t, uint64_t);
|
||||||
extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *);
|
extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *);
|
||||||
extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *);
|
extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *);
|
||||||
extern nvlist_t *vdev_draid_read_config_spare(vdev_t *);
|
extern nvlist_t *vdev_draid_read_config_spare(vdev_t *);
|
||||||
|
@ -103,7 +103,8 @@ typedef const struct vdev_ops {
|
|||||||
vdev_fini_func_t *vdev_op_fini;
|
vdev_fini_func_t *vdev_op_fini;
|
||||||
vdev_open_func_t *vdev_op_open;
|
vdev_open_func_t *vdev_op_open;
|
||||||
vdev_close_func_t *vdev_op_close;
|
vdev_close_func_t *vdev_op_close;
|
||||||
vdev_asize_func_t *vdev_op_asize;
|
vdev_asize_func_t *vdev_op_psize_to_asize;
|
||||||
|
vdev_asize_func_t *vdev_op_asize_to_psize;
|
||||||
vdev_min_asize_func_t *vdev_op_min_asize;
|
vdev_min_asize_func_t *vdev_op_min_asize;
|
||||||
vdev_min_alloc_func_t *vdev_op_min_alloc;
|
vdev_min_alloc_func_t *vdev_op_min_alloc;
|
||||||
vdev_io_start_func_t *vdev_op_io_start;
|
vdev_io_start_func_t *vdev_op_io_start;
|
||||||
@ -615,6 +616,7 @@ extern vdev_ops_t vdev_indirect_ops;
|
|||||||
*/
|
*/
|
||||||
extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
|
extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
|
||||||
zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
|
zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
|
||||||
|
extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg);
|
||||||
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
|
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
|
||||||
extern uint64_t vdev_default_min_asize(vdev_t *vd);
|
extern uint64_t vdev_default_min_asize(vdev_t *vd);
|
||||||
extern uint64_t vdev_get_min_asize(vdev_t *vd);
|
extern uint64_t vdev_get_min_asize(vdev_t *vd);
|
||||||
|
@ -227,6 +227,7 @@ typedef uint64_t zio_flag_t;
|
|||||||
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
|
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
|
||||||
#define ZIO_FLAG_DELEGATED (1ULL << 31)
|
#define ZIO_FLAG_DELEGATED (1ULL << 31)
|
||||||
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32)
|
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32)
|
||||||
|
#define ZIO_FLAG_PREALLOCATED (1ULL << 33)
|
||||||
|
|
||||||
#define ZIO_ALLOCATOR_NONE (-1)
|
#define ZIO_ALLOCATOR_NONE (-1)
|
||||||
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
|
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
|
||||||
|
@ -5436,12 +5436,12 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
|
|||||||
* +-------+-------+-------+-------+-------+
|
* +-------+-------+-------+-------+-------+
|
||||||
*
|
*
|
||||||
* Above, notice that the 4k block required one sector for parity and another
|
* Above, notice that the 4k block required one sector for parity and another
|
||||||
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
|
* for data. vdev_raidz_psize_to_asize() will return 8k and as such the pool's
|
||||||
* and free properties will be adjusted by 8k. The dataset will not be charged
|
* allocated and free properties will be adjusted by 8k. The dataset will not
|
||||||
* 8k. Rather, it will be charged a value that is scaled according to the
|
* be charged 8k. Rather, it will be charged a value that is scaled according
|
||||||
* overhead of the 128k block on the same vdev. This 8k allocation will be
|
* to the overhead of the 128k block on the same vdev. This 8k allocation will
|
||||||
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
|
* be charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is
|
||||||
* calculated in the 128k block example above.
|
* as calculated in the 128k block example above.
|
||||||
*
|
*
|
||||||
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
|
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
|
||||||
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
|
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
|
||||||
@ -5488,7 +5488,7 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
|
|||||||
* not necessarily equal to "blksize", due to RAIDZ deflation.
|
* not necessarily equal to "blksize", due to RAIDZ deflation.
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
vdev_raidz_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||||
uint64_t blksize)
|
uint64_t blksize)
|
||||||
{
|
{
|
||||||
uint64_t asize, ndata;
|
uint64_t asize, ndata;
|
||||||
@ -5508,7 +5508,7 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
|||||||
* size.
|
* size.
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
vdev_draid_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||||
uint64_t blksize)
|
uint64_t blksize)
|
||||||
{
|
{
|
||||||
ASSERT3U(ndisks, >, nparity);
|
ASSERT3U(ndisks, >, nparity);
|
||||||
@ -5568,12 +5568,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* allocation size for the "typical" 128k block */
|
/* allocation size for the "typical" 128k block */
|
||||||
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
|
tsize = vdev_raidz_psize_to_asize(ndisks, nparity,
|
||||||
SPA_OLD_MAXBLOCKSIZE);
|
ashift, SPA_OLD_MAXBLOCKSIZE);
|
||||||
|
|
||||||
/* allocation size for the blksize block */
|
/* allocation size for the blksize block */
|
||||||
asize = vdev_raidz_asize(ndisks, nparity, ashift,
|
asize = vdev_raidz_psize_to_asize(ndisks, nparity,
|
||||||
blksize);
|
ashift, blksize);
|
||||||
} else {
|
} else {
|
||||||
uint64_t ndata;
|
uint64_t ndata;
|
||||||
|
|
||||||
@ -5582,12 +5582,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* allocation size for the "typical" 128k block */
|
/* allocation size for the "typical" 128k block */
|
||||||
tsize = vdev_draid_asize(ndata + nparity, nparity,
|
tsize = vdev_draid_psize_to_asize(ndata + nparity,
|
||||||
ashift, SPA_OLD_MAXBLOCKSIZE);
|
nparity, ashift, SPA_OLD_MAXBLOCKSIZE);
|
||||||
|
|
||||||
/* allocation size for the blksize block */
|
/* allocation size for the blksize block */
|
||||||
asize = vdev_draid_asize(ndata + nparity, nparity,
|
asize = vdev_draid_psize_to_asize(ndata + nparity,
|
||||||
ashift, blksize);
|
nparity, ashift, blksize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1276,7 +1276,8 @@ vdev_ops_t vdev_disk_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_geom_open,
|
.vdev_op_open = vdev_geom_open,
|
||||||
.vdev_op_close = vdev_geom_close,
|
.vdev_op_close = vdev_geom_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_geom_io_start,
|
.vdev_op_io_start = vdev_geom_io_start,
|
||||||
|
@ -1554,7 +1554,8 @@ vdev_ops_t vdev_disk_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_disk_open,
|
.vdev_op_open = vdev_disk_open,
|
||||||
.vdev_op_close = vdev_disk_close,
|
.vdev_op_close = vdev_disk_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_disk_io_start,
|
.vdev_op_io_start = vdev_disk_io_start,
|
||||||
|
@ -1692,17 +1692,30 @@ metaslab_largest_unflushed_free(metaslab_t *msp)
|
|||||||
|
|
||||||
static zfs_range_seg_t *
|
static zfs_range_seg_t *
|
||||||
metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start,
|
metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start,
|
||||||
uint64_t size, zfs_btree_index_t *where)
|
uint64_t size, uint64_t max_size, zfs_btree_index_t *where)
|
||||||
{
|
{
|
||||||
zfs_range_seg_t *rs;
|
zfs_range_seg_t *rs;
|
||||||
zfs_range_seg_max_t rsearch;
|
zfs_range_seg_max_t rsearch;
|
||||||
|
|
||||||
zfs_rs_set_start(&rsearch, rt, start);
|
zfs_rs_set_start(&rsearch, rt, start);
|
||||||
zfs_rs_set_end(&rsearch, rt, start + size);
|
zfs_rs_set_end(&rsearch, rt, start + max_size);
|
||||||
|
|
||||||
rs = zfs_btree_find(t, &rsearch, where);
|
rs = zfs_btree_find(t, &rsearch, where);
|
||||||
if (rs == NULL) {
|
if (rs == NULL) {
|
||||||
rs = zfs_btree_next(t, where, where);
|
if (size == max_size) {
|
||||||
|
rs = zfs_btree_next(t, where, where);
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* If we're searching for a range, get the largest
|
||||||
|
* segment in that range, or the smallest one bigger
|
||||||
|
* than it.
|
||||||
|
*/
|
||||||
|
rs = zfs_btree_prev(t, where, where);
|
||||||
|
if (rs == NULL || zfs_rs_get_end(rs, rt) -
|
||||||
|
zfs_rs_get_start(rs, rt) < size) {
|
||||||
|
rs = zfs_btree_next(t, where, where);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (rs);
|
return (rs);
|
||||||
@ -1715,14 +1728,14 @@ metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start,
|
|||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
|
metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
|
||||||
uint64_t max_search)
|
uint64_t max_size, uint64_t max_search, uint64_t *found_size)
|
||||||
{
|
{
|
||||||
if (*cursor == 0)
|
if (*cursor == 0)
|
||||||
*cursor = rt->rt_start;
|
*cursor = rt->rt_start;
|
||||||
zfs_btree_t *bt = &rt->rt_root;
|
zfs_btree_t *bt = &rt->rt_root;
|
||||||
zfs_btree_index_t where;
|
zfs_btree_index_t where;
|
||||||
zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size,
|
zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size,
|
||||||
&where);
|
max_size, &where);
|
||||||
uint64_t first_found;
|
uint64_t first_found;
|
||||||
int count_searched = 0;
|
int count_searched = 0;
|
||||||
|
|
||||||
@ -1733,7 +1746,9 @@ metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
|
|||||||
max_search || count_searched < metaslab_min_search_count)) {
|
max_search || count_searched < metaslab_min_search_count)) {
|
||||||
uint64_t offset = zfs_rs_get_start(rs, rt);
|
uint64_t offset = zfs_rs_get_start(rs, rt);
|
||||||
if (offset + size <= zfs_rs_get_end(rs, rt)) {
|
if (offset + size <= zfs_rs_get_end(rs, rt)) {
|
||||||
*cursor = offset + size;
|
*found_size = MIN(zfs_rs_get_end(rs, rt) - offset,
|
||||||
|
max_size);
|
||||||
|
*cursor = offset + *found_size;
|
||||||
return (offset);
|
return (offset);
|
||||||
}
|
}
|
||||||
rs = zfs_btree_next(bt, &where, &where);
|
rs = zfs_btree_next(bt, &where, &where);
|
||||||
@ -1741,12 +1756,16 @@ metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
|
|||||||
}
|
}
|
||||||
|
|
||||||
*cursor = 0;
|
*cursor = 0;
|
||||||
|
*found_size = 0;
|
||||||
return (-1ULL);
|
return (-1ULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
|
static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size,
|
||||||
static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
|
uint64_t max_size, uint64_t *found_size);
|
||||||
static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
|
static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size,
|
||||||
|
uint64_t max_size, uint64_t *found_size);
|
||||||
|
static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size,
|
||||||
|
uint64_t max_size, uint64_t *found_size);
|
||||||
metaslab_ops_t *metaslab_allocator(spa_t *spa);
|
metaslab_ops_t *metaslab_allocator(spa_t *spa);
|
||||||
|
|
||||||
static metaslab_ops_t metaslab_allocators[] = {
|
static metaslab_ops_t metaslab_allocators[] = {
|
||||||
@ -1832,7 +1851,8 @@ metaslab_allocator(spa_t *spa)
|
|||||||
* ==========================================================================
|
* ==========================================================================
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
metaslab_df_alloc(metaslab_t *msp, uint64_t size)
|
metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
|
||||||
|
uint64_t *found_size)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Find the largest power of 2 block size that evenly divides the
|
* Find the largest power of 2 block size that evenly divides the
|
||||||
@ -1841,7 +1861,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
|
|||||||
* bucket) but it does not guarantee that other allocations sizes
|
* bucket) but it does not guarantee that other allocations sizes
|
||||||
* may exist in the same region.
|
* may exist in the same region.
|
||||||
*/
|
*/
|
||||||
uint64_t align = size & -size;
|
uint64_t align = max_size & -max_size;
|
||||||
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
|
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
|
||||||
zfs_range_tree_t *rt = msp->ms_allocatable;
|
zfs_range_tree_t *rt = msp->ms_allocatable;
|
||||||
uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
|
uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
|
||||||
@ -1855,10 +1875,18 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
|
|||||||
*/
|
*/
|
||||||
if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
|
if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
|
||||||
free_pct < metaslab_df_free_pct) {
|
free_pct < metaslab_df_free_pct) {
|
||||||
|
align = size & -size;
|
||||||
|
cursor = &msp->ms_lbas[highbit64(align) - 1];
|
||||||
offset = -1;
|
offset = -1;
|
||||||
} else {
|
} else {
|
||||||
offset = metaslab_block_picker(rt,
|
offset = metaslab_block_picker(rt, cursor, size, max_size,
|
||||||
cursor, size, metaslab_df_max_search);
|
metaslab_df_max_search, found_size);
|
||||||
|
if (max_size != size && offset == -1) {
|
||||||
|
align = size & -size;
|
||||||
|
cursor = &msp->ms_lbas[highbit64(align) - 1];
|
||||||
|
offset = metaslab_block_picker(rt, cursor, size,
|
||||||
|
max_size, metaslab_df_max_search, found_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (offset == -1) {
|
if (offset == -1) {
|
||||||
@ -1873,12 +1901,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
|
|||||||
zfs_btree_index_t where;
|
zfs_btree_index_t where;
|
||||||
/* use segment of this size, or next largest */
|
/* use segment of this size, or next largest */
|
||||||
rs = metaslab_block_find(&msp->ms_allocatable_by_size,
|
rs = metaslab_block_find(&msp->ms_allocatable_by_size,
|
||||||
rt, msp->ms_start, size, &where);
|
rt, msp->ms_start, size, max_size, &where);
|
||||||
}
|
}
|
||||||
if (rs != NULL && zfs_rs_get_start(rs, rt) + size <=
|
if (rs != NULL && zfs_rs_get_start(rs, rt) + size <=
|
||||||
zfs_rs_get_end(rs, rt)) {
|
zfs_rs_get_end(rs, rt)) {
|
||||||
offset = zfs_rs_get_start(rs, rt);
|
offset = zfs_rs_get_start(rs, rt);
|
||||||
*cursor = offset + size;
|
*found_size = MIN(zfs_rs_get_end(rs, rt) - offset,
|
||||||
|
max_size);
|
||||||
|
*cursor = offset + *found_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1895,7 +1925,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
|
|||||||
* ==========================================================================
|
* ==========================================================================
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
|
metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
|
||||||
|
uint64_t *found_size)
|
||||||
{
|
{
|
||||||
zfs_range_tree_t *rt = msp->ms_allocatable;
|
zfs_range_tree_t *rt = msp->ms_allocatable;
|
||||||
zfs_btree_t *t = &msp->ms_allocatable_by_size;
|
zfs_btree_t *t = &msp->ms_allocatable_by_size;
|
||||||
@ -1922,7 +1953,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
|
|||||||
}
|
}
|
||||||
|
|
||||||
offset = *cursor;
|
offset = *cursor;
|
||||||
*cursor += size;
|
*found_size = MIN(*cursor_end - offset, max_size);
|
||||||
|
*cursor = offset + *found_size;
|
||||||
|
|
||||||
return (offset);
|
return (offset);
|
||||||
}
|
}
|
||||||
@ -1943,33 +1975,43 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
|
|||||||
uint64_t metaslab_ndf_clump_shift = 4;
|
uint64_t metaslab_ndf_clump_shift = 4;
|
||||||
|
|
||||||
static uint64_t
|
static uint64_t
|
||||||
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
|
metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
|
||||||
|
uint64_t *found_size)
|
||||||
{
|
{
|
||||||
zfs_btree_t *t = &msp->ms_allocatable->rt_root;
|
zfs_btree_t *t = &msp->ms_allocatable->rt_root;
|
||||||
zfs_range_tree_t *rt = msp->ms_allocatable;
|
zfs_range_tree_t *rt = msp->ms_allocatable;
|
||||||
zfs_btree_index_t where;
|
zfs_btree_index_t where;
|
||||||
zfs_range_seg_t *rs;
|
zfs_range_seg_t *rs;
|
||||||
zfs_range_seg_max_t rsearch;
|
zfs_range_seg_max_t rsearch;
|
||||||
uint64_t hbit = highbit64(size);
|
uint64_t hbit = highbit64(max_size);
|
||||||
uint64_t *cursor = &msp->ms_lbas[hbit - 1];
|
uint64_t *cursor = &msp->ms_lbas[hbit - 1];
|
||||||
uint64_t max_size = metaslab_largest_allocatable(msp);
|
uint64_t max_possible_size = metaslab_largest_allocatable(msp);
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||||
|
|
||||||
if (max_size < size)
|
if (max_possible_size < size)
|
||||||
return (-1ULL);
|
return (-1ULL);
|
||||||
|
|
||||||
zfs_rs_set_start(&rsearch, rt, *cursor);
|
zfs_rs_set_start(&rsearch, rt, *cursor);
|
||||||
zfs_rs_set_end(&rsearch, rt, *cursor + size);
|
zfs_rs_set_end(&rsearch, rt, *cursor + max_size);
|
||||||
|
|
||||||
rs = zfs_btree_find(t, &rsearch, &where);
|
rs = zfs_btree_find(t, &rsearch, &where);
|
||||||
|
if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
|
||||||
|
max_size) {
|
||||||
|
hbit = highbit64(size);
|
||||||
|
cursor = &msp->ms_lbas[hbit - 1];
|
||||||
|
zfs_rs_set_start(&rsearch, rt, *cursor);
|
||||||
|
zfs_rs_set_end(&rsearch, rt, *cursor + size);
|
||||||
|
|
||||||
|
rs = zfs_btree_find(t, &rsearch, &where);
|
||||||
|
}
|
||||||
if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
|
if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
|
||||||
size) {
|
size) {
|
||||||
t = &msp->ms_allocatable_by_size;
|
t = &msp->ms_allocatable_by_size;
|
||||||
|
|
||||||
zfs_rs_set_start(&rsearch, rt, 0);
|
zfs_rs_set_start(&rsearch, rt, 0);
|
||||||
zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
|
zfs_rs_set_end(&rsearch, rt, MIN(max_possible_size,
|
||||||
metaslab_ndf_clump_shift)));
|
1ULL << (hbit + metaslab_ndf_clump_shift)));
|
||||||
|
|
||||||
rs = zfs_btree_find(t, &rsearch, &where);
|
rs = zfs_btree_find(t, &rsearch, &where);
|
||||||
if (rs == NULL)
|
if (rs == NULL)
|
||||||
@ -1978,7 +2020,9 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) {
|
if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) {
|
||||||
*cursor = zfs_rs_get_start(rs, rt) + size;
|
*found_size = MIN(zfs_rs_get_end(rs, rt) -
|
||||||
|
zfs_rs_get_start(rs, rt), max_size);
|
||||||
|
*cursor = zfs_rs_get_start(rs, rt) + *found_size;
|
||||||
return (zfs_rs_get_start(rs, rt));
|
return (zfs_rs_get_start(rs, rt));
|
||||||
}
|
}
|
||||||
return (-1ULL);
|
return (-1ULL);
|
||||||
@ -4668,6 +4712,15 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
|
|||||||
ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
|
ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
metaslab_trace_move(zio_alloc_list_t *old, zio_alloc_list_t *new)
|
||||||
|
{
|
||||||
|
ASSERT0(new->zal_size);
|
||||||
|
list_move_tail(&new->zal_list, &old->zal_list);
|
||||||
|
new->zal_size = old->zal_size;
|
||||||
|
list_destroy(&old->zal_list);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
metaslab_trace_init(zio_alloc_list_t *zal)
|
metaslab_trace_init(zio_alloc_list_t *zal)
|
||||||
{
|
{
|
||||||
@ -4697,7 +4750,7 @@ static void
|
|||||||
metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator,
|
metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator,
|
||||||
int flags, uint64_t psize, const void *tag)
|
int flags, uint64_t psize, const void *tag)
|
||||||
{
|
{
|
||||||
if (!(flags & METASLAB_ASYNC_ALLOC))
|
if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
||||||
@ -4708,11 +4761,22 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator,
|
|||||||
(void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag);
|
(void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
metaslab_group_alloc_increment_all(spa_t *spa, blkptr_t *bp, int allocator,
|
||||||
|
int flags, uint64_t psize, const void *tag)
|
||||||
|
{
|
||||||
|
for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
|
||||||
|
uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]);
|
||||||
|
metaslab_group_alloc_increment(spa, vdev, allocator, flags,
|
||||||
|
psize, tag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator,
|
metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator,
|
||||||
int flags, uint64_t psize, const void *tag)
|
int flags, uint64_t psize, const void *tag)
|
||||||
{
|
{
|
||||||
if (!(flags & METASLAB_ASYNC_ALLOC))
|
if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
||||||
@ -4724,7 +4788,8 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t
|
static uint64_t
|
||||||
metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
|
metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
|
||||||
|
uint64_t txg, uint64_t *actual_size)
|
||||||
{
|
{
|
||||||
uint64_t start;
|
uint64_t start;
|
||||||
zfs_range_tree_t *rt = msp->ms_allocatable;
|
zfs_range_tree_t *rt = msp->ms_allocatable;
|
||||||
@ -4735,8 +4800,9 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
|
|||||||
VERIFY0(msp->ms_disabled);
|
VERIFY0(msp->ms_disabled);
|
||||||
VERIFY0(msp->ms_new);
|
VERIFY0(msp->ms_new);
|
||||||
|
|
||||||
start = mc->mc_ops->msop_alloc(msp, size);
|
start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size);
|
||||||
if (start != -1ULL) {
|
if (start != -1ULL) {
|
||||||
|
size = *actual_size;
|
||||||
metaslab_group_t *mg = msp->ms_group;
|
metaslab_group_t *mg = msp->ms_group;
|
||||||
vdev_t *vd = mg->mg_vd;
|
vdev_t *vd = mg->mg_vd;
|
||||||
|
|
||||||
@ -4879,8 +4945,9 @@ metaslab_active_mask_verify(metaslab_t *msp)
|
|||||||
|
|
||||||
static uint64_t
|
static uint64_t
|
||||||
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
||||||
uint64_t asize, uint64_t txg, dva_t *dva, int d, int allocator,
|
uint64_t asize, uint64_t max_asize, uint64_t txg,
|
||||||
boolean_t try_hard)
|
dva_t *dva, int d, int allocator, boolean_t try_hard,
|
||||||
|
uint64_t *actual_asize)
|
||||||
{
|
{
|
||||||
metaslab_t *msp = NULL;
|
metaslab_t *msp = NULL;
|
||||||
uint64_t offset = -1ULL;
|
uint64_t offset = -1ULL;
|
||||||
@ -5095,16 +5162,19 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
offset = metaslab_block_alloc(msp, asize, txg);
|
offset = metaslab_block_alloc(msp, asize, max_asize, txg,
|
||||||
metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
|
actual_asize);
|
||||||
|
|
||||||
if (offset != -1ULL) {
|
if (offset != -1ULL) {
|
||||||
|
metaslab_trace_add(zal, mg, msp, *actual_asize, d,
|
||||||
|
offset, allocator);
|
||||||
/* Proactively passivate the metaslab, if needed */
|
/* Proactively passivate the metaslab, if needed */
|
||||||
if (activated)
|
if (activated)
|
||||||
metaslab_segment_may_passivate(msp);
|
metaslab_segment_may_passivate(msp);
|
||||||
mutex_exit(&msp->ms_lock);
|
mutex_exit(&msp->ms_lock);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
|
||||||
next:
|
next:
|
||||||
ASSERT(msp->ms_loaded);
|
ASSERT(msp->ms_loaded);
|
||||||
|
|
||||||
@ -5243,13 +5313,10 @@ metaslab_group_allocatable(spa_t *spa, metaslab_group_t *mg, uint64_t psize,
|
|||||||
return (B_TRUE);
|
return (B_TRUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static int
|
||||||
* Allocate a block for the specified i/o.
|
metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||||
*/
|
uint64_t max_psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg,
|
||||||
int
|
int flags, zio_alloc_list_t *zal, int allocator, uint64_t *actual_psize)
|
||||||
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
|
||||||
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
|
|
||||||
zio_alloc_list_t *zal, int allocator)
|
|
||||||
{
|
{
|
||||||
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
|
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
|
||||||
metaslab_group_t *mg = NULL, *rotor;
|
metaslab_group_t *mg = NULL, *rotor;
|
||||||
@ -5272,6 +5339,13 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
|||||||
allocator);
|
allocator);
|
||||||
return (SET_ERROR(ENOSPC));
|
return (SET_ERROR(ENOSPC));
|
||||||
}
|
}
|
||||||
|
if (max_psize > psize && max_psize >= metaslab_force_ganging &&
|
||||||
|
metaslab_force_ganging_pct > 0 &&
|
||||||
|
(random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
|
||||||
|
max_psize = MAX((psize + max_psize) / 2,
|
||||||
|
metaslab_force_ganging);
|
||||||
|
}
|
||||||
|
ASSERT3U(psize, <=, max_psize);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Start at the rotor and loop through all mgs until we find something.
|
* Start at the rotor and loop through all mgs until we find something.
|
||||||
@ -5319,11 +5393,18 @@ top:
|
|||||||
|
|
||||||
vd = mg->mg_vd;
|
vd = mg->mg_vd;
|
||||||
uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
|
uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
|
||||||
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
|
ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
|
||||||
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
|
uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize,
|
||||||
dva, d, allocator, try_hard);
|
txg);
|
||||||
|
ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift));
|
||||||
|
uint64_t offset = metaslab_group_alloc(mg, zal, asize,
|
||||||
|
max_asize, txg, dva, d, allocator, try_hard,
|
||||||
|
&asize);
|
||||||
|
|
||||||
if (offset != -1ULL) {
|
if (offset != -1ULL) {
|
||||||
|
if (actual_psize)
|
||||||
|
*actual_psize = vdev_asize_to_psize_txg(vd,
|
||||||
|
asize, txg);
|
||||||
metaslab_class_rotate(mg, allocator, psize, B_TRUE);
|
metaslab_class_rotate(mg, allocator, psize, B_TRUE);
|
||||||
|
|
||||||
DVA_SET_VDEV(&dva[d], vd->vdev_id);
|
DVA_SET_VDEV(&dva[d], vd->vdev_id);
|
||||||
@ -5354,6 +5435,18 @@ next:
|
|||||||
return (SET_ERROR(ENOSPC));
|
return (SET_ERROR(ENOSPC));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allocate a block for the specified i/o.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||||
|
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
|
||||||
|
zio_alloc_list_t *zal, int allocator)
|
||||||
|
{
|
||||||
|
return (metaslab_alloc_dva_range(spa, mc, psize, psize, dva, d, hintdva,
|
||||||
|
txg, flags, zal, allocator, NULL));
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
|
metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
|
||||||
boolean_t checkpoint)
|
boolean_t checkpoint)
|
||||||
@ -5841,6 +5934,16 @@ int
|
|||||||
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
||||||
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
|
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
|
||||||
zio_alloc_list_t *zal, int allocator, const void *tag)
|
zio_alloc_list_t *zal, int allocator, const void *tag)
|
||||||
|
{
|
||||||
|
return (metaslab_alloc_range(spa, mc, psize, psize, bp, ndvas, txg,
|
||||||
|
hintbp, flags, zal, allocator, tag, NULL));
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
|
||||||
|
uint64_t max_psize, blkptr_t *bp, int ndvas, uint64_t txg,
|
||||||
|
blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator,
|
||||||
|
const void *tag, uint64_t *actual_psize)
|
||||||
{
|
{
|
||||||
dva_t *dva = bp->blk_dva;
|
dva_t *dva = bp->blk_dva;
|
||||||
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
|
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
|
||||||
@ -5862,9 +5965,12 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
|||||||
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
|
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
|
||||||
ASSERT3P(zal, !=, NULL);
|
ASSERT3P(zal, !=, NULL);
|
||||||
|
|
||||||
|
uint64_t cur_psize = 0;
|
||||||
|
|
||||||
for (int d = 0; d < ndvas; d++) {
|
for (int d = 0; d < ndvas; d++) {
|
||||||
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
|
error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
|
||||||
txg, flags, zal, allocator);
|
dva, d, hintdva, txg, flags, zal, allocator,
|
||||||
|
actual_psize ? &cur_psize : NULL);
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
for (d--; d >= 0; d--) {
|
for (d--; d >= 0; d--) {
|
||||||
metaslab_unalloc_dva(spa, &dva[d], txg);
|
metaslab_unalloc_dva(spa, &dva[d], txg);
|
||||||
@ -5883,10 +5989,14 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
|
|||||||
metaslab_group_alloc_increment(spa,
|
metaslab_group_alloc_increment(spa,
|
||||||
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
|
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
|
||||||
tag);
|
tag);
|
||||||
|
if (actual_psize)
|
||||||
|
max_psize = MIN(cur_psize, max_psize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ASSERT(error == 0);
|
ASSERT(error == 0);
|
||||||
ASSERT(BP_GET_NDVAS(bp) == ndvas);
|
ASSERT(BP_GET_NDVAS(bp) == ndvas);
|
||||||
|
if (actual_psize)
|
||||||
|
*actual_psize = max_psize;
|
||||||
|
|
||||||
spa_config_exit(spa, SCL_ALLOC, FTAG);
|
spa_config_exit(spa, SCL_ALLOC, FTAG);
|
||||||
|
|
||||||
|
@ -185,7 +185,8 @@ zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
|
|||||||
ASSERT3U(rc->rc_count, >=, number);
|
ASSERT3U(rc->rc_count, >=, number);
|
||||||
ref = avl_find(&rc->rc_tree, &s, NULL);
|
ref = avl_find(&rc->rc_tree, &s, NULL);
|
||||||
if (unlikely(ref == NULL)) {
|
if (unlikely(ref == NULL)) {
|
||||||
panic("No such hold %p on refcount %llx", holder,
|
PANIC("No such hold %llx on refcount %llx",
|
||||||
|
(u_longlong_t)(uintptr_t)holder,
|
||||||
(u_longlong_t)(uintptr_t)rc);
|
(u_longlong_t)(uintptr_t)rc);
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
|
@ -323,6 +323,19 @@ vdev_derive_alloc_bias(const char *bias)
|
|||||||
return (alloc_bias);
|
return (alloc_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t
|
||||||
|
vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
|
||||||
|
{
|
||||||
|
ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift));
|
||||||
|
uint64_t csize, psize = asize;
|
||||||
|
for (int c = 0; c < vd->vdev_children; c++) {
|
||||||
|
csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg);
|
||||||
|
psize = MIN(psize, csize);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (psize);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Default asize function: return the MAX of psize with the asize of
|
* Default asize function: return the MAX of psize with the asize of
|
||||||
* all children. This is what's used by anything other than RAID-Z.
|
* all children. This is what's used by anything other than RAID-Z.
|
||||||
@ -4135,17 +4148,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
|
|||||||
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
|
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
|
||||||
dmu_tx_commit(tx);
|
dmu_tx_commit(tx);
|
||||||
}
|
}
|
||||||
|
uint64_t
|
||||||
|
vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg)
|
||||||
|
{
|
||||||
|
return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the amount of space that should be (or was) allocated for the given
|
* Return the amount of space that should be (or was) allocated for the given
|
||||||
* psize (compressed block size) in the given TXG. Note that for expanded
|
* psize (compressed block size) in the given TXG. Note that for expanded
|
||||||
* RAIDZ vdevs, the size allocated for older BP's may be larger. See
|
* RAIDZ vdevs, the size allocated for older BP's may be larger. See
|
||||||
* vdev_raidz_asize().
|
* vdev_raidz_psize_to_asize().
|
||||||
*/
|
*/
|
||||||
uint64_t
|
uint64_t
|
||||||
vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
|
vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
|
||||||
{
|
{
|
||||||
return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
|
return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg));
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t
|
uint64_t
|
||||||
|
@ -578,7 +578,7 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc,
|
|||||||
* i.e. vdev_draid_psize_to_asize().
|
* i.e. vdev_draid_psize_to_asize().
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
|
vdev_draid_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
|
||||||
{
|
{
|
||||||
(void) txg;
|
(void) txg;
|
||||||
vdev_draid_config_t *vdc = vd->vdev_tsd;
|
vdev_draid_config_t *vdc = vd->vdev_tsd;
|
||||||
@ -599,8 +599,9 @@ vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
|
|||||||
* Deflate the asize to the psize, this includes stripping parity.
|
* Deflate the asize to the psize, this includes stripping parity.
|
||||||
*/
|
*/
|
||||||
uint64_t
|
uint64_t
|
||||||
vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize)
|
vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
|
||||||
{
|
{
|
||||||
|
(void) txg;
|
||||||
vdev_draid_config_t *vdc = vd->vdev_tsd;
|
vdev_draid_config_t *vdc = vd->vdev_tsd;
|
||||||
|
|
||||||
ASSERT0(asize % vdc->vdc_groupwidth);
|
ASSERT0(asize % vdc->vdc_groupwidth);
|
||||||
@ -962,7 +963,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
|
|||||||
vdev_draid_config_t *vdc = vd->vdev_tsd;
|
vdev_draid_config_t *vdc = vd->vdev_tsd;
|
||||||
uint64_t ashift = vd->vdev_top->vdev_ashift;
|
uint64_t ashift = vd->vdev_top->vdev_ashift;
|
||||||
uint64_t io_size = abd_size;
|
uint64_t io_size = abd_size;
|
||||||
uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);
|
uint64_t io_asize = vdev_draid_psize_to_asize(vd, io_size, 0);
|
||||||
uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
|
uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
|
||||||
uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
|
uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
|
||||||
|
|
||||||
@ -972,7 +973,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
|
|||||||
*/
|
*/
|
||||||
if (io_offset + io_asize > start_offset) {
|
if (io_offset + io_asize > start_offset) {
|
||||||
io_size = vdev_draid_asize_to_psize(vd,
|
io_size = vdev_draid_asize_to_psize(vd,
|
||||||
start_offset - io_offset);
|
start_offset - io_offset, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1117,7 +1118,7 @@ vdev_draid_map_alloc(zio_t *zio)
|
|||||||
if (size < abd_size) {
|
if (size < abd_size) {
|
||||||
vdev_t *vd = zio->io_vd;
|
vdev_t *vd = zio->io_vd;
|
||||||
|
|
||||||
io_offset += vdev_draid_asize(vd, size, 0);
|
io_offset += vdev_draid_psize_to_asize(vd, size, 0);
|
||||||
abd_offset += size;
|
abd_offset += size;
|
||||||
abd_size -= size;
|
abd_size -= size;
|
||||||
nrows++;
|
nrows++;
|
||||||
@ -1770,7 +1771,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
|
|||||||
uint64_t phys_birth)
|
uint64_t phys_birth)
|
||||||
{
|
{
|
||||||
uint64_t offset = DVA_GET_OFFSET(dva);
|
uint64_t offset = DVA_GET_OFFSET(dva);
|
||||||
uint64_t asize = vdev_draid_asize(vd, psize, 0);
|
uint64_t asize = vdev_draid_psize_to_asize(vd, psize, 0);
|
||||||
|
|
||||||
if (phys_birth == TXG_UNKNOWN) {
|
if (phys_birth == TXG_UNKNOWN) {
|
||||||
/*
|
/*
|
||||||
@ -1827,7 +1828,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
|
|||||||
zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
|
zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
|
||||||
logical_rs.rs_start = rr->rr_offset;
|
logical_rs.rs_start = rr->rr_offset;
|
||||||
logical_rs.rs_end = logical_rs.rs_start +
|
logical_rs.rs_end = logical_rs.rs_start +
|
||||||
vdev_draid_asize(vd, rr->rr_size, 0);
|
vdev_draid_psize_to_asize(vd, rr->rr_size, 0);
|
||||||
|
|
||||||
raidz_col_t *rc = &rr->rr_col[col];
|
raidz_col_t *rc = &rr->rr_col[col];
|
||||||
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
|
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
|
||||||
@ -2311,7 +2312,8 @@ vdev_ops_t vdev_draid_ops = {
|
|||||||
.vdev_op_fini = vdev_draid_fini,
|
.vdev_op_fini = vdev_draid_fini,
|
||||||
.vdev_op_open = vdev_draid_open,
|
.vdev_op_open = vdev_draid_open,
|
||||||
.vdev_op_close = vdev_draid_close,
|
.vdev_op_close = vdev_draid_close,
|
||||||
.vdev_op_asize = vdev_draid_asize,
|
.vdev_op_psize_to_asize = vdev_draid_psize_to_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_draid_asize_to_psize,
|
||||||
.vdev_op_min_asize = vdev_draid_min_asize,
|
.vdev_op_min_asize = vdev_draid_min_asize,
|
||||||
.vdev_op_min_alloc = vdev_draid_min_alloc,
|
.vdev_op_min_alloc = vdev_draid_min_alloc,
|
||||||
.vdev_op_io_start = vdev_draid_io_start,
|
.vdev_op_io_start = vdev_draid_io_start,
|
||||||
@ -2801,7 +2803,8 @@ vdev_ops_t vdev_draid_spare_ops = {
|
|||||||
.vdev_op_fini = vdev_draid_spare_fini,
|
.vdev_op_fini = vdev_draid_spare_fini,
|
||||||
.vdev_op_open = vdev_draid_spare_open,
|
.vdev_op_open = vdev_draid_spare_open,
|
||||||
.vdev_op_close = vdev_draid_spare_close,
|
.vdev_op_close = vdev_draid_spare_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_draid_spare_io_start,
|
.vdev_op_io_start = vdev_draid_spare_io_start,
|
||||||
|
@ -313,7 +313,8 @@ vdev_ops_t vdev_file_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_file_open,
|
.vdev_op_open = vdev_file_open,
|
||||||
.vdev_op_close = vdev_file_close,
|
.vdev_op_close = vdev_file_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_file_io_start,
|
.vdev_op_io_start = vdev_file_io_start,
|
||||||
@ -343,7 +344,7 @@ vdev_ops_t vdev_disk_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_file_open,
|
.vdev_op_open = vdev_file_open,
|
||||||
.vdev_op_close = vdev_file_close,
|
.vdev_op_close = vdev_file_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_file_io_start,
|
.vdev_op_io_start = vdev_file_io_start,
|
||||||
|
@ -1867,7 +1867,8 @@ vdev_ops_t vdev_indirect_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_indirect_open,
|
.vdev_op_open = vdev_indirect_open,
|
||||||
.vdev_op_close = vdev_indirect_close,
|
.vdev_op_close = vdev_indirect_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_indirect_io_start,
|
.vdev_op_io_start = vdev_indirect_io_start,
|
||||||
|
@ -972,7 +972,8 @@ vdev_ops_t vdev_mirror_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_mirror_open,
|
.vdev_op_open = vdev_mirror_open,
|
||||||
.vdev_op_close = vdev_mirror_close,
|
.vdev_op_close = vdev_mirror_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_mirror_io_start,
|
.vdev_op_io_start = vdev_mirror_io_start,
|
||||||
@ -997,7 +998,8 @@ vdev_ops_t vdev_replacing_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_mirror_open,
|
.vdev_op_open = vdev_mirror_open,
|
||||||
.vdev_op_close = vdev_mirror_close,
|
.vdev_op_close = vdev_mirror_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_mirror_io_start,
|
.vdev_op_io_start = vdev_mirror_io_start,
|
||||||
@ -1022,7 +1024,8 @@ vdev_ops_t vdev_spare_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_mirror_open,
|
.vdev_op_open = vdev_mirror_open,
|
||||||
.vdev_op_close = vdev_mirror_close,
|
.vdev_op_close = vdev_mirror_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_mirror_io_start,
|
.vdev_op_io_start = vdev_mirror_io_start,
|
||||||
|
@ -85,7 +85,8 @@ vdev_ops_t vdev_missing_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_missing_open,
|
.vdev_op_open = vdev_missing_open,
|
||||||
.vdev_op_close = vdev_missing_close,
|
.vdev_op_close = vdev_missing_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_missing_io_start,
|
.vdev_op_io_start = vdev_missing_io_start,
|
||||||
@ -110,7 +111,8 @@ vdev_ops_t vdev_hole_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_missing_open,
|
.vdev_op_open = vdev_missing_open,
|
||||||
.vdev_op_close = vdev_missing_close,
|
.vdev_op_close = vdev_missing_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_missing_io_start,
|
.vdev_op_io_start = vdev_missing_io_start,
|
||||||
|
@ -2235,6 +2235,33 @@ vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
|
|||||||
mutex_exit(&vdrz->vd_expand_lock);
|
mutex_exit(&vdrz->vd_expand_lock);
|
||||||
return (width);
|
return (width);
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
* This code converts an asize into the largest psize that can safely be written
|
||||||
|
* to an allocation of that size for this vdev.
|
||||||
|
*
|
||||||
|
* Note that this function will not take into account the effect of gang
|
||||||
|
* headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
|
||||||
|
* the psize_to_asize function.
|
||||||
|
*/
|
||||||
|
static uint64_t
|
||||||
|
vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
|
||||||
|
{
|
||||||
|
vdev_raidz_t *vdrz = vd->vdev_tsd;
|
||||||
|
uint64_t psize;
|
||||||
|
uint64_t ashift = vd->vdev_top->vdev_ashift;
|
||||||
|
uint64_t cols = vdrz->vd_original_width;
|
||||||
|
uint64_t nparity = vdrz->vd_nparity;
|
||||||
|
|
||||||
|
cols = vdev_raidz_get_logical_width(vdrz, txg);
|
||||||
|
|
||||||
|
ASSERT0(asize % (1 << ashift));
|
||||||
|
|
||||||
|
psize = (asize >> ashift);
|
||||||
|
psize -= nparity * DIV_ROUND_UP(psize, cols);
|
||||||
|
psize <<= ashift;
|
||||||
|
|
||||||
|
return (asize);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
|
* Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
|
||||||
@ -2245,7 +2272,7 @@ vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
|
|||||||
* allocate P+1 sectors regardless of width ("cols", which is at least P+1).
|
* allocate P+1 sectors regardless of width ("cols", which is at least P+1).
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
|
vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
|
||||||
{
|
{
|
||||||
vdev_raidz_t *vdrz = vd->vdev_tsd;
|
vdev_raidz_t *vdrz = vd->vdev_tsd;
|
||||||
uint64_t asize;
|
uint64_t asize;
|
||||||
@ -2309,7 +2336,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
|
|||||||
zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
|
zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
|
||||||
logical_rs.rs_start = rr->rr_offset;
|
logical_rs.rs_start = rr->rr_offset;
|
||||||
logical_rs.rs_end = logical_rs.rs_start +
|
logical_rs.rs_end = logical_rs.rs_start +
|
||||||
vdev_raidz_asize(zio->io_vd, rr->rr_size,
|
vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
|
||||||
BP_GET_BIRTH(zio->io_bp));
|
BP_GET_BIRTH(zio->io_bp));
|
||||||
|
|
||||||
raidz_col_t *rc = &rr->rr_col[col];
|
raidz_col_t *rc = &rr->rr_col[col];
|
||||||
@ -5093,7 +5120,8 @@ vdev_ops_t vdev_raidz_ops = {
|
|||||||
.vdev_op_fini = vdev_raidz_fini,
|
.vdev_op_fini = vdev_raidz_fini,
|
||||||
.vdev_op_open = vdev_raidz_open,
|
.vdev_op_open = vdev_raidz_open,
|
||||||
.vdev_op_close = vdev_raidz_close,
|
.vdev_op_close = vdev_raidz_close,
|
||||||
.vdev_op_asize = vdev_raidz_asize,
|
.vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
|
||||||
.vdev_op_min_asize = vdev_raidz_min_asize,
|
.vdev_op_min_asize = vdev_raidz_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = vdev_raidz_io_start,
|
.vdev_op_io_start = vdev_raidz_io_start,
|
||||||
|
@ -529,7 +529,7 @@ vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
|
|||||||
vd->vdev_ops == &vdev_spare_ops);
|
vd->vdev_ops == &vdev_spare_ops);
|
||||||
|
|
||||||
uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
|
uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
|
||||||
vdev_draid_asize_to_psize(vd, asize) : asize;
|
vdev_draid_asize_to_psize(vd, asize, 0) : asize;
|
||||||
|
|
||||||
BP_ZERO(bp);
|
BP_ZERO(bp);
|
||||||
|
|
||||||
|
@ -147,7 +147,8 @@ vdev_ops_t vdev_root_ops = {
|
|||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_root_open,
|
.vdev_op_open = vdev_root_open,
|
||||||
.vdev_op_close = vdev_root_close,
|
.vdev_op_close = vdev_root_close,
|
||||||
.vdev_op_asize = vdev_default_asize,
|
.vdev_op_psize_to_asize = vdev_default_asize,
|
||||||
|
.vdev_op_asize_to_psize = vdev_default_psize,
|
||||||
.vdev_op_min_asize = vdev_default_min_asize,
|
.vdev_op_min_asize = vdev_default_min_asize,
|
||||||
.vdev_op_min_alloc = NULL,
|
.vdev_op_min_alloc = NULL,
|
||||||
.vdev_op_io_start = NULL, /* not applicable to the root */
|
.vdev_op_io_start = NULL, /* not applicable to the root */
|
||||||
|
@ -1022,6 +1022,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|||||||
zio->io_logical = zio;
|
zio->io_logical = zio;
|
||||||
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
|
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
|
||||||
pipeline |= ZIO_GANG_STAGES;
|
pipeline |= ZIO_GANG_STAGES;
|
||||||
|
if (flags & ZIO_FLAG_PREALLOCATED) {
|
||||||
|
BP_ZERO_DVAS(zio->io_bp);
|
||||||
|
BP_SET_BIRTH(zio->io_bp, 0, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
zio->io_spa = spa;
|
zio->io_spa = spa;
|
||||||
@ -3092,7 +3096,12 @@ zio_write_gang_member_ready(zio_t *zio)
|
|||||||
if (BP_IS_HOLE(zio->io_bp))
|
if (BP_IS_HOLE(zio->io_bp))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
|
/*
|
||||||
|
* If we're getting direct-invoked from zio_write_gang_block(),
|
||||||
|
* the bp_orig will be set.
|
||||||
|
*/
|
||||||
|
ASSERT(BP_IS_HOLE(&zio->io_bp_orig) ||
|
||||||
|
zio->io_flags & ZIO_FLAG_PREALLOCATED);
|
||||||
|
|
||||||
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
|
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
|
||||||
ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
|
ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
|
||||||
@ -3134,7 +3143,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
|||||||
abd_t *gbh_abd;
|
abd_t *gbh_abd;
|
||||||
uint64_t txg = pio->io_txg;
|
uint64_t txg = pio->io_txg;
|
||||||
uint64_t resid = pio->io_size;
|
uint64_t resid = pio->io_size;
|
||||||
uint64_t psize;
|
|
||||||
zio_prop_t zp;
|
zio_prop_t zp;
|
||||||
int error;
|
int error;
|
||||||
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
|
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
|
||||||
@ -3203,14 +3211,13 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create and nowait the gang children.
|
* Create and nowait the gang children. First, we try to do
|
||||||
|
* opportunistic allocations. If that fails to generate enough
|
||||||
|
* space, we fall back to normal zio_write calls for nested gang.
|
||||||
*/
|
*/
|
||||||
for (int g = 0; resid != 0; resid -= psize, g++) {
|
for (int g = 0; resid != 0; g++) {
|
||||||
psize = zio_roundup_alloc_size(spa,
|
flags &= METASLAB_ASYNC_ALLOC;
|
||||||
resid / (SPA_GBH_NBLKPTRS - g));
|
flags |= METASLAB_GANG_CHILD;
|
||||||
psize = MIN(resid, psize);
|
|
||||||
ASSERT3U(psize, >=, SPA_MINBLOCKSIZE);
|
|
||||||
|
|
||||||
zp.zp_checksum = gio->io_prop.zp_checksum;
|
zp.zp_checksum = gio->io_prop.zp_checksum;
|
||||||
zp.zp_compress = ZIO_COMPRESS_OFF;
|
zp.zp_compress = ZIO_COMPRESS_OFF;
|
||||||
zp.zp_complevel = gio->io_prop.zp_complevel;
|
zp.zp_complevel = gio->io_prop.zp_complevel;
|
||||||
@ -3228,14 +3235,38 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
|||||||
memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
|
memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
|
||||||
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
|
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
|
||||||
|
|
||||||
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
|
uint64_t min_size = zio_roundup_alloc_size(spa,
|
||||||
has_data ? abd_get_offset(pio->io_abd, pio->io_size -
|
resid / (SPA_GBH_NBLKPTRS - g));
|
||||||
resid) : NULL, psize, psize, &zp,
|
min_size = MIN(min_size, resid);
|
||||||
zio_write_gang_member_ready, NULL,
|
bp = &gbh->zg_blkptr[g];
|
||||||
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
|
|
||||||
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
|
|
||||||
|
|
||||||
|
zio_alloc_list_t cio_list;
|
||||||
|
metaslab_trace_init(&cio_list);
|
||||||
|
uint64_t allocated_size = UINT64_MAX;
|
||||||
|
error = metaslab_alloc_range(spa, mc, min_size, resid,
|
||||||
|
bp, gio->io_prop.zp_copies, txg, NULL,
|
||||||
|
flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
|
||||||
|
|
||||||
|
boolean_t allocated = error == 0;
|
||||||
|
|
||||||
|
uint64_t psize = allocated ? MIN(resid, allocated_size) :
|
||||||
|
min_size;
|
||||||
|
|
||||||
|
zio_t *cio = zio_write(zio, spa, txg, bp, has_data ?
|
||||||
|
abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL,
|
||||||
|
psize, psize, &zp, zio_write_gang_member_ready, NULL,
|
||||||
|
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
|
||||||
|
ZIO_GANG_CHILD_FLAGS(pio) |
|
||||||
|
(allocated ? ZIO_FLAG_PREALLOCATED : 0), &pio->io_bookmark);
|
||||||
|
|
||||||
|
resid -= psize;
|
||||||
zio_gang_inherit_allocator(zio, cio);
|
zio_gang_inherit_allocator(zio, cio);
|
||||||
|
if (allocated) {
|
||||||
|
metaslab_trace_move(&cio_list, &cio->io_alloc_list);
|
||||||
|
metaslab_group_alloc_increment_all(spa,
|
||||||
|
&cio->io_bp_orig, zio->io_allocator, flags, psize,
|
||||||
|
cio);
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* We do not reserve for the child writes, since we already
|
* We do not reserve for the child writes, since we already
|
||||||
* reserved for the parent. Unreserve though will be called
|
* reserved for the parent. Unreserve though will be called
|
||||||
@ -4140,6 +4171,14 @@ zio_dva_allocate(zio_t *zio)
|
|||||||
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
||||||
zio->io_gang_leader = zio;
|
zio->io_gang_leader = zio;
|
||||||
}
|
}
|
||||||
|
if (zio->io_flags & ZIO_FLAG_PREALLOCATED) {
|
||||||
|
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG);
|
||||||
|
memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva,
|
||||||
|
3 * sizeof (dva_t));
|
||||||
|
BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig),
|
||||||
|
BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig));
|
||||||
|
return (zio);
|
||||||
|
}
|
||||||
|
|
||||||
ASSERT(BP_IS_HOLE(bp));
|
ASSERT(BP_IS_HOLE(bp));
|
||||||
ASSERT0(BP_GET_NDVAS(bp));
|
ASSERT0(BP_GET_NDVAS(bp));
|
||||||
|
@ -726,7 +726,8 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
|
|||||||
tags = ['functional', 'features', 'large_dnode']
|
tags = ['functional', 'features', 'large_dnode']
|
||||||
|
|
||||||
[tests/functional/gang_blocks]
|
[tests/functional/gang_blocks]
|
||||||
tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies']
|
tests = ['gang_blocks_001_pos', 'gang_blocks_redundant',
|
||||||
|
'gang_blocks_ddt_copies']
|
||||||
tags = ['functional', 'gang_blocks']
|
tags = ['functional', 'gang_blocks']
|
||||||
|
|
||||||
[tests/functional/grow]
|
[tests/functional/grow]
|
||||||
|
@ -428,6 +428,10 @@ tests = ['large_dnode_003_pos', 'large_dnode_004_neg',
|
|||||||
'large_dnode_005_pos', 'large_dnode_007_neg']
|
'large_dnode_005_pos', 'large_dnode_007_neg']
|
||||||
tags = ['functional', 'features', 'large_dnode']
|
tags = ['functional', 'features', 'large_dnode']
|
||||||
|
|
||||||
|
[tests/functional/gang_blocks]
|
||||||
|
tests = ['gang_blocks_001_pos']
|
||||||
|
tags = ['functional', 'gang_blocks']
|
||||||
|
|
||||||
[tests/functional/grow]
|
[tests/functional/grow]
|
||||||
pre =
|
pre =
|
||||||
post =
|
post =
|
||||||
|
@ -1562,6 +1562,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||||||
functional/features/large_dnode/large_dnode_009_pos.ksh \
|
functional/features/large_dnode/large_dnode_009_pos.ksh \
|
||||||
functional/features/large_dnode/setup.ksh \
|
functional/features/large_dnode/setup.ksh \
|
||||||
functional/gang_blocks/cleanup.ksh \
|
functional/gang_blocks/cleanup.ksh \
|
||||||
|
functional/gang_blocks/gang_blocks_001_pos.ksh \
|
||||||
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
|
functional/gang_blocks/gang_blocks_ddt_copies.ksh \
|
||||||
functional/gang_blocks/gang_blocks_redundant.ksh \
|
functional/gang_blocks/gang_blocks_redundant.ksh \
|
||||||
functional/gang_blocks/setup.ksh \
|
functional/gang_blocks/setup.ksh \
|
||||||
|
59
tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh
Executable file
59
tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh
Executable file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/ksh
|
||||||
|
# SPDX-License-Identifier: CDDL-1.0
|
||||||
|
#
|
||||||
|
# This file and its contents are supplied under the terms of the
|
||||||
|
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
# You may only use this file in accordance with the terms of version
|
||||||
|
# 1.0 of the CDDL.
|
||||||
|
#
|
||||||
|
# A full copy of the text of the CDDL should have accompanied this
|
||||||
|
# source. A copy of the CDDL is also available via the Internet at
|
||||||
|
# http://www.illumos.org/license/CDDL.
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright (c) 2025 by Klara Inc.
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Description:
|
||||||
|
# Verify that gang block functionality behaves correctly.
|
||||||
|
#
|
||||||
|
# Strategy:
|
||||||
|
# 1. Create a pool without dynamic gang headers.
|
||||||
|
# 2. Set metaslab_force_ganging to force gang blocks to be created.
|
||||||
|
# 3. Verify that gang blocks can be read, written, and freed.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
|
||||||
|
|
||||||
|
log_assert "Gang blocks behave correctly."
|
||||||
|
|
||||||
|
preamble
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
log_must zpool create -f $TESTPOOL $DISKS
|
||||||
|
log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS
|
||||||
|
mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||||
|
set_tunable64 METASLAB_FORCE_GANGING 100000
|
||||||
|
set_tunable32 METASLAB_FORCE_GANGING_PCT 100
|
||||||
|
|
||||||
|
path="${mountpoint}/file"
|
||||||
|
log_must dd if=/dev/urandom of=$path bs=128k count=1
|
||||||
|
log_must zpool sync $TESTPOOL
|
||||||
|
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
|
||||||
|
leaves=$(read_gang_header $TESTPOOL $first_block 200 | grep -v hole | wc -l)
|
||||||
|
[[ "$leaves" -gt 1 ]] || log_fail "Only one leaf in gang block, should not be possible"
|
||||||
|
|
||||||
|
orig_checksum="$(cat $path | xxh128digest)"
|
||||||
|
|
||||||
|
log_must verify_pool $TESTPOOL
|
||||||
|
log_must zinject -a
|
||||||
|
new_checksum="$(cat $path | xxh128digest)"
|
||||||
|
[[ "$orig_checksum" == "$new_checksum" ]] || log_fail "Checksum mismatch"
|
||||||
|
|
||||||
|
log_must rm $path
|
||||||
|
log_must verify_pool $TESTPOOL
|
||||||
|
|
||||||
|
log_pass "Gang blocks behave correctly."
|
Loading…
Reference in New Issue
Block a user