diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 766d582c0..1bfa44a38 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -41,7 +41,7 @@ extern "C" { typedef struct metaslab_ops { const char *msop_name; - uint64_t (*msop_alloc)(metaslab_t *, uint64_t); + uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *); } metaslab_ops_t; @@ -82,6 +82,9 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *); +int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t, + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, + int, const void *, uint64_t *); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); @@ -95,6 +98,7 @@ void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); +void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); @@ -127,6 +131,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int, + uint64_t, const void *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t, const void *); void metaslab_recalculate_weight_and_sort(metaslab_t *); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 5aad22dba..7f457c3a0 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -134,6 +134,8 @@ extern void vdev_space_update(vdev_t *vd, extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, + uint64_t txg); extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h index d44ab6681..e923092a3 100644 --- a/include/sys/vdev_draid.h +++ b/include/sys/vdev_draid.h @@ -95,7 +95,7 @@ extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **); */ extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); -extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); +extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t, uint64_t); extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *); extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index a2a3e25d1..385d7224f 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -103,7 +103,8 @@ typedef const struct vdev_ops { vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; - vdev_asize_func_t *vdev_op_asize; + vdev_asize_func_t *vdev_op_psize_to_asize; + vdev_asize_func_t *vdev_op_asize_to_psize; vdev_min_asize_func_t *vdev_op_min_asize; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; @@ -615,6 +616,7 @@ extern vdev_ops_t vdev_indirect_ops; */ extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); +extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); diff --git a/include/sys/zio.h b/include/sys/zio.h index 78adca4d7..ea3809ce0 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -227,6 +227,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_REEXECUTED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 31) #define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32) +#define ZIO_FLAG_PREALLOCATED (1ULL << 33) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 3da8976ca..68aa2f2cb 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -5436,12 +5436,12 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) * +-------+-------+-------+-------+-------+ * * Above, notice that the 4k block required one sector for parity and another - * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated - * and free properties will be adjusted by 8k. The dataset will not be charged - * 8k. Rather, it will be charged a value that is scaled according to the - * overhead of the 128k block on the same vdev. This 8k allocation will be - * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as - * calculated in the 128k block example above. + * for data. vdev_raidz_psize_to_asize() will return 8k and as such the pool's + * allocated and free properties will be adjusted by 8k. The dataset will not + * be charged 8k. Rather, it will be charged a value that is scaled according + * to the overhead of the 128k block on the same vdev. This 8k allocation will + * be charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is + * as calculated in the 128k block example above. * * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 @@ -5488,7 +5488,7 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) * not necessarily equal to "blksize", due to RAIDZ deflation. */ static uint64_t -vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, +vdev_raidz_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { uint64_t asize, ndata; @@ -5508,7 +5508,7 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, * size. */ static uint64_t -vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, +vdev_draid_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { ASSERT3U(ndisks, >, nparity); @@ -5568,12 +5568,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) continue; /* allocation size for the "typical" 128k block */ - tsize = vdev_raidz_asize(ndisks, nparity, ashift, - SPA_OLD_MAXBLOCKSIZE); + tsize = vdev_raidz_psize_to_asize(ndisks, nparity, + ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ - asize = vdev_raidz_asize(ndisks, nparity, ashift, - blksize); + asize = vdev_raidz_psize_to_asize(ndisks, nparity, + ashift, blksize); } else { uint64_t ndata; @@ -5582,12 +5582,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) continue; /* allocation size for the "typical" 128k block */ - tsize = vdev_draid_asize(ndata + nparity, nparity, - ashift, SPA_OLD_MAXBLOCKSIZE); + tsize = vdev_draid_psize_to_asize(ndata + nparity, + nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ - asize = vdev_draid_asize(ndata + nparity, nparity, - ashift, blksize); + asize = vdev_draid_psize_to_asize(ndata + nparity, + nparity, ashift, blksize); } /* diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 7acf37ba9..b75d1ccea 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1276,7 +1276,8 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_geom_open, .vdev_op_close = vdev_geom_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_geom_io_start, diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 29e54b39a..face4611d 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1554,7 +1554,8 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, + .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 8e7138984..58f0975e1 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1692,17 +1692,30 @@ metaslab_largest_unflushed_free(metaslab_t *msp) static zfs_range_seg_t * metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start, - uint64_t size, zfs_btree_index_t *where) + uint64_t size, uint64_t max_size, zfs_btree_index_t *where) { zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch; zfs_rs_set_start(&rsearch, rt, start); - zfs_rs_set_end(&rsearch, rt, start + size); + zfs_rs_set_end(&rsearch, rt, start + max_size); rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { - rs = zfs_btree_next(t, where, where); + if (size == max_size) { + rs = zfs_btree_next(t, where, where); + } else { + /* + * If we're searching for a range, get the largest + * segment in that range, or the smallest one bigger + * than it. + */ + rs = zfs_btree_prev(t, where, where); + if (rs == NULL || zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt) < size) { + rs = zfs_btree_next(t, where, where); + } + } } return (rs); @@ -1715,14 +1728,14 @@ metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start, */ static uint64_t metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size, - uint64_t max_search) + uint64_t max_size, uint64_t max_search, uint64_t *found_size) { if (*cursor == 0) *cursor = rt->rt_start; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, - &where); + max_size, &where); uint64_t first_found; int count_searched = 0; @@ -1733,7 +1746,9 @@ metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size, max_search || count_searched < metaslab_min_search_count)) { uint64_t offset = zfs_rs_get_start(rs, rt); if (offset + size <= zfs_rs_get_end(rs, rt)) { - *cursor = offset + size; + *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, + max_size); + *cursor = offset + *found_size; return (offset); } rs = zfs_btree_next(bt, &where, &where); @@ -1741,12 +1756,16 @@ metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size, } *cursor = 0; + *found_size = 0; return (-1ULL); } -static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); -static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); -static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); +static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); +static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); +static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, + uint64_t max_size, uint64_t *found_size); metaslab_ops_t *metaslab_allocator(spa_t *spa); static metaslab_ops_t metaslab_allocators[] = { @@ -1832,7 +1851,8 @@ metaslab_allocator(spa_t *spa) * ========================================================================== */ static uint64_t -metaslab_df_alloc(metaslab_t *msp, uint64_t size) +metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { /* * Find the largest power of 2 block size that evenly divides the @@ -1841,7 +1861,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ - uint64_t align = size & -size; + uint64_t align = max_size & -max_size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; zfs_range_tree_t *rt = msp->ms_allocatable; uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; @@ -1855,10 +1875,18 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) */ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { + align = size & -size; + cursor = &msp->ms_lbas[highbit64(align) - 1]; offset = -1; } else { - offset = metaslab_block_picker(rt, - cursor, size, metaslab_df_max_search); + offset = metaslab_block_picker(rt, cursor, size, max_size, + metaslab_df_max_search, found_size); + if (max_size != size && offset == -1) { + align = size & -size; + cursor = &msp->ms_lbas[highbit64(align) - 1]; + offset = metaslab_block_picker(rt, cursor, size, + max_size, metaslab_df_max_search, found_size); + } } if (offset == -1) { @@ -1873,12 +1901,14 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) zfs_btree_index_t where; /* use segment of this size, or next largest */ rs = metaslab_block_find(&msp->ms_allocatable_by_size, - rt, msp->ms_start, size, &where); + rt, msp->ms_start, size, max_size, &where); } if (rs != NULL && zfs_rs_get_start(rs, rt) + size <= zfs_rs_get_end(rs, rt)) { offset = zfs_rs_get_start(rs, rt); - *cursor = offset + size; + *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, + max_size); + *cursor = offset + *found_size; } } @@ -1895,7 +1925,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) * ========================================================================== */ static uint64_t -metaslab_cf_alloc(metaslab_t *msp, uint64_t size) +metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; @@ -1922,7 +1953,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) } offset = *cursor; - *cursor += size; + *found_size = MIN(*cursor_end - offset, max_size); + *cursor = offset + *found_size; return (offset); } @@ -1943,33 +1975,43 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) uint64_t metaslab_ndf_clump_shift = 4; static uint64_t -metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t *found_size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch; - uint64_t hbit = highbit64(size); + uint64_t hbit = highbit64(max_size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; - uint64_t max_size = metaslab_largest_allocatable(msp); + uint64_t max_possible_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - if (max_size < size) + if (max_possible_size < size) return (-1ULL); zfs_rs_set_start(&rsearch, rt, *cursor); - zfs_rs_set_end(&rsearch, rt, *cursor + size); + zfs_rs_set_end(&rsearch, rt, *cursor + max_size); rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < + max_size) { + hbit = highbit64(size); + cursor = &msp->ms_lbas[hbit - 1]; + zfs_rs_set_start(&rsearch, rt, *cursor); + zfs_rs_set_end(&rsearch, rt, *cursor + size); + + rs = zfs_btree_find(t, &rsearch, &where); + } if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; zfs_rs_set_start(&rsearch, rt, 0); - zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + - metaslab_ndf_clump_shift))); + zfs_rs_set_end(&rsearch, rt, MIN(max_possible_size, + 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) @@ -1978,7 +2020,9 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) } if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) { - *cursor = zfs_rs_get_start(rs, rt) + size; + *found_size = MIN(zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt), max_size); + *cursor = zfs_rs_get_start(rs, rt) + *found_size; return (zfs_rs_get_start(rs, rt)); } return (-1ULL); @@ -4668,6 +4712,15 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } +void +metaslab_trace_move(zio_alloc_list_t *old, zio_alloc_list_t *new) +{ + ASSERT0(new->zal_size); + list_move_tail(&new->zal_list, &old->zal_list); + new->zal_size = old->zal_size; + list_destroy(&old->zal_list); +} + void metaslab_trace_init(zio_alloc_list_t *zal) { @@ -4697,7 +4750,7 @@ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator, int flags, uint64_t psize, const void *tag) { - if (!(flags & METASLAB_ASYNC_ALLOC)) + if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; @@ -4708,11 +4761,22 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator, (void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag); } +void +metaslab_group_alloc_increment_all(spa_t *spa, blkptr_t *bp, int allocator, + int flags, uint64_t psize, const void *tag) +{ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]); + metaslab_group_alloc_increment(spa, vdev, allocator, flags, + psize, tag); + } +} + void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator, int flags, uint64_t psize, const void *tag) { - if (!(flags & METASLAB_ASYNC_ALLOC)) + if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; @@ -4724,7 +4788,8 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator, } static uint64_t -metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, + uint64_t txg, uint64_t *actual_size) { uint64_t start; zfs_range_tree_t *rt = msp->ms_allocatable; @@ -4735,8 +4800,9 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) VERIFY0(msp->ms_disabled); VERIFY0(msp->ms_new); - start = mc->mc_ops->msop_alloc(msp, size); + start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size); if (start != -1ULL) { + size = *actual_size; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; @@ -4879,8 +4945,9 @@ metaslab_active_mask_verify(metaslab_t *msp) static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, dva_t *dva, int d, int allocator, - boolean_t try_hard) + uint64_t asize, uint64_t max_asize, uint64_t txg, + dva_t *dva, int d, int allocator, boolean_t try_hard, + uint64_t *actual_asize) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; @@ -5095,16 +5162,19 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } - offset = metaslab_block_alloc(msp, asize, txg); - metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); + offset = metaslab_block_alloc(msp, asize, max_asize, txg, + actual_asize); if (offset != -1ULL) { + metaslab_trace_add(zal, mg, msp, *actual_asize, d, + offset, allocator); /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); mutex_exit(&msp->ms_lock); break; } + metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); next: ASSERT(msp->ms_loaded); @@ -5243,13 +5313,10 @@ metaslab_group_allocatable(spa_t *spa, metaslab_group_t *mg, uint64_t psize, return (B_TRUE); } -/* - * Allocate a block for the specified i/o. - */ -int -metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, - zio_alloc_list_t *zal, int allocator) +static int +metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + uint64_t max_psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, + int flags, zio_alloc_list_t *zal, int allocator, uint64_t *actual_psize) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg = NULL, *rotor; @@ -5272,6 +5339,13 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, allocator); return (SET_ERROR(ENOSPC)); } + if (max_psize > psize && max_psize >= metaslab_force_ganging && + metaslab_force_ganging_pct > 0 && + (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { + max_psize = MAX((psize + max_psize) / 2, + metaslab_force_ganging); + } + ASSERT3U(psize, <=, max_psize); /* * Start at the rotor and loop through all mgs until we find something. @@ -5319,11 +5393,18 @@ top: vd = mg->mg_vd; uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); - ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - dva, d, allocator, try_hard); + ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); + uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize, + txg); + ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); + uint64_t offset = metaslab_group_alloc(mg, zal, asize, + max_asize, txg, dva, d, allocator, try_hard, + &asize); if (offset != -1ULL) { + if (actual_psize) + *actual_psize = vdev_asize_to_psize_txg(vd, + asize, txg); metaslab_class_rotate(mg, allocator, psize, B_TRUE); DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -5354,6 +5435,18 @@ next: return (SET_ERROR(ENOSPC)); } +/* + * Allocate a block for the specified i/o. + */ +int +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal, int allocator) +{ + return (metaslab_alloc_dva_range(spa, mc, psize, psize, dva, d, hintdva, + txg, flags, zal, allocator, NULL)); +} + void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) @@ -5841,6 +5934,16 @@ int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator, const void *tag) +{ + return (metaslab_alloc_range(spa, mc, psize, psize, bp, ndvas, txg, + hintbp, flags, zal, allocator, tag, NULL)); +} + +int +metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + uint64_t max_psize, blkptr_t *bp, int ndvas, uint64_t txg, + blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator, + const void *tag, uint64_t *actual_psize) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; @@ -5862,9 +5965,12 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); + uint64_t cur_psize = 0; + for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags, zal, allocator); + error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, + dva, d, hintdva, txg, flags, zal, allocator, + actual_psize ? &cur_psize : NULL); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); @@ -5883,10 +5989,14 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); + if (actual_psize) + max_psize = MIN(cur_psize, max_psize); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); + if (actual_psize) + *actual_psize = max_psize; spa_config_exit(spa, SCL_ALLOC, FTAG); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 9cd0cfbcf..aa2902d0b 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -185,7 +185,8 @@ zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, ASSERT3U(rc->rc_count, >=, number); ref = avl_find(&rc->rc_tree, &s, NULL); if (unlikely(ref == NULL)) { - panic("No such hold %p on refcount %llx", holder, + PANIC("No such hold %llx on refcount %llx", + (u_longlong_t)(uintptr_t)holder, (u_longlong_t)(uintptr_t)rc); return (-1); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 9ac9a9fe6..4fab60336 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -323,6 +323,19 @@ vdev_derive_alloc_bias(const char *bias) return (alloc_bias); } +uint64_t +vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift)); + uint64_t csize, psize = asize; + for (int c = 0; c < vd->vdev_children; c++) { + csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg); + psize = MIN(psize, csize); + } + + return (psize); +} + /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. @@ -4135,17 +4148,22 @@ vdev_sync(vdev_t *vd, uint64_t txg) (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } +uint64_t +vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg)); +} /* * Return the amount of space that should be (or was) allocated for the given * psize (compressed block size) in the given TXG. Note that for expanded * RAIDZ vdevs, the size allocated for older BP's may be larger. See - * vdev_raidz_asize(). + * vdev_raidz_psize_to_asize(). */ uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) { - return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); + return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg)); } uint64_t diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index d39a05458..e0fafd0da 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -578,7 +578,7 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc, * i.e. vdev_draid_psize_to_asize(). */ static uint64_t -vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) +vdev_draid_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; @@ -599,8 +599,9 @@ vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) * Deflate the asize to the psize, this includes stripping parity. */ uint64_t -vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) +vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) { + (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; ASSERT0(asize % vdc->vdc_groupwidth); @@ -962,7 +963,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; - uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); + uint64_t io_asize = vdev_draid_psize_to_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); @@ -972,7 +973,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, */ if (io_offset + io_asize > start_offset) { io_size = vdev_draid_asize_to_psize(vd, - start_offset - io_offset); + start_offset - io_offset, 0); } /* @@ -1117,7 +1118,7 @@ vdev_draid_map_alloc(zio_t *zio) if (size < abd_size) { vdev_t *vd = zio->io_vd; - io_offset += vdev_draid_asize(vd, size, 0); + io_offset += vdev_draid_psize_to_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; @@ -1770,7 +1771,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = vdev_draid_asize(vd, psize, 0); + uint64_t asize = vdev_draid_psize_to_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* @@ -1827,7 +1828,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_draid_asize(vd, rr->rr_size, 0); + vdev_draid_psize_to_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; @@ -2311,7 +2312,8 @@ vdev_ops_t vdev_draid_ops = { .vdev_op_fini = vdev_draid_fini, .vdev_op_open = vdev_draid_open, .vdev_op_close = vdev_draid_close, - .vdev_op_asize = vdev_draid_asize, + .vdev_op_psize_to_asize = vdev_draid_psize_to_asize, + .vdev_op_asize_to_psize = vdev_draid_asize_to_psize, .vdev_op_min_asize = vdev_draid_min_asize, .vdev_op_min_alloc = vdev_draid_min_alloc, .vdev_op_io_start = vdev_draid_io_start, @@ -2801,7 +2803,8 @@ vdev_ops_t vdev_draid_spare_ops = { .vdev_op_fini = vdev_draid_spare_fini, .vdev_op_open = vdev_draid_spare_open, .vdev_op_close = vdev_draid_spare_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_draid_spare_io_start, diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index a2cb6f9b9..f457669bc 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -313,7 +313,8 @@ vdev_ops_t vdev_file_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, @@ -343,7 +344,7 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 30d7340f7..b58b87d1f 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1867,7 +1867,8 @@ vdev_ops_t vdev_indirect_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 9f3dfce01..a6aee9437 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -972,7 +972,8 @@ vdev_ops_t vdev_mirror_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, @@ -997,7 +998,8 @@ vdev_ops_t vdev_replacing_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, @@ -1022,7 +1024,8 @@ vdev_ops_t vdev_spare_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index 89786a1df..c62faef2d 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -85,7 +85,8 @@ vdev_ops_t vdev_missing_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, @@ -110,7 +111,8 @@ vdev_ops_t vdev_hole_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b62dc6b0b..62d9c9909 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2235,6 +2235,33 @@ vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) mutex_exit(&vdrz->vd_expand_lock); return (width); } +/* + * This code converts an asize into the largest psize that can safely be written + * to an allocation of that size for this vdev. + * + * Note that this function will not take into account the effect of gang + * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of + * the psize_to_asize function. + */ +static uint64_t +vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t psize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vdrz->vd_original_width; + uint64_t nparity = vdrz->vd_nparity; + + cols = vdev_raidz_get_logical_width(vdrz, txg); + + ASSERT0(asize % (1 << ashift)); + + psize = (asize >> ashift); + psize -= nparity * DIV_ROUND_UP(psize, cols); + psize <<= ashift; + + return (asize); +} /* * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated @@ -2245,7 +2272,7 @@ vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) * allocate P+1 sectors regardless of width ("cols", which is at least P+1). */ static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) +vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; @@ -2309,7 +2336,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, rr->rr_size, + vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, BP_GET_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; @@ -5093,7 +5120,8 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, - .vdev_op_asize = vdev_raidz_asize, + .vdev_op_psize_to_asize = vdev_raidz_psize_to_asize, + .vdev_op_asize_to_psize = vdev_raidz_asize_to_psize, .vdev_op_min_asize = vdev_raidz_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index ea6f86993..21cb57e38 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -529,7 +529,7 @@ vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, vd->vdev_ops == &vdev_spare_ops); uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? - vdev_draid_asize_to_psize(vd, asize) : asize; + vdev_draid_asize_to_psize(vd, asize, 0) : asize; BP_ZERO(bp); diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 8f6e49f25..21a81d6d2 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -147,7 +147,8 @@ vdev_ops_t vdev_root_ops = { .vdev_op_fini = NULL, .vdev_op_open = vdev_root_open, .vdev_op_close = vdev_root_close, - .vdev_op_asize = vdev_default_asize, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = NULL, /* not applicable to the root */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index eb08a6eac..1769606eb 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1022,6 +1022,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; + if (flags & ZIO_FLAG_PREALLOCATED) { + BP_ZERO_DVAS(zio->io_bp); + BP_SET_BIRTH(zio->io_bp, 0, 0); + } } zio->io_spa = spa; @@ -3092,7 +3096,12 @@ zio_write_gang_member_ready(zio_t *zio) if (BP_IS_HOLE(zio->io_bp)) return; - ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); + /* + * If we're getting direct-invoked from zio_write_gang_block(), + * the bp_orig will be set. + */ + ASSERT(BP_IS_HOLE(&zio->io_bp_orig) || + zio->io_flags & ZIO_FLAG_PREALLOCATED); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); @@ -3134,7 +3143,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; - uint64_t psize; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); @@ -3203,14 +3211,13 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) } /* - * Create and nowait the gang children. + * Create and nowait the gang children. First, we try to do + * opportunistic allocations. If that fails to generate enough + * space, we fall back to normal zio_write calls for nested gang. */ - for (int g = 0; resid != 0; resid -= psize, g++) { - psize = zio_roundup_alloc_size(spa, - resid / (SPA_GBH_NBLKPTRS - g)); - psize = MIN(resid, psize); - ASSERT3U(psize, >=, SPA_MINBLOCKSIZE); - + for (int g = 0; resid != 0; g++) { + flags &= METASLAB_ASYNC_ALLOC; + flags |= METASLAB_GANG_CHILD; zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; @@ -3228,14 +3235,38 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); - zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - has_data ? abd_get_offset(pio->io_abd, pio->io_size - - resid) : NULL, psize, psize, &zp, - zio_write_gang_member_ready, NULL, - zio_write_gang_done, &gn->gn_child[g], pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + uint64_t min_size = zio_roundup_alloc_size(spa, + resid / (SPA_GBH_NBLKPTRS - g)); + min_size = MIN(min_size, resid); + bp = &gbh->zg_blkptr[g]; + zio_alloc_list_t cio_list; + metaslab_trace_init(&cio_list); + uint64_t allocated_size = UINT64_MAX; + error = metaslab_alloc_range(spa, mc, min_size, resid, + bp, gio->io_prop.zp_copies, txg, NULL, + flags, &cio_list, zio->io_allocator, NULL, &allocated_size); + + boolean_t allocated = error == 0; + + uint64_t psize = allocated ? MIN(resid, allocated_size) : + min_size; + + zio_t *cio = zio_write(zio, spa, txg, bp, has_data ? + abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, + psize, psize, &zp, zio_write_gang_member_ready, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio) | + (allocated ? ZIO_FLAG_PREALLOCATED : 0), &pio->io_bookmark); + + resid -= psize; zio_gang_inherit_allocator(zio, cio); + if (allocated) { + metaslab_trace_move(&cio_list, &cio->io_alloc_list); + metaslab_group_alloc_increment_all(spa, + &cio->io_bp_orig, zio->io_allocator, flags, psize, + cio); + } /* * We do not reserve for the child writes, since we already * reserved for the parent. Unreserve though will be called @@ -4140,6 +4171,14 @@ zio_dva_allocate(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } + if (zio->io_flags & ZIO_FLAG_PREALLOCATED) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG); + memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva, + 3 * sizeof (dva_t)); + BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig), + BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig)); + return (zio); + } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9373b39a1..d7f3c75c7 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -726,7 +726,8 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', tags = ['functional', 'features', 'large_dnode'] [tests/functional/gang_blocks] -tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies'] +tests = ['gang_blocks_001_pos', 'gang_blocks_redundant', + 'gang_blocks_ddt_copies'] tags = ['functional', 'gang_blocks'] [tests/functional/grow] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index ddd2d431a..6362a2606 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -428,6 +428,10 @@ tests = ['large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg'] tags = ['functional', 'features', 'large_dnode'] +[tests/functional/gang_blocks] +tests = ['gang_blocks_001_pos'] +tags = ['functional', 'gang_blocks'] + [tests/functional/grow] pre = post = diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index db1ef0d03..4c102b3aa 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1562,6 +1562,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ functional/gang_blocks/cleanup.ksh \ + functional/gang_blocks/gang_blocks_001_pos.ksh \ functional/gang_blocks/gang_blocks_ddt_copies.ksh \ functional/gang_blocks/gang_blocks_redundant.ksh \ functional/gang_blocks/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh new file mode 100755 index 000000000..3601f5422 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that gang block functionality behaves correctly. +# +# Strategy: +# 1. Create a pool without dynamic gang headers. +# 2. Set metaslab_force_ganging to force gang blocks to be created. +# 3. Verify that gang blocks can be read, written, and freed. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Gang blocks behave correctly." + +preamble +log_onexit cleanup + +log_must zpool create -f $TESTPOOL $DISKS +log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 100000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=128k count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200 | grep -v hole | wc -l) +[[ "$leaves" -gt 1 ]] || log_fail "Only one leaf in gang block, should not be possible" + +orig_checksum="$(cat $path | xxh128digest)" + +log_must verify_pool $TESTPOOL +log_must zinject -a +new_checksum="$(cat $path | xxh128digest)" +[[ "$orig_checksum" == "$new_checksum" ]] || log_fail "Checksum mismatch" + +log_must rm $path +log_must verify_pool $TESTPOOL + +log_pass "Gang blocks behave correctly."