mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 18:40:43 +03:00
Implement allocation size ranges and use for gang leaves (#17111)
When forced to resort to ganging, ZFS currently allocates three child blocks, each one third of the size of the original. This is true regardless of whether larger allocations could be made, which would allow us to have fewer gang leaves. This improves performance when fragmentation is high enough to require ganging, but not so high that all the free ranges are only just big enough to hold a third of the recordsize. This is also useful for improving the behavior of a future change to allow larger gang headers. We add the ability for the allocation codepath to allocate a range of sizes instead of a single fixed size. We then use this to pre-allocate the DVAs for the gang children. If those allocations fail, we fall back to the normal write path, which will likely re-gang. Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Co-authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
+16
-16
@@ -5436,12 +5436,12 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
|
||||
* +-------+-------+-------+-------+-------+
|
||||
*
|
||||
* Above, notice that the 4k block required one sector for parity and another
|
||||
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
|
||||
* and free properties will be adjusted by 8k. The dataset will not be charged
|
||||
* 8k. Rather, it will be charged a value that is scaled according to the
|
||||
* overhead of the 128k block on the same vdev. This 8k allocation will be
|
||||
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
|
||||
* calculated in the 128k block example above.
|
||||
* for data. vdev_raidz_psize_to_asize() will return 8k and as such the pool's
|
||||
* allocated and free properties will be adjusted by 8k. The dataset will not
|
||||
* be charged 8k. Rather, it will be charged a value that is scaled according
|
||||
* to the overhead of the 128k block on the same vdev. This 8k allocation will
|
||||
* be charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is
|
||||
* as calculated in the 128k block example above.
|
||||
*
|
||||
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
|
||||
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
|
||||
@@ -5488,7 +5488,7 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
|
||||
* not necessarily equal to "blksize", due to RAIDZ deflation.
|
||||
*/
|
||||
static uint64_t
|
||||
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||
vdev_raidz_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||
uint64_t blksize)
|
||||
{
|
||||
uint64_t asize, ndata;
|
||||
@@ -5508,7 +5508,7 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||
* size.
|
||||
*/
|
||||
static uint64_t
|
||||
vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||
vdev_draid_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
|
||||
uint64_t blksize)
|
||||
{
|
||||
ASSERT3U(ndisks, >, nparity);
|
||||
@@ -5568,12 +5568,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
|
||||
continue;
|
||||
|
||||
/* allocation size for the "typical" 128k block */
|
||||
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
|
||||
SPA_OLD_MAXBLOCKSIZE);
|
||||
tsize = vdev_raidz_psize_to_asize(ndisks, nparity,
|
||||
ashift, SPA_OLD_MAXBLOCKSIZE);
|
||||
|
||||
/* allocation size for the blksize block */
|
||||
asize = vdev_raidz_asize(ndisks, nparity, ashift,
|
||||
blksize);
|
||||
asize = vdev_raidz_psize_to_asize(ndisks, nparity,
|
||||
ashift, blksize);
|
||||
} else {
|
||||
uint64_t ndata;
|
||||
|
||||
@@ -5582,12 +5582,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
|
||||
continue;
|
||||
|
||||
/* allocation size for the "typical" 128k block */
|
||||
tsize = vdev_draid_asize(ndata + nparity, nparity,
|
||||
ashift, SPA_OLD_MAXBLOCKSIZE);
|
||||
tsize = vdev_draid_psize_to_asize(ndata + nparity,
|
||||
nparity, ashift, SPA_OLD_MAXBLOCKSIZE);
|
||||
|
||||
/* allocation size for the blksize block */
|
||||
asize = vdev_draid_asize(ndata + nparity, nparity,
|
||||
ashift, blksize);
|
||||
asize = vdev_draid_psize_to_asize(ndata + nparity,
|
||||
nparity, ashift, blksize);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user