mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	OpenZFS 9318 - vol_volsize_to_reservation does not account for raidz skip blocks
When a volume is created in a pool with raidz vdevs and
volblocksize != 128k, the volume can reference more space than is
reserved with the automatically calculated refreservation.  There
are two deficiencies in vol_volsize_to_reservation that contribute
to this:
  1) Skip blocks may be added to keep each allocation a multiple
     of parity + 1. This is the dominating factor when volblocksize
     is close to 2^ashift.
  2) raidz deflation for 128 KB blocks is different for most other
     block sizes.
See "The theory of raidz space accounting" comment in
libzfs_dataset.c for a full explanation.
Authored by: Mike Gerdts <mike.gerdts@joyent.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Mike Gerdts <mike.gerdts@joyent.com>
Porting Notes:
* ZTS: wait for zvols to exist before writing
* ZTS: use log_must_busy with {zpool|zfs} destroy
OpenZFS-issue: https://www.illumos.org/issues/9318
OpenZFS-commit: https://github.com/illumos/illumos-gate/commit/b73ccab0
Closes #8973
			
			
This commit is contained in:
		
							parent
							
								
									350646563f
								
							
						
					
					
						commit
						f3f46b0e45
					
				| @ -29,6 +29,7 @@ | |||||||
|  * Copyright 2016 Nexenta Systems, Inc. |  * Copyright 2016 Nexenta Systems, Inc. | ||||||
|  * Copyright (c) 2019 Datto Inc. |  * Copyright (c) 2019 Datto Inc. | ||||||
|  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> |  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> | ||||||
|  |  * Copyright 2019 Joyent, Inc. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #include <assert.h> | #include <assert.h> | ||||||
| @ -992,10 +993,11 @@ zfs_do_create(int argc, char **argv) | |||||||
| 			zpool_close(zpool_handle); | 			zpool_close(zpool_handle); | ||||||
| 			goto error; | 			goto error; | ||||||
| 		} | 		} | ||||||
| 		zpool_close(zpool_handle); |  | ||||||
| 
 | 
 | ||||||
| 		volsize = zvol_volsize_to_reservation(volsize, real_props); | 		volsize = zvol_volsize_to_reservation(zpool_handle, volsize, | ||||||
|  | 		    real_props); | ||||||
| 		nvlist_free(real_props); | 		nvlist_free(real_props); | ||||||
|  | 		zpool_close(zpool_handle); | ||||||
| 
 | 
 | ||||||
| 		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), | 		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), | ||||||
| 		    &strval) != 0) { | 		    &strval) != 0) { | ||||||
|  | |||||||
| @ -22,7 +22,7 @@ | |||||||
| /*
 | /*
 | ||||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||||
|  * Copyright (c) 2011, 2018 by Delphix. All rights reserved. |  * Copyright (c) 2011, 2018 by Delphix. All rights reserved. | ||||||
|  * Copyright (c) 2012, Joyent, Inc. All rights reserved. |  * Copyright Joyent, Inc. | ||||||
|  * Copyright (c) 2013 Steven Hartland. All rights reserved. |  * Copyright (c) 2013 Steven Hartland. All rights reserved. | ||||||
|  * Copyright (c) 2016, Intel Corporation. |  * Copyright (c) 2016, Intel Corporation. | ||||||
|  * Copyright 2016 Nexenta Systems, Inc. |  * Copyright 2016 Nexenta Systems, Inc. | ||||||
| @ -687,7 +687,8 @@ extern int zfs_hold(zfs_handle_t *, const char *, const char *, | |||||||
| extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); | extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); | ||||||
| extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); | extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); | ||||||
| extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); | extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); | ||||||
| extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); | extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, | ||||||
|  |     nvlist_t *); | ||||||
| 
 | 
 | ||||||
| typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, | typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, | ||||||
|     uid_t rid, uint64_t space); |     uid_t rid, uint64_t space); | ||||||
|  | |||||||
| @ -21,7 +21,7 @@ | |||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||||
|  * Copyright (c) 2018, Joyent, Inc. All rights reserved. |  * Copyright 2019 Joyent, Inc. | ||||||
|  * Copyright (c) 2011, 2017 by Delphix. All rights reserved. |  * Copyright (c) 2011, 2017 by Delphix. All rights reserved. | ||||||
|  * Copyright (c) 2012 DEY Storage Systems, Inc.  All rights reserved. |  * Copyright (c) 2012 DEY Storage Systems, Inc.  All rights reserved. | ||||||
|  * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. |  * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. | ||||||
| @ -1618,6 +1618,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |||||||
| 	uint64_t new_reservation; | 	uint64_t new_reservation; | ||||||
| 	zfs_prop_t resv_prop; | 	zfs_prop_t resv_prop; | ||||||
| 	nvlist_t *props; | 	nvlist_t *props; | ||||||
|  | 	zpool_handle_t *zph = zpool_handle(zhp); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * If this is an existing volume, and someone is setting the volsize, | 	 * If this is an existing volume, and someone is setting the volsize, | ||||||
| @ -1632,7 +1633,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |||||||
| 	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), | 	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), | ||||||
| 	    zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); | 	    zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); | ||||||
| 
 | 
 | ||||||
| 	if ((zvol_volsize_to_reservation(old_volsize, props) != | 	if ((zvol_volsize_to_reservation(zph, old_volsize, props) != | ||||||
| 	    old_reservation) || nvlist_exists(nvl, | 	    old_reservation) || nvlist_exists(nvl, | ||||||
| 	    zfs_prop_to_name(resv_prop))) { | 	    zfs_prop_to_name(resv_prop))) { | ||||||
| 		fnvlist_free(props); | 		fnvlist_free(props); | ||||||
| @ -1643,7 +1644,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |||||||
| 		fnvlist_free(props); | 		fnvlist_free(props); | ||||||
| 		return (-1); | 		return (-1); | ||||||
| 	} | 	} | ||||||
| 	new_reservation = zvol_volsize_to_reservation(new_volsize, props); | 	new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); | ||||||
| 	fnvlist_free(props); | 	fnvlist_free(props); | ||||||
| 
 | 
 | ||||||
| 	if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), | 	if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), | ||||||
| @ -1698,7 +1699,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |||||||
| 		volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); | 		volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	resvsize = zvol_volsize_to_reservation(volsize, props); | 	resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, | ||||||
|  | 	    props); | ||||||
| 	fnvlist_free(props); | 	fnvlist_free(props); | ||||||
| 
 | 
 | ||||||
| 	(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); | 	(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); | ||||||
| @ -5363,12 +5365,176 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Convert the zvol's volume size to an appropriate reservation. |  * The theory of raidz space accounting | ||||||
|  |  * | ||||||
|  |  * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block | ||||||
|  |  * will "reference" 128KB, even though it allocates more than that, to store the | ||||||
|  |  * parity information (and perhaps skip sectors). This concept of the | ||||||
|  |  * "referenced" (and other DMU space accounting) being lower than the allocated | ||||||
|  |  * space by a constant factor is called "raidz deflation." | ||||||
|  |  * | ||||||
|  |  * As mentioned above, the constant factor for raidz deflation assumes a 128KB | ||||||
|  |  * block size. However, zvols typically have a much smaller block size (default | ||||||
|  |  * 8KB). These smaller blocks may require proportionally much more parity | ||||||
|  |  * information (and perhaps skip sectors). In this case, the change to the | ||||||
|  |  * "referenced" property may be much more than the logical block size. | ||||||
|  |  * | ||||||
|  |  * Suppose a raidz vdev has 5 disks with ashift=12.  A 128k block may be written | ||||||
|  |  * as follows. | ||||||
|  |  * | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * | disk1 | disk2 | disk3 | disk4 | disk5 | | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * |  P0   |  D0   |  D8   |  D16  |  D24  | | ||||||
|  |  * |  P1   |  D1   |  D9   |  D17  |  D25  | | ||||||
|  |  * |  P2   |  D2   |  D10  |  D18  |  D26  | | ||||||
|  |  * |  P3   |  D3   |  D11  |  D19  |  D27  | | ||||||
|  |  * |  P4   |  D4   |  D12  |  D20  |  D28  | | ||||||
|  |  * |  P5   |  D5   |  D13  |  D21  |  D29  | | ||||||
|  |  * |  P6   |  D6   |  D14  |  D22  |  D30  | | ||||||
|  |  * |  P7   |  D7   |  D15  |  D23  |  D31  | | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * | ||||||
|  |  * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data | ||||||
|  |  * sectors.  The dataset's referenced will increase by 128k and the pool's | ||||||
|  |  * allocated and free properties will be adjusted by 160k. | ||||||
|  |  * | ||||||
|  |  * A 4k block written to the same raidz vdev will require two 4k sectors.  The | ||||||
|  |  * blank cells represent unallocated space. | ||||||
|  |  * | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * | disk1 | disk2 | disk3 | disk4 | disk5 | | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * |  P0   |  D0   |       |       |       | | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * | ||||||
|  |  * Above, notice that the 4k block required one sector for parity and another | ||||||
|  |  * for data.  vdev_raidz_asize() will return 8k and as such the pool's allocated | ||||||
|  |  * and free properties will be adjusted by 8k.  The dataset will not be charged | ||||||
|  |  * 8k.  Rather, it will be charged a value that is scaled according to the | ||||||
|  |  * overhead of the 128k block on the same vdev.  This 8k allocation will be | ||||||
|  |  * charged 8k * 128k / 160k.  128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as | ||||||
|  |  * calculated in the 128k block example above. | ||||||
|  |  * | ||||||
|  |  * Every raidz allocation is sized to be a multiple of nparity+1 sectors.  That | ||||||
|  |  * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 | ||||||
|  |  * allocations are a multiple of 3 sectors, and raidz3 allocations are a | ||||||
|  |  * multiple of of 4 sectors.  When a block does not fill the required number of | ||||||
|  |  * sectors, skip blocks (sectors) are used. | ||||||
|  |  * | ||||||
|  |  * An 8k block being written to a raidz vdev may be written as follows: | ||||||
|  |  * | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * | disk1 | disk2 | disk3 | disk4 | disk5 | | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * |  P0   |  D0   |  D1   |  S0   |       | | ||||||
|  |  * +-------+-------+-------+-------+-------+ | ||||||
|  |  * | ||||||
|  |  * In order to maintain the nparity+1 allocation size, a skip block (S0) was | ||||||
|  |  * added.  For this 8k block, the pool's allocated and free properties are | ||||||
|  |  * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / | ||||||
|  |  * 160k.  Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in | ||||||
|  |  * the 128k block example above. | ||||||
|  |  * | ||||||
|  |  * Compression may lead to a variety of block sizes being written for the same | ||||||
|  |  * volume or file.  There is no clear way to reserve just the amount of space | ||||||
|  |  * that will be required, so the worst case (no compression) is assumed. | ||||||
|  |  * Note that metadata blocks will typically be compressed, so the reservation | ||||||
|  |  * size returned by zvol_volsize_to_reservation() will generally be slightly | ||||||
|  |  * larger than the maximum that the volume can reference. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Derived from function of same name in module/zfs/vdev_raidz.c.  Returns the | ||||||
|  |  * amount of space (in bytes) that will be allocated for the specified block | ||||||
|  |  * size. Note that the "referenced" space accounted will be less than this, but | ||||||
|  |  * not necessarily equal to "blksize", due to RAIDZ deflation. | ||||||
|  |  */ | ||||||
|  | static uint64_t | ||||||
|  | vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, | ||||||
|  |     uint64_t blksize) | ||||||
|  | { | ||||||
|  | 	uint64_t asize, ndata; | ||||||
|  | 
 | ||||||
|  | 	ASSERT3U(ndisks, >, nparity); | ||||||
|  | 	ndata = ndisks - nparity; | ||||||
|  | 	asize = ((blksize - 1) >> ashift) + 1; | ||||||
|  | 	asize += nparity * ((asize + ndata - 1) / ndata); | ||||||
|  | 	asize = roundup(asize, nparity + 1) << ashift; | ||||||
|  | 
 | ||||||
|  | 	return (asize); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Determine how much space will be allocated if it lands on the most space- | ||||||
|  |  * inefficient top-level vdev.  Returns the size in bytes required to store one | ||||||
|  |  * copy of the volume data.  See theory comment above. | ||||||
|  |  */ | ||||||
|  | static uint64_t | ||||||
|  | volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) | ||||||
|  | { | ||||||
|  | 	nvlist_t *config, *tree, **vdevs; | ||||||
|  | 	uint_t nvdevs, v; | ||||||
|  | 	uint64_t ret = 0; | ||||||
|  | 
 | ||||||
|  | 	config = zpool_get_config(zhp, NULL); | ||||||
|  | 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || | ||||||
|  | 	    nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, | ||||||
|  | 	    &vdevs, &nvdevs) != 0) { | ||||||
|  | 		return (nblocks * blksize); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	for (v = 0; v < nvdevs; v++) { | ||||||
|  | 		char *type; | ||||||
|  | 		uint64_t nparity, ashift, asize, tsize; | ||||||
|  | 		nvlist_t **disks; | ||||||
|  | 		uint_t ndisks; | ||||||
|  | 		uint64_t volsize; | ||||||
|  | 
 | ||||||
|  | 		if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, | ||||||
|  | 		    &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || | ||||||
|  | 		    nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, | ||||||
|  | 		    &nparity) != 0 || | ||||||
|  | 		    nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, | ||||||
|  | 		    &ashift) != 0 || | ||||||
|  | 		    nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, | ||||||
|  | 		    &disks, &ndisks) != 0) { | ||||||
|  | 			continue; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		/* allocation size for the "typical" 128k block */ | ||||||
|  | 		tsize = vdev_raidz_asize(ndisks, nparity, ashift, | ||||||
|  | 		    SPA_OLD_MAXBLOCKSIZE); | ||||||
|  | 		/* allocation size for the blksize block */ | ||||||
|  | 		asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * Scale this size down as a ratio of 128k / tsize.  See theory | ||||||
|  | 		 * statement above. | ||||||
|  | 		 */ | ||||||
|  | 		volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; | ||||||
|  | 		if (volsize > ret) { | ||||||
|  | 			ret = volsize; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (ret == 0) { | ||||||
|  | 		ret = nblocks * blksize; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return (ret); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Convert the zvol's volume size to an appropriate reservation.  See theory | ||||||
|  |  * comment above. | ||||||
|  |  * | ||||||
|  * Note: If this routine is updated, it is necessary to update the ZFS test |  * Note: If this routine is updated, it is necessary to update the ZFS test | ||||||
|  * suite's shell version in reservation.kshlib. |  * suite's shell version in reservation.shlib. | ||||||
|  */ |  */ | ||||||
| uint64_t | uint64_t | ||||||
| zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) | zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, | ||||||
|  |     nvlist_t *props) | ||||||
| { | { | ||||||
| 	uint64_t numdb; | 	uint64_t numdb; | ||||||
| 	uint64_t nblocks, volblocksize; | 	uint64_t nblocks, volblocksize; | ||||||
| @ -5384,7 +5550,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) | |||||||
| 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), | 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), | ||||||
| 	    &volblocksize) != 0) | 	    &volblocksize) != 0) | ||||||
| 		volblocksize = ZVOL_DEFAULT_BLOCKSIZE; | 		volblocksize = ZVOL_DEFAULT_BLOCKSIZE; | ||||||
| 	nblocks = volsize/volblocksize; | 
 | ||||||
|  | 	nblocks = volsize / volblocksize; | ||||||
|  | 	/*
 | ||||||
|  | 	 * Metadata defaults to using 128k blocks, not volblocksize blocks.  For | ||||||
|  | 	 * this reason, only the data blocks are scaled based on vdev config. | ||||||
|  | 	 */ | ||||||
|  | 	volsize = volsize_from_vdevs(zph, nblocks, volblocksize); | ||||||
|  | 
 | ||||||
| 	/* start with metadnode L0-L6 */ | 	/* start with metadnode L0-L6 */ | ||||||
| 	numdb = 7; | 	numdb = 7; | ||||||
| 	/* calculate number of indirects */ | 	/* calculate number of indirects */ | ||||||
|  | |||||||
| @ -755,7 +755,8 @@ tags = ['functional', 'refquota'] | |||||||
| 
 | 
 | ||||||
| [tests/functional/refreserv] | [tests/functional/refreserv] | ||||||
| tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', | tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', | ||||||
|     'refreserv_004_pos', 'refreserv_005_pos'] |     'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', | ||||||
|  |     'refreserv_raidz'] | ||||||
| tags = ['functional', 'refreserv'] | tags = ['functional', 'refreserv'] | ||||||
| 
 | 
 | ||||||
| [tests/functional/removal] | [tests/functional/removal] | ||||||
|  | |||||||
| @ -6,7 +6,9 @@ dist_pkgdata_SCRIPTS = \ | |||||||
| 	refreserv_002_pos.ksh \
 | 	refreserv_002_pos.ksh \
 | ||||||
| 	refreserv_003_pos.ksh \
 | 	refreserv_003_pos.ksh \
 | ||||||
| 	refreserv_004_pos.ksh \
 | 	refreserv_004_pos.ksh \
 | ||||||
| 	refreserv_005_pos.ksh | 	refreserv_005_pos.ksh \
 | ||||||
|  | 	refreserv_multi_raidz.ksh \
 | ||||||
|  | 	refreserv_raidz.ksh | ||||||
| 
 | 
 | ||||||
| dist_pkgdata_DATA = \
 | dist_pkgdata_DATA = \
 | ||||||
| 	refreserv.cfg | 	refreserv.cfg | ||||||
|  | |||||||
							
								
								
									
										197
									
								
								tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										197
									
								
								tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,197 @@ | |||||||
|  | #!/bin/ksh -p | ||||||
|  | # | ||||||
|  | # This file and its contents are supplied under the terms of the | ||||||
|  | # Common Development and Distribution License ("CDDL"), version 1.0. | ||||||
|  | # You may only use this file in accordance with the terms of version | ||||||
|  | # 1.0 of the CDDL. | ||||||
|  | # | ||||||
|  | # A full copy of the text of the CDDL should have accompanied this | ||||||
|  | # source.  A copy of the CDDL is also available via the Internet at | ||||||
|  | # http://www.illumos.org/license/CDDL. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # Copyright 2019 Joyent, Inc. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | . $STF_SUITE/include/libtest.shlib | ||||||
|  | . $STF_SUITE/tests/functional/refreserv/refreserv.cfg | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # DESCRIPTION: | ||||||
|  | #	raidz refreservation=auto picks worst raidz vdev | ||||||
|  | # | ||||||
|  | # STRATEGY: | ||||||
|  | #	1. Create a pool with a single raidz vdev | ||||||
|  | #	2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] | ||||||
|  | #	    - create a volume | ||||||
|  | #	    - remember its refreservation | ||||||
|  | #	    - destroy the volume | ||||||
|  | #	3. Destroy the pool | ||||||
|  | #	4. Recreate the pool with one more disk in the vdev, then repeat steps | ||||||
|  | #	   2 and 3. | ||||||
|  | # | ||||||
|  | # NOTES: | ||||||
|  | #	1. This test will use up to 14 disks but can cover the key concepts with | ||||||
|  | #	   5 disks. | ||||||
|  | #	2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | verify_runnable "global" | ||||||
|  | 
 | ||||||
|  | typeset -a alldisks=($DISKS) | ||||||
|  | 
 | ||||||
|  | # The larger the volsize, the better zvol_volsize_to_reservation() is at | ||||||
|  | # guessing the right number - though it is horrible with tiny blocks.  At 10M on | ||||||
|  | # ashift=12, the estimate may be over 26% too high. | ||||||
|  | volsize=100 | ||||||
|  | 
 | ||||||
|  | function cleanup | ||||||
|  | { | ||||||
|  | 	default_cleanup_noexit | ||||||
|  | 	default_setup_noexit "${alldisks[0]}" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | log_assert "raidz refreservation=auto picks worst raidz vdev" | ||||||
|  | log_onexit cleanup | ||||||
|  | 
 | ||||||
|  | poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" | ||||||
|  | 
 | ||||||
|  | # Testing tiny block sizes on ashift=12 pools causes so much size inflation | ||||||
|  | # that small test disks may fill before creating small volumes.  However, | ||||||
|  | # testing 512b and 1K blocks on ashift=9 pools is an ok approximation for | ||||||
|  | # testing the problems that arise from 4K and 8K blocks on ashift=12 pools. | ||||||
|  | bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) | ||||||
|  | case "$bps" in | ||||||
|  | 512) | ||||||
|  | 	allshifts=(9 10 17) | ||||||
|  | 	;; | ||||||
|  | 4096) | ||||||
|  | 	allshifts=(12 13 17) | ||||||
|  | 	;; | ||||||
|  | *) | ||||||
|  | 	log_fail "bytes/sector: $bps != (512|4096)" | ||||||
|  | 	;; | ||||||
|  | esac | ||||||
|  | log_note "Testing in ashift=${allshifts[0]} mode" | ||||||
|  | 
 | ||||||
|  | typeset -A sizes= | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # Determine the refreservation for a $volsize MiB volume on each raidz type at | ||||||
|  | # various block sizes. | ||||||
|  | # | ||||||
|  | for parity in 1 2 3; do | ||||||
|  | 	raid=raidz$parity | ||||||
|  | 	typeset -A sizes["$raid"] | ||||||
|  | 
 | ||||||
|  | 	# Ensure we hit scenarios with and without skip blocks | ||||||
|  | 	for ndisks in $((parity * 2)) $((parity * 2 + 1)); do | ||||||
|  | 		typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) | ||||||
|  | 
 | ||||||
|  | 		if (( ${#disks[@]} < ndisks )); then | ||||||
|  | 			log_note "Too few disks to test $raid-$ndisks" | ||||||
|  | 			continue | ||||||
|  | 		fi | ||||||
|  | 
 | ||||||
|  | 		typeset -A sizes["$raid"]["$ndisks"] | ||||||
|  | 
 | ||||||
|  | 		log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" | ||||||
|  | 
 | ||||||
|  | 		for bits in "${allshifts[@]}"; do | ||||||
|  | 			vbs=$((1 << bits)) | ||||||
|  | 			log_note "Gathering refreservation for $raid-$ndisks" \ | ||||||
|  | 			    "volblocksize=$vbs" | ||||||
|  | 
 | ||||||
|  | 			vol=$TESTPOOL/$TESTVOL | ||||||
|  | 			log_must zfs create -V ${volsize}m \ | ||||||
|  | 			    -o volblocksize=$vbs "$vol" | ||||||
|  | 
 | ||||||
|  | 			refres=$(zfs get -Hpo value refreservation "$vol") | ||||||
|  | 			log_must test -n "$refres" | ||||||
|  | 			sizes["$raid"]["$ndisks"]["$vbs"]=$refres | ||||||
|  | 
 | ||||||
|  | 			log_must_busy zfs destroy "$vol" | ||||||
|  | 		done | ||||||
|  | 
 | ||||||
|  | 		log_must_busy zpool destroy "$TESTPOOL" | ||||||
|  | 	done | ||||||
|  | done | ||||||
|  | 
 | ||||||
|  | # A little extra info is always helpful when diagnosing problems.  To | ||||||
|  | # pretty-print what you find in the log, do this in ksh: | ||||||
|  | #   typeset -A sizes=(...) | ||||||
|  | #   print -v sizes | ||||||
|  | log_note "sizes=$(print -C sizes)" | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # Helper furnction for checking that refreservation is calculated properly in | ||||||
|  | # multi-vdev pools.  "Properly" is defined as assuming that all vdevs are as | ||||||
|  | # space inefficient as the worst one. | ||||||
|  | # | ||||||
|  | function check_vdevs { | ||||||
|  | 	typeset raid=$1 | ||||||
|  | 	typeset nd1=$2 | ||||||
|  | 	typeset nd2=$3 | ||||||
|  | 	typeset -a disks1 disks2 | ||||||
|  | 	typeset vbs vol refres refres1 refres2 expect | ||||||
|  | 
 | ||||||
|  | 	disks1=(${alldisks[0..$((nd1 - 1))]}) | ||||||
|  | 	disks2=(${alldisks[$nd1..$((nd1 + nd2 - 1))]}) | ||||||
|  | 	if (( ${#disks2[@]} < nd2 )); then | ||||||
|  | 		log_note "Too few disks to test $raid-$nd1 + $raid=$nd2" | ||||||
|  | 		return | ||||||
|  | 	fi | ||||||
|  | 
 | ||||||
|  | 	log_must zpool create -f "$TESTPOOL" \ | ||||||
|  | 	    "$raid" "${disks1[@]}" "$raid" "${disks2[@]}" | ||||||
|  | 
 | ||||||
|  | 	for bits in "${allshifts[@]}"; do | ||||||
|  | 		vbs=$((1 << bits)) | ||||||
|  | 		log_note "Verifying $raid-$nd1 $raid-$nd2 volblocksize=$vbs" | ||||||
|  | 
 | ||||||
|  | 		vol=$TESTPOOL/$TESTVOL | ||||||
|  | 		log_must zfs create -V ${volsize}m -o volblocksize=$vbs "$vol" | ||||||
|  | 		refres=$(zfs get -Hpo value refreservation "$vol") | ||||||
|  | 		log_must test -n "$refres" | ||||||
|  | 
 | ||||||
|  | 		refres1=${sizes["$raid"]["$nd1"]["$vbs"]} | ||||||
|  | 		refres2=${sizes["$raid"]["$nd2"]["$vbs"]} | ||||||
|  | 
 | ||||||
|  | 		if (( refres1 > refres2 )); then | ||||||
|  | 			log_note "Expecting refres ($refres) to match refres" \ | ||||||
|  | 			   "from $raid-$nd1 ($refres1)" | ||||||
|  | 			log_must test "$refres" -eq "$refres1" | ||||||
|  | 		else | ||||||
|  | 			log_note "Expecting refres ($refres) to match refres" \ | ||||||
|  | 			   "from $raid-$nd1 ($refres2)" | ||||||
|  | 			log_must test "$refres" -eq "$refres2" | ||||||
|  | 		fi | ||||||
|  | 
 | ||||||
|  | 		log_must zfs destroy "$vol" | ||||||
|  | 	done | ||||||
|  | 
 | ||||||
|  | 	log_must zpool destroy "$TESTPOOL" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # Verify that multi-vdev pools use the last optimistic size for all the | ||||||
|  | # permutations within a particular raidz variant. | ||||||
|  | # | ||||||
|  | for raid in "${!sizes[@]}"; do | ||||||
|  | 	# ksh likes to create a [0] item for us.  Thanks, ksh! | ||||||
|  | 	[[ $raid == "0" ]] && continue | ||||||
|  | 
 | ||||||
|  | 	for nd1 in "${!sizes["$raid"][@]}"; do | ||||||
|  | 		# And with an empty array we get one key, ''.  Thanks, ksh! | ||||||
|  | 		[[ $nd1 == "0" || -z "$nd1" ]] && continue | ||||||
|  | 
 | ||||||
|  | 		for nd2 in "${!sizes["$raid"][@]}"; do | ||||||
|  | 			[[ $nd2 == "0" || -z "$nd2" ]] && continue | ||||||
|  | 
 | ||||||
|  | 			check_vdevs "$raid" "$nd1" "$nd2" | ||||||
|  | 		done | ||||||
|  | 	done | ||||||
|  | done | ||||||
|  | 
 | ||||||
|  | log_pass "raidz refreservation=auto picks worst raidz vdev" | ||||||
							
								
								
									
										130
									
								
								tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										130
									
								
								tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,130 @@ | |||||||
|  | #!/bin/ksh -p | ||||||
|  | # | ||||||
|  | # This file and its contents are supplied under the terms of the | ||||||
|  | # Common Development and Distribution License ("CDDL"), version 1.0. | ||||||
|  | # You may only use this file in accordance with the terms of version | ||||||
|  | # 1.0 of the CDDL. | ||||||
|  | # | ||||||
|  | # A full copy of the text of the CDDL should have accompanied this | ||||||
|  | # source.  A copy of the CDDL is also available via the Internet at | ||||||
|  | # http://www.illumos.org/license/CDDL. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # Copyright 2019 Joyent, Inc. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | . $STF_SUITE/include/libtest.shlib | ||||||
|  | . $STF_SUITE/tests/functional/refreserv/refreserv.cfg | ||||||
|  | 
 | ||||||
|  | # | ||||||
|  | # DESCRIPTION: | ||||||
|  | #	raidz refreservation=auto accounts for extra parity and skip blocks | ||||||
|  | # | ||||||
|  | # STRATEGY: | ||||||
|  | #	1. Create a pool with a single raidz vdev | ||||||
|  | #	2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] | ||||||
|  | #	    - create a volume | ||||||
|  | #	    - fully overwrite it | ||||||
|  | #	    - verify that referenced is less than or equal to reservation | ||||||
|  | #	    - destroy the volume | ||||||
|  | #	3. Destroy the pool | ||||||
|  | #	4. Recreate the pool with one more disk in the vdev, then repeat steps | ||||||
|  | #	   2 and 3. | ||||||
|  | #	5. Repeat all steps above for raidz2 and raidz3. | ||||||
|  | # | ||||||
|  | # NOTES: | ||||||
|  | #	1. This test will use up to 14 disks but can cover the key concepts with | ||||||
|  | #	   5 disks. | ||||||
|  | #	2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | verify_runnable "global" | ||||||
|  | 
 | ||||||
|  | typeset -a alldisks=($DISKS) | ||||||
|  | 
 | ||||||
|  | # The larger the volsize, the better zvol_volsize_to_reservation() is at | ||||||
|  | # guessing the right number.  At 10M on ashift=12, the estimate may be over 26% | ||||||
|  | # too high. | ||||||
|  | volsize=100 | ||||||
|  | 
 | ||||||
|  | function cleanup | ||||||
|  | { | ||||||
|  | 	default_cleanup_noexit | ||||||
|  | 	default_setup_noexit "${alldisks[0]}" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | log_assert "raidz refreservation=auto accounts for extra parity and skip blocks" | ||||||
|  | log_onexit cleanup | ||||||
|  | 
 | ||||||
|  | poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" | ||||||
|  | 
 | ||||||
|  | # Testing tiny block sizes on ashift=12 pools causes so much size inflation | ||||||
|  | # that small test disks may fill before creating small volumes.  However, | ||||||
|  | # testing 512b and 1K blocks on ashift=9 pools is an ok approximation for | ||||||
|  | # testing the problems that arise from 4K and 8K blocks on ashift=12 pools. | ||||||
|  | bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) | ||||||
|  | log_must test "$bps" -eq 512 -o "$bps" -eq 4096 | ||||||
|  | case "$bps" in | ||||||
|  | 512) | ||||||
|  | 	allshifts=(9 10 17) | ||||||
|  | 	maxpct=151 | ||||||
|  | 	;; | ||||||
|  | 4096) | ||||||
|  | 	allshifts=(12 13 17) | ||||||
|  | 	maxpct=110 | ||||||
|  | 	;; | ||||||
|  | *) | ||||||
|  | 	log_fail "bytes/sector: $bps != (512|4096)" | ||||||
|  | 	;; | ||||||
|  | esac | ||||||
|  | log_note "Testing in ashift=${allshifts[0]} mode" | ||||||
|  | 
 | ||||||
|  | # This loop handles all iterations of steps 1 through 4 described in strategy | ||||||
|  | # comment above, | ||||||
|  | for parity in 1 2 3; do | ||||||
|  | 	raid=raidz$parity | ||||||
|  | 
 | ||||||
|  | 	# Ensure we hit scenarios with and without skip blocks | ||||||
|  | 	for ndisks in $((parity * 2)) $((parity * 2 + 1)); do | ||||||
|  | 		typeset -a disks=(${alldisks[0..$((ndisks - 1))]}) | ||||||
|  | 
 | ||||||
|  | 		if (( ${#disks[@]} < ndisks )); then | ||||||
|  | 			log_note "Too few disks to test $raid-$ndisks" | ||||||
|  | 			continue | ||||||
|  | 		fi | ||||||
|  | 
 | ||||||
|  | 		log_must zpool create "$TESTPOOL" "$raid" "${disks[@]}" | ||||||
|  | 
 | ||||||
|  | 		for bits in "${allshifts[@]}"; do | ||||||
|  | 			vbs=$((1 << bits)) | ||||||
|  | 			log_note "Testing $raid-$ndisks volblocksize=$vbs" | ||||||
|  | 
 | ||||||
|  | 			vol=$TESTPOOL/$TESTVOL | ||||||
|  | 			log_must zfs create -V ${volsize}m \ | ||||||
|  | 			    -o volblocksize=$vbs "$vol" | ||||||
|  | 			block_device_wait "/dev/zvol/$vol" | ||||||
|  | 			log_must dd if=/dev/zero of=/dev/zvol/$vol \ | ||||||
|  | 			    bs=1024k count=$volsize | ||||||
|  | 			sync | ||||||
|  | 
 | ||||||
|  | 			ref=$(zfs get -Hpo value referenced "$vol") | ||||||
|  | 			refres=$(zfs get -Hpo value refreservation "$vol") | ||||||
|  | 			log_must test -n "$ref" | ||||||
|  | 			log_must test -n "$refres" | ||||||
|  | 
 | ||||||
|  | 			typeset -F2 deltapct=$((refres * 100.0 / ref)) | ||||||
|  | 			log_note "$raid-$ndisks refreservation $refres" \ | ||||||
|  | 			    "is $deltapct% of reservation $res" | ||||||
|  | 
 | ||||||
|  | 			log_must test "$ref" -le "$refres" | ||||||
|  | 			log_must test "$deltapct" -le $maxpct | ||||||
|  | 
 | ||||||
|  | 			log_must_busy zfs destroy "$vol" | ||||||
|  | 		done | ||||||
|  | 
 | ||||||
|  | 		log_must_busy zpool destroy "$TESTPOOL" | ||||||
|  | 	done | ||||||
|  | done | ||||||
|  | 
 | ||||||
|  | log_pass "raidz refreservation=auto accounts for extra parity and skip blocks" | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Mike Gerdts
						Mike Gerdts