OpenZFS 9426 - metaslab size can exceed offset addressable by spacemap

Authored by: Don Brady <don.brady@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Matt Ahrens <matt@delphix.com> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@joyent.com> OpenZFS-issue: https://www.illumos.org/issues/9426 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f1c88afb1 Closes #7700
2025-09-18 07:06:22 +03:00 · 2017-08-11 15:28:17 -06:00 · 2017-08-11 15:28:17 -06:00 · e4e94ca315
commit e4e94ca315
parent e902ddb0f8
2 changed files with 67 additions and 28 deletions
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -296,7 +296,7 @@ Use \fB1\fR for yes (default) and \fB0\fR for no.
 \fBvdev_max_ms_count\fR (int)
 .ad
 .RS 12n
-When a vdev is added, it will be divided into approximately (but no more than) this number of metaslabs.
+When a vdev is added target this number of metaslabs per top-level vdev.
 .sp
 Default value: \fB200\fR.
 .RE
@ -312,6 +312,17 @@ Minimum number of metaslabs to create in a top-level vdev.
 Default value: \fB16\fR.
 .RE
 .sp
 .ne 2
 .na
 \fBvdev_ms_count_limit\fR (int)
 .ad
 .RS 12n
 Practical upper limit of total metaslabs per top-level vdev.
 .sp
 Default value: \fB131,072\fR.
 .RE
 .sp
 .ne 2
 .na
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@ -52,15 +52,21 @@
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
-/* maximum number of metaslabs per top-level vdev */
+/* target number of metaslabs per top-level vdev */
 int vdev_max_ms_count = 200;
-/* minimum amount of metaslabs per top-level vdev */
+/* minimum number of metaslabs per top-level vdev */
 int vdev_min_ms_count = 16;
-/* see comment in vdev_metaslab_set_size() */
+/* practical upper limit of total metaslabs per top-level vdev */
 int vdev_ms_count_limit = 1ULL << 17;
 /* lower limit for metaslab size (512M) */
 int vdev_default_ms_shift = 29;
 /* upper limit for metaslab size (256G) */
 int vdev_max_ms_shift = 38;
 int vdev_validate_skip = B_FALSE;
 /*
@ -2130,34 +2136,53 @@ void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
-	uint64_t ms_shift = 0;
+	uint64_t ms_count = asize >> vdev_default_ms_shift;
 	uint64_t ms_shift;
 	/*
-	 * For vdevs that are bigger than 8G the metaslab size varies in
+	 * There are two dimensions to the metaslab sizing calculation:
-	 * a way that the number of metaslabs increases in powers of two,
+	 * the size of the metaslab and the count of metaslabs per vdev.
-	 * linearly in terms of vdev_asize, starting from 16 metaslabs.
+	 * In general, we aim for vdev_max_ms_count (200) metaslabs. The
-	 * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32,
+	 * range of the dimensions are as follows:
-	 * and so on, until we hit the maximum metaslab count limit
+	 *
-	 * [vdev_max_ms_count] from which point the metaslab count stays
+	 *	2^29 <= ms_size  <= 2^38
-	 * the same.
+	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
 	 * at least 512MB (2^29) to minimize fragmentation effects when
 	 * testing with smaller devices.  However, the count constraint
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
 	 * size of 256GB.  However, we will cap the total count to 2^17
 	 * metaslabs to keep our memory footprint in check.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
 	 *	vdev size	metaslab count
 	 *	-------------|-----------------
 	 *	< 8GB		~16
 	 *	8GB - 100GB	one per 512MB
 	 *	100GB - 50TB	~200
 	 *	50TB - 32PB	one per 256GB
 	 *	> 32PB		~131,072
 	 *	-------------------------------
 	 */
 	if (ms_count < vdev_min_ms_count)
 		ms_shift = highbit64(asize / vdev_min_ms_count);
 	else if (ms_count > vdev_max_ms_count)
 		ms_shift = highbit64(asize / vdev_max_ms_count);
 	else
 		ms_shift = vdev_default_ms_shift;
-	if ((asize >> ms_shift) < vdev_min_ms_count) {
+	if (ms_shift < SPA_MAXBLOCKSHIFT) {
-		/*
+		ms_shift = SPA_MAXBLOCKSHIFT;
-		 * For devices that are less than 8G we want to have
+	} else if (ms_shift > vdev_max_ms_shift) {
-		 * exactly 16 metaslabs. We don't want less as integer
+		ms_shift = vdev_max_ms_shift;
-		 * division rounds down, so less metaslabs mean more
+		/* cap the total count to constrain memory footprint */
-		 * wasted space. We don't want more as these vdevs are
+		if ((asize >> ms_shift) > vdev_ms_count_limit)
-		 * small and in the likely event that we are running
+			ms_shift = highbit64(asize / vdev_ms_count_limit);
 		 * out of space, the SPA will have a hard time finding
 		 * space due to fragmentation.
 		 */
 		ms_shift = highbit64(asize / vdev_min_ms_count);
 		ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT);
 	} else if ((asize >> ms_shift) > vdev_max_ms_count) {
 		ms_shift = highbit64(asize / vdev_max_ms_count);
 	}
 	vd->vdev_ms_shift = ms_shift;
@ -4392,13 +4417,16 @@ EXPORT_SYMBOL(vdev_clear);
 /* BEGIN CSTYLED */
 module_param(vdev_max_ms_count, int, 0644);
 MODULE_PARM_DESC(vdev_max_ms_count,
-	"Divide added vdev into approximately (but no more than) this number "
+	"Target number of metaslabs per top-level vdev");
 	"of metaslabs");
 module_param(vdev_min_ms_count, int, 0644);
 MODULE_PARM_DESC(vdev_min_ms_count,
 	"Minimum number of metaslabs per top-level vdev");
 module_param(vdev_ms_count_limit, int, 0644);
 MODULE_PARM_DESC(vdev_ms_count_limit,
 	"Practical upper limit of total metaslabs per top-level vdev");
 module_param(zfs_delays_per_second, uint, 0644);
 MODULE_PARM_DESC(zfs_delays_per_second, "Rate limit delay events to this many "
 	"IO delays per second");