mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-26 18:04:22 +03:00
panic in removal_remap test on 4K devices
If the zfs_remove_max_segment tunable is changed to be not a multiple of the sector size, then the device removal code will malfunction and try to create mappings that are smaller than one sector, leading to a panic. On debug bits this assertion will fail in spa_vdev_copy_segment(): ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); On nondebug, the system panics with a stack like: metaslab_free_concrete() metaslab_free_impl() metaslab_free_impl_cb() vdev_indirect_remap() free_from_removing_vdev() metaslab_free_impl() metaslab_free_dva() metaslab_free() Fortunately, the default for zfs_remove_max_segment is 1MB, so this can't occur by default. We hit it during this test because removal_remap.ksh changes zfs_remove_max_segment to 1KB. When testing on 4KB-sector disks, we hit the bug. This change makes the zfs_remove_max_segment tunable more robust, automatically rounding it up to a multiple of the sector size. We also turn some key assertions into VERIFY's so that similar bugs would be caught before they are encoded on disk (and thus avoid a panic-reboot-loop). Reviewed-by: Sean Eric Fagan <sef@ixsystems.com> Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> External-issue: DLPX-61342 Closes #8893
This commit is contained in:
parent
be89734a29
commit
53dce5acc6
@ -14,7 +14,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2014, 2017 by Delphix. All rights reserved.
|
* Copyright (c) 2014, 2019 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _SYS_VDEV_REMOVAL_H
|
#ifndef _SYS_VDEV_REMOVAL_H
|
||||||
@ -81,13 +81,13 @@ extern void spa_vdev_condense_suspend(spa_t *);
|
|||||||
extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
|
extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
|
||||||
extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
|
extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t);
|
||||||
extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
|
extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *);
|
||||||
extern void svr_sync(spa_t *spa, dmu_tx_t *tx);
|
extern void svr_sync(spa_t *, dmu_tx_t *);
|
||||||
extern void spa_vdev_remove_suspend(spa_t *);
|
extern void spa_vdev_remove_suspend(spa_t *);
|
||||||
extern int spa_vdev_remove_cancel(spa_t *);
|
extern int spa_vdev_remove_cancel(spa_t *);
|
||||||
extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
|
extern void spa_vdev_removal_destroy(spa_vdev_removal_t *);
|
||||||
|
extern uint64_t spa_remove_max_segment(spa_t *);
|
||||||
|
|
||||||
extern int vdev_removal_max_span;
|
extern int vdev_removal_max_span;
|
||||||
extern int zfs_remove_max_segment;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -2228,6 +2228,33 @@ pool cannot be returned to a healthy state prior to removing the device.
|
|||||||
Default value: \fB0\fR.
|
Default value: \fB0\fR.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBzfs_removal_suspend_progress\fR (int)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
.sp
|
||||||
|
This is used by the test suite so that it can ensure that certain actions
|
||||||
|
happen while in the middle of a removal.
|
||||||
|
.sp
|
||||||
|
Default value: \fB0\fR.
|
||||||
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBzfs_remove_max_segment\fR (int)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
.sp
|
||||||
|
The largest contiguous segment that we will attempt to allocate when removing
|
||||||
|
a device. This can be no larger than 16MB. If there is a performance
|
||||||
|
problem with attempting to allocate large blocks, consider decreasing this.
|
||||||
|
.sp
|
||||||
|
Default value: \fB16,777,216\fR (16MB).
|
||||||
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
|
@ -21,8 +21,7 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
|
||||||
* Copyright (c) 2017, Intel Corporation.
|
* Copyright (c) 2017, Intel Corporation.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -613,7 +612,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
|
|||||||
* zfs_remove_max_segment, so we need at least one entry
|
* zfs_remove_max_segment, so we need at least one entry
|
||||||
* per zfs_remove_max_segment of allocated data.
|
* per zfs_remove_max_segment of allocated data.
|
||||||
*/
|
*/
|
||||||
seg_count += to_alloc / zfs_remove_max_segment;
|
seg_count += to_alloc / spa_remove_max_segment(spa);
|
||||||
|
|
||||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
|
fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
|
||||||
seg_count *
|
seg_count *
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
@ -100,6 +100,8 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
|
|||||||
* removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
|
* removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
|
||||||
* there is a performance problem with attempting to allocate large blocks,
|
* there is a performance problem with attempting to allocate large blocks,
|
||||||
* consider decreasing this.
|
* consider decreasing this.
|
||||||
|
*
|
||||||
|
* See also the accessor function spa_remove_max_segment().
|
||||||
*/
|
*/
|
||||||
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
|
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
|
||||||
|
|
||||||
@ -951,8 +953,10 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
|
|||||||
vdev_indirect_mapping_entry_t *entry;
|
vdev_indirect_mapping_entry_t *entry;
|
||||||
dva_t dst = {{ 0 }};
|
dva_t dst = {{ 0 }};
|
||||||
uint64_t start = range_tree_min(segs);
|
uint64_t start = range_tree_min(segs);
|
||||||
|
ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
|
||||||
|
|
||||||
ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
|
ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
|
||||||
|
ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
|
||||||
|
|
||||||
uint64_t size = range_tree_span(segs);
|
uint64_t size = range_tree_span(segs);
|
||||||
if (range_tree_span(segs) > maxalloc) {
|
if (range_tree_span(segs) > maxalloc) {
|
||||||
@ -983,6 +987,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
ASSERT3U(size, <=, maxalloc);
|
ASSERT3U(size, <=, maxalloc);
|
||||||
|
ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* An allocation class might not have any remaining vdevs or space
|
* An allocation class might not have any remaining vdevs or space
|
||||||
@ -1026,11 +1031,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* We can't have any padding of the allocated size, otherwise we will
|
* We can't have any padding of the allocated size, otherwise we will
|
||||||
* misunderstand what's allocated, and the size of the mapping.
|
* misunderstand what's allocated, and the size of the mapping. We
|
||||||
* The caller ensures this will be true by passing in a size that is
|
* prevent padding by ensuring that all devices in the pool have the
|
||||||
* aligned to the worst (highest) ashift in the pool.
|
* same ashift, and the allocation size is a multiple of the ashift.
|
||||||
*/
|
*/
|
||||||
ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
|
VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
|
||||||
|
|
||||||
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
|
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
|
||||||
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
|
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
|
||||||
@ -1363,6 +1368,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
|
|||||||
range_tree_destroy(segs);
|
range_tree_destroy(segs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The size of each removal mapping is limited by the tunable
|
||||||
|
* zfs_remove_max_segment, but we must adjust this to be a multiple of the
|
||||||
|
* pool's ashift, so that we don't try to split individual sectors regardless
|
||||||
|
* of the tunable value. (Note that device removal requires that all devices
|
||||||
|
* have the same ashift, so there's no difference between spa_min_ashift and
|
||||||
|
* spa_max_ashift.) The raw tunable should not be used elsewhere.
|
||||||
|
*/
|
||||||
|
uint64_t
|
||||||
|
spa_remove_max_segment(spa_t *spa)
|
||||||
|
{
|
||||||
|
return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The removal thread operates in open context. It iterates over all
|
* The removal thread operates in open context. It iterates over all
|
||||||
* allocated space in the vdev, by loading each metaslab's spacemap.
|
* allocated space in the vdev, by loading each metaslab's spacemap.
|
||||||
@ -1385,7 +1404,7 @@ spa_vdev_remove_thread(void *arg)
|
|||||||
spa_t *spa = arg;
|
spa_t *spa = arg;
|
||||||
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
|
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
|
||||||
vdev_copy_arg_t vca;
|
vdev_copy_arg_t vca;
|
||||||
uint64_t max_alloc = zfs_remove_max_segment;
|
uint64_t max_alloc = spa_remove_max_segment(spa);
|
||||||
uint64_t last_txg = 0;
|
uint64_t last_txg = 0;
|
||||||
|
|
||||||
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
|
||||||
@ -1511,7 +1530,7 @@ spa_vdev_remove_thread(void *arg)
|
|||||||
vd = vdev_lookup_top(spa, svr->svr_vdev_id);
|
vd = vdev_lookup_top(spa, svr->svr_vdev_id);
|
||||||
|
|
||||||
if (txg != last_txg)
|
if (txg != last_txg)
|
||||||
max_alloc = zfs_remove_max_segment;
|
max_alloc = spa_remove_max_segment(spa);
|
||||||
last_txg = txg;
|
last_txg = txg;
|
||||||
|
|
||||||
spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
|
spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
|
||||||
|
Loading…
Reference in New Issue
Block a user