mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Illumos 4958 zdb trips assert on pools with ashift >= 0xe
4958 zdb trips assert on pools with ashift >= 0xe Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Max Grossman <max.grossman@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> References: https://www.illumos.org/issues/4958 https://github.com/illumos/illumos-gate/commit/2a104a5 Porting notes: Keep the ZIO_FLAG_FASTWRITE define. This is for a feature present in Linux but not yet in *BSD. Ported by: Turbo Fredriksson <turbo@bayour.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2697
This commit is contained in:
committed by
Brian Behlendorf
parent
adc90e9d94
commit
b02fe35d37
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
||||
*/
|
||||
|
||||
|
||||
+35
-4
@@ -64,6 +64,21 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
|
||||
*/
|
||||
int zfs_condense_pct = 200;
|
||||
|
||||
/*
|
||||
* Condensing a metaslab is not guaranteed to actually reduce the amount of
|
||||
* space used on disk. In particular, a space map uses data in increments of
|
||||
* MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
|
||||
* same number of blocks after condensing. Since the goal of condensing is to
|
||||
* reduce the number of IOPs required to read the space map, we only want to
|
||||
* condense when we can be sure we will reduce the number of blocks used by the
|
||||
* space map. Unfortunately, we cannot precisely compute whether or not this is
|
||||
* the case in metaslab_should_condense since we are holding ms_lock. Instead,
|
||||
* we apply the following heuristic: do not condense a spacemap unless the
|
||||
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
|
||||
* blocks.
|
||||
*/
|
||||
int zfs_metaslab_condense_block_threshold = 4;
|
||||
|
||||
/*
|
||||
* The zfs_mg_noalloc_threshold defines which metaslab groups should
|
||||
* be eligible for allocation. The value is defined as a percentage of
|
||||
@@ -1633,6 +1648,8 @@ metaslab_group_preload(metaslab_group_t *mg)
|
||||
* times the size than the free space range tree representation
|
||||
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
|
||||
*
|
||||
* 3. The on-disk size of the space map should actually decrease.
|
||||
*
|
||||
* Checking the first condition is tricky since we don't want to walk
|
||||
* the entire AVL tree calculating the estimated on-disk size. Instead we
|
||||
* use the size-ordered range tree in the metaslab and calculate the
|
||||
@@ -1643,13 +1660,21 @@ metaslab_group_preload(metaslab_group_t *mg)
|
||||
* To determine the second criterion we use a best-case estimate and assume
|
||||
* each segment can be represented on-disk as a single 64-bit entry. We refer
|
||||
* to this best-case estimate as the space map's minimal form.
|
||||
*
|
||||
* Unfortunately, we cannot compute the on-disk size of the space map in this
|
||||
* context because we cannot accurately compute the effects of compression, etc.
|
||||
* Instead, we apply the heuristic described in the block comment for
|
||||
* zfs_metaslab_condense_block_threshold - we only condense if the space used
|
||||
* is greater than a threshold number of blocks.
|
||||
*/
|
||||
static boolean_t
|
||||
metaslab_should_condense(metaslab_t *msp)
|
||||
{
|
||||
space_map_t *sm = msp->ms_sm;
|
||||
range_seg_t *rs;
|
||||
uint64_t size, entries, segsz;
|
||||
uint64_t size, entries, segsz, object_size, optimal_size, record_size;
|
||||
dmu_object_info_t doi;
|
||||
uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
|
||||
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
ASSERT(msp->ms_loaded);
|
||||
@@ -1674,9 +1699,15 @@ metaslab_should_condense(metaslab_t *msp)
|
||||
entries = size / (MIN(size, SM_RUN_MAX));
|
||||
segsz = entries * sizeof (uint64_t);
|
||||
|
||||
return (segsz <= space_map_length(msp->ms_sm) &&
|
||||
space_map_length(msp->ms_sm) >= (zfs_condense_pct *
|
||||
sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
|
||||
optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
|
||||
object_size = space_map_length(msp->ms_sm);
|
||||
|
||||
dmu_object_info_from_db(sm->sm_dbuf, &doi);
|
||||
record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
|
||||
|
||||
return (segsz <= object_size &&
|
||||
object_size >= (optimal_size * zfs_condense_pct / 100) &&
|
||||
object_size > zfs_metaslab_condense_block_threshold * record_size);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@@ -136,7 +136,10 @@ zfs_dbgmsg_fini(void)
|
||||
* echo ::zfs_dbgmsg | mdb -k
|
||||
*
|
||||
* Monitor these messages by running:
|
||||
* dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
|
||||
* dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
|
||||
*
|
||||
* When used with libzpool, monitor with:
|
||||
* dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
|
||||
*/
|
||||
void
|
||||
zfs_dbgmsg(const char *fmt, ...)
|
||||
|
||||
+23
-7
@@ -889,8 +889,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
||||
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
||||
|
||||
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
|
||||
ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
|
||||
ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
|
||||
ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
|
||||
NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
|
||||
|
||||
zio->io_prop.zp_checksum = checksum;
|
||||
|
||||
@@ -910,8 +910,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
|
||||
ASSERT3U(offset + size, <=, vd->vdev_psize);
|
||||
|
||||
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
|
||||
ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
|
||||
ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
|
||||
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
|
||||
NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
|
||||
|
||||
zio->io_prop.zp_checksum = checksum;
|
||||
|
||||
@@ -2642,7 +2642,9 @@ zio_vdev_io_start(zio_t *zio)
|
||||
|
||||
align = 1ULL << vd->vdev_top->vdev_ashift;
|
||||
|
||||
if (P2PHASE(zio->io_size, align) != 0) {
|
||||
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
|
||||
P2PHASE(zio->io_size, align) != 0) {
|
||||
/* Transform logical writes to be a full physical block size. */
|
||||
uint64_t asize = P2ROUNDUP(zio->io_size, align);
|
||||
char *abuf = zio_buf_alloc(asize);
|
||||
ASSERT(vd == vd->vdev_top);
|
||||
@@ -2653,8 +2655,22 @@ zio_vdev_io_start(zio_t *zio)
|
||||
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
|
||||
}
|
||||
|
||||
ASSERT(P2PHASE(zio->io_offset, align) == 0);
|
||||
ASSERT(P2PHASE(zio->io_size, align) == 0);
|
||||
/*
|
||||
* If this is not a physical io, make sure that it is properly aligned
|
||||
* before proceeding.
|
||||
*/
|
||||
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
|
||||
ASSERT0(P2PHASE(zio->io_offset, align));
|
||||
ASSERT0(P2PHASE(zio->io_size, align));
|
||||
} else {
|
||||
/*
|
||||
* For physical writes, we allow 512b aligned writes and assume
|
||||
* the device will perform a read-modify-write as necessary.
|
||||
*/
|
||||
ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
|
||||
ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
|
||||
}
|
||||
|
||||
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user