Illumos 4958 zdb trips assert on pools with ashift >= 0xe

4958 zdb trips assert on pools with ashift >= 0xe
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>

References:
  https://www.illumos.org/issues/4958
  https://github.com/illumos/illumos-gate/commit/2a104a5

Porting notes:

Keep the ZIO_FLAG_FASTWRITE define.  This is for a feature present
in Linux but not yet in *BSD.

Ported by: Turbo Fredriksson <turbo@bayour.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2697
This commit is contained in:
Alex Reece 2014-09-23 01:42:03 +02:00 committed by Brian Behlendorf
parent adc90e9d94
commit b02fe35d37
8 changed files with 131 additions and 46 deletions

View File

@ -859,7 +859,7 @@ static uint64_t
ztest_get_ashift(void) ztest_get_ashift(void)
{ {
if (ztest_opts.zo_ashift == 0) if (ztest_opts.zo_ashift == 0)
return (SPA_MINBLOCKSHIFT + ztest_random(3)); return (SPA_MINBLOCKSHIFT + ztest_random(5));
return (ztest_opts.zo_ashift); return (ztest_opts.zo_ashift);
} }
@ -1021,11 +1021,28 @@ ztest_random_spa_version(uint64_t initial_version)
return (version); return (version);
} }
/*
* Find the largest ashift used
*/
static uint64_t
ztest_spa_get_ashift() {
uint64_t i;
uint64_t ashift = SPA_MINBLOCKSHIFT;
vdev_t *rvd = ztest_spa->spa_root_vdev;
for (i = 0; i < rvd->vdev_children; i++) {
ashift = MAX(ashift, rvd->vdev_child[i]->vdev_ashift);
}
return (ashift);
}
static int static int
ztest_random_blocksize(void) ztest_random_blocksize(void)
{ {
return (1 << (SPA_MINBLOCKSHIFT + // Choose a block size >= the ashift.
ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); uint64_t block_shift =
ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
} }
static int static int
@ -5963,17 +5980,31 @@ ztest_freeze(void)
*/ */
spa_freeze(spa); spa_freeze(spa);
/*
* Because it is hard to predict how much space a write will actually
* require beforehand, we leave ourselves some fudge space to write over
* capacity.
*/
uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
/* /*
* Run tests that generate log records but don't alter the pool config * Run tests that generate log records but don't alter the pool config
* or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
* We do a txg_wait_synced() after each iteration to force the txg * We do a txg_wait_synced() after each iteration to force the txg
* to increase well beyond the last synced value in the uberblock. * to increase well beyond the last synced value in the uberblock.
* The ZIL should be OK with that. * The ZIL should be OK with that.
*
* Run a random number of times less than zo_maxloops and ensure we do
* not run out of space on the pool.
*/ */
while (ztest_random(10) != 0 && while (ztest_random(10) != 0 &&
numloops++ < ztest_opts.zo_maxloops) { numloops++ < ztest_opts.zo_maxloops &&
ztest_dmu_write_parallel(zd, 0); metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
ztest_dmu_object_alloc_free(zd, 0); ztest_od_t od;
ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
ztest_io(zd, od.od_object,
ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
txg_wait_synced(spa_get_dsl(spa), 0); txg_wait_synced(spa_get_dsl(spa), 0);
} }

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_VDEV_IMPL_H #ifndef _SYS_VDEV_IMPL_H
@ -239,8 +239,11 @@ struct vdev {
#define VDEV_PHYS_SIZE (112 << 10) #define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10) #define VDEV_UBERBLOCK_RING (128 << 10)
/* The largest uberblock we support is 8k. */
#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \ #define VDEV_UBERBLOCK_SHIFT(vd) \
MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \ #define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \ #define VDEV_UBERBLOCK_OFFSET(vd, n) \

View File

@ -167,19 +167,20 @@ enum zio_flag {
ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_RESILVER = 1 << 3,
ZIO_FLAG_SCRUB = 1 << 4, ZIO_FLAG_SCRUB = 1 << 4,
ZIO_FLAG_SCAN_THREAD = 1 << 5, ZIO_FLAG_SCAN_THREAD = 1 << 5,
ZIO_FLAG_PHYSICAL = 1 << 6,
#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
/* /*
* Flags inherited by ddt, gang, and vdev children. * Flags inherited by ddt, gang, and vdev children.
*/ */
ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
ZIO_FLAG_SPECULATIVE = 1 << 7, ZIO_FLAG_SPECULATIVE = 1 << 8,
ZIO_FLAG_CONFIG_WRITER = 1 << 8, ZIO_FLAG_CONFIG_WRITER = 1 << 9,
ZIO_FLAG_DONT_RETRY = 1 << 9, ZIO_FLAG_DONT_RETRY = 1 << 10,
ZIO_FLAG_DONT_CACHE = 1 << 10, ZIO_FLAG_DONT_CACHE = 1 << 11,
ZIO_FLAG_NODATA = 1 << 11, ZIO_FLAG_NODATA = 1 << 12,
ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
@ -187,28 +188,28 @@ enum zio_flag {
/* /*
* Flags inherited by vdev children. * Flags inherited by vdev children.
*/ */
ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
ZIO_FLAG_PROBE = 1 << 14, ZIO_FLAG_PROBE = 1 << 15,
ZIO_FLAG_TRYHARD = 1 << 15, ZIO_FLAG_TRYHARD = 1 << 16,
ZIO_FLAG_OPTIONAL = 1 << 16, ZIO_FLAG_OPTIONAL = 1 << 17,
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/* /*
* Flags not inherited by any children. * Flags not inherited by any children.
*/ */
ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
ZIO_FLAG_DONT_PROPAGATE = 1 << 18, ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
ZIO_FLAG_IO_BYPASS = 1 << 19, ZIO_FLAG_IO_BYPASS = 1 << 20,
ZIO_FLAG_IO_REWRITE = 1 << 20, ZIO_FLAG_IO_REWRITE = 1 << 21,
ZIO_FLAG_RAW = 1 << 21, ZIO_FLAG_RAW = 1 << 22,
ZIO_FLAG_GANG_CHILD = 1 << 22, ZIO_FLAG_GANG_CHILD = 1 << 23,
ZIO_FLAG_DDT_CHILD = 1 << 23, ZIO_FLAG_DDT_CHILD = 1 << 24,
ZIO_FLAG_GODFATHER = 1 << 24, ZIO_FLAG_GODFATHER = 1 << 25,
ZIO_FLAG_NOPWRITE = 1 << 25, ZIO_FLAG_NOPWRITE = 1 << 26,
ZIO_FLAG_REEXECUTED = 1 << 26, ZIO_FLAG_REEXECUTED = 1 << 27,
ZIO_FLAG_DELEGATED = 1 << 27, ZIO_FLAG_DELEGATED = 1 << 28,
ZIO_FLAG_FASTWRITE = 1 << 28 ZIO_FLAG_FASTWRITE = 1 << 29,
}; };
#define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_MUSTSUCCEED 0

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved.
*/ */

View File

@ -64,6 +64,21 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
*/ */
int zfs_condense_pct = 200; int zfs_condense_pct = 200;
/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
* space used on disk. In particular, a space map uses data in increments of
* MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
* same number of blocks after condensing. Since the goal of condensing is to
* reduce the number of IOPs required to read the space map, we only want to
* condense when we can be sure we will reduce the number of blocks used by the
* space map. Unfortunately, we cannot precisely compute whether or not this is
* the case in metaslab_should_condense since we are holding ms_lock. Instead,
* we apply the following heuristic: do not condense a spacemap unless the
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
* blocks.
*/
int zfs_metaslab_condense_block_threshold = 4;
/* /*
* The zfs_mg_noalloc_threshold defines which metaslab groups should * The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of * be eligible for allocation. The value is defined as a percentage of
@ -1633,6 +1648,8 @@ metaslab_group_preload(metaslab_group_t *mg)
* times the size than the free space range tree representation * times the size than the free space range tree representation
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
* *
* 3. The on-disk size of the space map should actually decrease.
*
* Checking the first condition is tricky since we don't want to walk * Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we * the entire AVL tree calculating the estimated on-disk size. Instead we
* use the size-ordered range tree in the metaslab and calculate the * use the size-ordered range tree in the metaslab and calculate the
@ -1643,13 +1660,21 @@ metaslab_group_preload(metaslab_group_t *mg)
* To determine the second criterion we use a best-case estimate and assume * To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer * each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form. * to this best-case estimate as the space map's minimal form.
*
* Unfortunately, we cannot compute the on-disk size of the space map in this
* context because we cannot accurately compute the effects of compression, etc.
* Instead, we apply the heuristic described in the block comment for
* zfs_metaslab_condense_block_threshold - we only condense if the space used
* is greater than a threshold number of blocks.
*/ */
static boolean_t static boolean_t
metaslab_should_condense(metaslab_t *msp) metaslab_should_condense(metaslab_t *msp)
{ {
space_map_t *sm = msp->ms_sm; space_map_t *sm = msp->ms_sm;
range_seg_t *rs; range_seg_t *rs;
uint64_t size, entries, segsz; uint64_t size, entries, segsz, object_size, optimal_size, record_size;
dmu_object_info_t doi;
uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded); ASSERT(msp->ms_loaded);
@ -1674,9 +1699,15 @@ metaslab_should_condense(metaslab_t *msp)
entries = size / (MIN(size, SM_RUN_MAX)); entries = size / (MIN(size, SM_RUN_MAX));
segsz = entries * sizeof (uint64_t); segsz = entries * sizeof (uint64_t);
return (segsz <= space_map_length(msp->ms_sm) && optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
space_map_length(msp->ms_sm) >= (zfs_condense_pct * object_size = space_map_length(msp->ms_sm);
sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
dmu_object_info_from_db(sm->sm_dbuf, &doi);
record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
return (segsz <= object_size &&
object_size >= (optimal_size * zfs_condense_pct / 100) &&
object_size > zfs_metaslab_condense_block_threshold * record_size);
} }
/* /*

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/ */

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -136,7 +136,10 @@ zfs_dbgmsg_fini(void)
* echo ::zfs_dbgmsg | mdb -k * echo ::zfs_dbgmsg | mdb -k
* *
* Monitor these messages by running: * Monitor these messages by running:
* dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
*
* When used with libzpool, monitor with:
* dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
*/ */
void void
zfs_dbgmsg(const char *fmt, ...) zfs_dbgmsg(const char *fmt, ...)

View File

@ -889,8 +889,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
ASSERT3U(offset + size, <=, vd->vdev_psize); ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
ZIO_TYPE_READ, priority, flags, vd, offset, NULL, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum; zio->io_prop.zp_checksum = checksum;
@ -910,8 +910,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
ASSERT3U(offset + size, <=, vd->vdev_psize); ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum; zio->io_prop.zp_checksum = checksum;
@ -2642,7 +2642,9 @@ zio_vdev_io_start(zio_t *zio)
align = 1ULL << vd->vdev_top->vdev_ashift; align = 1ULL << vd->vdev_top->vdev_ashift;
if (P2PHASE(zio->io_size, align) != 0) { if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align); uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = zio_buf_alloc(asize); char *abuf = zio_buf_alloc(asize);
ASSERT(vd == vd->vdev_top); ASSERT(vd == vd->vdev_top);
@ -2653,8 +2655,22 @@ zio_vdev_io_start(zio_t *zio)
zio_push_transform(zio, abuf, asize, asize, zio_subblock); zio_push_transform(zio, abuf, asize, asize, zio_subblock);
} }
ASSERT(P2PHASE(zio->io_offset, align) == 0); /*
ASSERT(P2PHASE(zio->io_size, align) == 0); * If this is not a physical io, make sure that it is properly aligned
* before proceeding.
*/
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
ASSERT0(P2PHASE(zio->io_offset, align));
ASSERT0(P2PHASE(zio->io_size, align));
} else {
/*
* For physical writes, we allow 512b aligned writes and assume
* the device will perform a read-modify-write as necessary.
*/
ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
}
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
/* /*