OpenZFS 9403 - assertion failed in arc_buf_destroy()

Assertion failed in arc_buf_destroy() when concurrently reading
block with checksum error.

Porting notes:
* The ability to zinject decompression errors has been added, but
  this only works at the zio_decompress() level, where we have all
  of the info we need to match against the user's zinject options.
* The decompress_fault test has been added to test the new zinject
  functionality
* We attempted to set zio_decompress_fail_fraction to (1 << 18) in
  ztest for further test coverage. Although this did uncover a few
  low priority issues, this unfortuantely also causes ztest to
  ASSERT in many locations where the code is working correctly since
  it is designed to fail on IO errors. Developers can manually set
  this variable with the '-o' option to find and debug issues.

Authored by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Matt Ahrens <mahrens@delphix.com>
Ported-by: Tom Caputi <tcaputi@datto.com>

OpenZFS-issue: https://illumos.org/issues/9403
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/fa98e487a9
Closes #7822
This commit is contained in:
Tom Caputi 2018-08-29 14:33:33 -04:00 committed by Brian Behlendorf
parent 47ab01a18f
commit c3bd3fb4ac
12 changed files with 162 additions and 39 deletions

View File

@ -116,9 +116,9 @@
* specified. * specified.
* *
* The '-e' option takes a string describing the errno to simulate. This must * The '-e' option takes a string describing the errno to simulate. This must
* be one of 'io', 'checksum', or 'decrypt'. In most cases this will result * be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this
* in the same behavior, but RAID-Z will produce a different set of ereports * will result in the same behavior, but RAID-Z will produce a different set of
* for this situation. * ereports for this situation.
* *
* The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is
* specified, then the ARC cache is flushed appropriately. If '-u' is * specified, then the ARC cache is flushed appropriately. If '-u' is
@ -300,7 +300,7 @@ usage(void)
"\n" "\n"
"\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n"
"\t\t-e\tInject a specific error. Must be one of 'io',\n" "\t\t-e\tInject a specific error. Must be one of 'io',\n"
"\t\t\t'checksum', or decrypt. Default is 'io'.\n" "\t\t\t'checksum', 'decompress', or decrypt. Default is 'io'.\n"
"\t\t-l\tInject error at a particular block level. Default is " "\t\t-l\tInject error at a particular block level. Default is "
"0.\n" "0.\n"
"\t\t-m\tAutomatically remount underlying filesystem.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n"
@ -774,6 +774,8 @@ main(int argc, char **argv)
error = EIO; error = EIO;
} else if (strcasecmp(optarg, "checksum") == 0) { } else if (strcasecmp(optarg, "checksum") == 0) {
error = ECKSUM; error = ECKSUM;
} else if (strcasecmp(optarg, "decompress") == 0) {
error = EINVAL;
} else if (strcasecmp(optarg, "decrypt") == 0) { } else if (strcasecmp(optarg, "decrypt") == 0) {
error = EACCES; error = EACCES;
} else if (strcasecmp(optarg, "nxio") == 0) { } else if (strcasecmp(optarg, "nxio") == 0) {

View File

@ -193,7 +193,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_init = 1, .zo_init = 1,
.zo_time = 300, /* 5 minutes */ .zo_time = 300, /* 5 minutes */
.zo_maxloops = 50, /* max loops during spa_freeze() */ .zo_maxloops = 50, /* max loops during spa_freeze() */
.zo_metaslab_force_ganging = 32 << 10 .zo_metaslab_force_ganging = 32 << 10,
}; };
extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_force_ganging;
@ -204,6 +204,7 @@ extern boolean_t zfs_compressed_arc_enabled;
extern int zfs_abd_scatter_enabled; extern int zfs_abd_scatter_enabled;
extern int dmu_object_alloc_chunk_shift; extern int dmu_object_alloc_chunk_shift;
extern boolean_t zfs_force_some_double_word_sm_entries; extern boolean_t zfs_force_some_double_word_sm_entries;
extern unsigned long zio_decompress_fail_fraction;
static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts; static ztest_shared_opts_t ztest_opts;
@ -483,9 +484,6 @@ static int zc_cb_counter = 0;
*/ */
#define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000)
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
enum ztest_object { enum ztest_object {
ZTEST_META_DNODE = 0, ZTEST_META_DNODE = 0,
ZTEST_DIROBJ, ZTEST_DIROBJ,

View File

@ -2511,6 +2511,19 @@ to limit potential SLOG device abuse by single active ZIL writer.
Default value: \fB786,432\fR. Default value: \fB786,432\fR.
.RE .RE
.sp
.ne 2
.na
\fBzio_decompress_fail_fraction\fR (int)
.ad
.RS 12n
If non-zero, this value represents the denominator of the probability that zfs
should induce a decompression failure. For instance, for a 5% decompression
failure rate, this value should be set to 20.
.sp
Default value: \fB0\fR.
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -108,6 +108,7 @@ A vdev specified by path or GUID.
.BI "\-e" " device_error" .BI "\-e" " device_error"
Specify Specify
.BR "checksum" " for an ECKSUM error," .BR "checksum" " for an ECKSUM error,"
.BR "decompress" " for a data decompression error,"
.BR "decrypt" " for a data decryption error," .BR "decrypt" " for a data decryption error,"
.BR "corrupt" " to flip a bit in the data after a read," .BR "corrupt" " to flip a bit in the data after a read,"
.BR "dtl" " for an ECHILD error," .BR "dtl" " for an ECHILD error,"

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/ */
@ -5784,10 +5784,12 @@ arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t **bufp = arg; arc_buf_t **bufp = arg;
if (buf == NULL) { if (buf == NULL) {
ASSERT(zio == NULL || zio->io_error != 0);
*bufp = NULL; *bufp = NULL;
} else { } else {
ASSERT(zio == NULL || zio->io_error == 0);
*bufp = buf; *bufp = buf;
ASSERT(buf->b_data); ASSERT(buf->b_data != NULL);
} }
} }
@ -5915,12 +5917,6 @@ arc_read_done(zio_t *zio)
&acb->acb_zb, acb->acb_private, acb->acb_encrypted, &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
acb->acb_compressed, acb->acb_noauth, B_TRUE, acb->acb_compressed, acb->acb_noauth, B_TRUE,
&acb->acb_buf); &acb->acb_buf);
if (error != 0) {
(void) remove_reference(hdr, hash_lock,
acb->acb_private);
arc_buf_destroy_impl(acb->acb_buf);
acb->acb_buf = NULL;
}
/* /*
* Assert non-speculative zios didn't fail because an * Assert non-speculative zios didn't fail because an
@ -5943,9 +5939,34 @@ arc_read_done(zio_t *zio)
} }
} }
if (zio->io_error == 0) if (error != 0) {
/*
* Decompression or decryption failed. Set
* io_error so that when we call acb_done
* (below), we will indicate that the read
* failed. Note that in the unusual case
* where one callback is compressed and another
* uncompressed, we will mark all of them
* as failed, even though the uncompressed
* one can't actually fail. In this case,
* the hdr will not be anonymous, because
* if there are multiple callbacks, it's
* because multiple threads found the same
* arc buf in the hash table.
*/
zio->io_error = error; zio->io_error = error;
}
} }
/*
* If there are multiple callbacks, we must have the hash lock,
* because the only way for multiple threads to find this hdr is
* in the hash table. This ensures that if there are multiple
* callbacks, the hdr is not anonymous. If it were anonymous,
* we couldn't use arc_buf_destroy() in the error case below.
*/
ASSERT(callback_cnt < 2 || hash_lock != NULL);
hdr->b_l1hdr.b_acb = NULL; hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
if (callback_cnt == 0) if (callback_cnt == 0)
@ -5987,7 +6008,17 @@ arc_read_done(zio_t *zio)
/* execute each callback and free its structure */ /* execute each callback and free its structure */
while ((acb = callback_list) != NULL) { while ((acb = callback_list) != NULL) {
if (acb->acb_done) { if (acb->acb_done != NULL) {
if (zio->io_error != 0 && acb->acb_buf != NULL) {
/*
* If arc_buf_alloc_impl() fails during
* decompression, the buf will still be
* allocated, and needs to be freed here.
*/
arc_buf_destroy(acb->acb_buf,
acb->acb_private);
acb->acb_buf = NULL;
}
acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
acb->acb_buf, acb->acb_private); acb->acb_buf, acb->acb_private);
} }

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -1191,25 +1191,26 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(refcount_count(&db->db_holds) > 0);
ASSERT(db->db_buf == NULL); ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL); ASSERT(db->db.db_data == NULL);
if (db->db_level == 0 && db->db_freed_in_flight) { if (buf == NULL) {
/* we were freed in flight; disregard any error */ /* i/o error */
if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0);
buf = arc_alloc_buf(db->db_objset->os_spa, ASSERT(db->db_blkid != DMU_BONUS_BLKID);
db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); ASSERT3P(db->db_buf, ==, NULL);
} db->db_state = DB_UNCACHED;
} else if (db->db_level == 0 && db->db_freed_in_flight) {
/* freed in flight */
ASSERT(zio == NULL || zio->io_error == 0);
arc_release(buf, db); arc_release(buf, db);
bzero(buf->b_data, db->db.db_size); bzero(buf->b_data, db->db.db_size);
arc_buf_freeze(buf); arc_buf_freeze(buf);
db->db_freed_in_flight = FALSE; db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf); dbuf_set_data(db, buf);
db->db_state = DB_CACHED; db->db_state = DB_CACHED;
} else if (buf != NULL) { } else {
/* success */
ASSERT(zio == NULL || zio->io_error == 0);
dbuf_set_data(db, buf); dbuf_set_data(db, buf);
db->db_state = DB_CACHED; db->db_state = DB_CACHED;
} else {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
db->db_state = DB_UNCACHED;
} }
cv_broadcast(&db->db_changed); cv_broadcast(&db->db_changed);
dbuf_rele_and_unlock(db, NULL, B_FALSE); dbuf_rele_and_unlock(db, NULL, B_FALSE);
@ -2847,6 +2848,13 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0); ASSERT3S(dpa->dpa_curlevel, >, 0);
if (abuf == NULL) {
ASSERT(zio == NULL || zio->io_error != 0);
kmem_free(dpa, sizeof (*dpa));
return;
}
ASSERT(zio == NULL || zio->io_error == 0);
/* /*
* The dpa_dnode is only valid if we are called with a NULL * The dpa_dnode is only valid if we are called with a NULL
* zio. This indicates that the arc_read() returned without * zio. This indicates that the arc_read() returned without
@ -2879,11 +2887,6 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
} }
if (abuf == NULL) {
kmem_free(dpa, sizeof (*dpa));
return;
}
dpa->dpa_curlevel--; dpa->dpa_curlevel--;
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));

View File

@ -389,6 +389,9 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
zio->io_abd, tmp, zio->io_size, size); zio->io_abd, tmp, zio->io_size, size);
abd_return_buf_copy(data, tmp, size); abd_return_buf_copy(data, tmp, size);
if (zio_injection_enabled && ret == 0)
ret = zio_handle_fault_injection(zio, EINVAL);
if (ret != 0) if (ret != 0)
zio->io_error = SET_ERROR(EIO); zio->io_error = SET_ERROR(EIO);
} }

View File

@ -28,7 +28,7 @@
*/ */
/* /*
* Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -37,6 +37,12 @@
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zio_compress.h> #include <sys/zio_compress.h>
/*
* If nonzero, every 1/X decompression attempts will fail, simulating
* an undetected memory error.
*/
unsigned long zio_decompress_fail_fraction = 0;
/* /*
* Compression vectors. * Compression vectors.
*/ */
@ -148,5 +154,15 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
abd_return_buf(src, tmp, s_len); abd_return_buf(src, tmp, s_len);
/*
* Decompression shouldn't fail, because we've already verifyied
* the checksum. However, for extra protection (e.g. against bitflips
* in non-ECC RAM), we handle this error (and test it).
*/
ASSERT0(ret);
if (zio_decompress_fail_fraction != 0 &&
spa_get_random(zio_decompress_fail_fraction) == 0)
ret = SET_ERROR(EINVAL);
return (ret); return (ret);
} }

View File

@ -523,7 +523,7 @@ tags = ['functional', 'exec']
[tests/functional/fault] [tests/functional/fault]
tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple', 'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple',
'scrub_after_resilver', 'decrypt_fault'] 'scrub_after_resilver', 'decrypt_fault', 'decompress_fault']
tags = ['functional', 'fault'] tags = ['functional', 'fault']
[tests/functional/features/async_destroy] [tests/functional/features/async_destroy]

View File

@ -9,6 +9,7 @@ dist_pkgdata_SCRIPTS = \
auto_spare_ashift.ksh \ auto_spare_ashift.ksh \
auto_spare_multiple.ksh \ auto_spare_multiple.ksh \
decrypt_fault.ksh \ decrypt_fault.ksh \
decompress_fault.ksh \
scrub_after_resilver.ksh scrub_after_resilver.ksh
dist_pkgdata_DATA = \ dist_pkgdata_DATA = \

View File

@ -0,0 +1,55 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 by Datto Inc.
# All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/fault/fault.cfg
#
# DESCRIPTION:
# Test that injected decompression errors are handled correctly.
#
# STRATEGY:
# 1. Create an compressed dataset with a test file
# 2. Inject decompression errors on the file 20% of the time
# 3. Read the file to confirm that errors are handled correctly
# 4. Confirm that the decompression injection was added to the ZED logs
#
log_assert "Testing that injected decompression errors are handled correctly"
function cleanup
{
log_must set_tunable64 zfs_compressed_arc_enabled 1
log_must zinject -c all
default_cleanup_noexit
}
log_onexit cleanup
default_mirror_setup_noexit $DISK1 $DISK2
log_must set_tunable64 zfs_compressed_arc_enabled 0
log_must zfs create -o compression=on $TESTPOOL/fs
mntpt=$(get_prop mountpoint $TESTPOOL/fs)
write_compressible $mntpt 32m 1 0 "testfile"
log_must sync
log_must zfs umount $TESTPOOL/fs
log_must zfs mount $TESTPOOL/fs
log_must zinject -a -t data -e decompress -f 20 $mntpt/testfile.0
log_mustnot eval "cat $mntpt/testfile.0 > /dev/null"
log_must eval "zpool events $TESTPOOL | grep -q 'data'"
log_pass "Injected decompression errors are handled correctly"

View File

@ -11,7 +11,7 @@
# #
# #
# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # Copyright (c) 2018 by Datto Inc.
# All rights reserved. # All rights reserved.
# #
@ -23,7 +23,7 @@
# Test that injected decryption errors are handled correctly. # Test that injected decryption errors are handled correctly.
# #
# STRATEGY: # STRATEGY:
# 1. Create an encrypted dataset with an test file # 1. Create an encrypted dataset with a test file
# 2. Inject decryption errors on the file 20% of the time # 2. Inject decryption errors on the file 20% of the time
# 3. Read the file to confirm that errors are handled correctly # 3. Read the file to confirm that errors are handled correctly
# 4. Confirm that the decryption injection was added to the ZED logs # 4. Confirm that the decryption injection was added to the ZED logs