Multiple DVA Scrubbing Fix

Currently, there is an issue in the sequential scrub code which
prevents self healing from working in some cases. The scrub code
will split up all DVA copies of a bp and issue each of them
separately. The problem is that, since each of the DVAs is no
longer associated with the others, the self healing code doesn't
have the opportunity to repair problems that show up in one of the
DVAs with the data from the others.

This patch fixes this issue by ensuring that all IOs issued by the
sequential scrub code include all DVAs. Initially, only the first
DVA of each is attempted. If an issue arises, the IO is retried
with all available copies, giving the self healing code a chance
to correct the issue.

To test this change, this patch also adds the ability for zinject
to specify individual DVAs to inject read errors into. We then
add a new test case that utilizes this functionality to ensure
scrubs and self-healing reads can handle and transparently fix
issues with individual copies of blocks.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #8453
This commit is contained in:
Tom Caputi 2019-03-15 17:14:31 -04:00 committed by Brian Behlendorf
parent 2bbec1c910
commit ab7615d92c
10 changed files with 415 additions and 95 deletions

View File

@ -291,8 +291,8 @@ usage(void)
"\t\tspecified by the remaining tuple. Each number is in\n"
"\t\thexadecimal, and only one block can be specified.\n"
"\n"
"\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n"
"\t [-a] [-m] [-u] [-f freq] <object>\n"
"\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
"\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
"\n"
"\t\tInject an error into the object specified by the '-t' option\n"
"\t\tand the object descriptor. The 'object' parameter is\n"
@ -300,7 +300,10 @@ usage(void)
"\n"
"\t\t-q\tQuiet mode. Only print out the handler number added.\n"
"\t\t-e\tInject a specific error. Must be one of 'io',\n"
"\t\t\t'checksum', 'decompress', or decrypt. Default is 'io'.\n"
"\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n"
"\t\t-C\tInject the given error only into specific DVAs. The\n"
"\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
"\t\t\tseparated by commas (ex. '0,2').\n"
"\t\t-l\tInject error at a particular block level. Default is "
"0.\n"
"\t\t-m\tAutomatically remount underlying filesystem.\n"
@ -361,17 +364,20 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
return (0);
if (*count == 0) {
(void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n",
"ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE");
(void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s "
"%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
"LVL", "DVAs", "RANGE");
(void) printf("--- --------------- ------ "
"------ -------- --- ---------------\n");
"------ -------- --- ---- ---------------\n");
}
*count += 1;
(void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool,
(u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object,
type_to_name(record->zi_type), record->zi_level);
(void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ",
id, pool, (u_longlong_t)record->zi_objset,
(u_longlong_t)record->zi_object, type_to_name(record->zi_type),
record->zi_level, record->zi_dvas);
if (record->zi_start == 0 &&
record->zi_end == -1ULL)
@ -602,6 +608,7 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
(void) printf(" range: [%llu, %llu)\n",
(u_longlong_t)record->zi_start,
(u_longlong_t)record->zi_end);
(void) printf(" dvas: 0x%x\n", record->zi_dvas);
}
}
@ -674,6 +681,59 @@ parse_frequency(const char *str, uint32_t *percent)
return (0);
}
/*
* This function converts a string specifier for DVAs into a bit mask.
* The dva's provided by the user should be 0 indexed and separated by
* a comma. For example:
* "1" -> 0b0010 (0x2)
* "0,1" -> 0b0011 (0x3)
* "0,1,2" -> 0b0111 (0x7)
*/
static int
parse_dvas(const char *str, uint32_t *dvas_out)
{
const char *c = str;
uint32_t mask = 0;
boolean_t need_delim = B_FALSE;
/* max string length is 5 ("0,1,2") */
if (strlen(str) > 5 || strlen(str) == 0)
return (EINVAL);
while (*c != '\0') {
switch (*c) {
case '0':
case '1':
case '2':
/* check for pipe between DVAs */
if (need_delim)
return (EINVAL);
/* check if this DVA has been set already */
if (mask & (1 << ((*c) - '0')))
return (EINVAL);
mask |= (1 << ((*c) - '0'));
need_delim = B_TRUE;
break;
case ',':
need_delim = B_FALSE;
break;
default:
/* check for invalid character */
return (EINVAL);
}
c++;
}
/* check for dangling delimiter */
if (!need_delim)
return (EINVAL);
*dvas_out = mask;
return (0);
}
int
main(int argc, char **argv)
{
@ -700,6 +760,7 @@ main(int argc, char **argv)
int dur_secs = 0;
int ret;
int flags = 0;
uint32_t dvas = 0;
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, "%s", libzfs_error_init(errno));
@ -730,7 +791,7 @@ main(int argc, char **argv)
}
while ((c = getopt(argc, argv,
":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@ -754,6 +815,17 @@ main(int argc, char **argv)
case 'c':
cancel = optarg;
break;
case 'C':
ret = parse_dvas(optarg, &dvas);
if (ret != 0) {
(void) fprintf(stderr, "invalid DVA list '%s': "
"DVAs should be 0 indexed and separated by "
"commas.\n", optarg);
usage();
libzfs_fini(g_zfs);
return (1);
}
break;
case 'd':
device = optarg;
break;
@ -937,7 +1009,7 @@ main(int argc, char **argv)
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
record.zi_freq > 0) {
record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@ -972,7 +1044,8 @@ main(int argc, char **argv)
* for doing injection, so handle it separately here.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
dvas != 0) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@ -1020,7 +1093,7 @@ main(int argc, char **argv)
} else if (raw != NULL) {
if (range != NULL || type != TYPE_INVAL || level != 0 ||
record.zi_cmd != ZINJECT_UNINITIALIZED ||
record.zi_freq > 0) {
record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "raw (-b) format with "
"any other options\n");
usage();
@ -1055,7 +1128,8 @@ main(int argc, char **argv)
error = EIO;
} else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_freq > 0) {
level != 0 || device != NULL || record.zi_freq > 0 ||
dvas != 0) {
(void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n");
usage();
@ -1076,6 +1150,15 @@ main(int argc, char **argv)
record.zi_type = atoi(argv[1]);
dataset[0] = '\0';
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "hardware failure (-I) "
"incompatible with other options\n");
usage();
libzfs_fini(g_zfs);
return (2);
}
if (nowrites == 0) {
(void) fprintf(stderr, "-s or -g meaningless "
"without -I (ignore writes)\n");
@ -1136,6 +1219,18 @@ main(int argc, char **argv)
return (1);
}
if (dvas != 0) {
if (error == EACCES || error == EINVAL) {
(void) fprintf(stderr, "the '-c' option may "
"not be used with logical data errors "
"'decrypt' and 'decompress'\n");
libzfs_fini(g_zfs);
return (1);
}
record.zi_dvas = dvas;
}
if (error == EACCES) {
if (type != TYPE_DATA) {
(void) fprintf(stderr, "decryption errors "

View File

@ -367,7 +367,7 @@ typedef struct zinject_record {
uint64_t zi_timer;
uint64_t zi_nlanes;
uint32_t zi_cmd;
uint32_t zi_pad;
uint32_t zi_dvas;
} zinject_record_t;
#define ZINJECT_NULL 0x1
@ -386,6 +386,8 @@ typedef struct zinject_record {
#define ZI_PERCENTAGE_MIN 4294UL
#define ZI_PERCENTAGE_MAX UINT32_MAX
#define ZI_NO_DVA (-1)
typedef enum zinject_type {
ZINJECT_UNINITIALIZED,
ZINJECT_DATA_FAULT,

View File

@ -85,13 +85,13 @@ Simulate a hardware failure that fails to honor a cache flush.
.B "zinject \-p \fIfunction\fB \fIpool\fB
Panic inside the specified function.
.TP
.B "zinject \-t data [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB"
.B "zinject \-t data [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB"
Force an error into the contents of a file.
.TP
.B "zinject \-t dnode [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB"
.B "zinject \-t dnode [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB"
Force an error into the metadnode for a file or directory.
.TP
.B "zinject \-t \fImos_type\fB [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB"
.B "zinject \-t \fImos_type\fB [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB"
Force an error into the MOS of a pool.
.SH OPTIONS
.TP
@ -102,6 +102,14 @@ Flush the ARC before injection.
Force an error into the pool at this bookmark tuple. Each number is
in hexadecimal, and only one block can be specified.
.TP
.BI "\-C" " dvas"
Inject the given error only into specific DVAs. The mask should be
specified as a list of 0-indexed DVAs separated by commas (ex. '0,2'). This
option is not applicable to logical data errors such as
.BR "decompress"
and
.BR "decrypt" .
.TP
.BI "\-d" " vdev"
A vdev specified by path or GUID.
.TP

View File

@ -236,24 +236,43 @@ typedef enum {
*/
typedef struct scan_io {
/* fields from blkptr_t */
uint64_t sio_offset;
uint64_t sio_blk_prop;
uint64_t sio_phys_birth;
uint64_t sio_birth;
zio_cksum_t sio_cksum;
uint32_t sio_asize;
uint32_t sio_nr_dvas;
/* fields from zio_t */
int sio_flags;
uint32_t sio_flags;
zbookmark_phys_t sio_zb;
/* members for queue sorting */
union {
avl_node_t sio_addr_node; /* link into issueing queue */
avl_node_t sio_addr_node; /* link into issuing queue */
list_node_t sio_list_node; /* link for issuing to disk */
} sio_nodes;
/*
* There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
* depending on how many were in the original bp. Only the
* first DVA is really used for sorting and issuing purposes.
* The other DVAs (if provided) simply exist so that the zio
* layer can find additional copies to repair from in the
* event of an error. This array must go at the end of the
* struct to allow this for the variable number of elements.
*/
dva_t sio_dva[0];
} scan_io_t;
#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
#define SIO_GET_END_OFFSET(sio) \
(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
#define SIO_GET_MUSED(sio) \
(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
struct dsl_scan_io_queue {
dsl_scan_t *q_scn; /* associated dsl_scan_t */
vdev_t *q_vd; /* top-level vdev that this queue represents */
@ -262,6 +281,7 @@ struct dsl_scan_io_queue {
range_tree_t *q_exts_by_addr;
avl_tree_t q_exts_by_size;
avl_tree_t q_sios_by_addr;
uint64_t q_sio_memused;
/* members for zio rate limiting */
uint64_t q_maxinflight_bytes;
@ -300,7 +320,27 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
static void scan_io_queues_destroy(dsl_scan_t *scn);
static kmem_cache_t *sio_cache;
static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
/* sio->sio_nr_dvas must be set so we know which cache to free from */
static void
sio_free(scan_io_t *sio)
{
ASSERT3U(sio->sio_nr_dvas, >, 0);
ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
}
/* It is up to the caller to set sio->sio_nr_dvas for freeing */
static scan_io_t *
sio_alloc(unsigned short nr_dvas)
{
ASSERT3U(nr_dvas, >, 0);
ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
}
void
scan_init(void)
@ -315,14 +355,22 @@ scan_init(void)
*/
fill_weight = zfs_scan_fill_weight;
sio_cache = kmem_cache_create("sio_cache",
sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
char name[36];
(void) sprintf(name, "sio_cache_%d", i);
sio_cache[i] = kmem_cache_create(name,
(sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
0, NULL, NULL, NULL, NULL, NULL, 0);
}
}
void
scan_fini(void)
{
kmem_cache_destroy(sio_cache);
for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
kmem_cache_destroy(sio_cache[i]);
}
}
static inline boolean_t
@ -339,29 +387,39 @@ dsl_scan_resilvering(dsl_pool_t *dp)
}
static inline void
sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
sio2bp(const scan_io_t *sio, blkptr_t *bp)
{
bzero(bp, sizeof (*bp));
DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
bp->blk_prop = sio->sio_blk_prop;
bp->blk_phys_birth = sio->sio_phys_birth;
bp->blk_birth = sio->sio_birth;
bp->blk_fill = 1; /* we always only work with data pointers */
bp->blk_cksum = sio->sio_cksum;
ASSERT3U(sio->sio_nr_dvas, >, 0);
ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
}
static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
/* we discard the vdev id, since we can deduce it from the queue */
sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
sio->sio_blk_prop = bp->blk_prop;
sio->sio_phys_birth = bp->blk_phys_birth;
sio->sio_birth = bp->blk_birth;
sio->sio_cksum = bp->blk_cksum;
sio->sio_nr_dvas = BP_GET_NDVAS(bp);
/*
* Copy the DVAs to the sio. We need all copies of the block so
* that the self healing code can use the alternate copies if the
* first is corrupted. We want the DVA at index dva_i to be first
* in the sio since this is the primary one that we want to issue.
*/
for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
}
}
int
@ -1176,11 +1234,9 @@ dsl_scan_should_clear(dsl_scan_t *scn)
mutex_enter(&tvd->vdev_scan_io_queue_lock);
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
/* #extents in exts_by_size = # in exts_by_addr */
/* # extents in exts_by_size = # in exts_by_addr */
mused += avl_numnodes(&queue->q_exts_by_size) *
sizeof (range_seg_t) +
avl_numnodes(&queue->q_sios_by_addr) *
sizeof (scan_io_t);
sizeof (range_seg_t) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
}
@ -2693,13 +2749,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
break;
}
sio2bp(sio, &bp, queue->q_vd->vdev_id);
bytes_issued += sio->sio_asize;
sio2bp(sio, &bp);
bytes_issued += SIO_GET_ASIZE(sio);
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
&sio->sio_zb, queue);
(void) list_remove_head(io_list);
scan_io_queues_update_zio_stats(queue, &bp);
kmem_cache_free(sio_cache, sio);
sio_free(sio);
}
atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
@ -2717,7 +2773,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
static boolean_t
scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
{
scan_io_t srch_sio, *sio, *next_sio;
scan_io_t *srch_sio, *sio, *next_sio;
avl_index_t idx;
uint_t num_sios = 0;
int64_t bytes_issued = 0;
@ -2725,24 +2781,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
ASSERT(rs != NULL);
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
srch_sio.sio_offset = rs->rs_start;
srch_sio = sio_alloc(1);
srch_sio->sio_nr_dvas = 1;
SIO_SET_OFFSET(srch_sio, rs->rs_start);
/*
* The exact start of the extent might not contain any matching zios,
* so if that's the case, examine the next one in the tree.
*/
sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
sio_free(srch_sio);
if (sio == NULL)
sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
ASSERT3U(sio->sio_offset, >=, rs->rs_start);
ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
while (sio != NULL &&
SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio);
queue->q_sio_memused -= SIO_GET_MUSED(sio);
bytes_issued += sio->sio_asize;
bytes_issued += SIO_GET_ASIZE(sio);
num_sios++;
list_insert_tail(list, sio);
sio = next_sio;
@ -2754,11 +2816,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
* in the segment we update it to reflect the work we were able to
* complete. Otherwise, we remove it from the range tree entirely.
*/
if (sio != NULL && sio->sio_offset < rs->rs_end) {
if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
range_tree_adjust_fill(queue->q_exts_by_addr, rs,
-bytes_issued);
range_tree_resize_segment(queue->q_exts_by_addr, rs,
sio->sio_offset, rs->rs_end - sio->sio_offset);
SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
return (B_TRUE);
} else {
@ -2862,9 +2924,9 @@ scan_io_queues_run_one(void *arg)
first_sio = list_head(&sio_list);
last_sio = list_tail(&sio_list);
seg_end = last_sio->sio_offset + last_sio->sio_asize;
seg_end = SIO_GET_END_OFFSET(last_sio);
if (seg_start == 0)
seg_start = first_sio->sio_offset;
seg_start = SIO_GET_OFFSET(first_sio);
/*
* Issuing sios can take a long time so drop the
@ -3567,10 +3629,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
int i;
/* update the spa's stats on how many bytes we have issued */
for (i = 0; i < BP_GET_NDVAS(bp); i++) {
/*
* Update the spa's stats on how many bytes we have issued.
* Sequential scrubs create a zio for each DVA of the bp. Each
* of these will include all DVAs for repair purposes, but the
* zio code will only try the first one unless there is an issue.
* Therefore, we should only count the first DVA for these IOs.
*/
if (scn->scn_is_sorted) {
atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
DVA_GET_ASIZE(&bp->blk_dva[i]));
DVA_GET_ASIZE(&bp->blk_dva[0]));
} else {
spa_t *spa = scn->scn_dp->dp_spa;
for (i = 0; i < BP_GET_NDVAS(bp); i++) {
atomic_add_64(&spa->spa_scan_pass_issued,
DVA_GET_ASIZE(&bp->blk_dva[i]));
}
}
/*
@ -3625,7 +3700,7 @@ static void
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{
avl_index_t idx;
int64_t asize = sio->sio_asize;
int64_t asize = SIO_GET_ASIZE(sio);
dsl_scan_t *scn = queue->q_scn;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@ -3633,11 +3708,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
/* block is already scheduled for reading */
atomic_add_64(&scn->scn_bytes_pending, -asize);
kmem_cache_free(sio_cache, sio);
sio_free(sio);
return;
}
avl_insert(&queue->q_sios_by_addr, sio, idx);
range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
queue->q_sio_memused += SIO_GET_MUSED(sio);
range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
}
/*
@ -3651,7 +3727,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
int zio_flags, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = queue->q_scn;
scan_io_t *sio = kmem_cache_alloc(sio_cache, KM_SLEEP);
scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
ASSERT0(BP_IS_GANG(bp));
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@ -3665,7 +3741,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
* get an integer underflow in case the worker processes the
* zio before we get to incrementing this counter.
*/
atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
scan_io_queue_insert_impl(queue, sio);
}
@ -3905,11 +3981,7 @@ sio_addr_compare(const void *x, const void *y)
{
const scan_io_t *a = x, *b = y;
if (a->sio_offset < b->sio_offset)
return (-1);
if (a->sio_offset == b->sio_offset)
return (0);
return (1);
return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
}
/* IO queues are created on demand when they are needed. */
@ -3921,6 +3993,7 @@ scan_io_queue_create(vdev_t *vd)
q->q_scn = scn;
q->q_vd = vd;
q->q_sio_memused = 0;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
&q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
@ -3948,11 +4021,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
NULL) {
ASSERT(range_tree_contains(queue->q_exts_by_addr,
sio->sio_offset, sio->sio_asize));
bytes_dequeued += sio->sio_asize;
kmem_cache_free(sio_cache, sio);
SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
bytes_dequeued += SIO_GET_ASIZE(sio);
queue->q_sio_memused -= SIO_GET_MUSED(sio);
sio_free(sio);
}
ASSERT0(queue->q_sio_memused);
atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
range_tree_destroy(queue->q_exts_by_addr);
@ -4007,7 +4082,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
vdev_t *vdev;
kmutex_t *q_lock;
dsl_scan_io_queue_t *queue;
scan_io_t srch, *sio;
scan_io_t *srch_sio, *sio;
avl_index_t idx;
uint64_t start, size;
@ -4022,9 +4097,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
return;
}
bp2sio(bp, &srch, dva_i);
start = srch.sio_offset;
size = srch.sio_asize;
srch_sio = sio_alloc(BP_GET_NDVAS(bp));
bp2sio(bp, srch_sio, dva_i);
start = SIO_GET_OFFSET(srch_sio);
size = SIO_GET_ASIZE(srch_sio);
/*
* We can find the zio in two states:
@ -4044,15 +4120,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
* be done with issuing the zio's it gathered and will
* signal us.
*/
sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
sio_free(srch_sio);
if (sio != NULL) {
int64_t asize = sio->sio_asize;
int64_t asize = SIO_GET_ASIZE(sio);
blkptr_t tmpbp;
/* Got it while it was cold in the queue */
ASSERT3U(start, ==, sio->sio_offset);
ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
ASSERT3U(size, ==, asize);
avl_remove(&queue->q_sios_by_addr, sio);
queue->q_sio_memused -= SIO_GET_MUSED(sio);
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
@ -4065,10 +4144,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
atomic_add_64(&scn->scn_bytes_pending, -asize);
/* count the block as though we issued it */
sio2bp(sio, &tmpbp, dva_i);
sio2bp(sio, &tmpbp);
count_block(scn, dp->dp_blkstats, &tmpbp);
kmem_cache_free(sio_cache, sio);
sio_free(sio);
}
mutex_exit(q_lock);
}

View File

@ -254,10 +254,35 @@ vdev_mirror_map_init(zio_t *zio)
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
dva_t dva_copy[SPA_DVAS_PER_BP];
c = BP_GET_NDVAS(zio->io_bp);
/*
* The sequential scrub code sorts and issues all DVAs
* of a bp separately. Each of these IOs includes all
* original DVA copies so that repairs can be performed
* in the event of an error, but we only actually want
* to check the first DVA since the others will be
* checked by their respective sorted IOs. Only if we
* hit an error will we try all DVAs upon retrying.
*
* Note: This check is safe even if the user switches
* from a legacy scrub to a sequential one in the middle
* of processing, since scn_is_sorted isn't updated until
* all outstanding IOs from the previous scrub pass
* complete.
*/
if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
!(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
dsl_scan_scrubbing(spa->spa_dsl_pool) &&
scn->scn_is_sorted) {
c = 1;
} else {
c = BP_GET_NDVAS(zio->io_bp);
}
/*
* If we do not trust the pool config, some DVAs might be
* invalid or point to vdevs that do not exist. We skip them.

View File

@ -124,7 +124,7 @@ freq_triggered(uint32_t frequency)
* Returns true if the given record matches the I/O in progress.
*/
static boolean_t
zio_match_handler(const zbookmark_phys_t *zb, uint64_t type,
zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
zinject_record_t *record, int error)
{
/*
@ -148,8 +148,10 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type,
zb->zb_level == record->zi_level &&
zb->zb_blkid >= record->zi_start &&
zb->zb_blkid <= record->zi_end &&
error == record->zi_error)
(record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
error == record->zi_error) {
return (freq_triggered(record->zi_freq));
}
return (B_FALSE);
}
@ -199,7 +201,8 @@ zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
continue;
if (zio_match_handler(zb, type, &handler->zi_record, error)) {
if (zio_match_handler(zb, type, ZI_NO_DVA,
&handler->zi_record, error)) {
ret = error;
break;
}
@ -209,6 +212,37 @@ zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
return (ret);
}
/*
* If this is a physical I/O for a vdev child determine which DVA it is
* for. We iterate backwards through the DVAs matching on the offset so
* that we end up with ZI_NO_DVA (-1) if we don't find a match.
*/
static int
zio_match_dva(zio_t *zio)
{
int i = ZI_NO_DVA;
if (zio->io_bp != NULL && zio->io_vd != NULL &&
zio->io_child_type == ZIO_CHILD_VDEV) {
for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
dva_t *dva = &zio->io_bp->blk_dva[i];
uint64_t off = DVA_GET_OFFSET(dva);
vdev_t *vd = vdev_lookup_top(zio->io_spa,
DVA_GET_VDEV(dva));
/* Compensate for vdev label added to leaves */
if (zio->io_vd->vdev_ops->vdev_op_leaf)
off += VDEV_LABEL_START_SIZE;
if (zio->io_vd == vd && zio->io_offset == off)
break;
}
}
return (i);
}
/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
@ -235,15 +269,14 @@ zio_handle_fault_injection(zio_t *zio, int error)
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
if (zio->io_spa != handler->zi_spa ||
handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
continue;
/* If this handler matches, return EIO */
/* If this handler matches, return the specified error */
if (zio_match_handler(&zio->io_logical->io_bookmark,
zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
&handler->zi_record, error)) {
zio_match_dva(zio), &handler->zi_record, error)) {
ret = error;
break;
}

View File

@ -444,7 +444,7 @@ tags = ['functional', 'cli_root', 'zpool_resilver']
tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
'zpool_scrub_offline_device']
'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies']
tags = ['functional', 'cli_root', 'zpool_scrub']
[tests/functional/cli_root/zpool_set]

View File

@ -9,7 +9,8 @@ dist_pkgdata_SCRIPTS = \
zpool_scrub_005_pos.ksh \
zpool_scrub_encrypted_unloaded.ksh \
zpool_scrub_offline_device.ksh \
zpool_scrub_print_repairing.ksh
zpool_scrub_print_repairing.ksh \
zpool_scrub_multiple_copies.ksh
dist_pkgdata_DATA = \
zpool_scrub.cfg

View File

@ -0,0 +1,77 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#
#
# Copyright (c) 2019 Datto, Inc. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Scrubs and self-healing should be able to repair data from additional
# copies that may be stored.
#
#
# STRATEGY:
# 1. Create a dataset with copies=3
# 2. Write a file to the dataset
# 3. zinject errors into the first and second DVAs of that file
# 4. Scrub and verify the scrub repaired all errors
# 7. Read the file normally to check that self healing also works
# 8. Remove the zinject handler
# 9. Scrub again and confirm 0 bytes were scrubbed
#
verify_runnable "global"
function cleanup
{
destroy_dataset $TESTPOOL/$TESTFS2
log_must zinject -c all
}
log_onexit cleanup
log_assert "Scrubs and self healing must work with additional copies"
log_must zfs create -o copies=3 $TESTPOOL/$TESTFS2
typeset mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS2)
log_must mkfile 10m $mntpnt/file
log_must zpool sync $TESTPOOL
log_must zinject -a -t data -C 0,1 -e io $mntpnt/file
log_must zpool scrub $TESTPOOL
log_must wait_scrubbed $TESTPOOL
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
log_must dd if=$mntpnt/file of=/dev/null bs=1M iflag=fullblock
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
log_must zinject -c all
log_must zpool scrub $TESTPOOL
log_must wait_scrubbed $TESTPOOL
zpool status
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
log_pass "Scrubs and self healing work with additional copies"

View File

@ -49,7 +49,7 @@ verify_runnable "global"
function cleanup
{
poolexists $TESTPOOL && destroy_pool $TESTPOOL
poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4
}
@ -103,31 +103,31 @@ log_must truncate -s $DEVSIZE $DISK1
log_must truncate -s $DEVSIZE $DISK2
log_must truncate -s $DEVSIZE $DISK3
log_must truncate -s $DEVSIZE $DISK4
poolexists $TESTPOOL && destroy_pool $TESTPOOL
log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL \
poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL2 \
raidz2 $DISK1 $DISK2 $DISK3 $DISK4
# 2. Offline the first device
zpool_do_sync 'offline' $TESTPOOL $DISK1
zpool_do_sync 'offline' $TESTPOOL2 $DISK1
# 3. Write to the pool
log_must mkfile $FILESIZE "$TESTDIR/data.bin"
# 4. Scrub the pool
zpool_scrub_sync $TESTPOOL
zpool_scrub_sync $TESTPOOL2
# 5. Online the first device and offline the second device
zpool_do_sync 'online' $TESTPOOL $DISK1
zpool_do_sync 'offline' $TESTPOOL $DISK2
log_must wait_for_resilver_end $TESTPOOL $RESILVER_TIMEOUT
zpool_do_sync 'online' $TESTPOOL2 $DISK1
zpool_do_sync 'offline' $TESTPOOL2 $DISK2
log_must wait_for_resilver_end $TESTPOOL2 $RESILVER_TIMEOUT
# 6. Scrub the pool again
zpool_scrub_sync $TESTPOOL
zpool_scrub_sync $TESTPOOL2
# 7. Verify data integrity
cksum=$(zpool status $TESTPOOL | awk 'L{print $NF;L=0} /CKSUM$/{L=1}')
cksum=$(zpool status $TESTPOOL2 | awk 'L{print $NF;L=0} /CKSUM$/{L=1}')
if [[ $cksum != 0 ]]; then
log_fail "Unexpected CKSUM errors found on $TESTPOOL ($cksum)"
log_fail "Unexpected CKSUM errors found on $TESTPOOL2 ($cksum)"
fi
log_pass "Scrubbing a pool with offline devices correctly preserves DTLs"