Optimize possible split block search space

Remove duplicate segment copies to minimize the possible search
space for reconstruction.  Once reduced an accurate assessment can
be made regarding the difficulty in reconstructing the block.

Also, ztest will now run zdb with
zfs_reconstruct_indirect_combinations_max set to 1000000 in an attempt
to avoid checksum errors.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #6900
This commit is contained in:
Brian Behlendorf 2018-03-29 14:50:40 -07:00
parent 9e052db462
commit 4589f3ae4c
3 changed files with 101 additions and 46 deletions

View File

@ -6221,7 +6221,8 @@ ztest_run_zdb(char *pool)
ztest_get_zdb_bin(bin, len); ztest_get_zdb_bin(bin, len);
(void) sprintf(zdb, (void) sprintf(zdb,
"%s -bcc%s%s -G -d -U %s %s", "%s -bcc%s%s -G -d -U %s "
"-o zfs_reconstruct_indirect_combinations_max=1000000 %s",
bin, bin,
ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 3 ? "s" : "",
ztest_opts.zo_verbose >= 4 ? "v" : "", ztest_opts.zo_verbose >= 4 ? "v" : "",

View File

@ -1742,18 +1742,18 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
.sp .sp
.ne 2 .ne 2
.na .na
\fBzfs_reconstruct_indirect_segments_max\fR (int) \fBzfs_reconstruct_indirect_combinations_max\fR (int)
.ad .ad
.RS 12n .RS 12na
When a split block which is part of a indirect vdev contains more than this If an indirect split block contains more than this many possible unique
many segments, consider it too computationally expensive to check all possible combinations when being reconstructed, consider it too computationally
combinations. Instead, operate under the assumption that only a few segment expensive to check them all. Instead, try at most
copies are damaged and the majority of segment copies are good, in which case \fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected
it is reasonable to randomly select sample combinations. This allows all the combinations each time the block is accessed. This allows all segment
segment copies to participate fairly in the reconstruction and prevents the copies to participate fairly in the reconstruction when all combinations
repeated use of one bad copy. cannot be checked and prevents repeated use of one bad copy.
.sp .sp
Default value: \fB10\fR. Default value: \fB100\fR.
.RE .RE
.sp .sp

View File

@ -204,17 +204,14 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
int zfs_condense_indirect_commit_entry_delay_ms = 0; int zfs_condense_indirect_commit_entry_delay_ms = 0;
/* /*
* If a split block contains more than this many segments, consider it too * If an indirect split block contains more than this many possible unique
* computationally expensive to check all (2^num_segments) possible * combinations when being reconstructed, consider it too computationally
* combinations. Instead, try at most 2^_segments_max randomly-selected * expensive to check them all. Instead, try at most 100 randomly-selected
* combinations. * combinations each time the block is accessed. This allows all segment
* * copies to participate fairly in the reconstruction when all combinations
* This is reasonable if only a few segment copies are damaged and the * cannot be checked and prevents repeated use of one bad copy.
* majority of segment copies are good. It allows all segment copies to
* participate fairly in the reconstruction and prevents repeated use of
* one bad copy.
*/ */
int zfs_reconstruct_indirect_segments_max = 10; int zfs_reconstruct_indirect_combinations_max = 100;
/* /*
* The indirect_child_t represents the vdev that we will read from, when we * The indirect_child_t represents the vdev that we will read from, when we
@ -226,6 +223,12 @@ int zfs_reconstruct_indirect_segments_max = 10;
typedef struct indirect_child { typedef struct indirect_child {
abd_t *ic_data; abd_t *ic_data;
vdev_t *ic_vdev; vdev_t *ic_vdev;
/*
* ic_duplicate is -1 when the ic_data contents are unique, when it
* is determined to be a duplicate it refers to the primary child.
*/
int ic_duplicate;
} indirect_child_t; } indirect_child_t;
/* /*
@ -1165,6 +1168,7 @@ vdev_indirect_read_all(zio_t *zio)
ic->ic_data = abd_alloc_sametype(zio->io_abd, ic->ic_data = abd_alloc_sametype(zio->io_abd,
is->is_size); is->is_size);
ic->ic_duplicate = -1;
zio_nowait(zio_vdev_child_io(zio, NULL, zio_nowait(zio_vdev_child_io(zio, NULL,
ic->ic_vdev, is->is_target_offset, ic->ic_data, ic->ic_vdev, is->is_target_offset, ic->ic_data,
@ -1314,7 +1318,7 @@ vdev_indirect_repair(zio_t *zio)
continue; continue;
if (ic->ic_data == NULL) if (ic->ic_data == NULL)
continue; continue;
if (abd_cmp(good_child->ic_data, ic->ic_data) == 0) if (ic->ic_duplicate == is->is_good_child)
continue; continue;
zio_nowait(zio_vdev_child_io(zio, NULL, zio_nowait(zio_vdev_child_io(zio, NULL,
@ -1367,15 +1371,16 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
* *
* If we pointed to any mirror vdevs, this effectively does the job of the * If we pointed to any mirror vdevs, this effectively does the job of the
* mirror. The mirror vdev code can't do its own job because we don't know * mirror. The mirror vdev code can't do its own job because we don't know
* the checksum of each split segment individually. We have to try every * the checksum of each split segment individually.
* combination of copies of split segments, until we find one that checksums
* correctly. (Or until we have tried all combinations, or have tried
* 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
* set io_error to ECKSUM to propagate the error up to the user.)
* *
* For example, if we have 3 segments in the split, * We have to try every unique combination of copies of split segments, until
* and each points to a 2-way mirror, we will have the following pieces of * we find one that checksums correctly. Duplicate segment copies are first
* data: * discarded as an optimization to reduce the search space. After pruning
* there will exist at most one valid combination.
*
* When the total number of combinations is small they can all be checked.
* For example, if we have 3 segments in the split, and each points to a
* 2-way mirror with unique copies, we will have the following pieces of data:
* *
* | mirror child * | mirror child
* split | [0] [1] * split | [0] [1]
@ -1414,12 +1419,48 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
{ {
indirect_vsd_t *iv = zio->io_vsd; indirect_vsd_t *iv = zio->io_vsd;
uint64_t attempts = 0; uint64_t attempts = 0;
uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max; uint64_t attempts_max = UINT64_MAX;
int segments = 0; uint64_t combinations = 1;
if (zfs_reconstruct_indirect_combinations_max > 0)
attempts_max = zfs_reconstruct_indirect_combinations_max;
/*
* Discard duplicate copies of split segments to minimize the
* number of unique combinations when attempting reconstruction.
*/
for (indirect_split_t *is = list_head(&iv->iv_splits); for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) is != NULL; is = list_next(&iv->iv_splits, is)) {
segments++; uint64_t is_copies = 0;
for (int i = 0; i < is->is_children; i++) {
if (is->is_child[i].ic_data == NULL)
continue;
for (int j = i + 1; j < is->is_children; j++) {
if (is->is_child[j].ic_data == NULL)
continue;
if (is->is_child[j].ic_duplicate == -1 &&
abd_cmp(is->is_child[i].ic_data,
is->is_child[j].ic_data) == 0) {
is->is_child[j].ic_duplicate = i;
}
}
is_copies++;
}
/* Reconstruction is impossible, no valid is->is_child[] */
if (is_copies == 0) {
zio->io_error = EIO;
vdev_indirect_all_checksum_errors(zio);
zio_checksum_verified(zio);
return;
}
combinations *= is_copies;
}
for (;;) { for (;;) {
/* copy data from splits to main zio */ /* copy data from splits to main zio */
@ -1436,6 +1477,15 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
goto next; goto next;
} }
/*
* If this child is a duplicate, its is_duplicate will
* refer to the primary copy. Skip this combination.
*/
if (is->is_child[is->is_good_child].ic_duplicate >= 0) {
ret = ECKSUM;
goto next;
}
abd_copy_off(zio->io_abd, abd_copy_off(zio->io_abd,
is->is_child[is->is_good_child].ic_data, is->is_child[is->is_good_child].ic_data,
is->is_split_offset, 0, is->is_size); is->is_split_offset, 0, is->is_size);
@ -1458,14 +1508,14 @@ vdev_indirect_reconstruct_io_done(zio_t *zio)
boolean_t more; boolean_t more;
next: next:
more = B_FALSE; more = B_FALSE;
if (segments <= zfs_reconstruct_indirect_segments_max) { if (combinations <= attempts_max) {
/* /*
* There are relatively few segments, so * There are relatively few possible combinations, so
* deterministically check all combinations. We do * deterministically check them all. We do this by
* this by by adding one to the first split's * adding one to the first split's good_child. If it
* good_child. If it overflows, then "carry over" to * overflows, then "carry over" to the next split
* the next split (like counting in base is_children, * (like counting in base is_children, but each
* but each digit can have a different base). * digit can have a different base).
*/ */
for (indirect_split_t *is = list_head(&iv->iv_splits); for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) { is != NULL; is = list_next(&iv->iv_splits, is)) {
@ -1485,8 +1535,12 @@ next:
*/ */
for (indirect_split_t *is = list_head(&iv->iv_splits); for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) { is != NULL; is = list_next(&iv->iv_splits, is)) {
is->is_good_child = int c = spa_get_random(is->is_children);
spa_get_random(is->is_children);
while (is->is_child[c].ic_duplicate >= 0)
c = (c + 1) % is->is_children;
is->is_good_child = c;
} }
more = B_TRUE; more = B_TRUE;
} }
@ -1577,7 +1631,7 @@ module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms, MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
"Delay while condensing vdev mapping"); "Delay while condensing vdev mapping");
module_param(zfs_reconstruct_indirect_segments_max, int, 0644); module_param(zfs_reconstruct_indirect_combinations_max, int, 0644);
MODULE_PARM_DESC(zfs_reconstruct_indirect_segments_max, MODULE_PARM_DESC(zfs_reconstruct_indirect_combinations_max,
"Maximum number of split segments check all combinations"); "Maximum number of combinations when reconstructing split segments");
#endif #endif