Refine split block reconstruction

Due to a flaw in 4589f3ae the number of unique combinations
could be calculated incorrectly.  This could result in the
random combinations reconstruction being used when it would
have been possible to check all combinations.

This change fixes the unique combinations calculation and
simplifies the reconstruction logic by maintaining a per-
segment list of unique copies.

The vdev_indirect_splits_damage() function was introduced
to validate both the enumeration and random reconstruction
logic with ztest.  It is implemented such it will never
make a known recoverable block unrecoverable.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #6900 
Closes #7934
This commit is contained in:
Brian Behlendorf 2018-10-01 10:36:34 -07:00 committed by GitHub
parent d12614521a
commit 1258bd778e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 274 additions and 123 deletions

View File

@ -214,6 +214,7 @@ extern int zfs_abd_scatter_enabled;
extern int dmu_object_alloc_chunk_shift; extern int dmu_object_alloc_chunk_shift;
extern boolean_t zfs_force_some_double_word_sm_entries; extern boolean_t zfs_force_some_double_word_sm_entries;
extern unsigned long zio_decompress_fail_fraction; extern unsigned long zio_decompress_fail_fraction;
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts; static ztest_shared_opts_t ztest_opts;
@ -6479,7 +6480,7 @@ ztest_run_zdb(char *pool)
(void) sprintf(zdb, (void) sprintf(zdb,
"%s -bcc%s%s -G -d -U %s " "%s -bcc%s%s -G -d -U %s "
"-o zfs_reconstruct_indirect_combinations_max=1000000 %s", "-o zfs_reconstruct_indirect_combinations_max=65536 %s",
bin, bin,
ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 3 ? "s" : "",
ztest_opts.zo_verbose >= 4 ? "v" : "", ztest_opts.zo_verbose >= 4 ? "v" : "",
@ -7518,6 +7519,13 @@ main(int argc, char **argv)
*/ */
zfs_force_some_double_word_sm_entries = B_TRUE; zfs_force_some_double_word_sm_entries = B_TRUE;
/*
* Verify that even extensively damaged split blocks with many
* segments can be reconstructed in a reasonable amount of time
* when reconstruction is known to be possible.
*/
zfs_reconstruct_indirect_damage_fraction = 4;
action.sa_handler = sig_handler; action.sa_handler = sig_handler;
sigemptyset(&action.sa_mask); sigemptyset(&action.sa_mask);
action.sa_flags = 0; action.sa_flags = 0;

View File

@ -1965,7 +1965,7 @@ combinations each time the block is accessed. This allows all segment
copies to participate fairly in the reconstruction when all combinations copies to participate fairly in the reconstruction when all combinations
cannot be checked and prevents repeated use of one bad copy. cannot be checked and prevents repeated use of one bad copy.
.sp .sp
Default value: \fB100\fR. Default value: \fB256\fR.
.RE .RE
.sp .sp

View File

@ -213,7 +213,14 @@ int zfs_condense_indirect_commit_entry_delay_ms = 0;
* copies to participate fairly in the reconstruction when all combinations * copies to participate fairly in the reconstruction when all combinations
* cannot be checked and prevents repeated use of one bad copy. * cannot be checked and prevents repeated use of one bad copy.
*/ */
int zfs_reconstruct_indirect_combinations_max = 100; int zfs_reconstruct_indirect_combinations_max = 256;
/*
* Enable to simulate damaged segments and validate reconstruction. This
* is intentionally not exposed as a module parameter.
*/
unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
/* /*
* The indirect_child_t represents the vdev that we will read from, when we * The indirect_child_t represents the vdev that we will read from, when we
@ -227,10 +234,11 @@ typedef struct indirect_child {
vdev_t *ic_vdev; vdev_t *ic_vdev;
/* /*
* ic_duplicate is -1 when the ic_data contents are unique, when it * ic_duplicate is NULL when the ic_data contents are unique, when it
* is determined to be a duplicate it refers to the primary child. * is determined to be a duplicate it references the primary child.
*/ */
int ic_duplicate; struct indirect_child *ic_duplicate;
list_node_t ic_node; /* node on is_unique_child */
} indirect_child_t; } indirect_child_t;
/* /*
@ -252,12 +260,14 @@ typedef struct indirect_split {
uint64_t is_target_offset; /* offset on is_vdev */ uint64_t is_target_offset; /* offset on is_vdev */
uint64_t is_size; uint64_t is_size;
int is_children; /* number of entries in is_child[] */ int is_children; /* number of entries in is_child[] */
int is_unique_children; /* number of entries in is_unique_child */
list_t is_unique_child;
/* /*
* is_good_child is the child that we are currently using to * is_good_child is the child that we are currently using to
* attempt reconstruction. * attempt reconstruction.
*/ */
int is_good_child; indirect_child_t *is_good_child;
indirect_child_t is_child[1]; /* variable-length */ indirect_child_t is_child[1]; /* variable-length */
} indirect_split_t; } indirect_split_t;
@ -269,6 +279,9 @@ typedef struct indirect_split {
typedef struct indirect_vsd { typedef struct indirect_vsd {
boolean_t iv_split_block; boolean_t iv_split_block;
boolean_t iv_reconstruct; boolean_t iv_reconstruct;
uint64_t iv_unique_combinations;
uint64_t iv_attempts;
uint64_t iv_attempts_max;
list_t iv_splits; /* list of indirect_split_t's */ list_t iv_splits; /* list of indirect_split_t's */
} indirect_vsd_t; } indirect_vsd_t;
@ -286,6 +299,13 @@ vdev_indirect_map_free(zio_t *zio)
abd_free(ic->ic_data); abd_free(ic->ic_data);
} }
list_remove(&iv->iv_splits, is); list_remove(&iv->iv_splits, is);
indirect_child_t *ic;
while ((ic = list_head(&is->is_unique_child)) != NULL)
list_remove(&is->is_unique_child, ic);
list_destroy(&is->is_unique_child);
kmem_free(is, kmem_free(is,
offsetof(indirect_split_t, is_child[is->is_children])); offsetof(indirect_split_t, is_child[is->is_children]));
} }
@ -1185,6 +1205,8 @@ vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
is->is_split_offset = split_offset; is->is_split_offset = split_offset;
is->is_target_offset = offset; is->is_target_offset = offset;
is->is_vdev = vd; is->is_vdev = vd;
list_create(&is->is_unique_child, sizeof (indirect_child_t),
offsetof(indirect_child_t, ic_node));
/* /*
* Note that we only consider multiple copies of the data for * Note that we only consider multiple copies of the data for
@ -1195,6 +1217,7 @@ vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
if (vd->vdev_ops == &vdev_mirror_ops) { if (vd->vdev_ops == &vdev_mirror_ops) {
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
is->is_child[i].ic_vdev = vd->vdev_child[i]; is->is_child[i].ic_vdev = vd->vdev_child[i];
list_link_init(&is->is_child[i].ic_node);
} }
} else { } else {
is->is_child[0].ic_vdev = vd; is->is_child[0].ic_vdev = vd;
@ -1247,7 +1270,7 @@ vdev_indirect_read_all(zio_t *zio)
ic->ic_data = abd_alloc_sametype(zio->io_abd, ic->ic_data = abd_alloc_sametype(zio->io_abd,
is->is_size); is->is_size);
ic->ic_duplicate = -1; ic->ic_duplicate = NULL;
zio_nowait(zio_vdev_child_io(zio, NULL, zio_nowait(zio_vdev_child_io(zio, NULL,
ic->ic_vdev, is->is_target_offset, ic->ic_data, ic->ic_vdev, is->is_target_offset, ic->ic_data,
@ -1359,7 +1382,7 @@ vdev_indirect_checksum_error(zio_t *zio,
zio_bad_cksum_t zbc = {{{ 0 }}}; zio_bad_cksum_t zbc = {{{ 0 }}};
abd_t *bad_abd = ic->ic_data; abd_t *bad_abd = ic->ic_data;
abd_t *good_abd = is->is_child[is->is_good_child].ic_data; abd_t *good_abd = is->is_good_child->ic_data;
zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
} }
@ -1389,11 +1412,9 @@ vdev_indirect_repair(zio_t *zio)
for (indirect_split_t *is = list_head(&iv->iv_splits); for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) { is != NULL; is = list_next(&iv->iv_splits, is)) {
indirect_child_t *good_child = &is->is_child[is->is_good_child];
for (int c = 0; c < is->is_children; c++) { for (int c = 0; c < is->is_children; c++) {
indirect_child_t *ic = &is->is_child[c]; indirect_child_t *ic = &is->is_child[c];
if (ic == good_child) if (ic == is->is_good_child)
continue; continue;
if (ic->ic_data == NULL) if (ic->ic_data == NULL)
continue; continue;
@ -1402,7 +1423,7 @@ vdev_indirect_repair(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, NULL, zio_nowait(zio_vdev_child_io(zio, NULL,
ic->ic_vdev, is->is_target_offset, ic->ic_vdev, is->is_target_offset,
good_child->ic_data, is->is_size, is->is_good_child->ic_data, is->is_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
NULL, NULL)); NULL, NULL));
@ -1444,6 +1465,177 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
} }
} }
/*
* Copy data from all the splits to a main zio then validate the checksum.
* If then checksum is successfully validated return success.
*/
static int
vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
{
zio_bad_cksum_t zbc;
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
ASSERT3P(is->is_good_child->ic_data, !=, NULL);
ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
is->is_split_offset, 0, is->is_size);
}
return (zio_checksum_error(zio, &zbc));
}
/*
* There are relatively few possible combinations making it feasible to
* deterministically check them all. We do this by setting the good_child
* to the next unique split version. If we reach the end of the list then
* "carry over" to the next unique split version (like counting in base
* is_unique_children, but each digit can have a different base).
*/
static int
vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
{
boolean_t more = B_TRUE;
iv->iv_attempts = 0;
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is))
is->is_good_child = list_head(&is->is_unique_child);
while (more == B_TRUE) {
iv->iv_attempts++;
more = B_FALSE;
if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
return (0);
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
is->is_good_child = list_next(&is->is_unique_child,
is->is_good_child);
if (is->is_good_child != NULL) {
more = B_TRUE;
break;
}
is->is_good_child = list_head(&is->is_unique_child);
}
}
ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
return (SET_ERROR(ECKSUM));
}
/*
* There are too many combinations to try all of them in a reasonable amount
* of time. So try a fixed number of random combinations from the unique
* split versions, after which we'll consider the block unrecoverable.
*/
static int
vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
{
iv->iv_attempts = 0;
while (iv->iv_attempts < iv->iv_attempts_max) {
iv->iv_attempts++;
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
indirect_child_t *ic = list_head(&is->is_unique_child);
int children = is->is_unique_children;
for (int i = spa_get_random(children); i > 0; i--)
ic = list_next(&is->is_unique_child, ic);
ASSERT3P(ic, !=, NULL);
is->is_good_child = ic;
}
if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
return (0);
}
return (SET_ERROR(ECKSUM));
}
/*
* This is a validation function for reconstruction. It randomly selects
* a good combination, if one can be found, and then it intentionally
* damages all other segment copes by zeroing them. This forces the
* reconstruction algorithm to locate the one remaining known good copy.
*/
static int
vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
{
/* Presume all the copies are unique for initial selection. */
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
is->is_unique_children = 0;
for (int i = 0; i < is->is_children; i++) {
indirect_child_t *ic = &is->is_child[i];
if (ic->ic_data != NULL) {
is->is_unique_children++;
list_insert_tail(&is->is_unique_child, ic);
}
}
}
/*
* Set each is_good_child to a randomly-selected child which
* is known to contain validated data.
*/
int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
if (error)
goto out;
/*
* Damage all but the known good copy by zeroing it. This will
* result in two or less unique copies per indirect_child_t.
* Both may need to be checked in order to reconstruct the block.
* Set iv->iv_attempts_max such that all unique combinations will
* enumerated, but limit the damage to at most 16 indirect splits.
*/
iv->iv_attempts_max = 1;
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
for (int c = 0; c < is->is_children; c++) {
indirect_child_t *ic = &is->is_child[c];
if (ic == is->is_good_child)
continue;
if (ic->ic_data == NULL)
continue;
abd_zero(ic->ic_data, ic->ic_data->abd_size);
}
iv->iv_attempts_max *= 2;
if (iv->iv_attempts_max > (1ULL << 16)) {
iv->iv_attempts_max = UINT64_MAX;
break;
}
}
out:
/* Empty the unique children lists so they can be reconstructed. */
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
indirect_child_t *ic;
while ((ic = list_head(&is->is_unique_child)) != NULL)
list_remove(&is->is_unique_child, ic);
is->is_unique_children = 0;
}
return (error);
}
/* /*
* This function is called when we have read all copies of the data and need * This function is called when we have read all copies of the data and need
* to try to find a combination of copies that gives us the right checksum. * to try to find a combination of copies that gives us the right checksum.
@ -1454,8 +1646,9 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
* *
* We have to try every unique combination of copies of split segments, until * We have to try every unique combination of copies of split segments, until
* we find one that checksums correctly. Duplicate segment copies are first * we find one that checksums correctly. Duplicate segment copies are first
* discarded as an optimization to reduce the search space. After pruning * identified and latter skipped during reconstruction. This optimization
* there will exist at most one valid combination. * reduces the search space and ensures that of the remaining combinations
* at most one is correct.
* *
* When the total number of combinations is small they can all be checked. * When the total number of combinations is small they can all be checked.
* For example, if we have 3 segments in the split, and each points to a * For example, if we have 3 segments in the split, and each points to a
@ -1486,10 +1679,10 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
* data_A_1 data_B_1 data_C_1 * data_A_1 data_B_1 data_C_1
* *
* Note that the split segments may be on the same or different top-level * Note that the split segments may be on the same or different top-level
* vdevs. In either case, we try lots of combinations (see * vdevs. In either case, we may need to try lots of combinations (see
* zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror
* small silent errors on all of its children, we can still reconstruct the * has small silent errors on all of its children, we can still reconstruct
* correct data, as long as those errors are at sufficiently-separated * the correct data, as long as those errors are at sufficiently-separated
* offsets (specifically, separated by the largest block size - default of * offsets (specifically, separated by the largest block size - default of
* 128KB, but up to 16MB). * 128KB, but up to 16MB).
*/ */
@ -1497,139 +1690,89 @@ static void
vdev_indirect_reconstruct_io_done(zio_t *zio) vdev_indirect_reconstruct_io_done(zio_t *zio)
{ {
indirect_vsd_t *iv = zio->io_vsd; indirect_vsd_t *iv = zio->io_vsd;
uint64_t attempts = 0; boolean_t known_good = B_FALSE;
uint64_t attempts_max = UINT64_MAX; int error;
uint64_t combinations = 1;
iv->iv_unique_combinations = 1;
iv->iv_attempts_max = UINT64_MAX;
if (zfs_reconstruct_indirect_combinations_max > 0) if (zfs_reconstruct_indirect_combinations_max > 0)
attempts_max = zfs_reconstruct_indirect_combinations_max; iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
/* /*
* Discard duplicate copies of split segments to minimize the * If nonzero, every 1/x blocks will be damaged, in order to validate
* number of unique combinations when attempting reconstruction. * reconstruction when there are split segments with damaged copies.
* Known_good will TRUE when reconstruction is known to be possible.
*/
if (zfs_reconstruct_indirect_damage_fraction != 0 &&
spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
/*
* Determine the unique children for a split segment and add them
* to the is_unique_child list. By restricting reconstruction
* to these children, only unique combinations will be considered.
* This can vastly reduce the search space when there are a large
* number of indirect splits.
*/ */
for (indirect_split_t *is = list_head(&iv->iv_splits); for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) { is != NULL; is = list_next(&iv->iv_splits, is)) {
uint64_t is_copies = 0; is->is_unique_children = 0;
for (int i = 0; i < is->is_children; i++) { for (int i = 0; i < is->is_children; i++) {
if (is->is_child[i].ic_data == NULL) indirect_child_t *ic_i = &is->is_child[i];
if (ic_i->ic_data == NULL ||
ic_i->ic_duplicate != NULL)
continue; continue;
for (int j = i + 1; j < is->is_children; j++) { for (int j = i + 1; j < is->is_children; j++) {
if (is->is_child[j].ic_data == NULL) indirect_child_t *ic_j = &is->is_child[j];
if (ic_j->ic_data == NULL ||
ic_j->ic_duplicate != NULL)
continue; continue;
if (is->is_child[j].ic_duplicate == -1 && if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0)
abd_cmp(is->is_child[i].ic_data, ic_j->ic_duplicate = ic_i;
is->is_child[j].ic_data) == 0) {
is->is_child[j].ic_duplicate = i;
}
} }
is_copies++; is->is_unique_children++;
list_insert_tail(&is->is_unique_child, ic_i);
} }
/* Reconstruction is impossible, no valid is->is_child[] */ /* Reconstruction is impossible, no valid children */
if (is_copies == 0) { EQUIV(list_is_empty(&is->is_unique_child),
is->is_unique_children == 0);
if (list_is_empty(&is->is_unique_child)) {
zio->io_error = EIO; zio->io_error = EIO;
vdev_indirect_all_checksum_errors(zio); vdev_indirect_all_checksum_errors(zio);
zio_checksum_verified(zio); zio_checksum_verified(zio);
return; return;
} }
combinations *= is_copies; iv->iv_unique_combinations *= is->is_unique_children;
} }
for (;;) { if (iv->iv_unique_combinations <= iv->iv_attempts_max)
/* copy data from splits to main zio */ error = vdev_indirect_splits_enumerate_all(iv, zio);
int ret; else
for (indirect_split_t *is = list_head(&iv->iv_splits); error = vdev_indirect_splits_enumerate_randomly(iv, zio);
is != NULL; is = list_next(&iv->iv_splits, is)) {
/*
* If this child failed, its ic_data will be NULL.
* Skip this combination.
*/
if (is->is_child[is->is_good_child].ic_data == NULL) {
ret = EIO;
goto next;
}
/*
* If this child is a duplicate, its is_duplicate will
* refer to the primary copy. Skip this combination.
*/
if (is->is_child[is->is_good_child].ic_duplicate >= 0) {
ret = ECKSUM;
goto next;
}
abd_copy_off(zio->io_abd,
is->is_child[is->is_good_child].ic_data,
is->is_split_offset, 0, is->is_size);
}
/* See if this checksum matches. */
zio_bad_cksum_t zbc;
ret = zio_checksum_error(zio, &zbc);
if (ret == 0) {
/* Found a matching checksum. Issue repair i/os. */
vdev_indirect_repair(zio);
zio_checksum_verified(zio);
return;
}
if (error != 0) {
/* All attempted combinations failed. */
ASSERT3B(known_good, ==, B_FALSE);
zio->io_error = error;
vdev_indirect_all_checksum_errors(zio);
} else {
/* /*
* Checksum failed; try a different combination of split * The checksum has been successfully validated. Issue
* children. * repair I/Os to any copies of splits which don't match
* the validated version.
*/ */
boolean_t more; ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
next: vdev_indirect_repair(zio);
more = B_FALSE; zio_checksum_verified(zio);
if (combinations <= attempts_max) {
/*
* There are relatively few possible combinations, so
* deterministically check them all. We do this by
* adding one to the first split's good_child. If it
* overflows, then "carry over" to the next split
* (like counting in base is_children, but each
* digit can have a different base).
*/
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
is->is_good_child++;
if (is->is_good_child < is->is_children) {
more = B_TRUE;
break;
}
is->is_good_child = 0;
}
} else if (++attempts < attempts_max) {
/*
* There are too many combinations to try all of them
* in a reasonable amount of time, so try a fixed
* number of random combinations, after which we'll
* consider the block unrecoverable.
*/
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
int c = spa_get_random(is->is_children);
while (is->is_child[c].ic_duplicate >= 0)
c = (c + 1) % is->is_children;
is->is_good_child = c;
}
more = B_TRUE;
}
if (!more) {
/* All combinations failed. */
zio->io_error = ret;
vdev_indirect_all_checksum_errors(zio);
zio_checksum_verified(zio);
return;
}
} }
} }