mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 03:09:34 +03:00
Fixed data integrity issue when underlying disk returns error
Errors in zil_lwb_write_done() are not propagated to zil_lwb_flush_vdevs_done() which can result in zil_commit_impl() not returning an error to applications even when zfs was not able to write data to the disk. Remove the ZIO_FLAG_DONT_PROPAGATE flag from zio_rewrite() to allow errors to propagate and consolidate the error handling for flush and write errors to a single location (rather than having error handling split between the "write done" and "flush done" handlers). Reviewed-by: George Wilson <gwilson@delphix.com> Reviewed-by: Prakash Surya <prakash.surya@delphix.com> Signed-off-by: Arun KV <arun.kv@datacore.com> Closes #12391 Closes #12443
This commit is contained in:
parent
695d4ae815
commit
f82f0279ed
@ -1178,6 +1178,20 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
|
||||
|
||||
ASSERT3P(zcw->zcw_lwb, ==, lwb);
|
||||
zcw->zcw_lwb = NULL;
|
||||
/*
|
||||
* We expect any ZIO errors from child ZIOs to have been
|
||||
* propagated "up" to this specific LWB's root ZIO, in
|
||||
* order for this error handling to work correctly. This
|
||||
* includes ZIO errors from either this LWB's write or
|
||||
* flush, as well as any errors from other dependent LWBs
|
||||
* (e.g. a root LWB ZIO that might be a child of this LWB).
|
||||
*
|
||||
* With that said, it's important to note that LWB flush
|
||||
* errors are not propagated up to the LWB root ZIO.
|
||||
* This is incorrect behavior, and results in VDEV flush
|
||||
* errors not being handled correctly here. See the
|
||||
* comment above the call to "zio_flush" for details.
|
||||
*/
|
||||
|
||||
zcw->zcw_zio_error = zio->io_error;
|
||||
|
||||
@ -1251,6 +1265,12 @@ zil_lwb_write_done(zio_t *zio)
|
||||
* nodes. We avoid calling zio_flush() since there isn't any
|
||||
* good reason for doing so, after the lwb block failed to be
|
||||
* written out.
|
||||
*
|
||||
* Additionally, we don't perform any further error handling at
|
||||
* this point (e.g. setting "zcw_zio_error" appropriately), as
|
||||
* we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
|
||||
* we expect any error seen here, to have been propagated to
|
||||
* that function).
|
||||
*/
|
||||
if (zio->io_error != 0) {
|
||||
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
|
||||
@ -1281,8 +1301,17 @@ zil_lwb_write_done(zio_t *zio)
|
||||
|
||||
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
|
||||
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
|
||||
if (vd != NULL)
|
||||
if (vd != NULL) {
|
||||
/*
|
||||
* The "ZIO_FLAG_DONT_PROPAGATE" is currently
|
||||
* always used within "zio_flush". This means,
|
||||
* any errors when flushing the vdev(s), will
|
||||
* (unfortunately) not be handled correctly,
|
||||
* since these "zio_flush" errors will not be
|
||||
* propagated up to "zil_lwb_flush_vdevs_done".
|
||||
*/
|
||||
zio_flush(lwb->lwb_root_zio, vd);
|
||||
}
|
||||
kmem_free(zv, sizeof (*zv));
|
||||
}
|
||||
}
|
||||
@ -1399,8 +1428,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
|
||||
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
|
||||
zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
|
||||
BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
|
||||
prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
|
||||
ZIO_FLAG_FASTWRITE, &zb);
|
||||
prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
|
||||
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
|
||||
|
||||
lwb->lwb_state = LWB_STATE_OPENED;
|
||||
|
Loading…
Reference in New Issue
Block a user