248 lines
8.9 KiB
Diff
248 lines
8.9 KiB
Diff
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||
|
From: George Wilson <george.wilson@delphix.com>
|
||
|
Date: Thu, 8 Feb 2018 15:04:14 -0500
|
||
|
Subject: [PATCH] OpenZFS 8857 - zio_remove_child() panic due to already
|
||
|
destroyed parent zio
|
||
|
MIME-Version: 1.0
|
||
|
Content-Type: text/plain; charset=UTF-8
|
||
|
Content-Transfer-Encoding: 8bit
|
||
|
|
||
|
PROBLEM
|
||
|
=======
|
||
|
It's possible for a parent zio to complete even though it has children
|
||
|
which have not completed. This can result in the following panic:
|
||
|
> $C
|
||
|
ffffff01809128c0 vpanic()
|
||
|
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
|
||
|
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
|
||
|
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
|
||
|
ffffff3373370908)
|
||
|
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
|
||
|
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
|
||
|
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
|
||
|
ffffff0180912b40 thread_start+8()
|
||
|
> ::status
|
||
|
debugging crash dump vmcore.2 (64-bit) from batfs0390
|
||
|
operating system: 5.11 joyent_20170911T171900Z (i86pc)
|
||
|
image uuid: (not set)
|
||
|
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
|
||
|
owner=ffffff3c59b39480 thread=ffffff0180912c40
|
||
|
dump content: kernel pages only
|
||
|
The problem is that dbuf_prefetch along with l2arc can create a zio tree
|
||
|
which confuses the parent zio and allows it to complete with while children
|
||
|
still exist. Here's the scenario:
|
||
|
zio tree:
|
||
|
pio
|
||
|
|--- lio
|
||
|
The parent zio, pio, has entered the zio_done stage and begins to check its
|
||
|
children to see there are still some that have not completed. In zio_done(),
|
||
|
the children are checked in the following order:
|
||
|
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
|
||
|
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
|
||
|
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
|
||
|
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
|
||
|
If pio, finds any child which has not completed then it stops executing and
|
||
|
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
|
||
|
while checking the particular child.
|
||
|
In this scenario, the pio has completed the first call to
|
||
|
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
|
||
|
the only zio in the zio tree right now is the logical zio, lio, then it
|
||
|
completes that call and prepares to check the next child type.
|
||
|
In the meantime, the lio completes and in its callback creates a child vdev
|
||
|
zio, cio. The zio tree looks like this:
|
||
|
zio tree:
|
||
|
pio
|
||
|
|--- lio
|
||
|
|--- cio
|
||
|
The lio then grabs the parent's io_lock and removes itself.
|
||
|
zio tree:
|
||
|
pio
|
||
|
|--- cio
|
||
|
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
|
||
|
and will erroneously complete. When the child zio, cio, completes it will panic
|
||
|
the system trying to reference the parent zio which has been destroyed.
|
||
|
SOLUTION
|
||
|
========
|
||
|
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
|
||
|
for all the children types that it's interested in checking. The
|
||
|
io_lock will is held the entire time we check all the children types. Since
|
||
|
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
|
||
|
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
|
||
|
the zio_wiat_for_children logic.
|
||
|
|
||
|
Authored by: George Wilson <george.wilson@delphix.com>
|
||
|
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
|
||
|
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
|
||
|
Reviewed by: Youzhong Yang <youzhong@gmail.com>
|
||
|
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||
|
Approved by: Dan McDonald <danmcd@omniti.com>
|
||
|
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
|
||
|
|
||
|
OpenZFS-issue: https://www.illumos.org/issues/8857
|
||
|
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
|
||
|
Issue #5918
|
||
|
Closes #7168
|
||
|
|
||
|
(cherry picked from commit 07ce5d739024cf9c638716c09f9934b4e629678c)
|
||
|
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
||
|
---
|
||
|
include/sys/zio.h | 11 +++++++++++
|
||
|
module/zfs/zio.c | 49 +++++++++++++++++++++++++++++--------------------
|
||
|
2 files changed, 40 insertions(+), 20 deletions(-)
|
||
|
|
||
|
diff --git a/include/sys/zio.h b/include/sys/zio.h
|
||
|
index 4eaabc38c..0d741f8e2 100644
|
||
|
--- a/include/sys/zio.h
|
||
|
+++ b/include/sys/zio.h
|
||
|
@@ -215,6 +215,9 @@ enum zio_flag {
|
||
|
(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
|
||
|
ZIO_FLAG_CANFAIL)
|
||
|
|
||
|
+#define ZIO_CHILD_BIT(x) (1 << (x))
|
||
|
+#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
|
||
|
+
|
||
|
enum zio_child {
|
||
|
ZIO_CHILD_VDEV = 0,
|
||
|
ZIO_CHILD_GANG,
|
||
|
@@ -223,6 +226,14 @@ enum zio_child {
|
||
|
ZIO_CHILD_TYPES
|
||
|
};
|
||
|
|
||
|
+#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
|
||
|
+#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
|
||
|
+#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
|
||
|
+#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
|
||
|
+#define ZIO_CHILD_ALL_BITS \
|
||
|
+ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
|
||
|
+ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
|
||
|
+
|
||
|
enum zio_wait_type {
|
||
|
ZIO_WAIT_READY = 0,
|
||
|
ZIO_WAIT_DONE,
|
||
|
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
|
||
|
index 1d69d8d8d..cd0a473e0 100644
|
||
|
--- a/module/zfs/zio.c
|
||
|
+++ b/module/zfs/zio.c
|
||
|
@@ -491,21 +491,26 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
||
|
}
|
||
|
|
||
|
static boolean_t
|
||
|
-zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
|
||
|
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
|
||
|
{
|
||
|
- uint64_t *countp = &zio->io_children[child][wait];
|
||
|
boolean_t waiting = B_FALSE;
|
||
|
|
||
|
mutex_enter(&zio->io_lock);
|
||
|
ASSERT(zio->io_stall == NULL);
|
||
|
- if (*countp != 0) {
|
||
|
- zio->io_stage >>= 1;
|
||
|
- ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
|
||
|
- zio->io_stall = countp;
|
||
|
- waiting = B_TRUE;
|
||
|
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
|
||
|
+ if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ uint64_t *countp = &zio->io_children[c][wait];
|
||
|
+ if (*countp != 0) {
|
||
|
+ zio->io_stage >>= 1;
|
||
|
+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
|
||
|
+ zio->io_stall = countp;
|
||
|
+ waiting = B_TRUE;
|
||
|
+ break;
|
||
|
+ }
|
||
|
}
|
||
|
mutex_exit(&zio->io_lock);
|
||
|
-
|
||
|
return (waiting);
|
||
|
}
|
||
|
|
||
|
@@ -1296,9 +1301,10 @@ zio_write_compress(zio_t *zio)
|
||
|
* If our children haven't all reached the ready stage,
|
||
|
* wait for them and then repeat this pipeline stage.
|
||
|
*/
|
||
|
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
|
||
|
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
|
||
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
|
||
|
+ ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
|
||
|
return (ZIO_PIPELINE_STOP);
|
||
|
+ }
|
||
|
|
||
|
if (!IO_IS_ALLOCATING(zio))
|
||
|
return (ZIO_PIPELINE_CONTINUE);
|
||
|
@@ -2229,8 +2235,9 @@ zio_gang_issue(zio_t *zio)
|
||
|
{
|
||
|
blkptr_t *bp = zio->io_bp;
|
||
|
|
||
|
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
|
||
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
|
||
|
return (ZIO_PIPELINE_STOP);
|
||
|
+ }
|
||
|
|
||
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
|
||
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
||
|
@@ -2561,8 +2568,9 @@ zio_ddt_read_done(zio_t *zio)
|
||
|
{
|
||
|
blkptr_t *bp = zio->io_bp;
|
||
|
|
||
|
- if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
|
||
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
|
||
|
return (ZIO_PIPELINE_STOP);
|
||
|
+ }
|
||
|
|
||
|
ASSERT(BP_GET_DEDUP(bp));
|
||
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
||
|
@@ -3292,8 +3300,9 @@ zio_vdev_io_done(zio_t *zio)
|
||
|
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
|
||
|
boolean_t unexpected_error = B_FALSE;
|
||
|
|
||
|
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
|
||
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
||
|
return (ZIO_PIPELINE_STOP);
|
||
|
+ }
|
||
|
|
||
|
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
|
||
|
|
||
|
@@ -3362,8 +3371,9 @@ zio_vdev_io_assess(zio_t *zio)
|
||
|
{
|
||
|
vdev_t *vd = zio->io_vd;
|
||
|
|
||
|
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
|
||
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
||
|
return (ZIO_PIPELINE_STOP);
|
||
|
+ }
|
||
|
|
||
|
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
||
|
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
|
||
|
@@ -3578,9 +3588,10 @@ zio_ready(zio_t *zio)
|
||
|
zio_t *pio, *pio_next;
|
||
|
zio_link_t *zl = NULL;
|
||
|
|
||
|
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
|
||
|
- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
|
||
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
|
||
|
+ ZIO_WAIT_READY)) {
|
||
|
return (ZIO_PIPELINE_STOP);
|
||
|
+ }
|
||
|
|
||
|
if (zio->io_ready) {
|
||
|
ASSERT(IO_IS_ALLOCATING(zio));
|
||
|
@@ -3721,11 +3732,9 @@ zio_done(zio_t *zio)
|
||
|
* If our children haven't all completed,
|
||
|
* wait for them and then repeat this pipeline stage.
|
||
|
*/
|
||
|
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
|
||
|
- zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
|
||
|
- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
|
||
|
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
|
||
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
|
||
|
return (ZIO_PIPELINE_STOP);
|
||
|
+ }
|
||
|
|
||
|
/*
|
||
|
* If the allocation throttle is enabled, then update the accounting.
|
||
|
--
|
||
|
2.14.2
|
||
|
|