75b07eca3e
by importing the upstream release as patches. replace user namespace patch with version which has been applied usptream.
248 lines
8.9 KiB
Diff
248 lines
8.9 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: George Wilson <george.wilson@delphix.com>
|
|
Date: Thu, 8 Feb 2018 15:04:14 -0500
|
|
Subject: [PATCH] OpenZFS 8857 - zio_remove_child() panic due to already
|
|
destroyed parent zio
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
PROBLEM
|
|
=======
|
|
It's possible for a parent zio to complete even though it has children
|
|
which have not completed. This can result in the following panic:
|
|
> $C
|
|
ffffff01809128c0 vpanic()
|
|
ffffff01809128e0 mutex_panic+0x58(fffffffffb94c904, ffffff597dde7f80)
|
|
ffffff0180912950 mutex_vector_enter+0x347(ffffff597dde7f80)
|
|
ffffff01809129b0 zio_remove_child+0x50(ffffff597dde7c58, ffffff32bd901ac0,
|
|
ffffff3373370908)
|
|
ffffff0180912a40 zio_done+0x390(ffffff32bd901ac0)
|
|
ffffff0180912a70 zio_execute+0x78(ffffff32bd901ac0)
|
|
ffffff0180912b30 taskq_thread+0x2d0(ffffff33bae44140)
|
|
ffffff0180912b40 thread_start+8()
|
|
> ::status
|
|
debugging crash dump vmcore.2 (64-bit) from batfs0390
|
|
operating system: 5.11 joyent_20170911T171900Z (i86pc)
|
|
image uuid: (not set)
|
|
panic message: mutex_enter: bad mutex, lp=ffffff597dde7f80
|
|
owner=ffffff3c59b39480 thread=ffffff0180912c40
|
|
dump content: kernel pages only
|
|
The problem is that dbuf_prefetch along with l2arc can create a zio tree
|
|
which confuses the parent zio and allows it to complete with while children
|
|
still exist. Here's the scenario:
|
|
zio tree:
|
|
pio
|
|
|--- lio
|
|
The parent zio, pio, has entered the zio_done stage and begins to check its
|
|
children to see there are still some that have not completed. In zio_done(),
|
|
the children are checked in the following order:
|
|
zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)
|
|
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)
|
|
zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)
|
|
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)
|
|
If pio, finds any child which has not completed then it stops executing and
|
|
goes to sleep. Each call to zio_wait_for_children() will grab the io_lock
|
|
while checking the particular child.
|
|
In this scenario, the pio has completed the first call to
|
|
zio_wait_for_children() to check for any ZIO_CHILD_VDEV children. Since
|
|
the only zio in the zio tree right now is the logical zio, lio, then it
|
|
completes that call and prepares to check the next child type.
|
|
In the meantime, the lio completes and in its callback creates a child vdev
|
|
zio, cio. The zio tree looks like this:
|
|
zio tree:
|
|
pio
|
|
|--- lio
|
|
|--- cio
|
|
The lio then grabs the parent's io_lock and removes itself.
|
|
zio tree:
|
|
pio
|
|
|--- cio
|
|
The pio continues to run but has already completed its check for ZIO_CHILD_VDEV
|
|
and will erroneously complete. When the child zio, cio, completes it will panic
|
|
the system trying to reference the parent zio which has been destroyed.
|
|
SOLUTION
|
|
========
|
|
The fix is to rework the zio_wait_for_children() logic to accept a bitfield
|
|
for all the children types that it's interested in checking. The
|
|
io_lock will is held the entire time we check all the children types. Since
|
|
the function now accepts a bitfield, a simple ZIO_CHILD_BIT() macro is provided
|
|
to allow for the conversion between a ZIO_CHILD type and the bitfield used by
|
|
the zio_wiat_for_children logic.
|
|
|
|
Authored by: George Wilson <george.wilson@delphix.com>
|
|
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
|
|
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
|
|
Reviewed by: Youzhong Yang <youzhong@gmail.com>
|
|
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
Approved by: Dan McDonald <danmcd@omniti.com>
|
|
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>
|
|
|
|
OpenZFS-issue: https://www.illumos.org/issues/8857
|
|
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/862ff6d99c
|
|
Issue #5918
|
|
Closes #7168
|
|
|
|
(cherry picked from commit 07ce5d739024cf9c638716c09f9934b4e629678c)
|
|
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
|
---
|
|
include/sys/zio.h | 11 +++++++++++
|
|
module/zfs/zio.c | 49 +++++++++++++++++++++++++++++--------------------
|
|
2 files changed, 40 insertions(+), 20 deletions(-)
|
|
|
|
diff --git a/include/sys/zio.h b/include/sys/zio.h
|
|
index 4eaabc38c..0d741f8e2 100644
|
|
--- a/include/sys/zio.h
|
|
+++ b/include/sys/zio.h
|
|
@@ -215,6 +215,9 @@ enum zio_flag {
|
|
(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
|
|
ZIO_FLAG_CANFAIL)
|
|
|
|
+#define ZIO_CHILD_BIT(x) (1 << (x))
|
|
+#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
|
|
+
|
|
enum zio_child {
|
|
ZIO_CHILD_VDEV = 0,
|
|
ZIO_CHILD_GANG,
|
|
@@ -223,6 +226,14 @@ enum zio_child {
|
|
ZIO_CHILD_TYPES
|
|
};
|
|
|
|
+#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
|
|
+#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
|
|
+#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
|
|
+#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
|
|
+#define ZIO_CHILD_ALL_BITS \
|
|
+ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
|
|
+ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
|
|
+
|
|
enum zio_wait_type {
|
|
ZIO_WAIT_READY = 0,
|
|
ZIO_WAIT_DONE,
|
|
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
|
|
index 1d69d8d8d..cd0a473e0 100644
|
|
--- a/module/zfs/zio.c
|
|
+++ b/module/zfs/zio.c
|
|
@@ -491,21 +491,26 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
|
}
|
|
|
|
static boolean_t
|
|
-zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
|
|
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
|
|
{
|
|
- uint64_t *countp = &zio->io_children[child][wait];
|
|
boolean_t waiting = B_FALSE;
|
|
|
|
mutex_enter(&zio->io_lock);
|
|
ASSERT(zio->io_stall == NULL);
|
|
- if (*countp != 0) {
|
|
- zio->io_stage >>= 1;
|
|
- ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
|
|
- zio->io_stall = countp;
|
|
- waiting = B_TRUE;
|
|
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
|
|
+ if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
|
|
+ continue;
|
|
+
|
|
+ uint64_t *countp = &zio->io_children[c][wait];
|
|
+ if (*countp != 0) {
|
|
+ zio->io_stage >>= 1;
|
|
+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
|
|
+ zio->io_stall = countp;
|
|
+ waiting = B_TRUE;
|
|
+ break;
|
|
+ }
|
|
}
|
|
mutex_exit(&zio->io_lock);
|
|
-
|
|
return (waiting);
|
|
}
|
|
|
|
@@ -1296,9 +1301,10 @@ zio_write_compress(zio_t *zio)
|
|
* If our children haven't all reached the ready stage,
|
|
* wait for them and then repeat this pipeline stage.
|
|
*/
|
|
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
|
|
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
|
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
|
|
+ ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
|
|
return (ZIO_PIPELINE_STOP);
|
|
+ }
|
|
|
|
if (!IO_IS_ALLOCATING(zio))
|
|
return (ZIO_PIPELINE_CONTINUE);
|
|
@@ -2229,8 +2235,9 @@ zio_gang_issue(zio_t *zio)
|
|
{
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
|
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
|
|
return (ZIO_PIPELINE_STOP);
|
|
+ }
|
|
|
|
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
|
|
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
|
|
@@ -2561,8 +2568,9 @@ zio_ddt_read_done(zio_t *zio)
|
|
{
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
- if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
|
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
|
|
return (ZIO_PIPELINE_STOP);
|
|
+ }
|
|
|
|
ASSERT(BP_GET_DEDUP(bp));
|
|
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
|
|
@@ -3292,8 +3300,9 @@ zio_vdev_io_done(zio_t *zio)
|
|
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
|
|
boolean_t unexpected_error = B_FALSE;
|
|
|
|
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
|
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
|
return (ZIO_PIPELINE_STOP);
|
|
+ }
|
|
|
|
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
|
|
|
|
@@ -3362,8 +3371,9 @@ zio_vdev_io_assess(zio_t *zio)
|
|
{
|
|
vdev_t *vd = zio->io_vd;
|
|
|
|
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
|
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
|
|
return (ZIO_PIPELINE_STOP);
|
|
+ }
|
|
|
|
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
|
|
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
|
|
@@ -3578,9 +3588,10 @@ zio_ready(zio_t *zio)
|
|
zio_t *pio, *pio_next;
|
|
zio_link_t *zl = NULL;
|
|
|
|
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
|
|
- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
|
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
|
|
+ ZIO_WAIT_READY)) {
|
|
return (ZIO_PIPELINE_STOP);
|
|
+ }
|
|
|
|
if (zio->io_ready) {
|
|
ASSERT(IO_IS_ALLOCATING(zio));
|
|
@@ -3721,11 +3732,9 @@ zio_done(zio_t *zio)
|
|
* If our children haven't all completed,
|
|
* wait for them and then repeat this pipeline stage.
|
|
*/
|
|
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
|
|
- zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
|
|
- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
|
|
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
|
|
+ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
|
|
return (ZIO_PIPELINE_STOP);
|
|
+ }
|
|
|
|
/*
|
|
* If the allocation throttle is enabled, then update the accounting.
|
|
--
|
|
2.14.2
|
|
|