Tag 2.1.0

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Tag 2.1.0-rc8
2026-05-23 19:04:45 +03:00 · 2021-07-02 11:04:34 -07:00 · 2021-06-29 13:19:19 -07:00 · 2021-06-29 13:18:58 -07:00 · 2021-06-29 13:15:21 -07:00 · 2021-06-29 13:15:04 -07:00
713 changed files with 39576 additions and 25645 deletions
@@ -2,7 +2,7 @@
 name: Bug report
 about: Create a report to help us improve OpenZFS
 title: ''
-labels: 'Type: Defect'
+labels: 'Type: Defect, Status: Triage Needed'
 assignees: ''

 ---
@@ -1,5 +1,8 @@
 blank_issues_enabled: false
 contact_links:
+  - name: OpenZFS Questions
+    url: https://github.com/openzfs/zfs/discussions/new
+    about: Ask the community for help
  - name: OpenZFS Community Support Mailing list (Linux)
    url: https://zfsonlinux.topicbox.com/groups/zfs-discuss
    about: Get community support for OpenZFS on Linux
@@ -7,5 +10,5 @@ contact_links:
    url: https://lists.freebsd.org/mailman/listinfo/freebsd-fs
    about: Get community support for OpenZFS on FreeBSD
  - name: OpenZFS on IRC
-    url: https://webchat.freenode.net/#openzfs
+    url: https://web.libera.chat/#openzfs
    about: Use IRC to get community support for OpenZFS
@@ -1,37 +0,0 @@
---
-name: Code Question
-about: Ask a question about the code
-title: ''
-labels: 'Type: Question'
-assignees: ''
-
---
-
-<!--
-Thank you for taking an interest in the OpenZFS codebase.
-
-Please be aware that most questions are preferably asked in the mailing list first.
-This form is primarily meant for asking questions about the code itself.
-
-Please also check our issue tracker before opening a new question.
-Filling out the following template will help other contributors better understand your question.
-->
-
-### Ask your question!
-
-<!--
-Please provide a clear and concise question.
-->
-
-### Which portion of the codebase does your question involve?
-
-<!--
-Optional: Please describe what portion of the codebase your issue involved.
-Example: "Testsuite", "Buildbots", "CLI", a code snippet etc.
-->
-
-### Additional context
-
-<!--
-Any additional information you want to add?
-->
@@ -28,14 +28,15 @@ https://openzfs.github.io/openzfs-docs/Developer%20Resources/Buildbot%20Options.
 - [ ] Performance enhancement (non-breaking change which improves efficiency)
 - [ ] Code cleanup (non-breaking change which makes code smaller or more readable)
 - [ ] Breaking change (fix or feature that would cause existing functionality to change)
+- [ ] Library ABI change (libzfs, libzfs\_core, libnvpair, libuutil and libzfsbootenv)
 - [ ] Documentation (a change to man pages or other documentation)

 ### Checklist:
 <!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
 <!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
- [ ] My code follows the ZFS on Linux [code style requirements](https://github.com/zfsonlinux/zfs/blob/master/.github/CONTRIBUTING.md#coding-conventions).
+- [ ] My code follows the OpenZFS [code style requirements](https://github.com/openzfs/zfs/blob/master/.github/CONTRIBUTING.md#coding-conventions).
 - [ ] I have updated the documentation accordingly.
- [ ] I have read the [**contributing** document](https://github.com/zfsonlinux/zfs/blob/master/.github/CONTRIBUTING.md).
- [ ] I have added [tests](https://github.com/zfsonlinux/zfs/tree/master/tests) to cover my changes.
+- [ ] I have read the [**contributing** document](https://github.com/openzfs/zfs/blob/master/.github/CONTRIBUTING.md).
+- [ ] I have added [tests](https://github.com/openzfs/zfs/tree/master/tests) to cover my changes.
 - [ ] I have run the ZFS Test Suite with this change applied.
- [ ] All commit messages are properly formatted and contain [`Signed-off-by`](https://github.com/zfsonlinux/zfs/blob/master/.github/CONTRIBUTING.md#signed-off-by).
+- [ ] All commit messages are properly formatted and contain [`Signed-off-by`](https://github.com/openzfs/zfs/blob/master/.github/CONTRIBUTING.md#signed-off-by).
@@ -0,0 +1,13 @@
+# Configuration for probot-no-response - https://github.com/probot/no-response
+
+# Number of days of inactivity before an Issue is closed for lack of response
+daysUntilClose: 31
+# Label requiring a response
+responseRequiredLabel: "Status: Feedback requested"
+# Comment to post when closing an Issue for lack of response. Set to `false` to disable
+closeComment: >
+  This issue has been automatically closed because there has been no response
+  to our request for more information from the original author. With only the
+  information that is currently in the issue, we don't have enough information
+  to take action. Please reach out if you have or find the answers we need so
+  that we can investigate further.
@@ -7,7 +7,14 @@ only: issues
 # Issues with these labels will never be considered stale
 exemptLabels:
  - "Type: Feature"
-  - "Type: Understood"
+  - "Bot: Not Stale"
+  - "Status: Work in Progress"
+# Set to true to ignore issues in a project (defaults to false)
+exemptProjects: true
+# Set to true to ignore issues in a milestone (defaults to false)
+exemptMilestones: true
+# Set to true to ignore issues with an assignee (defaults to false)
+exemptAssignees: true
 # Label to use when marking an issue as stale
 staleLabel: "Status: Stale"
 # Comment to post when marking an issue as stale. Set to `false` to disable
@@ -15,3 +22,5 @@ markComment: >
  This issue has been automatically marked as "stale" because it has not had
  any activity for a while. It will be closed in 90 days if no further activity occurs. 
  Thank you for your contributions.
+# Limit the number of actions per hour, from 1-30. Default is 30
+limitPerRun: 6
@@ -18,12 +18,13 @@ jobs:
        sudo apt-get install --yes -qq zlib1g-dev uuid-dev libattr1-dev libblkid-dev libselinux-dev libudev-dev libssl-dev python-dev python-setuptools python-cffi python3 python3-dev python3-setuptools python3-cffi
        # packages for tests
        sudo apt-get install --yes -qq parted lsscsi ksh attr acl nfs-kernel-server fio
-        sudo apt-get install --yes -qq mandoc cppcheck pax-utils abigail-tools # devscripts - enable then bashisms fixed
+        sudo apt-get install --yes -qq mandoc cppcheck pax-utils devscripts abigail-tools
        sudo -E pip --quiet install flake8
    - name: Prepare
      run: |
        sh ./autogen.sh
        ./configure
+        make -j$(nproc)
    - name: Checkstyle
      run: |
        make checkstyle
@@ -32,5 +33,4 @@ jobs:
        make lint
    - name: CheckABI
      run: |
-        make -j$(nproc)
        make checkabi
@@ -1,3 +1,3 @@
 [submodule "scripts/zfs-images"]
 	path = scripts/zfs-images
-	url = https://github.com/zfsonlinux/zfs-images
+	url = https://github.com/openzfs/zfs-images
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.0.6
+Version:       2.1.0
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 5.14
+Linux-Maximum: 5.13
 Linux-Minimum: 3.10
@@ -1,3 +1,5 @@
+include $(top_srcdir)/config/Shellcheck.am
+
 ACLOCAL_AMFLAGS = -I config

 SUBDIRS = include
@@ -6,7 +8,7 @@ SUBDIRS += rpm
 endif

 if CONFIG_USER
-SUBDIRS += etc man scripts lib tests cmd contrib
+SUBDIRS += man scripts lib tests cmd etc contrib
 if BUILD_LINUX
 SUBDIRS += udev
 endif
@@ -26,8 +28,8 @@ endif
 AUTOMAKE_OPTIONS = foreign
 EXTRA_DIST  = autogen.sh copy-builtin
 EXTRA_DIST += config/config.awk config/rpm.am config/deb.am config/tgz.am
-EXTRA_DIST += META AUTHORS COPYRIGHT LICENSE NEWS NOTICE README.md
-EXTRA_DIST += CODE_OF_CONDUCT.md
+EXTRA_DIST += AUTHORS CODE_OF_CONDUCT.md COPYRIGHT LICENSE META NEWS NOTICE
+EXTRA_DIST += README.md RELEASES.md
 EXTRA_DIST += module/lua/README.zfs module/os/linux/spl/README.md

 # Include all the extra licensing information for modules
@@ -123,17 +125,8 @@ cstyle:

 filter_executable = -exec test -x '{}' \; -print

-PHONY += shellcheck
-shellcheck:
-	@if type shellcheck > /dev/null 2>&1; then \
-		shellcheck --exclude=SC1090 --exclude=SC1117 --format=gcc \
-			$$(find ${top_srcdir}/scripts/*.sh -type f) \
-			$$(find ${top_srcdir}/cmd/zed/zed.d/*.sh -type f) \
-			$$(find ${top_srcdir}/cmd/zpool/zpool.d/* \
-			-type f ${filter_executable}); \
-	else \
-		echo "skipping shellcheck because shellcheck is not installed"; \
-	fi
+SHELLCHECKDIRS = cmd contrib etc scripts tests
+SHELLCHECKSCRIPTS = autogen.sh

 PHONY += checkabi storeabi
 checkabi: lib
@@ -142,40 +135,9 @@ checkabi: lib
 storeabi: lib
 	$(MAKE) -C lib storeabi

-PHONY += checkbashisms
-checkbashisms:
-	@if type checkbashisms > /dev/null 2>&1; then \
-		checkbashisms -n -p -x \
-			$$(find ${top_srcdir} \
-				-name '.git' -prune \
-				-o -name 'build' -prune \
-				-o -name 'tests' -prune \
-				-o -name 'config' -prune \
-				-o -name 'zed-functions.sh*' -prune \
-				-o -name 'zfs-import*' -prune \
-				-o -name 'zfs-mount*' -prune \
-				-o -name 'zfs-zed*' -prune \
-				-o -name 'smart' -prune \
-				-o -name 'paxcheck.sh' -prune \
-				-o -name 'make_gitrev.sh' -prune \
-				-o -name '90zfs' -prune \
-				-o -type f ! -name 'config*' \
-				! -name 'libtool' \
-			-exec sh -c 'awk "NR==1 && /#!.*bin\/sh.*/ {print FILENAME;}" "{}"' \;); \
-	else \
-		echo "skipping checkbashisms because checkbashisms is not installed"; \
-	fi
-
 PHONY += mancheck
 mancheck:
-	@if type mandoc > /dev/null 2>&1; then \
-		find ${top_srcdir}/man/man8 -type f -name 'zfs.8' \
-			-o -name 'zpool.8' -o -name 'zdb.8' \
-			-o -name 'zgenhostid.8' | \
-			xargs mandoc -Tlint -Werror; \
-	else \
-		echo "skipping mancheck because mandoc is not installed"; \
-	fi
+	${top_srcdir}/scripts/mancheck.sh ${top_srcdir}/man ${top_srcdir}/tests/test-runner/man

 if BUILD_LINUX
 stat_fmt = -c '%A %n'
@@ -32,4 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197`

 # Supported Kernels
  * The `META` file contains the officially recognized supported Linux kernel versions.
-  * Supported FreeBSD versions are 12-STABLE and 13-CURRENT.
+  * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE.
@@ -0,0 +1,37 @@
+OpenZFS uses the MAJOR.MINOR.PATCH versioning scheme described here:
+
+  * MAJOR - Incremented at the discretion of the OpenZFS developers to indicate
+    a particularly noteworthy feature or change. An increase in MAJOR number
+    does not indicate any incompatible on-disk format change. The ability
+    to import a ZFS pool is controlled by the feature flags enabled on the
+    pool and the feature flags supported by the installed OpenZFS version.
+    Increasing the MAJOR version is expected to be an infrequent occurrence.
+
+  * MINOR - Incremented to indicate new functionality such as a new feature
+    flag, pool/dataset property, zfs/zpool sub-command, new user/kernel
+    interface, etc. MINOR releases may introduce incompatible changes to the
+    user space library APIs (libzfs.so). Existing user/kernel interfaces are
+    considered to be stable to maximize compatibility between OpenZFS releases.
+    Additions to the user/kernel interface are backwards compatible.
+
+  * PATCH - Incremented when applying documentation updates, important bug
+    fixes, minor performance improvements, and kernel compatibility patches.
+    The user space library APIs and user/kernel interface are considered to
+    be stable. PATCH releases for a MAJOR.MINOR are published as needed.
+
+Two release branches are maintained for OpenZFS, they are:
+
+  * OpenZFS LTS - A designated MAJOR.MINOR release with periodic PATCH
+    releases that incorporate important changes backported from newer OpenZFS
+    releases. This branch is intended for use in environments using an
+    LTS, enterprise, or similarly managed kernel (RHEL, Ubuntu LTS, Debian).
+    Minor changes to support these distribution kernels will be applied as
+    needed. New kernel versions released after the OpenZFS LTS release are
+    not supported. LTS releases will receive patches for at least 2 years.
+    The current LTS release is OpenZFS 2.1.
+
+  * OpenZFS current - Tracks the newest MAJOR.MINOR release. This branch
+    includes support for the latest OpenZFS features and recently releases
+    kernels.  When a new MINOR release is tagged the previous MINOR release
+    will no longer be maintained (unless it is an LTS release). New MINOR
+    releases are planned to occur roughly annually.
@@ -1,8 +1,14 @@
-SUBDIRS  = zfs zpool zdb zhack zinject zstream zstreamdump ztest
+include $(top_srcdir)/config/Shellcheck.am
+
+SUBDIRS  = zfs zpool zdb zhack zinject zstream ztest
 SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path
+SUBDIRS += zpool_influxdb

 CPPCHECKDIRS  = zfs zpool zdb zhack zinject zstream ztest
-CPPCHECKDIRS += raidz_test zfs_ids_to_path
+CPPCHECKDIRS += raidz_test zfs_ids_to_path zpool_influxdb
+
+# TODO: #12084: SHELLCHECKDIRS = fsck_zfs vdev_id zpool
+SHELLCHECKDIRS = fsck_zfs zpool

 if USING_PYTHON
 SUBDIRS += arcstat arc_summary dbufstat
@@ -11,6 +17,7 @@ endif
 if BUILD_LINUX
 SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait
 CPPCHECKDIRS += mount_zfs zed zgenhostid zvol_id
+SHELLCHECKDIRS += zed
 endif

 PHONY = cppcheck
@@ -102,18 +102,6 @@ show_tunable_descriptions = False
 alternate_tunable_layout = False


-def handle_Exception(ex_cls, ex, tb):
-    if ex is IOError:
-        if ex.errno == errno.EPIPE:
-            sys.exit()
-
-    if ex is KeyboardInterrupt:
-        sys.exit()
-
-
-sys.excepthook = handle_Exception
-
-
 def get_Kstat():
    """Collect information on the ZFS subsystem from the /proc virtual
    file system. The name "kstat" is a holdover from the Solaris utility
@@ -225,12 +213,30 @@ def get_arc_summary(Kstat):
    deleted = Kstat["kstat.zfs.misc.arcstats.deleted"]
    mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"]
    evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"]
+    evict_l2_cached = Kstat["kstat.zfs.misc.arcstats.evict_l2_cached"]
+    evict_l2_eligible = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible"]
+    evict_l2_eligible_mfu = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible_mfu"]
+    evict_l2_eligible_mru = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible_mru"]
+    evict_l2_ineligible = Kstat["kstat.zfs.misc.arcstats.evict_l2_ineligible"]
+    evict_l2_skip = Kstat["kstat.zfs.misc.arcstats.evict_l2_skip"]

    # ARC Misc.
    output["arc_misc"] = {}
    output["arc_misc"]["deleted"] = fHits(deleted)
-    output["arc_misc"]['mutex_miss'] = fHits(mutex_miss)
-    output["arc_misc"]['evict_skips'] = fHits(evict_skip)
+    output["arc_misc"]["mutex_miss"] = fHits(mutex_miss)
+    output["arc_misc"]["evict_skips"] = fHits(evict_skip)
+    output["arc_misc"]["evict_l2_skip"] = fHits(evict_l2_skip)
+    output["arc_misc"]["evict_l2_cached"] = fBytes(evict_l2_cached)
+    output["arc_misc"]["evict_l2_eligible"] = fBytes(evict_l2_eligible)
+    output["arc_misc"]["evict_l2_eligible_mfu"] = {
+            'per': fPerc(evict_l2_eligible_mfu, evict_l2_eligible),
+            'num': fBytes(evict_l2_eligible_mfu),
+    }
+    output["arc_misc"]["evict_l2_eligible_mru"] = {
+            'per': fPerc(evict_l2_eligible_mru, evict_l2_eligible),
+            'num': fBytes(evict_l2_eligible_mru),
+    }
+    output["arc_misc"]["evict_l2_ineligible"] = fBytes(evict_l2_ineligible)

    # ARC Sizing
    arc_size = Kstat["kstat.zfs.misc.arcstats.size"]
@@ -346,8 +352,26 @@ def _arc_summary(Kstat):
    sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted'])
    sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" %
                     arc['arc_misc']['mutex_miss'])
-    sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" %
+    sys.stdout.write("\tEviction Skips:\t\t\t\t%s\n" %
                     arc['arc_misc']['evict_skips'])
+    sys.stdout.write("\tEviction Skips Due to L2 Writes:\t%s\n" %
+                     arc['arc_misc']['evict_l2_skip'])
+    sys.stdout.write("\tL2 Cached Evictions:\t\t\t%s\n" %
+                     arc['arc_misc']['evict_l2_cached'])
+    sys.stdout.write("\tL2 Eligible Evictions:\t\t\t%s\n" %
+                     arc['arc_misc']['evict_l2_eligible'])
+    sys.stdout.write("\tL2 Eligible MFU Evictions:\t%s\t%s\n" % (
+                     arc['arc_misc']['evict_l2_eligible_mfu']['per'],
+                     arc['arc_misc']['evict_l2_eligible_mfu']['num'],
+                     )
+    )
+    sys.stdout.write("\tL2 Eligible MRU Evictions:\t%s\t%s\n" % (
+                     arc['arc_misc']['evict_l2_eligible_mru']['per'],
+                     arc['arc_misc']['evict_l2_eligible_mru']['num'],
+                     )
+    )
+    sys.stdout.write("\tL2 Ineligible Evictions:\t\t%s\n" %
+                     arc['arc_misc']['evict_l2_ineligible'])
    sys.stdout.write("\n")

    # ARC Sizing
@@ -683,6 +707,11 @@ def get_l2arc_summary(Kstat):
    l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"]
    l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"]
    l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"]
+    l2_mfu_asize = Kstat["kstat.zfs.misc.arcstats.l2_mfu_asize"]
+    l2_mru_asize = Kstat["kstat.zfs.misc.arcstats.l2_mru_asize"]
+    l2_prefetch_asize = Kstat["kstat.zfs.misc.arcstats.l2_prefetch_asize"]
+    l2_bufc_data_asize = Kstat["kstat.zfs.misc.arcstats.l2_bufc_data_asize"]
+    l2_bufc_metadata_asize = Kstat["kstat.zfs.misc.arcstats.l2_bufc_metadata_asize"]

    l2_access_total = (l2_hits + l2_misses)
    output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error)
@@ -705,7 +734,7 @@ def get_l2arc_summary(Kstat):
        output["io_errors"] = fHits(l2_io_error)

        output["l2_arc_size"] = {}
-        output["l2_arc_size"]["adative"] = fBytes(l2_size)
+        output["l2_arc_size"]["adaptive"] = fBytes(l2_size)
        output["l2_arc_size"]["actual"] = {
            'per': fPerc(l2_asize, l2_size),
            'num': fBytes(l2_asize)
@@ -714,6 +743,26 @@ def get_l2arc_summary(Kstat):
            'per': fPerc(l2_hdr_size, l2_size),
            'num': fBytes(l2_hdr_size),
        }
+        output["l2_arc_size"]["mfu_asize"] = {
+            'per': fPerc(l2_mfu_asize, l2_asize),
+            'num': fBytes(l2_mfu_asize),
+        }
+        output["l2_arc_size"]["mru_asize"] = {
+            'per': fPerc(l2_mru_asize, l2_asize),
+            'num': fBytes(l2_mru_asize),
+        }
+        output["l2_arc_size"]["prefetch_asize"] = {
+            'per': fPerc(l2_prefetch_asize, l2_asize),
+            'num': fBytes(l2_prefetch_asize),
+        }
+        output["l2_arc_size"]["bufc_data_asize"] = {
+            'per': fPerc(l2_bufc_data_asize, l2_asize),
+            'num': fBytes(l2_bufc_data_asize),
+        }
+        output["l2_arc_size"]["bufc_metadata_asize"] = {
+            'per': fPerc(l2_bufc_metadata_asize, l2_asize),
+            'num': fBytes(l2_bufc_metadata_asize),
+        }

        output["l2_arc_evicts"] = {}
        output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry)
@@ -778,7 +827,7 @@ def _l2arc_summary(Kstat):
        sys.stdout.write("\n")

        sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" %
-                         arc["l2_arc_size"]["adative"])
+                         arc["l2_arc_size"]["adaptive"])
        sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % (
            arc["l2_arc_size"]["actual"]["per"],
            arc["l2_arc_size"]["actual"]["num"],
@@ -789,11 +838,36 @@ def _l2arc_summary(Kstat):
            arc["l2_arc_size"]["head_size"]["num"],
            )
        )
+        sys.stdout.write("\tMFU Alloc. Size:\t\t%s\t%s\n" % (
+            arc["l2_arc_size"]["mfu_asize"]["per"],
+            arc["l2_arc_size"]["mfu_asize"]["num"],
+            )
+        )
+        sys.stdout.write("\tMRU Alloc. Size:\t\t%s\t%s\n" % (
+            arc["l2_arc_size"]["mru_asize"]["per"],
+            arc["l2_arc_size"]["mru_asize"]["num"],
+            )
+        )
+        sys.stdout.write("\tPrefetch Alloc. Size:\t\t%s\t%s\n" % (
+            arc["l2_arc_size"]["prefetch_asize"]["per"],
+            arc["l2_arc_size"]["prefetch_asize"]["num"],
+            )
+        )
+        sys.stdout.write("\tData (buf content) Alloc. Size:\t%s\t%s\n" % (
+            arc["l2_arc_size"]["bufc_data_asize"]["per"],
+            arc["l2_arc_size"]["bufc_data_asize"]["num"],
+            )
+        )
+        sys.stdout.write("\tMetadata (buf content) Size:\t%s\t%s\n" % (
+            arc["l2_arc_size"]["bufc_metadata_asize"]["per"],
+            arc["l2_arc_size"]["bufc_metadata_asize"]["num"],
+            )
+        )
        sys.stdout.write("\n")

        if arc["l2_arc_evicts"]['lock_retries'] != '0' or \
           arc["l2_arc_evicts"]["reading"] != '0':
-            sys.stdout.write("L2 ARC Evicts:\n")
+            sys.stdout.write("L2 ARC Evictions:\n")
            sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" %
                             arc["l2_arc_evicts"]['lock_retries'])
            sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" %
@@ -1051,48 +1125,55 @@ def main():
    global alternate_tunable_layout

    try:
-        opts, args = getopt.getopt(
-            sys.argv[1:],
-            "adp:h", ["alternate", "description", "page=", "help"]
-        )
-    except getopt.error as e:
-        sys.stderr.write("Error: %s\n" % e.msg)
-        usage()
-        sys.exit(1)
-
-    args = {}
-    for opt, arg in opts:
-        if opt in ('-a', '--alternate'):
-            args['a'] = True
-        if opt in ('-d', '--description'):
-            args['d'] = True
-        if opt in ('-p', '--page'):
-            args['p'] = arg
-        if opt in ('-h', '--help'):
-            usage()
-            sys.exit(0)
-
-    Kstat = get_Kstat()
-
-    alternate_tunable_layout = 'a' in args
-    show_tunable_descriptions = 'd' in args
-
-    pages = []
-
-    if 'p' in args:
        try:
-            pages.append(unSub[int(args['p']) - 1])
-        except IndexError:
-            sys.stderr.write('the argument to -p must be between 1 and ' +
-                             str(len(unSub)) + '\n')
+            opts, args = getopt.getopt(
+                sys.argv[1:],
+                "adp:h", ["alternate", "description", "page=", "help"]
+            )
+        except getopt.error as e:
+            sys.stderr.write("Error: %s\n" % e.msg)
+            usage()
            sys.exit(1)
-    else:
-        pages = unSub

-    zfs_header()
-    for page in pages:
-        page(Kstat)
-        sys.stdout.write("\n")
+        args = {}
+        for opt, arg in opts:
+            if opt in ('-a', '--alternate'):
+                args['a'] = True
+            if opt in ('-d', '--description'):
+                args['d'] = True
+            if opt in ('-p', '--page'):
+                args['p'] = arg
+            if opt in ('-h', '--help'):
+                usage()
+                sys.exit(0)
+
+        Kstat = get_Kstat()
+
+        alternate_tunable_layout = 'a' in args
+        show_tunable_descriptions = 'd' in args
+
+        pages = []
+
+        if 'p' in args:
+            try:
+                pages.append(unSub[int(args['p']) - 1])
+            except IndexError:
+                sys.stderr.write('the argument to -p must be between 1 and ' +
+                                 str(len(unSub)) + '\n')
+                sys.exit(1)
+        else:
+            pages = unSub
+
+        zfs_header()
+        for page in pages:
+            page(Kstat)
+            sys.stdout.write("\n")
+    except IOError as ex:
+        if (ex.errno == errno.EPIPE):
+            sys.exit(0)
+        raise
+    except KeyboardInterrupt:
+        sys.exit(0)


 if __name__ == '__main__':
@@ -42,6 +42,13 @@ import os
 import subprocess
 import sys
 import time
+import errno
+
+# We can't use env -S portably, and we need python3 -u to handle pipes in
+# the shell abruptly closing the way we want to, so...
+import io
+if isinstance(sys.__stderr__.buffer, io.BufferedWriter):
+    os.execv(sys.executable, [sys.executable, "-u"] + sys.argv)

 DESCRIPTION = 'Print ARC and other statistics for OpenZFS'
 INDENT = ' '*8
@@ -58,7 +65,6 @@ SECTION_PATHS = {'arc': 'arcstats',
                 'dmu': 'dmu_tx',
                 'l2arc': 'arcstats',  # L2ARC stuff lives in arcstats
                 'vdev': 'vdev_cache_stats',
-                 'xuio': 'xuio_stats',
                 'zfetch': 'zfetchstats',
                 'zil': 'zil'}

@@ -162,21 +168,11 @@ elif sys.platform.startswith('linux'):
        # The original arc_summary called /sbin/modinfo/{spl,zfs} to get
        # the version information. We switch to /sys/module/{spl,zfs}/version
        # to make sure we get what is really loaded in the kernel
-        command = ["cat", "/sys/module/{0}/version".format(request)]
-        req = request.upper()
-
-        # The recommended way to do this is with subprocess.run(). However,
-        # some installed versions of Python are < 3.5, so we offer them
-        # the option of doing it the old way (for now)
-        if 'run' in dir(subprocess):
-            info = subprocess.run(command, stdout=subprocess.PIPE,
-                                  universal_newlines=True)
-            version = info.stdout.strip()
-        else:
-            info = subprocess.check_output(command, universal_newlines=True)
-            version = info.strip()
-
-        return version
+        try:
+            with open("/sys/module/{}/version".format(request)) as f:
+                return f.read().strip()
+        except:
+            return "(unknown)"

    def get_descriptions(request):
        """Get the descriptions of the Solaris Porting Layer (SPL) or the
@@ -232,6 +228,29 @@ elif sys.platform.startswith('linux'):

        return descs

+def handle_unraisableException(exc_type, exc_value=None, exc_traceback=None,
+                               err_msg=None, object=None):
+   handle_Exception(exc_type, object, exc_traceback)
+
+def handle_Exception(ex_cls, ex, tb):
+    if ex_cls is KeyboardInterrupt:
+        sys.exit()
+
+    if ex_cls is BrokenPipeError:
+        # It turns out that while sys.exit() triggers an exception
+        # not handled message on Python 3.8+, os._exit() does not.
+        os._exit(0)
+
+    if ex_cls is OSError:
+      if ex.errno == errno.ENOTCONN:
+        sys.exit()
+
+    raise ex
+
+if hasattr(sys,'unraisablehook'): # Python 3.8+
+    sys.unraisablehook = handle_unraisableException
+sys.excepthook = handle_Exception
+

 def cleanup_line(single_line):
    """Format a raw line of data from /proc and isolate the name value
@@ -593,6 +612,20 @@ def section_arc(kstats_dict):
    prt_i1('Deleted:', f_hits(arc_stats['deleted']))
    prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss']))
    prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip']))
+    prt_i1('Eviction skips due to L2 writes:',
+           f_hits(arc_stats['evict_l2_skip']))
+    prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached']))
+    prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible']))
+    prt_i2('L2 eligible MFU evictions:',
+           f_perc(arc_stats['evict_l2_eligible_mfu'],
+           arc_stats['evict_l2_eligible']),
+           f_bytes(arc_stats['evict_l2_eligible_mfu']))
+    prt_i2('L2 eligible MRU evictions:',
+           f_perc(arc_stats['evict_l2_eligible_mru'],
+           arc_stats['evict_l2_eligible']),
+           f_bytes(arc_stats['evict_l2_eligible_mru']))
+    prt_i1('L2 ineligible evictions:',
+           f_bytes(arc_stats['evict_l2_ineligible']))
    print()


@@ -731,6 +764,21 @@ def section_l2arc(kstats_dict):
    prt_i2('Header size:',
           f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
           f_bytes(arc_stats['l2_hdr_size']))
+    prt_i2('MFU allocated size:',
+           f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']),
+           f_bytes(arc_stats['l2_mfu_asize']))
+    prt_i2('MRU allocated size:',
+           f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']),
+           f_bytes(arc_stats['l2_mru_asize']))
+    prt_i2('Prefetch allocated size:',
+           f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']),
+           f_bytes(arc_stats['l2_prefetch_asize']))
+    prt_i2('Data (buffer content) allocated size:',
+           f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']),
+           f_bytes(arc_stats['l2_bufc_data_asize']))
+    prt_i2('Metadata (buffer content) allocated size:',
+           f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']),
+           f_bytes(arc_stats['l2_bufc_metadata_asize']))

    print()
    prt_1('L2ARC breakdown:', f_hits(l2_access_total))
@@ -88,6 +88,12 @@ cols = {
    "mfug":       [4, 1000, "MFU ghost list hits per second"],
    "mrug":       [4, 1000, "MRU ghost list hits per second"],
    "eskip":      [5, 1000, "evict_skip per second"],
+    "el2skip":    [7, 1000, "evict skip, due to l2 writes, per second"],
+    "el2cach":    [7, 1024, "Size of L2 cached evictions per second"],
+    "el2el":      [5, 1024, "Size of L2 eligible evictions per second"],
+    "el2mfu":     [6, 1024, "Size of L2 eligible MFU evictions per second"],
+    "el2mru":     [6, 1024, "Size of L2 eligible MRU evictions per second"],
+    "el2inel":    [7, 1024, "Size of L2 ineligible evictions per second"],
    "mtxmis":     [6, 1000, "mutex_miss per second"],
    "dread":      [5, 1000, "Demand accesses per second"],
    "pread":      [5, 1000, "Prefetch accesses per second"],
@@ -96,6 +102,16 @@ cols = {
    "l2read":     [6, 1000, "Total L2ARC accesses per second"],
    "l2hit%":     [6, 100, "L2ARC access hit percentage"],
    "l2miss%":    [7, 100, "L2ARC access miss percentage"],
+    "l2pref":     [6, 1024, "L2ARC prefetch allocated size"],
+    "l2mfu":      [5, 1024, "L2ARC MFU allocated size"],
+    "l2mru":      [5, 1024, "L2ARC MRU allocated size"],
+    "l2data":     [6, 1024, "L2ARC data allocated size"],
+    "l2meta":     [6, 1024, "L2ARC metadata allocated size"],
+    "l2pref%":    [7, 100, "L2ARC prefetch percentage"],
+    "l2mfu%":     [6, 100, "L2ARC MFU percentage"],
+    "l2mru%":     [6, 100, "L2ARC MRU percentage"],
+    "l2data%":    [7, 100, "L2ARC data percentage"],
+    "l2meta%":    [7, 100, "L2ARC metadata percentage"],
    "l2asize":    [7, 1024, "Actual (compressed) size of the L2ARC"],
    "l2size":     [6, 1024, "Size of the L2ARC"],
    "l2bytes":    [7, 1024, "Bytes read per second from the L2ARC"],
@@ -463,6 +479,12 @@ def calculate():
    v["mrug"] = d["mru_ghost_hits"] / sint
    v["mfug"] = d["mfu_ghost_hits"] / sint
    v["eskip"] = d["evict_skip"] / sint
+    v["el2skip"] = d["evict_l2_skip"] / sint
+    v["el2cach"] = d["evict_l2_cached"] / sint
+    v["el2el"] = d["evict_l2_eligible"] / sint
+    v["el2mfu"] = d["evict_l2_eligible_mfu"] / sint
+    v["el2mru"] = d["evict_l2_eligible_mru"] / sint
+    v["el2inel"] = d["evict_l2_ineligible"] / sint
    v["mtxmis"] = d["mutex_miss"] / sint

    if l2exist:
@@ -476,6 +498,17 @@ def calculate():
        v["l2size"] = cur["l2_size"]
        v["l2bytes"] = d["l2_read_bytes"] / sint

+        v["l2pref"] = cur["l2_prefetch_asize"]
+        v["l2mfu"] = cur["l2_mfu_asize"]
+        v["l2mru"] = cur["l2_mru_asize"]
+        v["l2data"] = cur["l2_bufc_data_asize"]
+        v["l2meta"] = cur["l2_bufc_metadata_asize"]
+        v["l2pref%"] = 100 * v["l2pref"] / v["l2asize"]
+        v["l2mfu%"] = 100 * v["l2mfu"] / v["l2asize"]
+        v["l2mru%"] = 100 * v["l2mru"] / v["l2asize"]
+        v["l2data%"] = 100 * v["l2data"] / v["l2asize"]
+        v["l2meta%"] = 100 * v["l2meta"] / v["l2asize"]
+
    v["grow"] = 0 if cur["arc_no_grow"] else 1
    v["need"] = cur["arc_need_free"]
    v["free"] = cur["memory_free_bytes"]
@@ -0,0 +1 @@
+/fsck.zfs
@@ -1 +1,6 @@
+include $(top_srcdir)/config/Substfiles.am
+include $(top_srcdir)/config/Shellcheck.am
+
 dist_sbin_SCRIPTS = fsck.zfs
+
+SUBSTFILES += $(dist_sbin_SCRIPTS)
@@ -1,9 +0,0 @@
-#!/bin/sh
-#
-# fsck.zfs: A fsck helper to accommodate distributions that expect
-# to be able to execute a fsck on all filesystem types.  Currently
-# this script does nothing but it could be extended to act as a
-# compatibility wrapper for 'zpool scrub'.
-#
-
-exit 0
@@ -0,0 +1,44 @@
+#!/bin/sh
+#
+# fsck.zfs: A fsck helper to accommodate distributions that expect
+# to be able to execute a fsck on all filesystem types.
+#
+# This script simply bubbles up some already-known-about errors,
+# see fsck.zfs(8)
+#
+
+if [ "$#" = "0" ]; then
+	echo "Usage: $0 [options] dataset…" >&2
+	exit 16
+fi
+
+ret=0
+for dataset in "$@"; do
+	case "$dataset" in
+		-*)
+			continue
+			;;
+		*)
+			;;
+	esac
+
+	pool="${dataset%%/*}"
+
+	case "$(@sbindir@/zpool list -Ho health "$pool")" in
+		DEGRADED)
+			ret=$(( ret | 4 ))
+			;;
+		FAULTED)
+			awk '!/^([[:space:]]*#.*)?$/ && $1 == "'"$dataset"'" && $3 == "zfs" {exit 1}' /etc/fstab || \
+				ret=$(( ret | 8 ))
+			;;
+		"")
+			# Pool not found, error printed by zpool(8)
+			ret=$(( ret | 8 ))
+			;;
+		*)
+			;;
+	esac
+done
+
+exit "$ret"
@@ -185,10 +185,11 @@ main(int argc, char **argv)
 			break;
 		case 'h':
 		case '?':
-			(void) fprintf(stderr, gettext("Invalid option '%c'\n"),
-			    optopt);
+			if (optopt)
+				(void) fprintf(stderr,
+				    gettext("Invalid option '%c'\n"), optopt);
 			(void) fprintf(stderr, gettext("Usage: mount.zfs "
-			    "[-sfnv] [-o options] <dataset> <mountpoint>\n"));
+			    "[-sfnvh] [-o options] <dataset> <mountpoint>\n"));
 			return (MOUNT_USAGE);
 		}
 	}
@@ -31,8 +31,6 @@
 #include <sys/vdev_raidz_impl.h>
 #include <stdio.h>

-#include <sys/time.h>
-
 #include "raidz_test.h"

 #define	GEN_BENCH_MEMORY	(((uint64_t)1ULL)<<32)
@@ -83,8 +81,17 @@ run_gen_bench_impl(const char *impl)
 			/* create suitable raidz_map */
 			ncols = rto_opts.rto_dcols + fn + 1;
 			zio_bench.io_size = 1ULL << ds;
-			rm_bench = vdev_raidz_map_alloc(&zio_bench,
-			    BENCH_ASHIFT, ncols, fn+1);
+
+			if (rto_opts.rto_expand) {
+				rm_bench = vdev_raidz_map_alloc_expanded(
+				    zio_bench.io_abd,
+				    zio_bench.io_size, zio_bench.io_offset,
+				    rto_opts.rto_ashift, ncols+1, ncols,
+				    fn+1, rto_opts.rto_expand_offset);
+			} else {
+				rm_bench = vdev_raidz_map_alloc(&zio_bench,
+				    BENCH_ASHIFT, ncols, fn+1);
+			}

 			/* estimate iteration count */
 			iter_cnt = GEN_BENCH_MEMORY;
@@ -163,8 +170,16 @@ run_rec_bench_impl(const char *impl)
 			    (1ULL << BENCH_ASHIFT))
 				continue;

-			rm_bench = vdev_raidz_map_alloc(&zio_bench,
-			    BENCH_ASHIFT, ncols, PARITY_PQR);
+			if (rto_opts.rto_expand) {
+				rm_bench = vdev_raidz_map_alloc_expanded(
+				    zio_bench.io_abd,
+				    zio_bench.io_size, zio_bench.io_offset,
+				    BENCH_ASHIFT, ncols+1, ncols,
+				    PARITY_PQR, rto_opts.rto_expand_offset);
+			} else {
+				rm_bench = vdev_raidz_map_alloc(&zio_bench,
+				    BENCH_ASHIFT, ncols, PARITY_PQR);
+			}

 			/* estimate iteration count */
 			iter_cnt = (REC_BENCH_MEMORY);
@@ -77,16 +77,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force)
 		(void) fprintf(stdout, DBLSEP "Running with options:\n"
 		    "  (-a) zio ashift                   : %zu\n"
 		    "  (-o) zio offset                   : 1 << %zu\n"
+		    "  (-e) expanded map                 : %s\n"
+		    "  (-r) reflow offset                : %llx\n"
 		    "  (-d) number of raidz data columns : %zu\n"
 		    "  (-s) size of DATA                 : 1 << %zu\n"
 		    "  (-S) sweep parameters             : %s \n"
 		    "  (-v) verbose                      : %s \n\n",
-		    opts->rto_ashift,			/* -a */
-		    ilog2(opts->rto_offset),		/* -o */
-		    opts->rto_dcols,			/* -d */
-		    ilog2(opts->rto_dsize),		/* -s */
-		    opts->rto_sweep ? "yes" : "no",	/* -S */
-		    verbose);				/* -v */
+		    opts->rto_ashift,				/* -a */
+		    ilog2(opts->rto_offset),			/* -o */
+		    opts->rto_expand ? "yes" : "no",		/* -e */
+		    (u_longlong_t)opts->rto_expand_offset,	/* -r */
+		    opts->rto_dcols,				/* -d */
+		    ilog2(opts->rto_dsize),			/* -s */
+		    opts->rto_sweep ? "yes" : "no",		/* -S */
+		    verbose);					/* -v */
 	}
 }

@@ -104,6 +108,8 @@ static void usage(boolean_t requested)
 	    "\t[-S parameter sweep (default: %s)]\n"
 	    "\t[-t timeout for parameter sweep test]\n"
 	    "\t[-B benchmark all raidz implementations]\n"
+	    "\t[-e use expanded raidz map (default: %s)]\n"
+	    "\t[-r expanded raidz map reflow offset (default: %llx)]\n"
 	    "\t[-v increase verbosity (default: %zu)]\n"
 	    "\t[-h (print help)]\n"
 	    "\t[-T test the test, see if failure would be detected]\n"
@@ -114,6 +120,8 @@ static void usage(boolean_t requested)
 	    o->rto_dcols,				/* -d */
 	    ilog2(o->rto_dsize),			/* -s */
 	    rto_opts.rto_sweep ? "yes" : "no",		/* -S */
+	    rto_opts.rto_expand ? "yes" : "no",		/* -e */
+	    (u_longlong_t)o->rto_expand_offset,		/* -r */
 	    o->rto_v);					/* -d */

 	exit(requested ? 0 : 1);
@@ -128,7 +136,7 @@ static void process_options(int argc, char **argv)

 	bcopy(&rto_opts_defaults, o, sizeof (*o));

-	while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) {
+	while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
 		value = 0;

 		switch (opt) {
@@ -136,6 +144,12 @@ static void process_options(int argc, char **argv)
 			value = strtoull(optarg, NULL, 0);
 			o->rto_ashift = MIN(13, MAX(9, value));
 			break;
+		case 'e':
+			o->rto_expand = 1;
+			break;
+		case 'r':
+			o->rto_expand_offset = strtoull(optarg, NULL, 0);
+			break;
 		case 'o':
 			value = strtoull(optarg, NULL, 0);
 			o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
@@ -179,25 +193,34 @@ static void process_options(int argc, char **argv)
 	}
 }

-#define	DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
-#define	DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
+#define	DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
+#define	DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)

-#define	CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
-#define	CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
+#define	CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
+#define	CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)

 static int
 cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
 {
-	int i, ret = 0;
+	int r, i, ret = 0;

 	VERIFY(parity >= 1 && parity <= 3);

-	for (i = 0; i < parity; i++) {
-		if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
-		    != 0) {
-			ret++;
-			LOG_OPT(D_DEBUG, opts,
-			    "\nParity block [%d] different!\n", i);
+	for (r = 0; r < rm->rm_nrows; r++) {
+		raidz_row_t * const rr = rm->rm_row[r];
+		raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
+		for (i = 0; i < parity; i++) {
+			if (CODE_COL_SIZE(rrg, i) == 0) {
+				VERIFY0(CODE_COL_SIZE(rr, i));
+				continue;
+			}
+
+			if (abd_cmp(CODE_COL(rr, i),
+			    CODE_COL(rrg, i)) != 0) {
+				ret++;
+				LOG_OPT(D_DEBUG, opts,
+				    "\nParity block [%d] different!\n", i);
+			}
 		}
 	}
 	return (ret);
@@ -206,16 +229,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
 static int
 cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
 {
-	int i, ret = 0;
-	int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
+	int r, i, dcols, ret = 0;

-	for (i = 0; i < dcols; i++) {
-		if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
-		    != 0) {
-			ret++;
+	for (r = 0; r < rm->rm_nrows; r++) {
+		raidz_row_t *rr = rm->rm_row[r];
+		raidz_row_t *rrg = opts->rm_golden->rm_row[r];
+		dcols = opts->rm_golden->rm_row[0]->rr_cols -
+		    raidz_parity(opts->rm_golden);
+		for (i = 0; i < dcols; i++) {
+			if (DATA_COL_SIZE(rrg, i) == 0) {
+				VERIFY0(DATA_COL_SIZE(rr, i));
+				continue;
+			}

-			LOG_OPT(D_DEBUG, opts,
-			    "\nData block [%d] different!\n", i);
+			if (abd_cmp(DATA_COL(rrg, i),
+			    DATA_COL(rr, i)) != 0) {
+				ret++;
+
+				LOG_OPT(D_DEBUG, opts,
+				    "\nData block [%d] different!\n", i);
+			}
 		}
 	}
 	return (ret);
@@ -236,12 +269,13 @@ init_rand(void *data, size_t size, void *private)
 static void
 corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
 {
-	int i;
-	raidz_col_t *col;
-
-	for (i = 0; i < cnt; i++) {
-		col = &rm->rm_col[tgts[i]];
-		abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
+	for (int r = 0; r < rm->rm_nrows; r++) {
+		raidz_row_t *rr = rm->rm_row[r];
+		for (int i = 0; i < cnt; i++) {
+			raidz_col_t *col = &rr->rr_col[tgts[i]];
+			abd_iterate_func(col->rc_abd, 0, col->rc_size,
+			    init_rand, NULL);
+		}
 	}
 }

@@ -288,10 +322,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)

 	VERIFY0(vdev_raidz_impl_set("original"));

-	opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
-	    opts->rto_ashift, total_ncols, parity);
-	rm_test = vdev_raidz_map_alloc(zio_test,
-	    opts->rto_ashift, total_ncols, parity);
+	if (opts->rto_expand) {
+		opts->rm_golden =
+		    vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
+		    opts->zio_golden->io_size, opts->zio_golden->io_offset,
+		    opts->rto_ashift, total_ncols+1, total_ncols,
+		    parity, opts->rto_expand_offset);
+		rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
+		    zio_test->io_size, zio_test->io_offset,
+		    opts->rto_ashift, total_ncols+1, total_ncols,
+		    parity, opts->rto_expand_offset);
+	} else {
+		opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
+		    opts->rto_ashift, total_ncols, parity);
+		rm_test = vdev_raidz_map_alloc(zio_test,
+		    opts->rto_ashift, total_ncols, parity);
+	}

 	VERIFY(opts->zio_golden);
 	VERIFY(opts->rm_golden);
@@ -312,6 +358,187 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
 	return (err);
 }

+/*
+ * If reflow is not in progress, reflow_offset should be UINT64_MAX.
+ * For each row, if the row is entirely before reflow_offset, it will
+ * come from the new location.  Otherwise this row will come from the
+ * old location.  Therefore, rows that straddle the reflow_offset will
+ * come from the old location.
+ *
+ * NOTE: Until raidz expansion is implemented this function is only
+ * needed by raidz_test.c to the multi-row raid_map_t functionality.
+ */
+raidz_map_t *
+vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
+    uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+    uint64_t nparity, uint64_t reflow_offset)
+{
+	/* The zio's size in units of the vdev's minimum sector size. */
+	uint64_t s = size >> ashift;
+	uint64_t q, r, bc, devidx, asize = 0, tot;
+
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 * AKA "full rows"
+	 */
+	q = s / (logical_cols - nparity);
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
+	r = s - q * (logical_cols - nparity);
+
+	/* The number of "big columns" - those which contain remainder data. */
+	bc = (r == 0 ? 0 : r + nparity);
+
+	/*
+	 * The total number of data and parity sectors associated with
+	 * this I/O.
+	 */
+	tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+	/* How many rows contain data (not skip) */
+	uint64_t rows = howmany(tot, logical_cols);
+	int cols = MIN(tot, logical_cols);
+
+	raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+	    KM_SLEEP);
+	rm->rm_nrows = rows;
+
+	for (uint64_t row = 0; row < rows; row++) {
+		raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
+		    rr_col[cols]), KM_SLEEP);
+		rm->rm_row[row] = rr;
+
+		/* The starting RAIDZ (parent) vdev sector of the row. */
+		uint64_t b = (offset >> ashift) + row * logical_cols;
+
+		/*
+		 * If we are in the middle of a reflow, and any part of this
+		 * row has not been copied, then use the old location of
+		 * this row.
+		 */
+		int row_phys_cols = physical_cols;
+		if (b + (logical_cols - nparity) > reflow_offset >> ashift)
+			row_phys_cols--;
+
+		/* starting child of this row */
+		uint64_t child_id = b % row_phys_cols;
+		/* The starting byte offset on each child vdev. */
+		uint64_t child_offset = (b / row_phys_cols) << ashift;
+
+		/*
+		 * We set cols to the entire width of the block, even
+		 * if this row is shorter.  This is needed because parity
+		 * generation (for Q and R) needs to know the entire width,
+		 * because it treats the short row as though it was
+		 * full-width (and the "phantom" sectors were zero-filled).
+		 *
+		 * Another approach to this would be to set cols shorter
+		 * (to just the number of columns that we might do i/o to)
+		 * and have another mechanism to tell the parity generation
+		 * about the "entire width".  Reconstruction (at least
+		 * vdev_raidz_reconstruct_general()) would also need to
+		 * know about the "entire width".
+		 */
+		rr->rr_cols = cols;
+		rr->rr_bigcols = bc;
+		rr->rr_missingdata = 0;
+		rr->rr_missingparity = 0;
+		rr->rr_firstdatacol = nparity;
+		rr->rr_abd_empty = NULL;
+		rr->rr_nempty = 0;
+
+		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
+			if (child_id >= row_phys_cols) {
+				child_id -= row_phys_cols;
+				child_offset += 1ULL << ashift;
+			}
+			rr->rr_col[c].rc_devidx = child_id;
+			rr->rr_col[c].rc_offset = child_offset;
+			rr->rr_col[c].rc_orig_data = NULL;
+			rr->rr_col[c].rc_error = 0;
+			rr->rr_col[c].rc_tried = 0;
+			rr->rr_col[c].rc_skipped = 0;
+			rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+			uint64_t dc = c - rr->rr_firstdatacol;
+			if (c < rr->rr_firstdatacol) {
+				rr->rr_col[c].rc_size = 1ULL << ashift;
+				rr->rr_col[c].rc_abd =
+				    abd_alloc_linear(rr->rr_col[c].rc_size,
+				    B_TRUE);
+			} else if (row == rows - 1 && bc != 0 && c >= bc) {
+				/*
+				 * Past the end, this for parity generation.
+				 */
+				rr->rr_col[c].rc_size = 0;
+				rr->rr_col[c].rc_abd = NULL;
+			} else {
+				/*
+				 * "data column" (col excluding parity)
+				 * Add an ASCII art diagram here
+				 */
+				uint64_t off;
+
+				if (c < bc || r == 0) {
+					off = dc * rows + row;
+				} else {
+					off = r * rows +
+					    (dc - r) * (rows - 1) + row;
+				}
+				rr->rr_col[c].rc_size = 1ULL << ashift;
+				rr->rr_col[c].rc_abd = abd_get_offset_struct(
+				    &rr->rr_col[c].rc_abdstruct,
+				    abd, off << ashift, 1 << ashift);
+			}
+
+			asize += rr->rr_col[c].rc_size;
+		}
+		/*
+		 * If all data stored spans all columns, there's a danger that
+		 * parity will always be on the same device and, since parity
+		 * isn't read during normal operation, that that device's I/O
+		 * bandwidth won't be used effectively. We therefore switch
+		 * the parity every 1MB.
+		 *
+		 * ...at least that was, ostensibly, the theory. As a practical
+		 * matter unless we juggle the parity between all devices
+		 * evenly, we won't see any benefit. Further, occasional writes
+		 * that aren't a multiple of the LCM of the number of children
+		 * and the minimum stripe width are sufficient to avoid pessimal
+		 * behavior. Unfortunately, this decision created an implicit
+		 * on-disk format requirement that we need to support for all
+		 * eternity, but only for single-parity RAID-Z.
+		 *
+		 * If we intend to skip a sector in the zeroth column for
+		 * padding we must make sure to note this swap. We will never
+		 * intend to skip the first column since at least one data and
+		 * one parity column must appear in each row.
+		 */
+		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+		    (offset & (1ULL << 20))) {
+			ASSERT(rr->rr_cols >= 2);
+			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+			devidx = rr->rr_col[0].rc_devidx;
+			uint64_t o = rr->rr_col[0].rc_offset;
+			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+			rr->rr_col[1].rc_devidx = devidx;
+			rr->rr_col[1].rc_offset = o;
+		}
+
+	}
+	ASSERT3U(asize, ==, tot << ashift);
+
+	/* init RAIDZ parity ops */
+	rm->rm_ops = vdev_raidz_math_get_ops();
+
+	return (rm);
+}
+
 static raidz_map_t *
 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
 {
@@ -330,8 +557,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
 	(*zio)->io_abd = raidz_alloc(alloc_dsize);
 	init_zio_abd(*zio);

-	rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
-	    total_ncols, parity);
+	if (opts->rto_expand) {
+		rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
+		    (*zio)->io_size, (*zio)->io_offset,
+		    opts->rto_ashift, total_ncols+1, total_ncols,
+		    parity, opts->rto_expand_offset);
+	} else {
+		rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
+		    total_ncols, parity);
+	}
 	VERIFY(rm);

 	/* Make sure code columns are destroyed */
@@ -420,7 +654,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
 	if (fn < RAIDZ_REC_PQ) {
 		/* can reconstruct 1 failed data disk */
 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
-			if (x0 >= rm->rm_cols - raidz_parity(rm))
+			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
 				continue;

 			/* Check if should stop */
@@ -445,10 +679,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
 	} else if (fn < RAIDZ_REC_PQR) {
 		/* can reconstruct 2 failed data disk */
 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
-			if (x0 >= rm->rm_cols - raidz_parity(rm))
+			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
 				continue;
 			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
-				if (x1 >= rm->rm_cols - raidz_parity(rm))
+				if (x1 >= rm->rm_row[0]->rr_cols -
+				    raidz_parity(rm))
 					continue;

 				/* Check if should stop */
@@ -475,14 +710,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
 	} else {
 		/* can reconstruct 3 failed data disk */
 		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
-			if (x0 >= rm->rm_cols - raidz_parity(rm))
+			if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
 				continue;
 			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
-				if (x1 >= rm->rm_cols - raidz_parity(rm))
+				if (x1 >= rm->rm_row[0]->rr_cols -
+				    raidz_parity(rm))
 					continue;
 				for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
-					if (x2 >=
-					    rm->rm_cols - raidz_parity(rm))
+					if (x2 >= rm->rm_row[0]->rr_cols -
+					    raidz_parity(rm))
 						continue;

 					/* Check if should stop */
@@ -700,6 +936,8 @@ run_sweep(void)
 		opts->rto_dcols = dcols_v[d];
 		opts->rto_offset = (1 << ashift_v[a]) * rand();
 		opts->rto_dsize = size_v[s];
+		opts->rto_expand = rto_opts.rto_expand;
+		opts->rto_expand_offset = rto_opts.rto_expand_offset;
 		opts->rto_v = 0; /* be quiet */

 		VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
@@ -732,6 +970,7 @@ exit:
 	return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
 }

+
 int
 main(int argc, char **argv)
 {
@@ -44,13 +44,15 @@ static const char *raidz_impl_names[] = {

 typedef struct raidz_test_opts {
 	size_t rto_ashift;
-	size_t rto_offset;
+	uint64_t rto_offset;
 	size_t rto_dcols;
 	size_t rto_dsize;
 	size_t rto_v;
 	size_t rto_sweep;
 	size_t rto_sweep_timeout;
 	size_t rto_benchmark;
+	size_t rto_expand;
+	uint64_t rto_expand_offset;
 	size_t rto_sanity;
 	size_t rto_gdb;

@@ -69,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = {
 	.rto_v = 0,
 	.rto_sweep = 0,
 	.rto_benchmark = 0,
+	.rto_expand = 0,
+	.rto_expand_offset = -1ULL,
 	.rto_sanity = 0,
 	.rto_gdb = 0,
 	.rto_should_stop = B_FALSE
@@ -113,4 +117,7 @@ void init_zio_abd(zio_t *zio);

 void run_raidz_benchmark(void);

+struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
+    uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
+
 #endif /* RAIDZ_TEST_H */
@@ -1 +1,3 @@
+include $(top_srcdir)/config/Shellcheck.am
+
 dist_udev_SCRIPTS = vdev_id
@@ -79,6 +79,34 @@
 # channel 86:00.0 1         A
 # channel 86:00.0 0         B

+# #
+# # Example vdev_id.conf - multipath / multijbod-daisychaining
+# #
+#
+# multipath yes
+# multijbod yes
+#
+# #       PCI_ID  HBA PORT  CHANNEL NAME
+# channel 85:00.0 1         A
+# channel 85:00.0 0         B
+# channel 86:00.0 1         A
+# channel 86:00.0 0         B
+
+# #
+# # Example vdev_id.conf - multipath / mixed
+# #
+#
+# multipath yes
+# slot mix
+#
+# #       PCI_ID  HBA PORT  CHANNEL NAME
+# channel 85:00.0 3         A
+# channel 85:00.0 2         B
+# channel 86:00.0 3         A
+# channel 86:00.0 2         B
+# channel af:00.0 0         C
+# channel af:00.0 1         C
+
 # #
 # # Example vdev_id.conf - alias
 # #
@@ -92,9 +120,10 @@ PATH=/bin:/sbin:/usr/bin:/usr/sbin
 CONFIG=/etc/zfs/vdev_id.conf
 PHYS_PER_PORT=
 DEV=
-MULTIPATH=
 TOPOLOGY=
 BAY=
+ENCL_ID=""
+UNIQ_ENCL_ID=""

 usage() {
 	cat << EOF
@@ -107,6 +136,7 @@ Usage: vdev_id [-h]
  -e    Create enclose device symlinks only (/dev/by-enclosure)
  -g    Storage network topology [default="$TOPOLOGY"]
  -m    Run in multipath mode
+  -j    Run in multijbod mode
  -p    number of phy's per switch port [default=$PHYS_PER_PORT]
  -h    show this summary
 EOF
@@ -117,12 +147,13 @@ map_slot() {
 	LINUX_SLOT=$1
 	CHANNEL=$2

-	MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \
-			\\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG`
+	MAPPED_SLOT=$(awk -v linux_slot="$LINUX_SLOT" -v channel="$CHANNEL" \
+			'$1 == "slot" && $2 == linux_slot && \
+			($4 ~ "^"channel"$" || $4 ~ /^$/) { print $3; exit}' $CONFIG)
 	if [ -z "$MAPPED_SLOT" ] ; then
 		MAPPED_SLOT=$LINUX_SLOT
 	fi
-	printf "%d" ${MAPPED_SLOT}
+	printf "%d" "${MAPPED_SLOT}"
 }

 map_channel() {
@@ -132,40 +163,120 @@ map_channel() {

 	case $TOPOLOGY in
 		"sas_switch")
-		MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \
-			{ print \\$3; exit }" $CONFIG`
+		MAPPED_CHAN=$(awk -v port="$PORT" \
+			'$1 == "channel" && $2 == port \
+			{ print $3; exit }' $CONFIG)
 		;;
 		"sas_direct"|"scsi")
-		MAPPED_CHAN=`awk "\\$1 == \"channel\" && \
-			\\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \
-			{ print \\$4; exit }" $CONFIG`
+		MAPPED_CHAN=$(awk -v pciID="$PCI_ID" -v port="$PORT" \
+			'$1 == "channel" && $2 == pciID && $3 == port \
+			{print $4}' $CONFIG)
 		;;
 	esac
-	printf "%s" ${MAPPED_CHAN}
+	printf "%s" "${MAPPED_CHAN}"
+}
+
+get_encl_id() {
+	set -- $(echo $1)
+	count=$#
+
+	i=1
+	while [ $i -le $count ] ; do
+		d=$(eval echo '$'{$i})
+		id=$(cat "/sys/class/enclosure/${d}/id")
+		ENCL_ID="${ENCL_ID} $id"
+		i=$((i + 1))
+	done
+}
+
+get_uniq_encl_id() {
+	for uuid in ${ENCL_ID}; do
+		found=0
+
+		for count in ${UNIQ_ENCL_ID}; do
+			if [ $count = $uuid ]; then
+				found=1
+				break
+			fi
+		done
+
+		if [ $found -eq 0 ]; then
+			UNIQ_ENCL_ID="${UNIQ_ENCL_ID} $uuid"
+		fi
+	done
+}
+
+# map_jbod explainer: The bsg driver knows the difference between a SAS
+# expander and fanout expander. Use hostX instance along with top-level
+# (whole enclosure) expander instances in /sys/class/enclosure and
+# matching a field in an array of expanders, using the index of the
+# matched array field as the enclosure instance, thereby making jbod IDs
+# dynamic. Avoids reliance on high overhead userspace commands like
+# multipath and lsscsi and instead uses existing sysfs data.  $HOSTCHAN
+# variable derived from devpath gymnastics in sas_handler() function.
+map_jbod() {
+	DEVEXP=$(ls -l "/sys/block/$DEV/device/" | grep enclos | awk -F/ '{print $(NF-1) }')
+	DEV=$1
+
+	# Use "set --" to create index values (Arrays)
+	set -- $(ls -l /sys/class/enclosure | grep -v "^total" | awk '{print $9}')
+	# Get count of total elements
+	JBOD_COUNT=$#
+	JBOD_ITEM=$*
+
+	# Build JBODs (enclosure)  id from sys/class/enclosure/<dev>/id
+	get_encl_id "$JBOD_ITEM"
+	# Different expander instances for each paths.
+	# Filter out and keep only unique id.
+	get_uniq_encl_id
+
+	# Identify final 'mapped jbod'
+	j=0
+	for count in ${UNIQ_ENCL_ID}; do
+		i=1
+		j=$((j + 1))
+		while [ $i -le $JBOD_COUNT ] ; do
+			d=$(eval echo '$'{$i})
+			id=$(cat "/sys/class/enclosure/${d}/id")
+			if [ "$d" = "$DEVEXP" ] && [ $id = $count ] ; then
+				MAPPED_JBOD=$j
+				break
+			fi
+			i=$((i + 1))
+		done
+	done
+
+	printf "%d" "${MAPPED_JBOD}"
 }

 sas_handler() {
 	if [ -z "$PHYS_PER_PORT" ] ; then
-		PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
-			{print \\$2; exit}" $CONFIG`
+		PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \
+			{print $2; exit}' $CONFIG)
 	fi
 	PHYS_PER_PORT=${PHYS_PER_PORT:-4}
-	if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+
+	if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then
 		echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
 		exit 1
 	fi

 	if [ -z "$MULTIPATH_MODE" ] ; then
-		MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
-			{print \\$2; exit}" $CONFIG`
+		MULTIPATH_MODE=$(awk '$1 == "multipath" \
+			{print $2; exit}' $CONFIG)
+	fi
+
+	if [ -z "$MULTIJBOD_MODE" ] ; then
+		MULTIJBOD_MODE=$(awk '$1 == "multijbod" \
+			{print $2; exit}' $CONFIG)
 	fi

 	# Use first running component device if we're handling a dm-mpath device
 	if [ "$MULTIPATH_MODE" = "yes" ] ; then
 		# If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
 		if [ -z "$DM_NAME" ] ; then
-			DM_NAME=`ls -l --full-time /dev/mapper |
-				awk "/\/$DEV$/{print \\$9}"`
+			DM_NAME=$(ls -l --full-time /dev/mapper |
+				grep "$DEV"$ | awk '{print $9}')
 		fi

 		# For raw disks udev exports DEVTYPE=partition when
@@ -175,28 +286,50 @@ sas_handler() {
 		# we have to append the -part suffix directly in the
 		# helper.
 		if [ "$DEVTYPE" != "partition" ] ; then
-			PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+			# Match p[number], remove the 'p' and prepend "-part"
+			PART=$(echo "$DM_NAME" |
+				awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}')
 		fi

 		# Strip off partition information.
-		DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+		DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//')
 		if [ -z "$DM_NAME" ] ; then
 			return
 		fi

-		# Get the raw scsi device name from multipath -ll. Strip off
-		# leading pipe symbols to make field numbering consistent.
-		DEV=`multipath -ll $DM_NAME |
-			awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+		# Utilize DM device name to gather subordinate block devices
+		# using sysfs to avoid userspace utilities
+
+		# If our DEVNAME is something like /dev/dm-177, then we may be
+		# able to get our DMDEV from it.
+		DMDEV=$(echo $DEVNAME | sed 's;/dev/;;g')
+		if [ ! -e /sys/block/$DMDEV/slaves/* ] ; then
+			# It's not there, try looking in /dev/mapper
+			DMDEV=$(ls -l --full-time /dev/mapper | grep $DM_NAME |
+			awk '{gsub("../", " "); print $NF}')
+		fi
+
+		# Use sysfs pointers in /sys/block/dm-X/slaves because using
+		# userspace tools creates lots of overhead and should be avoided
+		# whenever possible. Use awk to isolate lowest instance of
+		# sd device member in dm device group regardless of string
+		# length.
+		DEV=$(ls "/sys/block/$DMDEV/slaves" | awk '
+			{ len=sprintf ("%20s",length($0)); gsub(/ /,0,str); a[NR]=len "_" $0; }
+			END {
+				asort(a)
+				print substr(a[1],22)
+			}')
+
 		if [ -z "$DEV" ] ; then
 			return
 		fi
 	fi

-	if echo $DEV | grep -q ^/devices/ ; then
+	if echo "$DEV" | grep -q ^/devices/ ; then
 		sys_path=$DEV
 	else
-		sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+		sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null)
 	fi

 	# Use positional parameters as an ad-hoc array
@@ -206,84 +339,104 @@ sas_handler() {

 	# Get path up to /sys/.../hostX
 	i=1
-	while [ $i -le $num_dirs ] ; do
-		d=$(eval echo \${$i})
+
+	while [ $i -le "$num_dirs" ] ; do
+		d=$(eval echo '$'{$i})
 		scsi_host_dir="$scsi_host_dir/$d"
-		echo $d | grep -q -E '^host[0-9]+$' && break
-		i=$(($i + 1))
+		echo "$d" | grep -q -E '^host[0-9]+$' && break
+		i=$((i + 1))
 	done

-	if [ $i = $num_dirs ] ; then
+	# Lets grab the SAS host channel number and save it for JBOD sorting later
+	HOSTCHAN=$(echo "$d" | awk -F/ '{ gsub("host","",$NF); print $NF}')
+
+	if [ $i = "$num_dirs" ] ; then
 		return
 	fi

-	PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+	PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}')

 	# In sas_switch mode, the directory four levels beneath
 	# /sys/.../hostX contains symlinks to phy devices that reveal
 	# the switch port number.  In sas_direct mode, the phy links one
 	# directory down reveal the HBA port.
 	port_dir=$scsi_host_dir
+
 	case $TOPOLOGY in
-		"sas_switch") j=$(($i + 4)) ;;
-		"sas_direct") j=$(($i + 1)) ;;
+		"sas_switch") j=$((i + 4)) ;;
+		"sas_direct") j=$((i + 1)) ;;
 	esac

-	i=$(($i + 1))
+	i=$((i + 1))
+
 	while [ $i -le $j ] ; do
-		port_dir="$port_dir/$(eval echo \${$i})"
-		i=$(($i + 1))
+		port_dir="$port_dir/$(eval echo '$'{$i})"
+		i=$((i + 1))
 	done

-	PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'`
+	PHY=$(ls -d "$port_dir"/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}')
 	if [ -z "$PHY" ] ; then
 		PHY=0
 	fi
-	PORT=$(( $PHY / $PHYS_PER_PORT ))
+	PORT=$((PHY / PHYS_PER_PORT))

 	# Look in /sys/.../sas_device/end_device-X for the bay_identifier
 	# attribute.
 	end_device_dir=$port_dir
-	while [ $i -lt $num_dirs ] ; do
-		d=$(eval echo \${$i})
+
+	while [ $i -lt "$num_dirs" ] ; do
+		d=$(eval echo '$'{$i})
 		end_device_dir="$end_device_dir/$d"
-		if echo $d | grep -q '^end_device' ; then
+		if echo "$d" | grep -q '^end_device' ; then
 			end_device_dir="$end_device_dir/sas_device/$d"
 			break
 		fi
-		i=$(($i + 1))
+		i=$((i + 1))
 	done

+	# Add 'mix' slot type for environments where dm-multipath devices
+	# include end-devices connected via SAS expanders or direct connection
+	# to SAS HBA. A mixed connectivity environment such as pool devices
+	# contained in a SAS JBOD and spare drives or log devices directly
+	# connected in a server backplane without expanders in the I/O path.
 	SLOT=
+
 	case $BAY in
 	"bay")
-		SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null`
+		SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null)
+		;;
+	"mix")
+		if [ $(cat "$end_device_dir/bay_identifier" 2>/dev/null) ] ; then
+			SLOT=$(cat "$end_device_dir/bay_identifier" 2>/dev/null)
+		else
+			SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null)
+		fi
 		;;
 	"phy")
-		SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null`
+		SLOT=$(cat "$end_device_dir/phy_identifier" 2>/dev/null)
 		;;
 	"port")
-		d=$(eval echo \${$i})
-		SLOT=`echo $d | sed -e 's/^.*://'`
+		d=$(eval echo '$'{$i})
+		SLOT=$(echo "$d" | sed -e 's/^.*://')
 		;;
 	"id")
-		i=$(($i + 1))
-		d=$(eval echo \${$i})
-		SLOT=`echo $d | sed -e 's/^.*://'`
+		i=$((i + 1))
+		d=$(eval echo '$'{$i})
+		SLOT=$(echo "$d" | sed -e 's/^.*://')
 		;;
 	"lun")
-		i=$(($i + 2))
-		d=$(eval echo \${$i})
-		SLOT=`echo $d | sed -e 's/^.*://'`
+		i=$((i + 2))
+		d=$(eval echo '$'{$i})
+		SLOT=$(echo "$d" | sed -e 's/^.*://')
 		;;
 	"ses")
 		# look for this SAS path in all SCSI Enclosure Services
 		# (SES) enclosures
-		sas_address=`cat $end_device_dir/sas_address 2>/dev/null`
-		enclosures=`lsscsi -g | \
-			sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'`
+		sas_address=$(cat "$end_device_dir/sas_address" 2>/dev/null)
+		enclosures=$(lsscsi -g | \
+			sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p')
 		for enclosure in $enclosures; do
-			set -- $(sg_ses -p aes $enclosure | \
+			set -- $(sg_ses -p aes "$enclosure" | \
 				awk "/device slot number:/{slot=\$12} \
 					/SAS address: $sas_address/\
 					{print slot}")
@@ -298,42 +451,55 @@ sas_handler() {
 		return
 	fi

-	CHAN=`map_channel $PCI_ID $PORT`
-	SLOT=`map_slot $SLOT $CHAN`
-	if [ -z "$CHAN" ] ; then
-		return
+	if [ "$MULTIJBOD_MODE" = "yes" ] ; then
+		CHAN=$(map_channel "$PCI_ID" "$PORT")
+		SLOT=$(map_slot "$SLOT" "$CHAN")
+		JBOD=$(map_jbod "$DEV")
+
+		if [ -z "$CHAN" ] ; then
+			return
+		fi
+		echo "${CHAN}"-"${JBOD}"-"${SLOT}${PART}"
+	else
+		CHAN=$(map_channel "$PCI_ID" "$PORT")
+		SLOT=$(map_slot "$SLOT" "$CHAN")
+
+		if [ -z "$CHAN" ] ; then
+			return
+		fi
+		echo "${CHAN}${SLOT}${PART}"
 	fi
-	echo ${CHAN}${SLOT}${PART}
 }

 scsi_handler() {
 	if [ -z "$FIRST_BAY_NUMBER" ] ; then
-		FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \
-			{print \\$2; exit}" $CONFIG`
+		FIRST_BAY_NUMBER=$(awk '$1 == "first_bay_number" \
+			{print $2; exit}' $CONFIG)
 	fi
 	FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0}

 	if [ -z "$PHYS_PER_PORT" ] ; then
-		PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
-			{print \\$2; exit}" $CONFIG`
+		PHYS_PER_PORT=$(awk '$1 == "phys_per_port" \
+			{print $2; exit}' $CONFIG)
 	fi
 	PHYS_PER_PORT=${PHYS_PER_PORT:-4}
-	if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+
+	if ! echo "$PHYS_PER_PORT" | grep -q -E '^[0-9]+$' ; then
 		echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
 		exit 1
 	fi

 	if [ -z "$MULTIPATH_MODE" ] ; then
-		MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
-			{print \\$2; exit}" $CONFIG`
+		MULTIPATH_MODE=$(awk '$1 == "multipath" \
+			{print $2; exit}' $CONFIG)
 	fi

 	# Use first running component device if we're handling a dm-mpath device
 	if [ "$MULTIPATH_MODE" = "yes" ] ; then
 		# If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
 		if [ -z "$DM_NAME" ] ; then
-			DM_NAME=`ls -l --full-time /dev/mapper |
-				awk "/\/$DEV$/{print \\$9}"`
+			DM_NAME=$(ls -l --full-time /dev/mapper |
+				grep "$DEV"$ | awk '{print $9}')
 		fi

 		# For raw disks udev exports DEVTYPE=partition when
@@ -343,28 +509,30 @@ scsi_handler() {
 		# we have to append the -part suffix directly in the
 		# helper.
 		if [ "$DEVTYPE" != "partition" ] ; then
-			PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+			# Match p[number], remove the 'p' and prepend "-part"
+			PART=$(echo "$DM_NAME" |
+			    awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}')
 		fi

 		# Strip off partition information.
-		DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+		DM_NAME=$(echo "$DM_NAME" | sed 's/p[0-9][0-9]*$//')
 		if [ -z "$DM_NAME" ] ; then
 			return
 		fi

 		# Get the raw scsi device name from multipath -ll. Strip off
 		# leading pipe symbols to make field numbering consistent.
-		DEV=`multipath -ll $DM_NAME |
-			awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+		DEV=$(multipath -ll "$DM_NAME" |
+			awk '/running/{gsub("^[|]"," "); print $3 ; exit}')
 		if [ -z "$DEV" ] ; then
 			return
 		fi
 	fi

-	if echo $DEV | grep -q ^/devices/ ; then
+	if echo "$DEV" | grep -q ^/devices/ ; then
 		sys_path=$DEV
 	else
-		sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+		sys_path=$(udevadm info -q path -p "/sys/block/$DEV" 2>/dev/null)
 	fi

 	# expect sys_path like this, for example:
@@ -377,44 +545,47 @@ scsi_handler() {

 	# Get path up to /sys/.../hostX
 	i=1
-	while [ $i -le $num_dirs ] ; do
-		d=$(eval echo \${$i})
+
+	while [ $i -le "$num_dirs" ] ; do
+		d=$(eval echo '$'{$i})
 		scsi_host_dir="$scsi_host_dir/$d"
-		echo $d | grep -q -E '^host[0-9]+$' && break
-		i=$(($i + 1))
+
+		echo "$d" | grep -q -E '^host[0-9]+$' && break
+		i=$((i + 1))
 	done

-	if [ $i = $num_dirs ] ; then
+	if [ $i = "$num_dirs" ] ; then
 		return
 	fi

-	PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+	PCI_ID=$(eval echo '$'{$((i -1))} | awk -F: '{print $2":"$3}')

 	# In scsi mode, the directory two levels beneath
 	# /sys/.../hostX reveals the port and slot.
 	port_dir=$scsi_host_dir
-	j=$(($i + 2))
+	j=$((i + 2))

-	i=$(($i + 1))
+	i=$((i + 1))
 	while [ $i -le $j ] ; do
-		port_dir="$port_dir/$(eval echo \${$i})"
-		i=$(($i + 1))
+		port_dir="$port_dir/$(eval echo '$'{$i})"
+		i=$((i + 1))
 	done

-	set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/')
+	set -- $(echo "$port_dir" | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/')
 	PORT=$1
-	SLOT=$(($2 + $FIRST_BAY_NUMBER))
+	SLOT=$(($2 + FIRST_BAY_NUMBER))

 	if [ -z "$SLOT" ] ; then
 		return
 	fi

-	CHAN=`map_channel $PCI_ID $PORT`
-	SLOT=`map_slot $SLOT $CHAN`
+	CHAN=$(map_channel "$PCI_ID" "$PORT")
+	SLOT=$(map_slot "$SLOT" "$CHAN")
+
 	if [ -z "$CHAN" ] ; then
 		return
 	fi
-	echo ${CHAN}${SLOT}${PART}
+	echo "${CHAN}${SLOT}${PART}"
 }

 # Figure out the name for the enclosure symlink
@@ -425,7 +596,7 @@ enclosure_handler () {

 	# Get the enclosure ID ("0:0:0:0")
 	ENC=$(basename $(readlink -m "/sys/$DEVPATH/../.."))
-	if [ ! -d /sys/class/enclosure/$ENC ] ; then
+	if [ ! -d "/sys/class/enclosure/$ENC" ] ; then
 		# Not an enclosure, bail out
 		return
 	fi
@@ -433,14 +604,14 @@ enclosure_handler () {
 	# Get the long sysfs device path to our enclosure. Looks like:
 	# /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0

-	ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC)
+	ENC_DEVICE=$(readlink "/sys/class/enclosure/$ENC")

 	# Grab the full path to the hosts port dir:
 	# /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0
-	PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+')
+	PORT_DIR=$(echo "$ENC_DEVICE" | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+')

 	# Get the port number
-	PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$")
+	PORT_ID=$(echo "$PORT_DIR" | grep -Eo "[0-9]+$")

 	# The PCI directory is two directories up from the port directory
 	# /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0
@@ -450,8 +621,8 @@ enclosure_handler () {
 	PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g')

 	# Name our device according to vdev_id.conf (like "L0" or "U1").
-	NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \
-		\$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG)
+	NAME=$(awk '/channel/{if ($1 == "channel" && $2 == "$PCI_ID" && \
+		$3 == "$PORT_ID") {print ${4}int(count[$4])}; count[$4]++}' $CONFIG)

 	echo "${NAME}"
 }
@@ -487,9 +658,11 @@ alias_handler () {
 	#          ambiguity seems unavoidable, so devices using this facility
 	#          must not use such names.
 	DM_PART=
-	if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then
+	if echo "$DM_NAME" | grep -q -E 'p[0-9][0-9]*$' ; then
 		if [ "$DEVTYPE" != "partition" ] ; then
-			DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+			# Match p[number], remove the 'p' and prepend "-part"
+			DM_PART=$(echo "$DM_NAME" |
+			    awk 'match($0,/p[0-9]+$/) {print "-part"substr($0,RSTART+1,RLENGTH-1)}')
 		fi
 	fi

@@ -497,21 +670,25 @@ alias_handler () {
 	for link in $DEVLINKS ; do
 		# Remove partition information to match key of top-level device.
 		if [ -n "$DM_PART" ] ; then
-			link=`echo $link | sed 's/p[0-9][0-9]*$//'`
+			link=$(echo "$link" | sed 's/p[0-9][0-9]*$//')
 		fi
 		# Check both the fully qualified and the base name of link.
-		for l in $link `basename $link` ; do
-			alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \
-					{ print \\$2; exit }" $CONFIG`
-			if [ -n "$alias" ] ; then
-				echo ${alias}${DM_PART}
-				return
+		for l in $link $(basename "$link") ; do
+			if [ ! -z "$l" ]; then
+				alias=$(awk -v var="$l" '($1 == "alias") && \
+					($3 == var) \
+					{ print $2; exit }' $CONFIG)
+				if [ -n "$alias" ] ; then
+					echo "${alias}${DM_PART}"
+					return
+				fi
 			fi
 		done
 	done
 }

-while getopts 'c:d:eg:mp:h' OPTION; do
+# main
+while getopts 'c:d:eg:jmp:h' OPTION; do
 	case ${OPTION} in
 	c)
 		CONFIG=${OPTARG}
@@ -524,7 +701,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do
 	# create the enclosure device symlinks only.  We also need
 	# "enclosure_symlinks yes" set in vdev_id.config to actually create the
 	# symlink.
-	ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG)
+	ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") \
+		print $2}' "$CONFIG")
+
 	if [ "$ENCLOSURE_MODE" != "yes" ] ; then
 		exit 0
 	fi
@@ -535,6 +714,9 @@ while getopts 'c:d:eg:mp:h' OPTION; do
 	p)
 		PHYS_PER_PORT=${OPTARG}
 		;;
+	j)
+		MULTIJBOD_MODE=yes
+		;;
 	m)
 		MULTIPATH_MODE=yes
 		;;
@@ -544,7 +726,7 @@ while getopts 'c:d:eg:mp:h' OPTION; do
 	esac
 done

-if [ ! -r $CONFIG ] ; then
+if [ ! -r "$CONFIG" ] ; then
 	echo "Error: Config file \"$CONFIG\" not found"
 	exit 0
 fi
@@ -555,11 +737,11 @@ if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then
 fi

 if [ -z "$TOPOLOGY" ] ; then
-	TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG`
+	TOPOLOGY=$(awk '($1 == "topology") {print $2; exit}' "$CONFIG")
 fi

 if [ -z "$BAY" ] ; then
-	BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG`
+	BAY=$(awk '($1 == "slot") {print $2; exit}' "$CONFIG")
 fi

 TOPOLOGY=${TOPOLOGY:-sas_direct}
@@ -572,7 +754,7 @@ if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then
 	fi

 	# Just create the symlinks to the enclosure devices and then exit.
-	ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG)
+	ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' "$CONFIG")
 	if [ -z "$ENCLOSURE_PREFIX" ] ; then
 		ENCLOSURE_PREFIX="enc"
 	fi
@@ -582,16 +764,16 @@ if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then
 fi

 # First check if an alias was defined for this device.
-ID_VDEV=`alias_handler`
+ID_VDEV=$(alias_handler)

 if [ -z "$ID_VDEV" ] ; then
 	BAY=${BAY:-bay}
 	case $TOPOLOGY in
 		sas_direct|sas_switch)
-			ID_VDEV=`sas_handler`
+			ID_VDEV=$(sas_handler)
 			;;
 		scsi)
-			ID_VDEV=`scsi_handler`
+			ID_VDEV=$(scsi_handler)
 			;;
 		*)
 			echo "Error: unknown topology $TOPOLOGY"
@@ -31,6 +31,7 @@
 *
 * [1] Portions of this software were developed by Allan Jude
 *     under sponsorship from the FreeBSD Foundation.
+ * Copyright (c) 2021 Allan Jude
 */

 #include <stdio.h>
@@ -782,13 +783,14 @@ usage(void)
 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
 	    "\t%s -O <dataset> <path>\n"
+	    "\t%s -r <dataset> <path> <destination>\n"
 	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
 	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
 	    "\t%s -E [-A] word0:word1:...:word15\n"
 	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
 	    "<poolname>\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
-	    cmdname, cmdname, cmdname);
+	    cmdname, cmdname, cmdname, cmdname);

 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
@@ -827,6 +829,7 @@ usage(void)
 	(void) fprintf(stderr, "        -m metaslabs\n");
 	(void) fprintf(stderr, "        -M metaslab groups\n");
 	(void) fprintf(stderr, "        -O perform object lookups by path\n");
+	(void) fprintf(stderr, "        -r copy an object by path to file\n");
 	(void) fprintf(stderr, "        -R read and display block from a "
 	    "device\n");
 	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
@@ -1669,7 +1672,11 @@ dump_metaslab(metaslab_t *msp)
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}

-	ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+	if (vd->vdev_ops == &vdev_draid_ops)
+		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
+	else
+		ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
+
 	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);

 	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
@@ -4229,6 +4236,8 @@ dump_l2arc_log_entries(uint64_t log_entries,
 		    (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
 		(void) printf("|\t\t\t\taddress: %llu\n",
 		    (u_longlong_t)le[j].le_daddr);
+		(void) printf("|\t\t\t\tARC state: %llu\n",
+		    (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
 		(void) printf("|\n");
 	}
 	(void) printf("\n");
@@ -4511,7 +4520,7 @@ static char curpath[PATH_MAX];
 * for the last one.
 */
 static int
-dump_path_impl(objset_t *os, uint64_t obj, char *name)
+dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
 {
 	int err;
 	boolean_t header = B_TRUE;
@@ -4561,10 +4570,15 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
 	switch (doi.doi_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (s != NULL && *(s + 1) != '\0')
-			return (dump_path_impl(os, child_obj, s + 1));
+			return (dump_path_impl(os, child_obj, s + 1, retobj));
 		/*FALLTHROUGH*/
 	case DMU_OT_PLAIN_FILE_CONTENTS:
-		dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0);
+		if (retobj != NULL) {
+			*retobj = child_obj;
+		} else {
+			dump_object(os, child_obj, dump_opt['v'], &header,
+			    NULL, 0);
+		}
 		return (0);
 	default:
 		(void) fprintf(stderr, "object %llu has non-file/directory "
@@ -4579,7 +4593,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
 * Dump the blocks for the object specified by path inside the dataset.
 */
 static int
-dump_path(char *ds, char *path)
+dump_path(char *ds, char *path, uint64_t *retobj)
 {
 	int err;
 	objset_t *os;
@@ -4599,12 +4613,89 @@ dump_path(char *ds, char *path)

 	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);

-	err = dump_path_impl(os, root_obj, path);
+	err = dump_path_impl(os, root_obj, path, retobj);

 	close_objset(os, FTAG);
 	return (err);
 }

+static int
+zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
+{
+	int err = 0;
+	uint64_t size, readsize, oursize, offset;
+	ssize_t writesize;
+	sa_handle_t *hdl;
+
+	(void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
+	    destfile);
+
+	VERIFY3P(os, ==, sa_os);
+	if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
+		(void) printf("Failed to get handle for SA znode\n");
+		return (err);
+	}
+	if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
+		(void) sa_handle_destroy(hdl);
+		return (err);
+	}
+	(void) sa_handle_destroy(hdl);
+
+	(void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
+	    size);
+	if (size == 0) {
+		return (EINVAL);
+	}
+
+	int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+	/*
+	 * We cap the size at 1 mebibyte here to prevent
+	 * allocation failures and nigh-infinite printing if the
+	 * object is extremely large.
+	 */
+	oursize = MIN(size, 1 << 20);
+	offset = 0;
+	char *buf = kmem_alloc(oursize, KM_NOSLEEP);
+	if (buf == NULL) {
+		return (ENOMEM);
+	}
+
+	while (offset < size) {
+		readsize = MIN(size - offset, 1 << 20);
+		err = dmu_read(os, srcobj, offset, readsize, buf, 0);
+		if (err != 0) {
+			(void) printf("got error %u from dmu_read\n", err);
+			kmem_free(buf, oursize);
+			return (err);
+		}
+		if (dump_opt['v'] > 3) {
+			(void) printf("Read offset=%" PRIu64 " size=%" PRIu64
+			    " error=%d\n", offset, readsize, err);
+		}
+
+		writesize = write(fd, buf, readsize);
+		if (writesize < 0) {
+			err = errno;
+			break;
+		} else if (writesize != readsize) {
+			/* Incomplete write */
+			(void) fprintf(stderr, "Short write, only wrote %llu of"
+			    " %" PRIu64 " bytes, exiting...\n",
+			    (u_longlong_t)writesize, readsize);
+			break;
+		}
+
+		offset += readsize;
+	}
+
+	(void) close(fd);
+
+	if (buf != NULL)
+		kmem_free(buf, oursize);
+
+	return (err);
+}
+
 static int
 dump_label(const char *dev)
 {
@@ -5228,8 +5319,6 @@ zdb_blkptr_done(zio_t *zio)
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;

-	abd_free(zio->io_abd);
-
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
@@ -5256,6 +5345,8 @@ zdb_blkptr_done(zio_t *zio)
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
+
+	abd_free(zio->io_abd);
 }

 static int
@@ -5865,6 +5956,7 @@ zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 		 * metaslabs.  We want to set them up for
 		 * zio_claim().
 		 */
+		vdev_metaslab_group_create(vd);
 		VERIFY0(vdev_metaslab_init(vd, 0));

 		vdev_indirect_mapping_t *vim __maybe_unused =
@@ -5904,6 +5996,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 	 */
 	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;

 	zcb->zcb_vd_obsolete_counts =
 	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
@@ -6037,7 +6130,6 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
-		metaslab_group_t *mg __maybe_unused = vd->vdev_mg;

 		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
 			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
@@ -6045,7 +6137,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)

 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
-			ASSERT3P(mg, ==, msp->ms_group);
+			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
+			    spa_embedded_log_class(spa)) ?
+			    vd->vdev_log_mg : vd->vdev_mg);

 			/*
 			 * ms_allocatable has been overloaded
@@ -6252,6 +6346,8 @@ dump_block_stats(spa_t *spa)
 	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
 	zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
+	zcb.zcb_totalasize +=
+	    metaslab_class_get_alloc(spa_embedded_log_class(spa));
 	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
 	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);

@@ -6299,6 +6395,7 @@ dump_block_stats(spa_t *spa)

 	total_alloc = norm_alloc +
 	    metaslab_class_get_alloc(spa_log_class(spa)) +
+	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
@@ -6344,7 +6441,7 @@ dump_block_stats(spa_t *spa)
 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);

-	if (spa_special_class(spa)->mc_rotor != NULL) {
+	if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_special_class(spa));
 		uint64_t space = metaslab_class_get_space(
@@ -6355,7 +6452,7 @@ dump_block_stats(spa_t *spa)
 		    100.0 * alloc / space);
 	}

-	if (spa_dedup_class(spa)->mc_rotor != NULL) {
+	if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_dedup_class(spa));
 		uint64_t space = metaslab_class_get_space(
@@ -6366,6 +6463,17 @@ dump_block_stats(spa_t *spa)
 		    100.0 * alloc / space);
 	}

+	if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
+		uint64_t alloc = metaslab_class_get_alloc(
+		    spa_embedded_log_class(spa));
+		uint64_t space = metaslab_class_get_space(
+		    spa_embedded_log_class(spa));
+
+		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
+		    "Embedded log class", (u_longlong_t)alloc,
+		    100.0 * alloc / space);
+	}
+
 	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb.zcb_embedded_blocks[i] == 0)
 			continue;
@@ -8172,6 +8280,7 @@ main(int argc, char **argv)
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int64_t objset_id = -1;
+	uint64_t object;
 	int flags = ZFS_IMPORT_MISSING_LOG;
 	int rewind = ZPOOL_NEVER_REWIND;
 	char *spa_config_path_env, *objset_str;
@@ -8200,7 +8309,7 @@ main(int argc, char **argv)
 	zfs_btree_verify_intensity = 3;

 	while ((c = getopt(argc, argv,
-	    "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYyZ")) != -1) {
+	    "AbcCdDeEFGhiI:klLmMo:Op:PqrRsSt:uU:vVx:XYyZ")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
@@ -8215,6 +8324,7 @@ main(int argc, char **argv)
 		case 'm':
 		case 'M':
 		case 'O':
+		case 'r':
 		case 'R':
 		case 's':
 		case 'S':
@@ -8304,7 +8414,7 @@ main(int argc, char **argv)
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
-	if (dump_opt['d']) {
+	if (dump_opt['d'] || dump_opt['r']) {
 		/* <pool>[/<dataset | objset id> is accepted */
 		if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL &&
 		    objset_str++ != NULL) {
@@ -8363,7 +8473,7 @@ main(int argc, char **argv)
 		verbose = MAX(verbose, 1);

 	for (c = 0; c < 256; c++) {
-		if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL)
+		if (dump_all && strchr("AeEFklLOPrRSXy", c) == NULL)
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
@@ -8399,7 +8509,13 @@ main(int argc, char **argv)
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
-		return (dump_path(argv[0], argv[1]));
+		return (dump_path(argv[0], argv[1], NULL));
+	}
+	if (dump_opt['r']) {
+		if (argc != 3)
+			usage();
+		dump_opt['v'] = verbose;
+		error = dump_path(argv[0], argv[1], &object);
 	}

 	if (dump_opt['X'] || dump_opt['F'])
@@ -8577,7 +8693,9 @@ main(int argc, char **argv)

 	argv++;
 	argc--;
-	if (!dump_opt['R']) {
+	if (dump_opt['r']) {
+		error = zdb_copy_object(os, object, argv[1]);
+	} else if (!dump_opt['R']) {
 		flagbits['d'] = ZOR_FLAG_DIRECTORY;
 		flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
 		flagbits['m'] = ZOR_FLAG_SPACE_MAP;
@@ -1,8 +1,10 @@
 include $(top_srcdir)/config/Rules.am
+include $(top_srcdir)/config/Shellcheck.am

 AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS)

 SUBDIRS = zed.d
+SHELLCHECKDIRS = $(SUBDIRS)

 sbin_PROGRAMS = zed

@@ -43,7 +45,7 @@ zed_LDADD = \
 	$(abs_top_builddir)/lib/libnvpair/libnvpair.la \
 	$(abs_top_builddir)/lib/libuutil/libuutil.la

-zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
+zed_LDADD += -lrt $(LIBATOMIC_LIBS) $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
 zed_LDFLAGS = -pthread

 EXTRA_DIST = agents/README.md
@@ -13,6 +13,7 @@
 /*
 * Copyright (c) 2016, Intel Corporation.
 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ * Copyright (c) 2021 Hewlett Packard Enterprise Development LP
 */

 #include <libnvpair.h>
@@ -211,12 +212,18 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
 		 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
 		 * ZFS_EV_POOL_GUID may be missing so find them.
 		 */
-		(void) nvlist_lookup_string(nvl, DEV_IDENTIFIER,
-		    &search.gs_devid);
-		(void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
-		pool_guid = search.gs_pool_guid;
-		vdev_guid = search.gs_vdev_guid;
-		devtype = search.gs_vdev_type;
+		if (pool_guid == 0 || vdev_guid == 0) {
+			if ((nvlist_lookup_string(nvl, DEV_IDENTIFIER,
+			    &search.gs_devid) == 0) &&
+			    (zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search)
+			    == 1)) {
+				if (pool_guid == 0)
+					pool_guid = search.gs_pool_guid;
+				if (vdev_guid == 0)
+					vdev_guid = search.gs_vdev_guid;
+				devtype = search.gs_vdev_type;
+			}
+		}

 		/*
 		 * We want to avoid reporting "remove" events coming from
@@ -385,6 +392,7 @@ zfs_agent_init(libzfs_handle_t *zfs_hdl)
 		list_destroy(&agent_events);
 		zed_log_die("Failed to initialize agents");
 	}
+	pthread_setname_np(g_agents_tid, "agents");
 }

 void
@@ -435,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 		return;
 	}

-	ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
+	/*
+	 * Prefer sequential resilvering when supported (mirrors and dRAID),
+	 * otherwise fallback to a traditional healing resilver.
+	 */
+	ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE);
+	if (ret != 0) {
+		ret = zpool_vdev_attach(zhp, fullpath, path, nvroot,
+		    B_TRUE, B_FALSE);
+	}

 	zed_log_msg(LOG_INFO, "  zpool_vdev_replace: %s with %s (%s)",
 	    fullpath, path, (ret == 0) ? "no errors" :
@@ -910,6 +918,7 @@ zfs_slm_init()
 		return (-1);
 	}

+	pthread_setname_np(g_zfs_tid, "enum-pools");
 	list_create(&g_device_list, sizeof (struct pendingdev),
 	    offsetof(struct pendingdev, pd_node));

@@ -219,12 +219,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 	 * replace it.
 	 */
 	for (s = 0; s < nspares; s++) {
-		char *spare_name;
+		boolean_t rebuild = B_FALSE;
+		char *spare_name, *type;

 		if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
 		    &spare_name) != 0)
 			continue;

+		/* prefer sequential resilvering for distributed spares */
+		if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
+		    &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
+			rebuild = B_TRUE;
+
 		/* if set, add the "ashift" pool property to the spare nvlist */
 		if (source != ZPROP_SRC_DEFAULT)
 			(void) nvlist_add_uint64(spares[s],
@@ -237,7 +243,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
 		    dev_name, basename(spare_name));

 		if (zpool_vdev_attach(zhp, dev_name, spare_name,
-		    replacement, B_TRUE, B_FALSE) == 0) {
+		    replacement, B_TRUE, rebuild) == 0) {
 			free(dev_name);
 			nvlist_free(replacement);
 			return (B_TRUE);
@@ -328,7 +334,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 	 */
 	if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
 	    (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
-	    state == VDEV_STATE_REMOVED)) {
+	    (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) {
 		char *devtype;
 		char *devname;

@@ -499,6 +505,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 		 * Attempt to substitute a hot spare.
 		 */
 		(void) replace_with_spare(hdl, zhp, vdev);
+
 		zpool_close(zhp);
 	}

@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -60,8 +60,8 @@ _setup_sig_handlers(void)
 		zed_log_die("Failed to initialize sigset");

 	sa.sa_flags = SA_RESTART;
-	sa.sa_handler = SIG_IGN;

+	sa.sa_handler = SIG_IGN;
 	if (sigaction(SIGPIPE, &sa, NULL) < 0)
 		zed_log_die("Failed to ignore SIGPIPE");

@@ -75,6 +75,10 @@ _setup_sig_handlers(void)
 	sa.sa_handler = _hup_handler;
 	if (sigaction(SIGHUP, &sa, NULL) < 0)
 		zed_log_die("Failed to register SIGHUP handler");
+
+	(void) sigaddset(&sa.sa_mask, SIGCHLD);
+	if (pthread_sigmask(SIG_BLOCK, &sa.sa_mask, NULL) < 0)
+		zed_log_die("Failed to block SIGCHLD");
 }

 /*
@@ -212,22 +216,20 @@ _finish_daemonize(void)
 int
 main(int argc, char *argv[])
 {
-	struct zed_conf *zcp;
+	struct zed_conf zcp;
 	uint64_t saved_eid;
 	int64_t saved_etime[2];

 	zed_log_init(argv[0]);
 	zed_log_stderr_open(LOG_NOTICE);
-	zcp = zed_conf_create();
-	zed_conf_parse_opts(zcp, argc, argv);
-	if (zcp->do_verbose)
+	zed_conf_init(&zcp);
+	zed_conf_parse_opts(&zcp, argc, argv);
+	if (zcp.do_verbose)
 		zed_log_stderr_open(LOG_INFO);

 	if (geteuid() != 0)
 		zed_log_die("Must be run as root");

-	zed_conf_parse_file(zcp);
-
 	zed_file_close_from(STDERR_FILENO + 1);

 	(void) umask(0);
@@ -235,32 +237,32 @@ main(int argc, char *argv[])
 	if (chdir("/") < 0)
 		zed_log_die("Failed to change to root directory");

-	if (zed_conf_scan_dir(zcp) < 0)
+	if (zed_conf_scan_dir(&zcp) < 0)
 		exit(EXIT_FAILURE);

-	if (!zcp->do_foreground) {
+	if (!zcp.do_foreground) {
 		_start_daemonize();
 		zed_log_syslog_open(LOG_DAEMON);
 	}
 	_setup_sig_handlers();

-	if (zcp->do_memlock)
+	if (zcp.do_memlock)
 		_lock_memory();

-	if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force))
+	if ((zed_conf_write_pid(&zcp) < 0) && (!zcp.do_force))
 		exit(EXIT_FAILURE);

-	if (!zcp->do_foreground)
+	if (!zcp.do_foreground)
 		_finish_daemonize();

 	zed_log_msg(LOG_NOTICE,
 	    "ZFS Event Daemon %s-%s (PID %d)",
 	    ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid());

-	if (zed_conf_open_state(zcp) < 0)
+	if (zed_conf_open_state(&zcp) < 0)
 		exit(EXIT_FAILURE);

-	if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0)
+	if (zed_conf_read_state(&zcp, &saved_eid, saved_etime) < 0)
 		exit(EXIT_FAILURE);

 idle:
@@ -269,24 +271,24 @@ idle:
 	 * successful.
 	 */
 	do {
-		if (!zed_event_init(zcp))
+		if (!zed_event_init(&zcp))
 			break;
 		/* Wait for some time and try again. tunable? */
 		sleep(30);
-	} while (!_got_exit && zcp->do_idle);
+	} while (!_got_exit && zcp.do_idle);

 	if (_got_exit)
 		goto out;

-	zed_event_seek(zcp, saved_eid, saved_etime);
+	zed_event_seek(&zcp, saved_eid, saved_etime);

 	while (!_got_exit) {
 		int rv;
 		if (_got_hup) {
 			_got_hup = 0;
-			(void) zed_conf_scan_dir(zcp);
+			(void) zed_conf_scan_dir(&zcp);
 		}
-		rv = zed_event_service(zcp);
+		rv = zed_event_service(&zcp);

 		/* ENODEV: When kernel module is unloaded (osx) */
 		if (rv == ENODEV)
@@ -294,13 +296,13 @@ idle:
 	}

 	zed_log_msg(LOG_NOTICE, "Exiting");
-	zed_event_fini(zcp);
+	zed_event_fini(&zcp);

-	if (zcp->do_idle && !_got_exit)
+	if (zcp.do_idle && !_got_exit)
 		goto idle;

 out:
-	zed_conf_destroy(zcp);
+	zed_conf_destroy(&zcp);
 	zed_log_fini();
 	exit(EXIT_SUCCESS);
 }
@@ -1,5 +1,6 @@
 include $(top_srcdir)/config/Rules.am
 include $(top_srcdir)/config/Substfiles.am
+include $(top_srcdir)/config/Shellcheck.am

 EXTRA_DIST += README

@@ -51,3 +52,6 @@ install-data-hook:
 	    ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
 	done
 	chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc"
+
+# False positive: 1>&"${ZED_FLOCK_FD}" looks suspiciously similar to a >&filename bash extension
+CHECKBASHISMS_IGNORE = -e 'should be >word 2>&1' -e '&"$${ZED_FLOCK_FD}"'
@@ -12,15 +12,11 @@

 zed_exit_if_ignoring_this_event

-lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock"
+zed_lock "${ZED_DEBUG_LOG}"
+{
+	printenv | sort
+	echo
+} 1>&"${ZED_FLOCK_FD}"
+zed_unlock "${ZED_DEBUG_LOG}"

-umask 077
-zed_lock "${lockfile}"
-exec >> "${ZED_DEBUG_LOG}"
-
-printenv | sort
-echo
-
-exec >&-
-zed_unlock "${lockfile}"
 exit 0
@@ -42,6 +42,7 @@ fi
    msg="${msg} delay=$((ZEVENT_ZIO_DELAY / 1000000))ms"

 # list the bookmark data together
+# shellcheck disable=SC2153
 [ -n "${ZEVENT_ZIO_OBJSET}" ] && \
    msg="${msg} bookmark=${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}:${ZEVENT_ZIO_LEVEL}:${ZEVENT_ZIO_BLKID}"

@@ -25,7 +25,7 @@ zed_rate_limit "${rate_limit_tag}" || exit 3

 umask 077
 note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)"
-note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+note_pathname="$(mktemp)"
 {
    echo "ZFS has detected a data error:"
    echo
@@ -31,7 +31,7 @@ umask 077
 pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}"
 host_str=" on $(hostname)"
 note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}"
-note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+note_pathname="$(mktemp)"
 {
    echo "ZFS has posted the following event:"
    echo
@@ -3,9 +3,8 @@
 # Track changes to enumerated pools for use in early-boot
 set -ef

-FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache"
-FSLIST_TMP="@runstatedir@/zfs-list.cache.new"
-FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}"
+FSLIST="@sysconfdir@/zfs/zfs-list.cache/${ZEVENT_POOL}"
+FSLIST_TMP="@runstatedir@/zfs-list.cache@${ZEVENT_POOL}"

 # If the pool specific cache file is not writeable, abort
 [ -w "${FSLIST}" ] || exit 0
@@ -14,20 +13,20 @@ FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}"
 . "${ZED_ZEDLET_DIR}/zed-functions.sh"

 [ "$ZEVENT_SUBCLASS" != "history_event" ] && exit 0
-zed_check_cmd "${ZFS}" sort diff grep
+zed_check_cmd "${ZFS}" sort diff

 # If we are acting on a snapshot, we have nothing to do
-printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0
+[ "${ZEVENT_HISTORY_DSNAME%@*}" = "${ZEVENT_HISTORY_DSNAME}" ] || exit 0

-# We obtain a lock on zfs-list to avoid any simultaneous writes.
+# We lock the output file to avoid simultaneous writes.
 # If we run into trouble, log and drop the lock
 abort_alter() {
-  zed_log_msg "Error updating zfs-list.cache!"
-  zed_unlock zfs-list
+  zed_log_msg "Error updating zfs-list.cache for ${ZEVENT_POOL}!"
+  zed_unlock "${FSLIST}"
 }

 finished() {
-  zed_unlock zfs-list
+  zed_unlock "${FSLIST}"
  trap - EXIT
  exit 0
 }
@@ -37,7 +36,7 @@ case "${ZEVENT_HISTORY_INTERNAL_NAME}" in
      ;;

    export)
-        zed_lock zfs-list
+        zed_lock "${FSLIST}"
        trap abort_alter EXIT
        echo > "${FSLIST}"
        finished
@@ -63,7 +62,7 @@ case "${ZEVENT_HISTORY_INTERNAL_NAME}" in
      ;;
 esac

-zed_lock zfs-list
+zed_lock "${FSLIST}"
 trap abort_alter EXIT

 PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\
@@ -79,7 +78,7 @@ PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\
 sort "${FSLIST_TMP}" -o "${FSLIST_TMP}"

 # Don't modify the file if it hasn't changed
-diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}"
+diff -q "${FSLIST_TMP}" "${FSLIST}" || cat "${FSLIST_TMP}" > "${FSLIST}"
 rm -f "${FSLIST_TMP}"

 finished
@@ -41,7 +41,7 @@ fi

 umask 077
 note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
-note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+note_pathname="$(mktemp)"
 {
    echo "ZFS has finished a ${action}:"
    echo
@@ -1,21 +1,21 @@
 #!/bin/sh
 #
-# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes.
+# Turn off/on vdevs' enclosure fault LEDs when their pool's state changes.
 #
-# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL.
-# Turn the LED off when it's back ONLINE again.
+# Turn a vdev's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL.
+# Turn its LED off when it's back ONLINE again.
 #
 # This script run in two basic modes:
 #
 # 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then
-# only set the LED for that particular VDEV. This is the case for statechange
+# only set the LED for that particular vdev. This is the case for statechange
 # events and some vdev_* events.
 #
-# 2. If those vars are not set, then check the state of all VDEVs in the pool
+# 2. If those vars are not set, then check the state of all vdevs in the pool
 # and set the LEDs accordingly.  This is the case for pool_import events.
 #
 # Note that this script requires that your enclosure be supported by the
-# Linux SCSI enclosure services (ses) driver.  The script will do nothing
+# Linux SCSI Enclosure services (SES) driver.  The script will do nothing
 # if you have no enclosure, or if your enclosure isn't supported.
 #
 # Exit codes:
@@ -59,6 +59,10 @@ check_and_set_led()
 	file="$1"
 	val="$2"

+	if [ -z "$val" ]; then
+		return 0
+	fi
+
 	if [ ! -e "$file" ] ; then
 		return 3
 	fi
@@ -66,11 +70,11 @@ check_and_set_led()
 	# If another process is accessing the LED when we attempt to update it,
 	# the update will be lost so retry until the LED actually changes or we
 	# timeout.
-	for _ in $(seq 1 5); do
+	for _ in 1 2 3 4 5; do
 		# We want to check the current state first, since writing to the
 		# 'fault' entry always causes a SES command, even if the
 		# current state is already what you want.
-		current=$(cat "${file}")
+		read -r current < "${file}"

 		# On some enclosures if you write 1 to fault, and read it back,
 		# it will return 2.  Treat all non-zero values as 1 for
@@ -85,27 +89,29 @@ check_and_set_led()
 		else
 			break
 		fi
-        done
+	done
 }

 state_to_val()
 {
 	state="$1"
-	if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \
-	   [ "$state" = "UNAVAIL" ] ; then
-		echo 1
-	elif [ "$state" = "ONLINE" ] ; then
-		echo 0
-	fi
+	case "$state" in
+		FAULTED|DEGRADED|UNAVAIL)
+			echo 1
+			;;
+		ONLINE)
+			echo 0
+			;;
+	esac
 }

-# process_pool ([pool])
+# process_pool (pool)
 #
-# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to
-# the VDEV's state.
+# Iterate through a pool and set the vdevs' enclosure slot LEDs to
+# those vdevs' state.
 #
 # Arguments
-#   pool:	Optional pool name.  If not specified, iterate though all pools.
+#   pool:	Pool name.
 #
 # Return
 #  0 on success, 3 on missing sysfs path
@@ -113,19 +119,22 @@ state_to_val()
 process_pool()
 {
 	pool="$1"
+
+	# The output will be the vdevs only (from "grep '/dev/'"):
+	#
+	#    U45     ONLINE       0     0     0   /dev/sdk          0
+	#    U46     ONLINE       0     0     0   /dev/sdm          0
+	#    U47     ONLINE       0     0     0   /dev/sdn          0
+	#    U50     ONLINE       0     0     0  /dev/sdbn          0
+	#
+	ZPOOL_SCRIPTS_AS_ROOT=1 $ZPOOL status -c upath,fault_led "$pool" | grep '/dev/' | (
 	rc=0
-
-	# Lookup all the current LED values and paths in parallel
-	#shellcheck disable=SC2016
-	cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",'
-	out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=')
-
-	#shellcheck disable=SC2034
-	echo "$out" | while read -r vdev state read write chksum therest; do
+	while read -r vdev state _ _ _ therest; do
 		# Read out current LED value and path
-		tmp=$(echo "$therest" | sed 's/^.*led_token=//g')
-		vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}')
-		current_val=$(echo "$tmp" | awk -F ',' '{print $1}')
+		# Get dev name (like 'sda')
+		dev=$(basename "$(echo "$therest" | awk '{print $(NF-1)}')")
+		vdev_enc_sysfs_path=$(realpath "/sys/class/block/$dev/device/enclosure_device"*)
+		current_val=$(echo "$therest" | awk '{print $NF}')

 		if [ "$current_val" != "0" ] ; then
 			current_val=1
@@ -137,36 +146,27 @@ process_pool()
 		fi

 		if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then
-			#shellcheck disable=SC2030
-			rc=1
+			rc=3
 			zed_log_msg "vdev $vdev '$file/fault' doesn't exist"
-			continue;
+			continue
 		fi

 		val=$(state_to_val "$state")

 		if [ "$current_val" = "$val" ] ; then
 			# LED is already set correctly
-			continue;
+			continue
 		fi

 		if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then
-			rc=1
+			rc=3
 		fi
-
 	done
-
-	#shellcheck disable=SC2031
-	if [ "$rc" = "0" ] ; then
-		return 0
-	else
-		# We didn't see a sysfs entry that we wanted to set
-		return 3
-	fi
+	exit "$rc"; )
 }

 if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then
-	# Got a statechange for an individual VDEV
+	# Got a statechange for an individual vdev
 	val=$(state_to_val "$ZEVENT_VDEV_STATE_STR")
 	vdev=$(basename "$ZEVENT_VDEV_PATH")
 	check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val"
@@ -37,7 +37,7 @@ fi

 umask 077
 note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)"
-note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+note_pathname="$(mktemp)"
 {
    if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then
        echo "The number of I/O errors associated with a ZFS device exceeded"
@@ -19,7 +19,7 @@ zed_check_cmd "${ZPOOL}" || exit 9

 umask 077
 note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
-note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+note_pathname="$(mktemp)"
 {
    echo "ZFS has finished a trim:"
    echo
@@ -126,10 +126,8 @@ zed_lock()

    # Obtain a lock on the file bound to the given file descriptor.
    #
-    eval "exec ${fd}> '${lockfile}'"
-    err="$(flock --exclusive "${fd}" 2>&1)"
-    # shellcheck disable=SC2181
-    if [ $? -ne 0 ]; then
+    eval "exec ${fd}>> '${lockfile}'"
+    if ! err="$(flock --exclusive "${fd}" 2>&1)"; then
        zed_log_err "failed to lock \"${lockfile}\": ${err}"
    fi

@@ -165,9 +163,7 @@ zed_unlock()
    fi

    # Release the lock and close the file descriptor.
-    err="$(flock --unlock "${fd}" 2>&1)"
-    # shellcheck disable=SC2181
-    if [ $? -ne 0 ]; then
+    if ! err="$(flock --unlock "${fd}" 2>&1)"; then
        zed_log_err "failed to unlock \"${lockfile}\": ${err}"
    fi
    eval "exec ${fd}>&-"
@@ -267,7 +263,7 @@ zed_notify_email()
                -e "s/@SUBJECT@/${subject}/g")"

    # shellcheck disable=SC2086
-    eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1
+    eval ${ZED_EMAIL_PROG} ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1
    rv=$?
    if [ "${rv}" -ne 0 ]; then
        zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}"
@@ -367,7 +363,7 @@ zed_notify_pushbullet()
 #
 # Notification via Slack Webhook <https://api.slack.com/incoming-webhooks>.
 # The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the
-# Slack channel. 
+# Slack channel.
 #
 # Requires awk, curl, and sed executables to be installed in the standard PATH.
 #
@@ -511,10 +507,8 @@ zed_guid_to_pool()
 		return
 	fi

-	guid=$(printf "%llu" "$1")
-	if [ -n "$guid" ] ; then
-		$ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}'
-	fi
+	guid="$(printf "%u" "$1")"
+	$ZPOOL get -H -ovalue,name guid | awk '$1 == '"$guid"' {print $2; exit}'
 }

 # zed_exit_if_ignoring_this_event
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -15,11 +15,6 @@
 #ifndef	ZED_H
 #define	ZED_H

-/*
- * Absolute path for the default zed configuration file.
- */
-#define	ZED_CONF_FILE		SYSCONFDIR "/zfs/zed.conf"
-
 /*
 * Absolute path for the default zed pid file.
 */
@@ -35,16 +30,6 @@
 */
 #define	ZED_ZEDLET_DIR		SYSCONFDIR "/zfs/zed.d"

-/*
- * Reserved for future use.
- */
-#define	ZED_MAX_EVENTS		0
-
-/*
- * Reserved for future use.
- */
-#define	ZED_MIN_EVENTS		0
-
 /*
 * String prefix for ZED variables passed via environment variables.
 */
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -32,43 +32,26 @@
 #include "zed_strings.h"

 /*
- * Return a new configuration with default values.
+ * Initialise the configuration with default values.
 */
-struct zed_conf *
-zed_conf_create(void)
+void
+zed_conf_init(struct zed_conf *zcp)
 {
-	struct zed_conf *zcp;
+	memset(zcp, 0, sizeof (*zcp));

-	zcp = calloc(1, sizeof (*zcp));
-	if (!zcp)
-		goto nomem;
+	/* zcp->zfs_hdl opened in zed_event_init() */
+	/* zcp->zedlets created in zed_conf_scan_dir() */

-	zcp->syslog_facility = LOG_DAEMON;
-	zcp->min_events = ZED_MIN_EVENTS;
-	zcp->max_events = ZED_MAX_EVENTS;
-	zcp->pid_fd = -1;
-	zcp->zedlets = NULL;		/* created via zed_conf_scan_dir() */
-	zcp->state_fd = -1;		/* opened via zed_conf_open_state() */
-	zcp->zfs_hdl = NULL;		/* opened via zed_event_init() */
-	zcp->zevent_fd = -1;		/* opened via zed_event_init() */
+	zcp->pid_fd = -1;		/* opened in zed_conf_write_pid() */
+	zcp->state_fd = -1;		/* opened in zed_conf_open_state() */
+	zcp->zevent_fd = -1;		/* opened in zed_event_init() */

-	if (!(zcp->conf_file = strdup(ZED_CONF_FILE)))
-		goto nomem;
+	zcp->max_jobs = 16;

-	if (!(zcp->pid_file = strdup(ZED_PID_FILE)))
-		goto nomem;
-
-	if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)))
-		goto nomem;
-
-	if (!(zcp->state_file = strdup(ZED_STATE_FILE)))
-		goto nomem;
-
-	return (zcp);
-
-nomem:
-	zed_log_die("Failed to create conf: %s", strerror(errno));
-	return (NULL);
+	if (!(zcp->pid_file = strdup(ZED_PID_FILE)) ||
+	    !(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)) ||
+	    !(zcp->state_file = strdup(ZED_STATE_FILE)))
+		zed_log_die("Failed to create conf: %s", strerror(errno));
 }

 /*
@@ -79,9 +62,6 @@ nomem:
 void
 zed_conf_destroy(struct zed_conf *zcp)
 {
-	if (!zcp)
-		return;
-
 	if (zcp->state_fd >= 0) {
 		if (close(zcp->state_fd) < 0)
 			zed_log_msg(LOG_WARNING,
@@ -102,10 +82,6 @@ zed_conf_destroy(struct zed_conf *zcp)
 			    zcp->pid_file, strerror(errno));
 		zcp->pid_fd = -1;
 	}
-	if (zcp->conf_file) {
-		free(zcp->conf_file);
-		zcp->conf_file = NULL;
-	}
 	if (zcp->pid_file) {
 		free(zcp->pid_file);
 		zcp->pid_file = NULL;
@@ -122,7 +98,6 @@ zed_conf_destroy(struct zed_conf *zcp)
 		zed_strings_destroy(zcp->zedlets);
 		zcp->zedlets = NULL;
 	}
-	free(zcp);
 }

 /*
@@ -132,46 +107,52 @@ zed_conf_destroy(struct zed_conf *zcp)
 * otherwise, output to stderr and exit with a failure status.
 */
 static void
-_zed_conf_display_help(const char *prog, int got_err)
+_zed_conf_display_help(const char *prog, boolean_t got_err)
 {
+	struct opt { const char *o, *d, *v; };
+
 	FILE *fp = got_err ? stderr : stdout;
-	int w1 = 4;			/* width of leading whitespace */
-	int w2 = 8;			/* width of L-justified option field */
+
+	struct opt *oo;
+	struct opt iopts[] = {
+		{ .o = "-h", .d = "Display help" },
+		{ .o = "-L", .d = "Display license information" },
+		{ .o = "-V", .d = "Display version information" },
+		{},
+	};
+	struct opt nopts[] = {
+		{ .o = "-v", .d = "Be verbose" },
+		{ .o = "-f", .d = "Force daemon to run" },
+		{ .o = "-F", .d = "Run daemon in the foreground" },
+		{ .o = "-I",
+		    .d = "Idle daemon until kernel module is (re)loaded" },
+		{ .o = "-M", .d = "Lock all pages in memory" },
+		{ .o = "-P", .d = "$PATH for ZED to use (only used by ZTS)" },
+		{ .o = "-Z", .d = "Zero state file" },
+		{},
+	};
+	struct opt vopts[] = {
+		{ .o = "-d DIR", .d = "Read enabled ZEDLETs from DIR.",
+		    .v = ZED_ZEDLET_DIR },
+		{ .o = "-p FILE", .d = "Write daemon's PID to FILE.",
+		    .v = ZED_PID_FILE },
+		{ .o = "-s FILE", .d = "Write daemon's state to FILE.",
+		    .v = ZED_STATE_FILE },
+		{ .o = "-j JOBS", .d = "Start at most JOBS at once.",
+		    .v = "16" },
+		{},
+	};

 	fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed"));
 	fprintf(fp, "\n");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h",
-	    "Display help.");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L",
-	    "Display license information.");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V",
-	    "Display version information.");
+	for (oo = iopts; oo->o; ++oo)
+		fprintf(fp, "    %*s %s\n", -8, oo->o, oo->d);
 	fprintf(fp, "\n");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v",
-	    "Be verbose.");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f",
-	    "Force daemon to run.");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F",
-	    "Run daemon in the foreground.");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-I",
-	    "Idle daemon until kernel module is (re)loaded.");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M",
-	    "Lock all pages in memory.");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P",
-	    "$PATH for ZED to use (only used by ZTS).");
-	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z",
-	    "Zero state file.");
+	for (oo = nopts; oo->o; ++oo)
+		fprintf(fp, "    %*s %s\n", -8, oo->o, oo->d);
 	fprintf(fp, "\n");
-#if 0
-	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE",
-	    "Read configuration from FILE.", ZED_CONF_FILE);
-#endif
-	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR",
-	    "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR);
-	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE",
-	    "Write daemon's PID to FILE.", ZED_PID_FILE);
-	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE",
-	    "Write daemon's state to FILE.", ZED_STATE_FILE);
+	for (oo = vopts; oo->o; ++oo)
+		fprintf(fp, "    %*s %s [%s]\n", -8, oo->o, oo->d, oo->v);
 	fprintf(fp, "\n");

 	exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS);
@@ -183,20 +164,14 @@ _zed_conf_display_help(const char *prog, int got_err)
 static void
 _zed_conf_display_license(void)
 {
-	const char **pp;
-	const char *text[] = {
-	    "The ZFS Event Daemon (ZED) is distributed under the terms of the",
-	    "  Common Development and Distribution License (CDDL-1.0)",
-	    "  <http://opensource.org/licenses/CDDL-1.0>.",
-	    "",
+	printf(
+	    "The ZFS Event Daemon (ZED) is distributed under the terms of the\n"
+	    "  Common Development and Distribution License (CDDL-1.0)\n"
+	    "  <http://opensource.org/licenses/CDDL-1.0>.\n"
+	    "\n"
 	    "Developed at Lawrence Livermore National Laboratory"
-	    " (LLNL-CODE-403049).",
-	    "",
-	    NULL
-	};
-
-	for (pp = text; *pp; pp++)
-		printf("%s\n", *pp);
+	    " (LLNL-CODE-403049).\n"
+	    "\n");

 	exit(EXIT_SUCCESS);
 }
@@ -231,16 +206,19 @@ _zed_conf_parse_path(char **resultp, const char *path)

 	if (path[0] == '/') {
 		*resultp = strdup(path);
-	} else if (!getcwd(buf, sizeof (buf))) {
-		zed_log_die("Failed to get current working dir: %s",
-		    strerror(errno));
-	} else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) {
-		zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
-	} else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) {
-		zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
 	} else {
+		if (!getcwd(buf, sizeof (buf)))
+			zed_log_die("Failed to get current working dir: %s",
+			    strerror(errno));
+
+		if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf) ||
+		    strlcat(buf, path, sizeof (buf)) >= sizeof (buf))
+			zed_log_die("Failed to copy path: %s",
+			    strerror(ENAMETOOLONG));
+
 		*resultp = strdup(buf);
 	}
+
 	if (!*resultp)
 		zed_log_die("Failed to copy path: %s", strerror(ENOMEM));
 }
@@ -251,8 +229,9 @@ _zed_conf_parse_path(char **resultp, const char *path)
 void
 zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
 {
-	const char * const opts = ":hLVc:d:p:P:s:vfFMZI";
+	const char * const opts = ":hLVd:p:P:s:vfFMZIj:";
 	int opt;
+	unsigned long raw;

 	if (!zcp || !argv || !argv[0])
 		zed_log_die("Failed to parse options: Internal error");
@@ -262,7 +241,7 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
 	while ((opt = getopt(argc, argv, opts)) != -1) {
 		switch (opt) {
 		case 'h':
-			_zed_conf_display_help(argv[0], EXIT_SUCCESS);
+			_zed_conf_display_help(argv[0], B_FALSE);
 			break;
 		case 'L':
 			_zed_conf_display_license();
@@ -270,9 +249,6 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
 		case 'V':
 			_zed_conf_display_version();
 			break;
-		case 'c':
-			_zed_conf_parse_path(&zcp->conf_file, optarg);
-			break;
 		case 'd':
 			_zed_conf_parse_path(&zcp->zedlet_dir, optarg);
 			break;
@@ -303,31 +279,30 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
 		case 'Z':
 			zcp->do_zero = 1;
 			break;
+		case 'j':
+			errno = 0;
+			raw = strtoul(optarg, NULL, 0);
+			if (errno == ERANGE || raw > INT16_MAX) {
+				zed_log_die("%lu is too many jobs", raw);
+			} if (raw == 0) {
+				zed_log_die("0 jobs makes no sense");
+			} else {
+				zcp->max_jobs = raw;
+			}
+			break;
 		case '?':
 		default:
 			if (optopt == '?')
-				_zed_conf_display_help(argv[0], EXIT_SUCCESS);
+				_zed_conf_display_help(argv[0], B_FALSE);

-			fprintf(stderr, "%s: %s '-%c'\n\n", argv[0],
-			    "Invalid option", optopt);
-			_zed_conf_display_help(argv[0], EXIT_FAILURE);
+			fprintf(stderr, "%s: Invalid option '-%c'\n\n",
+			    argv[0], optopt);
+			_zed_conf_display_help(argv[0], B_TRUE);
 			break;
 		}
 	}
 }

-/*
- * Parse the configuration file into the configuration [zcp].
- *
- * FIXME: Not yet implemented.
- */
-void
-zed_conf_parse_file(struct zed_conf *zcp)
-{
-	if (!zcp)
-		zed_log_die("Failed to parse config: %s", strerror(EINVAL));
-}
-
 /*
 * Scan the [zcp] zedlet_dir for files to exec based on the event class.
 * Files must be executable by user, but not writable by group or other.
@@ -335,8 +310,6 @@ zed_conf_parse_file(struct zed_conf *zcp)
 *
 * Return 0 on success with an updated set of zedlets,
 * or -1 on error with errno set.
- *
- * FIXME: Check if zedlet_dir and all parent dirs are secure.
 */
 int
 zed_conf_scan_dir(struct zed_conf *zcp)
@@ -452,8 +425,6 @@ zed_conf_scan_dir(struct zed_conf *zcp)
 int
 zed_conf_write_pid(struct zed_conf *zcp)
 {
-	const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
-	const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
 	char buf[PATH_MAX];
 	int n;
 	char *p;
@@ -481,7 +452,7 @@ zed_conf_write_pid(struct zed_conf *zcp)
 	if (p)
 		*p = '\0';

-	if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) {
+	if ((mkdirp(buf, 0755) < 0) && (errno != EEXIST)) {
 		zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s",
 		    buf, strerror(errno));
 		goto err;
@@ -491,7 +462,7 @@ zed_conf_write_pid(struct zed_conf *zcp)
 	 */
 	mask = umask(0);
 	umask(mask | 022);
-	zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode);
+	zcp->pid_fd = open(zcp->pid_file, O_RDWR | O_CREAT | O_CLOEXEC, 0644);
 	umask(mask);
 	if (zcp->pid_fd < 0) {
 		zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s",
@@ -528,7 +499,7 @@ zed_conf_write_pid(struct zed_conf *zcp)
 		errno = ERANGE;
 		zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
 		    zcp->pid_file, strerror(errno));
-	} else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) {
+	} else if (write(zcp->pid_fd, buf, n) != n) {
 		zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
 		    zcp->pid_file, strerror(errno));
 	} else if (fdatasync(zcp->pid_fd) < 0) {
@@ -556,7 +527,6 @@ int
 zed_conf_open_state(struct zed_conf *zcp)
 {
 	char dirbuf[PATH_MAX];
-	mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
 	int n;
 	char *p;
 	int rv;
@@ -578,7 +548,7 @@ zed_conf_open_state(struct zed_conf *zcp)
 	if (p)
 		*p = '\0';

-	if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) {
+	if ((mkdirp(dirbuf, 0755) < 0) && (errno != EEXIST)) {
 		zed_log_msg(LOG_WARNING,
 		    "Failed to create directory \"%s\": %s",
 		    dirbuf, strerror(errno));
@@ -596,7 +566,7 @@ zed_conf_open_state(struct zed_conf *zcp)
 		(void) unlink(zcp->state_file);

 	zcp->state_fd = open(zcp->state_file,
-	    (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
+	    O_RDWR | O_CREAT | O_CLOEXEC, 0644);
 	if (zcp->state_fd < 0) {
 		zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s",
 		    zcp->state_file, strerror(errno));
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -20,43 +20,39 @@
 #include "zed_strings.h"

 struct zed_conf {
-	unsigned	do_force:1;		/* true if force enabled */
-	unsigned	do_foreground:1;	/* true if run in foreground */
-	unsigned	do_memlock:1;		/* true if locking memory */
-	unsigned	do_verbose:1;		/* true if verbosity enabled */
-	unsigned	do_zero:1;		/* true if zeroing state */
-	unsigned	do_idle:1;		/* true if idle enabled */
-	int		syslog_facility;	/* syslog facility value */
-	int		min_events;		/* RESERVED FOR FUTURE USE */
-	int		max_events;		/* RESERVED FOR FUTURE USE */
-	char		*conf_file;		/* abs path to config file */
 	char		*pid_file;		/* abs path to pid file */
-	int		pid_fd;			/* fd to pid file for lock */
 	char		*zedlet_dir;		/* abs path to zedlet dir */
-	zed_strings_t	*zedlets;		/* names of enabled zedlets */
 	char		*state_file;		/* abs path to state file */
-	int		state_fd;		/* fd to state file */
+
 	libzfs_handle_t	*zfs_hdl;		/* handle to libzfs */
-	int		zevent_fd;		/* fd for access to zevents */
+	zed_strings_t	*zedlets;		/* names of enabled zedlets */
 	char		*path;		/* custom $PATH for zedlets to use */
+
+	int		pid_fd;			/* fd to pid file for lock */
+	int		state_fd;		/* fd to state file */
+	int		zevent_fd;		/* fd for access to zevents */
+
+	int16_t max_jobs;		/* max zedlets to run at one time */
+
+	boolean_t	do_force:1;		/* true if force enabled */
+	boolean_t	do_foreground:1;	/* true if run in foreground */
+	boolean_t	do_memlock:1;		/* true if locking memory */
+	boolean_t	do_verbose:1;		/* true if verbosity enabled */
+	boolean_t	do_zero:1;		/* true if zeroing state */
+	boolean_t	do_idle:1;		/* true if idle enabled */
 };

-struct zed_conf *zed_conf_create(void);
-
+void zed_conf_init(struct zed_conf *zcp);
 void zed_conf_destroy(struct zed_conf *zcp);

 void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv);

-void zed_conf_parse_file(struct zed_conf *zcp);
-
 int zed_conf_scan_dir(struct zed_conf *zcp);

 int zed_conf_write_pid(struct zed_conf *zcp);

 int zed_conf_open_state(struct zed_conf *zcp);
-
 int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]);
-
 int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]);

 #endif	/* !ZED_CONF_H */
@@ -379,6 +379,7 @@ zed_disk_event_init()
 		return (-1);
 	}

+	pthread_setname_np(g_mon_tid, "udev monitor");
 	zed_log_msg(LOG_INFO, "zed_disk_event_init");

 	return (0);
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -15,7 +15,7 @@
 #include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
-#include <libzfs.h>			/* FIXME: Replace with libzfs_core. */
+#include <libzfs_core.h>
 #include <paths.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -54,7 +54,7 @@ zed_event_init(struct zed_conf *zcp)
 		zed_log_die("Failed to initialize libzfs");
 	}

-	zcp->zevent_fd = open(ZFS_DEV, O_RDWR);
+	zcp->zevent_fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC);
 	if (zcp->zevent_fd < 0) {
 		if (zcp->do_idle)
 			return (-1);
@@ -96,6 +96,47 @@ zed_event_fini(struct zed_conf *zcp)
 		libzfs_fini(zcp->zfs_hdl);
 		zcp->zfs_hdl = NULL;
 	}
+
+	zed_exec_fini();
+}
+
+static void
+_bump_event_queue_length(void)
+{
+	int zzlm = -1, wr;
+	char qlen_buf[12] = {0}; /* parameter is int => max "-2147483647\n" */
+	long int qlen;
+
+	zzlm = open("/sys/module/zfs/parameters/zfs_zevent_len_max", O_RDWR);
+	if (zzlm < 0)
+		goto done;
+
+	if (read(zzlm, qlen_buf, sizeof (qlen_buf)) < 0)
+		goto done;
+	qlen_buf[sizeof (qlen_buf) - 1] = '\0';
+
+	errno = 0;
+	qlen = strtol(qlen_buf, NULL, 10);
+	if (errno == ERANGE)
+		goto done;
+
+	if (qlen <= 0)
+		qlen = 512; /* default zfs_zevent_len_max value */
+	else
+		qlen *= 2;
+
+	if (qlen > INT_MAX)
+		qlen = INT_MAX;
+	wr = snprintf(qlen_buf, sizeof (qlen_buf), "%ld", qlen);
+
+	if (pwrite(zzlm, qlen_buf, wr, 0) < 0)
+		goto done;
+
+	zed_log_msg(LOG_WARNING, "Bumping queue length to %ld", qlen);
+
+done:
+	if (zzlm > -1)
+		(void) close(zzlm);
 }

 /*
@@ -136,10 +177,7 @@ zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[])

 		if (n_dropped > 0) {
 			zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
-			/*
-			 * FIXME: Increase max size of event nvlist in
-			 *   /sys/module/zfs/parameters/zfs_zevent_len_max ?
-			 */
+			_bump_event_queue_length();
 		}
 		if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
 			zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
@@ -211,7 +249,7 @@ _zed_event_value_is_hex(const char *name)
 *
 * All environment variables in [zsp] should be added through this function.
 */
-static int
+static __attribute__((format(printf, 5, 6))) int
 _zed_event_add_var(uint64_t eid, zed_strings_t *zsp,
    const char *prefix, const char *name, const char *fmt, ...)
 {
@@ -586,8 +624,6 @@ _zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp,
 * Convert the nvpair [nvp] to a string which is added to the environment
 * of the child process.
 * Return 0 on success, -1 on error.
- *
- * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()?
 */
 static void
 _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp)
@@ -686,23 +722,11 @@ _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp)
 		_zed_event_add_var(eid, zsp, prefix, name,
 		    "%llu", (u_longlong_t)i64);
 		break;
-	case DATA_TYPE_NVLIST:
-		_zed_event_add_var(eid, zsp, prefix, name,
-		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
-		break;
 	case DATA_TYPE_STRING:
 		(void) nvpair_value_string(nvp, &str);
 		_zed_event_add_var(eid, zsp, prefix, name,
 		    "%s", (str ? str : "<NULL>"));
 		break;
-	case DATA_TYPE_BOOLEAN_ARRAY:
-		_zed_event_add_var(eid, zsp, prefix, name,
-		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
-		break;
-	case DATA_TYPE_BYTE_ARRAY:
-		_zed_event_add_var(eid, zsp, prefix, name,
-		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
-		break;
 	case DATA_TYPE_INT8_ARRAY:
 		_zed_event_add_int8_array(eid, zsp, prefix, nvp);
 		break;
@@ -730,9 +754,11 @@ _zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp)
 	case DATA_TYPE_STRING_ARRAY:
 		_zed_event_add_string_array(eid, zsp, prefix, nvp);
 		break;
+	case DATA_TYPE_NVLIST:
+	case DATA_TYPE_BOOLEAN_ARRAY:
+	case DATA_TYPE_BYTE_ARRAY:
 	case DATA_TYPE_NVLIST_ARRAY:
-		_zed_event_add_var(eid, zsp, prefix, name,
-		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
+		_zed_event_add_var(eid, zsp, prefix, name, "_NOT_IMPLEMENTED_");
 		break;
 	default:
 		errno = EINVAL;
@@ -912,10 +938,7 @@ zed_event_service(struct zed_conf *zcp)

 	if (n_dropped > 0) {
 		zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
-		/*
-		 * FIXME: Increase max size of event nvlist in
-		 * /sys/module/zfs/parameters/zfs_zevent_len_max ?
-		 */
+		_bump_event_queue_length();
 	}
 	if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
 		zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
@@ -953,8 +976,7 @@ zed_event_service(struct zed_conf *zcp)

 		_zed_event_add_time_strings(eid, zsp, etime);

-		zed_exec_process(eid, class, subclass,
-		    zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd);
+		zed_exec_process(eid, class, subclass, zcp, zsp);

 		zed_conf_write_state(zcp, eid, etime);

@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -18,17 +18,53 @@
 #include <fcntl.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stddef.h>
+#include <sys/avl.h>
+#include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <time.h>
 #include <unistd.h>
+#include <pthread.h>
 #include "zed_exec.h"
-#include "zed_file.h"
 #include "zed_log.h"
 #include "zed_strings.h"

 #define	ZEVENT_FILENO	3

+struct launched_process_node {
+	avl_node_t node;
+	pid_t pid;
+	uint64_t eid;
+	char *name;
+};
+
+static int
+_launched_process_node_compare(const void *x1, const void *x2)
+{
+	pid_t p1;
+	pid_t p2;
+
+	assert(x1 != NULL);
+	assert(x2 != NULL);
+
+	p1 = ((const struct launched_process_node *) x1)->pid;
+	p2 = ((const struct launched_process_node *) x2)->pid;
+
+	if (p1 < p2)
+		return (-1);
+	else if (p1 == p2)
+		return (0);
+	else
+		return (1);
+}
+
+static pthread_t _reap_children_tid = (pthread_t)-1;
+static volatile boolean_t _reap_children_stop;
+static avl_tree_t _launched_processes;
+static pthread_mutex_t _launched_processes_lock = PTHREAD_MUTEX_INITIALIZER;
+static int16_t _launched_processes_limit;
+
 /*
 * Create an environment string array for passing to execve() using the
 * NAME=VALUE strings in container [zsp].
@@ -79,20 +115,26 @@ _zed_exec_create_env(zed_strings_t *zsp)
 */
 static void
 _zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog,
-    char *env[], int zfd)
+    char *env[], int zfd, boolean_t in_foreground)
 {
 	char path[PATH_MAX];
 	int n;
 	pid_t pid;
 	int fd;
-	pid_t wpid;
-	int status;
+	struct launched_process_node *node;
+	sigset_t mask;
+	struct timespec launch_timeout =
+		{ .tv_sec = 0, .tv_nsec = 200 * 1000 * 1000, };

 	assert(dir != NULL);
 	assert(prog != NULL);
 	assert(env != NULL);
 	assert(zfd >= 0);

+	while (__atomic_load_n(&_launched_processes_limit,
+	    __ATOMIC_SEQ_CST) <= 0)
+		(void) nanosleep(&launch_timeout, NULL);
+
 	n = snprintf(path, sizeof (path), "%s/%s", dir, prog);
 	if ((n < 0) || (n >= sizeof (path))) {
 		zed_log_msg(LOG_WARNING,
@@ -100,101 +142,179 @@ _zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog,
 		    prog, eid, strerror(ENAMETOOLONG));
 		return;
 	}
+	(void) pthread_mutex_lock(&_launched_processes_lock);
 	pid = fork();
 	if (pid < 0) {
+		(void) pthread_mutex_unlock(&_launched_processes_lock);
 		zed_log_msg(LOG_WARNING,
 		    "Failed to fork \"%s\" for eid=%llu: %s",
 		    prog, eid, strerror(errno));
 		return;
 	} else if (pid == 0) {
+		(void) sigemptyset(&mask);
+		(void) sigprocmask(SIG_SETMASK, &mask, NULL);
+
 		(void) umask(022);
-		if ((fd = open("/dev/null", O_RDWR)) != -1) {
+		if (in_foreground && /* we're already devnulled if daemonised */
+		    (fd = open("/dev/null", O_RDWR | O_CLOEXEC)) != -1) {
 			(void) dup2(fd, STDIN_FILENO);
 			(void) dup2(fd, STDOUT_FILENO);
 			(void) dup2(fd, STDERR_FILENO);
 		}
 		(void) dup2(zfd, ZEVENT_FILENO);
-		zed_file_close_from(ZEVENT_FILENO + 1);
 		execle(path, prog, NULL, env);
 		_exit(127);
 	}

 	/* parent process */

+	node = calloc(1, sizeof (*node));
+	if (node) {
+		node->pid = pid;
+		node->eid = eid;
+		node->name = strdup(prog);
+
+		avl_add(&_launched_processes, node);
+	}
+	(void) pthread_mutex_unlock(&_launched_processes_lock);
+
+	__atomic_sub_fetch(&_launched_processes_limit, 1, __ATOMIC_SEQ_CST);
 	zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d",
 	    prog, eid, pid);
+}

-	/* FIXME: Timeout rogue child processes with sigalarm? */
+static void
+_nop(int sig)
+{}

-	/*
-	 * Wait for child process using WNOHANG to limit
-	 * the time spent waiting to 10 seconds (10,000ms).
-	 */
-	for (n = 0; n < 1000; n++) {
-		wpid = waitpid(pid, &status, WNOHANG);
-		if (wpid == (pid_t)-1) {
-			if (errno == EINTR)
-				continue;
-			zed_log_msg(LOG_WARNING,
-			    "Failed to wait for \"%s\" eid=%llu pid=%d",
-			    prog, eid, pid);
-			break;
-		} else if (wpid == 0) {
-			struct timespec t;
+static void *
+_reap_children(void *arg)
+{
+	struct launched_process_node node, *pnode;
+	pid_t pid;
+	int status;
+	struct rusage usage;
+	struct sigaction sa = {};

-			/* child still running */
-			t.tv_sec = 0;
-			t.tv_nsec = 10000000;	/* 10ms */
-			(void) nanosleep(&t, NULL);
-			continue;
-		}
+	(void) sigfillset(&sa.sa_mask);
+	(void) sigdelset(&sa.sa_mask, SIGCHLD);
+	(void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL);

-		if (WIFEXITED(status)) {
-			zed_log_msg(LOG_INFO,
-			    "Finished \"%s\" eid=%llu pid=%d exit=%d",
-			    prog, eid, pid, WEXITSTATUS(status));
-		} else if (WIFSIGNALED(status)) {
-			zed_log_msg(LOG_INFO,
-			    "Finished \"%s\" eid=%llu pid=%d sig=%d/%s",
-			    prog, eid, pid, WTERMSIG(status),
-			    strsignal(WTERMSIG(status)));
+	(void) sigemptyset(&sa.sa_mask);
+	sa.sa_handler = _nop;
+	sa.sa_flags = SA_NOCLDSTOP;
+	(void) sigaction(SIGCHLD, &sa, NULL);
+
+	for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) {
+		(void) pthread_mutex_lock(&_launched_processes_lock);
+		pid = wait4(0, &status, WNOHANG, &usage);
+
+		if (pid == 0 || pid == (pid_t)-1) {
+			(void) pthread_mutex_unlock(&_launched_processes_lock);
+			if (pid == 0 || errno == ECHILD)
+				pause();
+			else if (errno != EINTR)
+				zed_log_msg(LOG_WARNING,
+				    "Failed to wait for children: %s",
+				    strerror(errno));
 		} else {
-			zed_log_msg(LOG_INFO,
-			    "Finished \"%s\" eid=%llu pid=%d status=0x%X",
-			    prog, eid, (unsigned int) status);
+			memset(&node, 0, sizeof (node));
+			node.pid = pid;
+			pnode = avl_find(&_launched_processes, &node, NULL);
+			if (pnode) {
+				memcpy(&node, pnode, sizeof (node));
+
+				avl_remove(&_launched_processes, pnode);
+				free(pnode);
+			}
+			(void) pthread_mutex_unlock(&_launched_processes_lock);
+			__atomic_add_fetch(&_launched_processes_limit, 1,
+			    __ATOMIC_SEQ_CST);
+
+			usage.ru_utime.tv_sec += usage.ru_stime.tv_sec;
+			usage.ru_utime.tv_usec += usage.ru_stime.tv_usec;
+			usage.ru_utime.tv_sec +=
+			    usage.ru_utime.tv_usec / (1000 * 1000);
+			usage.ru_utime.tv_usec %= 1000 * 1000;
+
+			if (WIFEXITED(status)) {
+				zed_log_msg(LOG_INFO,
+				    "Finished \"%s\" eid=%llu pid=%d "
+				    "time=%llu.%06us exit=%d",
+				    node.name, node.eid, pid,
+				    (unsigned long long) usage.ru_utime.tv_sec,
+				    (unsigned int) usage.ru_utime.tv_usec,
+				    WEXITSTATUS(status));
+			} else if (WIFSIGNALED(status)) {
+				zed_log_msg(LOG_INFO,
+				    "Finished \"%s\" eid=%llu pid=%d "
+				    "time=%llu.%06us sig=%d/%s",
+				    node.name, node.eid, pid,
+				    (unsigned long long) usage.ru_utime.tv_sec,
+				    (unsigned int) usage.ru_utime.tv_usec,
+				    WTERMSIG(status),
+				    strsignal(WTERMSIG(status)));
+			} else {
+				zed_log_msg(LOG_INFO,
+				    "Finished \"%s\" eid=%llu pid=%d "
+				    "time=%llu.%06us status=0x%X",
+				    node.name, node.eid,
+				    (unsigned long long) usage.ru_utime.tv_sec,
+				    (unsigned int) usage.ru_utime.tv_usec,
+				    (unsigned int) status);
+			}
+
+			free(node.name);
 		}
-		break;
 	}

-	/*
-	 * kill child process after 10 seconds
-	 */
-	if (wpid == 0) {
-		zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d",
-		    prog, pid);
-		(void) kill(pid, SIGKILL);
-		(void) waitpid(pid, &status, 0);
+	return (NULL);
+}
+
+void
+zed_exec_fini(void)
+{
+	struct launched_process_node *node;
+	void *ck = NULL;
+
+	if (_reap_children_tid == (pthread_t)-1)
+		return;
+
+	_reap_children_stop = B_TRUE;
+	(void) pthread_kill(_reap_children_tid, SIGCHLD);
+	(void) pthread_join(_reap_children_tid, NULL);
+
+	while ((node = avl_destroy_nodes(&_launched_processes, &ck)) != NULL) {
+		free(node->name);
+		free(node);
 	}
+	avl_destroy(&_launched_processes);
+
+	(void) pthread_mutex_destroy(&_launched_processes_lock);
+	(void) pthread_mutex_init(&_launched_processes_lock, NULL);
+
+	_reap_children_tid = (pthread_t)-1;
 }

 /*
 * Process the event [eid] by synchronously invoking all zedlets with a
 * matching class prefix.
 *
- * Each executable in [zedlets] from the directory [dir] is matched against
- * the event's [class], [subclass], and the "all" class (which matches
- * all events).  Every zedlet with a matching class prefix is invoked.
+ * Each executable in [zcp->zedlets] from the directory [zcp->zedlet_dir]
+ * is matched against the event's [class], [subclass], and the "all" class
+ * (which matches all events).
+ * Every zedlet with a matching class prefix is invoked.
 * The NAME=VALUE strings in [envs] will be passed to the zedlet as
 * environment variables.
 *
- * The file descriptor [zfd] is the zevent_fd used to track the
+ * The file descriptor [zcp->zevent_fd] is the zevent_fd used to track the
 * current cursor location within the zevent nvlist.
 *
 * Return 0 on success, -1 on error.
 */
 int
 zed_exec_process(uint64_t eid, const char *class, const char *subclass,
-    const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd)
+    struct zed_conf *zcp, zed_strings_t *envs)
 {
 	const char *class_strings[4];
 	const char *allclass = "all";
@@ -203,9 +323,22 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass,
 	char **e;
 	int n;

-	if (!dir || !zedlets || !envs || zfd < 0)
+	if (!zcp->zedlet_dir || !zcp->zedlets || !envs || zcp->zevent_fd < 0)
 		return (-1);

+	if (_reap_children_tid == (pthread_t)-1) {
+		_launched_processes_limit = zcp->max_jobs;
+
+		if (pthread_create(&_reap_children_tid, NULL,
+		    _reap_children, NULL) != 0)
+			return (-1);
+		pthread_setname_np(_reap_children_tid, "reap ZEDLETs");
+
+		avl_create(&_launched_processes, _launched_process_node_compare,
+		    sizeof (struct launched_process_node),
+		    offsetof(struct launched_process_node, node));
+	}
+
 	csp = class_strings;

 	if (class)
@@ -221,11 +354,13 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass,

 	e = _zed_exec_create_env(envs);

-	for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) {
+	for (z = zed_strings_first(zcp->zedlets); z;
+	    z = zed_strings_next(zcp->zedlets)) {
 		for (csp = class_strings; *csp; csp++) {
 			n = strlen(*csp);
 			if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n]))
-				_zed_exec_fork_child(eid, dir, z, e, zfd);
+				_zed_exec_fork_child(eid, zcp->zedlet_dir,
+				    z, e, zcp->zevent_fd, zcp->do_foreground);
 		}
 	}
 	free(e);
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -17,9 +17,11 @@

 #include <stdint.h>
 #include "zed_strings.h"
+#include "zed_conf.h"
+
+void zed_exec_fini(void);

 int zed_exec_process(uint64_t eid, const char *class, const char *subclass,
-    const char *dir, zed_strings_t *zedlets, zed_strings_t *envs,
-    int zevent_fd);
+    struct zed_conf *zcp, zed_strings_t *envs);

 #endif	/* !ZED_EXEC_H */
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -12,73 +12,17 @@
 * You may not use this file except in compliance with the license.
 */

+#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <limits.h>
 #include <string.h>
-#include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include "zed_file.h"
 #include "zed_log.h"

-/*
- * Read up to [n] bytes from [fd] into [buf].
- * Return the number of bytes read, 0 on EOF, or -1 on error.
- */
-ssize_t
-zed_file_read_n(int fd, void *buf, size_t n)
-{
-	unsigned char *p;
-	size_t n_left;
-	ssize_t n_read;
-
-	p = buf;
-	n_left = n;
-	while (n_left > 0) {
-		if ((n_read = read(fd, p, n_left)) < 0) {
-			if (errno == EINTR)
-				continue;
-			else
-				return (-1);
-
-		} else if (n_read == 0) {
-			break;
-		}
-		n_left -= n_read;
-		p += n_read;
-	}
-	return (n - n_left);
-}
-
-/*
- * Write [n] bytes from [buf] out to [fd].
- * Return the number of bytes written, or -1 on error.
- */
-ssize_t
-zed_file_write_n(int fd, void *buf, size_t n)
-{
-	const unsigned char *p;
-	size_t n_left;
-	ssize_t n_written;
-
-	p = buf;
-	n_left = n;
-	while (n_left > 0) {
-		if ((n_written = write(fd, p, n_left)) < 0) {
-			if (errno == EINTR)
-				continue;
-			else
-				return (-1);
-
-		}
-		n_left -= n_written;
-		p += n_written;
-	}
-	return (n);
-}
-
 /*
 * Set an exclusive advisory lock on the open file descriptor [fd].
 * Return 0 on success, 1 if a conflicting lock is held by another process,
@@ -160,6 +104,13 @@ zed_file_is_locked(int fd)
 	return (lock.l_pid);
 }

+
+#if __APPLE__
+#define	PROC_SELF_FD "/dev/fd"
+#else /* Linux-compatible layout */
+#define	PROC_SELF_FD "/proc/self/fd"
+#endif
+
 /*
 * Close all open file descriptors greater than or equal to [lowfd].
 * Any errors encountered while closing file descriptors are ignored.
@@ -167,51 +118,24 @@ zed_file_is_locked(int fd)
 void
 zed_file_close_from(int lowfd)
 {
-	const int maxfd_def = 256;
-	int errno_bak;
-	struct rlimit rl;
-	int maxfd;
+	int errno_bak = errno;
+	int maxfd = 0;
 	int fd;
+	DIR *fddir;
+	struct dirent *fdent;

-	errno_bak = errno;
-
-	if (getrlimit(RLIMIT_NOFILE, &rl) < 0) {
-		maxfd = maxfd_def;
-	} else if (rl.rlim_max == RLIM_INFINITY) {
-		maxfd = maxfd_def;
+	if ((fddir = opendir(PROC_SELF_FD)) != NULL) {
+		while ((fdent = readdir(fddir)) != NULL) {
+			fd = atoi(fdent->d_name);
+			if (fd > maxfd && fd != dirfd(fddir))
+				maxfd = fd;
+		}
+		(void) closedir(fddir);
 	} else {
-		maxfd = rl.rlim_max;
+		maxfd = sysconf(_SC_OPEN_MAX);
 	}
 	for (fd = lowfd; fd < maxfd; fd++)
 		(void) close(fd);

 	errno = errno_bak;
 }
-
-/*
- * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically
- * closed upon successful execution of one of the exec functions.
- * Return 0 on success, or -1 on error.
- *
- * FIXME: No longer needed?
- */
-int
-zed_file_close_on_exec(int fd)
-{
-	int flags;
-
-	if (fd < 0) {
-		errno = EBADF;
-		return (-1);
-	}
-	flags = fcntl(fd, F_GETFD);
-	if (flags == -1)
-		return (-1);
-
-	flags |= FD_CLOEXEC;
-
-	if (fcntl(fd, F_SETFD, flags) == -1)
-		return (-1);
-
-	return (0);
-}
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -18,10 +18,6 @@
 #include <sys/types.h>
 #include <unistd.h>

-ssize_t zed_file_read_n(int fd, void *buf, size_t n);
-
-ssize_t zed_file_write_n(int fd, void *buf, size_t n);
-
 int zed_file_lock(int fd);

 int zed_file_unlock(int fd);
@@ -30,6 +26,4 @@ pid_t zed_file_is_locked(int fd);

 void zed_file_close_from(int fd);

-int zed_file_close_on_exec(int fd);
-
 #endif	/* !ZED_FILE_H */
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -3,7 +3,7 @@
 *
 * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
- * Refer to the ZoL git commit log for authoritative copyright attribution.
+ * Refer to the OpenZFS git commit log for authoritative copyright attribution.
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
@@ -53,7 +53,6 @@
 #include <grp.h>
 #include <pwd.h>
 #include <signal.h>
-#include <sys/debug.h>
 #include <sys/list.h>
 #include <sys/mkdev.h>
 #include <sys/mntent.h>
@@ -71,7 +70,6 @@
 #include <zfs_prop.h>
 #include <zfs_deleg.h>
 #include <libzutil.h>
-#include <libuutil.h>
 #ifdef HAVE_IDMAP
 #include <aclutils.h>
 #include <directory.h>
@@ -270,7 +268,7 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tclone [-p] [-o property=value] ... "
 		    "<snapshot> <filesystem|volume>\n"));
 	case HELP_CREATE:
-		return (gettext("\tcreate [-Pnpv] [-o property=value] ... "
+		return (gettext("\tcreate [-Pnpuv] [-o property=value] ... "
 		    "<filesystem>\n"
 		    "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... "
 		    "-V <size> <volume>\n"));
@@ -730,6 +728,32 @@ finish_progress(char *done)
 	pt_header = NULL;
 }

+/* This function checks if the passed fd refers to /dev/null or /dev/zero */
+#ifdef __linux__
+static boolean_t
+is_dev_nullzero(int fd)
+{
+	struct stat st;
+	fstat(fd, &st);
+	return (major(st.st_rdev) == 1 && (minor(st.st_rdev) == 3 /* null */ ||
+	    minor(st.st_rdev) == 5 /* zero */));
+}
+#endif
+
+static void
+note_dev_error(int err, int fd)
+{
+#ifdef __linux__
+	if (err == EINVAL && is_dev_nullzero(fd)) {
+		(void) fprintf(stderr,
+		    gettext("Error: Writing directly to /dev/{null,zero} files"
+		    " on certain kernels is not currently implemented.\n"
+		    "(As a workaround, "
+		    "try \"zfs send [...] | cat > /dev/null\")\n"));
+	}
+#endif
+}
+
 static int
 zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type)
 {
@@ -892,6 +916,107 @@ usage:
 	return (-1);
 }

+/*
+ * Return a default volblocksize for the pool which always uses more than
+ * half of the data sectors.  This primarily applies to dRAID which always
+ * writes full stripe widths.
+ */
+static uint64_t
+default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+{
+	uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+	nvlist_t *tree, **vdevs;
+	uint_t nvdevs;
+
+	nvlist_t *config = zpool_get_config(zhp, NULL);
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
+	    nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
+	    &vdevs, &nvdevs) != 0) {
+		return (ZVOL_DEFAULT_BLOCKSIZE);
+	}
+
+	for (int i = 0; i < nvdevs; i++) {
+		nvlist_t *nv = vdevs[i];
+		uint64_t ashift, ndata, nparity;
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)
+			continue;
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA,
+		    &ndata) == 0) {
+			/* dRAID minimum allocation width */
+			asize = MAX(asize, ndata * (1ULL << ashift));
+		} else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+		    &nparity) == 0) {
+			/* raidz minimum allocation width */
+			if (nparity == 1)
+				asize = MAX(asize, 2 * (1ULL << ashift));
+			else
+				asize = MAX(asize, 4 * (1ULL << ashift));
+		} else {
+			/* mirror or (non-redundant) leaf vdev */
+			asize = MAX(asize, 1ULL << ashift);
+		}
+	}
+
+	/*
+	 * Calculate the target volblocksize such that more than half
+	 * of the asize is used. The following table is for 4k sectors.
+	 *
+	 * n   asize   blksz  used  |   n   asize   blksz  used
+	 * -------------------------+---------------------------------
+	 * 1   4,096   8,192  100%  |   9  36,864  32,768   88%
+	 * 2   8,192   8,192  100%  |  10  40,960  32,768   80%
+	 * 3  12,288   8,192   66%  |  11  45,056  32,768   72%
+	 * 4  16,384  16,384  100%  |  12  49,152  32,768   66%
+	 * 5  20,480  16,384   80%  |  13  53,248  32,768   61%
+	 * 6  24,576  16,384   66%  |  14  57,344  32,768   57%
+	 * 7  28,672  16,384   57%  |  15  61,440  32,768   53%
+	 * 8  32,768  32,768  100%  |  16  65,536  65,636  100%
+	 *
+	 * This is primarily a concern for dRAID which always allocates
+	 * a full stripe width.  For dRAID the default stripe width is
+	 * n=8 in which case the volblocksize is set to 32k. Ignoring
+	 * compression there are no unused sectors.  This same reasoning
+	 * applies to raidz[2,3] so target 4 sectors to minimize waste.
+	 */
+	uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+	while (tgt_volblocksize * 2 <= asize)
+		tgt_volblocksize *= 2;
+
+	const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
+	if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) {
+
+		/* Issue a warning when a non-optimal size is requested. */
+		if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) {
+			(void) fprintf(stderr, gettext("Warning: "
+			    "volblocksize (%llu) is less than the default "
+			    "minimum block size (%llu).\nTo reduce wasted "
+			    "space a volblocksize of %llu is recommended.\n"),
+			    (u_longlong_t)volblocksize,
+			    (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE,
+			    (u_longlong_t)tgt_volblocksize);
+		} else if (volblocksize < tgt_volblocksize) {
+			(void) fprintf(stderr, gettext("Warning: "
+			    "volblocksize (%llu) is much less than the "
+			    "minimum allocation\nunit (%llu), which wastes "
+			    "at least %llu%% of space. To reduce wasted "
+			    "space,\nuse a larger volblocksize (%llu is "
+			    "recommended), fewer dRAID data disks\n"
+			    "per group, or smaller sector size (ashift).\n"),
+			    (u_longlong_t)volblocksize, (u_longlong_t)asize,
+			    (u_longlong_t)((100 * (asize - volblocksize)) /
+			    asize), (u_longlong_t)tgt_volblocksize);
+		}
+	} else {
+		volblocksize = tgt_volblocksize;
+		fnvlist_add_uint64(props, prop, volblocksize);
+	}
+
+	return (volblocksize);
+}
+
 /*
 * zfs create [-Pnpv] [-o prop=value] ... fs
 * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size
@@ -911,6 +1036,8 @@ usage:
 * check of arguments and properties, but does not check for permissions,
 * available space, etc.
 *
+ * The '-u' flag prevents the newly created file system from being mounted.
+ *
 * The '-v' flag is for verbose output.
 *
 * The '-P' flag is used for parseable output.  It implies '-v'.
@@ -927,17 +1054,19 @@ zfs_do_create(int argc, char **argv)
 	boolean_t bflag = B_FALSE;
 	boolean_t parents = B_FALSE;
 	boolean_t dryrun = B_FALSE;
+	boolean_t nomount = B_FALSE;
 	boolean_t verbose = B_FALSE;
 	boolean_t parseable = B_FALSE;
 	int ret = 1;
 	nvlist_t *props;
 	uint64_t intval;
+	char *strval;

 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
 		nomem();

 	/* check options */
-	while ((c = getopt(argc, argv, ":PV:b:nso:pv")) != -1) {
+	while ((c = getopt(argc, argv, ":PV:b:nso:puv")) != -1) {
 		switch (c) {
 		case 'V':
 			type = ZFS_TYPE_VOLUME;
@@ -984,6 +1113,9 @@ zfs_do_create(int argc, char **argv)
 		case 's':
 			noreserve = B_TRUE;
 			break;
+		case 'u':
+			nomount = B_TRUE;
+			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
@@ -1003,6 +1135,11 @@ zfs_do_create(int argc, char **argv)
 		    "used when creating a volume\n"));
 		goto badusage;
 	}
+	if (nomount && type != ZFS_TYPE_FILESYSTEM) {
+		(void) fprintf(stderr, gettext("'-u' can only be "
+		    "used when creating a filesystem\n"));
+		goto badusage;
+	}

 	argc -= optind;
 	argv += optind;
@@ -1018,7 +1155,7 @@ zfs_do_create(int argc, char **argv)
 		goto badusage;
 	}

-	if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) {
+	if (dryrun || type == ZFS_TYPE_VOLUME) {
 		char msg[ZFS_MAX_DATASET_NAME_LEN * 2];
 		char *p;

@@ -1040,18 +1177,24 @@ zfs_do_create(int argc, char **argv)
 		}
 	}

-	/*
-	 * if volsize is not a multiple of volblocksize, round it up to the
-	 * nearest multiple of the volblocksize
-	 */
 	if (type == ZFS_TYPE_VOLUME) {
-		uint64_t volblocksize;
+		const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
+		uint64_t volblocksize = default_volblocksize(zpool_handle,
+		    real_props);

-		if (nvlist_lookup_uint64(props,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-		    &volblocksize) != 0)
-			volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+		if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE &&
+		    nvlist_lookup_string(props, prop, &strval) != 0) {
+			if (asprintf(&strval, "%llu",
+			    (u_longlong_t)volblocksize) == -1)
+				nomem();
+			nvlist_add_string(props, prop, strval);
+			free(strval);
+		}

+		/*
+		 * If volsize is not a multiple of volblocksize, round it
+		 * up to the nearest multiple of the volblocksize.
+		 */
 		if (volsize % volblocksize) {
 			volsize = P2ROUNDUP_TYPED(volsize, volblocksize,
 			    uint64_t);
@@ -1064,11 +1207,9 @@ zfs_do_create(int argc, char **argv)
 		}
 	}

-
 	if (type == ZFS_TYPE_VOLUME && !noreserve) {
 		uint64_t spa_version;
 		zfs_prop_t resv_prop;
-		char *strval;

 		spa_version = zpool_get_prop_int(zpool_handle,
 		    ZPOOL_PROP_VERSION, NULL);
@@ -1159,6 +1300,11 @@ zfs_do_create(int argc, char **argv)
 		log_history = B_FALSE;
 	}

+	if (nomount) {
+		ret = 0;
+		goto error;
+	}
+
 	ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
 error:
 	nvlist_free(props);
@@ -4256,6 +4402,7 @@ zfs_do_send(int argc, char **argv)

 	struct option long_options[] = {
 		{"replicate",	no_argument,		NULL, 'R'},
+		{"skip-missing",	no_argument,		NULL, 's'},
 		{"redact",	required_argument,	NULL, 'd'},
 		{"props",	no_argument,		NULL, 'p'},
 		{"parsable",	no_argument,		NULL, 'P'},
@@ -4274,7 +4421,7 @@ zfs_do_send(int argc, char **argv)
 	};

 	/* check options */
-	while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwbd:S",
+	while ((c = getopt_long(argc, argv, ":i:I:RsDpvnPLeht:cwbd:S",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'i':
@@ -4291,6 +4438,9 @@ zfs_do_send(int argc, char **argv)
 		case 'R':
 			flags.replicate = B_TRUE;
 			break;
+		case 's':
+			flags.skipmissing = B_TRUE;
+			break;
 		case 'd':
 			redactbook = optarg;
 			break;
@@ -4448,11 +4598,23 @@ zfs_do_send(int argc, char **argv)

 		err = zfs_send_saved(zhp, &flags, STDOUT_FILENO,
 		    resume_token);
+		if (err != 0)
+			note_dev_error(errno, STDOUT_FILENO);
 		zfs_close(zhp);
 		return (err != 0);
 	} else if (resume_token != NULL) {
-		return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
-		    resume_token));
+		err = zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
+		    resume_token);
+		if (err != 0)
+			note_dev_error(errno, STDOUT_FILENO);
+		return (err);
+	}
+
+	if (flags.skipmissing && !flags.replicate) {
+		(void) fprintf(stderr,
+		    gettext("skip-missing flag can only be used in "
+		    "conjunction with replicate\n"));
+		usage(B_FALSE);
 	}

 	/*
@@ -4496,6 +4658,8 @@ zfs_do_send(int argc, char **argv)
 		err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags,
 		    redactbook);
 		zfs_close(zhp);
+		if (err != 0)
+			note_dev_error(errno, STDOUT_FILENO);
 		return (err != 0);
 	}

@@ -4572,6 +4736,7 @@ zfs_do_send(int argc, char **argv)
 		nvlist_free(dbgnv);
 	}
 	zfs_close(zhp);
+	note_dev_error(errno, STDOUT_FILENO);

 	return (err != 0);
 }
@@ -35,7 +35,7 @@ libzfs_handle_t *g_zfs;
 static void
 usage(int err)
 {
-	fprintf(stderr, "Usage: [-v] zfs_ids_to_path <pool> <objset id> "
+	fprintf(stderr, "Usage: zfs_ids_to_path [-v] <pool> <objset id> "
 	    "<object id>\n");
 	exit(err);
 }
@@ -63,11 +63,11 @@ main(int argc, char **argv)

 	uint64_t objset, object;
 	if (sscanf(argv[1], "%llu", (u_longlong_t *)&objset) != 1) {
-		(void) fprintf(stderr, "Invalid objset id: %s\n", argv[2]);
+		(void) fprintf(stderr, "Invalid objset id: %s\n", argv[1]);
 		usage(2);
 	}
 	if (sscanf(argv[2], "%llu", (u_longlong_t *)&object) != 1) {
-		(void) fprintf(stderr, "Invalid object id: %s\n", argv[3]);
+		(void) fprintf(stderr, "Invalid object id: %s\n", argv[2]);
 		usage(3);
 	}
 	if ((g_zfs = libzfs_init()) == NULL) {
@@ -76,7 +76,7 @@ main(int argc, char **argv)
 	}
 	zpool_handle_t *pool = zpool_open(g_zfs, argv[0]);
 	if (pool == NULL) {
-		fprintf(stderr, "Could not open pool %s\n", argv[1]);
+		fprintf(stderr, "Could not open pool %s\n", argv[0]);
 		libzfs_fini(g_zfs);
 		return (5);
 	}
@@ -36,8 +36,6 @@
 #include <time.h>
 #include <unistd.h>

-static void usage(void);
-
 static void
 usage(void)
 {
@@ -60,12 +58,11 @@ int
 main(int argc, char **argv)
 {
 	/* default file path, can be optionally set by user */
-	char path[PATH_MAX] = "/etc/hostid";
+	const char *path = "/etc/hostid";
 	/* holds converted user input or lrand48() generated value */
 	unsigned long input_i = 0;

 	int opt;
-	int pathlen;
 	int force_fwrite = 0;
 	while ((opt = getopt_long(argc, argv, "fo:h?", 0, 0)) != -1) {
 		switch (opt) {
@@ -73,14 +70,7 @@ main(int argc, char **argv)
 			force_fwrite = 1;
 			break;
 		case 'o':
-			pathlen = snprintf(path, sizeof (path), "%s", optarg);
-			if (pathlen >= sizeof (path)) {
-				fprintf(stderr, "%s\n", strerror(EOVERFLOW));
-				exit(EXIT_FAILURE);
-			} else if (pathlen < 1) {
-				fprintf(stderr, "%s\n", strerror(EINVAL));
-				exit(EXIT_FAILURE);
-			}
+			path = optarg;
 			break;
 		case 'h':
 		case '?':
@@ -118,7 +108,7 @@ main(int argc, char **argv)
 	if (force_fwrite == 0 && stat(path, &fstat) == 0 &&
 	    S_ISREG(fstat.st_mode)) {
 		fprintf(stderr, "%s: %s\n", path, strerror(EEXIST));
-			exit(EXIT_FAILURE);
+		exit(EXIT_FAILURE);
 	}

 	/*
@@ -137,7 +127,7 @@ main(int argc, char **argv)
 	}

 	/*
-	 * we need just 4 bytes in native endianess
+	 * we need just 4 bytes in native endianness
 	 * not using sethostid() because it may be missing or just a stub
 	 */
 	uint32_t hostid = input_i;
@@ -1,4 +1,5 @@
 include $(top_srcdir)/config/Rules.am
+include $(top_srcdir)/config/Shellcheck.am

 AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS)

@@ -39,7 +40,7 @@ include $(top_srcdir)/config/CppCheck.am
 zpoolconfdir = $(sysconfdir)/zfs/zpool.d
 zpoolexecdir = $(zfsexecdir)/zpool.d

-EXTRA_DIST = zpool.d/README
+EXTRA_DIST = zpool.d/README compatibility.d

 dist_zpoolexec_SCRIPTS = \
 	zpool.d/dm-deps \
@@ -129,6 +130,52 @@ zpoolconfdefaults = \
 	test_progress \
 	test_ended

+zpoolcompatdir = $(pkgdatadir)/compatibility.d
+
+dist_zpoolcompat_DATA = \
+	compatibility.d/compat-2018 \
+	compatibility.d/compat-2019 \
+	compatibility.d/compat-2020 \
+	compatibility.d/compat-2021 \
+	compatibility.d/freebsd-11.0 \
+	compatibility.d/freebsd-11.2 \
+	compatibility.d/freebsd-11.3 \
+	compatibility.d/freenas-9.10.2 \
+	compatibility.d/grub2 \
+	compatibility.d/openzfsonosx-1.7.0 \
+	compatibility.d/openzfsonosx-1.8.1 \
+	compatibility.d/openzfsonosx-1.9.3 \
+	compatibility.d/openzfs-2.0-freebsd \
+	compatibility.d/openzfs-2.0-linux \
+	compatibility.d/openzfs-2.1-freebsd \
+	compatibility.d/openzfs-2.1-linux \
+	compatibility.d/zol-0.6.1 \
+	compatibility.d/zol-0.6.4 \
+	compatibility.d/zol-0.6.5 \
+	compatibility.d/zol-0.7 \
+	compatibility.d/zol-0.8
+
+# canonical <- alias symbolic link pairs
+# eg: "2018" is a link to "compat-2018"
+zpoolcompatlinks = \
+	"compat-2018		2018" \
+	"compat-2019		2019" \
+	"compat-2020		2020" \
+	"compat-2021		2021" \
+	"freebsd-11.0		freebsd-11.1" \
+	"freebsd-11.0		freenas-11.0" \
+	"freebsd-11.2		freenas-11.2" \
+	"freebsd-11.3		freebsd-11.4" \
+	"freebsd-11.3		freebsd-12.0" \
+	"freebsd-11.3		freebsd-12.1" \
+	"freebsd-11.3		freebsd-12.2" \
+	"freebsd-11.3		freenas-11.3" \
+	"freenas-11.0		freenas-11.1" \
+	"openzfsonosx-1.9.3	openzfsonosx-1.9.4" \
+	"openzfs-2.0-freebsd	truenas-12.0" \
+	"zol-0.7		ubuntu-18.04" \
+	"zol-0.8		ubuntu-20.04"
+
 install-data-hook:
 	$(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
 	for f in $(zpoolconfdefaults); do \
@@ -136,3 +183,6 @@ install-data-hook:
 	       -L "$(DESTDIR)$(zpoolconfdir)/$${f}" || \
 	    ln -s "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \
 	done
+	for l in $(zpoolcompatlinks); do \
+		(cd "$(DESTDIR)$(zpoolcompatdir)"; ln -sf $${l} ); \
+	done
@@ -0,0 +1,12 @@
+# Features supported by all Tier 1 platforms as of 2018
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+spacemap_histogram
@@ -0,0 +1,15 @@
+# Features supported by all Tier 1 platforms as of 2019
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+sha512
+skein
+spacemap_histogram
@@ -0,0 +1,15 @@
+# Features supported by all Tier 1 platforms as of 2020
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+sha512
+skein
+spacemap_histogram
@@ -0,0 +1,19 @@
+# Features supported by all Tier 1 platforms as of 2021
+async_destroy
+bookmarks
+device_removal
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+zpool_checkpoint
@@ -0,0 +1,15 @@
+# Features supported by FreeBSD 11.0
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+sha512
+skein
+spacemap_histogram
@@ -0,0 +1,18 @@
+# Features supported by FreeBSD 11.2
+async_destroy
+bookmarks
+device_removal
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+sha512
+skein
+spacemap_histogram
+zpool_checkpoint
@@ -0,0 +1,19 @@
+# Features supported by FreeBSD 11.3
+async_destroy
+bookmarks
+device_removal
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+zpool_checkpoint
@@ -0,0 +1,13 @@
+# Features supported by FreeNAS 9.10.2
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+spacemap_histogram
@@ -0,0 +1,12 @@
+# Features which are supported by GRUB2
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+spacemap_histogram
@@ -0,0 +1,33 @@
+# Features supported by OpenZFS 2.0 on FreeBSD
+allocation_classes
+async_destroy
+bookmark_v2
+bookmark_written
+bookmarks
+device_rebuild
+device_removal
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+large_dnode
+livelist
+log_spacemap
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+project_quota
+redacted_datasets
+redaction_bookmarks
+resilver_defer
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+userobj_accounting
+zpool_checkpoint
+zstd_compress
@@ -0,0 +1,34 @@
+# Features supported by OpenZFS 2.0 on Linux
+allocation_classes
+async_destroy
+bookmark_v2
+bookmark_written
+bookmarks
+device_rebuild
+device_removal
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+large_dnode
+livelist
+log_spacemap
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+project_quota
+redacted_datasets
+redaction_bookmarks
+resilver_defer
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+userobj_accounting
+zpool_checkpoint
+zstd_compress
@@ -0,0 +1,34 @@
+# Features supported by OpenZFS 2.1 on FreeBSD
+allocation_classes
+async_destroy
+bookmark_v2
+bookmark_written
+bookmarks
+device_rebuild
+device_removal
+draid
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+large_dnode
+livelist
+log_spacemap
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+project_quota
+redacted_datasets
+redaction_bookmarks
+resilver_defer
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+userobj_accounting
+zpool_checkpoint
+zstd_compress
@@ -0,0 +1,35 @@
+# Features supported by OpenZFS 2.1 on Linux
+allocation_classes
+async_destroy
+bookmark_v2
+bookmark_written
+bookmarks
+device_rebuild
+device_removal
+draid
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+large_dnode
+livelist
+log_spacemap
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+project_quota
+redacted_datasets
+redaction_bookmarks
+resilver_defer
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+userobj_accounting
+zpool_checkpoint
+zstd_compress
@@ -0,0 +1,16 @@
+# Features supported by OpenZFSonOSX 1.7.0
+async_destroy
+bookmarks
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+sha512
+skein
+spacemap_histogram
@@ -0,0 +1,21 @@
+# Features supported by OpenZFSonOSX 1.8.1
+async_destroy
+bookmarks
+device_removal
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+zpool_checkpoint
@@ -0,0 +1,27 @@
+# Features supported by OpenZFSonOSX 1.9.3
+allocation_classes
+async_destroy
+bookmark_v2
+bookmarks
+device_removal
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+large_dnode
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+project_quota
+resilver_defer
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+userobj_accounting
+zpool_checkpoint
@@ -0,0 +1,4 @@
+# Features supported by ZFSonLinux v0.6.1
+async_destroy
+empty_bpobj
+lz4_compress
@@ -0,0 +1,10 @@
+# Features supported by ZFSonLinux v0.6.4
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+hole_birth
+lz4_compress
+spacemap_histogram
@@ -0,0 +1,12 @@
+# Features supported by ZFSonLinux v0.6.5
+async_destroy
+bookmarks
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+lz4_compress
+spacemap_histogram
@@ -0,0 +1,18 @@
+# Features supported by ZFSonLinux v0.7
+async_destroy
+bookmarks
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+large_dnode
+lz4_compress
+multi_vdev_crash_dump
+sha512
+skein
+spacemap_histogram
+userobj_accounting
@@ -0,0 +1,27 @@
+# Features supported by ZFSonLinux v0.8
+allocation_classes
+async_destroy
+bookmark_v2
+bookmarks
+device_removal
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+filesystem_limits
+hole_birth
+large_blocks
+large_dnode
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+project_quota
+resilver_defer
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+userobj_accounting
+zpool_checkpoint
@@ -101,3 +101,18 @@ check_sector_size_database(char *path, int *sector_size)
 {
 	return (0);
 }
+
+void
+after_zpool_upgrade(zpool_handle_t *zhp)
+{
+	char bootfs[ZPOOL_MAXPROPLEN];
+
+	if (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
+	    sizeof (bootfs), NULL, B_FALSE) == 0 &&
+	    strcmp(bootfs, "-") != 0) {
+		(void) printf(gettext("Pool '%s' has the bootfs "
+		    "property set, you might need to update\nthe boot "
+		    "code. See gptzfsboot(8) and loader.efi(8) for "
+		    "details.\n"), zpool_get_name(zhp));
+	}
+}
@@ -79,9 +79,6 @@

 #include <scsi/scsi.h>
 #include <scsi/sg.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
 #include <sys/efi_partition.h>
 #include <sys/stat.h>
 #include <sys/vtoc.h>
@@ -408,3 +405,8 @@ check_device(const char *path, boolean_t force,

 	return (error);
 }
+
+void
+after_zpool_upgrade(zpool_handle_t *zhp)
+{
+}
@@ -53,7 +53,7 @@ get_filename_from_dir()
 	num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
 	mod=$((pid % num_files))
 	i=0
-	find "$dir" -type f -printf "%f\n" | while read -r file ; do
+	find "$dir" -type f -printf '%f\n' | while read -r file ; do
 		if [ "$mod" = "$i" ] ; then
 			echo "$file"
 			break
@@ -62,17 +62,14 @@ get_filename_from_dir()
 	done
 }

-script=$(basename "$0")
+script="${0##*/}"

 if [ "$1" = "-h" ] ; then
        echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
        exit
 fi

-smartctl_path=$(command -v smartctl)
-
-# shellcheck disable=SC2015
-if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
+if [ -b "$VDEV_UPATH" ] && PATH="/usr/sbin:$PATH" command -v smartctl > /dev/null || [ -n "$samples" ] ; then
 	if [ -n "$samples" ] ; then
 		# cat a smartctl output text file instead of running smartctl
 		# on a vdev (only used for developer testing).
@@ -80,7 +77,7 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
 		echo "file=$file"
 		raw_out=$(cat "$samples/$file")
 	else
-		raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
+		raw_out=$(sudo smartctl -a "$VDEV_UPATH")
 	fi

 	# What kind of drive are we?  Look for the right line in smartctl:
@@ -231,11 +228,11 @@ esac
 with_vals=$(echo "$out" | grep -E "$scripts")
 if [ -n "$with_vals" ]; then
 	echo "$with_vals"
-	without_vals=$(echo "$scripts" | tr "|" "\n" |
+	without_vals=$(echo "$scripts" | tr '|' '\n' |
 		grep -v -E "$(echo "$with_vals" |
 		awk -F "=" '{print $1}')" | awk '{print $0"="}')
 else
-	without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}')
+	without_vals=$(echo "$scripts" | tr '|' '\n' | awk '{print $0"="}')
 fi

 if [ -n "$without_vals" ]; then
@@ -494,19 +494,25 @@ vdev_run_cmd(vdev_cmd_data_t *data, char *cmd)
 	/* Setup our custom environment variables */
 	rc = asprintf(&env[1], "VDEV_PATH=%s",
 	    data->path ? data->path : "");
-	if (rc == -1)
+	if (rc == -1) {
+		env[1] = NULL;
 		goto out;
+	}

 	rc = asprintf(&env[2], "VDEV_UPATH=%s",
 	    data->upath ? data->upath : "");
-	if (rc == -1)
+	if (rc == -1) {
+		env[2] = NULL;
 		goto out;
+	}

 	rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s",
 	    data->vdev_enc_sysfs_path ?
 	    data->vdev_enc_sysfs_path : "");
-	if (rc == -1)
+	if (rc == -1) {
+		env[3] = NULL;
 		goto out;
+	}

 	/* Run the command */
 	rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines,
@@ -525,8 +531,7 @@ out:

 	/* Start with i = 1 since env[0] was statically allocated */
 	for (i = 1; i < ARRAY_SIZE(env); i++)
-		if (env[i] != NULL)
-			free(env[i]);
+		free(env[i]);
 }

 /*
@@ -31,6 +31,8 @@
 * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 * Copyright (c) 2017, Intel Corporation.
 * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
+ * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright [2021] Hewlett Packard Enterprise Development LP
 */

 #include <assert.h>
@@ -124,6 +126,9 @@ static int zpool_do_version(int, char **);

 static int zpool_do_wait(int, char **);

+static zpool_compat_status_t zpool_do_load_compat(
+    const char *, boolean_t *);
+
 /*
 * These libumem hooks provide a reasonable set of defaults for the allocator's
 * debugging facilities.
@@ -528,7 +533,7 @@ usage(boolean_t requested)
 		(void) fprintf(fp, "YES   disabled | enabled | active\n");

 		(void) fprintf(fp, gettext("\nThe feature@ properties must be "
-		    "appended with a feature name.\nSee zpool-features(5).\n"));
+		    "appended with a feature name.\nSee zpool-features(7).\n"));
 	}

 	/*
@@ -782,6 +787,8 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props,

 	if (poolprop) {
 		const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
+		const char *cname =
+		    zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY);

 		if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL &&
 		    !zpool_prop_feature(propname)) {
@@ -804,6 +811,22 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props,
 			return (2);
 		}

+		/*
+		 * if version is specified, only "legacy" compatibility
+		 * may be requested
+		 */
+		if ((prop == ZPOOL_PROP_COMPATIBILITY &&
+		    strcmp(propval, ZPOOL_COMPAT_LEGACY) != 0 &&
+		    nvlist_exists(proplist, vname)) ||
+		    (prop == ZPOOL_PROP_VERSION &&
+		    nvlist_exists(proplist, cname) &&
+		    strcmp(fnvlist_lookup_string(proplist, cname),
+		    ZPOOL_COMPAT_LEGACY) != 0)) {
+			(void) fprintf(stderr, gettext("when 'version' is "
+			    "specified, the 'compatibility' feature may only "
+			    "be set to '" ZPOOL_COMPAT_LEGACY "'\n"));
+			return (2);
+		}

 		if (zpool_prop_feature(propname))
 			normnm = propname;
@@ -1046,7 +1069,7 @@ zpool_do_add(int argc, char **argv)
 				free(vname);
 			}
 		}
-		/* And finaly the spares */
+		/* And finally the spares */
 		if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES,
 		    &sparechild, &sparechildren) == 0 && sparechildren > 0) {
 			hadspare = B_TRUE;
@@ -1374,13 +1397,15 @@ zpool_do_create(int argc, char **argv)
 {
 	boolean_t force = B_FALSE;
 	boolean_t dryrun = B_FALSE;
-	boolean_t enable_all_pool_feat = B_TRUE;
+	boolean_t enable_pool_features = B_TRUE;
+
 	int c;
 	nvlist_t *nvroot = NULL;
 	char *poolname;
 	char *tname = NULL;
 	int ret = 1;
 	char *altroot = NULL;
+	char *compat = NULL;
 	char *mountpoint = NULL;
 	nvlist_t *fsprops = NULL;
 	nvlist_t *props = NULL;
@@ -1396,7 +1421,7 @@ zpool_do_create(int argc, char **argv)
 			dryrun = B_TRUE;
 			break;
 		case 'd':
-			enable_all_pool_feat = B_FALSE;
+			enable_pool_features = B_FALSE;
 			break;
 		case 'R':
 			altroot = optarg;
@@ -1434,11 +1459,14 @@ zpool_do_create(int argc, char **argv)
 				ver = strtoull(propval, &end, 10);
 				if (*end == '\0' &&
 				    ver < SPA_VERSION_FEATURES) {
-					enable_all_pool_feat = B_FALSE;
+					enable_pool_features = B_FALSE;
 				}
 			}
 			if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
 				altroot = propval;
+			if (zpool_name_to_prop(optarg) ==
+			    ZPOOL_PROP_COMPATIBILITY)
+				compat = propval;
 			break;
 		case 'O':
 			if ((propval = strchr(optarg, '=')) == NULL) {
@@ -1632,10 +1660,27 @@ zpool_do_create(int argc, char **argv)
 		ret = 0;
 	} else {
 		/*
-		 * Hand off to libzfs.
+		 * Load in feature set.
+		 * Note: if compatibility property not given, we'll have
+		 * NULL, which means 'all features'.
 		 */
-		spa_feature_t i;
-		for (i = 0; i < SPA_FEATURES; i++) {
+		boolean_t requested_features[SPA_FEATURES];
+		if (zpool_do_load_compat(compat, requested_features) !=
+		    ZPOOL_COMPATIBILITY_OK)
+			goto errout;
+
+		/*
+		 * props contains list of features to enable.
+		 * For each feature:
+		 *  - remove it if feature@name=disabled
+		 *  - leave it there if feature@name=enabled
+		 *  - add it if:
+		 *    - enable_pool_features (ie: no '-d' or '-o version')
+		 *    - it's supported by the kernel module
+		 *    - it's in the requested feature set
+		 *  - warn if it's enabled but not in compat
+		 */
+		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			char propname[MAXPATHLEN];
 			char *propval;
 			zfeature_info_t *feat = &spa_feature_table[i];
@@ -1643,17 +1688,22 @@ zpool_do_create(int argc, char **argv)
 			(void) snprintf(propname, sizeof (propname),
 			    "feature@%s", feat->fi_uname);

-			/*
-			 * Only features contained in props will be enabled:
-			 * remove from the nvlist every ZFS_FEATURE_DISABLED
-			 * value and add every missing ZFS_FEATURE_ENABLED if
-			 * enable_all_pool_feat is set.
-			 */
 			if (!nvlist_lookup_string(props, propname, &propval)) {
 				if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0)
 					(void) nvlist_remove_all(props,
 					    propname);
-			} else if (enable_all_pool_feat) {
+				if (strcmp(propval,
+				    ZFS_FEATURE_ENABLED) == 0 &&
+				    !requested_features[i])
+					(void) fprintf(stderr, gettext(
+					    "Warning: feature \"%s\" enabled "
+					    "but is not in specified "
+					    "'compatibility' feature set.\n"),
+					    feat->fi_uname);
+			} else if (
+			    enable_pool_features &&
+			    feat->fi_zfs_mod_supported &&
+			    requested_features[i]) {
 				ret = add_prop_list(propname,
 				    ZFS_FEATURE_ENABLED, &props, B_TRUE);
 				if (ret != 0)
@@ -2009,7 +2059,7 @@ zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path)
 			 * Mark empty values with dashes to make output
 			 * awk-able.
 			 */
-			if (is_blank_str(val))
+			if (val == NULL || is_blank_str(val))
 				val = "-";

 			printf("%*s", vcdl->uniq_cols_width[j], val);
@@ -2330,6 +2380,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 			(void) printf(gettext("all children offline"));
 			break;

+		case VDEV_AUX_BAD_LABEL:
+			(void) printf(gettext("invalid label"));
+			break;
+
 		default:
 			(void) printf(gettext("corrupted data"));
 			break;
@@ -2375,7 +2429,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 		}
 	}

-	/* Display vdev initialization and trim status for leaves */
+	/* Display vdev initialization and trim status for leaves. */
 	if (children == 0) {
 		print_status_initialize(vs, cb->cb_print_vdev_init);
 		print_status_trim(vs, cb->cb_print_vdev_trim);
@@ -2472,6 +2526,10 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv,
 			(void) printf(gettext("all children offline"));
 			break;

+		case VDEV_AUX_BAD_LABEL:
+			(void) printf(gettext("invalid label"));
+			break;
+
 		default:
 			(void) printf(gettext("corrupted data"));
 			break;
@@ -2680,8 +2738,24 @@ show_import(nvlist_t *config, boolean_t report_error)

 	case ZPOOL_STATUS_FEAT_DISABLED:
 		printf_color(ANSI_BOLD, gettext("status: "));
-		printf_color(ANSI_YELLOW, gettext("Some supported features are "
-		    "not enabled on the pool.\n"));
+		printf_color(ANSI_YELLOW, gettext("Some supported "
+		    "features are not enabled on the pool.\n\t"
+		    "(Note that they may be intentionally disabled "
+		    "if the\n\t'compatibility' property is set.)\n"));
+		break;
+
+	case ZPOOL_STATUS_COMPATIBILITY_ERR:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("Error reading or parsing "
+		    "the file(s) indicated by the 'compatibility'\n"
+		    "property.\n"));
+		break;
+
+	case ZPOOL_STATUS_INCOMPATIBLE_FEAT:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more features "
+		    "are enabled on the pool despite not being\n"
+		    "requested by the 'compatibility' property.\n"));
 		break;

 	case ZPOOL_STATUS_UNSUP_FEAT_READ:
@@ -2773,6 +2847,12 @@ show_import(nvlist_t *config, boolean_t report_error)
 			    "imported using its name or numeric identifier, "
 			    "though\n\tsome features will not be available "
 			    "without an explicit 'zpool upgrade'.\n"));
+		} else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) {
+			(void) printf(gettext(" action: The pool can be "
+			    "imported using its name or numeric\n\tidentifier, "
+			    "though the file(s) indicated by its "
+			    "'compatibility'\n\tproperty cannot be parsed at "
+			    "this time.\n"));
 		} else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
 			(void) printf(gettext(" action: The pool can be "
 			    "imported using its name or numeric "
@@ -8004,7 +8084,9 @@ status_callback(zpool_handle_t *zhp, void *data)
 	if (cbp->cb_explain &&
 	    (reason == ZPOOL_STATUS_OK ||
 	    reason == ZPOOL_STATUS_VERSION_OLDER ||
-	    reason == ZPOOL_STATUS_FEAT_DISABLED)) {
+	    reason == ZPOOL_STATUS_FEAT_DISABLED ||
+	    reason == ZPOOL_STATUS_COMPATIBILITY_ERR ||
+	    reason == ZPOOL_STATUS_INCOMPATIBLE_FEAT)) {
 		if (!cbp->cb_allpools) {
 			(void) printf(gettext("pool '%s' is healthy\n"),
 			    zpool_get_name(zhp));
@@ -8179,14 +8261,40 @@ status_callback(zpool_handle_t *zhp, void *data)

 	case ZPOOL_STATUS_FEAT_DISABLED:
 		printf_color(ANSI_BOLD, gettext("status: "));
-		printf_color(ANSI_YELLOW, gettext("Some supported features are "
-		    "not enabled on the pool. The pool can\n\tstill be used, "
-		    "but some features are unavailable.\n"));
+		printf_color(ANSI_YELLOW, gettext("Some supported and "
+		    "requested features are not enabled on the pool.\n\t"
+		    "The pool can still be used, but some features are "
+		    "unavailable.\n"));
 		printf_color(ANSI_BOLD, gettext("action: "));
 		printf_color(ANSI_YELLOW, gettext("Enable all features using "
 		    "'zpool upgrade'. Once this is done,\n\tthe pool may no "
 		    "longer be accessible by software that does not support\n\t"
-		    "the features. See zpool-features(5) for details.\n"));
+		    "the features. See zpool-features(7) for details.\n"));
+		break;
+
+	case ZPOOL_STATUS_COMPATIBILITY_ERR:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("This pool has a "
+		    "compatibility list specified, but it could not be\n\t"
+		    "read/parsed at this time. The pool can still be used, "
+		    "but this\n\tshould be investigated.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Check the value of the "
+		    "'compatibility' property against the\n\t"
+		    "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or "
+		    ZPOOL_DATA_COMPAT_D ".\n"));
+		break;
+
+	case ZPOOL_STATUS_INCOMPATIBLE_FEAT:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more features "
+		    "are enabled on the pool despite not being\n\t"
+		    "requested by the 'compatibility' property.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Consider setting "
+		    "'compatibility' to an appropriate value, or\n\t"
+		    "adding needed features to the relevant file in\n\t"
+		    ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n"));
 		break;

 	case ZPOOL_STATUS_UNSUP_FEAT_READ:
@@ -8648,6 +8756,11 @@ upgrade_version(zpool_handle_t *zhp, uint64_t version)
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &oldversion) == 0);

+	char compat[ZFS_MAXPROPLEN];
+	if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat,
+	    ZFS_MAXPROPLEN, NULL, B_FALSE) != 0)
+		compat[0] = '\0';
+
 	assert(SPA_VERSION_IS_SUPPORTED(oldversion));
 	assert(oldversion < version);

@@ -8662,6 +8775,13 @@ upgrade_version(zpool_handle_t *zhp, uint64_t version)
 		return (1);
 	}

+	if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) {
+		(void) fprintf(stderr, gettext("Upgrade not performed because "
+		    "'compatibility' property set to '"
+		    ZPOOL_COMPAT_LEGACY "'.\n"));
+		return (1);
+	}
+
 	ret = zpool_upgrade(zhp, version);
 	if (ret != 0)
 		return (ret);
@@ -8687,11 +8807,25 @@ upgrade_enable_all(zpool_handle_t *zhp, int *countp)
 	boolean_t firstff = B_TRUE;
 	nvlist_t *enabled = zpool_get_features(zhp);

+	char compat[ZFS_MAXPROPLEN];
+	if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat,
+	    ZFS_MAXPROPLEN, NULL, B_FALSE) != 0)
+		compat[0] = '\0';
+
+	boolean_t requested_features[SPA_FEATURES];
+	if (zpool_do_load_compat(compat, requested_features) !=
+	    ZPOOL_COMPATIBILITY_OK)
+		return (-1);
+
 	count = 0;
 	for (i = 0; i < SPA_FEATURES; i++) {
 		const char *fname = spa_feature_table[i].fi_uname;
 		const char *fguid = spa_feature_table[i].fi_guid;
-		if (!nvlist_exists(enabled, fguid)) {
+
+		if (!spa_feature_table[i].fi_zfs_mod_supported)
+			continue;
+
+		if (!nvlist_exists(enabled, fguid) && requested_features[i]) {
 			char *propname;
 			verify(-1 != asprintf(&propname, "feature@%s", fname));
 			ret = zpool_set_prop(zhp, propname,
@@ -8724,7 +8858,7 @@ upgrade_cb(zpool_handle_t *zhp, void *arg)
 	upgrade_cbdata_t *cbp = arg;
 	nvlist_t *config;
 	uint64_t version;
-	boolean_t printnl = B_FALSE;
+	boolean_t modified_pool = B_FALSE;
 	int ret;

 	config = zpool_get_config(zhp, NULL);
@@ -8738,7 +8872,7 @@ upgrade_cb(zpool_handle_t *zhp, void *arg)
 		ret = upgrade_version(zhp, cbp->cb_version);
 		if (ret != 0)
 			return (ret);
-		printnl = B_TRUE;
+		modified_pool = B_TRUE;

 		/*
 		 * If they did "zpool upgrade -a", then we could
@@ -8758,12 +8892,13 @@ upgrade_cb(zpool_handle_t *zhp, void *arg)

 		if (count > 0) {
 			cbp->cb_first = B_FALSE;
-			printnl = B_TRUE;
+			modified_pool = B_TRUE;
 		}
 	}

-	if (printnl) {
-		(void) printf(gettext("\n"));
+	if (modified_pool) {
+		(void) printf("\n");
+		(void) after_zpool_upgrade(zhp);
 	}

 	return (0);
@@ -8789,7 +8924,10 @@ upgrade_list_older_cb(zpool_handle_t *zhp, void *arg)
 			    "be upgraded to use feature flags.  After "
 			    "being upgraded, these pools\nwill no "
 			    "longer be accessible by software that does not "
-			    "support feature\nflags.\n\n"));
+			    "support feature\nflags.\n\n"
+			    "Note that setting a pool's 'compatibility' "
+			    "feature to '" ZPOOL_COMPAT_LEGACY "' will\n"
+			    "inhibit upgrades.\n\n"));
 			(void) printf(gettext("VER  POOL\n"));
 			(void) printf(gettext("---  ------------\n"));
 			cbp->cb_first = B_FALSE;
@@ -8821,6 +8959,10 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
 		for (i = 0; i < SPA_FEATURES; i++) {
 			const char *fguid = spa_feature_table[i].fi_guid;
 			const char *fname = spa_feature_table[i].fi_uname;
+
+			if (!spa_feature_table[i].fi_zfs_mod_supported)
+				continue;
+
 			if (!nvlist_exists(enabled, fguid)) {
 				if (cbp->cb_first) {
 					(void) printf(gettext("\nSome "
@@ -8830,8 +8972,12 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
 					    "pool may become incompatible with "
 					    "software\nthat does not support "
 					    "the feature. See "
-					    "zpool-features(5) for "
-					    "details.\n\n"));
+					    "zpool-features(7) for "
+					    "details.\n\n"
+					    "Note that the pool "
+					    "'compatibility' feature can be "
+					    "used to inhibit\nfeature "
+					    "upgrades.\n\n"));
 					(void) printf(gettext("POOL  "
 					    "FEATURE\n"));
 					(void) printf(gettext("------"
@@ -8865,7 +9011,7 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
 static int
 upgrade_one(zpool_handle_t *zhp, void *data)
 {
-	boolean_t printnl = B_FALSE;
+	boolean_t modified_pool = B_FALSE;
 	upgrade_cbdata_t *cbp = data;
 	uint64_t cur_version;
 	int ret;
@@ -8893,7 +9039,7 @@ upgrade_one(zpool_handle_t *zhp, void *data)
 	}

 	if (cur_version != cbp->cb_version) {
-		printnl = B_TRUE;
+		modified_pool = B_TRUE;
 		ret = upgrade_version(zhp, cbp->cb_version);
 		if (ret != 0)
 			return (ret);
@@ -8906,16 +9052,17 @@ upgrade_one(zpool_handle_t *zhp, void *data)
 			return (ret);

 		if (count != 0) {
-			printnl = B_TRUE;
+			modified_pool = B_TRUE;
 		} else if (cur_version == SPA_VERSION) {
 			(void) printf(gettext("Pool '%s' already has all "
-			    "supported features enabled.\n"),
+			    "supported and requested features enabled.\n"),
 			    zpool_get_name(zhp));
 		}
 	}

-	if (printnl) {
-		(void) printf(gettext("\n"));
+	if (modified_pool) {
+		(void) printf("\n");
+		(void) after_zpool_upgrade(zhp);
 	}

 	return (0);
@@ -9010,6 +9157,8 @@ zpool_do_upgrade(int argc, char **argv)
 		    "---------------\n");
 		for (i = 0; i < SPA_FEATURES; i++) {
 			zfeature_info_t *fi = &spa_feature_table[i];
+			if (!fi->fi_zfs_mod_supported)
+				continue;
 			const char *ro =
 			    (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 			    " (read-only compatible)" : "";
@@ -9070,8 +9219,8 @@ zpool_do_upgrade(int argc, char **argv)
 				(void) printf(gettext("All pools are already "
 				    "formatted using feature flags.\n\n"));
 				(void) printf(gettext("Every feature flags "
-				    "pool already has all supported features "
-				    "enabled.\n"));
+				    "pool already has all supported and "
+				    "requested features enabled.\n"));
 			} else {
 				(void) printf(gettext("All pools are already "
 				    "formatted with version %llu or higher.\n"),
@@ -9097,7 +9246,7 @@ zpool_do_upgrade(int argc, char **argv)

 		if (cb.cb_first) {
 			(void) printf(gettext("Every feature flags pool has "
-			    "all supported features enabled.\n"));
+			    "all supported and requested features enabled.\n"));
 		} else {
 			(void) printf(gettext("\n"));
 		}
@@ -9126,7 +9275,7 @@ print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb)
 	    &records, &numrecords) == 0);
 	for (i = 0; i < numrecords; i++) {
 		nvlist_t *rec = records[i];
-		char tbuf[30] = "";
+		char tbuf[64] = "";

 		if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
 			time_t tsec;
@@ -9138,6 +9287,14 @@ print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb)
 			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		}

+		if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) {
+			uint64_t elapsed_ns = fnvlist_lookup_int64(records[i],
+			    ZPOOL_HIST_ELAPSED_NS);
+			(void) snprintf(tbuf + strlen(tbuf),
+			    sizeof (tbuf) - strlen(tbuf),
+			    " (%lldms)", (long long)elapsed_ns / 1000 / 1000);
+		}
+
 		if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
 			(void) printf("%s %s", tbuf,
 			    fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
@@ -9877,6 +10034,63 @@ set_callback(zpool_handle_t *zhp, void *data)
 	int error;
 	set_cbdata_t *cb = (set_cbdata_t *)data;

+	/* Check if we have out-of-bounds features */
+	if (strcmp(cb->cb_propname, ZPOOL_CONFIG_COMPATIBILITY) == 0) {
+		boolean_t features[SPA_FEATURES];
+		if (zpool_do_load_compat(cb->cb_value, features) !=
+		    ZPOOL_COMPATIBILITY_OK)
+			return (-1);
+
+		nvlist_t *enabled = zpool_get_features(zhp);
+		spa_feature_t i;
+		for (i = 0; i < SPA_FEATURES; i++) {
+			const char *fguid = spa_feature_table[i].fi_guid;
+			if (nvlist_exists(enabled, fguid) && !features[i])
+				break;
+		}
+		if (i < SPA_FEATURES)
+			(void) fprintf(stderr, gettext("Warning: one or "
+			    "more features already enabled on pool '%s'\n"
+			    "are not present in this compatibility set.\n"),
+			    zpool_get_name(zhp));
+	}
+
+	/* if we're setting a feature, check it's in compatibility set */
+	if (zpool_prop_feature(cb->cb_propname) &&
+	    strcmp(cb->cb_value, ZFS_FEATURE_ENABLED) == 0) {
+		char *fname = strchr(cb->cb_propname, '@') + 1;
+		spa_feature_t f;
+
+		if (zfeature_lookup_name(fname, &f) == 0) {
+			char compat[ZFS_MAXPROPLEN];
+			if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY,
+			    compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0)
+				compat[0] = '\0';
+
+			boolean_t features[SPA_FEATURES];
+			if (zpool_do_load_compat(compat, features) !=
+			    ZPOOL_COMPATIBILITY_OK) {
+				(void) fprintf(stderr, gettext("Error: "
+				    "cannot enable feature '%s' on pool '%s'\n"
+				    "because the pool's 'compatibility' "
+				    "property cannot be parsed.\n"),
+				    fname, zpool_get_name(zhp));
+				return (-1);
+			}
+
+			if (!features[f]) {
+				(void) fprintf(stderr, gettext("Error: "
+				    "cannot enable feature '%s' on pool '%s'\n"
+				    "as it is not specified in this pool's "
+				    "current compatibility set.\n"
+				    "Consider setting 'compatibility' to a "
+				    "less restrictive set, or to 'off'.\n"),
+				    fname, zpool_get_name(zhp));
+				return (-1);
+			}
+		}
+	}
+
 	error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);

 	if (!error)
@@ -10007,7 +10221,8 @@ vdev_any_spare_replacing(nvlist_t *nv)
 	(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type);

 	if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 ||
-	    strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) {
+	    strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 ||
+	    strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) {
 		return (B_TRUE);
 	}

@@ -10392,6 +10607,36 @@ zpool_do_version(int argc, char **argv)
 	return (0);
 }

+/*
+ * Do zpool_load_compat() and print error message on failure
+ */
+static zpool_compat_status_t
+zpool_do_load_compat(const char *compat, boolean_t *list)
+{
+	char report[1024];
+
+	zpool_compat_status_t ret;
+
+	ret = zpool_load_compat(compat, list, report, 1024);
+	switch (ret) {
+
+	case ZPOOL_COMPATIBILITY_OK:
+		break;
+
+	case ZPOOL_COMPATIBILITY_NOFILES:
+	case ZPOOL_COMPATIBILITY_BADFILE:
+	case ZPOOL_COMPATIBILITY_BADTOKEN:
+		(void) fprintf(stderr, "Error: %s\n", report);
+		break;
+
+	case ZPOOL_COMPATIBILITY_WARNTOKEN:
+		(void) fprintf(stderr, "Warning: %s\n", report);
+		ret = ZPOOL_COMPATIBILITY_OK;
+		break;
+	}
+	return (ret);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -129,6 +129,7 @@ int check_device(const char *path, boolean_t force,
 boolean_t check_sector_size_database(char *path, int *sector_size);
 void vdev_error(const char *fmt, ...);
 int check_file(const char *file, boolean_t force, boolean_t isspare);
+void after_zpool_upgrade(zpool_handle_t *zhp);

 #ifdef	__cplusplus
 }
@@ -86,9 +86,6 @@
 boolean_t error_seen;
 boolean_t is_force;

-
-
-
 /*PRINTFLIKE1*/
 void
 vdev_error(const char *fmt, ...)
@@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path)
 	uint_t i, nspares;
 	boolean_t inuse;

+	if (zpool_is_draid_spare(path))
+		return (B_TRUE);
+
 	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
 		return (B_FALSE);

@@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path)
 *	/dev/xxx	Complete disk path
 *	/xxx		Full path to file
 *	xxx		Shorthand for <zfs_vdev_paths>/xxx
+ *	draid*		Virtual dRAID spare
 */
 static nvlist_t *
-make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
+make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
 {
 	char path[MAXPATHLEN];
 	struct stat64 statbuf;
@@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)

 		/* After whole disk check restore original passed path */
 		strlcpy(path, arg, sizeof (path));
+	} else if (zpool_is_draid_spare(arg)) {
+		if (!is_primary) {
+			(void) fprintf(stderr,
+			    gettext("cannot open '%s': dRAID spares can only "
+			    "be used to replace primary vdevs\n"), arg);
+			return (NULL);
+		}
+
+		wholedisk = B_TRUE;
+		strlcpy(path, arg, sizeof (path));
+		type = VDEV_TYPE_DRAID_SPARE;
 	} else {
 		err = is_shorthand_path(arg, path, sizeof (path),
 		    &statbuf, &wholedisk);
@@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
 		}
 	}

-	/*
-	 * Determine whether this is a device or a file.
-	 */
-	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
-		type = VDEV_TYPE_DISK;
-	} else if (S_ISREG(statbuf.st_mode)) {
-		type = VDEV_TYPE_FILE;
-	} else {
-		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
-		    "block device or regular file\n"), path);
-		return (NULL);
+	if (type == NULL) {
+		/*
+		 * Determine whether this is a device or a file.
+		 */
+		if (wholedisk || S_ISBLK(statbuf.st_mode)) {
+			type = VDEV_TYPE_DISK;
+		} else if (S_ISREG(statbuf.st_mode)) {
+			type = VDEV_TYPE_FILE;
+		} else {
+			fprintf(stderr, gettext("cannot use '%s': must "
+			    "be a block device or regular file\n"), path);
+			return (NULL);
+		}
 	}

 	/*
@@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
-	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
-	if (is_log)
-		verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
-		    VDEV_ALLOC_BIAS_LOG) == 0);
+
 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
 		    (uint64_t)wholedisk) == 0);
@@ -432,11 +443,16 @@ typedef struct replication_level {

 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)

+/*
+ * N.B. For the purposes of comparing replication levels dRAID can be
+ * considered functionally equivalent to raidz.
+ */
 static boolean_t
 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
    replication_level_t **raidz, replication_level_t **mirror)
 {
-	if (strcmp(a->zprl_type, "raidz") == 0 &&
+	if ((strcmp(a->zprl_type, "raidz") == 0 ||
+	    strcmp(a->zprl_type, "draid") == 0) &&
 	    strcmp(b->zprl_type, "mirror") == 0) {
 		*raidz = a;
 		*mirror = b;
@@ -445,6 +461,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b,
 	return (B_FALSE);
 }

+/*
+ * Comparison for determining if dRAID and raidz where passed in either order.
+ */
+static boolean_t
+is_raidz_draid(replication_level_t *a, replication_level_t *b)
+{
+	if ((strcmp(a->zprl_type, "raidz") == 0 ||
+	    strcmp(a->zprl_type, "draid") == 0) &&
+	    (strcmp(b->zprl_type, "raidz") == 0 ||
+	    strcmp(b->zprl_type, "draid") == 0)) {
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
 /*
 * Given a list of toplevel vdevs, return the current replication level.  If
 * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
@@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
 			rep.zprl_type = type;
 			rep.zprl_children = 0;

-			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
+			    strcmp(type, VDEV_TYPE_DRAID) == 0) {
 				verify(nvlist_lookup_uint64(nv,
 				    ZPOOL_CONFIG_NPARITY,
 				    &rep.zprl_parity) == 0);
@@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
 					else
 						return (NULL);
 				}
+			} else if (is_raidz_draid(&lastrep, &rep)) {
+				/*
+				 * Accepted raidz and draid when they can
+				 * handle the same number of disk failures.
+				 */
+				if (lastrep.zprl_parity != rep.zprl_parity) {
+					if (ret != NULL)
+						free(ret);
+					ret = NULL;
+					if (fatal)
+						vdev_error(gettext(
+						    "mismatched replication "
+						    "level: %s and %s vdevs "
+						    "with different "
+						    "redundancy, %llu vs. "
+						    "%llu are present\n"),
+						    lastrep.zprl_type,
+						    rep.zprl_type,
+						    lastrep.zprl_parity,
+						    rep.zprl_parity);
+					else
+						return (NULL);
+				}
 			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
 			    0) {
 				if (ret != NULL)
@@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
 	return (anyinuse);
 }

+/*
+ * Returns the parity level extracted from a raidz or draid type.
+ * If the parity cannot be determined zero is returned.
+ */
+static int
+get_parity(const char *type)
+{
+	long parity = 0;
+	const char *p;
+
+	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
+		p = type + strlen(VDEV_TYPE_RAIDZ);
+
+		if (*p == '\0') {
+			/* when unspecified default to single parity */
+			return (1);
+		} else if (*p == '0') {
+			/* no zero prefixes allowed */
+			return (0);
+		} else {
+			/* 0-3, no suffixes allowed */
+			char *end;
+			errno = 0;
+			parity = strtol(p, &end, 10);
+			if (errno != 0 || *end != '\0' ||
+			    parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
+				return (0);
+			}
+		}
+	} else if (strncmp(type, VDEV_TYPE_DRAID,
+	    strlen(VDEV_TYPE_DRAID)) == 0) {
+		p = type + strlen(VDEV_TYPE_DRAID);
+
+		if (*p == '\0' || *p == ':') {
+			/* when unspecified default to single parity */
+			return (1);
+		} else if (*p == '0') {
+			/* no zero prefixes allowed */
+			return (0);
+		} else {
+			/* 0-3, allowed suffixes: '\0' or ':' */
+			char *end;
+			errno = 0;
+			parity = strtol(p, &end, 10);
+			if (errno != 0 ||
+			    parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
+			    (*end != '\0' && *end != ':')) {
+				return (0);
+			}
+		}
+	}
+
+	return ((int)parity);
+}
+
+/*
+ * Assign the minimum and maximum number of devices allowed for
+ * the specified type.  On error NULL is returned, otherwise the
+ * type prefix is returned (raidz, mirror, etc).
+ */
 static const char *
 is_grouping(const char *type, int *mindev, int *maxdev)
 {
-	if (strncmp(type, "raidz", 5) == 0) {
-		const char *p = type + 5;
-		char *end;
-		long nparity;
-
-		if (*p == '\0') {
-			nparity = 1;
-		} else if (*p == '0') {
-			return (NULL); /* no zero prefixes allowed */
-		} else {
-			errno = 0;
-			nparity = strtol(p, &end, 10);
-			if (errno != 0 || nparity < 1 || nparity >= 255 ||
-			    *end != '\0')
-				return (NULL);
-		}
+	int nparity;

+	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+	    strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
+		nparity = get_parity(type);
+		if (nparity == 0)
+			return (NULL);
 		if (mindev != NULL)
 			*mindev = nparity + 1;
 		if (maxdev != NULL)
 			*maxdev = 255;
-		return (VDEV_TYPE_RAIDZ);
+
+		if (strncmp(type, VDEV_TYPE_RAIDZ,
+		    strlen(VDEV_TYPE_RAIDZ)) == 0) {
+			return (VDEV_TYPE_RAIDZ);
+		} else {
+			return (VDEV_TYPE_DRAID);
+		}
 	}

 	if (maxdev != NULL)
@@ -1167,6 +1279,163 @@ is_grouping(const char *type, int *mindev, int *maxdev)
 	return (NULL);
 }

+/*
+ * Extract the configuration parameters encoded in the dRAID type and
+ * use them to generate a dRAID configuration.  The expected format is:
+ *
+ * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
+ *
+ * The intent is to be able to generate a good configuration when no
+ * additional information is provided.  The only mandatory component
+ * of the 'type' is the 'draid' prefix.  If a value is not provided
+ * then reasonable defaults are used.  The optional components may
+ * appear in any order but the d/s/c suffix is required.
+ *
+ * Valid inputs:
+ * - data:     number of data devices per group (1-255)
+ * - parity:   number of parity blocks per group (1-3)
+ * - spares:   number of distributed spare (0-100)
+ * - children: total number of devices (1-255)
+ *
+ * Examples:
+ * - zpool create tank draid <devices...>
+ * - zpool create tank draid2:8d:51c:2s <devices...>
+ */
+static int
+draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
+{
+	uint64_t nparity = 1;
+	uint64_t nspares = 0;
+	uint64_t ndata = UINT64_MAX;
+	uint64_t ngroups = 1;
+	long value;
+
+	if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
+		return (EINVAL);
+
+	nparity = (uint64_t)get_parity(type);
+	if (nparity == 0)
+		return (EINVAL);
+
+	char *p = (char *)type;
+	while ((p = strchr(p, ':')) != NULL) {
+		char *end;
+
+		p = p + 1;
+		errno = 0;
+
+		if (!isdigit(p[0])) {
+			(void) fprintf(stderr, gettext("invalid dRAID "
+			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
+			    type);
+			return (EINVAL);
+		}
+
+		/* Expected non-zero value with c/d/s suffix */
+		value = strtol(p, &end, 10);
+		char suffix = tolower(*end);
+		if (errno != 0 ||
+		    (suffix != 'c' && suffix != 'd' && suffix != 's')) {
+			(void) fprintf(stderr, gettext("invalid dRAID "
+			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
+			    type);
+			return (EINVAL);
+		}
+
+		if (suffix == 'c') {
+			if ((uint64_t)value != children) {
+				fprintf(stderr,
+				    gettext("invalid number of dRAID children; "
+				    "%llu required but %llu provided\n"),
+				    (u_longlong_t)value,
+				    (u_longlong_t)children);
+				return (EINVAL);
+			}
+		} else if (suffix == 'd') {
+			ndata = (uint64_t)value;
+		} else if (suffix == 's') {
+			nspares = (uint64_t)value;
+		} else {
+			verify(0); /* Unreachable */
+		}
+	}
+
+	/*
+	 * When a specific number of data disks is not provided limit a
+	 * redundancy group to 8 data disks.  This value was selected to
+	 * provide a reasonable tradeoff between capacity and performance.
+	 */
+	if (ndata == UINT64_MAX) {
+		if (children > nspares + nparity) {
+			ndata = MIN(children - nspares - nparity, 8);
+		} else {
+			fprintf(stderr, gettext("request number of "
+			    "distributed spares %llu and parity level %llu\n"
+			    "leaves no disks available for data\n"),
+			    (u_longlong_t)nspares, (u_longlong_t)nparity);
+			return (EINVAL);
+		}
+	}
+
+	/* Verify the maximum allowed group size is never exceeded. */
+	if (ndata == 0 || (ndata + nparity > children - nspares)) {
+		fprintf(stderr, gettext("requested number of dRAID data "
+		    "disks per group %llu is too high,\nat most %llu disks "
+		    "are available for data\n"), (u_longlong_t)ndata,
+		    (u_longlong_t)(children - nspares - nparity));
+		return (EINVAL);
+	}
+
+	if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
+		fprintf(stderr,
+		    gettext("invalid dRAID parity level %llu; must be "
+		    "between 1 and %d\n"), (u_longlong_t)nparity,
+		    VDEV_DRAID_MAXPARITY);
+		return (EINVAL);
+	}
+
+	/*
+	 * Verify the requested number of spares can be satisfied.
+	 * An arbitrary limit of 100 distributed spares is applied.
+	 */
+	if (nspares > 100 || nspares > (children - (ndata + nparity))) {
+		fprintf(stderr,
+		    gettext("invalid number of dRAID spares %llu; additional "
+		    "disks would be required\n"), (u_longlong_t)nspares);
+		return (EINVAL);
+	}
+
+	/* Verify the requested number children is sufficient. */
+	if (children < (ndata + nparity + nspares)) {
+		fprintf(stderr, gettext("%llu disks were provided, but at "
+		    "least %llu disks are required for this config\n"),
+		    (u_longlong_t)children,
+		    (u_longlong_t)(ndata + nparity + nspares));
+	}
+
+	if (children > VDEV_DRAID_MAX_CHILDREN) {
+		fprintf(stderr, gettext("%llu disks were provided, but "
+		    "dRAID only supports up to %u disks"),
+		    (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
+	}
+
+	/*
+	 * Calculate the minimum number of groups required to fill a slice.
+	 * This is the LCM of the stripe width (ndata + nparity) and the
+	 * number of data drives (children - nspares).
+	 */
+	while (ngroups * (ndata + nparity) % (children - nspares) != 0)
+		ngroups++;
+
+	/* Store the basic dRAID configuration. */
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
+
+	return (0);
+}
+
 /*
 * Construct a syntactically valid vdev specification,
 * and ensure that all devices and files exist and can be opened.
@@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 {
 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
-	const char *type;
-	uint64_t is_log, is_special, is_dedup;
+	const char *type, *fulltype;
+	boolean_t is_log, is_special, is_dedup, is_spare;
 	boolean_t seen_logs;

 	top = NULL;
@@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 	nspares = 0;
 	nlogs = 0;
 	nl2cache = 0;
-	is_log = is_special = is_dedup = B_FALSE;
+	is_log = is_special = is_dedup = is_spare = B_FALSE;
 	seen_logs = B_FALSE;
 	nvroot = NULL;

 	while (argc > 0) {
+		fulltype = argv[0];
 		nv = NULL;

 		/*
-		 * If it's a mirror or raidz, the subsequent arguments are
-		 * its leaves -- until we encounter the next mirror or raidz.
+		 * If it's a mirror, raidz, or draid the subsequent arguments
+		 * are its leaves -- until we encounter the next mirror,
+		 * raidz or draid.
 		 */
-		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
+		if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
 			nvlist_t **child = NULL;
 			int c, children = 0;

@@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 					    "specified only once\n"));
 					goto spec_out;
 				}
+				is_spare = B_TRUE;
 				is_log = is_special = is_dedup = B_FALSE;
 			}

@@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 				}
 				seen_logs = B_TRUE;
 				is_log = B_TRUE;
-				is_special = B_FALSE;
-				is_dedup = B_FALSE;
+				is_special = is_dedup = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				/*
@@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)

 			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
 				is_special = B_TRUE;
-				is_log = B_FALSE;
-				is_dedup = B_FALSE;
+				is_log = is_dedup = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				continue;
@@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)

 			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
 				is_dedup = B_TRUE;
-				is_log = B_FALSE;
-				is_special = B_FALSE;
+				is_log = is_special = is_spare = B_FALSE;
 				argc--;
 				argv++;
 				continue;
@@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 					    "specified only once\n"));
 					goto spec_out;
 				}
-				is_log = is_special = is_dedup = B_FALSE;
+				is_log = is_special = B_FALSE;
+				is_dedup = is_spare = B_FALSE;
 			}

 			if (is_log || is_special || is_dedup) {
@@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 			for (c = 1; c < argc; c++) {
 				if (is_grouping(argv[c], NULL, NULL) != NULL)
 					break;
+
 				children++;
 				child = realloc(child,
 				    children * sizeof (nvlist_t *));
 				if (child == NULL)
 					zpool_no_memory();
 				if ((nv = make_leaf_vdev(props, argv[c],
-				    B_FALSE)) == NULL) {
+				    !(is_log || is_special || is_dedup ||
+				    is_spare))) == NULL) {
 					for (c = 0; c < children - 1; c++)
 						nvlist_free(child[c]);
 					free(child);
@@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 				    type) == 0);
 				verify(nvlist_add_uint64(nv,
 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
-				if (is_log)
+				if (is_log) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_LOG) == 0);
+				}
 				if (is_special) {
 					verify(nvlist_add_string(nv,
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
@@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 					    ZPOOL_CONFIG_NPARITY,
 					    mindev - 1) == 0);
 				}
+				if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
+					if (draid_config_by_type(nv,
+					    fulltype, children) != 0) {
+						for (c = 0; c < children; c++)
+							nvlist_free(child[c]);
+						free(child);
+						goto spec_out;
+					}
+				}
 				verify(nvlist_add_nvlist_array(nv,
 				    ZPOOL_CONFIG_CHILDREN, child,
 				    children) == 0);
@@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 			 * We have a device.  Pass off to make_leaf_vdev() to
 			 * construct the appropriate nvlist describing the vdev.
 			 */
-			if ((nv = make_leaf_vdev(props, argv[0],
-			    is_log)) == NULL)
+			if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
+			    is_special || is_dedup || is_spare))) == NULL)
 				goto spec_out;

-			if (is_log)
+			verify(nvlist_add_uint64(nv,
+			    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+			if (is_log) {
+				verify(nvlist_add_string(nv,
+				    ZPOOL_CONFIG_ALLOCATION_BIAS,
+				    VDEV_ALLOC_BIAS_LOG) == 0);
 				nlogs++;
+			}
+
 			if (is_special) {
 				verify(nvlist_add_string(nv,
 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
@@ -0,0 +1 @@
+/zpool_influxdb
@@ -0,0 +1,13 @@
+include $(top_srcdir)/config/Rules.am
+
+zfsexec_PROGRAMS = zpool_influxdb
+
+zpool_influxdb_SOURCES = \
+	zpool_influxdb.c
+
+zpool_influxdb_LDADD = \
+	$(top_builddir)/lib/libspl/libspl.la \
+	$(top_builddir)/lib/libnvpair/libnvpair.la \
+	$(top_builddir)/lib/libzfs/libzfs.la
+
+include $(top_srcdir)/config/CppCheck.am
@@ -0,0 +1,294 @@
+# Influxdb Metrics for ZFS Pools
+The _zpool_influxdb_ program produces
+[influxdb](https://github.com/influxdata/influxdb) line protocol
+compatible metrics from zpools. In the UNIX tradition, _zpool_influxdb_
+does one thing: read statistics from a pool and print them to
+stdout. In many ways, this is a metrics-friendly output of
+statistics normally observed via the `zpool` command.
+
+## Usage
+When run without arguments, _zpool_influxdb_ runs once, reading data
+from all imported pools, and prints to stdout.
+```shell
+zpool_influxdb [options] [poolname]
+```
+If no poolname is specified, then all pools are sampled.
+
+| option | short option | description |
+|---|---|---|
+| --execd | -e | For use with telegraf's `execd` plugin. When [enter] is pressed, the pools are sampled. To exit, use [ctrl+D] |
+| --no-histogram | -n | Do not print histogram information |
+| --signed-int | -i | Use signed integer data type (default=unsigned) |
+| --sum-histogram-buckets | -s | Sum histogram bucket values |
+| --tags key=value[,key=value...] | -t | Add tags to data points. No tag sanity checking is performed. |
+| --help | -h | Print a short usage message |
+
+#### Histogram Bucket Values
+The histogram data collected by ZFS is stored as independent bucket values.
+This works well out-of-the-box with an influxdb data source and grafana's
+heatmap visualization. The influxdb query for a grafana heatmap
+visualization looks like:
+```
+field(disk_read) last() non_negative_derivative(1s)
+```
+
+Another method for storing histogram data sums the values for lower-value
+buckets. For example, a latency bucket tagged "le=10" includes the values
+in the bucket "le=1".
+This method is often used for prometheus histograms.
+The `zpool_influxdb --sum-histogram-buckets` option presents the data from ZFS
+as summed values.
+
+## Measurements
+The following measurements are collected:
+
+| measurement | description | zpool equivalent |
+|---|---|---|
+| zpool_stats | general size and data | zpool list |
+| zpool_scan_stats | scrub, rebuild, and resilver statistics (omitted if no scan has been requested) | zpool status |
+| zpool_vdev_stats | per-vdev statistics | zpool iostat -q |
+| zpool_io_size | per-vdev I/O size histogram | zpool iostat -r |
+| zpool_latency | per-vdev I/O latency histogram | zpool iostat -w |
+| zpool_vdev_queue | per-vdev instantaneous queue depth | zpool iostat -q |
+
+### zpool_stats Description
+zpool_stats contains top-level summary statistics for the pool.
+Performance counters measure the I/Os to the pool's devices.
+
+#### zpool_stats Tags
+
+| label | description |
+|---|---|
+| name | pool name |
+| path | for leaf vdevs, the pathname |
+| state | pool state, as shown by _zpool status_ |
+| vdev | vdev name (root = entire pool) |
+
+#### zpool_stats Fields
+
+| field | units | description |
+|---|---|---|
+| alloc | bytes | allocated space |
+| free | bytes | unallocated space |
+| size | bytes | total pool size |
+| read_bytes | bytes | bytes read since pool import |
+| read_errors | count | number of read errors |
+| read_ops | count | number of read operations |
+| write_bytes | bytes | bytes written since pool import |
+| write_errors | count | number of write errors |
+| write_ops | count | number of write operations |
+
+### zpool_scan_stats Description
+Once a pool has been scrubbed, resilvered, or rebuilt, the zpool_scan_stats
+contain information about the status and performance of the operation.
+Otherwise, the zpool_scan_stats do not exist in the kernel, and therefore
+cannot be reported by this collector.
+
+#### zpool_scan_stats Tags
+
+| label | description |
+|---|---|
+| name | pool name |
+| function | name of the scan function running or recently completed |
+| state | scan state, as shown by _zpool status_ |
+
+#### zpool_scan_stats Fields
+
+| field | units | description |
+|---|---|---|
+| errors | count | number of errors encountered by scan |
+| examined | bytes | total data examined during scan |
+| to_examine | bytes | prediction of total bytes to be scanned |
+| pass_examined | bytes | data examined during current scan pass |
+| issued | bytes | size of I/Os issued to disks |
+| pass_issued | bytes | size of I/Os issued to disks for current pass |
+| processed | bytes | data reconstructed during scan |
+| to_process | bytes | total bytes to be repaired |
+| rate | bytes/sec | examination rate |
+| start_ts | epoch timestamp | start timestamp for scan |
+| pause_ts | epoch timestamp | timestamp for a scan pause request |
+| end_ts | epoch timestamp | completion timestamp for scan |
+| paused_t | seconds | elapsed time while paused |
+| remaining_t | seconds | estimate of time remaining for scan |
+
+### zpool_vdev_stats Description
+The ZFS I/O (ZIO) scheduler uses five queues to schedule I/Os to each vdev.
+These queues are further divided into active and pending states.
+An I/O is pending prior to being issued to the vdev. An active
+I/O has been issued to the vdev. The scheduler and its tunable
+parameters are described at the
+[ZFS documentation for ZIO Scheduler]
+(https://openzfs.github.io/openzfs-docs/Performance%20and%20Tuning/ZIO%20Scheduler.html)
+The ZIO scheduler reports the queue depths as gauges where the value
+represents an instantaneous snapshot of the queue depth at
+the sample time. Therefore, it is not unusual to see all zeroes
+for an idle pool.
+
+#### zpool_vdev_stats Tags
+| label | description |
+|---|---|
+| name | pool name |
+| vdev | vdev name (root = entire pool) |
+
+#### zpool_vdev_stats Fields
+| field | units | description |
+|---|---|---|
+| sync_r_active_queue | entries | synchronous read active queue depth |
+| sync_w_active_queue | entries | synchronous write active queue depth |
+| async_r_active_queue | entries | asynchronous read active queue depth |
+| async_w_active_queue | entries | asynchronous write active queue depth |
+| async_scrub_active_queue | entries | asynchronous scrub active queue depth |
+| sync_r_pend_queue | entries | synchronous read pending queue depth |
+| sync_w_pend_queue | entries | synchronous write pending queue depth |
+| async_r_pend_queue | entries | asynchronous read pending queue depth |
+| async_w_pend_queue | entries | asynchronous write pending queue depth |
+| async_scrub_pend_queue | entries | asynchronous scrub pending queue depth |
+
+### zpool_latency Histogram
+ZFS tracks the latency of each I/O in the ZIO pipeline. This latency can
+be useful for observing latency-related issues that are not easily observed
+using the averaged latency statistics.
+
+The histogram fields show cumulative values from lowest to highest.
+The largest bucket is tagged "le=+Inf", representing the total count
+of I/Os by type and vdev.
+
+#### zpool_latency Histogram Tags
+| label | description |
+|---|---|
+| le | bucket for histogram, latency is less than or equal to bucket value in seconds |
+| name | pool name |
+| path | for leaf vdevs, the device path name, otherwise omitted |
+| vdev | vdev name (root = entire pool) |
+
+#### zpool_latency Histogram Fields
+| field | units | description |
+|---|---|---|
+| total_read | operations | read operations of all types |
+| total_write | operations | write operations of all types |
+| disk_read | operations | disk read operations |
+| disk_write | operations | disk write operations |
+| sync_read | operations | ZIO sync reads |
+| sync_write | operations | ZIO sync writes |
+| async_read | operations | ZIO async reads|
+| async_write | operations | ZIO async writes |
+| scrub | operations | ZIO scrub/scan reads |
+| trim | operations | ZIO trim (aka unmap) writes |
+
+### zpool_io_size Histogram
+ZFS tracks I/O throughout the ZIO pipeline. The size of each I/O is used
+to create a histogram of the size by I/O type and vdev. For example, a
+4KiB write to mirrored pool will show a 4KiB write to the top-level vdev
+(root) and a 4KiB write to each of the mirror leaf vdevs.
+
+The ZIO pipeline can aggregate I/O operations. For example, a contiguous
+series of writes can be aggregated into a single, larger I/O to the leaf
+vdev. The independent I/O operations reflect the logical operations and
+the aggregated I/O operations reflect the physical operations.
+
+The histogram fields show cumulative values from lowest to highest.
+The largest bucket is tagged "le=+Inf", representing the total count
+of I/Os by type and vdev.
+
+Note: trim I/Os can be larger than 16MiB, but the larger sizes are
+accounted in the 16MiB bucket.
+
+#### zpool_io_size Histogram Tags
+| label | description |
+|---|---|
+| le | bucket for histogram, I/O size is less than or equal to bucket value in bytes |
+| name | pool name |
+| path | for leaf vdevs, the device path name, otherwise omitted |
+| vdev | vdev name (root = entire pool) |
+
+#### zpool_io_size Histogram Fields
+| field | units | description |
+|---|---|---|
+| sync_read_ind | blocks | independent sync reads |
+| sync_write_ind | blocks | independent sync writes |
+| async_read_ind | blocks | independent async reads |
+| async_write_ind | blocks | independent async writes |
+| scrub_read_ind | blocks | independent scrub/scan reads |
+| trim_write_ind | blocks | independent trim (aka unmap) writes |
+| sync_read_agg | blocks | aggregated sync reads |
+| sync_write_agg | blocks | aggregated sync writes |
+| async_read_agg | blocks | aggregated async reads |
+| async_write_agg | blocks | aggregated async writes |
+| scrub_read_agg | blocks | aggregated scrub/scan reads |
+| trim_write_agg | blocks | aggregated trim (aka unmap) writes |
+
+#### About unsigned integers
+Telegraf v1.6.2 and later support unsigned 64-bit integers which more
+closely matches the uint64_t values used by ZFS. By default, zpool_influxdb
+uses ZFS' uint64_t values and influxdb line protocol unsigned integer type.
+If you are using old telegraf or influxdb where unsigned integers are not
+available, use the `--signed-int` option.
+
+## Using _zpool_influxdb_
+
+The simplest method is to use the execd input agent in telegraf. For older
+versions of telegraf which lack execd, the exec input agent can be used.
+For convenience, one of the sample config files below can be placed in the
+telegraf config-directory (often /etc/telegraf/telegraf.d). Telegraf can
+be restarted to read the config-directory files.
+
+### Example telegraf execd configuration
+```toml
+# # Read metrics from zpool_influxdb
+[[inputs.execd]]
+#   ## default installation location for zpool_influxdb command
+  command = ["/usr/libexec/zfs/zpool_influxdb", "--execd"]
+
+    ## Define how the process is signaled on each collection interval.
+    ## Valid values are:
+    ##   "none"    : Do not signal anything. (Recommended for service inputs)
+    ##               The process must output metrics by itself.
+    ##   "STDIN"   : Send a newline on STDIN. (Recommended for gather inputs)
+    ##   "SIGHUP"  : Send a HUP signal. Not available on Windows. (not recommended)
+    ##   "SIGUSR1" : Send a USR1 signal. Not available on Windows.
+    ##   "SIGUSR2" : Send a USR2 signal. Not available on Windows.
+  signal = "STDIN"
+
+  ## Delay before the process is restarted after an unexpected termination
+  restart_delay = "10s"
+
+    ## Data format to consume.
+    ## Each data format has its own unique set of configuration options, read
+    ## more about them here:
+    ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "influx"
+```
+
+### Example telegraf exec configuration
+```toml
+# # Read metrics from zpool_influxdb
+[[inputs.exec]]
+#   ## default installation location for zpool_influxdb command
+  commands = ["/usr/libexec/zfs/zpool_influxdb"]
+  data_format = "influx"
+```
+
+## Caveat Emptor
+* Like the _zpool_ command, _zpool_influxdb_ takes a reader
+  lock on spa_config for each imported pool. If this lock blocks,
+  then the command will also block indefinitely and might be
+  unkillable. This is not a normal condition, but can occur if
+  there are bugs in the kernel modules.
+  For this reason, care should be taken:
+  * avoid spawning many of these commands hoping that one might
+    finish
+  * avoid frequent updates or short sample time
+    intervals, because the locks can interfere with the performance
+    of other instances of _zpool_ or _zpool_influxdb_
+
+## Other collectors
+There are a few other collectors for zpool statistics roaming around
+the Internet. Many attempt to screen-scrape `zpool` output in various
+ways. The screen-scrape method works poorly for `zpool` output because
+of its human-friendly nature. Also, they suffer from the same caveats
+as this implementation. This implementation is optimized for directly
+collecting the metrics and is much more efficient than the screen-scrapers.
+
+## Feedback Encouraged
+Pull requests and issues are greatly appreciated at
+https://github.com/openzfs/zfs
@@ -0,0 +1,3 @@
+### Dashboards for zpool_influxdb
+This directory contains a collection of dashboards related to ZFS with data
+collected from the zpool_influxdb collector.
@@ -0,0 +1,7 @@
+This directory contains sample telegraf configurations for
+adding `zpool_influxdb` as an input plugin. Depending on your
+telegraf configuration, the installation can be as simple as
+copying one of these to the `/etc/telegraf/telegraf.d` directory
+and restarting `systemctl restart telegraf`
+
+See the telegraf docs for more information on input plugins.
@@ -0,0 +1,15 @@
+# # Read metrics from zpool_influxdb
+[[inputs.exec]]
+#   ## default installation location for zpool_influxdb command
+  commands = ["/usr/local/libexec/zfs/zpool_influxdb"]
+#   ## Timeout for each command to complete.
+#   timeout = "5s"
+#
+#   ## measurement name suffix (for separating different commands)
+#   name_suffix = "_mycollector"
+#
+#   ## Data format to consume.
+#   ## Each data format has its own unique set of configuration options, read
+#   ## more about them here:
+#   ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "influx"
@@ -0,0 +1,23 @@
+# # Read metrics from zpool_influxdb
+[[inputs.execd]]
+#   ## default installation location for zpool_influxdb command
+  command = ["/usr/local/libexec/zfs/zpool_influxdb", "--execd"]
+
+    ## Define how the process is signaled on each collection interval.
+    ## Valid values are:
+    ##   "none"    : Do not signal anything. (Recommended for service inputs)
+    ##               The process must output metrics by itself.
+    ##   "STDIN"   : Send a newline on STDIN. (Recommended for gather inputs)
+    ##   "SIGHUP"  : Send a HUP signal. Not available on Windows. (not recommended)
+    ##   "SIGUSR1" : Send a USR1 signal. Not available on Windows.
+    ##   "SIGUSR2" : Send a USR2 signal. Not available on Windows.
+  signal = "STDIN"
+
+  ## Delay before the process is restarted after an unexpected termination
+  restart_delay = "10s"
+
+    ## Data format to consume.
+    ## Each data format has its own unique set of configuration options, read
+    ## more about them here:
+    ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "influx"
@@ -0,0 +1,843 @@
+/*
+ * Gather top-level ZFS pool and resilver/scan statistics and print using
+ * influxdb line protocol
+ * usage: [options] [pool_name]
+ * where options are:
+ *   --execd, -e           run in telegraf execd input plugin mode, [CR] on
+ *                         stdin causes a sample to be printed and wait for
+ *                         the next [CR]
+ *   --no-histograms, -n   don't print histogram data (reduces cardinality
+ *                         if you don't care about histograms)
+ *   --sum-histogram-buckets, -s sum histogram bucket values
+ *
+ * To integrate into telegraf use one of:
+ * 1. the `inputs.execd` plugin with the `--execd` option
+ * 2. the `inputs.exec` plugin to simply run with no options
+ *
+ * NOTE: libzfs is an unstable interface. YMMV.
+ *
+ * The design goals of this software include:
+ * + be as lightweight as possible
+ * + reduce the number of external dependencies as far as possible, hence
+ *   there is no dependency on a client library for managing the metric
+ *   collection -- info is printed, KISS
+ * + broken pools or kernel bugs can cause this process to hang in an
+ *   unkillable state. For this reason, it is best to keep the damage limited
+ *   to a small process like zpool_influxdb rather than a larger collector.
+ *
+ * Copyright 2018-2020 Richard Elling
+ *
+ * This software is dual-licensed MIT and CDDL.
+ *
+ * The MIT License (MIT)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * CDDL HEADER END
+ */
+#include <string.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <libzfs_impl.h>
+
+#define	POOL_MEASUREMENT	"zpool_stats"
+#define	SCAN_MEASUREMENT	"zpool_scan_stats"
+#define	VDEV_MEASUREMENT	"zpool_vdev_stats"
+#define	POOL_LATENCY_MEASUREMENT	"zpool_latency"
+#define	POOL_QUEUE_MEASUREMENT	"zpool_vdev_queue"
+#define	MIN_LAT_INDEX	10  /* minimum latency index 10 = 1024ns */
+#define	POOL_IO_SIZE_MEASUREMENT	"zpool_io_size"
+#define	MIN_SIZE_INDEX	9  /* minimum size index 9 = 512 bytes */
+
+/* global options */
+int execd_mode = 0;
+int no_histograms = 0;
+int sum_histogram_buckets = 0;
+char metric_data_type = 'u';
+uint64_t metric_value_mask = UINT64_MAX;
+uint64_t timestamp = 0;
+int complained_about_sync = 0;
+char *tags = "";
+
+typedef int (*stat_printer_f)(nvlist_t *, const char *, const char *);
+
+/*
+ * influxdb line protocol rules for escaping are important because the
+ * zpool name can include characters that need to be escaped
+ *
+ * caller is responsible for freeing result
+ */
+static char *
+escape_string(char *s)
+{
+	char *c, *d;
+	char *t = (char *)malloc(ZFS_MAX_DATASET_NAME_LEN * 2);
+	if (t == NULL) {
+		fprintf(stderr, "error: cannot allocate memory\n");
+		exit(1);
+	}
+
+	for (c = s, d = t; *c != '\0'; c++, d++) {
+		switch (*c) {
+		case ' ':
+		case ',':
+		case '=':
+		case '\\':
+			*d++ = '\\';
+		default:
+			*d = *c;
+		}
+	}
+	*d = '\0';
+	return (t);
+}
+
+/*
+ * print key=value where value is a uint64_t
+ */
+static void
+print_kv(char *key, uint64_t value)
+{
+	printf("%s=%llu%c", key,
+	    (u_longlong_t)value & metric_value_mask, metric_data_type);
+}
+
+/*
+ * print_scan_status() prints the details as often seen in the "zpool status"
+ * output. However, unlike the zpool command, which is intended for humans,
+ * this output is suitable for long-term tracking in influxdb.
+ * TODO: update to include issued scan data
+ */
+static int
+print_scan_status(nvlist_t *nvroot, const char *pool_name)
+{
+	uint_t c;
+	int64_t elapsed;
+	uint64_t examined, pass_exam, paused_time, paused_ts, rate;
+	uint64_t remaining_time;
+	pool_scan_stat_t *ps = NULL;
+	double pct_done;
+	char *state[DSS_NUM_STATES] = {
+	    "none", "scanning", "finished", "canceled"};
+	char *func;
+
+	(void) nvlist_lookup_uint64_array(nvroot,
+	    ZPOOL_CONFIG_SCAN_STATS,
+	    (uint64_t **)&ps, &c);
+
+	/*
+	 * ignore if there are no stats
+	 */
+	if (ps == NULL)
+		return (0);
+
+	/*
+	 * return error if state is bogus
+	 */
+	if (ps->pss_state >= DSS_NUM_STATES ||
+	    ps->pss_func >= POOL_SCAN_FUNCS) {
+		if (complained_about_sync % 1000 == 0) {
+			fprintf(stderr, "error: cannot decode scan stats: "
+			    "ZFS is out of sync with compiled zpool_influxdb");
+			complained_about_sync++;
+		}
+		return (1);
+	}
+
+	switch (ps->pss_func) {
+	case POOL_SCAN_NONE:
+		func = "none_requested";
+		break;
+	case POOL_SCAN_SCRUB:
+		func = "scrub";
+		break;
+	case POOL_SCAN_RESILVER:
+		func = "resilver";
+		break;
+#ifdef POOL_SCAN_REBUILD
+	case POOL_SCAN_REBUILD:
+		func = "rebuild";
+		break;
+#endif
+	default:
+		func = "scan";
+	}
+
+	/* overall progress */
+	examined = ps->pss_examined ? ps->pss_examined : 1;
+	pct_done = 0.0;
+	if (ps->pss_to_examine > 0)
+		pct_done = 100.0 * examined / ps->pss_to_examine;
+
+#ifdef EZFS_SCRUB_PAUSED
+	paused_ts = ps->pss_pass_scrub_pause;
+	paused_time = ps->pss_pass_scrub_spent_paused;
+#else
+	paused_ts = 0;
+	paused_time = 0;
+#endif
+
+	/* calculations for this pass */
+	if (ps->pss_state == DSS_SCANNING) {
+		elapsed = (int64_t)time(NULL) - (int64_t)ps->pss_pass_start -
+		    (int64_t)paused_time;
+		elapsed = (elapsed > 0) ? elapsed : 1;
+		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
+		rate = pass_exam / elapsed;
+		rate = (rate > 0) ? rate : 1;
+		remaining_time = ps->pss_to_examine - examined / rate;
+	} else {
+		elapsed =
+		    (int64_t)ps->pss_end_time - (int64_t)ps->pss_pass_start -
+		    (int64_t)paused_time;
+		elapsed = (elapsed > 0) ? elapsed : 1;
+		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
+		rate = pass_exam / elapsed;
+		remaining_time = 0;
+	}
+	rate = rate ? rate : 1;
+
+	/* influxdb line protocol format: "tags metrics timestamp" */
+	printf("%s%s,function=%s,name=%s,state=%s ",
+	    SCAN_MEASUREMENT, tags, func, pool_name, state[ps->pss_state]);
+	print_kv("end_ts", ps->pss_end_time);
+	print_kv(",errors", ps->pss_errors);
+	print_kv(",examined", examined);
+	print_kv(",issued", ps->pss_issued);
+	print_kv(",pass_examined", pass_exam);
+	print_kv(",pass_issued", ps->pss_pass_issued);
+	print_kv(",paused_ts", paused_ts);
+	print_kv(",paused_t", paused_time);
+	printf(",pct_done=%.2f", pct_done);
+	print_kv(",processed", ps->pss_processed);
+	print_kv(",rate", rate);
+	print_kv(",remaining_t", remaining_time);
+	print_kv(",start_ts", ps->pss_start_time);
+	print_kv(",to_examine", ps->pss_to_examine);
+	print_kv(",to_process", ps->pss_to_process);
+	printf(" %llu\n", (u_longlong_t)timestamp);
+	return (0);
+}
+
+/*
+ * get a vdev name that corresponds to the top-level vdev names
+ * printed by `zpool status`
+ */
+static char *
+get_vdev_name(nvlist_t *nvroot, const char *parent_name)
+{
+	static char vdev_name[256];
+	char *vdev_type = NULL;
+	uint64_t vdev_id = 0;
+
+	if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE,
+	    &vdev_type) != 0) {
+		vdev_type = "unknown";
+	}
+	if (nvlist_lookup_uint64(
+	    nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) {
+		vdev_id = UINT64_MAX;
+	}
+	if (parent_name == NULL) {
+		(void) snprintf(vdev_name, sizeof (vdev_name), "%s",
+		    vdev_type);
+	} else {
+		(void) snprintf(vdev_name, sizeof (vdev_name),
+		    "%s/%s-%llu",
+		    parent_name, vdev_type, (u_longlong_t)vdev_id);
+	}
+	return (vdev_name);
+}
+
+/*
+ * get a string suitable for an influxdb tag that describes this vdev
+ *
+ * By default only the vdev hierarchical name is shown, separated by '/'
+ * If the vdev has an associated path, which is typical of leaf vdevs,
+ * then the path is added.
+ * It would be nice to have the devid instead of the path, but under
+ * Linux we cannot be sure a devid will exist and we'd rather have
+ * something than nothing, so we'll use path instead.
+ */
+static char *
+get_vdev_desc(nvlist_t *nvroot, const char *parent_name)
+{
+	static char vdev_desc[2 * MAXPATHLEN];
+	char *vdev_type = NULL;
+	uint64_t vdev_id = 0;
+	char vdev_value[MAXPATHLEN];
+	char *vdev_path = NULL;
+	char *s, *t;
+
+	if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type) != 0) {
+		vdev_type = "unknown";
+	}
+	if (nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) {
+		vdev_id = UINT64_MAX;
+	}
+	if (nvlist_lookup_string(
+	    nvroot, ZPOOL_CONFIG_PATH, &vdev_path) != 0) {
+		vdev_path = NULL;
+	}
+
+	if (parent_name == NULL) {
+		s = escape_string(vdev_type);
+		(void) snprintf(vdev_value, sizeof (vdev_value), "vdev=%s", s);
+		free(s);
+	} else {
+		s = escape_string((char *)parent_name);
+		t = escape_string(vdev_type);
+		(void) snprintf(vdev_value, sizeof (vdev_value),
+		    "vdev=%s/%s-%llu", s, t, (u_longlong_t)vdev_id);
+		free(s);
+		free(t);
+	}
+	if (vdev_path == NULL) {
+		(void) snprintf(vdev_desc, sizeof (vdev_desc), "%s",
+		    vdev_value);
+	} else {
+		s = escape_string(vdev_path);
+		(void) snprintf(vdev_desc, sizeof (vdev_desc), "path=%s,%s",
+		    s, vdev_value);
+		free(s);
+	}
+	return (vdev_desc);
+}
+
+/*
+ * vdev summary stats are a combination of the data shown by
+ * `zpool status` and `zpool list -v`
+ */
+static int
+print_summary_stats(nvlist_t *nvroot, const char *pool_name,
+    const char *parent_name)
+{
+	uint_t c;
+	vdev_stat_t *vs;
+	char *vdev_desc = NULL;
+	vdev_desc = get_vdev_desc(nvroot, parent_name);
+	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) != 0) {
+		return (1);
+	}
+	printf("%s%s,name=%s,state=%s,%s ", POOL_MEASUREMENT, tags,
+	    pool_name, zpool_state_to_name((vdev_state_t)vs->vs_state,
+	    (vdev_aux_t)vs->vs_aux), vdev_desc);
+	print_kv("alloc", vs->vs_alloc);
+	print_kv(",free", vs->vs_space - vs->vs_alloc);
+	print_kv(",size", vs->vs_space);
+	print_kv(",read_bytes", vs->vs_bytes[ZIO_TYPE_READ]);
+	print_kv(",read_errors", vs->vs_read_errors);
+	print_kv(",read_ops", vs->vs_ops[ZIO_TYPE_READ]);
+	print_kv(",write_bytes", vs->vs_bytes[ZIO_TYPE_WRITE]);
+	print_kv(",write_errors", vs->vs_write_errors);
+	print_kv(",write_ops", vs->vs_ops[ZIO_TYPE_WRITE]);
+	print_kv(",checksum_errors", vs->vs_checksum_errors);
+	print_kv(",fragmentation", vs->vs_fragmentation);
+	printf(" %llu\n", (u_longlong_t)timestamp);
+	return (0);
+}
+
+/*
+ * vdev latency stats are histograms stored as nvlist arrays of uint64.
+ * Latency stats include the ZIO scheduler classes plus lower-level
+ * vdev latencies.
+ *
+ * In many cases, the top-level "root" view obscures the underlying
+ * top-level vdev operations. For example, if a pool has a log, special,
+ * or cache device, then each can behave very differently. It is useful
+ * to see how each is responding.
+ */
+static int
+print_vdev_latency_stats(nvlist_t *nvroot, const char *pool_name,
+    const char *parent_name)
+{
+	uint_t c, end = 0;
+	nvlist_t *nv_ex;
+	char *vdev_desc = NULL;
+
+	/* short_names become part of the metric name and are influxdb-ready */
+	struct lat_lookup {
+	    char *name;
+	    char *short_name;
+	    uint64_t sum;
+	    uint64_t *array;
+	};
+	struct lat_lookup lat_type[] = {
+	    {ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,   "total_read", 0},
+	    {ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,   "total_write", 0},
+	    {ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,  "disk_read", 0},
+	    {ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,  "disk_write", 0},
+	    {ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,  "sync_read", 0},
+	    {ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,  "sync_write", 0},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, "async_read", 0},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, "async_write", 0},
+	    {ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,   "scrub", 0},
+#ifdef ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO
+	    {ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,    "trim", 0},
+#endif
+	    {NULL,	NULL}
+	};
+
+	if (nvlist_lookup_nvlist(nvroot,
+	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
+		return (6);
+	}
+
+	vdev_desc = get_vdev_desc(nvroot, parent_name);
+
+	for (int i = 0; lat_type[i].name; i++) {
+		if (nvlist_lookup_uint64_array(nv_ex,
+		    lat_type[i].name, &lat_type[i].array, &c) != 0) {
+			fprintf(stderr, "error: can't get %s\n",
+			    lat_type[i].name);
+			return (3);
+		}
+		/* end count count, all of the arrays are the same size */
+		end = c - 1;
+	}
+
+	for (int bucket = 0; bucket <= end; bucket++) {
+		if (bucket < MIN_LAT_INDEX) {
+			/* don't print, but collect the sum */
+			for (int i = 0; lat_type[i].name; i++) {
+				lat_type[i].sum += lat_type[i].array[bucket];
+			}
+			continue;
+		}
+		if (bucket < end) {
+			printf("%s%s,le=%0.6f,name=%s,%s ",
+			    POOL_LATENCY_MEASUREMENT, tags,
+			    (float)(1ULL << bucket) * 1e-9,
+			    pool_name, vdev_desc);
+		} else {
+			printf("%s%s,le=+Inf,name=%s,%s ",
+			    POOL_LATENCY_MEASUREMENT, tags, pool_name,
+			    vdev_desc);
+		}
+		for (int i = 0; lat_type[i].name; i++) {
+			if (bucket <= MIN_LAT_INDEX || sum_histogram_buckets) {
+				lat_type[i].sum += lat_type[i].array[bucket];
+			} else {
+				lat_type[i].sum = lat_type[i].array[bucket];
+			}
+			print_kv(lat_type[i].short_name, lat_type[i].sum);
+			if (lat_type[i + 1].name != NULL) {
+				printf(",");
+			}
+		}
+		printf(" %llu\n", (u_longlong_t)timestamp);
+	}
+	return (0);
+}
+
+/*
+ * vdev request size stats are histograms stored as nvlist arrays of uint64.
+ * Request size stats include the ZIO scheduler classes plus lower-level
+ * vdev sizes. Both independent (ind) and aggregated (agg) sizes are reported.
+ *
+ * In many cases, the top-level "root" view obscures the underlying
+ * top-level vdev operations. For example, if a pool has a log, special,
+ * or cache device, then each can behave very differently. It is useful
+ * to see how each is responding.
+ */
+static int
+print_vdev_size_stats(nvlist_t *nvroot, const char *pool_name,
+    const char *parent_name)
+{
+	uint_t c, end = 0;
+	nvlist_t *nv_ex;
+	char *vdev_desc = NULL;
+
+	/* short_names become the field name */
+	struct size_lookup {
+	    char *name;
+	    char *short_name;
+	    uint64_t sum;
+	    uint64_t *array;
+	};
+	struct size_lookup size_type[] = {
+	    {ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,   "sync_read_ind"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,   "sync_write_ind"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,  "async_read_ind"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,  "async_write_ind"},
+	    {ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,    "scrub_read_ind"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,   "sync_read_agg"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,   "sync_write_agg"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,  "async_read_agg"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,  "async_write_agg"},
+	    {ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,    "scrub_read_agg"},
+#ifdef ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO
+	    {ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,    "trim_write_ind"},
+	    {ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,    "trim_write_agg"},
+#endif
+	    {NULL,	NULL}
+	};
+
+	if (nvlist_lookup_nvlist(nvroot,
+	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
+		return (6);
+	}
+
+	vdev_desc = get_vdev_desc(nvroot, parent_name);
+
+	for (int i = 0; size_type[i].name; i++) {
+		if (nvlist_lookup_uint64_array(nv_ex, size_type[i].name,
+		    &size_type[i].array, &c) != 0) {
+			fprintf(stderr, "error: can't get %s\n",
+			    size_type[i].name);
+			return (3);
+		}
+		/* end count count, all of the arrays are the same size */
+		end = c - 1;
+	}
+
+	for (int bucket = 0; bucket <= end; bucket++) {
+		if (bucket < MIN_SIZE_INDEX) {
+			/* don't print, but collect the sum */
+			for (int i = 0; size_type[i].name; i++) {
+				size_type[i].sum += size_type[i].array[bucket];
+			}
+			continue;
+		}
+
+		if (bucket < end) {
+			printf("%s%s,le=%llu,name=%s,%s ",
+			    POOL_IO_SIZE_MEASUREMENT, tags, 1ULL << bucket,
+			    pool_name, vdev_desc);
+		} else {
+			printf("%s%s,le=+Inf,name=%s,%s ",
+			    POOL_IO_SIZE_MEASUREMENT, tags, pool_name,
+			    vdev_desc);
+		}
+		for (int i = 0; size_type[i].name; i++) {
+			if (bucket <= MIN_SIZE_INDEX || sum_histogram_buckets) {
+				size_type[i].sum += size_type[i].array[bucket];
+			} else {
+				size_type[i].sum = size_type[i].array[bucket];
+			}
+			print_kv(size_type[i].short_name, size_type[i].sum);
+			if (size_type[i + 1].name != NULL) {
+				printf(",");
+			}
+		}
+		printf(" %llu\n", (u_longlong_t)timestamp);
+	}
+	return (0);
+}
+
+/*
+ * ZIO scheduler queue stats are stored as gauges. This is unfortunate
+ * because the values can change very rapidly and any point-in-time
+ * value will quickly be obsoleted. It is also not easy to downsample.
+ * Thus only the top-level queue stats might be beneficial... maybe.
+ */
+static int
+print_queue_stats(nvlist_t *nvroot, const char *pool_name,
+    const char *parent_name)
+{
+	nvlist_t *nv_ex;
+	uint64_t value;
+
+	/* short_names are used for the field name */
+	struct queue_lookup {
+	    char *name;
+	    char *short_name;
+	};
+	struct queue_lookup queue_type[] = {
+	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,	"sync_r_active"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,	"sync_w_active"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,	"async_r_active"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,	"async_w_active"},
+	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,	"async_scrub_active"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,	"sync_r_pend"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,	"sync_w_pend"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,	"async_r_pend"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,	"async_w_pend"},
+	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,	"async_scrub_pend"},
+	    {NULL,	NULL}
+	};
+
+	if (nvlist_lookup_nvlist(nvroot,
+	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
+		return (6);
+	}
+
+	printf("%s%s,name=%s,%s ", POOL_QUEUE_MEASUREMENT, tags, pool_name,
+	    get_vdev_desc(nvroot, parent_name));
+	for (int i = 0; queue_type[i].name; i++) {
+		if (nvlist_lookup_uint64(nv_ex,
+		    queue_type[i].name, &value) != 0) {
+			fprintf(stderr, "error: can't get %s\n",
+			    queue_type[i].name);
+			return (3);
+		}
+		print_kv(queue_type[i].short_name, value);
+		if (queue_type[i + 1].name != NULL) {
+			printf(",");
+		}
+	}
+	printf(" %llu\n", (u_longlong_t)timestamp);
+	return (0);
+}
+
+/*
+ * top-level vdev stats are at the pool level
+ */
+static int
+print_top_level_vdev_stats(nvlist_t *nvroot, const char *pool_name)
+{
+	nvlist_t *nv_ex;
+	uint64_t value;
+
+	/* short_names become part of the metric name */
+	struct queue_lookup {
+	    char *name;
+	    char *short_name;
+	};
+	struct queue_lookup queue_type[] = {
+	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active_queue"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active_queue"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active_queue"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active_queue"},
+	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active_queue"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend_queue"},
+	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend_queue"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend_queue"},
+	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend_queue"},
+	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend_queue"},
+	    {NULL, NULL}
+	};
+
+	if (nvlist_lookup_nvlist(nvroot,
+	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
+		return (6);
+	}
+
+	printf("%s%s,name=%s,vdev=root ", VDEV_MEASUREMENT, tags,
+	    pool_name);
+	for (int i = 0; queue_type[i].name; i++) {
+		if (nvlist_lookup_uint64(nv_ex,
+		    queue_type[i].name, &value) != 0) {
+			fprintf(stderr, "error: can't get %s\n",
+			    queue_type[i].name);
+			return (3);
+		}
+		if (i > 0)
+			printf(",");
+		print_kv(queue_type[i].short_name, value);
+	}
+
+	printf(" %llu\n", (u_longlong_t)timestamp);
+	return (0);
+}
+
+/*
+ * recursive stats printer
+ */
+static int
+print_recursive_stats(stat_printer_f func, nvlist_t *nvroot,
+    const char *pool_name, const char *parent_name, int descend)
+{
+	uint_t c, children;
+	nvlist_t **child;
+	char vdev_name[256];
+	int err;
+
+	err = func(nvroot, pool_name, parent_name);
+	if (err)
+		return (err);
+
+	if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		(void) strncpy(vdev_name, get_vdev_name(nvroot, parent_name),
+		    sizeof (vdev_name));
+		vdev_name[sizeof (vdev_name) - 1] = '\0';
+
+		for (c = 0; c < children; c++) {
+			print_recursive_stats(func, child[c], pool_name,
+			    vdev_name, descend);
+		}
+	}
+	return (0);
+}
+
+/*
+ * call-back to print the stats from the pool config
+ *
+ * Note: if the pool is broken, this can hang indefinitely and perhaps in an
+ * unkillable state.
+ */
+static int
+print_stats(zpool_handle_t *zhp, void *data)
+{
+	uint_t c;
+	int err;
+	boolean_t missing;
+	nvlist_t *config, *nvroot;
+	vdev_stat_t *vs;
+	struct timespec tv;
+	char *pool_name;
+
+	/* if not this pool return quickly */
+	if (data &&
+	    strncmp(data, zhp->zpool_name, ZFS_MAX_DATASET_NAME_LEN) != 0) {
+		zpool_close(zhp);
+		return (0);
+	}
+
+	if (zpool_refresh_stats(zhp, &missing) != 0) {
+		zpool_close(zhp);
+		return (1);
+	}
+
+	config = zpool_get_config(zhp, NULL);
+	if (clock_gettime(CLOCK_REALTIME, &tv) != 0)
+		timestamp = (uint64_t)time(NULL) * 1000000000;
+	else
+		timestamp =
+		    ((uint64_t)tv.tv_sec * 1000000000) + (uint64_t)tv.tv_nsec;
+
+	if (nvlist_lookup_nvlist(
+	    config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) {
+	zpool_close(zhp);
+		return (2);
+	}
+	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) != 0) {
+	zpool_close(zhp);
+		return (3);
+	}
+
+	pool_name = escape_string(zhp->zpool_name);
+	err = print_recursive_stats(print_summary_stats, nvroot,
+	    pool_name, NULL, 1);
+	/* if any of these return an error, skip the rest */
+	if (err == 0)
+	err = print_top_level_vdev_stats(nvroot, pool_name);
+
+	if (no_histograms == 0) {
+	if (err == 0)
+		err = print_recursive_stats(print_vdev_latency_stats, nvroot,
+		    pool_name, NULL, 1);
+	if (err == 0)
+		err = print_recursive_stats(print_vdev_size_stats, nvroot,
+		    pool_name, NULL, 1);
+	if (err == 0)
+		err = print_recursive_stats(print_queue_stats, nvroot,
+		    pool_name, NULL, 0);
+	}
+	if (err == 0)
+		err = print_scan_status(nvroot, pool_name);
+
+	free(pool_name);
+	zpool_close(zhp);
+	return (err);
+}
+
+static void
+usage(char *name)
+{
+	fprintf(stderr, "usage: %s [--execd][--no-histograms]"
+	    "[--sum-histogram-buckets] [--signed-int] [poolname]\n", name);
+	exit(EXIT_FAILURE);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int opt;
+	int ret = 8;
+	char *line = NULL;
+	size_t len, tagslen = 0;
+	struct option long_options[] = {
+	    {"execd", no_argument, NULL, 'e'},
+	    {"help", no_argument, NULL, 'h'},
+	    {"no-histograms", no_argument, NULL, 'n'},
+	    {"signed-int", no_argument, NULL, 'i'},
+	    {"sum-histogram-buckets", no_argument, NULL, 's'},
+	    {"tags", required_argument, NULL, 't'},
+	    {0, 0, 0, 0}
+	};
+	while ((opt = getopt_long(
+	    argc, argv, "ehinst:", long_options, NULL)) != -1) {
+		switch (opt) {
+		case 'e':
+			execd_mode = 1;
+			break;
+		case 'i':
+			metric_data_type = 'i';
+			metric_value_mask = INT64_MAX;
+			break;
+		case 'n':
+			no_histograms = 1;
+			break;
+		case 's':
+			sum_histogram_buckets = 1;
+			break;
+		case 't':
+			tagslen = strlen(optarg) + 2;
+			tags = calloc(tagslen, 1);
+			if (tags == NULL) {
+				fprintf(stderr,
+				    "error: cannot allocate memory "
+				    "for tags\n");
+				exit(1);
+			}
+			(void) snprintf(tags, tagslen, ",%s", optarg);
+			break;
+		default:
+			usage(argv[0]);
+		}
+	}
+
+	libzfs_handle_t *g_zfs;
+	if ((g_zfs = libzfs_init()) == NULL) {
+		fprintf(stderr,
+		    "error: cannot initialize libzfs. "
+		    "Is the zfs module loaded or zrepl running?\n");
+		exit(EXIT_FAILURE);
+	}
+	if (execd_mode == 0) {
+		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
+		return (ret);
+	}
+	while (getline(&line, &len, stdin) != -1) {
+		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
+		fflush(stdout);
+	}
+	return (ret);
+}
@@ -15,3 +15,6 @@ zstream_LDADD = \
 	$(abs_top_builddir)/lib/libnvpair/libnvpair.la

 include $(top_srcdir)/config/CppCheck.am
+
+install-exec-hook:
+	cd $(DESTDIR)$(sbindir) && $(LN_S) -f zstream zstreamdump
@@ -49,6 +49,11 @@ zstream_usage(void)
 int
 main(int argc, char *argv[])
 {
+	char *basename = strrchr(argv[0], '/');
+	basename = basename ? (basename + 1) : argv[0];
+	if (argc >= 1 && strcmp(basename, "zstreamdump") == 0)
+		return (zstream_do_dump(argc, argv));
+
 	if (argc < 2)
 		zstream_usage();

--- a/Show More
+++ b/Show More