Tag zfs-2.3.4

META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>
CI: Add Debian 13 to the FULL_OS runner list
2026-06-10 07:56:39 +03:00 · 2025-08-20 09:29:36 -07:00 · 2025-08-20 09:29:36 -07:00 · 2025-08-20 09:20:45 -07:00 · 2025-08-20 09:12:26 -07:00 · 2025-08-19 10:30:04 -07:00
136 changed files with 2997 additions and 866 deletions
@@ -1,21 +0,0 @@
-env:
-  CIRRUS_CLONE_DEPTH: 1
-  ARCH: amd64
-
-build_task:
-  matrix:
-    freebsd_instance:
-      image_family: freebsd-13-5
-    freebsd_instance:
-      image_family: freebsd-14-2
-    freebsd_instance:
-      image_family: freebsd-15-0-snap
-  prepare_script:
-    - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py311-packaging py311-cffi py311-sysctl
-  configure_script:
-    - env MAKE=gmake ./autogen.sh
-    - env MAKE=gmake ./configure --with-config="user" --with-python=3.11
-  build_script:
-    - gmake -j `sysctl -n kern.smp.cpus`
-  install_script:
-    - gmake install
@@ -2,3 +2,4 @@ name: "Custom CodeQL Analysis"

 queries:
  - uses: ./.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql
+  - uses: ./.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql
@@ -0,0 +1,34 @@
+/**
+ * @name Detect mismatched dsl_dataset_hold/_rele pairs
+ * @description Flags instances of issue #12014 where
+ *   - a dataset held with dsl_dataset_hold_obj() ends up in dsl_dataset_rele_flags(), or
+ *   - a dataset held with dsl_dataset_hold_obj_flags() ends up in dsl_dataset_rele().
+ * @kind problem
+ * @severity error
+ * @tags correctness
+ * @id cpp/dslDatasetHoldReleMismatch
+ */
+
+import cpp
+
+from Variable ds, Call holdCall, Call releCall, string message
+where
+    ds.getType().toString() = "dsl_dataset_t *" and
+    holdCall.getASuccessor*() = releCall and
+    (
+        (holdCall.getTarget().getName() = "dsl_dataset_hold_obj_flags" and
+         holdCall.getArgument(4).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and
+         releCall.getTarget().getName() = "dsl_dataset_rele" and
+         releCall.getArgument(0).(VariableAccess).getTarget() = ds and
+         message = "Held with dsl_dataset_hold_obj_flags but released with dsl_dataset_rele")
+        or
+        (holdCall.getTarget().getName() = "dsl_dataset_hold_obj" and
+         holdCall.getArgument(3).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and
+         releCall.getTarget().getName() = "dsl_dataset_rele_flags" and
+         releCall.getArgument(0).(VariableAccess).getTarget() = ds and
+         message = "Held with dsl_dataset_hold_obj but released with dsl_dataset_rele_flags")
+    )
+select releCall,
+       "Mismatched release: held with $@ but released with " + releCall.getTarget().getName() + " for dataset $@",
+       holdCall, holdCall.getTarget().getName(),
+       ds, ds.toString()
@@ -12,10 +12,10 @@ OS="$1"
 # OS variant (virt-install --os-variant list)
 OSv=$OS

-# compressed with .zst extension
-REPO="https://github.com/mcmilk/openzfs-freebsd-images"
-FREEBSD="$REPO/releases/download/v2025-04-13"
-URLzs=""
+# FreeBSD urls's
+FREEBSD_REL="https://download.freebsd.org/releases/CI-IMAGES"
+FREEBSD_SNAP="https://download.freebsd.org/snapshots/CI-IMAGES"
+URLxz=""

 # Ubuntu mirrors
 UBMIRROR="https://cloud-images.ubuntu.com"
@@ -25,6 +25,10 @@ UBMIRROR="https://cloud-images.ubuntu.com"
 # default nic model for vm's
 NIC="virtio"

+# additional options for virt-install
+OPTS[0]=""
+OPTS[1]=""
+
 case "$OS" in
  almalinux8)
    OSNAME="AlmaLinux 8"
@@ -61,6 +65,14 @@ case "$OS" in
    OSNAME="Debian 12"
    URL="https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2"
    ;;
+  debian13)
+    OSNAME="Debian 13"
+    # TODO: Overwrite OSv to debian13 for virt-install until it's added to osinfo
+    OSv="debian12"
+    URL="https://cloud.debian.org/images/cloud/trixie/latest/debian-13-generic-amd64.qcow2"
+    OPTS[0]="--boot"
+    OPTS[1]="uefi=on"
+    ;;
  fedora41)
    OSNAME="Fedora 41"
    OSv="fedora-unknown"
@@ -71,50 +83,49 @@ case "$OS" in
    OSv="fedora-unknown"
    URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2"
    ;;
-  freebsd13-4r)
-    OSNAME="FreeBSD 13.4-RELEASE"
-    OSv="freebsd13.0"
-    URLzs="$FREEBSD/amd64-freebsd-13.4-RELEASE.qcow2.zst"
-    BASH="/usr/local/bin/bash"
-    NIC="rtl8139"
-    ;;
  freebsd13-5r)
-    OSNAME="FreeBSD 13.5-RELEASE"
+    FreeBSD="13.5-RELEASE"
+    OSNAME="FreeBSD $FreeBSD"
    OSv="freebsd13.0"
-    URLzs="$FREEBSD/amd64-freebsd-13.5-RELEASE.qcow2.zst"
-    BASH="/usr/local/bin/bash"
+    URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
+    KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
    NIC="rtl8139"
    ;;
-  freebsd14-1r)
-    OSNAME="FreeBSD 14.1-RELEASE"
-    OSv="freebsd14.0"
-    URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst"
-    BASH="/usr/local/bin/bash"
-    ;;
  freebsd14-2r)
-    OSNAME="FreeBSD 14.2-RELEASE"
+    FreeBSD="14.2-RELEASE"
+    OSNAME="FreeBSD $FreeBSD"
    OSv="freebsd14.0"
-    URLzs="$FREEBSD/amd64-freebsd-14.2-RELEASE.qcow2.zst"
-    BASH="/usr/local/bin/bash"
+    KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
+    URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
+    ;;
+  freebsd14-3r)
+    FreeBSD="14.3-RELEASE"
+    OSNAME="FreeBSD $FreeBSD"
+    OSv="freebsd14.0"
+    URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
+    KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
    ;;
  freebsd13-5s)
-    OSNAME="FreeBSD 13.5-STABLE"
+    FreeBSD="13.5-STABLE"
+    OSNAME="FreeBSD $FreeBSD"
    OSv="freebsd13.0"
-    URLzs="$FREEBSD/amd64-freebsd-13.5-STABLE.qcow2.zst"
-    BASH="/usr/local/bin/bash"
+    URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
+    KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
    NIC="rtl8139"
    ;;
-  freebsd14-2s)
-    OSNAME="FreeBSD 14.2-STABLE"
+  freebsd14-3s)
+    FreeBSD="14.3-STABLE"
+    OSNAME="FreeBSD $FreeBSD"
    OSv="freebsd14.0"
-    URLzs="$FREEBSD/amd64-freebsd-14.2-STABLE.qcow2.zst"
-    BASH="/usr/local/bin/bash"
+    URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
+    KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
    ;;
  freebsd15-0c)
-    OSNAME="FreeBSD 15.0-CURRENT"
+    FreeBSD="15.0-PRERELEASE"
+    OSNAME="FreeBSD $FreeBSD"
    OSv="freebsd14.0"
-    URLzs="$FREEBSD/amd64-freebsd-15.0-CURRENT.qcow2.zst"
-    BASH="/usr/local/bin/bash"
+    URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
+    KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
    ;;
  tumbleweed)
    OSNAME="openSUSE Tumbleweed"
@@ -168,31 +179,37 @@ echo "CPU=\"$CPU\"" >> $ENV
 sudo mkdir -p "/mnt/tests"
 sudo chown -R $(whoami) /mnt/tests

+DISK="/dev/zvol/zpool/openzfs"
+sudo zfs create -ps -b 64k -V 80g zpool/openzfs
+while true; do test -b $DISK && break; sleep 1; done
+
 # we are downloading via axel, curl and wget are mostly slower and
 # require more return value checking
-IMG="/mnt/tests/cloudimg.qcow2"
-if [ ! -z "$URLzs" ]; then
-  echo "Loading image $URLzs ..."
-  time axel -q -o "$IMG.zst" "$URLzs"
-  zstd -q -d --rm "$IMG.zst"
+IMG="/mnt/tests/cloud-image"
+if [ ! -z "$URLxz" ]; then
+  echo "Loading $URLxz ..."
+  time axel -q -o "$IMG" "$URLxz"
+  echo "Loading $KSRC ..."
+  time axel -q -o ~/src.txz $KSRC
 else
-  echo "Loading image $URL ..."
+  echo "Loading $URL ..."
  time axel -q -o "$IMG" "$URL"
 fi

-DISK="/dev/zvol/zpool/openzfs"
-FORMAT="raw"
-sudo zfs create -ps -b 64k -V 80g zpool/openzfs
-while true; do test -b $DISK && break; sleep 1; done
 echo "Importing VM image to zvol..."
-sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M
+if [ ! -z "$URLxz" ]; then
+  xzcat -T0 $IMG | sudo dd of=$DISK bs=4M
+else
+  sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M
+fi
 rm -f $IMG

 PUBKEY=$(cat ~/.ssh/id_ed25519.pub)
-cat <<EOF > /tmp/user-data
+if [ ${OS:0:7} != "freebsd" ]; then
+  cat <<EOF > /tmp/user-data
 #cloud-config

-fqdn: $OS
+hostname: $OS

 users:
 - name: root
@@ -208,6 +225,19 @@ growpart:
  devices: ['/']
  ignore_growroot_disabled: false
 EOF
+else
+  cat <<EOF > /tmp/user-data
+#cloud-config
+
+hostname: $OS
+
+# minimized config without sudo for nuageinit of FreeBSD
+growpart:
+  mode: auto
+  devices: ['/']
+  ignore_growroot_disabled: false
+EOF
+fi

 sudo virsh net-update default add ip-dhcp-host \
  "<host mac='52:54:00:83:79:00' ip='192.168.122.10'/>" --live --config
@@ -223,15 +253,8 @@ sudo virt-install \
  --graphics none \
  --network bridge=virbr0,model=$NIC,mac='52:54:00:83:79:00' \
  --cloud-init user-data=/tmp/user-data \
-  --disk $DISK,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \
-  --import --noautoconsole >/dev/null
-
-# enable KSM on Linux
-if [ ${OS:0:7} != "freebsd" ]; then
-  sudo virsh dommemstat --domain "openzfs" --period 5
-  sudo virsh node-memory-tune 100 50 1
-  echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null
-fi
+  --disk $DISK,bus=virtio,cache=none,format=raw,driver.discard=unmap \
+  --import --noautoconsole ${OPTS[0]} ${OPTS[1]} >/dev/null

 # Give the VMs hostnames so we don't have to refer to them with
 # hardcoded IP addresses.
@@ -252,3 +275,29 @@ StrictHostKeyChecking no
 # small timeout, used in while loops later
 ConnectTimeout 1
 EOF
+
+if [ ${OS:0:7} != "freebsd" ]; then
+  # enable KSM on Linux
+  sudo virsh dommemstat --domain "openzfs" --period 5
+  sudo virsh node-memory-tune 100 50 1
+  echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null
+else
+  # on FreeBSD we need some more init stuff, because of nuageinit
+  BASH="/usr/local/bin/bash"
+  while pidof /usr/bin/qemu-system-x86_64 >/dev/null; do
+    ssh 2>/dev/null root@vm0 "uname -a" && break
+  done
+  ssh root@vm0 "pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init"
+  ssh root@vm0 "chsh -s $BASH root"
+  ssh root@vm0 'sysrc qemu_guest_agent_enable="YES"'
+  ssh root@vm0 'sysrc cloudinit_enable="YES"'
+  ssh root@vm0 "pw add user zfs -w no -s $BASH"
+  ssh root@vm0 'mkdir -p ~zfs/.ssh'
+  ssh root@vm0 'echo "zfs ALL=(ALL:ALL) NOPASSWD: ALL" >> /usr/local/etc/sudoers'
+  ssh root@vm0 'echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config'
+  scp ~/.ssh/id_ed25519.pub "root@vm0:~zfs/.ssh/authorized_keys"
+  ssh root@vm0 'chown -R zfs ~zfs'
+  ssh root@vm0 'service sshd restart'
+  scp ~/src.txz "root@vm0:/tmp/src.txz"
+  ssh root@vm0 'tar -C / -zxf /tmp/src.txz'
+fi
@@ -28,6 +28,7 @@ function debian() {
  export DEBIAN_FRONTEND="noninteractive"

  echo "##[group]Running apt-get update+upgrade"
+  sudo sed -i '/[[:alpha:]]-backports/d' /etc/apt/sources.list
  sudo apt-get update -y
  sudo apt-get upgrade -y
  echo "##[endgroup]"
@@ -40,7 +41,7 @@ function debian() {
    libelf-dev libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev \
    libtool libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \
    lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \
-    python3-cffi python3-dev python3-distlib python3-packaging \
+    python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \
    python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \
    rsync samba sysstat uuid-dev watchdog wget xfslibs-dev  xxhash zlib1g-dev
  echo "##[endgroup]"
@@ -51,7 +52,7 @@ function freebsd() {

  echo "##[group]Install Development Tools"
  sudo pkg install -y autoconf automake autotools base64 checkbashisms fio \
-    gdb gettext gettext-runtime git gmake gsed jq ksh93 lcov libtool lscpu \
+    gdb gettext gettext-runtime git gmake gsed jq ksh lcov libtool lscpu \
    pkgconf python python3 pamtester pamtester qemu-guest-agent rsync xxhash
  sudo pkg install -xy \
    '^samba4[[:digit:]]+$' \
@@ -5,12 +5,13 @@
 #
 # Usage:
 #
-#       qemu-4-build-vm.sh OS [--enable-debug][--dkms][--poweroff]
-#           [--release][--repo][--tarball]
+#       qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM]
+#               [--poweroff][--release][--repo][--tarball]
 #
 # OS:           OS name like 'fedora41'
 # --enable-debug:  Build RPMs with '--enable-debug' (for testing)
 # --dkms:       Build DKMS RPMs as well
+# --patch-level NUM:    Use a custom patch level number for packages.
 # --poweroff:   Power-off the VM after building
 # --release     Build zfs-release*.rpm as well
 # --repo        After building everything, copy RPMs into /tmp/repo
@@ -21,6 +22,7 @@

 ENABLE_DEBUG=""
 DKMS=""
+PATCH_LEVEL=""
 POWEROFF=""
 RELEASE=""
 REPO=""
@@ -35,6 +37,11 @@ while [[ $# -gt 0 ]]; do
      DKMS=1
      shift
      ;;
+    --patch-level)
+      PATCH_LEVEL=$2
+      shift
+      shift
+      ;;
    --poweroff)
      POWEROFF=1
      shift
@@ -215,6 +222,10 @@ function rpm_build_and_install() {
  run ./autogen.sh
  echo "##[endgroup]"

+  if [ -n "$PATCH_LEVEL" ] ; then
+    sed -i -E 's/(Release:\s+)1/\1'$PATCH_LEVEL'/g' META
+  fi
+
  echo "##[group]Configure"
  run ./configure --enable-debuginfo $extra
  echo "##[endgroup]"
@@ -328,7 +339,13 @@ fi
 # almalinux9.5
 # fedora42
 source /etc/os-release
-sudo hostname "$ID$VERSION_ID"
+ if which hostnamectl &> /dev/null ; then
+  # Fedora 42+ use hostnamectl
+  sudo hostnamectl set-hostname "$ID$VERSION_ID"
+  sudo hostnamectl set-hostname --pretty "$ID$VERSION_ID"
+else
+  sudo hostname "$ID$VERSION_ID"
+fi

 # save some sysinfo
 uname -a > /var/tmp/uname.txt
@@ -12,16 +12,26 @@ source /var/tmp/env.txt
 # wait for poweroff to succeed
 PID=$(pidof /usr/bin/qemu-system-x86_64)
 tail --pid=$PID -f /dev/null
-sudo virsh undefine openzfs
+sudo virsh undefine --nvram openzfs

 # cpu pinning
 CPUSET=("0,1" "2,3")

+# additional options for virt-install
+OPTS[0]=""
+OPTS[1]=""
+
 case "$OS" in
  freebsd*)
    # FreeBSD needs only 6GiB
    RAM=6
    ;;
+  debian13)
+    RAM=8
+    # Boot Debian 13 with uefi=on and secureboot=off (ZFS Kernel Module not signed)
+    OPTS[0]="--boot"
+    OPTS[1]="firmware=efi,firmware.feature0.name=secure-boot,firmware.feature0.enabled=no"
+    ;;
  *)
    # Linux needs more memory, but can be optimized to share it via KSM
    RAM=8
@@ -79,7 +89,7 @@ EOF
    --network bridge=virbr0,model=$NIC,mac="52:54:00:83:79:0$i" \
    --disk $DISK-system,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \
    --disk $DISK-tests,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \
-    --import --noautoconsole >/dev/null
+    --import --noautoconsole ${OPTS[0]} ${OPTS[1]}
 done

 # generate some memory stats
@@ -21,11 +21,13 @@ function prefix() {
  S=$((DIFF-(M*60)))

  CTR=$(cat /tmp/ctr)
-  echo $LINE| grep -q "^Test[: ]" && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr
+  echo $LINE| grep -q '^\[.*] Test[: ]' && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr

  BASE="$HOME/work/zfs/zfs"
  COLOR="$BASE/scripts/zfs-tests-color.sh"
-  CLINE=$(echo $LINE| grep "^Test[ :]" | sed -e 's|/usr/local|/usr|g' \
+  CLINE=$(echo $LINE| grep '^\[.*] Test[: ]' \
+    | sed -e 's|^\[.*] Test|Test|g' \
+    | sed -e 's|/usr/local|/usr|g' \
    | sed -e 's| /usr/share/zfs/zfs-tests/tests/| |g' | $COLOR)
  if [ -z "$CLINE" ]; then
    printf "vm${ID}: %s\n" "$LINE"
@@ -32,6 +32,11 @@ on:
        options:
        - "Build RPMs"
        - "Test repo"
+      patch_level:
+        type: string
+        required: false
+        default: ""
+        description: "(optional) patch level number"
      repo_url:
        type: string
        required: false
@@ -78,7 +83,13 @@ jobs:
                mkdir -p /tmp/repo
                ssh zfs@vm0 '$HOME/zfs/.github/workflows/scripts/qemu-test-repo-vm.sh' ${{ github.event.inputs.repo_url }}
        else
-                .github/workflows/scripts/qemu-4-build.sh --repo --release --dkms --tarball ${{ matrix.os }}
+                EXTRA=""
+                if [ -n "${{ github.event.inputs.patch_level }}" ] ; then
+                        EXTRA="--patch-level ${{ github.event.inputs.patch_level }}"
+                fi
+
+                .github/workflows/scripts/qemu-4-build.sh $EXTRA \
+                        --repo --release --dkms --tarball ${{ matrix.os }}
        fi

    - name: Prepare artifacts
@@ -5,16 +5,6 @@ on:
  pull_request:
  workflow_dispatch:
    inputs:
-      include_stream9:
-        type: boolean
-        required: false
-        default: false
-        description: 'Test on CentOS 9 stream'
-      include_stream10:
-        type: boolean
-        required: false
-        default: false
-        description: 'Test on CentOS 10 stream'
      fedora_kernel_ver:
        type: string
        required: false
@@ -39,8 +29,8 @@ jobs:
      - name: Generate OS config and CI type
        id: os
        run: |
-          FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-2s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
-          QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-2r", "ubuntu24"]'
+          FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
+          QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]'
          # determine CI type when running on PR
          ci_type="full"
          if ${{ github.event_name == 'pull_request' }}; then
@@ -63,14 +53,6 @@ jobs:
              os_json=$(echo ${os_selection} | jq -c)
          fi

-          # Add optional runners
-          if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then
-            os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]')
-          fi
-          if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then
-            os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]')
-          fi
-
          echo $os_json
          echo "os=$os_json" >> $GITHUB_OUTPUT
          echo "ci_type=$ci_type" >> $GITHUB_OUTPUT
@@ -81,12 +63,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # rhl:     almalinux8, almalinux9, centos-stream9, fedora41
-        # debian:  debian11, debian12, ubuntu22, ubuntu24
+        # rhl:     almalinux8, almalinux9, centos-stream9, fedora4x
+        # debian:  debian12, debian13, ubuntu22, ubuntu24
        # misc:    archlinux, tumbleweed
-        # FreeBSD variants of 2024-12:
-        # FreeBSD Release: freebsd13-4r, freebsd14-2r
-        # FreeBSD Stable:  freebsd13-4s, freebsd14-2s
+        # FreeBSD variants of 2025-06:
+        # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r
+        # FreeBSD Stable:  freebsd13-5s, freebsd14-3s
        # FreeBSD Current: freebsd15-0c
        os: ${{ fromJson(needs.test-config.outputs.test_os) }}
    runs-on: ubuntu-24.04
@@ -12,7 +12,8 @@ jobs:
  zloop:
    runs-on: ubuntu-24.04
    env:
-      TEST_DIR: /var/tmp/zloop
+      WORK_DIR: /mnt/zloop
+      CORE_DIR: /mnt/zloop/cores
    steps:
    - uses: actions/checkout@v4
      with:
@@ -40,38 +41,37 @@ jobs:
        sudo modprobe zfs
    - name: Tests
      run: |
-        sudo mkdir -p $TEST_DIR
-        # run for 10 minutes or at most 6 iterations for a maximum runner
-        # time of 60 minutes.
-        sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -- -T 120 -P 60
+        sudo truncate -s 256G /mnt/vdev
+        sudo zpool create cipool -m $WORK_DIR -O compression=on -o autotrim=on /mnt/vdev
+        sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -c $CORE_DIR -f $WORK_DIR -- -T 120 -P 60
    - name: Prepare artifacts
      if: failure()
      run: |
-        sudo chmod +r -R $TEST_DIR/
+        sudo chmod +r -R $WORK_DIR/
    - name: Ztest log
      if: failure()
      run: |
-        grep -B10 -A1000 'ASSERT' $TEST_DIR/*/ztest.out || tail -n 1000 $TEST_DIR/*/ztest.out
+        grep -B10 -A1000 'ASSERT' $CORE_DIR/*/ztest.out || tail -n 1000 $CORE_DIR/*/ztest.out
    - name: Gdb log
      if: failure()
      run: |
-        sed -n '/Backtraces (full)/q;p' $TEST_DIR/*/ztest.gdb
+        sed -n '/Backtraces (full)/q;p' $CORE_DIR/*/ztest.gdb
    - name: Zdb log
      if: failure()
      run: |
-        cat $TEST_DIR/*/ztest.zdb
+        cat $CORE_DIR/*/ztest.zdb
    - uses: actions/upload-artifact@v4
      if: failure()
      with:
        name: Logs
        path: |
-          /var/tmp/zloop/*/
-          !/var/tmp/zloop/*/vdev/
+          /mnt/zloop/*/
+          !/mnt/zloop/cores/*/vdev/
        if-no-files-found: ignore
    - uses: actions/upload-artifact@v4
      if: failure()
      with:
        name: Pool files
        path: |
-          /var/tmp/zloop/*/vdev/
+          /mnt/zloop/cores/*/vdev/
        if-no-files-found: ignore
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.3.3
+Version:       2.3.4
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.15
+Linux-Maximum: 6.16
 Linux-Minimum: 4.18
@@ -559,6 +559,7 @@ def section_arc(kstats_dict):
    print()

    compressed_size = arc_stats['compressed_size']
+    uncompressed_size = arc_stats['uncompressed_size']
    overhead_size = arc_stats['overhead_size']
    bonus_size = arc_stats['bonus_size']
    dnode_size = arc_stats['dnode_size']
@@ -671,6 +672,8 @@ def section_arc(kstats_dict):
    print()

    print('ARC misc:')
+    prt_i2('Uncompressed size:', f_perc(uncompressed_size, compressed_size),
+           f_bytes(uncompressed_size))
    prt_i1('Memory throttles:', arc_stats['memory_throttle_count'])
    prt_i1('Memory direct reclaims:', arc_stats['memory_direct_count'])
    prt_i1('Memory indirect reclaims:', arc_stats['memory_indirect_count'])
@@ -619,8 +619,9 @@ livelist_metaslab_validate(spa_t *spa)
 			    metaslab_calculate_range_tree_type(vd, m,
 			    &start, &shift);
 			metaslab_verify_t mv;
-			mv.mv_allocated = zfs_range_tree_create(NULL,
-			    type, NULL, start, shift);
+			mv.mv_allocated = zfs_range_tree_create_flags(
+			    NULL, type, NULL, start, shift,
+			    0, "livelist_metaslab_validate:mv_allocated");
 			mv.mv_vdid = vd->vdev_id;
 			mv.mv_msid = m->ms_id;
 			mv.mv_start = m->ms_start;
@@ -2545,12 +2546,14 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,

 	blkbuf[0] = '\0';

-	for (i = 0; i < ndvas; i++)
+	for (i = 0; i < ndvas; i++) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
-		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
+		    buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
-		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]),
+		    (DVA_GET_GANG(&dva[i]) ? "G" : ""));
+	}

 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
@@ -6320,8 +6323,9 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)

 	ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));

-	zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *allocs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    0, "zdb_claim_removing:allocs");
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];

@@ -7704,7 +7708,8 @@ zdb_set_skip_mmp(char *target)
 * applies to the new_path parameter if allocated.
 */
 static char *
-import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa,
+    char **new_path)
 {
 	int error = 0;
 	char *poolname, *bogus_name = NULL;
@@ -7712,11 +7717,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)

 	/* If the target is not a pool, the extract the pool name */
 	char *path_start = strchr(target, '/');
-	if (path_start != NULL) {
+	if (target_is_spa || path_start == NULL) {
+		poolname = target;
+	} else {
 		size_t poolname_len = path_start - target;
 		poolname = strndup(target, poolname_len);
-	} else {
-		poolname = target;
 	}

 	if (cfg == NULL) {
@@ -7747,10 +7752,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
 		    "with error %d\n", bogus_name, error);
 	}

-	if (new_path != NULL && path_start != NULL) {
-		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+	if (new_path != NULL && !target_is_spa) {
+		if (asprintf(new_path, "%s%s", bogus_name,
+		    path_start != NULL ? path_start : "") == -1) {
 			free(bogus_name);
-			if (path_start != NULL)
+			if (!target_is_spa && path_start != NULL)
 				free(poolname);
 			return (NULL);
 		}
@@ -7979,7 +7985,7 @@ verify_checkpoint_blocks(spa_t *spa)
 	 * name) so we can do verification on it against the current state
 	 * of the pool.
 	 */
-	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
+	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE,
 	    NULL);
 	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);

@@ -8449,8 +8455,9 @@ dump_zpool(spa_t *spa)

 	if (dump_opt['d'] || dump_opt['i']) {
 		spa_feature_t f;
-		mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-		    NULL, 0, 0);
+		mos_refd_objs = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    0, "dump_zpool:mos_refd_objs");
 		dump_objset(dp->dp_meta_objset);

 		if (dump_opt['d'] >= 3) {
@@ -8981,7 +8988,7 @@ zdb_read_block(char *thing, spa_t *spa)

 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
-	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+	DVA_SET_GANG(&dva[0], 0);
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));

 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
@@ -8996,7 +9003,7 @@ zdb_read_block(char *thing, spa_t *spa)
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);

 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	zio = zio_root(spa, NULL, NULL, 0);
+	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

 	if (vd == vd->vdev_top) {
 		/*
@@ -9118,7 +9125,7 @@ zdb_read_block(char *thing, spa_t *spa)
 				ck_zio->io_offset =
 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
 				ck_zio->io_bp = bp;
-				zio_checksum_compute(ck_zio, ck, pabd, lsize);
+				zio_checksum_compute(ck_zio, ck, pabd, psize);
 				printf(
 				    "%12s\t"
 				    "cksum=%016llx:%016llx:%016llx:%016llx\n",
@@ -9695,7 +9702,7 @@ main(int argc, char **argv)
 	char *checkpoint_target = NULL;
 	if (dump_opt['k']) {
 		checkpoint_pool = import_checkpointed_state(target, cfg,
-		    &checkpoint_target);
+		    target_is_spa, &checkpoint_target);

 		if (checkpoint_target != NULL)
 			target = checkpoint_target;
@@ -134,11 +134,13 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
 	 * of blkid cache and L2ARC VDEV does not contain pool guid in its
 	 * blkid, so this is a special case for L2ARC VDEV.
 	 */
-	else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL &&
+	else if (gsp->gs_vdev_guid != 0 &&
 	    nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&
 	    gsp->gs_vdev_guid == vdev_guid) {
-		(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
-		    &gsp->gs_devid);
+		if (gsp->gs_devid == NULL) {
+			(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
+			    &gsp->gs_devid);
+		}
 		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
 		    &gsp->gs_vdev_expandtime);
 		return (B_TRUE);
@@ -156,22 +158,28 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
 	/*
 	 * For each vdev in this pool, look for a match by devid
 	 */
-	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
-		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-		    &nvl) == 0) {
-			(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
-		}
-	}
-	/*
-	 * if a match was found then grab the pool guid
-	 */
-	if (gsp->gs_vdev_guid && gsp->gs_devid) {
-		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-		    &gsp->gs_pool_guid);
-	}
+	boolean_t found = B_FALSE;
+	uint64_t pool_guid;

+	/* Get pool configuration and extract pool GUID */
+	if ((config = zpool_get_config(zhp, NULL)) == NULL ||
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &pool_guid) != 0)
+		goto out;
+
+	/* Skip this pool if we're looking for a specific pool */
+	if (gsp->gs_pool_guid != 0 && pool_guid != gsp->gs_pool_guid)
+		goto out;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) == 0)
+		found = zfs_agent_iter_vdev(zhp, nvl, gsp);
+
+	if (found && gsp->gs_pool_guid == 0)
+		gsp->gs_pool_guid = pool_guid;
+
+out:
 	zpool_close(zhp);
-	return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0);
+	return (found);
 }

 void
@@ -233,20 +241,17 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
 		 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
 		 * ZFS_EV_POOL_GUID may be missing so find them.
 		 */
-		if (devid == NULL || pool_guid == 0 || vdev_guid == 0) {
-			if (devid == NULL)
-				search.gs_vdev_guid = vdev_guid;
-			else
-				search.gs_devid = devid;
-			zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
-			if (devid == NULL)
-				devid = search.gs_devid;
-			if (pool_guid == 0)
-				pool_guid = search.gs_pool_guid;
-			if (vdev_guid == 0)
-				vdev_guid = search.gs_vdev_guid;
-			devtype = search.gs_vdev_type;
-		}
+		search.gs_devid = devid;
+		search.gs_vdev_guid = vdev_guid;
+		search.gs_pool_guid = pool_guid;
+		zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
+		if (devid == NULL)
+			devid = search.gs_devid;
+		if (pool_guid == 0)
+			pool_guid = search.gs_pool_guid;
+		if (vdev_guid == 0)
+			vdev_guid = search.gs_vdev_guid;
+		devtype = search.gs_vdev_type;

 		/*
 		 * We want to avoid reporting "remove" events coming from
@@ -441,8 +441,9 @@ zed_notify_slack_webhook()
        "${pathname}")"

    # Construct the JSON message for posting.
+    # shellcheck disable=SC2016
    #
-    msg_json="$(printf '{"text": "*%s*\\n%s"}' "${subject}" "${msg_body}" )"
+    msg_json="$(printf '{"text": "*%s*\\n```%s```"}' "${subject}" "${msg_body}" )"

    # Send the POST request and check for errors.
    #
@@ -37,6 +37,7 @@
 #include <assert.h>
 #include <ctype.h>
 #include <sys/debug.h>
+#include <dirent.h>
 #include <errno.h>
 #include <getopt.h>
 #include <libgen.h>
@@ -121,6 +122,7 @@ static int zfs_do_change_key(int argc, char **argv);
 static int zfs_do_project(int argc, char **argv);
 static int zfs_do_version(int argc, char **argv);
 static int zfs_do_redact(int argc, char **argv);
+static int zfs_do_rewrite(int argc, char **argv);
 static int zfs_do_wait(int argc, char **argv);

 #ifdef __FreeBSD__
@@ -193,6 +195,7 @@ typedef enum {
 	HELP_CHANGE_KEY,
 	HELP_VERSION,
 	HELP_REDACT,
+	HELP_REWRITE,
 	HELP_JAIL,
 	HELP_UNJAIL,
 	HELP_WAIT,
@@ -227,7 +230,7 @@ static zfs_command_t command_table[] = {
 	{ "promote",	zfs_do_promote,		HELP_PROMOTE		},
 	{ "rename",	zfs_do_rename,		HELP_RENAME		},
 	{ "bookmark",	zfs_do_bookmark,	HELP_BOOKMARK		},
-	{ "program",    zfs_do_channel_program, HELP_CHANNEL_PROGRAM    },
+	{ "diff",	zfs_do_diff,		HELP_DIFF		},
 	{ NULL },
 	{ "list",	zfs_do_list,		HELP_LIST		},
 	{ NULL },
@@ -249,27 +252,31 @@ static zfs_command_t command_table[] = {
 	{ NULL },
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
+	{ "redact",	zfs_do_redact,		HELP_REDACT		},
 	{ NULL },
 	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
-	{ NULL },
 	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
 	{ NULL },
 	{ "hold",	zfs_do_hold,		HELP_HOLD		},
 	{ "holds",	zfs_do_holds,		HELP_HOLDS		},
 	{ "release",	zfs_do_release,		HELP_RELEASE		},
-	{ "diff",	zfs_do_diff,		HELP_DIFF		},
+	{ NULL },
 	{ "load-key",	zfs_do_load_key,	HELP_LOAD_KEY		},
 	{ "unload-key",	zfs_do_unload_key,	HELP_UNLOAD_KEY		},
 	{ "change-key",	zfs_do_change_key,	HELP_CHANGE_KEY		},
-	{ "redact",	zfs_do_redact,		HELP_REDACT		},
+	{ NULL },
+	{ "program",	zfs_do_channel_program,	HELP_CHANNEL_PROGRAM	},
+	{ "rewrite",	zfs_do_rewrite,		HELP_REWRITE		},
 	{ "wait",	zfs_do_wait,		HELP_WAIT		},

 #ifdef __FreeBSD__
+	{ NULL },
 	{ "jail",	zfs_do_jail,		HELP_JAIL		},
 	{ "unjail",	zfs_do_unjail,		HELP_UNJAIL		},
 #endif

 #ifdef __linux__
+	{ NULL },
 	{ "zone",	zfs_do_zone,		HELP_ZONE		},
 	{ "unzone",	zfs_do_unzone,		HELP_UNZONE		},
 #endif
@@ -432,6 +439,9 @@ get_usage(zfs_help_t idx)
 	case HELP_REDACT:
 		return (gettext("\tredact <snapshot> <bookmark> "
 		    "<redaction_snapshot> ...\n"));
+	case HELP_REWRITE:
+		return (gettext("\trewrite [-rvx] [-o <offset>] [-l <length>] "
+		    "<directory|file ...>\n"));
 	case HELP_JAIL:
 		return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
 	case HELP_UNJAIL:
@@ -7716,6 +7726,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 	struct extmnttab entry;
 	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
 	ino_t path_inode;
+	char *zfs_mntpnt, *entry_mntpnt;

 	/*
 	 * Search for the given (major,minor) pair in the mount table.
@@ -7757,6 +7768,24 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 		goto out;
 	}

+	/*
+	 * If the filesystem is mounted, check that the mountpoint matches
+	 * the one in the mnttab entry w.r.t. provided path. If it doesn't,
+	 * then we should not proceed further.
+	 */
+	entry_mntpnt = strdup(entry.mnt_mountp);
+	if (zfs_is_mounted(zhp, &zfs_mntpnt)) {
+		if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) {
+			(void) fprintf(stderr, gettext("cannot %s '%s': "
+			    "not an original mountpoint\n"), cmdname, path);
+			free(zfs_mntpnt);
+			free(entry_mntpnt);
+			goto out;
+		}
+		free(zfs_mntpnt);
+	}
+	free(entry_mntpnt);
+
 	if (op == OP_SHARE) {
 		char nfs_mnt_prop[ZFS_MAXPROPLEN];
 		char smbshare_prop[ZFS_MAXPROPLEN];
@@ -9013,6 +9042,192 @@ zfs_do_project(int argc, char **argv)
 	return (ret);
 }

+static int
+zfs_rewrite_file(const char *path, boolean_t verbose, zfs_rewrite_args_t *args)
+{
+	int fd, ret = 0;
+
+	fd = open(path, O_WRONLY);
+	if (fd < 0) {
+		ret = errno;
+		(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+		    path, strerror(errno));
+		return (ret);
+	}
+
+	if (ioctl(fd, ZFS_IOC_REWRITE, args) < 0) {
+		ret = errno;
+		(void) fprintf(stderr, gettext("failed to rewrite %s: %s\n"),
+		    path, strerror(errno));
+	} else if (verbose) {
+		printf("%s\n", path);
+	}
+
+	close(fd);
+	return (ret);
+}
+
+static int
+zfs_rewrite_dir(const char *path, boolean_t verbose, boolean_t xdev, dev_t dev,
+    zfs_rewrite_args_t *args, nvlist_t *dirs)
+{
+	struct dirent *ent;
+	DIR *dir;
+	int ret = 0, err;
+
+	dir = opendir(path);
+	if (dir == NULL) {
+		if (errno == ENOENT)
+			return (0);
+		ret = errno;
+		(void) fprintf(stderr, gettext("failed to opendir %s: %s\n"),
+		    path, strerror(errno));
+		return (ret);
+	}
+
+	size_t plen = strlen(path) + 1;
+	while ((ent = readdir(dir)) != NULL) {
+		char *fullname;
+		struct stat st;
+
+		if (ent->d_type != DT_REG && ent->d_type != DT_DIR)
+			continue;
+
+		if (strcmp(ent->d_name, ".") == 0 ||
+		    strcmp(ent->d_name, "..") == 0)
+			continue;
+
+		if (plen + strlen(ent->d_name) >= PATH_MAX) {
+			(void) fprintf(stderr, gettext("path too long %s/%s\n"),
+			    path, ent->d_name);
+			ret = ENAMETOOLONG;
+			continue;
+		}
+
+		if (asprintf(&fullname, "%s/%s", path, ent->d_name) == -1) {
+			(void) fprintf(stderr,
+			    gettext("failed to allocate memory\n"));
+			ret = ENOMEM;
+			continue;
+		}
+
+		if (xdev) {
+			if (lstat(fullname, &st) < 0) {
+				ret = errno;
+				(void) fprintf(stderr,
+				    gettext("failed to stat %s: %s\n"),
+				    fullname, strerror(errno));
+				free(fullname);
+				continue;
+			}
+			if (st.st_dev != dev) {
+				free(fullname);
+				continue;
+			}
+		}
+
+		if (ent->d_type == DT_REG) {
+			err = zfs_rewrite_file(fullname, verbose, args);
+			if (err)
+				ret = err;
+		} else { /* DT_DIR */
+			fnvlist_add_uint64(dirs, fullname, dev);
+		}
+
+		free(fullname);
+	}
+
+	closedir(dir);
+	return (ret);
+}
+
+static int
+zfs_rewrite_path(const char *path, boolean_t verbose, boolean_t recurse,
+    boolean_t xdev, zfs_rewrite_args_t *args, nvlist_t *dirs)
+{
+	struct stat st;
+	int ret = 0;
+
+	if (lstat(path, &st) < 0) {
+		ret = errno;
+		(void) fprintf(stderr, gettext("failed to stat %s: %s\n"),
+		    path, strerror(errno));
+		return (ret);
+	}
+
+	if (S_ISREG(st.st_mode)) {
+		ret = zfs_rewrite_file(path, verbose, args);
+	} else if (S_ISDIR(st.st_mode) && recurse) {
+		ret = zfs_rewrite_dir(path, verbose, xdev, st.st_dev, args,
+		    dirs);
+	}
+	return (ret);
+}
+
+static int
+zfs_do_rewrite(int argc, char **argv)
+{
+	int ret = 0, err, c;
+	boolean_t recurse = B_FALSE, verbose = B_FALSE, xdev = B_FALSE;
+
+	if (argc < 2)
+		usage(B_FALSE);
+
+	zfs_rewrite_args_t args;
+	memset(&args, 0, sizeof (args));
+
+	while ((c = getopt(argc, argv, "l:o:rvx")) != -1) {
+		switch (c) {
+		case 'l':
+			args.len = strtoll(optarg, NULL, 0);
+			break;
+		case 'o':
+			args.off = strtoll(optarg, NULL, 0);
+			break;
+		case 'r':
+			recurse = B_TRUE;
+			break;
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		case 'x':
+			xdev = B_TRUE;
+			break;
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argv += optind;
+	argc -= optind;
+	if (argc == 0) {
+		(void) fprintf(stderr,
+		    gettext("missing file or directory target(s)\n"));
+		usage(B_FALSE);
+	}
+
+	nvlist_t *dirs = fnvlist_alloc();
+	for (int i = 0; i < argc; i++) {
+		err = zfs_rewrite_path(argv[i], verbose, recurse, xdev, &args,
+		    dirs);
+		if (err)
+			ret = err;
+	}
+	nvpair_t *dir;
+	while ((dir = nvlist_next_nvpair(dirs, NULL)) != NULL) {
+		err = zfs_rewrite_dir(nvpair_name(dir), verbose, xdev,
+		    fnvpair_value_uint64(dir), &args, dirs);
+		if (err)
+			ret = err;
+		fnvlist_remove_nvpair(dirs, dir);
+	}
+	fnvlist_free(dirs);
+
+	return (ret);
+}
+
 static int
 zfs_do_wait(int argc, char **argv)
 {
@@ -3881,7 +3881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 	 * If newvd is too small, it should fail with EOVERFLOW.
 	 *
 	 * If newvd is a distributed spare and it's being attached to a
-	 * dRAID which is not its parent it should fail with EINVAL.
+	 * dRAID which is not its parent it should fail with ENOTSUP.
 	 */
 	if (pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_root_ops && (!replacing ||
@@ -3900,7 +3900,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 	else if (ashift > oldvd->vdev_top->vdev_ashift)
 		expected_error = EDOM;
 	else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd))
-		expected_error = EINVAL;
+		expected_error = ENOTSUP;
 	else
 		expected_error = 0;

@@ -7812,6 +7812,9 @@ ztest_dataset_open(int d)

 	ztest_dataset_name(name, ztest_opts.zo_pool, d);

+	if (ztest_opts.zo_verbose >= 6)
+		(void) printf("Opening %s\n", name);
+
 	(void) pthread_rwlock_rdlock(&ztest_name_lock);

 	error = ztest_dataset_create(name);
@@ -8307,41 +8310,44 @@ static void
 ztest_generic_run(ztest_shared_t *zs, spa_t *spa)
 {
 	kthread_t **run_threads;
-	int t;
+	int i, ndatasets;

 	run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
 	    UMEM_NOFAIL);

+	/*
+	 * Actual number of datasets to be used.
+	 */
+	ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads);
+
+	/*
+	 * Prepare the datasets first.
+	 */
+	for (i = 0; i < ndatasets; i++)
+		VERIFY0(ztest_dataset_open(i));
+
 	/*
 	 * Kick off all the tests that run in parallel.
 	 */
-	for (t = 0; t < ztest_opts.zo_threads; t++) {
-		if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
-			umem_free(run_threads, ztest_opts.zo_threads *
-			    sizeof (kthread_t *));
-			return;
-		}
-
-		run_threads[t] = thread_create(NULL, 0, ztest_thread,
-		    (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
+	for (i = 0; i < ztest_opts.zo_threads; i++) {
+		run_threads[i] = thread_create(NULL, 0, ztest_thread,
+		    (void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE,
 		    defclsyspri);
 	}

 	/*
 	 * Wait for all of the tests to complete.
 	 */
-	for (t = 0; t < ztest_opts.zo_threads; t++)
-		VERIFY0(thread_join(run_threads[t]));
+	for (i = 0; i < ztest_opts.zo_threads; i++)
+		VERIFY0(thread_join(run_threads[i]));

 	/*
 	 * Close all datasets. This must be done after all the threads
 	 * are joined so we can be sure none of the datasets are in-use
 	 * by any of the threads.
 	 */
-	for (t = 0; t < ztest_opts.zo_threads; t++) {
-		if (t < ztest_opts.zo_datasets)
-			ztest_dataset_close(t);
-	}
+	for (i = 0; i < ndatasets; i++)
+		ztest_dataset_close(i);

 	txg_wait_synced(spa_get_dsl(spa), 0);

@@ -8464,6 +8470,7 @@ ztest_run(ztest_shared_t *zs)

 		int d = ztest_random(ztest_opts.zo_datasets);
 		ztest_dataset_destroy(d);
+		txg_wait_synced(spa_get_dsl(spa), 0);
 	}
 	zs->zs_enospc_count = 0;

@@ -72,7 +72,7 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.

-#serial 36
+#serial 37

 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL])
 AC_DEFUN([AX_PYTHON_DEVEL],[
@@ -316,7 +316,7 @@ EOD`
 			PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version"
 		fi

-		if test -z "PYTHON_LIBS"; then
+		if test -z "$PYTHON_LIBS"; then
 			AC_MSG_WARN([
  Cannot determine location of your Python DSO. Please check it was installed with
  dynamic libraries enabled, or try setting PYTHON_LIBS by hand.
@@ -0,0 +1,24 @@
+dnl #
+dnl # Linux 5.2 API change
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE], [
+	ZFS_LINUX_TEST_SRC([super_operations_free_inode], [
+		#include <linux/fs.h>
+
+		static void free_inode(struct inode *) { }
+
+		static struct super_operations sops __attribute__ ((unused)) = {
+			.free_inode = free_inode,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SOPS_FREE_INODE], [
+	AC_MSG_CHECKING([whether sops->free_inode() exists])
+	ZFS_LINUX_TEST_RESULT([super_operations_free_inode], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_SOPS_FREE_INODE, 1, [sops->free_inode() exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
@@ -49,6 +49,15 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_OBJTOOL], [
 		#error "STACK_FRAME_NON_STANDARD is not defined."
 		#endif
 	])
+
+	dnl # 6.15 made CONFIG_OBJTOOL_WERROR=y the default. We need to handle
+	dnl # this or our build will fail.
+	ZFS_LINUX_TEST_SRC([config_objtool_werror], [
+		#if !defined(CONFIG_OBJTOOL_WERROR)
+		#error "CONFIG_OBJTOOL_WERROR is not defined."
+		#endif
+	])
+
 ])

 AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [
@@ -84,6 +93,14 @@ AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [
 		],[
 			AC_MSG_RESULT(no)
 		])
+
+		AC_MSG_CHECKING([whether CONFIG_OBJTOOL_WERROR is defined])
+		ZFS_LINUX_TEST_RESULT([config_objtool_werror],[
+			AC_MSG_RESULT(yes)
+			CONFIG_OBJTOOL_WERROR_DEFINED=yes
+		],[
+			AC_MSG_RESULT(no)
+		])
 	],[
 		AC_MSG_RESULT(no)
 	])
@@ -0,0 +1,23 @@
+dnl #
+dnl # Linux 6.16 removed readahead_page
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE], [
+	ZFS_LINUX_TEST_SRC([pagemap_has_readahead_page], [
+		#include <linux/pagemap.h>
+	], [
+		struct page *p __attribute__ ((unused)) = NULL;
+		struct readahead_control *ractl __attribute__ ((unused)) = NULL;
+		p = readahead_page(ractl);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE], [
+	AC_MSG_CHECKING([whether readahead_page() exists])
+	ZFS_LINUX_TEST_RESULT([pagemap_has_readahead_page], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_PAGEMAP_READAHEAD_PAGE, 1,
+			[readahead_page() exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
@@ -0,0 +1,24 @@
+dnl #
+dnl # Linux 6.16 removes address_space_operations ->writepage
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE], [
+	ZFS_LINUX_TEST_SRC([vfs_has_writepage], [
+		#include <linux/fs.h>
+
+		static const struct address_space_operations
+		    aops __attribute__ ((unused)) = {
+			.writepage = NULL,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_VFS_WRITEPAGE], [
+	AC_MSG_CHECKING([whether aops->writepage exists])
+	ZFS_LINUX_TEST_RESULT([vfs_has_writepage], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_WRITEPAGE, 1,
+			[address_space_operations->writepage exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
@@ -82,6 +82,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_MIGRATEPAGE
 	ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_SRC_VFS_READPAGES
+	ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE
 	ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
@@ -111,6 +112,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG
 	ZFS_AC_KERNEL_SRC_STRLCPY
 	ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT
+	ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE
 	ZFS_AC_KERNEL_SRC_ADD_DISK
 	ZFS_AC_KERNEL_SRC_KTHREAD
 	ZFS_AC_KERNEL_SRC_ZERO_PAGE
@@ -132,6 +134,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
 	ZFS_AC_KERNEL_SRC_TIMER
 	ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR
+	ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -197,6 +200,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_MIGRATEPAGE
 	ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_VFS_READPAGES
+	ZFS_AC_KERNEL_VFS_WRITEPAGE
 	ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
@@ -226,6 +230,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG
 	ZFS_AC_KERNEL_STRLCPY
 	ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT
+	ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE
 	ZFS_AC_KERNEL_ADD_DISK
 	ZFS_AC_KERNEL_KTHREAD
 	ZFS_AC_KERNEL_ZERO_PAGE
@@ -248,6 +253,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_PIN_USER_PAGES
 	ZFS_AC_KERNEL_TIMER
 	ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR
+	ZFS_AC_KERNEL_SOPS_FREE_INODE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
@@ -38,9 +38,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE], [
 	AC_MSG_CHECKING([whether host toolchain supports SSE])

 	AC_LINK_IFELSE([AC_LANG_SOURCE([[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("xorps %xmm0, %xmm1");
+			return (0);
 		}
 	]])], [
 		AC_DEFINE([HAVE_SSE], 1, [Define if host toolchain supports SSE])
@@ -57,9 +58,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2], [
 	AC_MSG_CHECKING([whether host toolchain supports SSE2])

 	AC_LINK_IFELSE([AC_LANG_SOURCE([[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("pxor %xmm0, %xmm1");
+			return (0);
 		}
 	]])], [
 		AC_DEFINE([HAVE_SSE2], 1, [Define if host toolchain supports SSE2])
@@ -76,10 +78,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3], [
 	AC_MSG_CHECKING([whether host toolchain supports SSE3])

 	AC_LINK_IFELSE([AC_LANG_SOURCE([[
-		void main()
+		int main()
 		{
 			char v[16];
 			__asm__ __volatile__("lddqu %0,%%xmm0" :: "m"(v[0]));
+			return (0);
 		}
 	]])], [
 		AC_DEFINE([HAVE_SSE3], 1, [Define if host toolchain supports SSE3])
@@ -96,9 +99,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3], [
 	AC_MSG_CHECKING([whether host toolchain supports SSSE3])

 	AC_LINK_IFELSE([AC_LANG_SOURCE([[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("pshufb %xmm0,%xmm1");
+			return (0);
 		}
 	]])], [
 		AC_DEFINE([HAVE_SSSE3], 1, [Define if host toolchain supports SSSE3])
@@ -115,9 +119,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1], [
 	AC_MSG_CHECKING([whether host toolchain supports SSE4.1])

 	AC_LINK_IFELSE([AC_LANG_SOURCE([[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("pmaxsb %xmm0,%xmm1");
+			return (0);
 		}
 	]])], [
 		AC_DEFINE([HAVE_SSE4_1], 1, [Define if host toolchain supports SSE4.1])
@@ -134,9 +139,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2], [
 	AC_MSG_CHECKING([whether host toolchain supports SSE4.2])

 	AC_LINK_IFELSE([AC_LANG_SOURCE([[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("pcmpgtq %xmm0, %xmm1");
+			return (0);
 		}
 	]])], [
 		AC_DEFINE([HAVE_SSE4_2], 1, [Define if host toolchain supports SSE4.2])
@@ -153,10 +159,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX], [
 	AC_MSG_CHECKING([whether host toolchain supports AVX])

 	AC_LINK_IFELSE([AC_LANG_SOURCE([[
-		void main()
+		int main()
 		{
 			char v[32];
 			__asm__ __volatile__("vmovdqa %0,%%ymm0" :: "m"(v[0]));
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -174,9 +181,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vpshufb %ymm0,%ymm1,%ymm2");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -194,9 +202,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vpandd %zmm0,%zmm1,%zmm2");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -214,9 +223,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vplzcntd %zmm0,%zmm1");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -234,9 +244,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vandpd %zmm0,%zmm1,%zmm2");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -254,9 +265,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vpshufb %zmm0,%zmm1,%zmm2");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -274,9 +286,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vpmadd52luq %zmm0,%zmm1,%zmm2");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -294,9 +307,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vpermb %zmm0,%zmm1,%zmm2");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -314,9 +328,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vgatherpf0dps (%rsi,%zmm0,4){%k1}");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -334,9 +349,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vexp2pd %zmm0,%zmm1");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -354,9 +370,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("vpabsq %zmm0,%zmm1");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -374,9 +391,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("aesenc %xmm0, %xmm1");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -394,9 +412,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("pclmulqdq %0, %%xmm0, %%xmm1" :: "i"(0));
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -414,9 +433,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 			__asm__ __volatile__("movbe 0(%eax), %eax");
+			return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -434,10 +454,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 		  char b[4096] __attribute__ ((aligned (64)));
 		  __asm__ __volatile__("xsave %[b]\n" : : [b] "m" (*b) : "memory");
+		  return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -455,10 +476,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 		  char b[4096] __attribute__ ((aligned (64)));
 		  __asm__ __volatile__("xsaveopt %[b]\n" : : [b] "m" (*b) : "memory");
+		  return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -476,10 +498,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES], [

 	AC_LINK_IFELSE([AC_LANG_SOURCE([
 	[
-		void main()
+		int main()
 		{
 		  char b[4096] __attribute__ ((aligned (64)));
 		  __asm__ __volatile__("xsaves %[b]\n" : : [b] "m" (*b) : "memory");
+		  return (0);
 		}
 	]])], [
 		AC_MSG_RESULT([yes])
@@ -0,0 +1,34 @@
+dnl #
+dnl # Check for statx() function and STATX_MNT_ID availability
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [
+	AC_CHECK_HEADERS([linux/stat.h],
+		[have_stat_headers=yes],
+		[have_stat_headers=no])
+
+	AS_IF([test "x$have_stat_headers" = "xyes"], [
+		AC_CHECK_FUNC([statx], [
+			AC_DEFINE([HAVE_STATX], [1], [statx() is available])
+
+			dnl Check for STATX_MNT_ID availability
+			AC_MSG_CHECKING([for STATX_MNT_ID])
+			AC_COMPILE_IFELSE([
+				AC_LANG_PROGRAM([[
+					#include <linux/stat.h>
+				]], [[
+					struct statx stx;
+					int mask = STATX_MNT_ID;
+					(void)mask;
+					(void)stx.stx_mnt_id;
+				]])
+			], [
+				AC_MSG_RESULT([yes])
+				AC_DEFINE([HAVE_STATX_MNT_ID], [1], [STATX_MNT_ID is available])
+			], [
+				AC_MSG_RESULT([no])
+			])
+		])
+	], [
+		AC_MSG_WARN([linux/stat.h not found; skipping statx support])
+	])
+])  dnl end AC_DEFUN
@@ -17,6 +17,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 		ZFS_AC_CONFIG_USER_LIBUDEV
 		ZFS_AC_CONFIG_USER_LIBUUID
 		ZFS_AC_CONFIG_USER_LIBBLKID
+		ZFS_AC_CONFIG_USER_STATX
 	])
 	ZFS_AC_CONFIG_USER_LIBTIRPC
 	ZFS_AC_CONFIG_USER_LIBCRYPTO
@@ -205,6 +205,46 @@ AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [
 	AC_MSG_RESULT([$enable_invariants])
 ])

+dnl # Disabled by default. If enabled allows a configured "turn objtools
+dnl # warnings into errors" (CONFIG_OBJTOOL_WERROR) behavior to take effect.
+dnl # If disabled, objtool warnings are never turned into errors. It can't
+dnl # be enabled if the kernel wasn't compiled with CONFIG_OBJTOOL_WERROR=y.
+dnl #
+AC_DEFUN([ZFS_AC_OBJTOOL_WERROR], [
+	AC_MSG_CHECKING([whether objtool error on warning behavior is enabled])
+	AC_ARG_ENABLE([objtool-werror],
+		[AS_HELP_STRING([--enable-objtool-werror],
+		[Enable objtool's error on warning behaviour if present @<:@default=no@:>@])],
+		[enable_objtool_werror=$enableval],
+		[enable_objtool_werror=no])
+	AC_MSG_RESULT([$enable_objtool_werror])
+
+	AS_IF([test x$CONFIG_OBJTOOL_WERROR_DEFINED = xyes],[
+		AS_IF([test x$enable_objtool_werror = xyes],[
+			AC_MSG_NOTICE([enable-objtool-werror defined, keeping -Werror ])
+		],[
+			AC_MSG_NOTICE([enable-objtool-werror undefined, disabling -Werror ])
+			OBJTOOL_DISABLE_WERROR=y
+			abs_objtool_binary=$kernelsrc/tools/objtool/objtool
+			AS_IF([test -x $abs_objtool_binary],[],[
+				AC_MSG_ERROR([*** objtool binary $abs_objtool_binary not found])
+			])
+			dnl # The path to the wrapper is defined in modules/Makefile.in.
+		])
+	],[
+		dnl # We can't enable --Werror if it's not there.
+		AS_IF([test x$enable_objtool_werror = xyes],[
+			AC_MSG_ERROR([
+	*** Cannot enable objtool-werror,
+	*** a kernel built with CONFIG_OBJTOOL_WERROR=y is required.
+			])
+		],[])
+	])
+
+	AC_SUBST(OBJTOOL_DISABLE_WERROR)
+	AC_SUBST(abs_objtool_binary)
+])
+
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [
 	AX_COUNT_CPUS([])
 	AC_SUBST(CPU_COUNT)
@@ -65,6 +65,7 @@ ZFS_AC_DEBUGINFO
 ZFS_AC_DEBUG_KMEM
 ZFS_AC_DEBUG_KMEM_TRACKING
 ZFS_AC_DEBUG_INVARIANTS
+ZFS_AC_OBJTOOL_WERROR

 AC_CONFIG_FILES([
 	contrib/debian/rules
@@ -86,6 +87,7 @@ AC_CONFIG_FILES([
 	zfs.release
 ])

+AC_CONFIG_FILES([scripts/objtool-wrapper], [chmod +x scripts/objtool-wrapper])

 AC_OUTPUT

@@ -100,8 +100,8 @@ Depends: ${misc:Depends}, ${shlibs:Depends}
 # The libcurl4 is loaded through dlopen("libcurl.so.4").
 # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=988521
 Recommends: libcurl4
-Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux
-Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux
+Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4
+Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4
 Conflicts: libzfs6linux
 Description: OpenZFS filesystem library for Linux - general support
 OpenZFS is a storage platform that encompasses the functionality of
@@ -128,8 +128,8 @@ Package: openzfs-libzpool6
 Section: contrib/libs
 Architecture: linux-any
 Depends: ${misc:Depends}, ${shlibs:Depends}
-Breaks: libzpool2, libzpool5, libzpool5linux, libzpool6linux
-Replaces: libzpool2, libzpool5, libzpool5linux, libzpool6linux
+Breaks: libzpool2, libzpool5, libzpool6linux
+Replaces: libzpool2, libzpool5, libzpool6linux
 Conflicts: libzpool6linux
 Description: OpenZFS pool library for Linux
 OpenZFS is a storage platform that encompasses the functionality of
@@ -8,6 +8,7 @@ lib/systemd/system/zfs-import-scan.service
 lib/systemd/system/zfs-import.target
 lib/systemd/system/zfs-load-key.service
 lib/systemd/system/zfs-mount.service
+lib/systemd/system/zfs-mount@.service
 lib/systemd/system/zfs-scrub-monthly@.timer
 lib/systemd/system/zfs-scrub-weekly@.timer
 lib/systemd/system/zfs-scrub@.service
@@ -73,6 +74,7 @@ usr/share/man/man8/zfs-recv.8
 usr/share/man/man8/zfs-redact.8
 usr/share/man/man8/zfs-release.8
 usr/share/man/man8/zfs-rename.8
+usr/share/man/man8/zfs-rewrite.8
 usr/share/man/man8/zfs-rollback.8
 usr/share/man/man8/zfs-send.8
 usr/share/man/man8/zfs-set.8
@@ -93,7 +93,7 @@ override_dh_auto_install:
 	@# Install the DKMS source.
 	@# We only want the files needed to build the modules
 	install -D -t '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/scripts' \
-		'$(CURDIR)/scripts/dkms.postbuild'
+		'$(CURDIR)/scripts/dkms.postbuild' '$(CURDIR)/scripts/objtool-wrapper.in'
 	$(foreach file,$(DKMSFILES),mv '$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/$(file)' '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)' || exit 1;)

 	@# Only ever build Linux modules
@@ -108,8 +108,8 @@ override_dh_auto_install:
 	@#        - zfs.release$
 	@#  * Takes care of spaces and tabs
 	@#  * Remove reference to ZFS_AC_PACKAGE
-	awk '/^AC_CONFIG_FILES\(\[/,/^\]\)/ {\
-		if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))/) \
+	awk '/^AC_CONFIG_FILES\(\[/,/\]\)/ {\
+		if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))|scripts\/objtool-wrapper.*\]\)$$/) \
 		{next} } {print}' \
 		'$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac' | sed '/ZFS_AC_PACKAGE/d' > '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac'
 	@# Set "SUBDIRS = module include" for CONFIG_KERNEL and remove SUBDIRS for all other configs.
@@ -56,6 +56,7 @@ systemdunit_DATA = \
 	%D%/systemd/system/zfs-import-scan.service \
 	%D%/systemd/system/zfs-import.target \
 	%D%/systemd/system/zfs-mount.service \
+	%D%/systemd/system/zfs-mount@.service \
 	%D%/systemd/system/zfs-scrub-monthly@.timer \
 	%D%/systemd/system/zfs-scrub-weekly@.timer \
 	%D%/systemd/system/zfs-scrub@.service \
@@ -0,0 +1,26 @@
+[Unit]
+Description=Mount ZFS filesystem %I
+Documentation=man:zfs(8)
+DefaultDependencies=no
+After=systemd-udev-settle.service
+After=zfs-import.target
+After=zfs-mount.service
+After=systemd-remount-fs.service
+Before=local-fs.target
+ConditionPathIsDirectory=/sys/module/zfs
+
+# This merely tells the service manager
+# that unmounting everything undoes the
+# effect of this service. No extra logic
+# is ran as a result of these settings.
+Conflicts=umount.target
+Before=umount.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+EnvironmentFile=-@initconfdir@/zfs
+ExecStart=@sbindir@/zfs mount -R %I
+
+[Install]
+WantedBy=zfs.target
@@ -56,4 +56,9 @@ struct opensolaris_utsname {
 #define	task_io_account_read(n)
 #define	task_io_account_write(n)

+/*
+ * Check if the current thread is a memory reclaim thread.
+ */
+extern int current_is_reclaim_thread(void);
+
 #endif	/* _OPENSOLARIS_SYS_MISC_H_ */
@@ -45,7 +45,9 @@
 #ifdef _KERNEL
 #define	CPU		curcpu
 #define	minclsyspri	PRIBIO
-#define	defclsyspri minclsyspri
+#define	defclsyspri	minclsyspri
+/* Write issue taskq priority. */
+#define	wtqclsyspri	((PVM + PRIBIO) / 2)
 #define	maxclsyspri	PVM
 #define	max_ncpus	(mp_maxid + 1)
 #define	boot_max_ncpus	(mp_maxid + 1)
@@ -8,6 +8,7 @@ kernel_linux_HEADERS = \
 	%D%/kernel/linux/mm_compat.h \
 	%D%/kernel/linux/mod_compat.h \
 	%D%/kernel/linux/page_compat.h \
+	%D%/kernel/linux/pagemap_compat.h \
 	%D%/kernel/linux/simd.h \
 	%D%/kernel/linux/simd_aarch64.h \
 	%D%/kernel/linux/simd_arm.h \
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ */
+
+#ifndef _ZFS_PAGEMAP_COMPAT_H
+#define	_ZFS_PAGEMAP_COMPAT_H
+
+#include <linux/pagemap.h>
+
+#ifndef HAVE_PAGEMAP_READAHEAD_PAGE
+#define	readahead_page(ractl) (&(__readahead_folio(ractl)->page))
+#endif
+
+#endif
@@ -139,15 +139,6 @@
 */
 #if defined(HAVE_KERNEL_FPU_INTERNAL)

-/*
- * For kernels not exporting *kfpu_{begin,end} we have to use inline assembly
- * with the XSAVE{,OPT,S} instructions, so we need the toolchain to support at
- * least XSAVE.
- */
-#if !defined(HAVE_XSAVE)
-#error "Toolchain needs to support the XSAVE assembler instruction"
-#endif
-
 #ifndef XFEATURE_MASK_XTILE
 /*
 * For kernels where this doesn't exist yet, we still don't want to break
@@ -335,9 +326,13 @@ kfpu_begin(void)
 		return;
 	}
 #endif
+#if defined(HAVE_XSAVE)
 	if (static_cpu_has(X86_FEATURE_XSAVE)) {
 		kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE);
-	} else if (static_cpu_has(X86_FEATURE_FXSR)) {
+		return;
+	}
+#endif
+	if (static_cpu_has(X86_FEATURE_FXSR)) {
 		kfpu_save_fxsr(state);
 	} else {
 		kfpu_save_fsave(state);
@@ -390,9 +385,13 @@ kfpu_end(void)
 		goto out;
 	}
 #endif
+#if defined(HAVE_XSAVE)
 	if (static_cpu_has(X86_FEATURE_XSAVE)) {
 		kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE);
-	} else if (static_cpu_has(X86_FEATURE_FXSR)) {
+		goto out;
+	}
+#endif
+	if (static_cpu_has(X86_FEATURE_FXSR)) {
 		kfpu_restore_fxsr(state);
 	} else {
 		kfpu_restore_fsave(state);
@@ -24,7 +24,13 @@
 #define	_OS_LINUX_SPL_MISC_H

 #include <linux/kobject.h>
+#include <linux/swap.h>

 extern void spl_signal_kobj_evt(struct block_device *bdev);

+/*
+ * Check if the current thread is a memory reclaim thread.
+ */
+extern int current_is_reclaim_thread(void);
+
 #endif
@@ -92,8 +92,10 @@
 * Treat shim tasks as SCHED_NORMAL tasks
 */
 #define	minclsyspri			(MAX_PRIO-1)
-#define	maxclsyspri			(MAX_RT_PRIO)
 #define	defclsyspri			(DEFAULT_PRIO)
+/* Write issue taskq priority. */
+#define	wtqclsyspri			(MAX_RT_PRIO + 1)
+#define	maxclsyspri			(MAX_RT_PRIO)

 #ifndef NICE_TO_PRIO
 #define	NICE_TO_PRIO(nice)		(MAX_RT_PRIO + (nice) + 20)
@@ -59,8 +59,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __field(uint64_t,		z_size)
 	    __field(uint64_t,		z_pflags)
 	    __field(uint32_t,		z_sync_cnt)
-	    __field(uint32_t,		z_sync_writes_cnt)
-	    __field(uint32_t,		z_async_writes_cnt)
 	    __field(mode_t,		z_mode)
 	    __field(boolean_t,		z_is_sa)
 	    __field(boolean_t,		z_is_ctldir)
@@ -92,8 +90,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_size		= zn->z_size;
 	    __entry->z_pflags		= zn->z_pflags;
 	    __entry->z_sync_cnt		= zn->z_sync_cnt;
-	    __entry->z_sync_writes_cnt	= zn->z_sync_writes_cnt;
-	    __entry->z_async_writes_cnt	= zn->z_async_writes_cnt;
 	    __entry->z_mode		= zn->z_mode;
 	    __entry->z_is_sa		= zn->z_is_sa;
 	    __entry->z_is_ctldir	= zn->z_is_ctldir;
@@ -117,7 +113,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	TP_printk("zn { id %llu unlinked %u atime_dirty %u "
 	    "zn_prefetch %u blksz %u seq %u "
 	    "mapcnt %llu size %llu pflags %llu "
-	    "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
+	    "sync_cnt %u "
 	    "mode 0x%x is_sa %d is_ctldir %d "
 	    "inode { uid %u gid %u ino %lu nlink %u size %lli "
 	    "blkbits %u bytes %u mode 0x%x generation %x } } "
@@ -126,7 +122,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_zn_prefetch, __entry->z_blksz,
 	    __entry->z_seq, __entry->z_mapcnt, __entry->z_size,
 	    __entry->z_pflags, __entry->z_sync_cnt,
-	    __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
 	    __entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir,
 	    __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink,
 	    __entry->i_size, __entry->i_blkbits,
@@ -157,6 +157,7 @@ struct znode;

 extern int	zfs_sync(struct super_block *, int, cred_t *);
 extern int	zfs_inode_alloc(struct super_block *, struct inode **ip);
+extern void	zfs_inode_free(struct inode *);
 extern void	zfs_inode_destroy(struct inode *);
 extern void	zfs_mark_inode_dirty(struct inode *);
 extern boolean_t zfs_relatime_need_update(const struct inode *);
@@ -954,7 +954,7 @@ typedef struct arc_sums {
 	wmsum_t arcstat_data_size;
 	wmsum_t arcstat_metadata_size;
 	wmsum_t arcstat_dbuf_size;
-	wmsum_t arcstat_dnode_size;
+	aggsum_t arcstat_dnode_size;
 	wmsum_t arcstat_bonus_size;
 	wmsum_t arcstat_l2_hits;
 	wmsum_t arcstat_l2_misses;
@@ -174,6 +174,7 @@ typedef struct dbuf_dirty_record {
 			arc_buf_t *dr_data;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
+			uint8_t dr_gang_copies;
 			boolean_t dr_nopwrite;
 			boolean_t dr_brtwrite;
 			boolean_t dr_diowrite;
@@ -286,14 +286,11 @@ typedef struct {
 	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
 	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */

-	hrtime_t	ddt_flush_start;	/* log flush start this txg */
-	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
-
-	int32_t		ddt_flush_count;	/* entries flushed this txg */
-	int32_t		ddt_flush_min;		/* min rem entries to flush */
 	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
 	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
 	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
+	uint32_t	ddt_log_flush_pressure;	/* pressure to apply for cap */
+	uint32_t	ddt_log_flush_prev_backlog; /* prev backlog size */

 	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */

@@ -144,9 +144,9 @@ typedef enum dmu_object_byteswap {
 #define	DMU_OT_IS_DDT(ot) \
 	((ot) == DMU_OT_DDT_ZAP)

-#define	DMU_OT_IS_CRITICAL(ot) \
+#define	DMU_OT_IS_CRITICAL(ot, level) \
 	(DMU_OT_IS_METADATA(ot) && \
-	(ot) != DMU_OT_DNODE && \
+	((ot) != DMU_OT_DNODE || (level) > 0) && \
 	(ot) != DMU_OT_DIRECTORY_CONTENTS && \
 	(ot) != DMU_OT_SA)

@@ -1614,6 +1614,15 @@ typedef enum zfs_ioc {

 #endif

+typedef struct zfs_rewrite_args {
+	uint64_t	off;
+	uint64_t	len;
+	uint64_t	flags;
+	uint64_t	arg;
+} zfs_rewrite_args_t;
+
+#define	ZFS_IOC_REWRITE		_IOW(0x83, 3, zfs_rewrite_args_t)
+
 /*
 * ZFS-specific error codes used for returning descriptive errors
 * to the userland through zfs ioctls.
@@ -568,6 +568,8 @@ typedef struct metaslab_unflushed_phys {
 	uint64_t	msp_unflushed_txg;
 } metaslab_unflushed_phys_t;

+char *metaslab_rt_name(metaslab_group_t *, metaslab_t *, const char *);
+
 #ifdef	__cplusplus
 }
 #endif
@@ -49,6 +49,9 @@ typedef enum zfs_range_seg_type {
 	ZFS_RANGE_SEG_NUM_TYPES,
 } zfs_range_seg_type_t;

+#define	ZFS_RT_NAME(rt)		(((rt)->rt_name != NULL) ? (rt)->rt_name : "")
+#define	ZFS_RT_F_DYN_NAME	(1ULL << 0) /* if rt_name must be freed */
+
 /*
 * Note: the range_tree may not be accessed concurrently; consumers
 * must provide external locking if required.
@@ -68,6 +71,9 @@ typedef struct zfs_range_tree {
 	void		*rt_arg;
 	uint64_t	rt_gap;		/* allowable inter-segment gap */

+	uint64_t	rt_flags;
+	const char	*rt_name;	/* details for debugging */
+
 	/*
 	 * The rt_histogram maintains a histogram of ranges. Each bucket,
 	 * rt_histogram[i], contains the number of ranges whose size is:
@@ -281,6 +287,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
    uint64_t gap);
 zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
+zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
+    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+    uint64_t flags, const char *name);
 void zfs_range_tree_destroy(zfs_range_tree_t *rt);
 boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
    uint64_t size);
@@ -173,6 +173,7 @@ extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 extern uint32_t vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
 extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
+extern boolean_t vdev_queue_pool_busy(spa_t *spa);

 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
@@ -651,6 +651,7 @@ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
 int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp);
 #endif
 int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS);
+char *vdev_rt_name(vdev_t *vd, const char *name);

 /*
 * Vdev ashift optimization tunables
@@ -236,6 +236,11 @@ typedef pthread_t	kthread_t;
 #define	thread_join(t)	pthread_join((pthread_t)(t), NULL)

 #define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Always returns false in userspace (no memory reclaim thread).
+ */
+#define	current_is_reclaim_thread()	(0)

 /* in libzpool, p0 exists only to have its address taken */
 typedef struct proc {
@@ -623,8 +628,10 @@ extern void delay(clock_t ticks);
 * Process priorities as defined by setpriority(2) and getpriority(2).
 */
 #define	minclsyspri	19
-#define	maxclsyspri	-20
 #define	defclsyspri	0
+/* Write issue taskq priority. */
+#define	wtqclsyspri	-19
+#define	maxclsyspri	-20

 #define	CPU_SEQID	((uintptr_t)pthread_self() & (max_ncpus - 1))
 #define	CPU_SEQID_UNSTABLE	CPU_SEQID
@@ -60,6 +60,7 @@ extern int zfs_dbgmsg_enable;
 #define	ZFS_DEBUG_METASLAB_ALLOC	(1 << 13)
 #define	ZFS_DEBUG_BRT			(1 << 14)
 #define	ZFS_DEBUG_RAIDZ_RECONSTRUCT	(1 << 15)
+#define	ZFS_DEBUG_DDT			(1 << 16)

 extern void __set_error(const char *file, const char *func, int line, int err);
 extern void __zfs_dbgmsg(char *buf);
@@ -40,6 +40,7 @@ extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *,
    uint64_t *, cred_t *);
 extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
    const blkptr_t *, size_t);
+extern int zfs_rewrite(znode_t *, uint64_t, uint64_t, uint64_t, uint64_t);

 extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *);
 extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *);
@@ -201,8 +201,6 @@ typedef struct znode {
 	uint64_t	z_size;		/* file size (cached) */
 	uint64_t	z_pflags;	/* pflags (cached) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
-	uint32_t	z_sync_writes_cnt; /* synchronous write count */
-	uint32_t	z_async_writes_cnt; /* asynchronous write count */
 	mode_t		z_mode;		/* mode (cached) */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	zfs_acl_t	*z_acl_cached;	/* cached acl */
@@ -350,6 +350,7 @@ typedef struct zio_prop {
 	uint8_t			zp_complevel;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
+	uint8_t			zp_gang_copies;
 	dmu_object_type_t	zp_type;
 	boolean_t		zp_dedup;
 	boolean_t		zp_dedup_verify;
@@ -575,7 +576,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);

 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
-    boolean_t nopwrite, boolean_t brtwrite);
+    int gang_copies, boolean_t nopwrite, boolean_t brtwrite);

 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);

@@ -31,6 +31,11 @@

 #include <sys/mount.h> /* for BLKGETSIZE64 */

+#ifdef HAVE_STATX
+#include <fcntl.h>
+#include <linux/stat.h>
+#endif
+
 /*
 * Emulate Solaris' behavior of returning the block device size in fstat64().
 */
@@ -85,13 +85,21 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp)
 }

 static int
-getextmntent_impl(FILE *fp, struct extmnttab *mp)
+getextmntent_impl(FILE *fp, struct extmnttab *mp, uint64_t *mnt_id)
 {
 	int ret;
 	struct stat64 st;

+	*mnt_id = 0;
 	ret = _sol_getmntent(fp, (struct mnttab *)mp);
 	if (ret == 0) {
+#ifdef HAVE_STATX_MNT_ID
+		struct statx stx;
+		if (statx(AT_FDCWD, mp->mnt_mountp,
+		    AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW,
+		    STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID))
+			*mnt_id = stx.stx_mnt_id;
+#endif
 		if (stat64(mp->mnt_mountp, &st) != 0) {
 			mp->mnt_major = 0;
 			mp->mnt_minor = 0;
@@ -110,6 +118,12 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
 	struct stat64 st;
 	FILE *fp;
 	int match;
+	boolean_t have_mnt_id = B_FALSE;
+	uint64_t target_mnt_id = 0;
+	uint64_t entry_mnt_id;
+#ifdef HAVE_STATX_MNT_ID
+	struct statx stx;
+#endif

 	if (strlen(path) >= MAXPATHLEN) {
 		(void) fprintf(stderr, "invalid object; pathname too long\n");
@@ -128,6 +142,13 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
 		return (-1);
 	}

+#ifdef HAVE_STATX_MNT_ID
+	if (statx(AT_FDCWD, path, AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW,
+	    STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) {
+		have_mnt_id = B_TRUE;
+		target_mnt_id = stx.stx_mnt_id;
+	}
+#endif

 	if ((fp = fopen(MNTTAB, "re")) == NULL) {
 		(void) fprintf(stderr, "cannot open %s\n", MNTTAB);
@@ -139,12 +160,15 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
 	 */

 	match = 0;
-	while (getextmntent_impl(fp, entry) == 0) {
-		if (makedev(entry->mnt_major, entry->mnt_minor) ==
-		    statbuf->st_dev) {
-			match = 1;
-			break;
+	while (getextmntent_impl(fp, entry, &entry_mnt_id) == 0) {
+		if (have_mnt_id) {
+			match = (entry_mnt_id == target_mnt_id);
+		} else {
+			match = makedev(entry->mnt_major, entry->mnt_minor) ==
+			    statbuf->st_dev;
 		}
+		if (match)
+			break;
 	}
 	(void) fclose(fp);

@@ -50,6 +50,7 @@ dist_man_MANS = \
 	%D%/man8/zfs-redact.8 \
 	%D%/man8/zfs-release.8 \
 	%D%/man8/zfs-rename.8 \
+	%D%/man8/zfs-rewrite.8 \
 	%D%/man8/zfs-rollback.8 \
 	%D%/man8/zfs-send.8 \
 	%D%/man8/zfs-set.8 \
@@ -1057,27 +1057,6 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
-.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
-Maximum number of dedup log flush passes (iterations) each transaction.
-.Pp
-At the start of each transaction, OpenZFS will estimate how many entries it
-needs to flush out to keep up with the change rate, taking the amount and time
-taken to flush on previous txgs into account (see
-.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
-It will spread this amount into a number of passes.
-At each pass, it will use the amount already flushed and the total time taken
-by flushing and by other IO to recompute how much it should do for the remainder
-of the txg.
-.Pp
-Reducing the max number of passes will make flushing more aggressive, flushing
-out more entries on each pass.
-This can be faster, but also more likely to compete with other IO.
-Increasing the max number of passes will put fewer entries onto each pass,
-keeping the overhead of dedup changes to a minimum but possibly causing a large
-number of changes to be dumped on the last pass, which can blow out the txg
-sync time beyond
-.Sy zfs_txg_timeout .
-.
 .It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
 Minimum time to spend on dedup log flush each transaction.
 .Pp
@@ -1087,22 +1066,58 @@ up to
 This occurs even if doing so would delay the transaction, that is, other IO
 completes under this time.
 .
-.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 100 Ns Pq uint
 Flush at least this many entries each transaction.
 .Pp
-OpenZFS will estimate how many entries it needs to flush each transaction to
-keep up with the ingest rate (see
-.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
-This sets the minimum for that estimate.
-Raising it can force OpenZFS to flush more aggressively, keeping the log small
-and so reducing pool import times, but can make it less able to back off if
-log flushing would compete with other IO too much.
+OpenZFS will flush a fraction of the log every TXG, to keep the size
+proportional to the ingest rate (see
+.Sy zfs_dedup_log_flush_txgs ) .
+This sets the minimum for that estimate, which prevents the backlog from
+completely draining if the ingest rate falls.
+Raising it can force OpenZFS to flush more aggressively, reducing the backlog
+to zero more quickly, but can make it less able to back off if log
+flushing would compete with other IO too much.
 .
+.It Sy zfs_dedup_log_flush_entries_max Ns = Ns Sy UINT_MAX Ns Pq uint
+Flush at most this many entries each transaction.
+.Pp
+Mostly used for debugging purposes.
+.It Sy zfs_dedup_log_flush_txgs Ns = Ns Sy 100 Ns Pq uint
+Target number of TXGs to process the whole dedup log.
+.Pp
+Every TXG, OpenZFS will process the inverse of this number times the size
+of the DDT backlog.
+This will keep the backlog at a size roughly equal to the ingest rate
+times this value.
+This offers a balance between a more efficient DDT log, with better
+aggregation, and shorter import times, which increase as the size of the
+DDT log increases.
+Increasing this value will result in a more efficient DDT log, but longer
+import times.
+.It Sy zfs_dedup_log_cap Ns = Ns Sy UINT_MAX Ns Pq uint
+Soft cap for the size of the current dedup log.
+.Pp
+If the log is larger than this size, we increase the aggressiveness of
+the flushing to try to bring it back down to the soft cap.
+Setting it will reduce import times, but will reduce the efficiency of
+the DDT log, increasing the expected number of IOs required to flush the same
+amount of data.
+.It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+Whether to treat the log cap as a firm cap or not.
+.Pp
+When set to 0 (the default), the
+.Sy zfs_dedup_log_cap
+will increase the maximum number of log entries we flush in a given txg.
+This will bring the backlog size down towards the cap, but not at the expense
+of making TXG syncs take longer.
+If this is set to 1, the cap acts more like a hard cap than a soft cap; it will
+also increase the minimum number of log entries we flush per TXG.
+Enabling it will reduce worst-case import times, at the cost of increased TXG
+sync times.
 .It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
 Number of transactions to use to compute the flow rate.
 .Pp
-OpenZFS will estimate how many entries it needs to flush each transaction by
-monitoring the number of entries changed (ingest rate), number of entries
+OpenZFS will estimate number of entries changed (ingest rate), number of entries
 flushed (flush rate) and time spent flushing (flush time rate) and combining
 these into an overall "flow rate".
 It will use an exponential weighted moving average over some number of recent
@@ -1369,14 +1384,15 @@ If this setting is 0, then even if feature@block_cloning is enabled,
 using functions and system calls that attempt to clone blocks will act as
 though the feature is disabled.
 .
-.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
-When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
-written to disk.
-This allows the clone operation to reliably succeed when a file is
+.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 1 Ns | Ns 0 Pq int
+When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
+data to be written to disk before proceeding.
+This ensures that the clone operation reliably succeeds, even if a file is
 modified and then immediately cloned.
-For small files this may be slower than making a copy of the file.
-Therefore, this setting defaults to 0 which causes a clone operation to
-immediately fail when encountering a dirty block.
+Note that for small files this may be slower than simply copying the file.
+When set to 0 the clone operation will immediately fail if it encounters
+any dirty blocks.
+By default waiting is enabled.
 .
 .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
 Select a BLAKE3 implementation.
@@ -1638,6 +1654,10 @@ _
 	2048	ZFS_DEBUG_TRIM	Verify TRIM ranges are always within the allocatable range tree.
 	4096	ZFS_DEBUG_LOG_SPACEMAP	Verify that the log summary is consistent with the spacemap log
 			       and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing.
+	8192	ZFS_DEBUG_METASLAB_ALLOC	Enable debugging messages when allocations fail.
+	16384	ZFS_DEBUG_BRT	Enable BRT-related debugging messages.
+	32768	ZFS_DEBUG_RAIDZ_RECONSTRUCT	Enabled debugging messages for raidz reconstruction.
+	65536	ZFS_DEBUG_DDT	Enable DDT-related debugging messages.
 .TE
 .Sy \& * No Requires debug build .
 .
@@ -1596,7 +1596,8 @@ When set to
 ZFS stores an extra copy of only critical metadata.
 This can improve file create performance since less metadata
 needs to be written.
-If a single on-disk block is corrupt, at worst a single user file can be lost.
+If a single on-disk block is corrupt, multiple user files or directories
+can be lost.
 .Pp
 When set to
 .Sy none ,
@@ -0,0 +1,76 @@
+.\" SPDX-License-Identifier: CDDL-1.0
+.\"
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or https://opensource.org/licenses/CDDL-1.0.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\"
+.\" Copyright (c) 2025 iXsystems, Inc.
+.\"
+.Dd May 6, 2025
+.Dt ZFS-REWRITE 8
+.Os
+.
+.Sh NAME
+.Nm zfs-rewrite
+.Nd rewrite specified files without modification
+.Sh SYNOPSIS
+.Nm zfs
+.Cm rewrite
+.Oo Fl rvx Ns Oc
+.Op Fl l Ar length
+.Op Fl o Ar offset
+.Ar file Ns | Ns Ar directory Ns …
+.
+.Sh DESCRIPTION
+Rewrite blocks of specified
+.Ar file
+as is without modification at a new location and possibly with new
+properties, such as checksum, compression, dedup, copies, etc,
+as if they were atomically read and written back.
+.Bl -tag -width "-r"
+.It Fl l Ar length
+Rewrite at most this number of bytes.
+.It Fl o Ar offset
+Start at this offset in bytes.
+.It Fl r
+Recurse into directories.
+.It Fl v
+Print names of all successfully rewritten files.
+.It Fl x
+Don't cross file system mount points when recursing.
+.El
+.Sh NOTES
+Rewrite of cloned blocks and blocks that are part of any snapshots,
+same as some property changes may increase pool space usage.
+Holes that were never written or were previously zero-compressed are
+not rewritten and will remain holes even if compression is disabled.
+.Pp
+Rewritten blocks will be seen as modified in next snapshot and as such
+included into the incremental
+.Nm zfs Cm send
+stream.
+.Pp
+If a
+.Fl l
+or
+.Fl o
+value request a rewrite to regions past the end of the file, then those
+regions are silently ignored, and no error is reported.
+.
+.Sh SEE ALSO
+.Xr zfsprops 7
@@ -37,7 +37,7 @@
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd May 12, 2022
+.Dd April 18, 2025
 .Dt ZFS 8
 .Os
 .
@@ -299,6 +299,12 @@ Execute ZFS administrative operations
 programmatically via a Lua script-language channel program.
 .El
 .
+.Ss Data rewrite
+.Bl -tag -width ""
+.It Xr zfs-rewrite 8
+Rewrite specified files without modification.
+.El
+.
 .Ss Jails
 .Bl -tag -width ""
 .It Xr zfs-jail 8
@@ -494,3 +494,34 @@ UBSAN_SANITIZE_zfs/sa.o := n
 ifeq ($(CONFIG_ALTIVEC),y)
 $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec
 endif
+
+# The following recipes attempt to fix out of src-tree builds, where $(src) != $(obj), so that the
+# subdir %.c/%.S -> %.o targets will work as expected. The in-kernel pattern targets do not seem to
+# be working on subdirs since about ~6.10
+zobjdirs = $(dir $(zfs-objs)) $(dir $(spl-objs))                                             \
+  $(dir $(zfs-$(CONFIG_X86))) $(dir $(zfs-$(CONFIG_UML_X86))) $(dir $(zfs-$(CONFIG_ARM64)))  \
+  $(dir $(zfs-$(CONFIG_PPC64))) $(dir $(zfs-$(CONFIG_PPC)))
+
+z_cdirs = $(sort $(filter-out lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \
+  $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/)                                          \
+  $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/)                                             \
+  $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
+z_sdirs = $(sort $(filter lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/)     \
+  $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/)                                          \
+  $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/)                                             \
+  $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
+
+define ZKMOD_C_O_MAKE_TARGET
+$1%.o: $(src)/$1%.c FORCE
+	$$(call if_changed_rule,cc_o_c)
+	$$(call cmd,force_checksrc)
+endef
+
+define ZKMOD_S_O_MAKE_TARGET
+$1%.o: $(src)/$1%.S FORCE
+	$$(call if_changed_rule,as_o_S)
+	$$(call cmd,force_checksrc)
+endef
+
+$(foreach target,$(z_cdirs), $(eval $(call ZKMOD_C_O_MAKE_TARGET,$(target))))
+$(foreach target,$(z_sdirs), $(eval $(call ZKMOD_S_O_MAKE_TARGET,$(target))))
@@ -57,6 +57,7 @@ modules-Linux:
 		$(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \
 		$(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \
 		$(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \
+		$(if @OBJTOOL_DISABLE_WERROR@,objtool=@abs_top_builddir@/scripts/objtool-wrapper) \
 		M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules

 modules-FreeBSD:
@@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
 	va_end(ap);
 }

+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if curproc is pageproc (FreeBSD's page daemon).
+ */
+int
+current_is_reclaim_thread(void)
+{
+	return (curproc == pageproc);
+}

 SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
    opensolaris_utsname_init, NULL);
@@ -306,6 +306,18 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
 		*(offset_t *)data = off;
 		return (0);
 	}
+	case ZFS_IOC_REWRITE: {
+		zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data;
+		if ((flag & FWRITE) == 0)
+			return (SET_ERROR(EBADF));
+		error = vn_lock(vp, LK_SHARED);
+		if (error)
+			return (error);
+		error = zfs_rewrite(VTOZ(vp), args->off, args->len,
+		    args->flags, args->arg);
+		VOP_UNLOCK(vp);
+		return (error);
+	}
 	}
 	return (SET_ERROR(ENOTTY));
 }
@@ -4228,17 +4240,46 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);

-		putpage_commit_arg_t *pca = kmem_alloc(
-		    offsetof(putpage_commit_arg_t, pca_pages[ncount]),
-		    KM_SLEEP);
-		pca->pca_npages = ncount;
-		memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
+		if (commit) {
+			/*
+			 * Caller requested that we commit immediately. We set
+			 * a callback on the log entry, to be called once its
+			 * on disk after the call to zil_commit() below. The
+			 * pages will be undirtied and unbusied there.
+			 */
+			putpage_commit_arg_t *pca = kmem_alloc(
+			    offsetof(putpage_commit_arg_t, pca_pages[ncount]),
+			    KM_SLEEP);
+			pca->pca_npages = ncount;
+			memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);

-		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp,
-		    off, len, commit, B_FALSE, zfs_putpage_commit_cb, pca);
+			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+			    B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);

-		for (i = 0; i < ncount; i++)
-			rtvals[i] = zfs_vm_pagerret_pend;
+			for (i = 0; i < ncount; i++)
+				rtvals[i] = zfs_vm_pagerret_pend;
+		} else {
+			/*
+			 * Caller just wants the page written back somewhere,
+			 * but doesn't need it committed yet. We've already
+			 * written it back to the DMU, so we just need to put
+			 * it on the async log, then undirty the page and
+			 * return.
+			 *
+			 * We cannot use a callback here, because it would keep
+			 * the page busy (locked) until it is eventually
+			 * written down at txg sync.
+			 */
+			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
+			    B_FALSE, B_FALSE, NULL, NULL);
+
+			zfs_vmobject_wlock(object);
+			for (i = 0; i < ncount; i++) {
+				rtvals[i] = zfs_vm_pagerret_ok;
+				vm_page_undirty(ma[i]);
+			}
+			zfs_vmobject_wunlock(object);
+		}

 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
@@ -5201,6 +5242,11 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 			return (0);
 		}
 		return (EINVAL);
+#ifdef _PC_HAS_HIDDENSYSTEM
+	case _PC_HAS_HIDDENSYSTEM:
+		*ap->a_retval = 1;
+		return (0);
+#endif
 	default:
 		return (vop_stdpathconf(ap));
 	}
@@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
 	zp->z_vnode = NULL;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;

 	return (0);
 }
@@ -172,9 +170,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)

 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
-	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
-	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }


@@ -293,6 +288,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 	sharezp->z_atime_dirty = 0;
 	sharezp->z_zfsvfs = zfsvfs;
 	sharezp->z_is_sa = zfsvfs->z_use_sa;
+	sharezp->z_pflags = 0;

 	VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
 	    kcred, NULL, &acl_ids, NULL));
@@ -455,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 	atomic_store_ptr(&zp->z_cached_symlink, NULL);

 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
@@ -1729,6 +1723,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_is_sa = USE_SA(version, os);
+	rootzp->z_pflags = 0;

 	zfsvfs->z_os = os;
 	zfsvfs->z_parent = zfsvfs;
@@ -28,6 +28,7 @@
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/string.h>
+#include <sys/misc.h>

 /*
 * Thread interfaces
@@ -197,3 +198,14 @@ issig(void)
 }

 EXPORT_SYMBOL(issig);
+
+/*
+ * Check if the current thread is a memory reclaim thread.
+ * Returns true if current thread is kswapd.
+ */
+int
+current_is_reclaim_thread(void)
+{
+	return (current_is_kswapd());
+}
+EXPORT_SYMBOL(current_is_reclaim_thread);
@@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	zp->z_pflags = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;
 	ip->i_generation = 0;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRWXUGO);
@@ -1176,6 +1176,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
 	return (error);
 }

+/*
+ * Dentry and inode caches referenced by a task in non-root memcg are
+ * not going to be scanned by the kernel-provided shrinker. So, if
+ * kernel prunes nothing, fall back to this manual walk to free dnodes.
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+	znode_t **zp_array, *zp;
+	int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+	int objects = 0;
+	int i = 0, j = 0;
+
+	zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+		if ((i++ > nr_to_scan) || (j >= max_array))
+			break;
+
+		ASSERT(list_link_active(&zp->z_link_node));
+		list_remove(&zfsvfs->z_all_znodes, zp);
+		list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+		/* Skip active znodes and .zfs entries */
+		if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+			continue;
+
+		if (igrab(ZTOI(zp)) == NULL)
+			continue;
+
+		zp_array[j] = zp;
+		j++;
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	for (i = 0; i < j; i++) {
+		zp = zp_array[i];
+
+		ASSERT3P(zp, !=, NULL);
+		d_prune_aliases(ZTOI(zp));
+
+		if (atomic_read(&ZTOI(zp)->i_count) == 1)
+			objects++;
+
+		zrele(zp);
+	}
+
+	vmem_free(zp_array, max_array * sizeof (znode_t *));
+
+	return (objects);
+}
+
 /*
 * The ARC has requested that the filesystem drop entries from the dentry
 * and inode caches.  This can occur when the ARC needs to free meta data
@@ -1227,6 +1284,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
 	*objects = (*shrinker->scan_objects)(shrinker, &sc);
 #endif

+	/*
+	 * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
+	 * due to dentry and inode caches being referenced by a task running
+	 * in non-root memcg.
+	 */
+	if (*objects == 0)
+		*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+
 	zfs_exit(zfsvfs, FTAG);

 	dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
@@ -25,6 +25,7 @@
 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
 * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2025, Klara, Inc.
 */

 /* Portions Copyright 2007 Jeremy Teo */
@@ -3682,7 +3683,7 @@ top:
 }

 static void
-zfs_putpage_sync_commit_cb(void *arg)
+zfs_putpage_commit_cb(void *arg)
 {
 	struct page *pp = arg;

@@ -3690,17 +3691,6 @@ zfs_putpage_sync_commit_cb(void *arg)
 	end_page_writeback(pp);
 }

-static void
-zfs_putpage_async_commit_cb(void *arg)
-{
-	struct page *pp = arg;
-	znode_t *zp = ITOZ(pp->mapping->host);
-
-	ClearPageError(pp);
-	end_page_writeback(pp);
-	atomic_dec_32(&zp->z_async_writes_cnt);
-}
-
 /*
 * Push a page out to disk, once the page is on stable storage the
 * registered commit callback will be run as notification of completion.
@@ -3818,15 +3808,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 		zfs_rangelock_exit(lr);

 		if (wbc->sync_mode != WB_SYNC_NONE) {
-			/*
-			 * Speed up any non-sync page writebacks since
-			 * they may take several seconds to complete.
-			 * Refer to the comment in zpl_fsync() for details.
-			 */
-			if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
-				zil_commit(zfsvfs->z_log, zp->z_id);
-			}
-
 			if (PageWriteback(pp))
 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 				folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3852,8 +3833,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
-	if (!for_sync)
-		atomic_inc_32(&zp->z_async_writes_cnt);
 	set_page_writeback(pp);
 	unlock_page(pp);

@@ -3872,8 +3851,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 #endif
 		ClearPageError(pp);
 		end_page_writeback(pp);
-		if (!for_sync)
-			atomic_dec_32(&zp->z_async_writes_cnt);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (err);
@@ -3899,35 +3876,61 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,

 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);

-	boolean_t commit = B_FALSE;
-	if (wbc->sync_mode != WB_SYNC_NONE) {
-		/*
-		 * Note that this is rarely called under writepages(), because
-		 * writepages() normally handles the entire commit for
-		 * performance reasons.
-		 */
-		commit = B_TRUE;
-	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
-		/*
-		 * If the caller does not intend to wait synchronously
-		 * for this page writeback to complete and there are active
-		 * synchronous calls on this file, do a commit so that
-		 * the latter don't accidentally end up waiting for
-		 * our writeback to complete. Refer to the comment in
-		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
-		 */
-		commit = B_TRUE;
-	}
+	/*
+	 * A note about for_sync vs wbc->sync_mode.
+	 *
+	 * for_sync indicates that this is a syncing writeback, that is, kernel
+	 * caller expects the data to be durably stored before being notified.
+	 * Often, but not always, the call was triggered by a userspace syncing
+	 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
+	 * means that that page should remain "locked" (in the writeback state)
+	 * until it is definitely on disk (ie zil_commit() or spa_sync()).
+	 * Otherwise, we can unlock and return as soon as it is on the
+	 * in-memory ZIL.
+	 *
+	 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
+	 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
+	 * indicates this a regular async writeback (eg a cache eviction) and
+	 * so does not need a durability guarantee, while WB_SYNC_ALL indicates
+	 * a syncing op that must be waited on (by convention, we test for
+	 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
+	 * performance should there ever be a new mode that we have not yet
+	 * added support for).
+	 *
+	 * So, why a separate for_sync field? This is because zpl_writepages()
+	 * calls zfs_putpage() multiple times for a single "logical" operation.
+	 * It wants all the individual pages to be for_sync==TRUE ie only
+	 * unlocked once durably stored, but it only wants one call to
+	 * zil_commit() at the very end, once all the pages are synced. So,
+	 * it repurposes sync_mode slightly to indicate who issue and wait for
+	 * the IO: for NONE, the caller to zfs_putpage() will do it, while for
+	 * ALL, zfs_putpage should do it.
+	 *
+	 * Summary:
+	 *   for_sync:  0=unlock immediately; 1 unlock once on disk
+	 *   sync_mode: NONE=caller will commit; ALL=we will commit
+	 */
+	boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);

-	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
-	    B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
-	    zfs_putpage_async_commit_cb, pp);
+	/*
+	 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
+	 * because it is a policy flag that indicates "someone will call
+	 * zil_commit() soon". for_sync=TRUE means exactly that; the only
+	 * question is whether it will be us, or zpl_writepages().
+	 */
+	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
+	    B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
+
+	if (!for_sync) {
+		ClearPageError(pp);
+		end_page_writeback(pp);
+	}

 	dmu_tx_commit(tx);

 	zfs_rangelock_exit(lr);

-	if (commit)
+	if (need_commit)
 		zil_commit(zfsvfs->z_log, zp->z_id);

 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
@@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_acl_cached = NULL;
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;

 	return (0);
 }
@@ -149,9 +147,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	ASSERT3P(zp->z_dirlocks, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
-
-	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
-	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }

 static int
@@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 	return (0);
 }

+void
+zfs_inode_free(struct inode *ip)
+{
+	kmem_cache_free(znode_cache, ITOZ(ip));
+}
+
 /*
 * Called in multiple places when an inode should be destroyed.
 */
@@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip)
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
-
-	kmem_cache_free(znode_cache, zp);
+#ifndef HAVE_SOPS_FREE_INODE
+	/*
+	 * inode needs to be freed in RCU callback.  If we have
+	 * super_operations->free_inode, Linux kernel will do call_rcu
+	 * for us.  But if we don't have it, since call_rcu is GPL-only
+	 * symbol, we can only free synchronously and accept the risk.
+	 */
+	zfs_inode_free(ip);
+#endif
 }

 static void
@@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_sync_writes_cnt = 0;
-	zp->z_async_writes_cnt = 0;

 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);

@@ -36,10 +36,7 @@
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_project.h>
-#if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \
-    defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
-#include <linux/pagemap.h>
-#endif
+#include <linux/pagemap_compat.h>
 #include <linux/fadvise.h>
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 #include <linux/writeback.h>
@@ -114,52 +111,11 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	znode_t *zp = ITOZ(inode);
-	zfsvfs_t *zfsvfs = ITOZSB(inode);
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;

-	/*
-	 * The variables z_sync_writes_cnt and z_async_writes_cnt work in
-	 * tandem so that sync writes can detect if there are any non-sync
-	 * writes going on and vice-versa. The "vice-versa" part to this logic
-	 * is located in zfs_putpage() where non-sync writes check if there are
-	 * any ongoing sync writes. If any sync and non-sync writes overlap,
-	 * we do a commit to complete the non-sync writes since the latter can
-	 * potentially take several seconds to complete and thus block sync
-	 * writes in the upcoming call to filemap_write_and_wait_range().
-	 */
-	atomic_inc_32(&zp->z_sync_writes_cnt);
-	/*
-	 * If the following check does not detect an overlapping non-sync write
-	 * (say because it's just about to start), then it is guaranteed that
-	 * the non-sync write will detect this sync write. This is because we
-	 * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
-	 * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
-	 * zfs_putpage() respectively.
-	 */
-	if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
-		if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
-			atomic_dec_32(&zp->z_sync_writes_cnt);
-			return (error);
-		}
-		zil_commit(zfsvfs->z_log, zp->z_id);
-		zpl_exit(zfsvfs, FTAG);
-	}
-
 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-
-	/*
-	 * The sync write is not complete yet but we decrement
-	 * z_sync_writes_cnt since zfs_fsync() increments and decrements
-	 * it internally. If a non-sync write starts just after the decrement
-	 * operation but before we call zfs_fsync(), it may not detect this
-	 * overlapping sync write but it does not matter since we have already
-	 * gone past filemap_write_and_wait_range() and we won't block due to
-	 * the non-sync write.
-	 */
-	atomic_dec_32(&zp->z_sync_writes_cnt);
-
 	if (error)
 		return (error);

@@ -555,6 +511,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	return (result);
 }

+#ifdef HAVE_VFS_WRITEPAGE
 /*
 * Write out dirty pages to the ARC, this function is only required to
 * support mmap(2).  Mapped pages may be dirtied by memory operations
@@ -571,6 +528,7 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)

 	return (zpl_putpage(pp, wbc, &for_sync));
 }
+#endif

 /*
 * The flag combination which matches the behavior of zfs_space() is
@@ -985,6 +943,27 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
 	return (err);
 }

+static int
+zpl_ioctl_rewrite(struct file *filp, void __user *arg)
+{
+	struct inode *ip = file_inode(filp);
+	zfs_rewrite_args_t args;
+	fstrans_cookie_t cookie;
+	int err;
+
+	if (copy_from_user(&args, arg, sizeof (args)))
+		return (-EFAULT);
+
+	if (unlikely(!(filp->f_mode & FMODE_WRITE)))
+		return (-EBADF);
+
+	cookie = spl_fstrans_mark();
+	err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
+	spl_fstrans_unmark(cookie);
+
+	return (err);
+}
+
 static long
 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
@@ -1003,6 +982,8 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return (zpl_ioctl_getdosflags(filp, (void *)arg));
 	case ZFS_IOC_SETDOSFLAGS:
 		return (zpl_ioctl_setdosflags(filp, (void *)arg));
+	case ZFS_IOC_REWRITE:
+		return (zpl_ioctl_rewrite(filp, (void *)arg));
 	default:
 		return (-ENOTTY);
 	}
@@ -1040,7 +1021,9 @@ const struct address_space_operations zpl_address_space_operations = {
 #else
 	.readpage	= zpl_readpage,
 #endif
+#ifdef HAVE_VFS_WRITEPAGE
 	.writepage	= zpl_writepage,
+#endif
 	.writepages	= zpl_writepages,
 	.direct_IO	= zpl_direct_IO,
 #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
@@ -45,6 +45,15 @@ zpl_inode_alloc(struct super_block *sb)
 	return (ip);
 }

+#ifdef HAVE_SOPS_FREE_INODE
+static void
+zpl_inode_free(struct inode *ip)
+{
+	ASSERT(atomic_read(&ip->i_count) == 0);
+	zfs_inode_free(ip);
+}
+#endif
+
 static void
 zpl_inode_destroy(struct inode *ip)
 {
@@ -455,6 +464,9 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)

 const struct super_operations zpl_super_operations = {
 	.alloc_inode		= zpl_inode_alloc,
+#ifdef HAVE_SOPS_FREE_INODE
+	.free_inode		= zpl_inode_free,
+#endif
 	.destroy_inode		= zpl_inode_destroy,
 	.dirty_inode		= zpl_dirty_inode,
 	.write_inode		= NULL,
@@ -558,8 +558,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 #ifdef HAVE_BLK_MQ_RQ_HCTX
 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
 #else
-		blk_mq_hw_queue =
-		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+		blk_mq_hw_queue = rq->q->queue_hw_ctx[
+		    rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
 #endif
 	taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 	    blk_mq_hw_queue);
@@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
-		ARCSTAT_INCR(arcstat_dnode_size, space);
+		aggsum_add(&arc_sums.arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
@@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
-		ARCSTAT_INCR(arcstat_dnode_size, -space);
+		aggsum_add(&arc_sums.arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
@@ -4490,7 +4490,7 @@ arc_evict(void)
 	 * target is not evictable or if they go over arc_dnode_limit.
 	 */
 	int64_t prune = 0;
-	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
+	int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
 	int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
 	    + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
 	    - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
@@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
-	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
+	int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
 	    zfs_max_recordsize;
+	int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
+	    arc_dnode_limit;

 	/* Always allow at least one block of overflow. */
-	if (over < 0)
+	if (arc_over < 0 && dn_over <= 0)
 		return (ARC_OVF_NONE);

 	/* If we are under memory pressure, report severe overflow. */
@@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
 	int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
 	if (use_reserve)
 		overflow *= 3;
-	return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
+	return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }

 static abd_t *
@@ -6627,27 +6629,11 @@ arc_release(arc_buf_t *buf, const void *tag)
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
+	ASSERT3P(state, !=, arc_l2c_only);

 	/* this buffer is not on any list */
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);

-	if (HDR_HAS_L2HDR(hdr)) {
-		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
-
-		/*
-		 * We have to recheck this conditional again now that
-		 * we're holding the l2ad_mtx to prevent a race with
-		 * another thread which might be concurrently calling
-		 * l2arc_evict(). In that case, l2arc_evict() might have
-		 * destroyed the header's L2 portion as we were waiting
-		 * to acquire the l2ad_mtx.
-		 */
-		if (HDR_HAS_L2HDR(hdr))
-			arc_hdr_l2hdr_destroy(hdr);
-
-		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
-	}
-
 	/*
 	 * Do we have more than one buf?
 	 */
@@ -6659,10 +6645,6 @@ arc_release(arc_buf_t *buf, const void *tag)
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
-		VERIFY3U(hdr->b_type, ==, type);
-
-		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
-		VERIFY3S(remove_reference(hdr, tag), >, 0);

 		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
@@ -6670,10 +6652,10 @@ arc_release(arc_buf_t *buf, const void *tag)
 		}

 		/*
-		 * Pull the data off of this hdr and attach it to
-		 * a new anonymous hdr. Also find the last buffer
+		 * Pull the buffer off of this hdr and find the last buffer
 		 * in the hdr's buffer list.
 		 */
+		VERIFY3S(remove_reference(hdr, tag), >, 0);
 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 		ASSERT3P(lastbuf, !=, NULL);

@@ -6682,7 +6664,6 @@ arc_release(arc_buf_t *buf, const void *tag)
 		 * buffer, then we must stop sharing that block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
-			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(!arc_buf_is_shared(lastbuf));

 			/*
@@ -6704,7 +6685,6 @@ arc_release(arc_buf_t *buf, const void *tag)
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
-			VERIFY3P(lastbuf->b_data, !=, NULL);
 		} else if (HDR_SHARED_DATA(hdr)) {
 			/*
 			 * Uncompressed shared buffers are always at the end
@@ -6720,18 +6700,10 @@ arc_release(arc_buf_t *buf, const void *tag)
 		}

 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
-		ASSERT3P(state, !=, arc_l2c_only);

 		(void) zfs_refcount_remove_many(&state->arcs_size[type],
 		    arc_buf_size(buf), buf);

-		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
-			ASSERT3P(state, !=, arc_l2c_only);
-			(void) zfs_refcount_remove_many(
-			    &state->arcs_esize[type],
-			    arc_buf_size(buf), buf);
-		}
-
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);

@@ -6759,6 +6731,15 @@ arc_release(arc_buf_t *buf, const void *tag)
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+		if (HDR_HAS_L2HDR(hdr)) {
+			mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+			/* Recheck to prevent race with l2arc_evict(). */
+			if (HDR_HAS_L2HDR(hdr))
+				arc_hdr_l2hdr_destroy(hdr);
+			mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+		}
+
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
@@ -7086,6 +7067,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 			localprop.zp_nopwrite = B_FALSE;
 			localprop.zp_copies =
 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
+			localprop.zp_gang_copies =
+			    MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
 		}
 		zio_flags |= ZIO_FLAG_RAW;
 	} else if (ARC_BUF_COMPRESSED(buf)) {
@@ -7343,7 +7326,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
-	    wmsum_value(&arc_sums.arcstat_dnode_size) +
+	    aggsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif

@@ -7385,7 +7368,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
 	    &as->arcstat_uncached_evictable_metadata);

 	as->arcstat_dnode_size.value.ui64 =
-	    wmsum_value(&arc_sums.arcstat_dnode_size);
+	    aggsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
@@ -7755,7 +7738,7 @@ arc_state_init(void)
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
-	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
+	aggsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
@@ -7914,7 +7897,7 @@ arc_state_fini(void)
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
-	wmsum_fini(&arc_sums.arcstat_dnode_size);
+	aggsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);
@@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size)
 	 * and grabbing the lock results in massive lock contention.
 	 */
 	if (size > dbuf_cache_target_bytes()) {
-		if (size > dbuf_cache_hiwater_bytes())
+		/*
+		 * Avoid calling dbuf_evict_one() from memory reclaim context
+		 * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.
+		 * Memory reclaim threads can get stuck waiting for the dbuf
+		 * hash lock.
+		 */
+		if (size > dbuf_cache_hiwater_bytes() &&
+		    !current_is_reclaim_thread()) {
 			dbuf_evict_one();
+		}
 		cv_signal(&dbuf_evict_cv);
 	}
 }
@@ -1185,16 +1193,9 @@ dbuf_verify(dmu_buf_impl_t *db)
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
-			/*
-			 * dnode_grow_indblksz() can make this fail if we don't
-			 * have the parent's rwlock.  XXX indblksz no longer
-			 * grows.  safe to do this now?
-			 */
-			if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
-				ASSERT3P(db->db_blkptr, ==,
-				    ((blkptr_t *)db->db_parent->db.db_data +
-				    db->db_blkid % epb));
-			}
+			ASSERT3P(db->db_blkptr, ==,
+			    ((blkptr_t *)db->db_parent->db.db_data +
+			    db->db_blkid % epb));
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
@@ -3391,12 +3392,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 			*parentp = NULL;
 			return (err);
 		}
-		rw_enter(&(*parentp)->db_rwlock, RW_READER);
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
-		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
-			ASSERT(BP_IS_HOLE(*bpp));
-		rw_exit(&(*parentp)->db_rwlock);
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
@@ -5375,8 +5372,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
-		    dr->dt.dl.dr_brtwrite);
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
+		    dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (data == NULL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
@@ -250,11 +250,6 @@ static uint32_t zfs_ddt_prunes_per_txg = 50000;
 boolean_t ddt_prune_artificial_age = B_FALSE;
 boolean_t ddt_dump_prune_histogram = B_FALSE;

-/*
- * Don't do more than this many incremental flush passes per txg.
- */
-uint_t zfs_dedup_log_flush_passes_max = 8;
-
 /*
 * Minimum time to flush per txg.
 */
@@ -263,7 +258,32 @@ uint_t zfs_dedup_log_flush_min_time_ms = 1000;
 /*
 * Minimum entries to flush per txg.
 */
-uint_t zfs_dedup_log_flush_entries_min = 1000;
+uint_t zfs_dedup_log_flush_entries_min = 200;
+
+/*
+ * Target number of TXGs until the whole dedup log has been flushed.
+ * The log size will float around this value times the ingest rate.
+ */
+uint_t zfs_dedup_log_flush_txgs = 100;
+
+/*
+ * Maximum entries to flush per txg. Used for testing the dedup log.
+ */
+uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
+
+/*
+ * Soft cap for the size of the current dedup log. If the log is larger
+ * than this size, we slightly increase the aggressiveness of the flushing to
+ * try to bring it back down to the soft cap.
+ */
+uint_t zfs_dedup_log_cap = UINT_MAX;
+
+/*
+ * If this is set to B_TRUE, the cap above acts more like a hard cap:
+ * flushing is significantly more aggressive, increasing the minimum amount we
+ * flush per txg, as well as the maximum.
+ */
+boolean_t zfs_dedup_log_hard_cap = B_FALSE;

 /*
 * Number of txgs to average flow rates across.
@@ -1600,6 +1620,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
+	ddt->ddt_log_flush_pressure = 10;

 	ddt_log_alloc(ddt);
 	ddt_table_alloc_kstats(ddt);
@@ -2013,146 +2034,6 @@ _ewma(int32_t val, int32_t prev, uint32_t weight)
 	return (new);
 }

-/* Returns true if done for this txg */
-static boolean_t
-ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
-{
-	if (ddt->ddt_flush_pass == 0) {
-		if (spa_sync_pass(ddt->ddt_spa) == 1) {
-			/* First run this txg, get set up */
-			ddt->ddt_flush_start = gethrtime();
-			ddt->ddt_flush_count = 0;
-
-			/*
-			 * How many entries we need to flush. We want to at
-			 * least match the ingest rate.
-			 */
-			ddt->ddt_flush_min = MAX(
-			    ddt->ddt_log_ingest_rate,
-			    zfs_dedup_log_flush_entries_min);
-
-			/*
-			 * If we've been asked to flush everything in a hurry,
-			 * try to dump as much as possible on this txg. In
-			 * this case we're only limited by time, not amount.
-			 */
-			if (ddt->ddt_flush_force_txg > 0)
-				ddt->ddt_flush_min =
-				    MAX(ddt->ddt_flush_min, avl_numnodes(
-				    &ddt->ddt_log_flushing->ddl_tree));
-		} else {
-			/* We already decided we're done for this txg */
-			return (B_FALSE);
-		}
-	} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
-		/*
-		 * We already did some flushing on this pass, skip it. This
-		 * happens when dsl_process_async_destroys() runs during a scan
-		 * (on pass 1) and does an additional ddt_sync() to update
-		 * freed blocks.
-		 */
-		return (B_FALSE);
-	}
-
-	if (spa_sync_pass(ddt->ddt_spa) >
-	    MAX(zfs_dedup_log_flush_passes_max, 1)) {
-		/* Too many passes this txg, defer until next. */
-		ddt->ddt_flush_pass = 0;
-		return (B_TRUE);
-	}
-
-	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
-		/* Nothing to flush, done for this txg. */
-		ddt->ddt_flush_pass = 0;
-		return (B_TRUE);
-	}
-
-	uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
-	    MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
-	    SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
-
-	uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
-
-	if (elapsed_time >= target_time) {
-		/* Too long since we started, done for this txg. */
-		ddt->ddt_flush_pass = 0;
-		return (B_TRUE);
-	}
-
-	ddt->ddt_flush_pass++;
-	ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
-
-	/*
-	 * Estimate how much time we'll need to flush the remaining entries
-	 * based on how long it normally takes.
-	 */
-	uint32_t want_time;
-	if (ddt->ddt_flush_pass == 1) {
-		/* First pass, use the average time/entries */
-		if (ddt->ddt_log_flush_rate == 0)
-			/* Zero rate, just assume the whole time */
-			want_time = target_time;
-		else
-			want_time = ddt->ddt_flush_min *
-			    ddt->ddt_log_flush_time_rate /
-			    ddt->ddt_log_flush_rate;
-	} else {
-		/* Later pass, calculate from this txg so far */
-		want_time = ddt->ddt_flush_min *
-		    elapsed_time / ddt->ddt_flush_count;
-	}
-
-	/* Figure out how much time we have left */
-	uint32_t remain_time = target_time - elapsed_time;
-
-	/* Smear the remaining entries over the remaining passes. */
-	uint32_t nentries = ddt->ddt_flush_min /
-	    (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
-	if (want_time > remain_time) {
-		/*
-		 * We're behind; try to catch up a bit by doubling the amount
-		 * this pass. If we're behind that means we're in a later
-		 * pass and likely have most of the remaining time to
-		 * ourselves. If we're in the last couple of passes, then
-		 * doubling might just take us over the timeout, but probably
-		 * not be much, and it stops us falling behind. If we're
-		 * in the middle passes, there'll be more to do, but it
-		 * might just help us catch up a bit and we'll recalculate on
-		 * the next pass anyway.
-		 */
-		nentries = MIN(ddt->ddt_flush_min, nentries*2);
-	}
-
-	ddt_lightweight_entry_t ddlwe;
-	uint32_t count = 0;
-	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
-		ddt_sync_flush_entry(ddt, &ddlwe,
-		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
-
-		/* End this pass if we've synced as much as we need to. */
-		if (++count >= nentries)
-			break;
-	}
-	ddt->ddt_flush_count += count;
-	ddt->ddt_flush_min -= count;
-
-	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
-		/* We emptied it, so truncate on-disk */
-		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
-		ddt_log_truncate(ddt, tx);
-		/* No more passes needed this txg */
-		ddt->ddt_flush_pass = 0;
-	} else {
-		/* More to do next time, save checkpoint */
-		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
-		ddt_log_checkpoint(ddt, &ddlwe, tx);
-	}
-
-	ddt_sync_update_stats(ddt, tx);
-
-	return (ddt->ddt_flush_pass == 0);
-}
-
 static inline void
 ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
 {
@@ -2190,19 +2071,135 @@ ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
 static void
 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 {
+	spa_t *spa = ddt->ddt_spa;
 	ASSERT(avl_is_empty(&ddt->ddt_tree));

-	/* Don't do any flushing when the pool is ready to shut down */
-	if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
+	/*
+	 * Don't do any flushing when the pool is ready to shut down, or in
+	 * passes beyond the first.
+	 */
+	if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
 		return;

-	/* Try to flush some. */
-	if (!ddt_sync_flush_log_incremental(ddt, tx))
-		/* More to do next time */
-		return;
+	hrtime_t flush_start = gethrtime();
+	uint32_t count = 0;

-	/* No more flushing this txg, so we can do end-of-txg housekeeping */
+	/*
+	 * How many entries we need to flush. We need to at
+	 * least match the ingest rate, and also consider the
+	 * current backlog of entries.
+	 */
+	uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree);

+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
+		goto housekeeping;
+
+	uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
+	uint64_t cap = MAX(1, zfs_dedup_log_cap);
+	uint64_t flush_min = MAX(backlog / txgs,
+	    zfs_dedup_log_flush_entries_min);
+
+	/*
+	 * The theory for this block is that if we increase the pressure while
+	 * we're growing above the cap, and remove it when we're significantly
+	 * below the cap, we'll stay near cap while not bouncing around too
+	 * much.
+	 *
+	 * The factor of 10 is to smooth the pressure effect by expressing it
+	 * in tenths. The addition of the cap to the backlog in the second
+	 * block is to round up, instead of down. We never let the pressure go
+	 * below 1 (10 tenths).
+	 */
+	if (cap != UINT_MAX && backlog > cap &&
+	    backlog > ddt->ddt_log_flush_prev_backlog) {
+		ddt->ddt_log_flush_pressure += 10 * backlog / cap;
+	} else if (cap != UINT_MAX && backlog < cap) {
+		ddt->ddt_log_flush_pressure -=
+		    11 - (((10 * backlog) + cap - 1) / cap);
+		ddt->ddt_log_flush_pressure =
+		    MAX(ddt->ddt_log_flush_pressure, 10);
+	}
+
+	if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
+		flush_min = MAX(flush_min, MIN(backlog - cap,
+		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
+
+	uint64_t flush_max;
+
+	/*
+	 * If we've been asked to flush everything in a hurry,
+	 * try to dump as much as possible on this txg. In
+	 * this case we're only limited by time, not amount.
+	 *
+	 * Otherwise, if we are over the cap, try to get back down to it.
+	 *
+	 * Finally if there is no cap (or no pressure), just set the max a
+	 * little higher than the min to help smooth out variations in flush
+	 * times.
+	 */
+	if (ddt->ddt_flush_force_txg > 0)
+		flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
+		flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
+		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
+	else
+		flush_max = flush_min * 5 / 4;
+	flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
+
+	/*
+	 * When the pool is busy or someone is explicitly waiting for this txg
+	 * to complete, use the zfs_dedup_log_flush_min_time_ms.  Otherwise use
+	 * half of the time in the txg timeout.
+	 */
+	uint64_t target_time;
+
+	if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
+	    vdev_queue_pool_busy(spa)) {
+		target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
+		    SEC2NSEC(zfs_txg_timeout) / 2);
+	} else {
+		target_time = SEC2NSEC(zfs_txg_timeout) / 2;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
+
+		/* End if we've synced as much as we needed to. */
+		if (++count >= flush_max)
+			break;
+
+		/*
+		 * As long as we've flushed the absolute minimum,
+		 * stop if we're way over our target time.
+		 */
+		uint64_t diff = gethrtime() - flush_start;
+		if (count > zfs_dedup_log_flush_entries_min &&
+		    diff >= target_time * 2)
+			break;
+
+		/*
+		 * End if we've passed the minimum flush and we're out of time.
+		 */
+		if (count > flush_min && diff >= target_time)
+			break;
+	}
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* We emptied it, so truncate on-disk */
+		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
+		ddt_log_truncate(ddt, tx);
+	} else {
+		/* More to do next time, save checkpoint */
+		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
+		ddt_log_checkpoint(ddt, &ddlwe, tx);
+	}
+
+	ddt_sync_update_stats(ddt, tx);
+
+housekeeping:
 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
 	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
 		/*
@@ -2219,12 +2216,13 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	/* If force flush is no longer necessary, turn it off. */
 	ddt_flush_force_update_txg(ddt, 0);

+	ddt->ddt_log_flush_prev_backlog = backlog;
+
 	/*
-	 * Update flush rate. This is an exponential weighted moving average of
-	 * the number of entries flushed over recent txgs.
+	 * Update flush rate. This is an exponential weighted moving
+	 * average of the number of entries flushed over recent txgs.
 	 */
-	ddt->ddt_log_flush_rate = _ewma(
-	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
+	ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
 	    zfs_dedup_log_flush_flow_rate_txgs);
 	DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);

@@ -2232,12 +2230,21 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	 * Update flush time rate. This is an exponential weighted moving
 	 * average of the total time taken to flush over recent txgs.
 	 */
-	ddt->ddt_log_flush_time_rate = _ewma(
-	    ddt->ddt_log_flush_time_rate,
-	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
+	ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
+	    (int32_t)NSEC2MSEC(gethrtime() - flush_start),
 	    zfs_dedup_log_flush_flow_rate_txgs);
 	DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
 	    ddt->ddt_log_flush_time_rate);
+	if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
+	    zfs_flags & ZFS_DEBUG_DDT) {
+		zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
+		    "txg %llu, in %llu ms, flush rate %d, time rate %d",
+		    (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
+		    (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
+		    count, (u_longlong_t)tx->tx_txg,
+		    (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
+		    ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
+	}
 }

 static void
@@ -2785,14 +2792,23 @@ ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");

-ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
-	"Max number of incremental dedup log flush passes per transaction");
-
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
 	"Min time to spend on incremental dedup log flush each transaction");

 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
 	"Min number of log entries to flush each transaction");

+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
+	"Max number of log entries to flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
+	"Number of TXGs to try to rotate the log in");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
+	"Soft cap for the size of the current dedup log");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
+	"Whether to use the soft cap as a hard cap");
+
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
 	"Number of txgs to average flow rates across");
@@ -1916,6 +1916,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+		dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;

 		/*
 		 * Old style holes are filled with all zeros, whereas
@@ -2322,6 +2323,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
+	int gang_copies = os->os_copies;

 	/*
 	 * We maintain different write policies for each of the following
@@ -2354,15 +2356,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 		switch (os->os_redundant_metadata) {
 		case ZFS_REDUNDANT_METADATA_ALL:
 			copies++;
+			gang_copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_MOST:
 			if (level >= zfs_redundant_metadata_most_ditto_level ||
 			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
 				copies++;
+			if (level + 1 >=
+			    zfs_redundant_metadata_most_ditto_level ||
+			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
+				gang_copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_SOME:
-			if (DMU_OT_IS_CRITICAL(type))
+			if (DMU_OT_IS_CRITICAL(type, level)) {
 				copies++;
+				gang_copies++;
+			} else if (DMU_OT_IS_METADATA(type)) {
+				gang_copies++;
+			}
 			break;
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
@@ -2445,6 +2456,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
+
+		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+		    (os->os_redundant_metadata ==
+		    ZFS_REDUNDANT_METADATA_MOST &&
+		    zfs_redundant_metadata_most_ditto_level <= 1))
+			gang_copies++;
 	}

 	/*
@@ -2461,6 +2478,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)

 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
+			gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
@@ -2478,6 +2496,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
+	zp->zp_gang_copies = MIN(MAX(gang_copies, copies),
+	    spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
@@ -2310,6 +2310,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
 					zp.zp_nopwrite = B_FALSE;
 					zp.zp_copies = MIN(zp.zp_copies,
 					    SPA_DVAS_PER_BP - 1);
+					zp.zp_gang_copies =
+					    MIN(zp.zp_gang_copies,
+					    SPA_DVAS_PER_BP - 1);
 				}
 				zio_flags |= ZIO_FLAG_RAW;
 			} else if (DRR_WRITE_COMPRESSED(drrw)) {
@@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */

+static char *
+rt_name(dnode_t *dn, const char *name)
+{
+	struct objset *os = dn->dn_objset;
+
+	return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",
+	    spa_name(os->os_spa),
+	    (u_longlong_t)(os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET),
+	    (u_longlong_t)dn->dn_object,
+	    name));
+}
+
 static int
 dbuf_compare(const void *x1, const void *x2)
 {
@@ -2436,8 +2449,10 @@ done:
 	{
 		int txgoff = tx->tx_txg & TXG_MASK;
 		if (dn->dn_free_ranges[txgoff] == NULL) {
-			dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
-			    ZFS_RANGE_SEG64, NULL, 0, 0);
+			dn->dn_free_ranges[txgoff] =
+			    zfs_range_tree_create_flags(
+			    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+			    ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));
 		}
 		zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 		zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
@@ -2559,6 +2574,11 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		error = 0;
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
+		if (dn->dn_dbuf != NULL)
+			rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER);
+		else if (dmu_objset_ds(dn->dn_objset) != NULL)
+			rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
+			    RW_READER, FTAG);
 	} else {
 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
@@ -2663,6 +2683,12 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	if (db != NULL) {
 		rw_exit(&db->db_rwlock);
 		dbuf_rele(db, FTAG);
+	} else {
+		if (dn->dn_dbuf != NULL)
+			rw_exit(&dn->dn_dbuf->db_rwlock);
+		else if (dmu_objset_ds(dn->dn_objset) != NULL)
+			rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
+			    FTAG);
 	}

 	return (error);
@@ -235,6 +235,9 @@ static uint_t zfs_resilver_defer_percent = 10;
 #define	DSL_SCAN_IS_SCRUB(scn)		\
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB)

+#define	DSL_SCAN_IS_RESILVER(scn) \
+	((scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
 /*
 * Enable/disable the processing of the free_bpobj object.
 */
@@ -1169,7 +1172,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 			    scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);

-			if (scn->scn_phys.scn_min_txg) {
+			if (DSL_SCAN_IS_RESILVER(scn)) {
 				nvlist_t *aux = fnvlist_alloc();
 				fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
 				    "healing");
@@ -370,6 +370,16 @@ static metaslab_stats_t metaslab_stats = {
 #define	METASLABSTAT_BUMP(stat) \
 	atomic_inc_64(&metaslab_stats.stat.value.ui64);

+char *
+metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name)
+{
+	return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}",
+	    spa_name(mg->mg_vd->vdev_spa),
+	    (u_longlong_t)mg->mg_vd->vdev_guid,
+	    (u_longlong_t)ms->ms_id,
+	    name));
+}
+

 static kstat_t *metaslab_ksp;

@@ -969,14 +979,16 @@ metaslab_group_passivate(metaslab_group_t *mg)
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
+			    metaslab_weight(msp, B_TRUE) &
+			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 		}
 		msp = mga->mga_secondary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
-			    metaslab_weight_from_range_tree(msp));
+			    metaslab_weight(msp, B_TRUE) &
+			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 		}
 	}
@@ -2755,30 +2767,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
 	zfs_range_seg_type_t type =
 	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift);

-	ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
-	    shift);
+	ms->ms_allocatable = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable"));
 	for (int t = 0; t < TXG_SIZE; t++) {
-		ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
-		    NULL, start, shift);
+		ms->ms_allocating[t] = zfs_range_tree_create_flags(
+		    NULL, type, NULL, start, shift,
+		    ZFS_RT_F_DYN_NAME,
+		    metaslab_rt_name(mg, ms, "ms_allocating"));
 	}
-	ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
-	ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
+	ms->ms_freeing = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing"));
+	ms->ms_freed = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed"));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-		ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
-		    start, shift);
+		ms->ms_defer[t] = zfs_range_tree_create_flags(
+		    NULL, type, NULL, start, shift,
+		    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer"));
 	}
-	ms->ms_checkpointing =
-	    zfs_range_tree_create(NULL, type, NULL, start, shift);
-	ms->ms_unflushed_allocs =
-	    zfs_range_tree_create(NULL, type, NULL, start, shift);
+	ms->ms_checkpointing = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing"));
+	ms->ms_unflushed_allocs = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs"));

 	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
-	ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
-	    type, mrap, start, shift);
+	ms->ms_unflushed_frees = zfs_range_tree_create_flags(
+	    &metaslab_rt_ops, type, mrap, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees"));

-	ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
+	ms->ms_trim = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim"));

 	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms, B_FALSE);
@@ -3752,7 +3777,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
 	    &start, &shift);

-	condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
+	condense_tree = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME,
+	    metaslab_rt_name(msp->ms_group, msp, "condense_tree"));

 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_walk(msp->ms_defer[t],
@@ -3809,8 +3837,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
 	 * sync pass 1.
 	 */
-	zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL,
-	    start, shift);
+	zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags(
+	    NULL, type, NULL, start, shift,
+	    ZFS_RT_F_DYN_NAME,
+	    metaslab_rt_name(msp->ms_group, msp, "tmp_tree"));
 	zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
 	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
@@ -5073,29 +5103,16 @@ next:

 		/*
 		 * We were unable to allocate from this metaslab so determine
-		 * a new weight for this metaslab. Now that we have loaded
-		 * the metaslab we can provide a better hint to the metaslab
-		 * selector.
-		 *
-		 * For space-based metaslabs, we use the maximum block size.
-		 * This information is only available when the metaslab
-		 * is loaded and is more accurate than the generic free
-		 * space weight that was calculated by metaslab_weight().
-		 * This information allows us to quickly compare the maximum
-		 * available allocation in the metaslab to the allocation
-		 * size being requested.
-		 *
-		 * For segment-based metaslabs, determine the new weight
-		 * based on the highest bucket in the range tree. We
-		 * explicitly use the loaded segment weight (i.e. the range
-		 * tree histogram) since it contains the space that is
-		 * currently available for allocation and is accurate
-		 * even within a sync pass.
+		 * a new weight for this metaslab. The weight was last
+		 * recalculated either when we loaded it (if this is the first
+		 * TXG it's been loaded in), or the last time a txg was synced
+		 * out.
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
-			weight = metaslab_largest_allocatable(msp);
-			WEIGHT_SET_SPACEBASED(weight);
+			metaslab_set_fragmentation(msp, B_TRUE);
+			weight = metaslab_space_weight(msp) &
+			    ~METASLAB_ACTIVE_MASK;
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
 		}
@@ -5107,13 +5124,6 @@ next:
 			 * For the case where we use the metaslab that is
 			 * active for another allocator we want to make
 			 * sure that we retain the activation mask.
-			 *
-			 * Note that we could attempt to use something like
-			 * metaslab_recalculate_weight_and_sort() that
-			 * retains the activation mask here. That function
-			 * uses metaslab_weight() to set the weight though
-			 * which is not as accurate as the calculations
-			 * above.
 			 */
 			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
 			metaslab_group_sort(mg, msp, weight);
@@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
 	ml->ml_num_sublists = num;
 	ml->ml_index_func = index_func;

-	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+	ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) *
 	    ml->ml_num_sublists, KM_SLEEP);

 	ASSERT3P(ml->ml_sublists, !=, NULL);
@@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml)
 	}

 	ASSERT3P(ml->ml_sublists, !=, NULL);
-	kmem_free(ml->ml_sublists,
+	vmem_free(ml->ml_sublists,
 	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);

 	ml->ml_num_sublists = 0;
@@ -201,10 +201,10 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
    zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)

-zfs_range_tree_t *
-zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
+static zfs_range_tree_t *
+zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
-    uint64_t gap)
+    uint64_t gap, uint64_t flags, const char *name)
 {
 	zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);

@@ -236,6 +236,8 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,

 	rt->rt_ops = ops;
 	rt->rt_gap = gap;
+	rt->rt_flags = flags;
+	rt->rt_name = name;
 	rt->rt_arg = arg;
 	rt->rt_type = type;
 	rt->rt_start = start;
@@ -247,11 +249,30 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
 	return (rt);
 }

+zfs_range_tree_t *
+zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
+    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+    uint64_t gap)
+{
+	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
+	    0, NULL));
+}
+
 zfs_range_tree_t *
 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
 {
-	return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
+	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
+	    0, NULL));
+}
+
+zfs_range_tree_t *
+zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
+    zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+    uint64_t flags, const char *name)
+{
+	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
+	    flags, name));
 }

 void
@@ -262,6 +283,9 @@ zfs_range_tree_destroy(zfs_range_tree_t *rt)
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);

+	if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
+		kmem_strfree((char *)(uintptr_t)rt->rt_name);
+
 	zfs_btree_destroy(&rt->rt_root);
 	kmem_free(rt, sizeof (*rt));
 }
@@ -271,15 +295,17 @@ zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
    int64_t delta)
 {
 	if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
-		zfs_panic_recover("zfs: attempting to decrease fill to or "
-		    "below 0; probable double remove in segment [%llx:%llx]",
+		zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
+		    "or below 0; probable double remove in segment [%llx:%llx]",
+		    ZFS_RT_NAME(rt),
 		    (longlong_t)zfs_rs_get_start(rs, rt),
 		    (longlong_t)zfs_rs_get_end(rs, rt));
 	}
 	if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
 	    zfs_rs_get_start(rs, rt)) {
-		zfs_panic_recover("zfs: attempting to increase fill beyond "
-		    "max; probable double add in segment [%llx:%llx]",
+		zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
+		    "beyond max; probable double add in segment [%llx:%llx]",
+		    ZFS_RT_NAME(rt),
 		    (longlong_t)zfs_rs_get_start(rs, rt),
 		    (longlong_t)zfs_rs_get_end(rs, rt));
 	}
@@ -319,14 +345,17 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 	 * the normal code paths.
 	 */
 	if (rs != NULL) {
-		if (gap == 0) {
-			zfs_panic_recover("zfs: adding existent segment to "
-			    "range tree (offset=%llx size=%llx)",
-			    (longlong_t)start, (longlong_t)size);
-			return;
-		}
 		uint64_t rstart = zfs_rs_get_start(rs, rt);
 		uint64_t rend = zfs_rs_get_end(rs, rt);
+		if (gap == 0) {
+			zfs_panic_recover("zfs: rt=%s: adding segment "
+			    "(offset=%llx size=%llx) overlapping with existing "
+			    "one (offset=%llx size=%llx)",
+			    ZFS_RT_NAME(rt),
+			    (longlong_t)start, (longlong_t)size,
+			    (longlong_t)rstart, (longlong_t)(rend - rstart));
+			return;
+		}
 		if (rstart <= start && rend >= end) {
 			zfs_range_tree_adjust_fill(rt, rs, fill);
 			return;
@@ -451,6 +480,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
 	zfs_range_seg_t *rs;
 	zfs_range_seg_max_t rsearch, rs_tmp;
 	uint64_t end = start + size;
+	uint64_t rstart, rend;
 	boolean_t left_over, right_over;

 	VERIFY3U(size, !=, 0);
@@ -464,12 +494,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,

 	/* Make sure we completely overlap with someone */
 	if (rs == NULL) {
-		zfs_panic_recover("zfs: removing nonexistent segment from "
-		    "range tree (offset=%llx size=%llx)",
-		    (longlong_t)start, (longlong_t)size);
+		zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
+		    "from range tree (offset=%llx size=%llx)",
+		    ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
 		return;
 	}

+	rstart = zfs_rs_get_start(rs, rt);
+	rend = zfs_rs_get_end(rs, rt);
+
 	/*
 	 * Range trees with gap support must only remove complete segments
 	 * from the tree. This allows us to maintain accurate fill accounting
@@ -479,31 +512,36 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
 	if (rt->rt_gap != 0) {
 		if (do_fill) {
 			if (zfs_rs_get_fill(rs, rt) == size) {
-				start = zfs_rs_get_start(rs, rt);
-				end = zfs_rs_get_end(rs, rt);
+				start = rstart;
+				end = rend;
 				size = end - start;
 			} else {
 				zfs_range_tree_adjust_fill(rt, rs, -size);
 				return;
 			}
-		} else if (zfs_rs_get_start(rs, rt) != start ||
-		    zfs_rs_get_end(rs, rt) != end) {
-			zfs_panic_recover("zfs: freeing partial segment of "
-			    "gap tree (offset=%llx size=%llx) of "
+		} else if (rstart != start || rend != end) {
+			zfs_panic_recover("zfs: rt=%s: freeing partial segment "
+			    "of gap tree (offset=%llx size=%llx) of "
 			    "(offset=%llx size=%llx)",
+			    ZFS_RT_NAME(rt),
 			    (longlong_t)start, (longlong_t)size,
-			    (longlong_t)zfs_rs_get_start(rs, rt),
-			    (longlong_t)zfs_rs_get_end(rs, rt) -
-			    zfs_rs_get_start(rs, rt));
+			    (longlong_t)rstart, (longlong_t)(rend - rstart));
 			return;
 		}
 	}

-	VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
-	VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
+	if (!(rstart <= start && rend >= end)) {
+		panic("zfs: rt=%s: removing segment "
+		    "(offset=%llx size=%llx) not completely overlapped by "
+		    "existing one (offset=%llx size=%llx)",
+		    ZFS_RT_NAME(rt),
+		    (longlong_t)start, (longlong_t)size,
+		    (longlong_t)rstart, (longlong_t)(rend - rstart));
+		return;
+	}

-	left_over = (zfs_rs_get_start(rs, rt) != start);
-	right_over = (zfs_rs_get_end(rs, rt) != end);
+	left_over = (rstart != start);
+	right_over = (rend != end);

 	zfs_range_tree_stat_decr(rt, rs);

@@ -1231,29 +1231,14 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
-			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly less important
 			 * priority than the other taskqs.
-			 *
-			 * Under Linux and FreeBSD this means incrementing
-			 * the priority value as opposed to platforms like
-			 * illumos where it should be decremented.
-			 *
-			 * On FreeBSD, if priorities divided by four (RQ_PPQ)
-			 * are equal then a difference between them is
-			 * insignificant.
 			 */
-			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
-#if defined(__linux__)
-				pri++;
-#elif defined(__FreeBSD__)
-				pri += 4;
-#else
-#error "unknown OS"
-#endif
-			}
+			const pri_t pri = (t == ZIO_TYPE_WRITE &&
+			    q == ZIO_TASKQ_ISSUE) ?
+			    wtqclsyspri : maxclsyspri;
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef HAVE_SYSDC
@@ -243,6 +243,25 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }

+char *
+vdev_rt_name(vdev_t *vd, const char *name)
+{
+	return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}",
+	    spa_name(vd->vdev_spa),
+	    (u_longlong_t)vd->vdev_guid,
+	    name));
+}
+
+static char *
+vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type)
+{
+	return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}",
+	    spa_name(vd->vdev_spa),
+	    (u_longlong_t)vd->vdev_guid,
+	    name,
+	    dtl_type));
+}
+
 /*
 * Virtual device management.
 */
@@ -540,6 +559,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)

 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
+	pvd->vdev_nonrot &= cvd->vdev_nonrot;

 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
@@ -678,8 +698,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)

 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
-	vd->vdev_obsolete_segments = zfs_range_tree_create(NULL,
-	    ZFS_RANGE_SEG64, NULL, 0, 0);
+	vd->vdev_obsolete_segments = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments"));

 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
@@ -733,8 +754,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);

 	for (int t = 0; t < DTL_TYPES; t++) {
-		vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-		    NULL, 0, 0);
+		vd->vdev_dtl[t] = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t));
 	}

 	txg_list_create(&vd->vdev_ms_list, spa,
@@ -1361,6 +1383,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
+	mvd->vdev_nonrot = cvd->vdev_nonrot;

 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
@@ -1567,6 +1590,18 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;

+	/*
+	 * Weighting algorithms can depend on the number of metaslabs in the
+	 * vdev. In order to ensure that all weights are correct at all times,
+	 * we need to recalculate here.
+	 */
+	for (uint64_t m = 0; m < oldc; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
+		mutex_enter(&msp->ms_lock);
+		metaslab_recalculate_weight_and_sort(msp);
+		mutex_exit(&msp->ms_lock);
+	}
+
 	for (uint64_t m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 		/*
@@ -1948,6 +1983,10 @@ vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
 		taskq_wait(tq);
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
+
+		if (open_func(cvd) == B_FALSE ||
+		    cvd->vdev_state <= VDEV_STATE_FAULTED)
+			continue;
 		vd->vdev_nonrot &= cvd->vdev_nonrot;
 	}

@@ -3419,7 +3458,9 @@ vdev_dtl_load(vdev_t *vd)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);

-		rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+		rt = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt"));
 		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
 		if (error == 0) {
 			mutex_enter(&vd->vdev_dtl_lock);
@@ -3567,7 +3608,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}

-	rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync"));

 	mutex_enter(&vd->vdev_dtl_lock);
 	zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
@@ -2482,6 +2482,7 @@ vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 	*max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;

 	vds->vds_draid_vdev = tvd;
+	vd->vdev_nonrot = tvd->vdev_nonrot;

 	return (0);
 }
@@ -541,8 +541,9 @@ vdev_initialize_thread(void *arg)

 	abd_t *deadbeef = vdev_initialize_block_alloc();

-	vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	vd->vdev_initialize_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_initialize_tree"));

 	for (uint64_t i = 0; !vd->vdev_detached &&
 	    i < vd->vdev_top->vdev_ms_count; i++) {
@@ -1050,6 +1050,16 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 	mutex_exit(&vq->vq_lock);
 }

+boolean_t
+vdev_queue_pool_busy(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	uint64_t min_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_min_dirty_percent / 100;
+
+	return (dp->dp_dirty_total > min_bytes);
+}
+
 /*
 * As these two methods are only used for load calculations we're not
 * concerned if we get an incorrect value on 32bit platforms due to lack of
@@ -4556,8 +4556,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
 		uint64_t shift, start;
 		zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
 		    raidvd, msp, &start, &shift);
-		zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
-		    start, shift);
+		zfs_range_tree_t *rt = zfs_range_tree_create_flags(
+		    NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
+		    metaslab_rt_name(msp->ms_group, msp,
+		    "spa_raidz_expand_thread:rt"));
 		zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
 		zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
 		    rt);
@@ -787,8 +787,9 @@ vdev_rebuild_thread(void *arg)
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	vr->vr_top_vdev = vd;
 	vr->vr_scan_msp = NULL;
-	vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL,
-	    0, 0);
+	vr->vr_scan_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vr_scan_tree"));
 	mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);

@@ -364,13 +364,15 @@ spa_vdev_removal_create(vdev_t *vd)
 	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
-	svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	svr->svr_allocd_segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs"));
 	svr->svr_vdev_id = vd->vdev_id;

 	for (int i = 0; i < TXG_SIZE; i++) {
-		svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-		    NULL, 0, 0);
+		svr->svr_frees[i] = zfs_range_tree_create_flags(
+		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+		    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees"));
 		list_create(&svr->svr_new_segments[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
@@ -1179,8 +1181,9 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs,
 	 * relative to the start of the range to be copied (i.e. relative to the
 	 * local variable "start").
 	 */
-	zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL,
-	    ZFS_RANGE_SEG64, NULL, 0, 0);
+	zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs"));

 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
@@ -1448,8 +1451,9 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 	 * allocated segments that we are copying.  We may also be copying
 	 * free segments (of up to vdev_removal_max_span bytes).
 	 */
-	zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs"));
 	for (;;) {
 		zfs_range_tree_t *rt = svr->svr_allocd_segs;
 		zfs_range_seg_t *rs = zfs_range_tree_first(rt);
@@ -1610,8 +1614,9 @@ spa_vdev_remove_thread(void *arg)
 	vca.vca_read_error_bytes = 0;
 	vca.vca_write_error_bytes = 0;

-	zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs"));

 	mutex_enter(&svr->svr_lock);

@@ -1894,8 +1899,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 		    vdev_indirect_mapping_max_offset(vim));
 	}

-	zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-	    NULL, 0, 0);
+	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME,
+	    vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs"));
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];

@@ -902,7 +902,9 @@ vdev_trim_thread(void *arg)
 	ta.trim_vdev = vd;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
-	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	ta.trim_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
 	ta.trim_type = TRIM_TYPE_MANUAL;
 	ta.trim_flags = 0;

@@ -1305,8 +1307,10 @@ vdev_autotrim_thread(void *arg)
 			 * Allocate an empty range tree which is swapped in
 			 * for the existing ms_trim tree while it is processed.
 			 */
-			trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
-			    NULL, 0, 0);
+			trim_tree = zfs_range_tree_create_flags(
+			    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+			    ZFS_RT_F_DYN_NAME,
+			    vdev_rt_name(vd, "autotrim_tree"));
 			zfs_range_tree_swap(&msp->ms_trim, &trim_tree);
 			ASSERT(zfs_range_tree_is_empty(msp->ms_trim));

@@ -1360,8 +1364,10 @@ vdev_autotrim_thread(void *arg)
 				if (!cvd->vdev_ops->vdev_op_leaf)
 					continue;

-				ta->trim_tree = zfs_range_tree_create(NULL,
-				    ZFS_RANGE_SEG64, NULL, 0, 0);
+				ta->trim_tree = zfs_range_tree_create_flags(
+				    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+				    ZFS_RT_F_DYN_NAME,
+				    vdev_rt_name(vd, "autotrim_tree"));
 				zfs_range_tree_walk(trim_tree,
 				    vdev_trim_range_add, ta);
 			}
@@ -1600,7 +1606,9 @@ vdev_trim_l2arc_thread(void *arg)
 	vd->vdev_trim_secure = 0;

 	ta.trim_vdev = vd;
-	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	ta.trim_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
 	ta.trim_type = TRIM_TYPE_MANUAL;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
@@ -1735,7 +1743,9 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
 	ASSERT(!vd->vdev_top->vdev_rz_expanding);

 	ta.trim_vdev = vd;
-	ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+	ta.trim_tree = zfs_range_tree_create_flags(
+	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
 	ta.trim_type = TRIM_TYPE_SIMPLE;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
@@ -32,9 +32,6 @@
 #include <sys/blake3.h>
 #include <sys/sha2.h>

-/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
-#define	LIMIT_PERF_MBS	300
-
 typedef struct {
 	const char *name;
 	const char *impl;
@@ -52,9 +49,15 @@ typedef struct {
 	zio_checksum_tmpl_free_t *(free);
 } chksum_stat_t;

+#define	AT_STARTUP	0
+#define	AT_BENCHMARK	1
+#define	AT_DONE		2
+
 static chksum_stat_t *chksum_stat_data = 0;
-static int chksum_stat_cnt = 0;
 static kstat_t *chksum_kstat = NULL;
+static int chksum_stat_limit = AT_STARTUP;
+static int chksum_stat_cnt = 0;
+static void chksum_benchmark(void);

 /*
 * Sample output on i3-1005G1 System:
@@ -129,6 +132,9 @@ chksum_kstat_data(char *buf, size_t size, void *data)
 static void *
 chksum_kstat_addr(kstat_t *ksp, loff_t n)
 {
+	/* full benchmark */
+	chksum_benchmark();
+
 	if (n < chksum_stat_cnt)
 		ksp->ks_private = (void *)(chksum_stat_data + n);
 	else
@@ -176,47 +182,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
 	kpreempt_enable();

 	run_bw = size * run_count * NANOSEC;
-	run_bw /= run_time_ns;	/* B/s */
+	run_bw /= run_time_ns; /* B/s */
 	*result = run_bw/1024/1024; /* MiB/s */
 }

-#define	LIMIT_INIT	0
-#define	LIMIT_NEEDED	1
-#define	LIMIT_NOLIMIT	2
-
 static void
 chksum_benchit(chksum_stat_t *cs)
 {
 	abd_t *abd;
 	void *ctx = 0;
 	void *salt = &cs->salt.zcs_bytes;
-	static int chksum_stat_limit = LIMIT_INIT;

 	memset(salt, 0, sizeof (cs->salt.zcs_bytes));
 	if (cs->init)
 		ctx = cs->init(&cs->salt);

+	/* benchmarks in startup mode */
+	if (chksum_stat_limit == AT_STARTUP) {
+		abd = abd_alloc_linear(1<<18, B_FALSE);
+		chksum_run(cs, abd, ctx, 5, &cs->bs256k);
+		goto done;
+	}
+
 	/* allocate test memory via abd linear interface */
 	abd = abd_alloc_linear(1<<20, B_FALSE);
+
+	/* benchmarks when requested */
 	chksum_run(cs, abd, ctx, 1, &cs->bs1k);
 	chksum_run(cs, abd, ctx, 2, &cs->bs4k);
 	chksum_run(cs, abd, ctx, 3, &cs->bs16k);
 	chksum_run(cs, abd, ctx, 4, &cs->bs64k);
-	chksum_run(cs, abd, ctx, 5, &cs->bs256k);
-
-	/* check if we ran on a slow cpu */
-	if (chksum_stat_limit == LIMIT_INIT) {
-		if (cs->bs1k < LIMIT_PERF_MBS) {
-			chksum_stat_limit = LIMIT_NEEDED;
-		} else {
-			chksum_stat_limit = LIMIT_NOLIMIT;
-		}
-	}
-
-	/* skip benchmarks >= 1MiB when the CPU is to slow */
-	if (chksum_stat_limit == LIMIT_NEEDED)
-		goto abort;
-
 	chksum_run(cs, abd, ctx, 6, &cs->bs1m);
 	abd_free(abd);

@@ -225,7 +220,7 @@ chksum_benchit(chksum_stat_t *cs)
 	chksum_run(cs, abd, ctx, 7, &cs->bs4m);
 	chksum_run(cs, abd, ctx, 8, &cs->bs16m);

-abort:
+done:
 	abd_free(abd);

 	/* free up temp memory */
@@ -243,7 +238,6 @@ chksum_benchmark(void)
 	/* we need the benchmark only for the kernel module */
 	return;
 #endif
-
 	chksum_stat_t *cs;
 	uint64_t max;
 	uint32_t id, cbid = 0, id_save;
@@ -251,8 +245,14 @@ chksum_benchmark(void)
 	const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
 	const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");

+	/* benchmarks are done */
+	if (chksum_stat_limit == AT_DONE)
+		return;
+
+
 	/* count implementations */
-	chksum_stat_cnt = 2;
+	chksum_stat_cnt = 1;  /* edonr */
+	chksum_stat_cnt += 1; /* skein */
 	chksum_stat_cnt += sha256->getcnt();
 	chksum_stat_cnt += sha512->getcnt();
 	chksum_stat_cnt += blake3->getcnt();
@@ -332,6 +332,17 @@ chksum_benchmark(void)
 		}
 	}
 	blake3->setid(id_save);
+
+	switch (chksum_stat_limit) {
+	case AT_STARTUP:
+		/* next time we want a full benchmark */
+		chksum_stat_limit = AT_BENCHMARK;
+		break;
+	case AT_BENCHMARK:
+		/* no further benchmarks */
+		chksum_stat_limit = AT_DONE;
+		break;
+	}
 }

 void
@@ -341,7 +352,7 @@ chksum_init(void)
 	blake3_per_cpu_ctx_init();
 #endif

-	/* Benchmark supported implementations */
+	/* 256KiB benchmark */
 	chksum_benchmark();

 	/* Install kstats for all implementations */
@@ -67,13 +67,14 @@
 int zfs_bclone_enabled = 1;

 /*
- * When set zfs_clone_range() waits for dirty data to be written to disk.
- * This allows the clone operation to reliably succeed when a file is modified
- * and then immediately cloned. For small files this may be slower than making
- * a copy of the file and is therefore not the default.  However, in certain
- * scenarios this behavior may be desirable so a tunable is provided.
+ * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
+ * data to be written to disk before proceeding. This ensures that the clone
+ * operation reliably succeeds, even if a file is modified and then immediately
+ * cloned. Note that for small files this may be slower than simply copying
+ * the file. When set to 0 the clone operation will immediately fail if it
+ * encounters any dirty blocks. By default waiting is enabled.
 */
-int zfs_bclone_wait_dirty = 0;
+int zfs_bclone_wait_dirty = 1;

 /*
 * Enable Direct I/O. If this setting is 0, then all I/O requests will be
@@ -108,9 +109,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (error);
-		atomic_inc_32(&zp->z_sync_writes_cnt);
 		zil_commit(zfsvfs->z_log, zp->z_id);
-		atomic_dec_32(&zp->z_sync_writes_cnt);
 		zfs_exit(zfsvfs, FTAG);
 	}
 	return (error);
@@ -1058,6 +1057,143 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 	return (0);
 }

+/*
+ * Rewrite a range of file as-is without modification.
+ *
+ *	IN:	zp	- znode of file to be rewritten.
+ *		off	- Offset of the range to rewrite.
+ *		len	- Length of the range to rewrite.
+ *		flags	- Random rewrite parameters.
+ *		arg	- flags-specific argument.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ */
+int
+zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
+    uint64_t arg)
+{
+	int error;
+
+	if (flags != 0 || arg != 0)
+		return (SET_ERROR(EINVAL));
+
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);
+
+	if (zfs_is_readonly(zfsvfs)) {
+		zfs_exit(zfsvfs, FTAG);
+		return (SET_ERROR(EROFS));
+	}
+
+	if (off >= zp->z_size) {
+		zfs_exit(zfsvfs, FTAG);
+		return (0);
+	}
+	if (len == 0 || len > zp->z_size - off)
+		len = zp->z_size - off;
+
+	/* Flush any mmap()'d data to disk */
+	if (zn_has_cached_data(zp, off, off + len - 1))
+		zn_flush_cached_data(zp, B_TRUE);
+
+	zfs_locked_range_t *lr;
+	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
+	const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
+	const uint64_t projid = zp->z_projid;
+
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+	DB_DNODE_ENTER(db);
+	dnode_t *dn = DB_DNODE(db);
+
+	uint64_t n, noff = off, nr = 0, nw = 0;
+	while (len > 0) {
+		/*
+		 * Rewrite only actual data, skipping any holes.  This might
+		 * be inaccurate for dirty files, but we don't really care.
+		 */
+		if (noff == off) {
+			/* Find next data in the file. */
+			error = dnode_next_offset(dn, 0, &noff, 1, 1, 0);
+			if (error || noff >= off + len) {
+				if (error == ESRCH)	/* No more data. */
+					error = 0;
+				break;
+			}
+			ASSERT3U(noff, >=, off);
+			len -= noff - off;
+			off = noff;
+
+			/* Find where the data end. */
+			error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff,
+			    1, 1, 0);
+			if (error != 0)
+				noff = off + len;
+		}
+		ASSERT3U(noff, >, off);
+
+		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
+		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
+		    (projid != ZFS_DEFAULT_PROJID &&
+		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+		    projid))) {
+			error = SET_ERROR(EDQUOT);
+			break;
+		}
+
+		n = MIN(MIN(len, noff - off),
+		    DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz));
+
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_write_by_dnode(tx, dn, off, n);
+		error = dmu_tx_assign(tx, DMU_TX_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			break;
+		}
+
+		/* Mark all dbufs within range as dirty to trigger rewrite. */
+		dmu_buf_t **dbp;
+		int numbufs;
+		error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG,
+		    &numbufs, &dbp, DMU_READ_PREFETCH);
+		if (error) {
+			dmu_tx_commit(tx);
+			break;
+		}
+		for (int i = 0; i < numbufs; i++) {
+			nr += dbp[i]->db_size;
+			if (dmu_buf_is_dirty(dbp[i], tx))
+				continue;
+			nw += dbp[i]->db_size;
+			dmu_buf_will_dirty(dbp[i], tx);
+		}
+		dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+		dmu_tx_commit(tx);
+
+		len -= n;
+		off += n;
+
+		if (issig()) {
+			error = SET_ERROR(EINTR);
+			break;
+		}
+	}
+
+	DB_DNODE_EXIT(db);
+
+	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr);
+	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw);
+
+	zfs_rangelock_exit(lr);
+	zfs_exit(zfsvfs, FTAG);
+	return (error);
+}
+
 int
 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
@@ -1691,7 +1691,7 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
-	 * how we can defer the flush commands for each lwb.
+	 * how we can defer the flush commands for any lwb without waiters.
 	 *
 	 * When the flush commands are deferred, the previous lwb will rely on
 	 * this lwb to flush the vdevs written to by that previous lwb. Thus,
@@ -1708,7 +1708,10 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 	 */
 	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
 		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
-		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
+		if (list_is_empty(&prev_lwb->lwb_waiters)) {
+			zio_add_child(lwb->lwb_write_zio,
+			    prev_lwb->lwb_write_zio);
+		}
 	} else {
 		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
@@ -2898,19 +2901,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)

 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));

-	/*
-	 * Return if there's nothing to commit before we dirty the fs by
-	 * calling zil_create().
-	 */
-	if (list_is_empty(&zilog->zl_itx_commit_list))
-		return;
-
-	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
-	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
-	    offsetof(zil_commit_waiter_t, zcw_node));
-
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
+		/*
+		 * Return if there's nothing to commit before we dirty the fs.
+		 */
+		if (list_is_empty(&zilog->zl_itx_commit_list))
+			return;
+
 		lwb = zil_create(zilog);
 	} else {
 		/*
@@ -2938,6 +2936,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		}
 	}

+	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+	    offsetof(zil_commit_waiter_t, zcw_node));
+
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
@@ -3107,7 +3109,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
-		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+		if (lwb->lwb_state == LWB_STATE_OPENED &&
+		    (!zilog->zl_parallel || zilog->zl_suspend > 0)) {
 			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
--- a/Show More
+++ b/Show More