Tag zfs-2.3.6

META file and changelog updated. Signed-off-by: Tony Hutter <hutter2@llnl.gov>
CI: Test & fix Linux ZFS built-in build
2026-06-10 16:06:38 +03:00 · 2026-02-19 14:58:21 -08:00 · 2026-02-19 14:58:21 -08:00 · 2026-02-17 10:18:14 -08:00 · 2026-02-11 16:18:01 -08:00 · 2026-02-11 16:18:01 -08:00
144 changed files with 3975 additions and 1125 deletions
@@ -14,7 +14,7 @@ Please check our issue tracker before opening a new feature request.
 Filling out the following template will help other contributors better understand your proposed feature.
 -->

-### Describe the feature would like to see added to OpenZFS
+### Describe the feature you would like to see added to OpenZFS

 <!--
 Provide a clear and concise description of the feature.
@@ -2,11 +2,6 @@

 <!--- Provide a general summary of your changes in the Title above -->

-<!---
-Documentation on ZFS Buildbot options can be found at
-https://openzfs.github.io/openzfs-docs/Developer%20Resources/Buildbot%20Options.html
-->
-
 ### Motivation and Context
 <!--- Why is this change required? What problem does it solve? -->
 <!--- If it fixes an open issue, please link to the issue here. -->
@@ -7,7 +7,7 @@ Prints "quick" if (explicity required by user):
 - the *last* commit message contains 'ZFS-CI-Type: quick'
 or if (heuristics):
 - the files changed are not in the list of specified directories, and
- all commit messages do not contain 'ZFS-CI-Type: full'
+- all commit messages do not contain 'ZFS-CI-Type: (full|linux|freebsd)'

 Otherwise prints "full".
 """
@@ -65,12 +65,12 @@ if __name__ == '__main__':

    # check last (HEAD) commit message
    last_commit_message_raw = subprocess.run([
-        'git', 'show', '-s', '--format=%B', 'HEAD'
+        'git', 'show', '-s', '--format=%B', head
    ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    for line in last_commit_message_raw.stdout.decode().splitlines():
        if line.strip().lower() == 'zfs-ci-type: quick':
-            output_type('quick', f'explicitly requested by HEAD commit {head}')
+            output_type('quick', f'requested by HEAD commit {head}')

    # check all commit messages
    all_commit_message_raw = subprocess.run([
@@ -83,8 +83,12 @@ if __name__ == '__main__':
    for line in all_commit_message:
        if line.startswith('ZFS-CI-Commit:'):
            commit_ref = line.lstrip('ZFS-CI-Commit:').rstrip()
+        if line.strip().lower() == 'zfs-ci-type: freebsd':
+            output_type('freebsd', f'requested by commit {commit_ref}')
+        if line.strip().lower() == 'zfs-ci-type: linux':
+            output_type('linux', f'requested by commit {commit_ref}')
        if line.strip().lower() == 'zfs-ci-type: full':
-            output_type('full', f'explicitly requested by commit {commit_ref}')
+            output_type('full', f'requested by commit {commit_ref}')

    # check changed files
    changed_files_raw = subprocess.run([
@@ -6,6 +6,20 @@

 set -eu

+# The default 'azure.archive.ubuntu.com' mirrors can be really slow.
+# Prioritize the official Ubuntu mirrors.
+#
+# The normal apt-mirrors.txt will look like:
+#
+# http://azure.archive.ubuntu.com/ubuntu/       priority:1
+# https://archive.ubuntu.com/ubuntu/    priority:2
+# https://security.ubuntu.com/ubuntu/   priority:3
+#
+# Just delete the 'azure.archive.ubuntu.com' line.
+sudo sed -i '/azure.archive.ubuntu.com/d' /etc/apt/apt-mirrors.txt
+echo "Using mirrors:"
+cat /etc/apt/apt-mirrors.txt
+
 # install needed packages
 export DEBIAN_FRONTEND="noninteractive"
 sudo apt-get -y update
@@ -20,35 +34,89 @@ ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -q -N ""
 sudo systemctl stop docker.socket
 sudo systemctl stop multipathd.socket

-# remove default swapfile and /mnt
 sudo swapoff -a
-sudo umount -l /mnt
-DISK="/dev/disk/cloud/azure_resource-part1"
-sudo sed -e "s|^$DISK.*||g" -i /etc/fstab
-sudo wipefs -aq $DISK
-sudo systemctl daemon-reload
+
+# Special case:
+#
+# For reasons unknown, the runner can boot-up with two different block device
+# configurations.  On one config you get two 75GB block devices, and on the
+# other you get a single 150GB block device. Here's what both look like:
+#
+# --- Two 75GB block devices ---
+# NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINTS
+# sda       8:0    0  150G  0 disk
+# ├─sda1    8:1    0  149G  0 part /
+# ├─sda14   8:14   0    4M  0 part
+# ├─sda15   8:15   0  106M  0 part /boot/efi
+# └─sda16 259:0    0  913M  0 part /boot
+#
+# lrwxrwxrwx 1 root root  9 Jan 29 18:07 azure_root -> ../../sda
+# lrwxrwxrwx 1 root root 10 Jan 29 18:07 azure_root-part1 -> ../../sda1
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part14 -> ../../sda14
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part15 -> ../../sda15
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part16 -> ../../sda16
+#
+# --- One 150GB block device ---
+# NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINTS
+# sda       8:0    0   75G  0 disk
+# ├─sda1    8:1    0   74G  0 part /
+# ├─sda14   8:14   0    4M  0 part
+# ├─sda15   8:15   0  106M  0 part /boot/efi
+# └─sda16 259:0    0  913M  0 part /boot
+# sdb       8:16   0   75G  0 disk
+# └─sdb1    8:17   0   75G  0 part
+#
+# lrwxrwxrwx 1 root root  9 Jan 29 18:07 azure_resource -> ../../sdb
+# lrwxrwxrwx 1 root root 10 Jan 29 18:07 azure_resource-part1 -> ../../sdb1
+# lrwxrwxrwx 1 root root  9 Jan 29 18:07 azure_root -> ../../sda
+# lrwxrwxrwx 1 root root 10 Jan 29 18:07 azure_root-part1 -> ../../sda1
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part14 -> ../../sda14
+# lrwxrwxrwx 1 root root 11 Jan 29 18:07 azure_root-part15 -> ../../sda15
+#
+# If we have the azure_resource-part1 partition, umount it, partition it, and
+# use it as our ZFS disk and swap partition.  If not, just create a file VDEV
+# and swap file and use that instead.
+
+# remove default swapfile and /mnt
+if [ -e /dev/disk/cloud/azure_resource-part1 ] ; then
+  sudo umount -l /mnt
+  DISK="/dev/disk/cloud/azure_resource-part1"
+  sudo sed -e "s|^$DISK.*||g" -i /etc/fstab
+  sudo wipefs -aq $DISK
+  sudo systemctl daemon-reload
+fi

 sudo modprobe loop
 sudo modprobe zfs

-# partition the disk as needed
-DISK="/dev/disk/cloud/azure_resource"
-sudo sgdisk --zap-all $DISK
-sudo sgdisk -p \
- -n 1:0:+16G -c 1:"swap" \
- -n 2:0:0    -c 2:"tests" \
-$DISK
-sync
-sleep 1
+if [ -e /dev/disk/cloud/azure_resource-part1 ] ; then
+  echo "We have two 75GB block devices"
+  # partition the disk as needed
+  DISK="/dev/disk/cloud/azure_resource"
+  sudo sgdisk --zap-all $DISK
+  sudo sgdisk -p \
+   -n 1:0:+16G -c 1:"swap" \
+   -n 2:0:0    -c 2:"tests" \
+   $DISK
+  sync
+  sleep 1
+
+  sudo fallocate -l 12G /test.ssd2
+  DISKS="$DISK-part2 /test.ssd2"
+
+  SWAP=$DISK-part1
+else
+  echo "We have a single 150GB block device"
+  sudo fallocate -l 72G /test.ssd2
+  SWAP=/swapfile.ssd
+  sudo fallocate -l 16G $SWAP
+  sudo chmod 600 $SWAP
+  DISKS="/test.ssd2"
+fi

 # swap with same size as RAM (16GiB)
-sudo mkswap $DISK-part1
-sudo swapon $DISK-part1
-
-# JBOD 2xdisk for OpenZFS storage (test vm's)
-SSD1="$DISK-part2"
-sudo fallocate -l 12G /test.ssd2
-SSD2=$(sudo losetup -b 4096 -f /test.ssd2 --show)
+sudo mkswap $SWAP
+sudo swapon $SWAP

 # adjust zfs module parameter and create pool
 exec 1>/dev/null
@@ -57,7 +125,7 @@ ARC_MAX=$((1024*1024*512))
 echo $ARC_MIN | sudo tee /sys/module/zfs/parameters/zfs_arc_min
 echo $ARC_MAX | sudo tee /sys/module/zfs/parameters/zfs_arc_max
 echo 1 | sudo tee /sys/module/zfs/parameters/zvol_use_blk_mq
-sudo zpool create -f -o ashift=12 zpool $SSD1 $SSD2 -O relatime=off \
+sudo zpool create -f -o ashift=12 zpool $DISKS -O relatime=off \
  -O atime=off -O xattr=sa -O compression=lz4 -O sync=disabled \
  -O redundant_metadata=none -O mountpoint=/mnt/tests

@@ -43,20 +43,25 @@ case "$OS" in
    OSv="almalinux9"
    URL="https://repo.almalinux.org/almalinux/10/cloud/x86_64/images/AlmaLinux-10-GenericCloud-latest.x86_64.qcow2"
    ;;
+  alpine3-23)
+    OSNAME="Alpine Linux 3.23.2"
+    # Alpine Linux v3.22 and v3.23 are unknown to osinfo as of 2025-12-26.
+    OSv="alpinelinux3.21"
+    URL="https://dl-cdn.alpinelinux.org/alpine/v3.23/releases/cloud/generic_alpine-3.23.2-x86_64-bios-cloudinit-r0.qcow2"
+    ;;
  archlinux)
    OSNAME="Archlinux"
    URL="https://geo.mirror.pkgbuild.com/images/latest/Arch-Linux-x86_64-cloudimg.qcow2"
    ;;
-  centos-stream10)
-    OSNAME="CentOS Stream 10"
-    # TODO: #16903 Overwrite OSv to stream9 for virt-install until it's added to osinfo
-    OSv="centos-stream9"
-    URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2"
-    ;;
  centos-stream9)
    OSNAME="CentOS Stream 9"
    URL="https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-latest.x86_64.qcow2"
    ;;
+  centos-stream10)
+    OSNAME="CentOS Stream 10"
+    OSv="centos-stream9"
+    URL="https://cloud.centos.org/centos/10-stream/x86_64/images/CentOS-Stream-GenericCloud-10-latest.x86_64.qcow2"
+    ;;
  debian11)
    OSNAME="Debian 11"
    URL="https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-amd64.qcow2"
@@ -83,6 +88,11 @@ case "$OS" in
    OSv="fedora-unknown"
    URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2"
    ;;
+  fedora43)
+    OSNAME="Fedora 43"
+    OSv="fedora-unknown"
+    URL="https://download.fedoraproject.org/pub/fedora/linux/releases/43/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-43-1.6.x86_64.qcow2"
+    ;;
  freebsd13-5r)
    FreeBSD="13.5-RELEASE"
    OSNAME="FreeBSD $FreeBSD"
@@ -95,8 +105,8 @@ case "$OS" in
    FreeBSD="14.2-RELEASE"
    OSNAME="FreeBSD $FreeBSD"
    OSv="freebsd14.0"
-    KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
    URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
+    KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
    ;;
  freebsd14-3r)
    FreeBSD="14.3-RELEASE"
@@ -120,8 +130,15 @@ case "$OS" in
    URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
    KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
    ;;
-  freebsd15-0c)
-    FreeBSD="15.0-PRERELEASE"
+  freebsd15-0s)
+    FreeBSD="15.0-STABLE"
+    OSNAME="FreeBSD $FreeBSD"
+    OSv="freebsd14.0"
+    URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
+    KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
+    ;;
+  freebsd16-0c)
+    FreeBSD="16.0-CURRENT"
    OSNAME="FreeBSD $FreeBSD"
    OSv="freebsd14.0"
    URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
@@ -212,13 +229,21 @@ if [ ${OS:0:7} != "freebsd" ]; then
 hostname: $OS

 users:
- name: root
-  shell: $BASH
- name: zfs
-  sudo: ALL=(ALL) NOPASSWD:ALL
-  shell: $BASH
-  ssh_authorized_keys:
-    - $PUBKEY
+  - name: root
+    shell: /bin/bash
+    sudo: ['ALL=(ALL) NOPASSWD:ALL']
+  - name: zfs
+    shell: /bin/bash
+    sudo: ['ALL=(ALL) NOPASSWD:ALL']
+    ssh_authorized_keys:
+      - $PUBKEY
+    # Workaround for Alpine Linux.
+    lock_passwd: false
+    passwd: '*'
+
+packages:
+  - sudo
+  - bash

 growpart:
  mode: auto
@@ -287,7 +312,7 @@ else
  while pidof /usr/bin/qemu-system-x86_64 >/dev/null; do
    ssh 2>/dev/null root@vm0 "uname -a" && break
  done
-  ssh root@vm0 "pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init"
+  ssh root@vm0 "env IGNORE_OSVERSION=yes pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init"
  ssh root@vm0 "chsh -s $BASH root"
  ssh root@vm0 'sysrc qemu_guest_agent_enable="YES"'
  ssh root@vm0 'sysrc cloudinit_enable="YES"'
@@ -301,3 +326,23 @@ else
  scp ~/src.txz "root@vm0:/tmp/src.txz"
  ssh root@vm0 'tar -C / -zxf /tmp/src.txz'
 fi
+
+#
+# Config for Alpine Linux similar to FreeBSD.
+#
+if [ ${OS:0:6} == "alpine" ]; then
+  while pidof /usr/bin/qemu-system-x86_64 >/dev/null; do
+    ssh 2>/dev/null zfs@vm0 "uname -a" && break
+  done
+  # Enable community and testing repositories.
+  ssh zfs@vm0 "sudo rm -rf /etc/apk/repositories"
+  ssh zfs@vm0 "sudo setup-apkrepos -c1"
+  ssh zfs@vm0 "echo '@testing http://dl-cdn.alpinelinux.org/alpine/edge/testing' | sudo tee -a /etc/apk/repositories"
+  # Upgrade to edge or latest-stable.
+  #ssh zfs@vm0 "sudo sed -i 's#/v[0-9]\+\.[0-9]\+/#/edge/#g' /etc/apk/repositories"
+  #ssh zfs@vm0 "sudo sed -i 's#/v[0-9]\+\.[0-9]\+/#/latest-stable/#g' /etc/apk/repositories"
+  # Update and upgrade after repository setup.
+  ssh zfs@vm0 "sudo apk update"
+  ssh zfs@vm0 "sudo apk add --upgrade apk-tools"
+  ssh zfs@vm0 "sudo apk upgrade --available"
+fi
@@ -10,6 +10,32 @@

 set -eu

+function alpine() {
+  echo "##[group]Install Development Tools"
+  sudo apk add \
+    acl alpine-sdk attr autoconf automake bash build-base clang21 coreutils \
+    cpio cryptsetup curl curl-dev dhcpcd eudev eudev-dev eudev-libs findutils \
+    fio gawk gdb gettext-dev git grep jq libaio libaio-dev libcurl \
+    libtirpc-dev libtool libunwind libunwind-dev linux-headers linux-tools \
+    linux-virt linux-virt-dev lsscsi m4 make nfs-utils openssl-dev parted \
+    pax procps py3-cffi py3-distlib py3-packaging py3-setuptools python3 \
+    python3-dev qemu-guest-agent rng-tools rsync samba samba-server sed \
+    strace sysstat util-linux util-linux-dev wget words xfsprogs xxhash \
+    zlib-dev pamtester@testing
+  echo "##[endgroup]"
+
+  echo "##[group]Switch to eudev"
+  sudo setup-devd udev
+  echo "##[endgroup]"
+
+  echo "##[group]Install ksh93 from Source"
+  git clone --depth 1 https://github.com/ksh93/ksh.git /tmp/ksh
+  cd /tmp/ksh
+  ./bin/package make
+  sudo ./bin/package install /
+  echo "##[endgroup]"
+}
+
 function archlinux() {
  echo "##[group]Running pacman -Syu"
  sudo btrfs filesystem resize max /
@@ -20,13 +46,17 @@ function archlinux() {
  sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \
    fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \
    parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \
-    samba sysstat rng-tools rsync wget xxhash
+    samba strace sysstat rng-tools rsync wget xxhash
  echo "##[endgroup]"
 }

 function debian() {
  export DEBIAN_FRONTEND="noninteractive"

+  echo "##[group]Wait for cloud-init to finish"
+  cloud-init status --wait
+  echo "##[endgroup]"
+
  echo "##[group]Running apt-get update+upgrade"
  sudo sed -i '/[[:alpha:]]-backports/d' /etc/apt/sources.list
  sudo apt-get update -y
@@ -43,7 +73,8 @@ function debian() {
    lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \
    python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \
    python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \
-    rsync samba sysstat uuid-dev watchdog wget xfslibs-dev  xxhash zlib1g-dev
+    rsync samba strace sysstat uuid-dev watchdog wget xfslibs-dev xxhash \
+    zlib1g-dev
  echo "##[endgroup]"
 }

@@ -87,8 +118,13 @@ function rhel() {
    libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \
    parted perf python3 python3-cffi python3-devel python3-packaging \
    kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \
-    rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \
-    zlib-devel
+    rpm-build rsync samba strace sysstat systemd watchdog wget xfsprogs-devel \
+    xxhash zlib-devel
+
+  # These are needed for building Lustre.  We only install these on EL VMs since
+  # we don't plan to test build Lustre on other platforms.
+  sudo dnf install -y libnl3-devel libyaml-devel libmount-devel
+
  echo "##[endgroup]"
 }

@@ -104,7 +140,7 @@ function install_fedora_experimental_kernel {
  our_version="$1"
  sudo dnf -y copr enable @kernel-vanilla/stable
  sudo dnf -y copr enable @kernel-vanilla/mainline
-  all="$(sudo dnf list --showduplicates kernel-*)"
+  all="$(sudo dnf list --showduplicates kernel-* python3-perf* perf* bpftool*)"
  echo "Available versions:"
  echo "$all"

@@ -139,6 +175,9 @@ case "$1" in
    sudo dnf install -y kernel-abi-stablelists
    echo "##[endgroup]"
    ;;
+  alpine*)
+    alpine
+    ;;
  archlinux)
    archlinux
    ;;
@@ -187,6 +226,16 @@ test -z "${ONLY_DEPS:-}" || exit 0
 # Start services
 echo "##[group]Enable services"
 case "$1" in
+  alpine*)
+    sudo -E rc-update add qemu-guest-agent
+    sudo -E rc-update add nfs
+    sudo -E rc-update add samba
+    sudo -E rc-update add dhcpcd
+    # Remove services related to cloud-init.
+    sudo -E rc-update del cloud-init default
+    sudo -E rc-update del cloud-final default
+    sudo -E rc-update del cloud-config default
+    ;;
  freebsd*)
    # add virtio things
    echo 'virtio_load="YES"' | sudo -E tee -a /boot/loader.conf
@@ -242,7 +291,7 @@ case "$1" in
 esac

 case "$1" in
-  archlinux|freebsd*)
+  alpine*|archlinux|freebsd*)
    true
    ;;
  *)
@@ -58,13 +58,21 @@ for ((i=1; i<=VMs; i++)); do
 fqdn: vm$i

 users:
- name: root
-  shell: $BASH
- name: zfs
-  sudo: ALL=(ALL) NOPASSWD:ALL
-  shell: $BASH
-  ssh_authorized_keys:
-    - $PUBKEY
+  - name: root
+    shell: /bin/bash
+    sudo: ['ALL=(ALL) NOPASSWD:ALL']
+  - name: zfs
+    shell: /bin/bash
+    sudo: ['ALL=(ALL) NOPASSWD:ALL']
+    ssh_authorized_keys:
+      - $PUBKEY
+    # Workaround for Alpine Linux.
+    lock_passwd: false
+    passwd: '*'
+
+packages:
+  - sudo
+  - bash

 growpart:
  mode: auto
@@ -108,19 +116,30 @@ echo '*/5 * * * *  /root/cronjob.sh' > crontab.txt
 sudo crontab crontab.txt
 rm crontab.txt

-# check if the machines are okay
-echo "Waiting for vm's to come up...  (${VMs}x CPU=$CPU RAM=$RAM)"
-for ((i=1; i<=VMs; i++)); do
-  .github/workflows/scripts/qemu-wait-for-vm.sh vm$i
-done
-echo "All $VMs VMs are up now."
-
 # Save the VM's serial output (ttyS0) to /var/tmp/console.txt
 # - ttyS0 on the VM corresponds to a local /dev/pty/N entry
 # - use 'virsh ttyconsole' to lookup the /dev/pty/N entry
 for ((i=1; i<=VMs; i++)); do
  mkdir -p $RESPATH/vm$i
  read "pty" <<< $(sudo virsh ttyconsole vm$i)
+
+  # Create the file so we can tail it, even if there's no output.
+  touch $RESPATH/vm$i/console.txt
+
  sudo nohup bash -c "cat $pty > $RESPATH/vm$i/console.txt" &
+
+  # Write all VM boot lines to the console to aid in debugging failed boots.
+  # The boot lines from all the VMs will be munged together, so prepend each
+  # line with the vm hostname (like 'vm1:').
+  (while IFS=$'\n' read -r line; do echo "vm$i: $line" ; done < <(sudo tail -f $RESPATH/vm$i/console.txt)) &
+
 done
 echo "Console logging for ${VMs}x $OS started."
+
+
+# check if the machines are okay
+echo "Waiting for vm's to come up...  (${VMs}x CPU=$CPU RAM=$RAM)"
+for ((i=1; i<=VMs; i++)); do
+  .github/workflows/scripts/qemu-wait-for-vm.sh vm$i
+done
+echo "All $VMs VMs are up now."
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+######################################################################
+# 6) Test if Lustre can still build against ZFS
+######################################################################
+set -e
+
+# Build from the latest Lustre tag rather than the master branch.  We do this
+# under the assumption that master is going to have a lot of churn thus will be
+# more prone to breaking the build than a point release.  We don't want ZFS
+# PR's reporting bad test results simply because upstream Lustre accidentally
+# broke their build.
+#
+# Skip any RC tags, or any tags where the last version digit is 50 or more.
+# Versions with 50 or more are development versions of Lustre.
+repo=https://github.com/lustre/lustre-release.git
+tag="$(git ls-remote --refs --exit-code --sort=version:refname --tags $repo | \
+	awk -F '_' '/-RC/{next}; /refs\/tags\/v/{if ($NF < 50){print}}' | \
+	tail -n 1 | sed 's/.*\///')"
+
+echo "Cloning Lustre tag $tag"
+git clone --depth 1 --branch "$tag" "$repo"
+
+cd lustre-release
+
+# Include Lustre patches to build against master/zfs-2.4.x.  Once these
+# patches are merged we can remove these lines.
+patches=('https://review.whamcloud.com/changes/fs%2Flustre-release~62101/revisions/2/patch?download'
+	'https://review.whamcloud.com/changes/fs%2Flustre-release~63267/revisions/9/patch?download')
+
+for p in "${patches[@]}" ; do
+	curl $p | base64 -d > patch
+	patch -p1 < patch || true
+done
+
+echo "Configure Lustre"
+./autogen.sh
+# EL 9 needs '--disable-gss-keyring'
+./configure --with-zfs --disable-gss-keyring
+echo "Building Lustre RPMs"
+make rpms
+ls *.rpm
+
+# There's only a handful of Lustre RPMs we actually need to install
+lustrerpms="$(ls *.rpm | grep -E 'kmod-lustre-osd-zfs-[0-9]|kmod-lustre-[0-9]|lustre-osd-zfs-mount-[0-9]')"
+echo "Installing: $lustrerpms"
+sudo dnf -y install $lustrerpms
+sudo modprobe -v lustre
+
+# Should see some Lustre lines in dmesg
+sudo dmesg | grep -Ei 'lnet|lustre'
@@ -4,7 +4,10 @@
 # 6) load openzfs module and run the tests
 #
 # called on runner:  qemu-6-tests.sh
-# called on qemu-vm: qemu-6-tests.sh $OS $2/$3
+# called on qemu-vm: qemu-6-tests.sh $OS $2 $3 [--lustre|--builtin] [quick|default]
+#
+# --lustre: Test build lustre in addition to the normal tests
+# --builtin: Test build ZFS as a kernel built-in in addition to the normal tests
 ######################################################################

 set -eu
@@ -38,6 +41,54 @@ function prefix() {
  fi
 }

+function do_lustre_build() {
+  local rc=0
+  $HOME/zfs/.github/workflows/scripts/qemu-6-lustre-tests-vm.sh &> /var/tmp/lustre.txt || rc=$?
+  echo "$rc" > /var/tmp/lustre-exitcode.txt
+  if [ "$rc" != "0" ] ; then
+      echo "$rc" > /var/tmp/tests-exitcode.txt
+  fi
+}
+export -f do_lustre_build
+
+# Test build ZFS into the kernel directly
+function do_builtin_build() {
+  local rc=0
+  # Get currently full kernel version (like '6.18.8')
+  fullver=$(uname -r | grep -Eo '^[0-9]+\.[0-9]+\.[0-9]+')
+
+  # Get just the major ('6')
+  major=$(echo $fullver | grep -Eo '^[0-9]+')
+  (
+  set -e
+
+  wget https://cdn.kernel.org/pub/linux/kernel/v${major}.x/linux-$fullver.tar.xz
+  tar -xf $HOME/linux-$fullver.tar.xz
+  cd $HOME/linux-$fullver
+  make tinyconfig
+  ./scripts/config --enable EFI_PARTITON
+  ./scripts/config --enable BLOCK
+  # BTRFS_FS is easiest config option to enable CONFIG_ZLIB_INFLATE|DEFLATE
+  ./scripts/config --enable BTRFS_FS
+  yes "" | make oldconfig
+  make prepare
+
+  cd $HOME/zfs
+  ./configure --with-linux=$HOME/linux-$fullver --enable-linux-builtin --enable-debug
+  ./copy-builtin $HOME/linux-$fullver
+
+  cd $HOME/linux-$fullver
+  ./scripts/config --enable ZFS
+  yes "" | make oldconfig
+  make -j `nproc`
+  ) &> /var/tmp/builtin.txt || rc=$?
+  echo "$rc" > /var/tmp/builtin-exitcode.txt
+  if [ "$rc" != "0" ] ; then
+      echo "$rc" > /var/tmp/tests-exitcode.txt
+  fi
+}
+export -f do_builtin_build
+
 # called directly on the runner
 if [ -z ${1:-} ]; then
  cd "/var/tmp"
@@ -49,8 +100,24 @@ if [ -z ${1:-} ]; then

  for ((i=1; i<=VMs; i++)); do
    IP="192.168.122.1$i"
+
+    # We do an additional test build of Lustre against ZFS if we're vm2
+    # on almalinux*.  At the time of writing, the vm2 tests were
+    # completing roughly 15min before the vm1 tests, so it makes sense
+    # to have vm2 do the build.
+    #
+    # In addition, we do an additional test build of ZFS as a Linux
+    # kernel built-in on Fedora.  Again, we do it on vm2 to exploit vm2's
+    # early finish time.
+    extra=""
+    if [[ "$OS" == almalinux* ]] && [[ "$i" == "2" ]] ; then
+        extra="--lustre"
+    elif [[ "$OS" == fedora* ]] && [[ "$i" == "2" ]] ; then
+        extra="--builtin"
+    fi
+
    daemonize -c /var/tmp -p vm${i}.pid -o vm${i}log.txt -- \
-      $SSH zfs@$IP $TESTS $OS $i $VMs $CI_TYPE
+      $SSH zfs@$IP $TESTS $OS $i $VMs $extra $CI_TYPE
    # handly line by line and add info prefix
    stdbuf -oL tail -fq vm${i}log.txt \
      | while read -r line; do prefix "$i" "$line"; done &
@@ -70,9 +137,35 @@ if [ -z ${1:-} ]; then
  exit 0
 fi

-# this part runs inside qemu vm
+
+#############################################
+# Everything from here on runs inside qemu vm
+#############################################
+
+# Process cmd line args
+OS="$1"
+shift
+NUM="$1"
+shift
+DEN="$1"
+shift
+
+BUILD_LUSTRE=0
+BUILD_BUILTIN=0
+if [ "$1" == "--lustre" ] ; then
+  BUILD_LUSTRE=1
+  shift
+elif [ "$1" == "--builtin" ] ; then
+  BUILD_BUILTIN=1
+  shift
+fi
+
+if [ "$1" == "quick" ] ; then
+  export RUNFILES="sanity.run"
+fi
+
 export PATH="$PATH:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/sbin:/usr/local/bin"
-case "$1" in
+case "$OS" in
  freebsd*)
    TDIR="/usr/local/share/zfs"
    sudo kldstat -n zfs 2>/dev/null && sudo kldunload zfs
@@ -95,23 +188,42 @@ case "$1" in
    ;;
 esac

-# enable io_uring on el9/el10
-case "$1" in
+# Distribution-specific settings.
+case "$OS" in
  almalinux9|almalinux10|centos-stream*)
+    # Enable io_uring on Enterprise Linux 9 and 10.
    sudo sysctl kernel.io_uring_disabled=0 > /dev/null
    ;;
+  alpine*)
+    # Ensure `/etc/zfs/zpool.cache` exists.
+    sudo mkdir -p /etc/zfs
+    sudo touch /etc/zfs/zpool.cache
+    sudo chmod 644 /etc/zfs/zpool.cache
+    ;;
 esac

+# Lustre calls a number of exported ZFS module symbols.  To make sure we don't
+# change the symbols and break Lustre, do a quick Lustre build of the latest
+# released Lustre against ZFS.
+#
+# Note that we do the Lustre test build in parallel with ZTS.  ZTS isn't very
+# CPU intensive, so we can use idle CPU cycles "guilt free" for the build.
+# The Lustre build on its own takes ~15min.
+if [ "$BUILD_LUSTRE" == "1" ] ; then
+  do_lustre_build &
+elif [ "$BUILD_BUILTIN" == "1" ] ; then
+  # Try building ZFS directly into the Linux kernel (not as a module)
+  do_builtin_build &
+fi
+
 # run functional testings and save exitcode
 cd /var/tmp
-TAGS=$2/$3
-if [ "$4" == "quick" ]; then
-  export RUNFILES="sanity.run"
-fi
+TAGS=$NUM/$DEN
 sudo dmesg -c > dmesg-prerun.txt
 mount > mount.txt
 df -h > df-prerun.txt
-$TDIR/zfs-tests.sh -vK -s 3GB -T $TAGS
+$TDIR/zfs-tests.sh -vKO -s 3GB -T $TAGS
+
 RV=$?
 df -h > df-postrun.txt
 echo $RV > tests-exitcode.txt
@@ -31,6 +31,12 @@ EOF
  rm -f tmp$$
 }

+function showfile_tail() {
+  echo "##[group]$2 (final lines)"
+  tail -n 80 $1
+  echo "##[endgroup]"
+}
+
 # overview
 cat /tmp/summary.txt
 echo ""
@@ -46,6 +52,32 @@ fi
 echo -e "\nFull logs for download:\n    $1\n"

 for ((i=1; i<=VMs; i++)); do
+
+  # Print Lustre build test results (the build is only done on vm2)
+  if [ -f vm$i/lustre-exitcode.txt ] ; then
+    rv=$(< vm$i/lustre-exitcode.txt)
+    if [ $rv = 0 ]; then
+      vm="[92mvm$i[0m"
+    else
+      vm="[1;91mvm$i[0m"
+      touch /tmp/have_failed_tests
+    fi
+    file="vm$i/lustre.txt"
+    test -s "$file" && showfile_tail "$file" "$vm: Lustre build"
+  fi
+
+  if [ -f vm$i/builtin-exitcode.txt ] ; then
+    rv=$(< vm$i/builtin-exitcode.txt)
+    if [ $rv = 0 ]; then
+      vm="[92mvm$i[0m"
+    else
+      vm="[1;91mvm$i[0m"
+      touch /tmp/have_failed_tests
+    fi
+    file="vm$i/builtin.txt"
+    test -s "$file" && showfile_tail "$file" "$vm: Linux built-in build"
+  fi
+
  rv=$(cat vm$i/tests-exitcode.txt)

  if [ $rv = 0 ]; then
@@ -4,7 +4,11 @@
 #
 # USAGE:
 #
-# 	./qemu-test-repo-vm [URL]
+# 	./qemu-test-repo-vm [--install] [URL]
+#
+# --lookup:	When testing a repo, only lookup the latest package versions,
+#		don't try to install them.  Installing all of them takes over
+#		an hour, so this is much quicker.
 #
 # URL:		URL to use instead of http://download.zfsonlinux.org
 #		If blank, use the default repo from zfs-release RPM.
@@ -15,6 +19,13 @@ source /etc/os-release
 OS="$ID"
 VERSION="$VERSION_ID"

+
+LOOKUP=""
+if [ -n "$1" ] && [ "$1" == "--lookup" ] ; then
+	LOOKUP=1
+	shift
+fi
+
 ALTHOST=""
 if [ -n "$1" ] ; then
 	ALTHOST="$1"
@@ -42,7 +53,19 @@ function test_install {
 		sudo sed -i "s;baseurl=http://download.zfsonlinux.org;baseurl=$host;g" /etc/yum.repos.d/zfs.repo
 	fi

-	sudo dnf -y install $args zfs zfs-test
+	baseurl=$(grep -A 5 "\[$repo\]" /etc/yum.repos.d/zfs.repo  | awk -F'=' '/baseurl=/{print $2; exit}')
+
+	# Just do a version lookup - don't try to install any RPMs
+	if [ "$LOOKUP" == "1" ] ; then
+		 package="$(dnf list $args zfs | tail -n 1 | awk '{print $2}')"
+		 echo "$repo ${package} $baseurl" >> $SUMMARY
+		 return
+	fi
+
+	if ! sudo dnf -y install $args zfs zfs-test ; then
+		echo "$repo ${package}...[FAILED] $baseurl" >> $SUMMARY
+		return
+	fi

 	# Load modules and create a simple pool as a sanity test.
 	sudo /usr/share/zfs/zfs.sh -r
@@ -51,7 +74,6 @@ function test_install {
 	sudo zpool status

 	# Print out repo name, rpm installed (kmod or dkms), and repo URL
-	baseurl=$(grep -A 5 "\[$repo\]" /etc/yum.repos.d/zfs.repo  | awk -F'=' '/baseurl=/{print $2; exit}')
 	package=$(sudo rpm -qa | grep zfs | grep -E 'kmod|dkms')

 	echo "$repo $package $baseurl" >> $SUMMARY
@@ -70,16 +92,19 @@ almalinux*)
 	name=$(curl -Ls $url | grep 'dnf install' | grep -Eo 'zfs-release-[0-9]+-[0-9]+')
 	sudo dnf -y install https://zfsonlinux.org/epel/$name$(rpm --eval "%{dist}").noarch.rpm 2>&1
 	sudo rpm -qi zfs-release
-	test_install zfs $ALTHOST
-	test_install zfs-kmod $ALTHOST
-	test_install zfs-testing $ALTHOST
-	test_install zfs-testing-kmod $ALTHOST
+	for i in zfs zfs-kmod zfs-testing zfs-testing-kmod zfs-latest \
+		zfs-latest-kmod zfs-legacy zfs-legacy-kmod zfs-2.2 \
+		zfs-2.2-kmod zfs-2.3 zfs-2.3-kmod zfs-2.4 zfs-2.4-kmod; do
+		test_install $i $ALTHOST
+	done
 	;;
 fedora*)
 	url='https://raw.githubusercontent.com/openzfs/openzfs-docs/refs/heads/master/docs/Getting%20Started/Fedora/index.rst'
 	name=$(curl -Ls $url | grep 'dnf install' | grep -Eo 'zfs-release-[0-9]+-[0-9]+')
 	sudo dnf -y install -y https://zfsonlinux.org/fedora/$name$(rpm --eval "%{dist}").noarch.rpm
-	test_install zfs $ALTHOST
+	for i in zfs zfs-latest zfs-legacy zfs-2.2 zfs-2.3 zfs-2.4 ; do
+		test_install $i $ALTHOST
+	done
 	;;
 esac
 echo "##[endgroup]"
@@ -0,0 +1,52 @@
+name: smatch
+
+on:
+  push:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  smatch:
+    runs-on: ubuntu-24.04
+    steps:
+    - name: Checkout smatch
+      uses: actions/checkout@v4
+      with:
+        repository: error27/smatch
+        ref: master
+        path: smatch
+    - name: Install smatch dependencies
+      run: |
+        sudo apt-get install -y llvm gcc make sqlite3 libsqlite3-dev libdbd-sqlite3-perl libssl-dev libtry-tiny-perl
+    - name: Make smatch
+      run: |
+        cd $GITHUB_WORKSPACE/smatch
+        make -j$(nproc)
+    - name: Checkout OpenZFS
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.event.pull_request.head.sha }}
+        path: zfs
+    - name: Install OpenZFS dependencies
+      run: |
+        cd $GITHUB_WORKSPACE/zfs
+        sudo apt-get purge -y snapd google-chrome-stable firefox
+        ONLY_DEPS=1 .github/workflows/scripts/qemu-3-deps-vm.sh ubuntu24
+    - name: Autogen.sh OpenZFS
+      run: |
+        cd $GITHUB_WORKSPACE/zfs
+        ./autogen.sh
+    - name: Configure OpenZFS
+      run: |
+        cd $GITHUB_WORKSPACE/zfs
+        ./configure --enable-debug
+    - name: Make OpenZFS
+      run: |
+        cd $GITHUB_WORKSPACE/zfs
+        make -j$(nproc) CHECK="$GITHUB_WORKSPACE/smatch/smatch" CC=$GITHUB_WORKSPACE/smatch/cgcc | tee $GITHUB_WORKSPACE/smatch.log
+    - name: Smatch results log
+      run: |
+        grep -E 'error:|warn:|warning:' $GITHUB_WORKSPACE/smatch.log
@@ -42,6 +42,12 @@ on:
        required: false
        default: ""
        description: "(optional) repo URL (blank: use http://download.zfsonlinux.org)"
+      lookup:
+        type: boolean
+        required: false
+        default: false
+        description: "(optional) do version lookup only on repo test"
+
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
@@ -52,7 +58,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora41', 'fedora42']
+        os: ['almalinux8', 'almalinux9', 'almalinux10', 'fedora41', 'fedora42', 'fedora43']
    runs-on: ubuntu-24.04
    steps:
    - uses: actions/checkout@v4
@@ -60,20 +66,16 @@ jobs:
        ref: ${{ github.event.pull_request.head.sha }}

    - name: Setup QEMU
-      timeout-minutes: 10
      run: .github/workflows/scripts/qemu-1-setup.sh

    - name: Start build machine
-      timeout-minutes: 10
      run: .github/workflows/scripts/qemu-2-start.sh ${{ matrix.os }}

    - name: Install dependencies
-      timeout-minutes: 20
      run: |
        .github/workflows/scripts/qemu-3-deps.sh ${{ matrix.os }}

    - name: Build modules or Test repo
-      timeout-minutes: 30
      run: |
        set -e
        if [ "${{ github.event.inputs.test_type }}" == "Test repo" ] ; then
@@ -81,7 +83,12 @@ jobs:
                .github/workflows/scripts/qemu-prepare-for-build.sh

                mkdir -p /tmp/repo
-                ssh zfs@vm0 '$HOME/zfs/.github/workflows/scripts/qemu-test-repo-vm.sh' ${{ github.event.inputs.repo_url }}
+                EXTRA=""
+                if [ "${{ github.event.inputs.lookup }}" == 'true' ] ; then
+                        EXTRA="--lookup"
+                fi
+
+                ssh zfs@vm0 '$HOME/zfs/.github/workflows/scripts/qemu-test-repo-vm.sh' $EXTRA ${{ github.event.inputs.repo_url }}
        else
                EXTRA=""
                if [ -n "${{ github.event.inputs.patch_level }}" ] ; then
@@ -94,7 +101,6 @@ jobs:

    - name: Prepare artifacts
      if: always()
-      timeout-minutes: 10
      run: |
        rsync -a zfs@vm0:/tmp/repo /tmp || true
        .github/workflows/scripts/replace-dupes-with-symlinks.sh /tmp/repo
@@ -10,6 +10,11 @@ on:
        required: false
        default: ""
        description: "(optional) Experimental kernel version to install on Fedora (like '6.14' or '6.13.3-0.rc3')"
+      specific_os:
+        type: string
+        required: false
+        default: ""
+        description: "(optional) Only run on this specific OS (like 'fedora42' or 'alpine3-23')"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -29,33 +34,45 @@ jobs:
      - name: Generate OS config and CI type
        id: os
        run: |
-          FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
-          QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]'
+          ci_type="default"
+
          # determine CI type when running on PR
-          ci_type="full"
          if ${{ github.event_name == 'pull_request' }}; then
            head=${{ github.event.pull_request.head.sha }}
            base=${{ github.event.pull_request.base.sha }}
            ci_type=$(python3 .github/workflows/scripts/generate-ci-type.py $head $base)
          fi
-          if [ "$ci_type" == "quick" ]; then
-            os_selection="$QUICK_OS"
-          else
-            os_selection="$FULL_OS"
-          fi

-          if [ ${{ github.event.inputs.fedora_kernel_ver }} != "" ] ; then
-              # They specified a custom kernel version for Fedora.  Use only
-              # Fedora runners.
+          case "$ci_type" in
+          quick)
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd15-0s", "ubuntu24"]'
+            ;;
+          linux)
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "debian13", "fedora41", "fedora42", "fedora43", "ubuntu22", "ubuntu24"]'
+            ;;
+          freebsd)
+            os_selection='["freebsd13-5r", "freebsd14-2r", "freebsd14-3r", "freebsd13-5s", "freebsd14-3s", "freebsd15-0s", "freebsd16-0c"]'
+            ;;
+          *)
+            # default list
+            os_selection='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora42", "fedora43", "freebsd14-3r", "freebsd15-0s", "freebsd16-0c", "ubuntu22", "ubuntu24"]'
+            ;;
+          esac
+
+          if ${{ github.event.inputs.fedora_kernel_ver != '' }}; then
+              # They specified a custom kernel version for Fedora.
+              # Use only Fedora runners.
              os_json=$(echo ${os_selection} | jq -c '[.[] | select(startswith("fedora"))]')
+          elif ${{ github.event.inputs.specific_os != '' }}; then
+              # Use only the specified runner.
+              os_json=$(jq -cn --arg os "${{ github.event.inputs.specific_os }}" '[ $os ]')
          else
              # Normal case
              os_json=$(echo ${os_selection} | jq -c)
          fi

-          echo $os_json
-          echo "os=$os_json" >> $GITHUB_OUTPUT
-          echo "ci_type=$ci_type" >> $GITHUB_OUTPUT
+          echo "os=$os_json" | tee -a $GITHUB_OUTPUT
+          echo "ci_type=$ci_type" | tee -a $GITHUB_OUTPUT

  qemu-vm:
    name: qemu-x86
@@ -63,13 +80,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # rhl:     almalinux8, almalinux9, centos-stream9, fedora4x
+        # rhl:     almalinux8, almalinux9, centos-streamX, fedora4x
        # debian:  debian12, debian13, ubuntu22, ubuntu24
        # misc:    archlinux, tumbleweed
-        # FreeBSD variants of 2025-06:
+        # FreeBSD variants of november 2025:
        # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r
-        # FreeBSD Stable:  freebsd13-5s, freebsd14-3s
-        # FreeBSD Current: freebsd15-0c
+        # FreeBSD Stable:  freebsd13-5s, freebsd14-3s, freebsd15-0s
+        # FreeBSD Current: freebsd16-0c
        os: ${{ fromJson(needs.test-config.outputs.test_os) }}
    runs-on: ubuntu-24.04
    steps:
@@ -78,7 +95,7 @@ jobs:
        ref: ${{ github.event.pull_request.head.sha }}

    - name: Setup QEMU
-      timeout-minutes: 10
+      timeout-minutes: 60
      run: .github/workflows/scripts/qemu-1-setup.sh

    - name: Start build machine
@@ -86,7 +103,7 @@ jobs:
      run: .github/workflows/scripts/qemu-2-start.sh ${{ matrix.os }}

    - name: Install dependencies
-      timeout-minutes: 20
+      timeout-minutes: 60
      run: .github/workflows/scripts/qemu-3-deps.sh ${{ matrix.os }} ${{ github.event.inputs.fedora_kernel_ver }}

    - name: Build modules
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.3.4
+Version:       2.3.6
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.16
+Linux-Maximum: 6.19
 Linux-Minimum: 4.18
@@ -264,9 +264,21 @@ cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)

 static int
 init_rand(void *data, size_t size, void *private)
+{
+	size_t *offsetp = (size_t *)private;
+	size_t offset = *offsetp;
+
+	VERIFY3U(offset + size, <=, SPA_MAXBLOCKSIZE);
+	memcpy(data, (char *)rand_data + offset, size);
+	*offsetp = offset + size;
+	return (0);
+}
+
+static int
+corrupt_rand_fill(void *data, size_t size, void *private)
 {
 	(void) private;
-	memcpy(data, rand_data, size);
+	memset(data, 0xAA, size);
 	return (0);
 }

@@ -278,7 +290,7 @@ corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
 		for (int i = 0; i < cnt; i++) {
 			raidz_col_t *col = &rr->rr_col[tgts[i]];
 			abd_iterate_func(col->rc_abd, 0, col->rc_size,
-			    init_rand, NULL);
+			    corrupt_rand_fill, NULL);
 		}
 	}
 }
@@ -286,7 +298,8 @@ corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
 void
 init_zio_abd(zio_t *zio)
 {
-	abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
+	size_t offset = 0;
+	abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, &offset);
 }

 static void
@@ -373,7 +386,7 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)

 	*zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);

-	(*zio)->io_offset = 0;
+	(*zio)->io_offset = opts->rto_offset;
 	(*zio)->io_size = alloc_dsize;
 	(*zio)->io_abd = raidz_alloc(alloc_dsize);
 	init_zio_abd(*zio);
@@ -834,6 +847,8 @@ main(int argc, char **argv)
 		err = run_test(NULL);
 	}

+	mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ | PROT_WRITE);
+
 	umem_free(rand_data, SPA_MAXBLOCKSIZE);
 	kernel_fini();

@@ -72,7 +72,7 @@ typedef struct raidz_test_opts {

 static const raidz_test_opts_t rto_opts_defaults = {
 	.rto_ashift = 9,
-	.rto_offset = 1ULL << 0,
+	.rto_offset = 0,
 	.rto_dcols = 8,
 	.rto_dsize = 1<<19,
 	.rto_v = D_ALL,
@@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max;
 extern uint_t zfs_btree_verify_intensity;

 static const char cmdname[] = "zdb";
-uint8_t dump_opt[256];
+uint8_t dump_opt[512];
+
+#define	ALLOCATED_OPT	256

 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);

@@ -381,7 +383,7 @@ verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
 	sublivelist_verify_block_t svb = {{{0}}};
 	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
 	DVA_SET_OFFSET(&svb.svb_dva, offset);
-	DVA_SET_ASIZE(&svb.svb_dva, size);
+	DVA_SET_ASIZE(&svb.svb_dva, 0);
 	zfs_btree_index_t where;
 	uint64_t end_offset = offset + size;

@@ -1651,6 +1653,16 @@ dump_metaslab_stats(metaslab_t *msp)
 	dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 }

+static void
+dump_allocated(void *arg, uint64_t start, uint64_t size)
+{
+	uint64_t *off = arg;
+	if (*off != start)
+		(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off,
+		    start - *off);
+	*off = start + size;
+}
+
 static void
 dump_metaslab(metaslab_t *msp)
 {
@@ -1667,13 +1679,24 @@ dump_metaslab(metaslab_t *msp)
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);

-	if (dump_opt['m'] > 2 && !dump_opt['L']) {
+	if (dump_opt[ALLOCATED_OPT] ||
+	    (dump_opt['m'] > 2 && !dump_opt['L'])) {
 		mutex_enter(&msp->ms_lock);
 		VERIFY0(metaslab_load(msp));
+	}
+
+	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		zfs_range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
-		metaslab_unload(msp);
-		mutex_exit(&msp->ms_lock);
+	}
+
+	if (dump_opt[ALLOCATED_OPT]) {
+		uint64_t off = msp->ms_start;
+		zfs_range_tree_walk(msp->ms_allocatable, dump_allocated,
+		    &off);
+		if (off != msp->ms_start + msp->ms_size)
+			(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off,
+			    msp->ms_size - off);
 	}

 	if (dump_opt['m'] > 1 && sm != NULL &&
@@ -1688,6 +1711,12 @@ dump_metaslab(metaslab_t *msp)
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}

+	if (dump_opt[ALLOCATED_OPT] ||
+	    (dump_opt['m'] > 2 && !dump_opt['L'])) {
+		metaslab_unload(msp);
+		mutex_exit(&msp->ms_lock);
+	}
+
 	if (vd->vdev_ops == &vdev_draid_ops)
 		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
 	else
@@ -1724,8 +1753,9 @@ print_vdev_metaslab_header(vdev_t *vd)
 		}
 	}

-	(void) printf("\tvdev %10llu   %s",
-	    (u_longlong_t)vd->vdev_id, bias_str);
+	(void) printf("\tvdev %10llu\t%s  metaslab shift %4llu",
+	    (u_longlong_t)vd->vdev_id, bias_str,
+	    (u_longlong_t)vd->vdev_ms_shift);

 	if (ms_flush_data_obj != 0) {
 		(void) printf("   ms_unflushed_phys object %llu",
@@ -9318,6 +9348,8 @@ main(int argc, char **argv)
 		{"all-reconstruction",	no_argument,		NULL, 'Y'},
 		{"livelist",		no_argument,		NULL, 'y'},
 		{"zstd-headers",	no_argument,		NULL, 'Z'},
+		{"allocated-map",	no_argument,		NULL,
+		    ALLOCATED_OPT},
 		{0, 0, 0, 0}
 	};

@@ -9348,6 +9380,7 @@ main(int argc, char **argv)
 		case 'u':
 		case 'y':
 		case 'Z':
+		case ALLOCATED_OPT:
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
@@ -29,6 +29,6 @@
 #define	_ZDB_H

 void dump_intent_log(zilog_t *);
-extern uint8_t dump_opt[256];
+extern uint8_t dump_opt[512];

 #endif	/* _ZDB_H */
@@ -48,8 +48,6 @@

 #include "zdb.h"

-extern uint8_t dump_opt[256];
-
 static char tab_prefix[4] = "\t\t\t";

 static void
@@ -930,19 +930,15 @@ usage:
 }

 /*
- * Return a default volblocksize for the pool which always uses more than
- * half of the data sectors.  This primarily applies to dRAID which always
- * writes full stripe widths.
+ * Calculate the minimum allocation size based on the top-level vdevs.
 */
 static uint64_t
-default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+calculate_volblocksize(nvlist_t *config)
 {
-	uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+	uint64_t asize = SPA_MINBLOCKSIZE;
 	nvlist_t *tree, **vdevs;
 	uint_t nvdevs;

-	nvlist_t *config = zpool_get_config(zhp, NULL);
-
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
 	    nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
 	    &vdevs, &nvdevs) != 0) {
@@ -973,6 +969,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
 		}
 	}

+	return (asize);
+}
+
+/*
+ * Return a default volblocksize for the pool which always uses more than
+ * half of the data sectors.  This primarily applies to dRAID which always
+ * writes full stripe widths.
+ */
+static uint64_t
+default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+{
+	uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+
+	nvlist_t *config = zpool_get_config(zhp, NULL);
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0)
+		asize = calculate_volblocksize(config);
+
 	/*
 	 * Calculate the target volblocksize such that more than half
 	 * of the asize is used. The following table is for 4k sectors.
@@ -145,11 +145,11 @@ zfs_project_handle_one(const char *name, zfs_project_control_t *zpc)
 	switch (zpc->zpc_op) {
 	case ZFS_PROJECT_OP_LIST:
 		(void) printf("%5u %c %s\n", fsx.fsx_projid,
-		    (fsx.fsx_xflags & ZFS_PROJINHERIT_FL) ? 'P' : '-', name);
+		    (fsx.fsx_xflags & FS_XFLAG_PROJINHERIT) ? 'P' : '-', name);
 		goto out;
 	case ZFS_PROJECT_OP_CHECK:
 		if (fsx.fsx_projid == zpc->zpc_expected_projid &&
-		    fsx.fsx_xflags & ZFS_PROJINHERIT_FL)
+		    fsx.fsx_xflags & FS_XFLAG_PROJINHERIT)
 			goto out;

 		if (!zpc->zpc_newline) {
@@ -164,29 +164,30 @@ zfs_project_handle_one(const char *name, zfs_project_control_t *zpc)
 			    "(%u/%u)\n", name, fsx.fsx_projid,
 			    (uint32_t)zpc->zpc_expected_projid);

-		if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL))
+		if (!(fsx.fsx_xflags & FS_XFLAG_PROJINHERIT))
 			(void) printf("%s - project inherit flag is not set\n",
 			    name);

 		goto out;
 	case ZFS_PROJECT_OP_CLEAR:
-		if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL) &&
+		if (!(fsx.fsx_xflags & FS_XFLAG_PROJINHERIT) &&
 		    (zpc->zpc_keep_projid ||
 		    fsx.fsx_projid == ZFS_DEFAULT_PROJID))
 			goto out;

-		fsx.fsx_xflags &= ~ZFS_PROJINHERIT_FL;
+		fsx.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
 		if (!zpc->zpc_keep_projid)
 			fsx.fsx_projid = ZFS_DEFAULT_PROJID;
 		break;
 	case ZFS_PROJECT_OP_SET:
 		if (fsx.fsx_projid == zpc->zpc_expected_projid &&
-		    (!zpc->zpc_set_flag || fsx.fsx_xflags & ZFS_PROJINHERIT_FL))
+		    (!zpc->zpc_set_flag ||
+		    fsx.fsx_xflags & FS_XFLAG_PROJINHERIT))
 			goto out;

 		fsx.fsx_projid = zpc->zpc_expected_projid;
 		if (zpc->zpc_set_flag)
-			fsx.fsx_xflags |= ZFS_PROJINHERIT_FL;
+			fsx.fsx_xflags |= FS_XFLAG_PROJINHERIT;
 		break;
 	default:
 		ASSERT(0);
@@ -194,11 +195,30 @@ zfs_project_handle_one(const char *name, zfs_project_control_t *zpc)
 	}

 	ret = ioctl(fd, ZFS_IOC_FSSETXATTR, &fsx);
-	if (ret)
+	if (ret) {
 		(void) fprintf(stderr,
 		    gettext("failed to set xattr for %s: %s\n"),
 		    name, strerror(errno));

+		if (errno == ENOTSUP) {
+			char *kver = zfs_version_kernel();
+			/*
+			 * Special case: a module/userspace version mismatch can
+			 * return ENOTSUP due to us fixing the XFLAGs bits in
+			 * #17884.  In that case give a hint to the user that
+			 * they should take action to make the versions match.
+			 */
+			if (strcmp(kver, ZFS_META_ALIAS) != 0) {
+				fprintf(stderr,
+				    gettext("Warning: The zfs module version "
+				    "(%s) and userspace\nversion (%s) do not "
+				    "match up.  This may be the\ncause of the "
+				    "\"Operation not supported\" error.\n"),
+				    kver, ZFS_META_ALIAS);
+			}
+		}
+	}
+
 out:
 	close(fd);
 	return (ret);
@@ -52,12 +52,15 @@
 #include <sys/zio_compress.h>
 #include <sys/zfeature.h>
 #include <sys/dmu_tx.h>
+#include <sys/backtrace.h>
 #include <zfeature_common.h>
 #include <libzutil.h>
+#include <sys/metaslab_impl.h>

 static importargs_t g_importargs;
 static char *g_pool;
 static boolean_t g_readonly;
+static boolean_t g_dump_dbgmsg;

 typedef enum {
 	ZHACK_REPAIR_OP_UNKNOWN  = 0,
@@ -69,11 +72,23 @@ static __attribute__((noreturn)) void
 usage(void)
 {
 	(void) fprintf(stderr,
-	    "Usage: zhack [-c cachefile] [-d dir] <subcommand> <args> ...\n"
-	    "where <subcommand> <args> is one of the following:\n"
+	    "Usage: zhack [-o tunable] [-c cachefile] [-d dir] [-G] "
+	    "<subcommand> <args> ...\n"
+	    "       where <subcommand> <args> is one of the following:\n"
 	    "\n");

 	(void) fprintf(stderr,
+	    "    global options:\n"
+	    "    -c <cachefile>   reads config from the given cachefile\n"
+	    "    -d <dir>         directory with vdevs for import\n"
+	    "    -o var=value...  set global variable to an unsigned "
+	    "32-bit integer\n"
+	    "    -G               dump zfs_dbgmsg buffer before exiting\n"
+	    "\n"
+	    "    action idle <pool> [-f] [-t seconds]\n"
+	    "        import the pool for a set time then export it\n"
+	    "        -t <seconds> sets the time the pool is imported\n"
+	    "\n"
 	    "    feature stat <pool>\n"
 	    "        print information about enabled features\n"
 	    "    feature enable [-r] [-d desc] <pool> <feature>\n"
@@ -93,10 +108,46 @@ usage(void)
 	    "        -c repair corrupted label checksums\n"
 	    "        -u restore the label on a detached device\n"
 	    "\n"
-	    "    <device> : path to vdev\n");
+	    "    <device> : path to vdev\n"
+	    "\n"
+	    "    metaslab leak <pool>\n"
+	    "        apply allocation map from zdb to specified pool\n");
 	exit(1);
 }

+static void
+dump_debug_buffer(void)
+{
+	ssize_t ret __attribute__((unused));
+
+	if (!g_dump_dbgmsg)
+		return;
+
+	/*
+	 * We use write() instead of printf() so that this function
+	 * is safe to call from a signal handler.
+	 */
+	ret = write(STDERR_FILENO, "\n", 1);
+	zfs_dbgmsg_print(STDERR_FILENO, "zhack");
+}
+
+static void sig_handler(int signo)
+{
+	struct sigaction action;
+
+	libspl_backtrace(STDERR_FILENO);
+	dump_debug_buffer();
+
+	/*
+	 * Restore default action and re-raise signal so SIGSEGV and
+	 * SIGABRT can trigger a core dump.
+	 */
+	action.sa_handler = SIG_DFL;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	(void) sigaction(signo, &action, NULL);
+	raise(signo);
+}

 static __attribute__((format(printf, 3, 4))) __attribute__((noreturn)) void
 fatal(spa_t *spa, const void *tag, const char *fmt, ...)
@@ -114,6 +165,8 @@ fatal(spa_t *spa, const void *tag, const char *fmt, ...)
 	va_end(ap);
 	(void) fputc('\n', stderr);

+	dump_debug_buffer();
+
 	exit(1);
 }

@@ -169,7 +222,7 @@ zhack_import(char *target, boolean_t readonly)

 	zfeature_checks_disable = B_TRUE;
 	error = spa_import(target, config, props,
-	    (readonly ?  ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL));
+	    (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL));
 	fnvlist_free(config);
 	zfeature_checks_disable = B_FALSE;
 	if (error == EEXIST)
@@ -363,10 +416,12 @@ feature_incr_sync(void *arg, dmu_tx_t *tx)
 	zfeature_info_t *feature = arg;
 	uint64_t refcount;

+	mutex_enter(&spa->spa_feat_stats_lock);
 	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
 	feature_sync(spa, feature, refcount + 1, tx);
 	spa_history_log_internal(spa, "zhack feature incr", tx,
 	    "name=%s", feature->fi_guid);
+	mutex_exit(&spa->spa_feat_stats_lock);
 }

 static void
@@ -376,10 +431,12 @@ feature_decr_sync(void *arg, dmu_tx_t *tx)
 	zfeature_info_t *feature = arg;
 	uint64_t refcount;

+	mutex_enter(&spa->spa_feat_stats_lock);
 	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
 	feature_sync(spa, feature, refcount - 1, tx);
 	spa_history_log_internal(spa, "zhack feature decr", tx,
 	    "name=%s", feature->fi_guid);
+	mutex_exit(&spa->spa_feat_stats_lock);
 }

 static void
@@ -496,6 +553,259 @@ zhack_do_feature(int argc, char **argv)
 	return (0);
 }

+static void
+zhack_do_action_idle(int argc, char **argv)
+{
+	spa_t *spa;
+	char *target, *tmp;
+	int idle_time = 0;
+	int c;
+
+	optind = 1;
+	while ((c = getopt(argc, argv, "+t:")) != -1) {
+		switch (c) {
+		case 't':
+			idle_time = strtol(optarg, &tmp, 0);
+			if (*tmp) {
+				(void) fprintf(stderr, "error: time must "
+				    "be an integer in seconds: %s\n", tmp);
+				usage();
+			}
+			if (idle_time < 0) {
+				(void) fprintf(stderr, "error: time must "
+				    "not be negative: %d\n", idle_time);
+				usage();
+			}
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, "error: missing pool name\n");
+		usage();
+	}
+	target = argv[0];
+
+	zhack_spa_open(target, B_FALSE, FTAG, &spa);
+
+	fprintf(stdout, "Imported pool %s, idle for %d seconds\n",
+	    target, idle_time);
+	sleep(idle_time);
+
+	spa_close(spa, FTAG);
+}
+
+static int
+zhack_do_action(int argc, char **argv)
+{
+	char *subcommand;
+
+	argc--;
+	argv++;
+	if (argc == 0) {
+		(void) fprintf(stderr,
+		    "error: no import operation specified\n");
+		usage();
+	}
+
+	subcommand = argv[0];
+	if (strcmp(subcommand, "idle") == 0) {
+		zhack_do_action_idle(argc, argv);
+	} else {
+		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
+		    subcommand);
+		usage();
+	}
+
+	return (0);
+}
+
+
+static boolean_t
+strstarts(const char *a, const char *b)
+{
+	return (strncmp(a, b, strlen(b)) == 0);
+}
+
+static void
+metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size,
+    dmu_tx_t *tx)
+{
+	ASSERT(msp->ms_disabled);
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	uint64_t off = start;
+	while (off < start + size) {
+		uint64_t ostart, osize;
+		boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable,
+		    off, start + size - off, &ostart, &osize);
+		if (!found)
+			break;
+		zfs_range_tree_remove(msp->ms_allocatable, ostart, osize);
+
+		if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
+			vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp,
+			    txg);
+
+		zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart,
+		    osize);
+		msp->ms_allocating_total += osize;
+		off = ostart + osize;
+	}
+}
+
+static void
+zhack_do_metaslab_leak(int argc, char **argv)
+{
+	int c;
+	char *target;
+	spa_t *spa;
+
+	optind = 1;
+	boolean_t force = B_FALSE;
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, "error: missing pool name\n");
+		usage();
+	}
+	target = argv[0];
+
+	zhack_spa_open(target, B_FALSE, FTAG, &spa);
+	spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER);
+
+	char *line = NULL;
+	size_t cap = 0;
+
+	vdev_t *vd = NULL;
+	metaslab_t *prev = NULL;
+	dmu_tx_t *tx = NULL;
+	while (getline(&line, &cap, stdin) > 0) {
+		if (strstarts(line, "\tvdev ")) {
+			uint64_t vdev_id, ms_shift;
+			if (sscanf(line,
+			    "\tvdev %10"PRIu64"\t%*s  metaslab shift %4"PRIu64,
+			    &vdev_id, &ms_shift) == 1) {
+				VERIFY3U(sscanf(line, "\tvdev %"PRIu64
+				    "\t  metaslab shift %4"PRIu64,
+				    &vdev_id, &ms_shift), ==, 2);
+			}
+			vd = vdev_lookup_top(spa, vdev_id);
+			if (vd == NULL) {
+				fprintf(stderr, "error: no such vdev with "
+				    "id %"PRIu64"\n", vdev_id);
+				break;
+			}
+			if (tx) {
+				dmu_tx_commit(tx);
+				mutex_exit(&prev->ms_lock);
+				metaslab_enable(prev, B_FALSE, B_FALSE);
+				tx = NULL;
+				prev = NULL;
+			}
+			if (vd->vdev_ms_shift != ms_shift) {
+				fprintf(stderr, "error: ms_shift mismatch: %"
+				    PRIu64" != %"PRIu64"\n", vd->vdev_ms_shift,
+				    ms_shift);
+				break;
+			}
+		} else if (strstarts(line, "\tmetaslabs ")) {
+			uint64_t ms_count;
+			VERIFY3U(sscanf(line, "\tmetaslabs %"PRIu64, &ms_count),
+			    ==, 1);
+			ASSERT(vd);
+			if (!force && vd->vdev_ms_count != ms_count) {
+				fprintf(stderr, "error: ms_count mismatch: %"
+				    PRIu64" != %"PRIu64"\n", vd->vdev_ms_count,
+				    ms_count);
+				break;
+			}
+		} else if (strstarts(line, "ALLOC:")) {
+			uint64_t start, size;
+			VERIFY3U(sscanf(line, "ALLOC: %"PRIu64" %"PRIu64"\n",
+			    &start, &size), ==, 2);
+
+			ASSERT(vd);
+			metaslab_t *cur =
+			    vd->vdev_ms[start >> vd->vdev_ms_shift];
+			if (prev != cur) {
+				if (prev) {
+					dmu_tx_commit(tx);
+					mutex_exit(&prev->ms_lock);
+					metaslab_enable(prev, B_FALSE, B_FALSE);
+				}
+				ASSERT(cur);
+				metaslab_disable(cur);
+				mutex_enter(&cur->ms_lock);
+				metaslab_load(cur);
+				prev = cur;
+				tx = dmu_tx_create_dd(
+				    spa_get_dsl(vd->vdev_spa)->dp_root_dir);
+				dmu_tx_assign(tx, DMU_TX_WAIT);
+			}
+
+			metaslab_force_alloc(cur, start, size, tx);
+		} else {
+			continue;
+		}
+	}
+	if (tx) {
+		dmu_tx_commit(tx);
+		mutex_exit(&prev->ms_lock);
+		metaslab_enable(prev, B_FALSE, B_FALSE);
+		tx = NULL;
+		prev = NULL;
+	}
+	if (line)
+		free(line);
+
+	spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG);
+	spa_close(spa, FTAG);
+}
+
+static int
+zhack_do_metaslab(int argc, char **argv)
+{
+	char *subcommand;
+
+	argc--;
+	argv++;
+	if (argc == 0) {
+		(void) fprintf(stderr,
+		    "error: no metaslab operation specified\n");
+		usage();
+	}
+
+	subcommand = argv[0];
+	if (strcmp(subcommand, "leak") == 0) {
+		zhack_do_metaslab_leak(argc, argv);
+	} else {
+		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
+		    subcommand);
+		usage();
+	}
+
+	return (0);
+}
+
 #define	ASHIFT_UBERBLOCK_SHIFT(ashift)	\
 	MIN(MAX(ashift, UBERBLOCK_SHIFT), \
 	MAX_UBERBLOCK_SHIFT)
@@ -971,17 +1281,35 @@ zhack_do_label(int argc, char **argv)
 int
 main(int argc, char **argv)
 {
+	struct sigaction action;
 	char *path[MAX_NUM_PATHS];
 	const char *subcommand;
 	int rv = 0;
 	int c;

+	/*
+	 * Set up signal handlers, so if we crash due to bad on-disk data we
+	 * can get more info. Unlike ztest, we don't bail out if we can't set
+	 * up signal handlers, because zhack is very useful without them.
+	 */
+	action.sa_handler = sig_handler;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	if (sigaction(SIGSEGV, &action, NULL) < 0) {
+		(void) fprintf(stderr, "zhack: cannot catch SIGSEGV: %s\n",
+		    strerror(errno));
+	}
+	if (sigaction(SIGABRT, &action, NULL) < 0) {
+		(void) fprintf(stderr, "zhack: cannot catch SIGABRT: %s\n",
+		    strerror(errno));
+	}
+
 	g_importargs.path = path;

 	dprintf_setup(&argc, argv);
 	zfs_prop_init();

-	while ((c = getopt(argc, argv, "+c:d:")) != -1) {
+	while ((c = getopt(argc, argv, "+c:d:Go:")) != -1) {
 		switch (c) {
 		case 'c':
 			g_importargs.cachefile = optarg;
@@ -990,6 +1318,13 @@ main(int argc, char **argv)
 			assert(g_importargs.paths < MAX_NUM_PATHS);
 			g_importargs.path[g_importargs.paths++] = optarg;
 			break;
+		case 'G':
+			g_dump_dbgmsg = B_TRUE;
+			break;
+		case 'o':
+			if (set_global_var(optarg) != 0)
+				exit(1);
+			break;
 		default:
 			usage();
 			break;
@@ -1007,10 +1342,14 @@ main(int argc, char **argv)

 	subcommand = argv[0];

-	if (strcmp(subcommand, "feature") == 0) {
+	if (strcmp(subcommand, "action") == 0) {
+		rv = zhack_do_action(argc, argv);
+	} else if (strcmp(subcommand, "feature") == 0) {
 		rv = zhack_do_feature(argc, argv);
 	} else if (strcmp(subcommand, "label") == 0) {
 		return (zhack_do_label(argc, argv));
+	} else if (strcmp(subcommand, "metaslab") == 0) {
+		rv = zhack_do_metaslab(argc, argv);
 	} else {
 		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
 		    subcommand);
@@ -1022,6 +1361,9 @@ main(int argc, char **argv)
 		    "changes may not be committed to disk\n");
 	}

+	if (g_dump_dbgmsg)
+		dump_debug_buffer();
+
 	kernel_fini();

 	return (rv);
@@ -3883,6 +3883,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 			    hostid, ctime(&timestamp));
 		}

+		if (getenv("ZFS_LOAD_INFO_DEBUG"))
+			dump_nvlist(nvinfo, 4);
+
 		return (1);
 	}

@@ -270,14 +270,13 @@ is_spare(nvlist_t *config, const char *path)
 *	draid*		Virtual dRAID spare
 */
 static nvlist_t *
-make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
+make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift)
 {
 	char path[MAXPATHLEN];
 	struct stat64 statbuf;
 	nvlist_t *vdev = NULL;
 	const char *type = NULL;
 	boolean_t wholedisk = B_FALSE;
-	uint64_t ashift = 0;
 	int err;

 	/*
@@ -381,31 +380,6 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
 		    (uint64_t)wholedisk) == 0);

-	/*
-	 * Override defaults if custom properties are provided.
-	 */
-	if (props != NULL) {
-		const char *value = NULL;
-
-		if (nvlist_lookup_string(props,
-		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
-			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
-				(void) fprintf(stderr,
-				    gettext("ashift must be a number.\n"));
-				return (NULL);
-			}
-			if (ashift != 0 &&
-			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
-				(void) fprintf(stderr,
-				    gettext("invalid 'ashift=%" PRIu64 "' "
-				    "property: only values between %" PRId32 " "
-				    "and %" PRId32 " are allowed.\n"),
-				    ashift, ASHIFT_MIN, ASHIFT_MAX);
-				return (NULL);
-			}
-		}
-	}
-
 	/*
 	 * If the device is known to incorrectly report its physical sector
 	 * size explicitly provide the known correct value.
@@ -610,22 +584,28 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
 				    ZPOOL_CONFIG_PATH, &path) == 0);

 				/*
-				 * If we have a raidz/mirror that combines disks
-				 * with files, report it as an error.
+				 * Skip active spares they should never cause
+				 * the pool to be evaluated as inconsistent.
 				 */
-				if (!dontreport && type != NULL &&
+				if (is_spare(NULL, path))
+					continue;
+
+				/*
+				 * If we have a raidz/mirror that combines disks
+				 * with files, only report it as an error when
+				 * fatal is set to ensure all the replication
+				 * checks aren't skipped in check_replication().
+				 */
+				if (fatal && !dontreport && type != NULL &&
 				    strcmp(type, childtype) != 0) {
 					if (ret != NULL)
 						free(ret);
 					ret = NULL;
-					if (fatal)
-						vdev_error(gettext(
-						    "mismatched replication "
-						    "level: %s contains both "
-						    "files and devices\n"),
-						    rep.zprl_type);
-					else
-						return (NULL);
+					vdev_error(gettext(
+					    "mismatched replication "
+					    "level: %s contains both "
+					    "files and devices\n"),
+					    rep.zprl_type);
 					dontreport = B_TRUE;
 				}

@@ -1496,6 +1476,29 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 	const char *type, *fulltype;
 	boolean_t is_log, is_special, is_dedup, is_spare;
 	boolean_t seen_logs;
+	uint64_t ashift = 0;
+
+	if (props != NULL) {
+		const char *value = NULL;
+
+		if (nvlist_lookup_string(props,
+		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
+			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
+				(void) fprintf(stderr,
+				    gettext("ashift must be a number.\n"));
+				return (NULL);
+			}
+			if (ashift != 0 &&
+			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
+				(void) fprintf(stderr,
+				    gettext("invalid 'ashift=%" PRIu64 "' "
+				    "property: only values between %" PRId32 " "
+				    "and %" PRId32 " are allowed.\n"),
+				    ashift, ASHIFT_MIN, ASHIFT_MAX);
+				return (NULL);
+			}
+		}
+	}

 	top = NULL;
 	toplevels = 0;
@@ -1602,9 +1605,9 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 				    children * sizeof (nvlist_t *));
 				if (child == NULL)
 					zpool_no_memory();
-				if ((nv = make_leaf_vdev(props, argv[c],
+				if ((nv = make_leaf_vdev(argv[c],
 				    !(is_log || is_special || is_dedup ||
-				    is_spare))) == NULL) {
+				    is_spare), ashift)) == NULL) {
 					for (c = 0; c < children - 1; c++)
 						nvlist_free(child[c]);
 					free(child);
@@ -1668,6 +1671,10 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
 					    VDEV_ALLOC_BIAS_DEDUP) == 0);
 				}
+				if (ashift > 0) {
+					fnvlist_add_uint64(nv,
+					    ZPOOL_CONFIG_ASHIFT, ashift);
+				}
 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
 					verify(nvlist_add_uint64(nv,
 					    ZPOOL_CONFIG_NPARITY,
@@ -1695,8 +1702,9 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 			 * We have a device.  Pass off to make_leaf_vdev() to
 			 * construct the appropriate nvlist describing the vdev.
 			 */
-			if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
-			    is_special || is_dedup || is_spare))) == NULL)
+			if ((nv = make_leaf_vdev(argv[0], !(is_log ||
+			    is_special || is_dedup || is_spare),
+			    ashift)) == NULL)
 				goto spec_out;

 			verify(nvlist_add_uint64(nv,
@@ -29,9 +29,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [
 		const char *path = "path";
 		fmode_t mode = 0;
 		void *holder = NULL;
-		struct blk_holder_ops h;

-		bdev = blkdev_get_by_path(path, mode, holder, &h);
+		bdev = blkdev_get_by_path(path, mode, holder, NULL);
 	])
 ])

@@ -48,9 +47,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [
 		const char *path = "path";
 		fmode_t mode = 0;
 		void *holder = NULL;
-		struct blk_holder_ops h;

-		bdh = bdev_open_by_path(path, mode, holder, &h);
+		bdh = bdev_open_by_path(path, mode, holder, NULL);
 	])
 ])

@@ -68,9 +66,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [
 		const char *path = "path";
 		fmode_t mode = 0;
 		void *holder = NULL;
-		struct blk_holder_ops h;

-		file = bdev_file_open_by_path(path, mode, holder, &h);
+		file = bdev_file_open_by_path(path, mode, holder, NULL);
 	])
 ])

@@ -119,15 +119,49 @@ AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [
 	])
 ])

+dnl #
+dnl # 6.18 API change
+dnl # block_device_operation->getgeo takes struct gendisk* as first arg
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK], [
+	ZFS_LINUX_TEST_SRC([block_device_operations_getgeo_gendisk], [
+		#include <linux/blkdev.h>
+
+		static int blk_getgeo(struct gendisk *disk, struct hd_geometry *geo)
+		{
+			(void) disk, (void) geo;
+			return (0);
+		}
+
+		static const struct block_device_operations
+		    bops __attribute__ ((unused)) = {
+			.getgeo	= blk_getgeo,
+		};
+	], [], [])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK], [
+	AC_MSG_CHECKING([whether bops->getgeo() takes gendisk as first arg])
+	ZFS_LINUX_TEST_RESULT([block_device_operations_getgeo_gendisk], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK], [1],
+			[Define if getgeo() in block_device_operations takes struct gendisk * as its first arg])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
+	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
 ])

 AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS], [
 	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
 	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
 	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
+	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
 ])
@@ -24,6 +24,9 @@ dnl #
 dnl # 2.6.38 API change
 dnl # Added d_set_d_op() helper function.
 dnl #
+dnl # 6.17 API change
+dnl # d_set_d_op() removed. No direct replacement.
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [
 	ZFS_LINUX_TEST_SRC([d_set_d_op], [
 		#include <linux/dcache.h>
@@ -34,22 +37,46 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [

 AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], [
 	AC_MSG_CHECKING([whether d_set_d_op() is available])
-	ZFS_LINUX_TEST_RESULT_SYMBOL([d_set_d_op],
-	    [d_set_d_op], [fs/dcache.c], [
+	ZFS_LINUX_TEST_RESULT([d_set_d_op], [
 		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_D_SET_D_OP, 1,
+		    [Define if d_set_d_op() is available])
 	], [
-		ZFS_LINUX_TEST_ERROR([d_set_d_op])
+		AC_MSG_RESULT(no)
+	])
+])
+
+dnl #
+dnl # 6.17 API change
+dnl # sb->s_d_op removed; set_default_d_op(sb, dop) added
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_DEFAULT_D_OP], [
+	ZFS_LINUX_TEST_SRC([set_default_d_op], [
+		#include <linux/dcache.h>
+	], [
+		set_default_d_op(NULL, NULL);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SET_DEFAULT_D_OP], [
+	AC_MSG_CHECKING([whether set_default_d_op() is available])
+	ZFS_LINUX_TEST_RESULT([set_default_d_op], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_SET_DEFAULT_D_OP, 1,
+		    [Define if set_default_d_op() is available])
+	], [
+		AC_MSG_RESULT(no)
 	])
 ])

 AC_DEFUN([ZFS_AC_KERNEL_SRC_DENTRY], [
        ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS
        ZFS_AC_KERNEL_SRC_D_SET_D_OP
-        ZFS_AC_KERNEL_SRC_S_D_OP
+        ZFS_AC_KERNEL_SRC_SET_DEFAULT_D_OP
 ])

 AC_DEFUN([ZFS_AC_KERNEL_DENTRY], [
        ZFS_AC_KERNEL_D_OBTAIN_ALIAS
        ZFS_AC_KERNEL_D_SET_D_OP
-        ZFS_AC_KERNEL_S_D_OP
+        ZFS_AC_KERNEL_SET_DEFAULT_D_OP
 ])
@@ -0,0 +1,24 @@
+dnl #
+dnl # 6.18 API change
+dnl # - generic_drop_inode() renamed to inode_generic_drop()
+dnl # - generic_delete_inode() renamed to inode_just_drop()
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GENERIC_DROP], [
+	ZFS_LINUX_TEST_SRC([inode_generic_drop], [
+		#include <linux/fs.h>
+	],[
+		struct inode *ip = NULL;
+		inode_generic_drop(ip);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_INODE_GENERIC_DROP], [
+	AC_MSG_CHECKING([whether inode_generic_drop() exists])
+	ZFS_LINUX_TEST_RESULT([inode_generic_drop], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_INODE_GENERIC_DROP, 1,
+			[inode_generic_drop() exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
@@ -0,0 +1,23 @@
+dnl #
+dnl # 6.19 API change. inode->i_state no longer accessible directly; helper
+dnl # functions exist.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_STATE_READ_ONCE], [
+	ZFS_LINUX_TEST_SRC([inode_state_read_once], [
+		#include <linux/fs.h>
+	], [
+		struct inode i = {};
+		inode_state_read_once(&i);
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_INODE_STATE_READ_ONCE], [
+	AC_MSG_CHECKING([whether inode_state_read_once() exists])
+	ZFS_LINUX_TEST_RESULT([inode_state_read_once], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_INODE_STATE_READ_ONCE, 1,
+		    [inode_state_read_once() exists])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
@@ -7,7 +7,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS], [
 	ZFS_LINUX_TEST_SRC([kmap_atomic], [
 		#include <linux/pagemap.h>
 	],[
-		struct page page;
+		struct page page = {};
 		kmap_atomic(&page);
 	])
 ])
@@ -16,9 +16,36 @@ AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_FLAG_ERROR], [
 	])
 ])

+dnl #
+dnl # Linux 6.18+ uses a struct typedef (memdesc_flags_t) instead of an
+dnl # 'unsigned long' for the 'flags' field in 'struct page'.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_FLAGS_STRUCT], [
+	ZFS_LINUX_TEST_SRC([mm_page_flags_struct], [
+		#include <linux/mm.h>
+
+		static const struct page p __attribute__ ((unused)) = {
+			.flags = { .f = 0 }
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_FLAGS_STRUCT], [
+	AC_MSG_CHECKING([whether 'flags' in 'struct page' is a struct])
+	ZFS_LINUX_TEST_RESULT([mm_page_flags_struct], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_MM_PAGE_FLAGS_STRUCT, 1,
+			['flags' in 'struct page' is a struct])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_FLAGS], [
 	ZFS_AC_KERNEL_SRC_MM_PAGE_FLAG_ERROR
+	ZFS_AC_KERNEL_SRC_MM_PAGE_FLAGS_STRUCT
 ])
 AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_FLAGS], [
 	ZFS_AC_KERNEL_MM_PAGE_FLAG_ERROR
+	ZFS_AC_KERNEL_MM_PAGE_FLAGS_STRUCT
 ])
@@ -0,0 +1,31 @@
+dnl #
+dnl # 6.18 API change
+dnl # ns->ops->type was moved to ns->ns.ns_type (struct ns_common)
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_NS_COMMON_TYPE], [
+	ZFS_LINUX_TEST_SRC([ns_common_type], [
+		#include <linux/user_namespace.h>
+	],[
+		struct user_namespace ns;
+		ns.ns.ns_type = 0;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_NS_COMMON_TYPE], [
+	AC_MSG_CHECKING([whether ns_type is accessible through ns_common])
+	ZFS_LINUX_TEST_RESULT([ns_common_type], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE([HAVE_NS_COMMON_TYPE], 1,
+			[Define if ns_type is accessible through ns_common])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_NAMESPACE], [
+	ZFS_AC_KERNEL_SRC_NS_COMMON_TYPE
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_NAMESPACE], [
+	ZFS_AC_KERNEL_NS_COMMON_TYPE
+])
@@ -1,79 +0,0 @@
-dnl #
-dnl # 2.6.38 API change
-dnl # ns_capable() was introduced
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_NS_CAPABLE], [
-	ZFS_LINUX_TEST_SRC([ns_capable], [
-		#include <linux/capability.h>
-	],[
-		ns_capable((struct user_namespace *)NULL, CAP_SYS_ADMIN);
-	])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_NS_CAPABLE], [
-	AC_MSG_CHECKING([whether ns_capable exists])
-	ZFS_LINUX_TEST_RESULT([ns_capable], [
-		AC_MSG_RESULT(yes)
-	],[
-		ZFS_LINUX_TEST_ERROR([ns_capable()])
-	])
-])
-
-dnl #
-dnl # 2.6.39 API change
-dnl # struct user_namespace was added to struct cred_t as cred->user_ns member
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_CRED_USER_NS], [
-	ZFS_LINUX_TEST_SRC([cred_user_ns], [
-		#include <linux/cred.h>
-	],[
-		struct cred cr;
-		cr.user_ns = (struct user_namespace *)NULL;
-	])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_CRED_USER_NS], [
-	AC_MSG_CHECKING([whether cred_t->user_ns exists])
-	ZFS_LINUX_TEST_RESULT([cred_user_ns], [
-		AC_MSG_RESULT(yes)
-	],[
-		ZFS_LINUX_TEST_ERROR([cred_t->user_ns()])
-	])
-])
-
-dnl #
-dnl # 3.4 API change
-dnl # kuid_has_mapping() and kgid_has_mapping() were added to distinguish
-dnl # between internal kernel uids/gids and user namespace uids/gids.
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_KUID_HAS_MAPPING], [
-	ZFS_LINUX_TEST_SRC([kuid_has_mapping], [
-		#include <linux/uidgid.h>
-	],[
-		kuid_has_mapping((struct user_namespace *)NULL, KUIDT_INIT(0));
-		kgid_has_mapping((struct user_namespace *)NULL, KGIDT_INIT(0));
-	])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_KUID_HAS_MAPPING], [
-	AC_MSG_CHECKING([whether kuid_has_mapping/kgid_has_mapping exist])
-	ZFS_LINUX_TEST_RESULT([kuid_has_mapping], [
-		AC_MSG_RESULT(yes)
-	],[
-		ZFS_LINUX_TEST_ERROR([kuid_has_mapping()])
-	])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES], [
-	ZFS_AC_KERNEL_SRC_NS_CAPABLE
-	ZFS_AC_KERNEL_SRC_HAS_CAPABILITY
-	ZFS_AC_KERNEL_SRC_CRED_USER_NS
-	ZFS_AC_KERNEL_SRC_KUID_HAS_MAPPING
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_USERNS_CAPABILITIES], [
-	ZFS_AC_KERNEL_NS_CAPABLE
-	ZFS_AC_KERNEL_HAS_CAPABILITY
-	ZFS_AC_KERNEL_CRED_USER_NS
-	ZFS_AC_KERNEL_KUID_HAS_MAPPING
-])
@@ -0,0 +1,58 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITEPAGE_T], [
+	dnl #
+	dnl # 6.3 API change
+	dnl # The writepage_t function type now has its first argument as
+	dnl # struct folio* instead of struct page*
+	dnl #
+	ZFS_LINUX_TEST_SRC([writepage_t_folio], [
+		#include <linux/writeback.h>
+		static int putpage(struct folio *folio,
+		    struct writeback_control *wbc, void *data)
+		{ return 0; }
+		writepage_t func = putpage;
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_WRITEPAGE_T], [
+	AC_MSG_CHECKING([whether int (*writepage_t)() takes struct folio*])
+	ZFS_LINUX_TEST_RESULT([writepage_t_folio], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_WRITEPAGE_T_FOLIO, 1,
+		   [int (*writepage_t)() takes struct folio*])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITE_CACHE_PAGES], [
+	dnl #
+	dnl # 6.18 API change
+	dnl # write_cache_pages() has been removed.
+	dnl #
+	ZFS_LINUX_TEST_SRC([write_cache_pages], [
+		#include <linux/writeback.h>
+	], [
+		(void) write_cache_pages(NULL, NULL, NULL, NULL);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_WRITE_CACHE_PAGES], [
+	AC_MSG_CHECKING([whether write_cache_pages() is available])
+	ZFS_LINUX_TEST_RESULT([write_cache_pages], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_WRITE_CACHE_PAGES, 1,
+		    [write_cache_pages() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITEBACK], [
+	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
+	ZFS_AC_KERNEL_SRC_WRITE_CACHE_PAGES
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_WRITEBACK], [
+	ZFS_AC_KERNEL_WRITEPAGE_T
+	ZFS_AC_KERNEL_WRITE_CACHE_PAGES
+])
@@ -1,26 +0,0 @@
-AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITEPAGE_T], [
-	dnl #
-	dnl # 6.3 API change
-	dnl # The writepage_t function type now has its first argument as
-	dnl # struct folio* instead of struct page*
-	dnl #
-	ZFS_LINUX_TEST_SRC([writepage_t_folio], [
-		#include <linux/writeback.h>
-		static int putpage(struct folio *folio,
-		    struct writeback_control *wbc, void *data)
-		{ return 0; }
-		writepage_t func = putpage;
-	],[])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_WRITEPAGE_T], [
-	AC_MSG_CHECKING([whether int (*writepage_t)() takes struct folio*])
-	ZFS_LINUX_TEST_RESULT([writepage_t_folio], [
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_WRITEPAGE_T_FOLIO, 1,
-		   [int (*writepage_t)() takes struct folio*])
-	],[
-		AC_MSG_RESULT(no)
-	])
-])
-
@@ -59,6 +59,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_ACL
 	ZFS_AC_KERNEL_SRC_INODE_SETATTR
 	ZFS_AC_KERNEL_SRC_INODE_GETATTR
+	ZFS_AC_KERNEL_SRC_INODE_STATE_READ_ONCE
 	ZFS_AC_KERNEL_SRC_SHOW_OPTIONS
 	ZFS_AC_KERNEL_SRC_SHRINKER
 	ZFS_AC_KERNEL_SRC_MKDIR
@@ -70,6 +71,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_COMMIT_METADATA
 	ZFS_AC_KERNEL_SRC_SETATTR_PREPARE
 	ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED
+	ZFS_AC_KERNEL_SRC_DENTRY
 	ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SRC_SECURITY_INODE
 	ZFS_AC_KERNEL_SRC_FST_MOUNT
@@ -120,7 +122,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_IDMAP_MNT_API
 	ZFS_AC_KERNEL_SRC_IDMAP_NO_USERNS
 	ZFS_AC_KERNEL_SRC_IATTR_VFSID
-	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
+	ZFS_AC_KERNEL_SRC_WRITEBACK
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
@@ -135,6 +137,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_TIMER
 	ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR
 	ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE
+	ZFS_AC_KERNEL_SRC_NAMESPACE
+	ZFS_AC_KERNEL_SRC_INODE_GENERIC_DROP
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -177,6 +181,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_ACL
 	ZFS_AC_KERNEL_INODE_SETATTR
 	ZFS_AC_KERNEL_INODE_GETATTR
+	ZFS_AC_KERNEL_INODE_STATE_READ_ONCE
 	ZFS_AC_KERNEL_SHOW_OPTIONS
 	ZFS_AC_KERNEL_SHRINKER
 	ZFS_AC_KERNEL_MKDIR
@@ -188,6 +193,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_COMMIT_METADATA
 	ZFS_AC_KERNEL_SETATTR_PREPARE
 	ZFS_AC_KERNEL_INSERT_INODE_LOCKED
+	ZFS_AC_KERNEL_DENTRY
 	ZFS_AC_KERNEL_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SECURITY_INODE
 	ZFS_AC_KERNEL_FST_MOUNT
@@ -238,7 +244,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_IDMAP_MNT_API
 	ZFS_AC_KERNEL_IDMAP_NO_USERNS
 	ZFS_AC_KERNEL_IATTR_VFSID
-	ZFS_AC_KERNEL_WRITEPAGE_T
+	ZFS_AC_KERNEL_WRITEBACK
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
@@ -254,6 +260,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_TIMER
 	ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR
 	ZFS_AC_KERNEL_SOPS_FREE_INODE
+	ZFS_AC_KERNEL_NAMESPACE
+	ZFS_AC_KERNEL_INODE_GENERIC_DROP
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
@@ -2,7 +2,7 @@ dnl #
 dnl # Check for statx() function and STATX_MNT_ID availability
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [
-	AC_CHECK_HEADERS([linux/stat.h],
+	AC_CHECK_HEADERS([sys/stat.h],
 		[have_stat_headers=yes],
 		[have_stat_headers=no])

@@ -14,7 +14,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [
 			AC_MSG_CHECKING([for STATX_MNT_ID])
 			AC_COMPILE_IFELSE([
 				AC_LANG_PROGRAM([[
-					#include <linux/stat.h>
+					#include <sys/stat.h>
 				]], [[
 					struct statx stx;
 					int mask = STATX_MNT_ID;
@@ -29,6 +29,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [
 			])
 		])
 	], [
-		AC_MSG_WARN([linux/stat.h not found; skipping statx support])
+		AC_MSG_WARN([sys/stat.h not found; skipping statx support])
 	])
 ])  dnl end AC_DEFUN
@@ -979,7 +979,8 @@ mountroot()

 	touch /run/zfs_unlock_complete
 	if [ -e /run/zfs_unlock_complete_notify ]; then
-		read -r < /run/zfs_unlock_complete_notify
+		# shellcheck disable=SC2034
+		read -r zfs_unlock_complete_notify < /run/zfs_unlock_complete_notify
 	fi

 	# ------------
@@ -8,7 +8,7 @@ This contrib contains community compatibility patches to get Intel QAT working o
 These patches are based on the following Intel QAT version:
 [1.7.l.4.10.0-00014](https://01.org/sites/default/files/downloads/qat1.7.l.4.10.0-00014.tar.gz)

-When using QAT with above kernels versions, the following patches needs to be applied using:
+When using QAT with the above kernel versions, the following patches need to be applied using:
 patch -p1 < _$PATCH_
 _Where $PATCH refers to the path of the patch in question_

@@ -604,5 +604,4 @@ class RaidzExpansionRunning(ZFSError):
    errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS
    message = "A raidz device is currently expanding"

-
 # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4
@@ -4223,7 +4223,7 @@ class _TempPool(object):
            self.getRoot().reset()
            return

-        # On the Buildbot builders this may fail with "pool is busy"
+        # On the CI builders this may fail with "pool is busy"
        # Retry 5 times before raising an error
        retry = 0
        while True:
@@ -8,7 +8,9 @@ usage()
 	exit 1
 }

-[ "$#" -eq 1 ] || usage
+if ! [ -d "$1" ] ; then
+	usage
+fi
 KERNEL_DIR="$1"

 if ! [ -e 'zfs_config.h' ]
@@ -31,6 +33,7 @@ cat > "$KERNEL_DIR/fs/zfs/Kconfig" <<EOF
 config ZFS
 	tristate "ZFS filesystem support"
 	depends on EFI_PARTITION
+	depends on BLOCK
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	help
@@ -1,5 +1,5 @@
 DESCRIPTION
-  These script were written with the primary intention of being portable and
+  These scripts were written with the primary intention of being portable and
  usable on as many systems as possible.

  This is, in practice, usually not possible. But the intention is there.
@@ -104,6 +104,9 @@
 #define	spa_taskq_write_param_set_args(var) \
    CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A"

+#define	spa_taskq_free_param_set_args(var) \
+    CTLTYPE_STRING, NULL, 0, spa_taskq_free_param, "A"
+
 #define	fletcher_4_param_set_args(var) \
    CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A"

@@ -290,80 +290,11 @@ extern unsigned char bcd_to_byte[256];
 #define	offsetof(type, field)	__offsetof(type, field)
 #endif

-/*
- * Find highest one bit set.
- *      Returns bit number + 1 of highest bit that is set, otherwise returns 0.
- * High order bit is 31 (or 63 in _LP64 kernel).
- */
-static __inline int
-highbit(ulong_t i)
-{
-#if defined(HAVE_INLINE_FLSL)
-	return (flsl(i));
-#else
-	int h = 1;
+#define	highbit(x)		flsl(x)
+#define	lowbit(x)		ffsl(x)

-	if (i == 0)
-		return (0);
-#ifdef _LP64
-	if (i & 0xffffffff00000000ul) {
-		h += 32; i >>= 32;
-	}
-#endif
-	if (i & 0xffff0000) {
-		h += 16; i >>= 16;
-	}
-	if (i & 0xff00) {
-		h += 8; i >>= 8;
-	}
-	if (i & 0xf0) {
-		h += 4; i >>= 4;
-	}
-	if (i & 0xc) {
-		h += 2; i >>= 2;
-	}
-	if (i & 0x2) {
-		h += 1;
-	}
-	return (h);
-#endif
-}
-
-/*
- * Find highest one bit set.
- *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
- */
-static __inline int
-highbit64(uint64_t i)
-{
-#if defined(HAVE_INLINE_FLSLL)
-	return (flsll(i));
-#else
-	int h = 1;
-
-	if (i == 0)
-		return (0);
-	if (i & 0xffffffff00000000ULL) {
-		h += 32; i >>= 32;
-	}
-	if (i & 0xffff0000) {
-		h += 16; i >>= 16;
-	}
-	if (i & 0xff00) {
-		h += 8; i >>= 8;
-	}
-	if (i & 0xf0) {
-		h += 4; i >>= 4;
-	}
-	if (i & 0xc) {
-		h += 2; i >>= 2;
-	}
-	if (i & 0x2) {
-		h += 1;
-	}
-	return (h);
-#endif
-}
+#define	highbit64(x)		flsll(x)
+#define	lowbit64(x)		ffsll(x)

 #ifdef	__cplusplus
 }
@@ -542,24 +542,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id)
 }
 #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */

-/*
- * All the io_*() helper functions below can operate on a bio, or a rq, but
- * not both.  The older submit_bio() codepath will pass a bio, and the
- * newer blk-mq codepath will pass a rq.
- */
-static inline int
-io_data_dir(struct bio *bio, struct request *rq)
-{
-	if (rq != NULL) {
-		if (op_is_write(req_op(rq))) {
-			return (WRITE);
-		} else {
-			return (READ);
-		}
-	}
-	return (bio_data_dir(bio));
-}
-
 static inline int
 io_is_flush(struct bio *bio, struct request *rq)
 {
@@ -34,6 +34,17 @@

 #define	d_alias			d_u.d_alias

+#ifdef HAVE_MM_PAGE_FLAGS_STRUCT
+/*
+ * Starting from Linux 6.18, the 'flags' field in 'struct page' is defined
+ * to a struct ('memdesc_flags_t' typedef) instead of an unsigned long for
+ * improved typesafety.
+ */
+#define	page_flags flags.f
+#else
+#define	page_flags flags
+#endif
+
 /*
 * Starting from Linux 5.13, flush_dcache_page() becomes an inline function
 * and under some configurations, may indirectly referencing GPL-only
@@ -44,8 +55,8 @@
 #include <linux/simd_powerpc.h>
 #define	flush_dcache_page(page)	do {					\
 		if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&	\
-		    test_bit(PG_dcache_clean, &(page)->flags))		\
-			clear_bit(PG_dcache_clean, &(page)->flags);	\
+		    test_bit(PG_dcache_clean, &(page)->page_flags))	\
+			clear_bit(PG_dcache_clean, &(page)->page_flags);\
 	} while (0)
 #endif
 /*
@@ -55,37 +66,11 @@
 */
 #if defined __riscv && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY
 #define	flush_dcache_page(page)	do {					\
-		if (test_bit(PG_dcache_clean, &(page)->flags))		\
-			clear_bit(PG_dcache_clean, &(page)->flags);	\
+		if (test_bit(PG_dcache_clean, &(page)->page_flags))	\
+			clear_bit(PG_dcache_clean, &(page)->page_flags);\
 	} while (0)
 #endif

-/*
- * 2.6.30 API change,
- * The const keyword was added to the 'struct dentry_operations' in
- * the dentry structure.  To handle this we define an appropriate
- * dentry_operations_t typedef which can be used.
- */
-typedef const struct dentry_operations	dentry_operations_t;
-
-/*
- * 2.6.38 API addition,
- * Added d_clear_d_op() helper function which clears some flags and the
- * registered dentry->d_op table.  This is required because d_set_d_op()
- * issues a warning when the dentry operations table is already set.
- * For the .zfs control directory to work properly we must be able to
- * override the default operations table and register custom .d_automount
- * and .d_revalidate callbacks.
- */
-static inline void
-d_clear_d_op(struct dentry *dentry)
-{
-	dentry->d_op = NULL;
-	dentry->d_flags &= ~(
-	    DCACHE_OP_HASH | DCACHE_OP_COMPARE |
-	    DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE);
-}
-
 /*
 * Walk and invalidate all dentry aliases of an inode
 * unless it's a mountpoint
@@ -23,6 +23,7 @@
 /*
 * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
 * Copyright (C) 2015 Jörg Thalheim.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
 */

 #ifndef _ZFS_VFS_H
@@ -262,4 +263,18 @@ zpl_is_32bit_api(void)
 #define	zpl_generic_fillattr(user_ns, ip, sp)	generic_fillattr(ip, sp)
 #endif

+#ifdef HAVE_INODE_GENERIC_DROP
+/* 6.18 API change. These were renamed, alias the old names to the new. */
+#define	generic_delete_inode(ip)	inode_just_drop(ip)
+#define	generic_drop_inode(ip)		inode_generic_drop(ip)
+#endif
+
+#ifndef HAVE_INODE_STATE_READ_ONCE
+/*
+ * 6.19 API change. We should no longer access i_state directly. If the new
+ * helper function doesn't exist, define our own.
+ */
+#define	inode_state_read_once(ip)	READ_ONCE(ip->i_state)
+#endif
+
 #endif /* _ZFS_VFS_H */
@@ -25,6 +25,6 @@
 #ifndef _SPL_STAT_H
 #define	_SPL_STAT_H

-#include <linux/stat.h>
+#include <sys/stat.h>

 #endif /* SPL_STAT_H */
@@ -55,6 +55,7 @@ extern const struct file_operations zpl_dir_file_operations;
 extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg);

 extern const struct super_operations zpl_super_operations;
+extern const struct dentry_operations zpl_dentry_operations;
 extern const struct export_operations zpl_export_operations;
 extern struct file_system_type zpl_fs_type;

@@ -65,7 +65,7 @@ _Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
 */
 #define	BRT_BLOCKSIZE	(32 * 1024)
 #define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
-	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
+	(((size) - 1) / (BRT_BLOCKSIZE / sizeof (uint16_t)) + 1)

 #define	BRT_LITTLE_ENDIAN	0
 #define	BRT_BIG_ENDIAN		1
@@ -740,6 +740,8 @@ typedef struct zpool_load_policy {
 #define	ZPOOL_CONFIG_METASLAB_SHIFT	"metaslab_shift"
 #define	ZPOOL_CONFIG_ASHIFT		"ashift"
 #define	ZPOOL_CONFIG_ASIZE		"asize"
+#define	ZPOOL_CONFIG_MIN_ALLOC		"min_alloc"
+#define	ZPOOL_CONFIG_MAX_ALLOC		"max_alloc"
 #define	ZPOOL_CONFIG_DTL		"DTL"
 #define	ZPOOL_CONFIG_SCAN_STATS		"scan_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_REMOVAL_STATS	"removal_stats"	/* not stored on disk */
@@ -861,6 +863,10 @@ typedef struct zpool_load_policy {
 #define	ZPOOL_CONFIG_MMP_SEQ		"mmp_seq"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_HOSTNAME	"mmp_hostname"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_HOSTID		"mmp_hostid"	/* not stored on disk */
+#define	ZPOOL_CONFIG_MMP_RESULT		"mmp_result"	/* not stored on disk */
+#define	ZPOOL_CONFIG_MMP_TRYIMPORT_NS	"mmp_tryimport_ns"	/* not stored */
+#define	ZPOOL_CONFIG_MMP_IMPORT_NS	"mmp_import_ns"	/* not stored on disk */
+#define	ZPOOL_CONFIG_MMP_CLAIM_NS	"mmp_claim_ns"	/* not stored on disk */
 #define	ZPOOL_CONFIG_ALLOCATION_BIAS	"alloc_bias"	/* not stored on disk */
 #define	ZPOOL_CONFIG_EXPANSION_TIME	"expansion_time"	/* not stored */
 #define	ZPOOL_CONFIG_REBUILD_STATS	"org.openzfs:rebuild_stats"
@@ -33,6 +33,7 @@ extern "C" {
 #define	MMP_DEFAULT_IMPORT_INTERVALS	20
 #define	MMP_DEFAULT_FAIL_INTERVALS	10
 #define	MMP_MIN_FAIL_INTERVALS		2	/* min if != 0 */
+#define	MMP_IMPORT_VERIFY_ITERS		10
 #define	MMP_IMPORT_SAFETY_FACTOR	200	/* pct */
 #define	MMP_INTERVAL_OK(interval)	MAX(interval, MMP_MIN_INTERVAL)
 #define	MMP_FAIL_INTVS_OK(fails)	(fails == 0 ? 0 : MAX(fails, \
@@ -53,6 +54,9 @@ typedef struct mmp_thread {
 	vdev_t		*mmp_last_leaf;	/* last mmp write sent here */
 	uint64_t	mmp_leaf_last_gen;	/* last mmp write sent here */
 	uint32_t	mmp_seq;	/* intra-second update counter */
+	uint64_t	mmp_tryimport_ns; /* tryimport activity check time */
+	uint64_t	mmp_import_ns;	/* import activity check time */
+	uint64_t	mmp_claim_ns;	/* claim activity check time */
 } mmp_thread_t;


@@ -62,6 +66,7 @@ extern void mmp_thread_start(struct spa *spa);
 extern void mmp_thread_stop(struct spa *spa);
 extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub);
 extern void mmp_signal_all_threads(void);
+extern int mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub);

 /* Global tuning */
 extern int param_set_multihost_interval(ZFS_MODULE_PARAM_ARGS);
@@ -1044,6 +1044,7 @@ extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern uint32_t spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);
+extern char *spa_load_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
@@ -1055,6 +1056,7 @@ extern pool_state_t spa_state(spa_t *spa);
 extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
+extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max);
 extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_checkpoint_space(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
@@ -224,6 +224,7 @@ struct spa {
 	 * Fields protected by spa_namespace_lock.
 	 */
 	char		spa_name[ZFS_MAX_DATASET_NAME_LEN];	/* pool name */
+	char		*spa_load_name;		/* unmodified pool name */
 	char		*spa_comment;		/* comment */
 	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
 	nvlist_t	*spa_config;		/* last synced config */
@@ -267,6 +268,7 @@ struct spa {
 	uint64_t	spa_min_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_max_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_min_alloc;		/* of vdevs in normal class */
+	uint64_t	spa_max_alloc;		/* of vdevs in normal class */
 	uint64_t	spa_gcd_alloc;		/* of vdevs in normal class */
 	uint64_t	spa_config_guid;	/* config pool guid */
 	uint64_t	spa_load_guid;		/* spa_load initialized guid */
@@ -302,6 +304,7 @@ struct spa {
 	void		*spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
+	boolean_t	spa_activity_check; 	/* activity check required */
 	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub bytes */
@@ -51,6 +51,12 @@ extern "C" {
 #define	MMP_SEQ_VALID_BIT	0x02
 #define	MMP_FAIL_INT_VALID_BIT	0x04

+#define	MMP_INTERVAL_MASK	0x00000000FFFFFF00
+#define	MMP_SEQ_MASK		0x0000FFFF00000000
+#define	MMP_FAIL_INT_MASK	0xFFFF000000000000
+
+#define	MMP_SEQ_MAX		UINT16_MAX
+
 #define	MMP_VALID(ubp)		((ubp)->ub_magic == UBERBLOCK_MAGIC && \
 				    (ubp)->ub_mmp_magic == MMP_MAGIC)
 #define	MMP_INTERVAL_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
@@ -60,21 +66,25 @@ extern "C" {
 #define	MMP_FAIL_INT_VALID(ubp)	(MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
 				    MMP_FAIL_INT_VALID_BIT))

-#define	MMP_INTERVAL(ubp)	(((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
+#define	MMP_INTERVAL(ubp)	(((ubp)->ub_mmp_config & MMP_INTERVAL_MASK) \
 				    >> 8)
-#define	MMP_SEQ(ubp)		(((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
+#define	MMP_SEQ(ubp)		(((ubp)->ub_mmp_config & MMP_SEQ_MASK) \
 				    >> 32)
-#define	MMP_FAIL_INT(ubp)	(((ubp)->ub_mmp_config & 0xFFFF000000000000) \
+#define	MMP_FAIL_INT(ubp)	(((ubp)->ub_mmp_config & MMP_FAIL_INT_MASK) \
 				    >> 48)

 #define	MMP_INTERVAL_SET(write) \
-	    (((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT)
+	    (((uint64_t)((write) & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT)

 #define	MMP_SEQ_SET(seq) \
-	    (((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT)
+	    (((uint64_t)((seq) & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT)

 #define	MMP_FAIL_INT_SET(fail) \
-	    (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT)
+	    (((uint64_t)((fail) & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT)
+
+
+#define	MMP_SEQ_CLEAR(ubp) \
+	    ((ubp)->ub_mmp_config &= ~(MMP_SEQ_MASK | MMP_SEQ_VALID_BIT))

 /*
 * RAIDZ expansion reflow information.
@@ -212,6 +212,8 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
 extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *);
 extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *);
 extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int);
+extern int vdev_uberblock_compare(const struct uberblock *,
+    const struct uberblock *);
 extern int vdev_check_boot_reserve(spa_t *, vdev_t *);

 typedef enum {
@@ -46,7 +46,7 @@ void zfs_file_close(zfs_file_t *fp);

 int zfs_file_write(zfs_file_t *fp, const void *buf, size_t len, ssize_t *resid);
 int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t len, loff_t off,
-    ssize_t *resid);
+    uint8_t ashift, ssize_t *resid);
 int zfs_file_read(zfs_file_t *fp, void *buf, size_t len, ssize_t *resid);
 int zfs_file_pread(zfs_file_t *fp, void *buf, size_t len, loff_t off,
    ssize_t *resid);
@@ -35,18 +35,16 @@

 #include <sys/vfs.h>

-#ifdef FS_PROJINHERIT_FL
-#define	ZFS_PROJINHERIT_FL	FS_PROJINHERIT_FL
-#else
-#define	ZFS_PROJINHERIT_FL	0x20000000
-#endif
-
 #ifdef FS_IOC_FSGETXATTR
 typedef struct fsxattr zfsxattr_t;

 #define	ZFS_IOC_FSGETXATTR	FS_IOC_FSGETXATTR
 #define	ZFS_IOC_FSSETXATTR	FS_IOC_FSSETXATTR
 #else
+/* Non-Linux OS */
+#define	FS_PROJINHERIT_FL	0x20000000
+#define	FS_XFLAG_PROJINHERIT	FS_PROJINHERIT_FL
+
 struct zfsxattr {
 	uint32_t	fsx_xflags;	/* xflags field value (get/set) */
 	uint32_t	fsx_extsize;	/* extsize field value (get/set) */
@@ -76,6 +76,7 @@ libspl_sys_HEADERS += \
 	%D%/os/linux/sys/param.h \
 	%D%/os/linux/sys/stat.h \
 	%D%/os/linux/sys/sysmacros.h \
+	%D%/os/linux/sys/vfs.h \
 	%D%/os/linux/sys/zfs_context_os.h

 libspl_ia32_HEADERS = \
@@ -33,7 +33,7 @@

 #ifdef HAVE_STATX
 #include <fcntl.h>
-#include <linux/stat.h>
+#include <sys/stat.h>
 #endif

 /*
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright 2025 by Lawrence Livermore National Security, LLC. */
+
+/* This is the Linux userspace version of include/os/linux/spl/sys/vfs.h */
+
+#ifndef	_LIBSPL_SYS_VFS_H
+#define	_LIBSPL_SYS_VFS_H
+
+#include <linux/fs.h>
+#include <sys/statfs.h>
+
+#endif
@@ -2238,6 +2238,11 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,

 	zpool_get_load_policy(config, &policy);

+	if (getenv("ZFS_LOAD_INFO_DEBUG") && nv != NULL &&
+	    nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) {
+		dump_nvlist(nvinfo, 4);
+	}
+
 	if (error) {
 		char desc[1024];
 		char aux[256];
@@ -98,57 +98,57 @@ static const char *const zfs_msgid_table[] = {
 #define	NMSGID	(sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))

 static int
-vdev_missing(vdev_stat_t *vs, uint_t vsc)
+vdev_missing(vdev_stat_t *vs, uint_t vsc, void *arg)
 {
-	(void) vsc;
+	(void) vsc, (void) arg;
 	return (vs->vs_state == VDEV_STATE_CANT_OPEN &&
 	    vs->vs_aux == VDEV_AUX_OPEN_FAILED);
 }

 static int
-vdev_faulted(vdev_stat_t *vs, uint_t vsc)
+vdev_faulted(vdev_stat_t *vs, uint_t vsc, void *arg)
 {
-	(void) vsc;
+	(void) vsc, (void) arg;
 	return (vs->vs_state == VDEV_STATE_FAULTED);
 }

 static int
-vdev_errors(vdev_stat_t *vs, uint_t vsc)
+vdev_errors(vdev_stat_t *vs, uint_t vsc, void *arg)
 {
-	(void) vsc;
+	(void) vsc, (void) arg;
 	return (vs->vs_state == VDEV_STATE_DEGRADED ||
 	    vs->vs_read_errors != 0 || vs->vs_write_errors != 0 ||
 	    vs->vs_checksum_errors != 0);
 }

 static int
-vdev_broken(vdev_stat_t *vs, uint_t vsc)
+vdev_broken(vdev_stat_t *vs, uint_t vsc, void *arg)
 {
-	(void) vsc;
+	(void) vsc, (void) arg;
 	return (vs->vs_state == VDEV_STATE_CANT_OPEN);
 }

 static int
-vdev_offlined(vdev_stat_t *vs, uint_t vsc)
+vdev_offlined(vdev_stat_t *vs, uint_t vsc, void *arg)
 {
-	(void) vsc;
+	(void) vsc, (void) arg;
 	return (vs->vs_state == VDEV_STATE_OFFLINE);
 }

 static int
-vdev_removed(vdev_stat_t *vs, uint_t vsc)
+vdev_removed(vdev_stat_t *vs, uint_t vsc, void *arg)
 {
-	(void) vsc;
+	(void) vsc, (void) arg;
 	return (vs->vs_state == VDEV_STATE_REMOVED);
 }

 static int
-vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc)
+vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc, void *arg)
 {
-	if (getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") != NULL)
-		return (0);
+	uint64_t ashift = *(uint64_t *)arg;

 	return (VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
+	    (ashift == 0 || vs->vs_configured_ashift < ashift) &&
 	    vs->vs_configured_ashift < vs->vs_physical_ashift);
 }

@@ -156,8 +156,8 @@ vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc)
 * Detect if any leaf devices that have seen errors or could not be opened.
 */
 static boolean_t
-find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t),
-    boolean_t ignore_replacing)
+find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t, void *),
+    void *arg, boolean_t ignore_replacing)
 {
 	nvlist_t **child;
 	uint_t c, children;
@@ -177,14 +177,16 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t),

 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) == 0) {
-		for (c = 0; c < children; c++)
-			if (find_vdev_problem(child[c], func, ignore_replacing))
+		for (c = 0; c < children; c++) {
+			if (find_vdev_problem(child[c], func, arg,
+			    ignore_replacing))
 				return (B_TRUE);
+		}
 	} else {
 		uint_t vsc;
 		vdev_stat_t *vs = (vdev_stat_t *)fnvlist_lookup_uint64_array(
 		    vdev, ZPOOL_CONFIG_VDEV_STATS, &vsc);
-		if (func(vs, vsc) != 0)
+		if (func(vs, vsc, arg) != 0)
 			return (B_TRUE);
 	}

@@ -193,9 +195,11 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t),
 	 */
 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child,
 	    &children) == 0) {
-		for (c = 0; c < children; c++)
-			if (find_vdev_problem(child[c], func, ignore_replacing))
+		for (c = 0; c < children; c++) {
+			if (find_vdev_problem(child[c], func, arg,
+			    ignore_replacing))
 				return (B_TRUE);
+		}
 	}

 	return (B_FALSE);
@@ -220,7 +224,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t),
 */
 static zpool_status_t
 check_status(nvlist_t *config, boolean_t isimport,
-    zpool_errata_t *erratap, const char *compat)
+    zpool_errata_t *erratap, const char *compat, uint64_t ashift)
 {
 	pool_scan_stat_t *ps = NULL;
 	uint_t vsc, psc;
@@ -371,15 +375,15 @@ check_status(nvlist_t *config, boolean_t isimport,
 	 * Bad devices in non-replicated config.
 	 */
 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
-	    find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
+	    find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE))
 		return (ZPOOL_STATUS_FAULTED_DEV_NR);

 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
-	    find_vdev_problem(nvroot, vdev_missing, B_TRUE))
+	    find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE))
 		return (ZPOOL_STATUS_MISSING_DEV_NR);

 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
-	    find_vdev_problem(nvroot, vdev_broken, B_TRUE))
+	    find_vdev_problem(nvroot, vdev_broken, NULL, B_TRUE))
 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);

 	/*
@@ -402,35 +406,37 @@ check_status(nvlist_t *config, boolean_t isimport,
 	/*
 	 * Missing devices in a replicated config.
 	 */
-	if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
+	if (find_vdev_problem(nvroot, vdev_faulted, NULL, B_TRUE))
 		return (ZPOOL_STATUS_FAULTED_DEV_R);
-	if (find_vdev_problem(nvroot, vdev_missing, B_TRUE))
+	if (find_vdev_problem(nvroot, vdev_missing, NULL, B_TRUE))
 		return (ZPOOL_STATUS_MISSING_DEV_R);
-	if (find_vdev_problem(nvroot, vdev_broken, B_TRUE))
+	if (find_vdev_problem(nvroot, vdev_broken, NULL, B_TRUE))
 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);

 	/*
 	 * Devices with errors
 	 */
-	if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE))
+	if (!isimport && find_vdev_problem(nvroot, vdev_errors, NULL, B_TRUE))
 		return (ZPOOL_STATUS_FAILING_DEV);

 	/*
 	 * Offlined devices
 	 */
-	if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE))
+	if (find_vdev_problem(nvroot, vdev_offlined, NULL, B_TRUE))
 		return (ZPOOL_STATUS_OFFLINE_DEV);

 	/*
 	 * Removed device
 	 */
-	if (find_vdev_problem(nvroot, vdev_removed, B_TRUE))
+	if (find_vdev_problem(nvroot, vdev_removed, NULL, B_TRUE))
 		return (ZPOOL_STATUS_REMOVED_DEV);

 	/*
 	 * Suboptimal, but usable, ashift configuration.
 	 */
-	if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE))
+	if (!isimport &&
+	    getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL &&
+	    find_vdev_problem(nvroot, vdev_non_native_ashift, &ashift, B_FALSE))
 		return (ZPOOL_STATUS_NON_NATIVE_ASHIFT);

 	/*
@@ -509,8 +515,10 @@ zpool_get_status(zpool_handle_t *zhp, const char **msgid,
 	    ZFS_MAXPROPLEN, NULL, B_FALSE) != 0)
 		compatibility[0] = '\0';

+	uint64_t ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, NULL);
+
 	zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata,
-	    compatibility);
+	    compatibility, ashift);

 	if (msgid != NULL) {
 		if (ret >= NMSGID)
@@ -525,7 +533,7 @@ zpool_status_t
 zpool_import_status(nvlist_t *config, const char **msgid,
    zpool_errata_t *errata)
 {
-	zpool_status_t ret = check_status(config, B_TRUE, errata, NULL);
+	zpool_status_t ret = check_status(config, B_TRUE, errata, NULL, 0);

 	if (ret >= NMSGID)
 		*msgid = NULL;
@@ -1175,7 +1175,7 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 */
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf,
-    size_t count, loff_t pos, ssize_t *resid)
+    size_t count, loff_t pos, uint8_t ashift, ssize_t *resid)
 {
 	ssize_t rc, split, done;
 	int sectors;
@@ -1185,8 +1185,8 @@ zfs_file_pwrite(zfs_file_t *fp, const void *buf,
 	 * system calls so that the process can be killed in between.
 	 * This is used by ztest to simulate realistic failure modes.
 	 */
-	sectors = count >> SPA_MINBLOCKSHIFT;
-	split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT;
+	sectors = count >> ashift;
+	split = (sectors > 0 ? rand() % sectors : 0) << ashift;
 	rc = pwrite64(fp->f_fd, buf, split, pos);
 	if (rc != -1) {
 		done = rc;
@@ -122,6 +122,24 @@ Example:
 .Nm zhack Cm label repair Fl cu Ar device
  Fix checksums and undetach a device
 .
+.It Xo
+.Nm zhack
+.Cm metaslab leak
+.Op Fl f
+.Ar pool
+.Xc
+Apply a fragmentation profile generated by
+.Sy zdb
+to the specified
+.Ar pool Ns
+\&.
+.Pp
+The
+.Fl f
+flag forces the profile to apply even if the vdevs in the
+.Ar pool
+don't have the same number of metaslabs as the fragmentation profile.
+.
 .El
 .
 .Sh GLOBAL OPTIONS
@@ -143,6 +161,8 @@ Search for
 members in
 .Ar dir .
 Can be specified more than once.
+.It Fl o Ar var Ns = Ns Ar value
+Set the given tunable to the provided value.
 .El
 .
 .Sh EXAMPLES
@@ -17,7 +17,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd May 24, 2025
+.Dd September 15, 2025
 .Dt ZFS 4
 .Os
 .
@@ -2551,6 +2551,49 @@ the xattr so as to not accumulate duplicates.
 .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Prioritize requeued I/O.
 .
+.It Sy zfs_delete_inode Ns = Ns Sy 0 Ns | Ns 1 Pq int
+Sets whether the kernel should free an inode structure when the last reference
+is released, or cache it in memory.
+Intended for testing/debugging.
+.Pp
+A live inode structure "pins" versious internal OpenZFS structures in memory,
+which can result in large amounts of "unusable" memory on systems with lots of
+infrequently-accessed files, until the kernel's memory pressure mechanism
+asks OpenZFS to release them.
+.Pp
+The default value of
+.Sy 0
+always caches inodes that appear to still exist on disk.
+Setting it to
+.Sy 1
+will immediately release unused inodes and their associated memory back to the
+dbuf cache or the ARC for reuse, but may reduce performance if inodes are
+frequently evicted and reloaded.
+.Pp
+This parameter is only available on Linux.
+.
+.It Sy zfs_delete_dentry Ns = Ns Sy 0 Ns | Ns 1 Pq int
+Sets whether the kernel should free a dentry structure when it is no longer
+required, or hold it in the dentry cache.
+Intended for testing/debugging.
+.
+Since a dentry structure holds an inode reference, a cached dentry can "pin"
+an inode in memory indefinitely, along with associated OpenZFS structures (See
+.Sy zfs_delete_inode ) .
+.Pp
+The default value of
+.Sy 0
+instructs the kernel to cache entries and their associated inodes when they
+are no longer directly referenced.
+They will be reclaimed as part of the kernel's normal cache management
+processes.
+Setting it to
+.Sy 1
+will instruct the kernel to release directory entries and their inodes as soon
+as they are no longer referenced by the filesystem.
+.Pp
+This parameter is only available on Linux.
+.
 .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint
 Percentage of online CPUs which will run a worker thread for I/O.
 These workers are responsible for I/O work such as compression, encryption,
@@ -2585,12 +2628,50 @@ Set value only applies to pools imported/created after that.
 Set the queue and thread configuration for the IO read queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
+Each of the four values corresponds to the issue, issue high-priority,
+interrupt, and interrupt high-priority queues.
+Valid values are
+.Sy fixed,N,M
+(M queues with N threads each),
+.Sy scale[,MIN]
+(scale with CPUs, minimum MIN total threads),
+.Sy sync ,
+and
+.Sy null .
 Set values only apply to pools imported/created after that.
 .
 .It Sy zio_taskq_write Ns = Ns Sy sync null scale null Pq charp
 Set the queue and thread configuration for the IO write queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
+Each of the four values corresponds to the issue, issue high-priority,
+interrupt, and interrupt high-priority queues.
+Valid values are
+.Sy fixed,N,M
+(M queues with N threads each),
+.Sy scale[,MIN]
+(scale with CPUs, minimum MIN total threads),
+.Sy sync ,
+and
+.Sy null .
+Set values only apply to pools imported/created after that.
+.
+.It Sy zio_taskq_free Ns = Ns Sy scale,32 null null null Pq charp
+Set the queue and thread configuration for the IO free queues.
+This is an advanced debugging parameter.
+Don't change this unless you understand what it does.
+Each of the four values corresponds to the issue, issue high-priority,
+interrupt, and interrupt high-priority queues.
+Valid values are
+.Sy fixed,N,M
+(M queues with N threads each),
+.Sy scale[,MIN]
+(scale with CPUs, minimum MIN total threads),
+.Sy sync ,
+and
+.Sy null .
+The default uses a minimum of 32 threads to improve parallelism for
+DDT and BRT metadata operations during frees.
 Set values only apply to pools imported/created after that.
 .
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
@@ -69,6 +69,13 @@
 .Op Fl U Ar cache
 .Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns …
 .Nm
+.Fl -allocated-map
+.Op Fl mAFLPXY
+.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
+.Op Fl t Ar txg
+.Op Fl U Ar cache
+.Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns …
+.Nm
 .Fl O
 .Op Fl K Ar key
 .Ar dataset path
@@ -128,6 +135,11 @@ that zdb may interpret inconsistent pool data and behave erratically.
 .Sh OPTIONS
 Display options:
 .Bl -tag -width Ds
+.It Fl Sy -allocated-map
+Prints out a list of all the allocated regions in the pool.
+Primarily intended for use with the
+.Nm zhack metaslab leak
+subcommand.
 .It Fl b , -block-stats
 Display statistics regarding the number, size
 .Pq logical, physical and allocated
@@ -2,7 +2,7 @@
 # first.  This ensures its module initialization function is run before
 # any of the other module initialization functions which depend on it.

-ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement
+ZFS_MODULE_CFLAGS += -std=gnu11 -Wno-declaration-after-statement
 ZFS_MODULE_CFLAGS += -Wmissing-prototypes
 ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@  @NO_FORMAT_ZERO_LENGTH@

@@ -293,10 +293,9 @@ ZSTD_UPSTREAM_OBJS := \

 zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS))

-# Disable aarch64 neon SIMD instructions for kernel mode
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS)
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include)
-$(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
+$(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
 $(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h


@@ -62,11 +62,6 @@ CFLAGS+= -DZFS_DEBUG -g
 CFLAGS += -DNDEBUG
 .endif

-.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true"
-# kernel must also be built with this option for this to work
-CFLAGS+= -DDEBUG_VFS_LOCKS
-.endif
-
 .if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
 CFLAGS+=	 -fprofile-arcs -ftest-coverage
 .endif
@@ -521,30 +516,6 @@ CFLAGS.zstd_ldm.c= -U__BMI__ -fno-tree-vectorize ${NO_WBITWISE_INSTEAD_OF_LOGICA
 CFLAGS.zstd_opt.c= -U__BMI__ -fno-tree-vectorize ${NO_WBITWISE_INSTEAD_OF_LOGICAL}

 .if ${MACHINE_ARCH} == "aarch64"
-__ZFS_ZSTD_AARCH64_FLAGS= -include ${SRCDIR}/zstd/include/aarch64_compat.h
-CFLAGS.zstd.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.entropy_common.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.error_private.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.fse_compress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.fse_decompress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.hist.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.huf_compress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.huf_decompress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.pool.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.xxhash.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_common.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress_literals.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress_sequences.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_compress_superblock.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_ddict.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_decompress.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_decompress_block.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_double_fast.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_fast.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_lazy.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_ldm.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
-CFLAGS.zstd_opt.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}

 sha256-armv8.o: sha256-armv8.S
 	${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \
@@ -77,7 +77,8 @@ static const uint32_t SHA256_K[64] = {
 	h = g, g = f, f = e, e = d + T1; \
 	d = c, c = b, b = a, a = T1 + T2;

-static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
+static void
+icp_sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
 {
 	uint64_t blk;

@@ -173,7 +174,8 @@ static const uint64_t SHA512_K[80] = {
 	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
 };

-static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
+static void
+icp_sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
 {
 	uint64_t blk;

@@ -226,7 +228,8 @@ static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
 	}
 }

-static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
+static void
+icp_sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
 {
 	uint64_t pos = ctx->count[0];
 	uint64_t total = ctx->count[1];
@@ -258,7 +261,8 @@ static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
 	ctx->count[1] = total;
 }

-static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
+static void
+icp_sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
 {
 	uint64_t pos = ctx->count[0];
 	uint64_t total = ctx->count[1];
@@ -290,7 +294,8 @@ static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
 	ctx->count[1] = total;
 }

-static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
+static void
+icp_sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
 {
 	uint64_t mlen, pos = ctx->count[0];
 	uint8_t *m = ctx->wbuf;
@@ -334,7 +339,8 @@ static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
 	memset(ctx, 0, sizeof (*ctx));
 }

-static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
+static void
+icp_sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
 {
 	uint64_t mlen, pos = ctx->count[0];
 	uint8_t *m = ctx->wbuf, *r;
@@ -461,14 +467,14 @@ SHA2Update(SHA2_CTX *ctx, const void *data, size_t len)

 	switch (ctx->algotype) {
 		case SHA256:
-			sha256_update(&ctx->sha256, data, len);
+			icp_sha256_update(&ctx->sha256, data, len);
 			break;
 		case SHA512:
 		case SHA512_HMAC_MECH_INFO_TYPE:
-			sha512_update(&ctx->sha512, data, len);
+			icp_sha512_update(&ctx->sha512, data, len);
 			break;
 		case SHA512_256:
-			sha512_update(&ctx->sha512, data, len);
+			icp_sha512_update(&ctx->sha512, data, len);
 			break;
 	}
 }
@@ -479,32 +485,33 @@ SHA2Final(void *digest, SHA2_CTX *ctx)
 {
 	switch (ctx->algotype) {
 		case SHA256:
-			sha256_final(&ctx->sha256, digest, 256);
+			icp_sha256_final(&ctx->sha256, digest, 256);
 			break;
 		case SHA512:
 		case SHA512_HMAC_MECH_INFO_TYPE:
-			sha512_final(&ctx->sha512, digest, 512);
+			icp_sha512_final(&ctx->sha512, digest, 512);
 			break;
 		case SHA512_256:
-			sha512_final(&ctx->sha512, digest, 256);
+			icp_sha512_final(&ctx->sha512, digest, 256);
 			break;
 	}
 }

 /* the generic implementation is always okay */
-static boolean_t sha2_is_supported(void)
+static boolean_t
+icp_sha2_is_supported(void)
 {
 	return (B_TRUE);
 }

 const sha256_ops_t sha256_generic_impl = {
 	.name = "generic",
-	.transform = sha256_generic,
-	.is_supported = sha2_is_supported
+	.transform = icp_sha256_generic,
+	.is_supported = icp_sha2_is_supported
 };

 const sha512_ops_t sha512_generic_impl = {
 	.name = "generic",
-	.transform = sha512_generic,
-	.is_supported = sha2_is_supported
+	.transform = icp_sha512_generic,
+	.is_supported = icp_sha2_is_supported
 };
@@ -3246,7 +3246,8 @@ nvs_xdr_nvl_fini(nvstream_t *nvs)
 * xdrproc_t-compatible callbacks for xdr_array()
 */

-#if defined(_KERNEL) && defined(__linux__) /* Linux kernel */
+#if (defined(__FreeBSD_version) && __FreeBSD_version >= 1600010) || \
+    defined(_KERNEL) && defined(__linux__) /* Linux kernel */

 #define	NVS_BUILD_XDRPROC_T(type)		\
 static bool_t					\
@@ -3255,7 +3256,7 @@ nvs_xdr_nvp_##type(XDR *xdrs, void *ptr)	\
 	return (xdr_##type(xdrs, ptr));		\
 }

-#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */
+#elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc, FreeBSD < 16 */

 #define	NVS_BUILD_XDRPROC_T(type)		\
 static bool_t					\
@@ -3271,7 +3272,7 @@ nvs_xdr_nvp_##type(XDR *xdrs, ...)		\
 	return (xdr_##type(xdrs, ptr));		\
 }

-#else /* FreeBSD, sunrpc */
+#else /* FreeBSD kernel < 16, sunrpc */

 #define	NVS_BUILD_XDRPROC_T(type)		\
 static bool_t					\
@@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 	int			count = 0;
 	zfs_acl_phys_t		acl_phys;

-	if (zp->z_zfsvfs->z_replay == B_FALSE) {
+	if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) {
 		ASSERT_VOP_IN_SEQC(ZTOV(zp));
 	}

@@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)

 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
-    ssize_t *resid)
+    uint8_t ashift, ssize_t *resid)
 {
+	(void) ashift;
 	return (zfs_file_write_impl(fp, buf, count, &off, resid));
 }

@@ -100,14 +100,6 @@

 VFS_SMR_DECLARE;

-#ifdef DEBUG_VFS_LOCKS
-#define	VNCHECKREF(vp)				  \
-	VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,	\
-	    ("%s: wrong ref counts", __func__));
-#else
-#define	VNCHECKREF(vp)
-#endif
-
 #if __FreeBSD_version >= 1400045
 typedef uint64_t cookie_t;
 #else
@@ -965,9 +957,6 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype;
-#ifdef DEBUG_VFS_LOCKS
-	vnode_t	*dvp = ZTOV(dzp);
-#endif

 	if (is_nametoolong(zfsvfs, name))
 		return (SET_ERROR(ENAMETOOLONG));
@@ -1097,7 +1086,8 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 	getnewvnode_drop_reserve();

 out:
-	VNCHECKREF(dvp);
+	VNASSERT(ZTOV(dzp)->v_holdcnt > 0 && ZTOV(dzp)->v_usecount > 0,
+	    ZTOV(dzp), ("%s: wrong ref counts", __func__));
 	if (error == 0) {
 		*zpp = zp;
 	}
@@ -794,6 +794,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 	(*zpp)->z_mode = mode;
 	(*zpp)->z_dnodesize = dnodesize;

+	vnode_t *vp = ZTOV(*zpp);
+	if (!(flag & IS_ROOT_NODE))
+		vn_seqc_write_begin(vp);
+
 	if (vap->va_mask & AT_XVATTR)
 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);

@@ -802,7 +806,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	if (!(flag & IS_ROOT_NODE)) {
-		vnode_t *vp = ZTOV(*zpp);
+		vn_seqc_write_end(vp);
 		vp->v_vflag |= VV_FORCEINSMQ;
 		int err = insmntque(vp, zfsvfs->z_vfs);
 		vp->v_vflag &= ~VV_FORCEINSMQ;
@@ -371,9 +371,6 @@ error:
 	return (ret);
 }

-void *failed_decrypt_buf;
-int failed_decrypt_size;
-
 /*
 * This function handles all encryption and decryption in zfs. When
 * encrypting it expects puio to reference the plaintext and cuio to
@@ -1663,9 +1660,6 @@ error:
 	return (ret);
 }

-void *failed_decrypt_buf;
-int faile_decrypt_size;
-
 /*
 * Primary encryption / decryption entrypoint for zio data.
 */
@@ -1758,13 +1752,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
 	return (0);

 error:
-	if (!encrypt) {
-		if (failed_decrypt_buf != NULL)
-			kmem_free(failed_decrypt_buf, failed_decrypt_size);
-		failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
-		failed_decrypt_size = datalen;
-		memcpy(failed_decrypt_buf, cipherbuf, datalen);
-	}
 	if (locked)
 		rw_exit(&key->zk_salt_lock);
 	if (authbuf != NULL)
@@ -25,6 +25,10 @@
 * SUCH DAMAGE.
 */

+/*
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ */
+
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
@@ -56,6 +60,19 @@ typedef struct zone_dataset {
 } zone_dataset_t;

 #ifdef CONFIG_USER_NS
+
+/*
+ * Linux 6.18 moved the generic namespace type away from ns->ops->type onto
+ * ns_common itself.
+ */
+#ifdef HAVE_NS_COMMON_TYPE
+#define	ns_is_newuser(ns)	\
+	((ns)->ns_type == CLONE_NEWUSER)
+#else
+#define	ns_is_newuser(ns)	\
+	((ns)->ops != NULL && (ns)->ops->type == CLONE_NEWUSER)
+#endif
+
 /*
 * Returns:
 * - 0 on success
@@ -84,7 +101,7 @@ user_ns_get(int fd, struct user_namespace **userns)
 		goto done;
 	}
 	ns = get_proc_ns(file_inode(nsfile));
-	if (ns->ops->type != CLONE_NEWUSER) {
+	if (!ns_is_newuser(ns)) {
 		error = ENOTTY;
 		goto done;
 	}
@@ -23,6 +23,7 @@
 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
 * Copyright (c) 2019 by Delphix. All rights reserved.
 * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
 */

 /*
@@ -891,6 +892,14 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount)
 	}
 }

+#ifndef nth_page
+/*
+ * Since 6.18 nth_page() no longer exists, and is no longer required to iterate
+ * within a single SG entry, so we replace it with a simple addition.
+ */
+#define	nth_page(p, n)	((p)+(n))
+#endif
+
 /*
 * Map the current chunk into aiter. This can be safely called when the aiter
 * has already exhausted, in which case this does nothing.
@@ -918,7 +927,14 @@ abd_iter_map(struct abd_iter *aiter)
 		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
 		    aiter->iter_abd->abd_size - aiter->iter_pos);

-		paddr = zfs_kmap_local(sg_page(aiter->iter_sg));
+		struct page *page = sg_page(aiter->iter_sg);
+		if (PageHighMem(page)) {
+			page = nth_page(page, offset / PAGE_SIZE);
+			offset &= PAGE_SIZE - 1;
+			aiter->iter_mapsize = MIN(aiter->iter_mapsize,
+			    PAGE_SIZE - offset);
+		}
+		paddr = zfs_kmap_local(page);
 	}

 	aiter->iter_mapaddr = (char *)paddr + offset;
@@ -936,8 +952,14 @@ abd_iter_unmap(struct abd_iter *aiter)
 		return;

 	if (!abd_is_linear(aiter->iter_abd)) {
+		size_t offset = aiter->iter_offset;
+
+		struct page *page = sg_page(aiter->iter_sg);
+		if (PageHighMem(page))
+			offset &= PAGE_SIZE - 1;
+
 		/* LINTED E_FUNC_SET_NOT_USED */
-		zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset);
+		zfs_kunmap_local(aiter->iter_mapaddr - offset);
 	}

 	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
@@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 */
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
-    ssize_t *resid)
+    uint8_t ashift, ssize_t *resid)
 {
+	(void) ashift;
 	ssize_t rc;

 	rc  = kernel_write(fp, buf, count, &off);
@@ -100,15 +100,17 @@ zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)

 	while (n && uio->uio_resid) {
 		void *paddr;
-		cnt = MIN(bv->bv_len - skip, n);
+		size_t offset = bv->bv_offset + skip;
+		cnt = MIN(PAGE_SIZE - (offset & ~PAGE_MASK),
+		    MIN(bv->bv_len - skip, n));

-		paddr = zfs_kmap_local(bv->bv_page);
+		paddr = zfs_kmap_local(bv->bv_page + (offset >> PAGE_SHIFT));
 		if (rw == UIO_READ) {
 			/* Copy from buffer 'p' to the bvec data */
-			memcpy(paddr + bv->bv_offset + skip, p, cnt);
+			memcpy(paddr + (offset & ~PAGE_MASK), p, cnt);
 		} else {
 			/* Copy from bvec data to buffer 'p' */
-			memcpy(p, paddr + bv->bv_offset + skip, cnt);
+			memcpy(p, paddr + (offset & ~PAGE_MASK), cnt);
 		}
 		zfs_kunmap_local(paddr);

@@ -1521,6 +1521,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
 	sb->s_xattr = zpl_xattr_handlers;
 	sb->s_export_op = &zpl_export_operations;

+#ifdef HAVE_SET_DEFAULT_D_OP
+	set_default_d_op(sb, &zpl_dentry_operations);
+#else
+	sb->s_d_op = &zpl_dentry_operations;
+#endif
+
 	/* Set features for file system. */
 	zfs_set_fuid_feature(zfsvfs);

@@ -3516,7 +3516,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
 	boolean_t	is_tmpfile = 0;
 	uint64_t	txg;

-	is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
+	is_tmpfile = (sip->i_nlink == 0 &&
+	    (inode_state_read_once(sip) & I_LINKABLE));

 	ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));

@@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
 	return (!!dentry->d_inode);
 }

-static dentry_operations_t zpl_dops_snapdirs = {
+static const struct dentry_operations zpl_dops_snapdirs = {
 /*
 * Auto mounting of snapshots is only supported for 2.6.37 and
 * newer kernels.  Prior to this kernel the ops->follow_link()
@@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = {
 	.d_revalidate	= zpl_snapdir_revalidate,
 };

+/*
+ * For the .zfs control directory to work properly we must be able to override
+ * the default operations table and register custom .d_automount and
+ * .d_revalidate callbacks.
+ */
+static void
+set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) {
+	static const unsigned int op_flags =
+	    DCACHE_OP_HASH | DCACHE_OP_COMPARE |
+	    DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE |
+	    DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL;
+
+#ifdef HAVE_D_SET_D_OP
+	/*
+	 * d_set_d_op() will set the DCACHE_OP_ flags according to what it
+	 * finds in the passed dentry_operations, so we don't have to.
+	 *
+	 * We clear the flags and the old op table before calling d_set_d_op()
+	 * because issues a warning when the dentry operations table is already
+	 * set.
+	 */
+	dentry->d_op = NULL;
+	dentry->d_flags &= ~op_flags;
+	d_set_d_op(dentry, &zpl_dops_snapdirs);
+	dentry->d_flags |= extraflags;
+#else
+	/*
+	 * Since 6.17 there's no exported way to modify dentry ops, so we have
+	 * to reach in and do it ourselves. This should be safe for our very
+	 * narrow use case, which is to create or splice in an entry to give
+	 * access to a snapshot.
+	 *
+	 * We need to set the op flags directly. We hardcode
+	 * DCACHE_OP_REVALIDATE because that's the only operation we have; if
+	 * we ever extend zpl_dops_snapdirs we will need to update the op flags
+	 * to match.
+	 */
+	spin_lock(&dentry->d_lock);
+	dentry->d_op = &zpl_dops_snapdirs;
+	dentry->d_flags &= ~op_flags;
+	dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags;
+	spin_unlock(&dentry->d_lock);
+#endif
+}
+
 static struct dentry *
 zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
    unsigned int flags)
@@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
 		return (ERR_PTR(error));

 	ASSERT(error == 0 || ip == NULL);
-	d_clear_d_op(dentry);
-	d_set_d_op(dentry, &zpl_dops_snapdirs);
-	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-
+	set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT);
 	return (d_splice_alias(ip, dentry));
 }

@@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)

 	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
 	if (error == 0) {
-		d_clear_d_op(dentry);
-		d_set_d_op(dentry, &zpl_dops_snapdirs);
+		set_snapdir_dentry_ops(dentry, 0);
 		d_instantiate(dentry, ip);
 	}

@@ -22,6 +22,7 @@
 /*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
 */


@@ -444,6 +445,7 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 	return (ret);
 }

+#ifdef HAVE_WRITE_CACHE_PAGES
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
@@ -465,6 +467,78 @@ zpl_write_cache_pages(struct address_space *mapping,
 #endif
 	return (result);
 }
+#else
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data)
+{
+	pgoff_t start = wbc->range_start >> PAGE_SHIFT;
+	pgoff_t end = wbc->range_end >> PAGE_SHIFT;
+
+	struct folio_batch fbatch;
+	folio_batch_init(&fbatch);
+
+	/*
+	 * This atomically (-ish) tags all DIRTY pages in the range with
+	 * TOWRITE, allowing users to continue dirtying or undirtying pages
+	 * while we get on with writeback, without us treading on each other.
+	 */
+	tag_pages_for_writeback(mapping, start, end);
+
+	int err = 0;
+	unsigned int npages;
+
+	/*
+	 * Grab references to the TOWRITE pages just flagged. This may not get
+	 * all of them, so we do it in a loop until there are none left.
+	 */
+	while ((npages = filemap_get_folios_tag(mapping, &start, end,
+	    PAGECACHE_TAG_TOWRITE, &fbatch)) != 0) {
+
+		/* Loop over each page and write it out. */
+		struct folio *folio;
+		while ((folio = folio_batch_next(&fbatch)) != NULL) {
+			folio_lock(folio);
+
+			/*
+			 * If the folio has been remapped, or is no longer
+			 * dirty, then there's nothing to do.
+			 */
+			if (folio->mapping != mapping ||
+			    !folio_test_dirty(folio)) {
+				folio_unlock(folio);
+				continue;
+			}
+
+			/*
+			 * If writeback is already in progress, wait for it to
+			 * finish. We continue after this even if the page
+			 * ends up clean; zfs_putpage() will skip it if no
+			 * further work is required.
+			 */
+			while (folio_test_writeback(folio))
+				folio_wait_bit(folio, PG_writeback);
+
+			/*
+			 * Write it out and collect any error. zfs_putpage()
+			 * will clear the TOWRITE and DIRTY flags, and return
+			 * with the page unlocked.
+			 */
+			int ferr = zpl_putpage(&folio->page, wbc, data);
+			if (err == 0 && ferr != 0)
+				err = ferr;
+
+			/* Housekeeping for the caller. */
+			wbc->nr_to_write -= folio_nr_pages(folio);
+		}
+
+		/* Release any remaining references on the batch. */
+		folio_batch_release(&fbatch);
+	}
+
+	return (err);
+}
+#endif

 static int
 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -684,28 +758,44 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
 	return (error);
 }

-#define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
-#define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
+#define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)
+#define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL)
+
+
+static struct {
+	uint64_t zfs_flag;
+	uint32_t fs_flag;
+	uint32_t xflag;
+} flags_lookup[] = {
+	{ZFS_IMMUTABLE, FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE},
+	{ZFS_APPENDONLY, FS_APPEND_FL, FS_XFLAG_APPEND},
+	{ZFS_NODUMP, FS_NODUMP_FL, FS_XFLAG_NODUMP},
+	{ZFS_PROJINHERIT, FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT}
+};

 static uint32_t
 __zpl_ioctl_getflags(struct inode *ip)
 {
 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
 	uint32_t ioctl_flags = 0;
+	for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
+		if (zfs_flags & flags_lookup[i].zfs_flag)
+			ioctl_flags |= flags_lookup[i].fs_flag;

-	if (zfs_flags & ZFS_IMMUTABLE)
-		ioctl_flags |= FS_IMMUTABLE_FL;
+	return (ioctl_flags);
+}

-	if (zfs_flags & ZFS_APPENDONLY)
-		ioctl_flags |= FS_APPEND_FL;
+static uint32_t
+__zpl_ioctl_getxflags(struct inode *ip)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	uint32_t ioctl_flags = 0;

-	if (zfs_flags & ZFS_NODUMP)
-		ioctl_flags |= FS_NODUMP_FL;
+	for (int i = 0; i < ARRAY_SIZE(flags_lookup); i++)
+		if (zfs_flags & flags_lookup[i].zfs_flag)
+			ioctl_flags |= flags_lookup[i].xflag;

-	if (zfs_flags & ZFS_PROJINHERIT)
-		ioctl_flags |= ZFS_PROJINHERIT_FL;
-
-	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
+	return (ioctl_flags);
 }

 /*
@@ -719,6 +809,7 @@ zpl_ioctl_getflags(struct file *filp, void __user *arg)
 	int err;

 	flags = __zpl_ioctl_getflags(file_inode(filp));
+	flags = flags & ZFS_FL_USER_VISIBLE;
 	err = copy_to_user(arg, &flags, sizeof (flags));

 	return (err);
@@ -742,7 +833,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
 	xoptattr_t *xoap;

 	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
-	    ZFS_PROJINHERIT_FL))
+	    FS_PROJINHERIT_FL))
 		return (-EOPNOTSUPP);

 	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
@@ -773,7 +864,51 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
 	    xoap->xoa_appendonly);
 	FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
 	    xoap->xoa_nodump);
-	FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+	FLAG_CHANGE(FS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
+	    xoap->xoa_projinherit);
+
+#undef	FLAG_CHANGE
+
+	return (0);
+}
+
+static int
+__zpl_ioctl_setxflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	xoptattr_t *xoap;
+
+	if (ioctl_flags & ~(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND |
+	    FS_XFLAG_NODUMP | FS_XFLAG_PROJINHERIT))
+		return (-EOPNOTSUPP);
+
+	if ((fchange(ioctl_flags, zfs_flags, FS_XFLAG_IMMUTABLE,
+	    ZFS_IMMUTABLE) ||
+	    fchange(ioctl_flags, zfs_flags, FS_XFLAG_APPEND, ZFS_APPENDONLY)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return (-EPERM);
+
+	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
+		return (-EACCES);
+
+	xva_init(xva);
+	xoap = xva_getxoptattr(xva);
+
+#define	FLAG_CHANGE(iflag, zflag, xflag, xfield)	do {	\
+	if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) ||	\
+	    ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) {	\
+		XVA_SET_REQ(xva, (xflag));	\
+		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
+	}	\
+} while (0)
+
+	FLAG_CHANGE(FS_XFLAG_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+	    xoap->xoa_immutable);
+	FLAG_CHANGE(FS_XFLAG_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+	    xoap->xoa_appendonly);
+	FLAG_CHANGE(FS_XFLAG_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+	    xoap->xoa_nodump);
+	FLAG_CHANGE(FS_XFLAG_PROJINHERIT, ZFS_PROJINHERIT, XAT_PROJINHERIT,
 	    xoap->xoa_projinherit);

 #undef	FLAG_CHANGE
@@ -814,7 +949,7 @@ zpl_ioctl_getxattr(struct file *filp, void __user *arg)
 	struct inode *ip = file_inode(filp);
 	int err;

-	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
+	fsx.fsx_xflags = __zpl_ioctl_getxflags(ip);
 	fsx.fsx_projid = ITOZ(ip)->z_projid;
 	err = copy_to_user(arg, &fsx, sizeof (fsx));

@@ -838,7 +973,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg)
 	if (!zpl_is_valid_projid(fsx.fsx_projid))
 		return (-EINVAL);

-	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
+	err = __zpl_ioctl_setxflags(ip, fsx.fsx_xflags, &xva);
 	if (err)
 		return (err);

@@ -22,6 +22,8 @@
 /*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
 * Copyright (c) 2023, Datto Inc. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
+ * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
 */


@@ -32,7 +34,22 @@
 #include <sys/zpl.h>
 #include <linux/iversion.h>
 #include <linux/version.h>
+#include <linux/vfs_compat.h>

+/*
+ * What to do when the last reference to an inode is released. If 0, the kernel
+ * will cache it on the superblock. If 1, the inode will be freed immediately.
+ * See zpl_drop_inode().
+ */
+int zfs_delete_inode = 0;
+
+/*
+ * What to do when the last reference to a dentry is released. If 0, the kernel
+ * will cache it until the entry (file) is destroyed. If 1, the dentry will be
+ * marked for cleanup, at which time its inode reference will be released. See
+ * zpl_dentry_delete().
+ */
+int zfs_delete_dentry = 0;

 static struct inode *
 zpl_inode_alloc(struct super_block *sb)
@@ -77,11 +94,36 @@ zpl_dirty_inode(struct inode *ip, int flags)
 }

 /*
- * When ->drop_inode() is called its return value indicates if the
- * inode should be evicted from the inode cache.  If the inode is
- * unhashed and has no links the default policy is to evict it
- * immediately.
+ * ->drop_inode() is called when the last reference to an inode is released.
+ * Its return value indicates if the inode should be destroyed immediately, or
+ * cached on the superblock structure.
 *
+ * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
+ * "destroy immediately" if the inode is unhashed and has no links (roughly: no
+ * longer exists on disk). On datasets with millions of rarely-accessed files,
+ * this can cause a large amount of memory to be "pinned" by cached inodes,
+ * which in turn pin their associated dnodes and dbufs, until the kernel starts
+ * reporting memory pressure and requests OpenZFS release some memory (see
+ * zfs_prune()).
+ *
+ * When set to 1, we call generic_delete_inode(), which always returns "destroy
+ * immediately", resulting in inodes being destroyed immediately, releasing
+ * their associated dnodes and dbufs to the dbuf cached and the ARC to be
+ * evicted as normal.
+ *
+ * Note that the "last reference" doesn't always mean the last _userspace_
+ * reference; the dentry cache also holds a reference, so "busy" inodes will
+ * still be kept alive that way (subject to dcache tuning).
+ */
+static int
+zpl_drop_inode(struct inode *ip)
+{
+	if (zfs_delete_inode)
+		return (generic_delete_inode(ip));
+	return (generic_drop_inode(ip));
+}
+
+/*
 * The ->evict_inode() callback must minimally truncate the inode pages,
 * and call clear_inode().  For 2.6.35 and later kernels this will
 * simply update the inode state, with the sync occurring before the
@@ -470,6 +512,7 @@ const struct super_operations zpl_super_operations = {
 	.destroy_inode		= zpl_inode_destroy,
 	.dirty_inode		= zpl_dirty_inode,
 	.write_inode		= NULL,
+	.drop_inode		= zpl_drop_inode,
 	.evict_inode		= zpl_evict_inode,
 	.put_super		= zpl_put_super,
 	.sync_fs		= zpl_sync_fs,
@@ -480,6 +523,35 @@ const struct super_operations zpl_super_operations = {
 	.show_stats		= NULL,
 };

+/*
+ * ->d_delete() is called when the last reference to a dentry is released. Its
+ *  return value indicates if the dentry should be destroyed immediately, or
+ *  retained in the dentry cache.
+ *
+ * By default (zfs_delete_dentry=0) the kernel will always cache unused
+ * entries.  Each dentry holds an inode reference, so cached dentries can hold
+ * the final inode reference indefinitely, leading to the inode and its related
+ * data being pinned (see zpl_drop_inode()).
+ *
+ * When set to 1, we signal that the dentry should be destroyed immediately and
+ * never cached. This reduces memory usage, at the cost of higher overheads to
+ * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
+ * reloaded and reinflated.
+ *
+ * Note that userspace does not have direct control over dentry references and
+ * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
+ * (eg vm.vfs_cache_pressure).
+ */
+static int
+zpl_dentry_delete(const struct dentry *dentry)
+{
+	return (zfs_delete_dentry ? 1 : 0);
+}
+
+const struct dentry_operations zpl_dentry_operations = {
+	.d_delete = zpl_dentry_delete,
+};
+
 struct file_system_type zpl_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= ZFS_DRIVER,
@@ -491,3 +563,10 @@ struct file_system_type zpl_fs_type = {
 	.mount			= zpl_mount,
 	.kill_sb		= zpl_kill_sb,
 };
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
+	"Delete inodes as soon as the last reference is released.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
+	"Delete dentries from dentry cache as soon as the last reference is "
+	"released.");
@@ -21,7 +21,7 @@
 */
 /*
 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
- * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>
 * Copyright (c) 2024, Klara, Inc.
 */

@@ -377,16 +377,14 @@ zvol_discard(zv_request_t *zvr)
 	}

 	/*
-	 * Align the request to volume block boundaries when a secure erase is
-	 * not required.  This will prevent dnode_free_range() from zeroing out
-	 * the unaligned parts which is slow (read-modify-write) and useless
-	 * since we are not freeing any space by doing so.
+	 * Align the request to volume block boundaries. This will prevent
+	 * dnode_free_range() from zeroing out the unaligned parts which is
+	 * slow (read-modify-write) and useless since we are not freeing any
+	 * space by doing so.
 	 */
-	if (!io_is_secure_erase(bio, rq)) {
-		start = P2ROUNDUP(start, zv->zv_volblocksize);
-		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
-		size = end - start;
-	}
+	start = P2ROUNDUP(start, zv->zv_volblocksize);
+	end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
+	size = end - start;

 	if (start >= end)
 		goto unlock;
@@ -506,6 +504,24 @@ zvol_read_task(void *arg)
 	zv_request_task_free(task);
 }

+/*
+ * Note:
+ *
+ * The kernel uses different enum names for the IO opcode, depending on the
+ * kernel version ('req_opf', 'req_op').  To sidestep this, use macros rather
+ * than inline functions for these checks.
+ */
+/* Should this IO go down the zvol write path? */
+#define	ZVOL_OP_IS_WRITE(op) \
+	(op == REQ_OP_WRITE || \
+	op == REQ_OP_FLUSH || \
+	op == REQ_OP_DISCARD)
+
+/* Is this IO type supported by zvols? */
+#define	ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
+
+/* Get the IO opcode */
+#define	ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))

 /*
 * Process a BIO or request
@@ -523,7 +539,33 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
-	int rw = io_data_dir(bio, rq);
+	int rw;
+
+	if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
+		zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
+		    rq != NULL ? "request" : "BIO",
+		    ZVOL_OP(bio, rq),
+		    rq != NULL ? rq->cmd_flags : bio->bi_opf);
+		ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
+		zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
+		goto out;
+	}
+
+	if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
+		rw = WRITE;
+	} else {
+		rw = READ;
+	}
+
+	/*
+	 * Sanity check
+	 *
+	 * If we're a BIO, check our rw matches the kernel's
+	 * bio_data_dir(bio) rw.  We need to check because we support fewer
+	 * IO operations, and want to verify that what we think are reads and
+	 * writes from those operations match what the kernel thinks.
+	 */
+	ASSERT(rq != NULL || rw == bio_data_dir(bio));

 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
@@ -628,7 +670,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 		 * interfaces lack this functionality (they block waiting for
 		 * the i/o to complete).
 		 */
-		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+		if (io_is_discard(bio, rq)) {
 			if (force_sync) {
 				zvol_discard(&zvr);
 			} else {
@@ -1024,12 +1066,13 @@ zvol_os_clear_private(zvol_state_t *zv)
 * tiny devices.  For devices over 1 Mib a standard head and sector count
 * is used to keep the cylinders count reasonable.
 */
-static int
-zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+static inline int
+zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
 {
-	zvol_state_t *zv = bdev->bd_disk->private_data;
+	zvol_state_t *zv = disk->private_data;
 	sector_t sectors;

+	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);

 	sectors = get_capacity(zv->zv_zso->zvo_disk);
@@ -1048,6 +1091,20 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return (0);
 }

+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
+static int
+zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(disk, geo));
+}
+#else
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	return (zvol_getgeo_impl(bdev->bd_disk, geo));
+}
+#endif
+
 /*
 * Why have two separate block_device_operations structs?
 *
@@ -1489,7 +1546,7 @@ zvol_os_free(zvol_state_t *zv)
 	if (zv->zv_zso->use_blk_mq)
 		blk_mq_free_tag_set(&zv->zv_zso->tag_set);

-	ida_simple_remove(&zvol_ida,
+	ida_free(&zvol_ida,
 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);

 	cv_destroy(&zv->zv_removing_cv);
@@ -1623,7 +1680,7 @@ zvol_os_create_minor(const char *name)
 	if (zvol_inhibit_dev)
 		return (0);

-	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+	idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
@@ -1631,7 +1688,7 @@ zvol_os_create_minor(const char *name)
 		/* too many partitions can cause an overflow */
 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
 		    name, minor, MINOR(minor));
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EINVAL));
 	}

@@ -1639,7 +1696,7 @@ zvol_os_create_minor(const char *name)
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}

@@ -1741,7 +1798,7 @@ out_doi:
 		rw_exit(&zvol_state_lock);
 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
-		ida_simple_remove(&zvol_ida, idx);
+		ida_free(&zvol_ida, idx);
 	}

 	return (error);
@@ -1111,13 +1111,6 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,

 		func_raidz_gen(caddrs, daddr, len, dlen);

-		for (i = parity-1; i >= 0; i--) {
-			abd_iter_unmap(&caiters[i]);
-			c_cabds[i] =
-			    abd_advance_abd_iter(cabds[i], c_cabds[i],
-			    &caiters[i], len);
-		}
-
 		if (dsize > 0) {
 			abd_iter_unmap(&daiter);
 			c_dabd =
@@ -1126,6 +1119,13 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,
 			dsize -= dlen;
 		}

+		for (i = parity - 1; i >= 0; i--) {
+			abd_iter_unmap(&caiters[i]);
+			c_cabds[i] =
+			    abd_advance_abd_iter(cabds[i], c_cabds[i],
+			    &caiters[i], len);
+		}
+
 		csize -= len;
 	}
 	abd_exit_critical(flags);
@@ -1194,7 +1194,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,

 		func_raidz_rec(xaddrs, len, caddrs, mul);

-		for (i = parity-1; i >= 0; i--) {
+		for (i = parity - 1; i >= 0; i--) {
 			abd_iter_unmap(&xiters[i]);
 			abd_iter_unmap(&citers[i]);
 			c_tabds[i] =
@@ -212,7 +212,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 	char *ds_name;

 	ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name);
-	ASSERT3S(ds_name, !=, NULL);
+	ASSERT3P(ds_name, !=, NULL);
 	(void) strlcpy(ds_name, name,
 	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
 }
@@ -1508,7 +1508,7 @@ ddt_configure(ddt_t *ddt, boolean_t new)
 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
 		    &ddt->ddt_dir_object);
 		if (error == 0) {
-			ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
+			ASSERT3P(spa->spa_meta_objset, ==, ddt->ddt_os);

 			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
 			    DDT_DIR_VERSION, sizeof (uint64_t), 1,
@@ -262,7 +262,7 @@ ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 void
 ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
 {
-	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3P(dlu->dlu_dbp, !=, NULL);

 	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
 	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
@@ -312,7 +312,7 @@ ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
 void
 ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
 {
-	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3P(dlu->dlu_dbp, !=, NULL);
 	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
 	ASSERT3U(dlu->dlu_offset, >, 0);

@@ -770,6 +770,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+		if (limit == 0)
+			end2 = start2;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
@@ -1684,8 +1686,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object,

 	dmu_object_info_from_dnode(dn, &doi);

-	for (uint64_t off = 0; off < doi.doi_max_offset;
-	    off += dmu_prefetch_max) {
+	for (uint64_t off = 0; off < doi.doi_max_offset &&
+	    dmu_prefetch_max > 0; off += dmu_prefetch_max) {
 		/* dbuf_read doesn't prefetch L1 blocks. */
 		dmu_prefetch_by_dnode(dn, 1, off,
 		    dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);
@@ -2694,6 +2694,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	return (error);
 }

+/*
+ * Adjust *offset to the next (or previous) block byte offset at lvl.
+ * Returns FALSE if *offset would overflow or underflow.
+ */
+static boolean_t
+dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
+{
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int span = lvl * epbs + dn->dn_datablkshift;
+	uint64_t blkid, maxblkid;
+
+	if (span >= 8 * sizeof (uint64_t))
+		return (B_FALSE);
+
+	blkid = *offset >> span;
+	maxblkid = 1ULL << (8 * sizeof (*offset) - span);
+	if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
+		*offset = (blkid + 1) << span;
+	else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
+		*offset = (blkid << span) - 1;
+	else
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
 /*
 * Find the next hole, data, or sparse region at or after *offset.
 * The value 'blkfill' tells us how many items we expect to find
@@ -2721,7 +2747,7 @@ int
 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
    int minlvl, uint64_t blkfill, uint64_t txg)
 {
-	uint64_t initial_offset = *offset;
+	uint64_t matched = *offset;
 	int lvl, maxlvl;
 	int error = 0;

@@ -2745,16 +2771,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,

 	maxlvl = dn->dn_phys->dn_nlevels;

-	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+	for (lvl = minlvl; lvl <= maxlvl; ) {
 		error = dnode_next_offset_level(dn,
 		    flags, offset, lvl, blkfill, txg);
-		if (error != ESRCH)
+		if (error == 0 && lvl > minlvl) {
+			--lvl;
+			matched = *offset;
+		} else if (error == ESRCH && lvl < maxlvl &&
+		    dnode_next_block(dn, flags, &matched, lvl)) {
+			/*
+			 * Continue search at next/prev offset in lvl+1 block.
+			 *
+			 * Usually we only search upwards at the start of the
+			 * search as higher level blocks point at a matching
+			 * minlvl block in most cases, but we backtrack if not.
+			 *
+			 * This can happen for txg > 0 searches if the block
+			 * contains only BPs/dnodes freed at that txg. It also
+			 * happens if we are still syncing out the tree, and
+			 * some BP's at higher levels are not updated yet.
+			 *
+			 * We must adjust offset to avoid coming back to the
+			 * same offset and getting stuck looping forever. This
+			 * also deals with the case where offset is already at
+			 * the beginning or end of the object.
+			 */
+			++lvl;
+			*offset = matched;
+		} else {
 			break;
-	}
-
-	while (error == 0 && --lvl >= minlvl) {
-		error = dnode_next_offset_level(dn,
-		    flags, offset, lvl, blkfill, txg);
+		}
 	}

 	/*
@@ -2766,9 +2812,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
 		error = 0;
 	}

-	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
-	    initial_offset < *offset : initial_offset > *offset))
-		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
@@ -145,6 +145,15 @@
 * Additionally, the duration is then extended by a random 25% to attempt to to
 * detect simultaneous imports.  For example, if both partner hosts are rebooted
 * at the same time and automatically attempt to import the pool.
+ *
+ * Once the read-only activity check completes and the pool is determined to
+ * be inactive a second check is performed to claim the pool.  During this
+ * phase the host writes out MMP uberblocks to each of the devices which are
+ * identical to the best uberblock but with a randomly selected sequence id.
+ * The "best" uberblock is then read back and it must contain this new sequence
+ * number.  This check is performed multiple times to ensure that there is
+ * no window where a concurrently importing system can incorrectly determine
+ * the pool to be inactive.
 */

 /*
@@ -237,8 +246,8 @@ mmp_thread_start(spa_t *spa)
 		if (!mmp->mmp_thread) {
 			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
 			    spa, 0, &p0, TS_RUN, defclsyspri);
-			zfs_dbgmsg("MMP thread started pool '%s' "
-			    "gethrtime %llu", spa_name(spa), gethrtime());
+			zfs_dbgmsg("mmp: mmp thread started spa=%s "
+			    "gethrtime=%llu", spa_name(spa), gethrtime());
 		}
 		mutex_exit(&mmp->mmp_thread_lock);
 	}
@@ -257,7 +266,7 @@ mmp_thread_stop(spa_t *spa)
 		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
 	}
 	mutex_exit(&mmp->mmp_thread_lock);
-	zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
+	zfs_dbgmsg("mmp: mmp thread stopped spa=%s gethrtime=%llu",
 	    spa_name(spa), gethrtime());

 	ASSERT(mmp->mmp_thread == NULL);
@@ -449,9 +458,9 @@ mmp_write_uberblock(spa_t *spa)
 	spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
 	lock_acquire_time = gethrtime() - lock_acquire_time;
 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
-		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
-		    "gethrtime %llu", spa_name(spa), lock_acquire_time,
-		    gethrtime());
+		zfs_dbgmsg("mmp: long SCL_STATE acquisition, spa=%s "
+		    "acquire_time=%llu gethrtime=%llu", spa_name(spa),
+		    lock_acquire_time, gethrtime());

 	mutex_enter(&mmp->mmp_io_lock);

@@ -474,8 +483,8 @@ mmp_write_uberblock(spa_t *spa)
 			spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
 			    gethrestime_sec(), mmp->mmp_delay, NULL, 0,
 			    mmp->mmp_kstat_id++, error);
-			zfs_dbgmsg("MMP error choosing leaf pool '%s' "
-			    "gethrtime %llu fail_mask %#x", spa_name(spa),
+			zfs_dbgmsg("mmp: error choosing leaf, spa=%s "
+			    "gethrtime=%llu fail_mask=%#x", spa_name(spa),
 			    gethrtime(), error);
 		}
 		mutex_exit(&mmp->mmp_io_lock);
@@ -485,11 +494,11 @@ mmp_write_uberblock(spa_t *spa)

 	vd = spa->spa_mmp.mmp_last_leaf;
 	if (mmp->mmp_skip_error != 0) {
-		mmp->mmp_skip_error = 0;
-		zfs_dbgmsg("MMP write after skipping due to unavailable "
-		    "leaves, pool '%s' gethrtime %llu leaf %llu",
+		zfs_dbgmsg("mmp: write after skipping due to unavailable "
+		    "leaves, spa=%s gethrtime=%llu vdev=%llu error=%d",
 		    spa_name(spa), (u_longlong_t)gethrtime(),
-		    (u_longlong_t)vd->vdev_guid);
+		    (u_longlong_t)vd->vdev_guid, mmp->mmp_skip_error);
+		mmp->mmp_skip_error = 0;
 	}

 	if (mmp->mmp_zio_root == NULL)
@@ -540,6 +549,108 @@ mmp_write_uberblock(spa_t *spa)
 	zio_nowait(zio);
 }

+static void
+mmp_claim_uberblock_sync_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_private;
+
+	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+		atomic_inc_64(good_writes);
+}
+
+/*
+ * Write the uberblock to the first label of all leaves of the specified vdev.
+ * Two writes required for each mirror, one for a singleton, and parity+1 for
+ * raidz or draid vdevs.
+ */
+static void
+mmp_claim_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+    uint64_t *req_writes, uberblock_t *ub, vdev_t *vd, int flags)
+{
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_islog || cvd->vdev_isspare || cvd->vdev_isl2cache)
+			continue;
+
+		if (cvd->vdev_top == cvd) {
+			uint64_t nparity = vdev_get_nparity(cvd);
+			if (nparity) {
+				*req_writes += nparity + 1;
+			} else {
+				*req_writes +=
+				    MIN(MAX(cvd->vdev_children, 1), 2);
+			}
+		}
+
+		mmp_claim_uberblock_sync(zio, good_writes, req_writes,
+		    ub, cvd, flags);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (!vdev_writeable(vd))
+		return;
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
+	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+	abd_zero_off(ub_abd, sizeof (uberblock_t),
+	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
+
+	vdev_label_write(zio, vd, 0, ub_abd,
+	    VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
+	    MMP_BLOCKS_PER_LABEL), VDEV_UBERBLOCK_SIZE(vd),
+	    mmp_claim_uberblock_sync_done, good_writes,
+	    flags | ZIO_FLAG_DONT_PROPAGATE);
+
+	abd_free(ub_abd);
+}
+
+int
+mmp_claim_uberblock(spa_t *spa, vdev_t *vd, uberblock_t *ub)
+{
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	uint64_t good_writes = 0;
+	uint64_t req_writes = 0;
+	zio_t *zio;
+
+	ASSERT(MMP_VALID(ub));
+	ASSERT(MMP_SEQ_VALID(ub));
+
+	spa_config_enter(spa, SCL_ALL, mmp_tag, RW_WRITER);
+
+	/* Sync the uberblock to all writeable leaves */
+	zio = zio_root(spa, NULL, NULL, flags);
+	mmp_claim_uberblock_sync(zio, &good_writes, &req_writes, ub, vd, flags);
+	(void) zio_wait(zio);
+
+	/* Flush the new uberblocks so they're immediately visible */
+	zio = zio_root(spa, NULL, NULL, flags);
+	zio_flush(zio, vd);
+	(void) zio_wait(zio);
+
+	spa_config_exit(spa, SCL_ALL, mmp_tag);
+
+	zfs_dbgmsg("mmp: claiming uberblock, spa=%s txg=%llu seq=%llu "
+	    "req_writes=%llu good_writes=%llu", spa_load_name(spa),
+	    (u_longlong_t)ub->ub_txg, (u_longlong_t)MMP_SEQ(ub),
+	    (u_longlong_t)req_writes, (u_longlong_t)good_writes);
+
+	/*
+	 * To guarantee visibility from a remote host we require a minimum
+	 * number of good writes. For raidz/draid vdevs parity+1 writes, for
+	 * mirrors 2 writes, and for singletons 1 write.
+	 */
+	if (req_writes == 0 || good_writes < req_writes)
+		return (SET_ERROR(EIO));
+
+	return (0);
+}
+
 static __attribute__((noreturn)) void
 mmp_thread(void *arg)
 {
@@ -616,11 +727,11 @@ mmp_thread(void *arg)
 			next_time = gethrtime() + mmp_interval / leaves;

 		if (mmp_fail_ns != last_mmp_fail_ns) {
-			zfs_dbgmsg("MMP interval change pool '%s' "
-			    "gethrtime %llu last_mmp_interval %llu "
-			    "mmp_interval %llu last_mmp_fail_intervals %u "
-			    "mmp_fail_intervals %u mmp_fail_ns %llu "
-			    "skip_wait %d leaves %d next_time %llu",
+			zfs_dbgmsg("mmp: interval change, spa=%s "
+			    "gethrtime=%llu last_mmp_interval=%llu "
+			    "mmp_interval=%llu last_mmp_fail_intervals=%u "
+			    "mmp_fail_intervals=%u mmp_fail_ns=%llu "
+			    "skip_wait=%d leaves=%d next_time=%llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)last_mmp_interval,
 			    (u_longlong_t)mmp_interval, last_mmp_fail_intervals,
@@ -635,9 +746,9 @@ mmp_thread(void *arg)
 		 */
 		if ((!last_spa_multihost && multihost) ||
 		    (last_spa_suspended && !suspended)) {
-			zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
-			    "last_spa_multihost %u multihost %u "
-			    "last_spa_suspended %u suspended %u",
+			zfs_dbgmsg("mmp: state change spa=%s: gethrtime=%llu "
+			    "last_spa_multihost=%u multihost=%u "
+			    "last_spa_suspended=%u suspended=%u",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    last_spa_multihost, multihost, last_spa_suspended,
 			    suspended);
@@ -663,9 +774,10 @@ mmp_thread(void *arg)
 		 */
 		if (multihost && !suspended && mmp_fail_intervals &&
 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
-			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
-			    "mmp_last_write %llu mmp_interval %llu "
-			    "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
+			zfs_dbgmsg("mmp: suspending pool, spa=%s "
+			    "gethrtime=%llu mmp_last_write=%llu "
+			    "mmp_interval=%llu mmp_fail_intervals=%llu "
+			    "mmp_fail_ns=%llu txg=%llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)mmp->mmp_last_write,
 			    (u_longlong_t)mmp_interval,
@@ -461,6 +461,8 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_MIN_ALLOC, spa->spa_min_alloc);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, spa->spa_max_alloc);
 	if (spa->spa_comment != NULL)
 		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
 		    spa->spa_comment);
@@ -413,7 +413,7 @@ spa_load_failed(spa_t *spa, const char *fmt, ...)
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);

-	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
+	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa_load_name(spa),
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
 }

@@ -427,7 +427,7 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);

-	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
+	zfs_dbgmsg("spa_load(%s, config %s): %s", spa_load_name(spa),
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);

 	spa_import_progress_set_notes_nolog(spa, "%s", buf);
@@ -814,6 +814,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
+	spa->spa_max_alloc = 0;
 	spa->spa_gcd_alloc = INT_MAX;

 	/* Reset cached value */
@@ -856,6 +857,9 @@ spa_remove(spa_t *spa)
 	if (spa->spa_root)
 		spa_strfree(spa->spa_root);

+	if (spa->spa_load_name)
+		spa_strfree(spa->spa_load_name);
+
 	while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path != NULL)
 			spa_strfree(dp->scd_path);
@@ -1241,7 +1245,7 @@ spa_vdev_enter(spa_t *spa)
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);

-	ASSERT0(spa->spa_export_thread);
+	ASSERT0P(spa->spa_export_thread);

 	vdev_autotrim_stop_all(spa);

@@ -1260,7 +1264,7 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);

-	ASSERT0(spa->spa_export_thread);
+	ASSERT0P(spa->spa_export_thread);

 	vdev_autotrim_stop_all(spa);

@@ -1776,6 +1780,19 @@ spa_name(spa_t *spa)
 	return (spa->spa_name);
 }

+char *
+spa_load_name(spa_t *spa)
+{
+	/*
+	 * During spa_tryimport() the pool name includes a unique prefix.
+	 * Returns the original name which can be used for log messages.
+	 */
+	if (spa->spa_load_name)
+		return (spa->spa_load_name);
+
+	return (spa->spa_name);
+}
+
 uint64_t
 spa_guid(spa_t *spa)
 {
@@ -1876,6 +1893,19 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 	return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation);
 }

+/*
+ * Return the range of minimum allocation sizes for the normal allocation
+ * class. This can be used by external consumers of the DMU to estimate
+ * potential wasted capacity when setting the recordsize for an object.
+ * This is mainly for dRAID pools which always pad to a full stripe width.
+ */
+void
+spa_get_min_alloc_range(spa_t *spa, uint64_t *min_alloc, uint64_t *max_alloc)
+{
+	*min_alloc = spa->spa_min_alloc;
+	*max_alloc = spa->spa_max_alloc;
+}
+
 /*
 * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
 * (3.2%), minus the embedded log space.  On very small pools, it may be
@@ -3075,6 +3105,7 @@ EXPORT_SYMBOL(spa_set_rootblkptr);
 EXPORT_SYMBOL(spa_altroot);
 EXPORT_SYMBOL(spa_sync_pass);
 EXPORT_SYMBOL(spa_name);
+EXPORT_SYMBOL(spa_load_name);
 EXPORT_SYMBOL(spa_guid);
 EXPORT_SYMBOL(spa_last_synced_txg);
 EXPORT_SYMBOL(spa_first_txg);
@@ -3083,6 +3114,7 @@ EXPORT_SYMBOL(spa_version);
 EXPORT_SYMBOL(spa_state);
 EXPORT_SYMBOL(spa_load_state);
 EXPORT_SYMBOL(spa_freeze_txg);
+EXPORT_SYMBOL(spa_get_min_alloc_range); /* for Lustre */
 EXPORT_SYMBOL(spa_get_dspace);
 EXPORT_SYMBOL(spa_update_dspace);
 EXPORT_SYMBOL(spa_deflate);
@@ -1474,12 +1474,14 @@ vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
 {
 	if (min_alloc < spa->spa_min_alloc)
 		spa->spa_min_alloc = min_alloc;
-	if (spa->spa_gcd_alloc == INT_MAX) {
+
+	if (min_alloc > spa->spa_max_alloc)
+		spa->spa_max_alloc = min_alloc;
+
+	if (spa->spa_gcd_alloc == INT_MAX)
 		spa->spa_gcd_alloc = min_alloc;
-	} else {
-		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
-		    spa->spa_gcd_alloc);
-	}
+	else
+		spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc);
 }

 void
@@ -1533,8 +1535,7 @@ vdev_metaslab_group_create(vdev_t *vd)
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;

-			uint64_t min_alloc = vdev_get_min_alloc(vd);
-			vdev_spa_set_alloc(spa, min_alloc);
+			vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd));
 		}
 	}
 }
@@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg)
 		abd_return_buf_copy(zio->io_abd, buf, size);
 	} else {
 		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
-		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+		err = zfs_file_pwrite(vf->vf_file, buf, size, off,
+		    vd->vdev_ashift, &resid);
 		abd_return_buf(zio->io_abd, buf, size);
 	}
 	zio->io_error = err;
--- a/Show More
+++ b/Show More