mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-06-10 07:56:39 +03:00
Compare commits
72 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 34f96a15c7 | |||
| fdb5078d82 | |||
| 8cca55f18b | |||
| ef1ee9421d | |||
| 435006d81d | |||
| 725886d67a | |||
| bce049389d | |||
| c3d74a0d6f | |||
| 9cf069b366 | |||
| 3f87c9c276 | |||
| 9d14ce4db7 | |||
| 3b64a9619f | |||
| a072611eef | |||
| 0fe10361ba | |||
| 3e78905ffb | |||
| 46de04d2e9 | |||
| 41ca2296cd | |||
| 9651668457 | |||
| a49c957299 | |||
| d1d706350e | |||
| 11f844175e | |||
| 57b614e025 | |||
| 0c7d6e20e6 | |||
| b9c45fe68c | |||
| f72226a75c | |||
| 97fe86837c | |||
| df5e02d253 | |||
| 245adb6a4f | |||
| 82a0868ce4 | |||
| e7e0bb3b61 | |||
| 6af1f61ad4 | |||
| 8c4f625c12 | |||
| 7882e85a9b | |||
| 6b38d0f7ff | |||
| 80b6457fcd | |||
| 2518f4b124 | |||
| 90d2c4407a | |||
| f7698f47e8 | |||
| 6c1130a730 | |||
| 74b539d3dc | |||
| 024e60b927 | |||
| 5289f6f961 | |||
| 094305c937 | |||
| a826f7a993 | |||
| 86bf73c1eb | |||
| 1d293b377a | |||
| 22eb2bdce3 | |||
| 809b553940 | |||
| abb6211e7a | |||
| c405a7a35c | |||
| 4808641e71 | |||
| 30fa92bff3 | |||
| fd5a27c9db | |||
| 3ad3f439bb | |||
| a46ce73ca8 | |||
| 90790955a6 | |||
| 95abbc71c3 | |||
| fc658b9935 | |||
| 271b9797c5 | |||
| 582e7847f6 | |||
| 6d378564b4 | |||
| 0c928f7a37 | |||
| c79d5e4f33 | |||
| 347d68048a | |||
| acf3871ef8 | |||
| 21d5f25724 | |||
| 7e945a5b3f | |||
| 85ce6b8ab2 | |||
| 954894ee53 | |||
| a4e775d2ca | |||
| 661310ff5c | |||
| f9d59b579e |
-21
@@ -1,21 +0,0 @@
|
||||
env:
|
||||
CIRRUS_CLONE_DEPTH: 1
|
||||
ARCH: amd64
|
||||
|
||||
build_task:
|
||||
matrix:
|
||||
freebsd_instance:
|
||||
image_family: freebsd-13-5
|
||||
freebsd_instance:
|
||||
image_family: freebsd-14-2
|
||||
freebsd_instance:
|
||||
image_family: freebsd-15-0-snap
|
||||
prepare_script:
|
||||
- pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py311-packaging py311-cffi py311-sysctl
|
||||
configure_script:
|
||||
- env MAKE=gmake ./autogen.sh
|
||||
- env MAKE=gmake ./configure --with-config="user" --with-python=3.11
|
||||
build_script:
|
||||
- gmake -j `sysctl -n kern.smp.cpus`
|
||||
install_script:
|
||||
- gmake install
|
||||
@@ -2,3 +2,4 @@ name: "Custom CodeQL Analysis"
|
||||
|
||||
queries:
|
||||
- uses: ./.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql
|
||||
- uses: ./.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
/**
|
||||
* @name Detect mismatched dsl_dataset_hold/_rele pairs
|
||||
* @description Flags instances of issue #12014 where
|
||||
* - a dataset held with dsl_dataset_hold_obj() ends up in dsl_dataset_rele_flags(), or
|
||||
* - a dataset held with dsl_dataset_hold_obj_flags() ends up in dsl_dataset_rele().
|
||||
* @kind problem
|
||||
* @severity error
|
||||
* @tags correctness
|
||||
* @id cpp/dslDatasetHoldReleMismatch
|
||||
*/
|
||||
|
||||
import cpp
|
||||
|
||||
from Variable ds, Call holdCall, Call releCall, string message
|
||||
where
|
||||
ds.getType().toString() = "dsl_dataset_t *" and
|
||||
holdCall.getASuccessor*() = releCall and
|
||||
(
|
||||
(holdCall.getTarget().getName() = "dsl_dataset_hold_obj_flags" and
|
||||
holdCall.getArgument(4).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and
|
||||
releCall.getTarget().getName() = "dsl_dataset_rele" and
|
||||
releCall.getArgument(0).(VariableAccess).getTarget() = ds and
|
||||
message = "Held with dsl_dataset_hold_obj_flags but released with dsl_dataset_rele")
|
||||
or
|
||||
(holdCall.getTarget().getName() = "dsl_dataset_hold_obj" and
|
||||
holdCall.getArgument(3).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and
|
||||
releCall.getTarget().getName() = "dsl_dataset_rele_flags" and
|
||||
releCall.getArgument(0).(VariableAccess).getTarget() = ds and
|
||||
message = "Held with dsl_dataset_hold_obj but released with dsl_dataset_rele_flags")
|
||||
)
|
||||
select releCall,
|
||||
"Mismatched release: held with $@ but released with " + releCall.getTarget().getName() + " for dataset $@",
|
||||
holdCall, holdCall.getTarget().getName(),
|
||||
ds, ds.toString()
|
||||
@@ -12,10 +12,10 @@ OS="$1"
|
||||
# OS variant (virt-install --os-variant list)
|
||||
OSv=$OS
|
||||
|
||||
# compressed with .zst extension
|
||||
REPO="https://github.com/mcmilk/openzfs-freebsd-images"
|
||||
FREEBSD="$REPO/releases/download/v2025-04-13"
|
||||
URLzs=""
|
||||
# FreeBSD urls's
|
||||
FREEBSD_REL="https://download.freebsd.org/releases/CI-IMAGES"
|
||||
FREEBSD_SNAP="https://download.freebsd.org/snapshots/CI-IMAGES"
|
||||
URLxz=""
|
||||
|
||||
# Ubuntu mirrors
|
||||
UBMIRROR="https://cloud-images.ubuntu.com"
|
||||
@@ -25,6 +25,10 @@ UBMIRROR="https://cloud-images.ubuntu.com"
|
||||
# default nic model for vm's
|
||||
NIC="virtio"
|
||||
|
||||
# additional options for virt-install
|
||||
OPTS[0]=""
|
||||
OPTS[1]=""
|
||||
|
||||
case "$OS" in
|
||||
almalinux8)
|
||||
OSNAME="AlmaLinux 8"
|
||||
@@ -61,6 +65,14 @@ case "$OS" in
|
||||
OSNAME="Debian 12"
|
||||
URL="https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2"
|
||||
;;
|
||||
debian13)
|
||||
OSNAME="Debian 13"
|
||||
# TODO: Overwrite OSv to debian13 for virt-install until it's added to osinfo
|
||||
OSv="debian12"
|
||||
URL="https://cloud.debian.org/images/cloud/trixie/latest/debian-13-generic-amd64.qcow2"
|
||||
OPTS[0]="--boot"
|
||||
OPTS[1]="uefi=on"
|
||||
;;
|
||||
fedora41)
|
||||
OSNAME="Fedora 41"
|
||||
OSv="fedora-unknown"
|
||||
@@ -71,50 +83,49 @@ case "$OS" in
|
||||
OSv="fedora-unknown"
|
||||
URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2"
|
||||
;;
|
||||
freebsd13-4r)
|
||||
OSNAME="FreeBSD 13.4-RELEASE"
|
||||
OSv="freebsd13.0"
|
||||
URLzs="$FREEBSD/amd64-freebsd-13.4-RELEASE.qcow2.zst"
|
||||
BASH="/usr/local/bin/bash"
|
||||
NIC="rtl8139"
|
||||
;;
|
||||
freebsd13-5r)
|
||||
OSNAME="FreeBSD 13.5-RELEASE"
|
||||
FreeBSD="13.5-RELEASE"
|
||||
OSNAME="FreeBSD $FreeBSD"
|
||||
OSv="freebsd13.0"
|
||||
URLzs="$FREEBSD/amd64-freebsd-13.5-RELEASE.qcow2.zst"
|
||||
BASH="/usr/local/bin/bash"
|
||||
URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
|
||||
KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
|
||||
NIC="rtl8139"
|
||||
;;
|
||||
freebsd14-1r)
|
||||
OSNAME="FreeBSD 14.1-RELEASE"
|
||||
OSv="freebsd14.0"
|
||||
URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst"
|
||||
BASH="/usr/local/bin/bash"
|
||||
;;
|
||||
freebsd14-2r)
|
||||
OSNAME="FreeBSD 14.2-RELEASE"
|
||||
FreeBSD="14.2-RELEASE"
|
||||
OSNAME="FreeBSD $FreeBSD"
|
||||
OSv="freebsd14.0"
|
||||
URLzs="$FREEBSD/amd64-freebsd-14.2-RELEASE.qcow2.zst"
|
||||
BASH="/usr/local/bin/bash"
|
||||
KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
|
||||
URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
|
||||
;;
|
||||
freebsd14-3r)
|
||||
FreeBSD="14.3-RELEASE"
|
||||
OSNAME="FreeBSD $FreeBSD"
|
||||
OSv="freebsd14.0"
|
||||
URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
|
||||
KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz"
|
||||
;;
|
||||
freebsd13-5s)
|
||||
OSNAME="FreeBSD 13.5-STABLE"
|
||||
FreeBSD="13.5-STABLE"
|
||||
OSNAME="FreeBSD $FreeBSD"
|
||||
OSv="freebsd13.0"
|
||||
URLzs="$FREEBSD/amd64-freebsd-13.5-STABLE.qcow2.zst"
|
||||
BASH="/usr/local/bin/bash"
|
||||
URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz"
|
||||
KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
|
||||
NIC="rtl8139"
|
||||
;;
|
||||
freebsd14-2s)
|
||||
OSNAME="FreeBSD 14.2-STABLE"
|
||||
freebsd14-3s)
|
||||
FreeBSD="14.3-STABLE"
|
||||
OSNAME="FreeBSD $FreeBSD"
|
||||
OSv="freebsd14.0"
|
||||
URLzs="$FREEBSD/amd64-freebsd-14.2-STABLE.qcow2.zst"
|
||||
BASH="/usr/local/bin/bash"
|
||||
URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
|
||||
KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
|
||||
;;
|
||||
freebsd15-0c)
|
||||
OSNAME="FreeBSD 15.0-CURRENT"
|
||||
FreeBSD="15.0-PRERELEASE"
|
||||
OSNAME="FreeBSD $FreeBSD"
|
||||
OSv="freebsd14.0"
|
||||
URLzs="$FREEBSD/amd64-freebsd-15.0-CURRENT.qcow2.zst"
|
||||
BASH="/usr/local/bin/bash"
|
||||
URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz"
|
||||
KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz"
|
||||
;;
|
||||
tumbleweed)
|
||||
OSNAME="openSUSE Tumbleweed"
|
||||
@@ -168,31 +179,37 @@ echo "CPU=\"$CPU\"" >> $ENV
|
||||
sudo mkdir -p "/mnt/tests"
|
||||
sudo chown -R $(whoami) /mnt/tests
|
||||
|
||||
DISK="/dev/zvol/zpool/openzfs"
|
||||
sudo zfs create -ps -b 64k -V 80g zpool/openzfs
|
||||
while true; do test -b $DISK && break; sleep 1; done
|
||||
|
||||
# we are downloading via axel, curl and wget are mostly slower and
|
||||
# require more return value checking
|
||||
IMG="/mnt/tests/cloudimg.qcow2"
|
||||
if [ ! -z "$URLzs" ]; then
|
||||
echo "Loading image $URLzs ..."
|
||||
time axel -q -o "$IMG.zst" "$URLzs"
|
||||
zstd -q -d --rm "$IMG.zst"
|
||||
IMG="/mnt/tests/cloud-image"
|
||||
if [ ! -z "$URLxz" ]; then
|
||||
echo "Loading $URLxz ..."
|
||||
time axel -q -o "$IMG" "$URLxz"
|
||||
echo "Loading $KSRC ..."
|
||||
time axel -q -o ~/src.txz $KSRC
|
||||
else
|
||||
echo "Loading image $URL ..."
|
||||
echo "Loading $URL ..."
|
||||
time axel -q -o "$IMG" "$URL"
|
||||
fi
|
||||
|
||||
DISK="/dev/zvol/zpool/openzfs"
|
||||
FORMAT="raw"
|
||||
sudo zfs create -ps -b 64k -V 80g zpool/openzfs
|
||||
while true; do test -b $DISK && break; sleep 1; done
|
||||
echo "Importing VM image to zvol..."
|
||||
sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M
|
||||
if [ ! -z "$URLxz" ]; then
|
||||
xzcat -T0 $IMG | sudo dd of=$DISK bs=4M
|
||||
else
|
||||
sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M
|
||||
fi
|
||||
rm -f $IMG
|
||||
|
||||
PUBKEY=$(cat ~/.ssh/id_ed25519.pub)
|
||||
cat <<EOF > /tmp/user-data
|
||||
if [ ${OS:0:7} != "freebsd" ]; then
|
||||
cat <<EOF > /tmp/user-data
|
||||
#cloud-config
|
||||
|
||||
fqdn: $OS
|
||||
hostname: $OS
|
||||
|
||||
users:
|
||||
- name: root
|
||||
@@ -208,6 +225,19 @@ growpart:
|
||||
devices: ['/']
|
||||
ignore_growroot_disabled: false
|
||||
EOF
|
||||
else
|
||||
cat <<EOF > /tmp/user-data
|
||||
#cloud-config
|
||||
|
||||
hostname: $OS
|
||||
|
||||
# minimized config without sudo for nuageinit of FreeBSD
|
||||
growpart:
|
||||
mode: auto
|
||||
devices: ['/']
|
||||
ignore_growroot_disabled: false
|
||||
EOF
|
||||
fi
|
||||
|
||||
sudo virsh net-update default add ip-dhcp-host \
|
||||
"<host mac='52:54:00:83:79:00' ip='192.168.122.10'/>" --live --config
|
||||
@@ -223,15 +253,8 @@ sudo virt-install \
|
||||
--graphics none \
|
||||
--network bridge=virbr0,model=$NIC,mac='52:54:00:83:79:00' \
|
||||
--cloud-init user-data=/tmp/user-data \
|
||||
--disk $DISK,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \
|
||||
--import --noautoconsole >/dev/null
|
||||
|
||||
# enable KSM on Linux
|
||||
if [ ${OS:0:7} != "freebsd" ]; then
|
||||
sudo virsh dommemstat --domain "openzfs" --period 5
|
||||
sudo virsh node-memory-tune 100 50 1
|
||||
echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null
|
||||
fi
|
||||
--disk $DISK,bus=virtio,cache=none,format=raw,driver.discard=unmap \
|
||||
--import --noautoconsole ${OPTS[0]} ${OPTS[1]} >/dev/null
|
||||
|
||||
# Give the VMs hostnames so we don't have to refer to them with
|
||||
# hardcoded IP addresses.
|
||||
@@ -252,3 +275,29 @@ StrictHostKeyChecking no
|
||||
# small timeout, used in while loops later
|
||||
ConnectTimeout 1
|
||||
EOF
|
||||
|
||||
if [ ${OS:0:7} != "freebsd" ]; then
|
||||
# enable KSM on Linux
|
||||
sudo virsh dommemstat --domain "openzfs" --period 5
|
||||
sudo virsh node-memory-tune 100 50 1
|
||||
echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null
|
||||
else
|
||||
# on FreeBSD we need some more init stuff, because of nuageinit
|
||||
BASH="/usr/local/bin/bash"
|
||||
while pidof /usr/bin/qemu-system-x86_64 >/dev/null; do
|
||||
ssh 2>/dev/null root@vm0 "uname -a" && break
|
||||
done
|
||||
ssh root@vm0 "pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init"
|
||||
ssh root@vm0 "chsh -s $BASH root"
|
||||
ssh root@vm0 'sysrc qemu_guest_agent_enable="YES"'
|
||||
ssh root@vm0 'sysrc cloudinit_enable="YES"'
|
||||
ssh root@vm0 "pw add user zfs -w no -s $BASH"
|
||||
ssh root@vm0 'mkdir -p ~zfs/.ssh'
|
||||
ssh root@vm0 'echo "zfs ALL=(ALL:ALL) NOPASSWD: ALL" >> /usr/local/etc/sudoers'
|
||||
ssh root@vm0 'echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config'
|
||||
scp ~/.ssh/id_ed25519.pub "root@vm0:~zfs/.ssh/authorized_keys"
|
||||
ssh root@vm0 'chown -R zfs ~zfs'
|
||||
ssh root@vm0 'service sshd restart'
|
||||
scp ~/src.txz "root@vm0:/tmp/src.txz"
|
||||
ssh root@vm0 'tar -C / -zxf /tmp/src.txz'
|
||||
fi
|
||||
|
||||
@@ -28,6 +28,7 @@ function debian() {
|
||||
export DEBIAN_FRONTEND="noninteractive"
|
||||
|
||||
echo "##[group]Running apt-get update+upgrade"
|
||||
sudo sed -i '/[[:alpha:]]-backports/d' /etc/apt/sources.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get upgrade -y
|
||||
echo "##[endgroup]"
|
||||
@@ -40,7 +41,7 @@ function debian() {
|
||||
libelf-dev libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev \
|
||||
libtool libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \
|
||||
lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \
|
||||
python3-cffi python3-dev python3-distlib python3-packaging \
|
||||
python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \
|
||||
python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \
|
||||
rsync samba sysstat uuid-dev watchdog wget xfslibs-dev xxhash zlib1g-dev
|
||||
echo "##[endgroup]"
|
||||
@@ -51,7 +52,7 @@ function freebsd() {
|
||||
|
||||
echo "##[group]Install Development Tools"
|
||||
sudo pkg install -y autoconf automake autotools base64 checkbashisms fio \
|
||||
gdb gettext gettext-runtime git gmake gsed jq ksh93 lcov libtool lscpu \
|
||||
gdb gettext gettext-runtime git gmake gsed jq ksh lcov libtool lscpu \
|
||||
pkgconf python python3 pamtester pamtester qemu-guest-agent rsync xxhash
|
||||
sudo pkg install -xy \
|
||||
'^samba4[[:digit:]]+$' \
|
||||
|
||||
@@ -5,12 +5,13 @@
|
||||
#
|
||||
# Usage:
|
||||
#
|
||||
# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--poweroff]
|
||||
# [--release][--repo][--tarball]
|
||||
# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM]
|
||||
# [--poweroff][--release][--repo][--tarball]
|
||||
#
|
||||
# OS: OS name like 'fedora41'
|
||||
# --enable-debug: Build RPMs with '--enable-debug' (for testing)
|
||||
# --dkms: Build DKMS RPMs as well
|
||||
# --patch-level NUM: Use a custom patch level number for packages.
|
||||
# --poweroff: Power-off the VM after building
|
||||
# --release Build zfs-release*.rpm as well
|
||||
# --repo After building everything, copy RPMs into /tmp/repo
|
||||
@@ -21,6 +22,7 @@
|
||||
|
||||
ENABLE_DEBUG=""
|
||||
DKMS=""
|
||||
PATCH_LEVEL=""
|
||||
POWEROFF=""
|
||||
RELEASE=""
|
||||
REPO=""
|
||||
@@ -35,6 +37,11 @@ while [[ $# -gt 0 ]]; do
|
||||
DKMS=1
|
||||
shift
|
||||
;;
|
||||
--patch-level)
|
||||
PATCH_LEVEL=$2
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
--poweroff)
|
||||
POWEROFF=1
|
||||
shift
|
||||
@@ -215,6 +222,10 @@ function rpm_build_and_install() {
|
||||
run ./autogen.sh
|
||||
echo "##[endgroup]"
|
||||
|
||||
if [ -n "$PATCH_LEVEL" ] ; then
|
||||
sed -i -E 's/(Release:\s+)1/\1'$PATCH_LEVEL'/g' META
|
||||
fi
|
||||
|
||||
echo "##[group]Configure"
|
||||
run ./configure --enable-debuginfo $extra
|
||||
echo "##[endgroup]"
|
||||
@@ -328,7 +339,13 @@ fi
|
||||
# almalinux9.5
|
||||
# fedora42
|
||||
source /etc/os-release
|
||||
sudo hostname "$ID$VERSION_ID"
|
||||
if which hostnamectl &> /dev/null ; then
|
||||
# Fedora 42+ use hostnamectl
|
||||
sudo hostnamectl set-hostname "$ID$VERSION_ID"
|
||||
sudo hostnamectl set-hostname --pretty "$ID$VERSION_ID"
|
||||
else
|
||||
sudo hostname "$ID$VERSION_ID"
|
||||
fi
|
||||
|
||||
# save some sysinfo
|
||||
uname -a > /var/tmp/uname.txt
|
||||
|
||||
@@ -12,16 +12,26 @@ source /var/tmp/env.txt
|
||||
# wait for poweroff to succeed
|
||||
PID=$(pidof /usr/bin/qemu-system-x86_64)
|
||||
tail --pid=$PID -f /dev/null
|
||||
sudo virsh undefine openzfs
|
||||
sudo virsh undefine --nvram openzfs
|
||||
|
||||
# cpu pinning
|
||||
CPUSET=("0,1" "2,3")
|
||||
|
||||
# additional options for virt-install
|
||||
OPTS[0]=""
|
||||
OPTS[1]=""
|
||||
|
||||
case "$OS" in
|
||||
freebsd*)
|
||||
# FreeBSD needs only 6GiB
|
||||
RAM=6
|
||||
;;
|
||||
debian13)
|
||||
RAM=8
|
||||
# Boot Debian 13 with uefi=on and secureboot=off (ZFS Kernel Module not signed)
|
||||
OPTS[0]="--boot"
|
||||
OPTS[1]="firmware=efi,firmware.feature0.name=secure-boot,firmware.feature0.enabled=no"
|
||||
;;
|
||||
*)
|
||||
# Linux needs more memory, but can be optimized to share it via KSM
|
||||
RAM=8
|
||||
@@ -79,7 +89,7 @@ EOF
|
||||
--network bridge=virbr0,model=$NIC,mac="52:54:00:83:79:0$i" \
|
||||
--disk $DISK-system,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \
|
||||
--disk $DISK-tests,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \
|
||||
--import --noautoconsole >/dev/null
|
||||
--import --noautoconsole ${OPTS[0]} ${OPTS[1]}
|
||||
done
|
||||
|
||||
# generate some memory stats
|
||||
|
||||
@@ -21,11 +21,13 @@ function prefix() {
|
||||
S=$((DIFF-(M*60)))
|
||||
|
||||
CTR=$(cat /tmp/ctr)
|
||||
echo $LINE| grep -q "^Test[: ]" && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr
|
||||
echo $LINE| grep -q '^\[.*] Test[: ]' && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr
|
||||
|
||||
BASE="$HOME/work/zfs/zfs"
|
||||
COLOR="$BASE/scripts/zfs-tests-color.sh"
|
||||
CLINE=$(echo $LINE| grep "^Test[ :]" | sed -e 's|/usr/local|/usr|g' \
|
||||
CLINE=$(echo $LINE| grep '^\[.*] Test[: ]' \
|
||||
| sed -e 's|^\[.*] Test|Test|g' \
|
||||
| sed -e 's|/usr/local|/usr|g' \
|
||||
| sed -e 's| /usr/share/zfs/zfs-tests/tests/| |g' | $COLOR)
|
||||
if [ -z "$CLINE" ]; then
|
||||
printf "vm${ID}: %s\n" "$LINE"
|
||||
|
||||
@@ -32,6 +32,11 @@ on:
|
||||
options:
|
||||
- "Build RPMs"
|
||||
- "Test repo"
|
||||
patch_level:
|
||||
type: string
|
||||
required: false
|
||||
default: ""
|
||||
description: "(optional) patch level number"
|
||||
repo_url:
|
||||
type: string
|
||||
required: false
|
||||
@@ -78,7 +83,13 @@ jobs:
|
||||
mkdir -p /tmp/repo
|
||||
ssh zfs@vm0 '$HOME/zfs/.github/workflows/scripts/qemu-test-repo-vm.sh' ${{ github.event.inputs.repo_url }}
|
||||
else
|
||||
.github/workflows/scripts/qemu-4-build.sh --repo --release --dkms --tarball ${{ matrix.os }}
|
||||
EXTRA=""
|
||||
if [ -n "${{ github.event.inputs.patch_level }}" ] ; then
|
||||
EXTRA="--patch-level ${{ github.event.inputs.patch_level }}"
|
||||
fi
|
||||
|
||||
.github/workflows/scripts/qemu-4-build.sh $EXTRA \
|
||||
--repo --release --dkms --tarball ${{ matrix.os }}
|
||||
fi
|
||||
|
||||
- name: Prepare artifacts
|
||||
|
||||
@@ -5,16 +5,6 @@ on:
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
include_stream9:
|
||||
type: boolean
|
||||
required: false
|
||||
default: false
|
||||
description: 'Test on CentOS 9 stream'
|
||||
include_stream10:
|
||||
type: boolean
|
||||
required: false
|
||||
default: false
|
||||
description: 'Test on CentOS 10 stream'
|
||||
fedora_kernel_ver:
|
||||
type: string
|
||||
required: false
|
||||
@@ -39,8 +29,8 @@ jobs:
|
||||
- name: Generate OS config and CI type
|
||||
id: os
|
||||
run: |
|
||||
FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-2s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
|
||||
QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-2r", "ubuntu24"]'
|
||||
FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
|
||||
QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]'
|
||||
# determine CI type when running on PR
|
||||
ci_type="full"
|
||||
if ${{ github.event_name == 'pull_request' }}; then
|
||||
@@ -63,14 +53,6 @@ jobs:
|
||||
os_json=$(echo ${os_selection} | jq -c)
|
||||
fi
|
||||
|
||||
# Add optional runners
|
||||
if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then
|
||||
os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]')
|
||||
fi
|
||||
if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then
|
||||
os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]')
|
||||
fi
|
||||
|
||||
echo $os_json
|
||||
echo "os=$os_json" >> $GITHUB_OUTPUT
|
||||
echo "ci_type=$ci_type" >> $GITHUB_OUTPUT
|
||||
@@ -81,12 +63,12 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# rhl: almalinux8, almalinux9, centos-stream9, fedora41
|
||||
# debian: debian11, debian12, ubuntu22, ubuntu24
|
||||
# rhl: almalinux8, almalinux9, centos-stream9, fedora4x
|
||||
# debian: debian12, debian13, ubuntu22, ubuntu24
|
||||
# misc: archlinux, tumbleweed
|
||||
# FreeBSD variants of 2024-12:
|
||||
# FreeBSD Release: freebsd13-4r, freebsd14-2r
|
||||
# FreeBSD Stable: freebsd13-4s, freebsd14-2s
|
||||
# FreeBSD variants of 2025-06:
|
||||
# FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r
|
||||
# FreeBSD Stable: freebsd13-5s, freebsd14-3s
|
||||
# FreeBSD Current: freebsd15-0c
|
||||
os: ${{ fromJson(needs.test-config.outputs.test_os) }}
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
+12
-12
@@ -12,7 +12,8 @@ jobs:
|
||||
zloop:
|
||||
runs-on: ubuntu-24.04
|
||||
env:
|
||||
TEST_DIR: /var/tmp/zloop
|
||||
WORK_DIR: /mnt/zloop
|
||||
CORE_DIR: /mnt/zloop/cores
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -40,38 +41,37 @@ jobs:
|
||||
sudo modprobe zfs
|
||||
- name: Tests
|
||||
run: |
|
||||
sudo mkdir -p $TEST_DIR
|
||||
# run for 10 minutes or at most 6 iterations for a maximum runner
|
||||
# time of 60 minutes.
|
||||
sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -- -T 120 -P 60
|
||||
sudo truncate -s 256G /mnt/vdev
|
||||
sudo zpool create cipool -m $WORK_DIR -O compression=on -o autotrim=on /mnt/vdev
|
||||
sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -c $CORE_DIR -f $WORK_DIR -- -T 120 -P 60
|
||||
- name: Prepare artifacts
|
||||
if: failure()
|
||||
run: |
|
||||
sudo chmod +r -R $TEST_DIR/
|
||||
sudo chmod +r -R $WORK_DIR/
|
||||
- name: Ztest log
|
||||
if: failure()
|
||||
run: |
|
||||
grep -B10 -A1000 'ASSERT' $TEST_DIR/*/ztest.out || tail -n 1000 $TEST_DIR/*/ztest.out
|
||||
grep -B10 -A1000 'ASSERT' $CORE_DIR/*/ztest.out || tail -n 1000 $CORE_DIR/*/ztest.out
|
||||
- name: Gdb log
|
||||
if: failure()
|
||||
run: |
|
||||
sed -n '/Backtraces (full)/q;p' $TEST_DIR/*/ztest.gdb
|
||||
sed -n '/Backtraces (full)/q;p' $CORE_DIR/*/ztest.gdb
|
||||
- name: Zdb log
|
||||
if: failure()
|
||||
run: |
|
||||
cat $TEST_DIR/*/ztest.zdb
|
||||
cat $CORE_DIR/*/ztest.zdb
|
||||
- uses: actions/upload-artifact@v4
|
||||
if: failure()
|
||||
with:
|
||||
name: Logs
|
||||
path: |
|
||||
/var/tmp/zloop/*/
|
||||
!/var/tmp/zloop/*/vdev/
|
||||
/mnt/zloop/*/
|
||||
!/mnt/zloop/cores/*/vdev/
|
||||
if-no-files-found: ignore
|
||||
- uses: actions/upload-artifact@v4
|
||||
if: failure()
|
||||
with:
|
||||
name: Pool files
|
||||
path: |
|
||||
/var/tmp/zloop/*/vdev/
|
||||
/mnt/zloop/cores/*/vdev/
|
||||
if-no-files-found: ignore
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
Meta: 1
|
||||
Name: zfs
|
||||
Branch: 1.0
|
||||
Version: 2.3.3
|
||||
Version: 2.3.4
|
||||
Release: 1
|
||||
Release-Tags: relext
|
||||
License: CDDL
|
||||
Author: OpenZFS
|
||||
Linux-Maximum: 6.15
|
||||
Linux-Maximum: 6.16
|
||||
Linux-Minimum: 4.18
|
||||
|
||||
@@ -559,6 +559,7 @@ def section_arc(kstats_dict):
|
||||
print()
|
||||
|
||||
compressed_size = arc_stats['compressed_size']
|
||||
uncompressed_size = arc_stats['uncompressed_size']
|
||||
overhead_size = arc_stats['overhead_size']
|
||||
bonus_size = arc_stats['bonus_size']
|
||||
dnode_size = arc_stats['dnode_size']
|
||||
@@ -671,6 +672,8 @@ def section_arc(kstats_dict):
|
||||
print()
|
||||
|
||||
print('ARC misc:')
|
||||
prt_i2('Uncompressed size:', f_perc(uncompressed_size, compressed_size),
|
||||
f_bytes(uncompressed_size))
|
||||
prt_i1('Memory throttles:', arc_stats['memory_throttle_count'])
|
||||
prt_i1('Memory direct reclaims:', arc_stats['memory_direct_count'])
|
||||
prt_i1('Memory indirect reclaims:', arc_stats['memory_indirect_count'])
|
||||
|
||||
+28
-21
@@ -619,8 +619,9 @@ livelist_metaslab_validate(spa_t *spa)
|
||||
metaslab_calculate_range_tree_type(vd, m,
|
||||
&start, &shift);
|
||||
metaslab_verify_t mv;
|
||||
mv.mv_allocated = zfs_range_tree_create(NULL,
|
||||
type, NULL, start, shift);
|
||||
mv.mv_allocated = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
0, "livelist_metaslab_validate:mv_allocated");
|
||||
mv.mv_vdid = vd->vdev_id;
|
||||
mv.mv_msid = m->ms_id;
|
||||
mv.mv_start = m->ms_start;
|
||||
@@ -2545,12 +2546,14 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
|
||||
|
||||
blkbuf[0] = '\0';
|
||||
|
||||
for (i = 0; i < ndvas; i++)
|
||||
for (i = 0; i < ndvas; i++) {
|
||||
(void) snprintf(blkbuf + strlen(blkbuf),
|
||||
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
|
||||
buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",
|
||||
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
|
||||
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
|
||||
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
|
||||
(u_longlong_t)DVA_GET_ASIZE(&dva[i]),
|
||||
(DVA_GET_GANG(&dva[i]) ? "G" : ""));
|
||||
}
|
||||
|
||||
if (BP_IS_HOLE(bp)) {
|
||||
(void) snprintf(blkbuf + strlen(blkbuf),
|
||||
@@ -6320,8 +6323,9 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
|
||||
|
||||
ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
|
||||
|
||||
zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
zfs_range_tree_t *allocs = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
0, "zdb_claim_removing:allocs");
|
||||
for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
|
||||
metaslab_t *msp = vd->vdev_ms[msi];
|
||||
|
||||
@@ -7704,7 +7708,8 @@ zdb_set_skip_mmp(char *target)
|
||||
* applies to the new_path parameter if allocated.
|
||||
*/
|
||||
static char *
|
||||
import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
|
||||
import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa,
|
||||
char **new_path)
|
||||
{
|
||||
int error = 0;
|
||||
char *poolname, *bogus_name = NULL;
|
||||
@@ -7712,11 +7717,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
|
||||
|
||||
/* If the target is not a pool, the extract the pool name */
|
||||
char *path_start = strchr(target, '/');
|
||||
if (path_start != NULL) {
|
||||
if (target_is_spa || path_start == NULL) {
|
||||
poolname = target;
|
||||
} else {
|
||||
size_t poolname_len = path_start - target;
|
||||
poolname = strndup(target, poolname_len);
|
||||
} else {
|
||||
poolname = target;
|
||||
}
|
||||
|
||||
if (cfg == NULL) {
|
||||
@@ -7747,10 +7752,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
|
||||
"with error %d\n", bogus_name, error);
|
||||
}
|
||||
|
||||
if (new_path != NULL && path_start != NULL) {
|
||||
if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
|
||||
if (new_path != NULL && !target_is_spa) {
|
||||
if (asprintf(new_path, "%s%s", bogus_name,
|
||||
path_start != NULL ? path_start : "") == -1) {
|
||||
free(bogus_name);
|
||||
if (path_start != NULL)
|
||||
if (!target_is_spa && path_start != NULL)
|
||||
free(poolname);
|
||||
return (NULL);
|
||||
}
|
||||
@@ -7979,7 +7985,7 @@ verify_checkpoint_blocks(spa_t *spa)
|
||||
* name) so we can do verification on it against the current state
|
||||
* of the pool.
|
||||
*/
|
||||
checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
|
||||
checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE,
|
||||
NULL);
|
||||
ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
|
||||
|
||||
@@ -8449,8 +8455,9 @@ dump_zpool(spa_t *spa)
|
||||
|
||||
if (dump_opt['d'] || dump_opt['i']) {
|
||||
spa_feature_t f;
|
||||
mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
mos_refd_objs = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
0, "dump_zpool:mos_refd_objs");
|
||||
dump_objset(dp->dp_meta_objset);
|
||||
|
||||
if (dump_opt['d'] >= 3) {
|
||||
@@ -8981,7 +8988,7 @@ zdb_read_block(char *thing, spa_t *spa)
|
||||
|
||||
DVA_SET_VDEV(&dva[0], vd->vdev_id);
|
||||
DVA_SET_OFFSET(&dva[0], offset);
|
||||
DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
|
||||
DVA_SET_GANG(&dva[0], 0);
|
||||
DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
|
||||
|
||||
BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
|
||||
@@ -8996,7 +9003,7 @@ zdb_read_block(char *thing, spa_t *spa)
|
||||
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
|
||||
|
||||
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
|
||||
zio = zio_root(spa, NULL, NULL, 0);
|
||||
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
|
||||
if (vd == vd->vdev_top) {
|
||||
/*
|
||||
@@ -9118,7 +9125,7 @@ zdb_read_block(char *thing, spa_t *spa)
|
||||
ck_zio->io_offset =
|
||||
DVA_GET_OFFSET(&bp->blk_dva[0]);
|
||||
ck_zio->io_bp = bp;
|
||||
zio_checksum_compute(ck_zio, ck, pabd, lsize);
|
||||
zio_checksum_compute(ck_zio, ck, pabd, psize);
|
||||
printf(
|
||||
"%12s\t"
|
||||
"cksum=%016llx:%016llx:%016llx:%016llx\n",
|
||||
@@ -9695,7 +9702,7 @@ main(int argc, char **argv)
|
||||
char *checkpoint_target = NULL;
|
||||
if (dump_opt['k']) {
|
||||
checkpoint_pool = import_checkpointed_state(target, cfg,
|
||||
&checkpoint_target);
|
||||
target_is_spa, &checkpoint_target);
|
||||
|
||||
if (checkpoint_target != NULL)
|
||||
target = checkpoint_target;
|
||||
|
||||
+36
-31
@@ -134,11 +134,13 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
|
||||
* of blkid cache and L2ARC VDEV does not contain pool guid in its
|
||||
* blkid, so this is a special case for L2ARC VDEV.
|
||||
*/
|
||||
else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL &&
|
||||
else if (gsp->gs_vdev_guid != 0 &&
|
||||
nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&
|
||||
gsp->gs_vdev_guid == vdev_guid) {
|
||||
(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
|
||||
&gsp->gs_devid);
|
||||
if (gsp->gs_devid == NULL) {
|
||||
(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
|
||||
&gsp->gs_devid);
|
||||
}
|
||||
(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
|
||||
&gsp->gs_vdev_expandtime);
|
||||
return (B_TRUE);
|
||||
@@ -156,22 +158,28 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
|
||||
/*
|
||||
* For each vdev in this pool, look for a match by devid
|
||||
*/
|
||||
if ((config = zpool_get_config(zhp, NULL)) != NULL) {
|
||||
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
|
||||
&nvl) == 0) {
|
||||
(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* if a match was found then grab the pool guid
|
||||
*/
|
||||
if (gsp->gs_vdev_guid && gsp->gs_devid) {
|
||||
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
|
||||
&gsp->gs_pool_guid);
|
||||
}
|
||||
boolean_t found = B_FALSE;
|
||||
uint64_t pool_guid;
|
||||
|
||||
/* Get pool configuration and extract pool GUID */
|
||||
if ((config = zpool_get_config(zhp, NULL)) == NULL ||
|
||||
nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
|
||||
&pool_guid) != 0)
|
||||
goto out;
|
||||
|
||||
/* Skip this pool if we're looking for a specific pool */
|
||||
if (gsp->gs_pool_guid != 0 && pool_guid != gsp->gs_pool_guid)
|
||||
goto out;
|
||||
|
||||
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) == 0)
|
||||
found = zfs_agent_iter_vdev(zhp, nvl, gsp);
|
||||
|
||||
if (found && gsp->gs_pool_guid == 0)
|
||||
gsp->gs_pool_guid = pool_guid;
|
||||
|
||||
out:
|
||||
zpool_close(zhp);
|
||||
return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0);
|
||||
return (found);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -233,20 +241,17 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
|
||||
* For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
|
||||
* ZFS_EV_POOL_GUID may be missing so find them.
|
||||
*/
|
||||
if (devid == NULL || pool_guid == 0 || vdev_guid == 0) {
|
||||
if (devid == NULL)
|
||||
search.gs_vdev_guid = vdev_guid;
|
||||
else
|
||||
search.gs_devid = devid;
|
||||
zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
|
||||
if (devid == NULL)
|
||||
devid = search.gs_devid;
|
||||
if (pool_guid == 0)
|
||||
pool_guid = search.gs_pool_guid;
|
||||
if (vdev_guid == 0)
|
||||
vdev_guid = search.gs_vdev_guid;
|
||||
devtype = search.gs_vdev_type;
|
||||
}
|
||||
search.gs_devid = devid;
|
||||
search.gs_vdev_guid = vdev_guid;
|
||||
search.gs_pool_guid = pool_guid;
|
||||
zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
|
||||
if (devid == NULL)
|
||||
devid = search.gs_devid;
|
||||
if (pool_guid == 0)
|
||||
pool_guid = search.gs_pool_guid;
|
||||
if (vdev_guid == 0)
|
||||
vdev_guid = search.gs_vdev_guid;
|
||||
devtype = search.gs_vdev_type;
|
||||
|
||||
/*
|
||||
* We want to avoid reporting "remove" events coming from
|
||||
|
||||
@@ -441,8 +441,9 @@ zed_notify_slack_webhook()
|
||||
"${pathname}")"
|
||||
|
||||
# Construct the JSON message for posting.
|
||||
# shellcheck disable=SC2016
|
||||
#
|
||||
msg_json="$(printf '{"text": "*%s*\\n%s"}' "${subject}" "${msg_body}" )"
|
||||
msg_json="$(printf '{"text": "*%s*\\n```%s```"}' "${subject}" "${msg_body}" )"
|
||||
|
||||
# Send the POST request and check for errors.
|
||||
#
|
||||
|
||||
+219
-4
@@ -37,6 +37,7 @@
|
||||
#include <assert.h>
|
||||
#include <ctype.h>
|
||||
#include <sys/debug.h>
|
||||
#include <dirent.h>
|
||||
#include <errno.h>
|
||||
#include <getopt.h>
|
||||
#include <libgen.h>
|
||||
@@ -121,6 +122,7 @@ static int zfs_do_change_key(int argc, char **argv);
|
||||
static int zfs_do_project(int argc, char **argv);
|
||||
static int zfs_do_version(int argc, char **argv);
|
||||
static int zfs_do_redact(int argc, char **argv);
|
||||
static int zfs_do_rewrite(int argc, char **argv);
|
||||
static int zfs_do_wait(int argc, char **argv);
|
||||
|
||||
#ifdef __FreeBSD__
|
||||
@@ -193,6 +195,7 @@ typedef enum {
|
||||
HELP_CHANGE_KEY,
|
||||
HELP_VERSION,
|
||||
HELP_REDACT,
|
||||
HELP_REWRITE,
|
||||
HELP_JAIL,
|
||||
HELP_UNJAIL,
|
||||
HELP_WAIT,
|
||||
@@ -227,7 +230,7 @@ static zfs_command_t command_table[] = {
|
||||
{ "promote", zfs_do_promote, HELP_PROMOTE },
|
||||
{ "rename", zfs_do_rename, HELP_RENAME },
|
||||
{ "bookmark", zfs_do_bookmark, HELP_BOOKMARK },
|
||||
{ "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM },
|
||||
{ "diff", zfs_do_diff, HELP_DIFF },
|
||||
{ NULL },
|
||||
{ "list", zfs_do_list, HELP_LIST },
|
||||
{ NULL },
|
||||
@@ -249,27 +252,31 @@ static zfs_command_t command_table[] = {
|
||||
{ NULL },
|
||||
{ "send", zfs_do_send, HELP_SEND },
|
||||
{ "receive", zfs_do_receive, HELP_RECEIVE },
|
||||
{ "redact", zfs_do_redact, HELP_REDACT },
|
||||
{ NULL },
|
||||
{ "allow", zfs_do_allow, HELP_ALLOW },
|
||||
{ NULL },
|
||||
{ "unallow", zfs_do_unallow, HELP_UNALLOW },
|
||||
{ NULL },
|
||||
{ "hold", zfs_do_hold, HELP_HOLD },
|
||||
{ "holds", zfs_do_holds, HELP_HOLDS },
|
||||
{ "release", zfs_do_release, HELP_RELEASE },
|
||||
{ "diff", zfs_do_diff, HELP_DIFF },
|
||||
{ NULL },
|
||||
{ "load-key", zfs_do_load_key, HELP_LOAD_KEY },
|
||||
{ "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY },
|
||||
{ "change-key", zfs_do_change_key, HELP_CHANGE_KEY },
|
||||
{ "redact", zfs_do_redact, HELP_REDACT },
|
||||
{ NULL },
|
||||
{ "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM },
|
||||
{ "rewrite", zfs_do_rewrite, HELP_REWRITE },
|
||||
{ "wait", zfs_do_wait, HELP_WAIT },
|
||||
|
||||
#ifdef __FreeBSD__
|
||||
{ NULL },
|
||||
{ "jail", zfs_do_jail, HELP_JAIL },
|
||||
{ "unjail", zfs_do_unjail, HELP_UNJAIL },
|
||||
#endif
|
||||
|
||||
#ifdef __linux__
|
||||
{ NULL },
|
||||
{ "zone", zfs_do_zone, HELP_ZONE },
|
||||
{ "unzone", zfs_do_unzone, HELP_UNZONE },
|
||||
#endif
|
||||
@@ -432,6 +439,9 @@ get_usage(zfs_help_t idx)
|
||||
case HELP_REDACT:
|
||||
return (gettext("\tredact <snapshot> <bookmark> "
|
||||
"<redaction_snapshot> ...\n"));
|
||||
case HELP_REWRITE:
|
||||
return (gettext("\trewrite [-rvx] [-o <offset>] [-l <length>] "
|
||||
"<directory|file ...>\n"));
|
||||
case HELP_JAIL:
|
||||
return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
|
||||
case HELP_UNJAIL:
|
||||
@@ -7716,6 +7726,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
|
||||
struct extmnttab entry;
|
||||
const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
|
||||
ino_t path_inode;
|
||||
char *zfs_mntpnt, *entry_mntpnt;
|
||||
|
||||
/*
|
||||
* Search for the given (major,minor) pair in the mount table.
|
||||
@@ -7757,6 +7768,24 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the filesystem is mounted, check that the mountpoint matches
|
||||
* the one in the mnttab entry w.r.t. provided path. If it doesn't,
|
||||
* then we should not proceed further.
|
||||
*/
|
||||
entry_mntpnt = strdup(entry.mnt_mountp);
|
||||
if (zfs_is_mounted(zhp, &zfs_mntpnt)) {
|
||||
if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) {
|
||||
(void) fprintf(stderr, gettext("cannot %s '%s': "
|
||||
"not an original mountpoint\n"), cmdname, path);
|
||||
free(zfs_mntpnt);
|
||||
free(entry_mntpnt);
|
||||
goto out;
|
||||
}
|
||||
free(zfs_mntpnt);
|
||||
}
|
||||
free(entry_mntpnt);
|
||||
|
||||
if (op == OP_SHARE) {
|
||||
char nfs_mnt_prop[ZFS_MAXPROPLEN];
|
||||
char smbshare_prop[ZFS_MAXPROPLEN];
|
||||
@@ -9013,6 +9042,192 @@ zfs_do_project(int argc, char **argv)
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_rewrite_file(const char *path, boolean_t verbose, zfs_rewrite_args_t *args)
|
||||
{
|
||||
int fd, ret = 0;
|
||||
|
||||
fd = open(path, O_WRONLY);
|
||||
if (fd < 0) {
|
||||
ret = errno;
|
||||
(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
|
||||
path, strerror(errno));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
if (ioctl(fd, ZFS_IOC_REWRITE, args) < 0) {
|
||||
ret = errno;
|
||||
(void) fprintf(stderr, gettext("failed to rewrite %s: %s\n"),
|
||||
path, strerror(errno));
|
||||
} else if (verbose) {
|
||||
printf("%s\n", path);
|
||||
}
|
||||
|
||||
close(fd);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_rewrite_dir(const char *path, boolean_t verbose, boolean_t xdev, dev_t dev,
|
||||
zfs_rewrite_args_t *args, nvlist_t *dirs)
|
||||
{
|
||||
struct dirent *ent;
|
||||
DIR *dir;
|
||||
int ret = 0, err;
|
||||
|
||||
dir = opendir(path);
|
||||
if (dir == NULL) {
|
||||
if (errno == ENOENT)
|
||||
return (0);
|
||||
ret = errno;
|
||||
(void) fprintf(stderr, gettext("failed to opendir %s: %s\n"),
|
||||
path, strerror(errno));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
size_t plen = strlen(path) + 1;
|
||||
while ((ent = readdir(dir)) != NULL) {
|
||||
char *fullname;
|
||||
struct stat st;
|
||||
|
||||
if (ent->d_type != DT_REG && ent->d_type != DT_DIR)
|
||||
continue;
|
||||
|
||||
if (strcmp(ent->d_name, ".") == 0 ||
|
||||
strcmp(ent->d_name, "..") == 0)
|
||||
continue;
|
||||
|
||||
if (plen + strlen(ent->d_name) >= PATH_MAX) {
|
||||
(void) fprintf(stderr, gettext("path too long %s/%s\n"),
|
||||
path, ent->d_name);
|
||||
ret = ENAMETOOLONG;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (asprintf(&fullname, "%s/%s", path, ent->d_name) == -1) {
|
||||
(void) fprintf(stderr,
|
||||
gettext("failed to allocate memory\n"));
|
||||
ret = ENOMEM;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (xdev) {
|
||||
if (lstat(fullname, &st) < 0) {
|
||||
ret = errno;
|
||||
(void) fprintf(stderr,
|
||||
gettext("failed to stat %s: %s\n"),
|
||||
fullname, strerror(errno));
|
||||
free(fullname);
|
||||
continue;
|
||||
}
|
||||
if (st.st_dev != dev) {
|
||||
free(fullname);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (ent->d_type == DT_REG) {
|
||||
err = zfs_rewrite_file(fullname, verbose, args);
|
||||
if (err)
|
||||
ret = err;
|
||||
} else { /* DT_DIR */
|
||||
fnvlist_add_uint64(dirs, fullname, dev);
|
||||
}
|
||||
|
||||
free(fullname);
|
||||
}
|
||||
|
||||
closedir(dir);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_rewrite_path(const char *path, boolean_t verbose, boolean_t recurse,
|
||||
boolean_t xdev, zfs_rewrite_args_t *args, nvlist_t *dirs)
|
||||
{
|
||||
struct stat st;
|
||||
int ret = 0;
|
||||
|
||||
if (lstat(path, &st) < 0) {
|
||||
ret = errno;
|
||||
(void) fprintf(stderr, gettext("failed to stat %s: %s\n"),
|
||||
path, strerror(errno));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
if (S_ISREG(st.st_mode)) {
|
||||
ret = zfs_rewrite_file(path, verbose, args);
|
||||
} else if (S_ISDIR(st.st_mode) && recurse) {
|
||||
ret = zfs_rewrite_dir(path, verbose, xdev, st.st_dev, args,
|
||||
dirs);
|
||||
}
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_do_rewrite(int argc, char **argv)
|
||||
{
|
||||
int ret = 0, err, c;
|
||||
boolean_t recurse = B_FALSE, verbose = B_FALSE, xdev = B_FALSE;
|
||||
|
||||
if (argc < 2)
|
||||
usage(B_FALSE);
|
||||
|
||||
zfs_rewrite_args_t args;
|
||||
memset(&args, 0, sizeof (args));
|
||||
|
||||
while ((c = getopt(argc, argv, "l:o:rvx")) != -1) {
|
||||
switch (c) {
|
||||
case 'l':
|
||||
args.len = strtoll(optarg, NULL, 0);
|
||||
break;
|
||||
case 'o':
|
||||
args.off = strtoll(optarg, NULL, 0);
|
||||
break;
|
||||
case 'r':
|
||||
recurse = B_TRUE;
|
||||
break;
|
||||
case 'v':
|
||||
verbose = B_TRUE;
|
||||
break;
|
||||
case 'x':
|
||||
xdev = B_TRUE;
|
||||
break;
|
||||
default:
|
||||
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
|
||||
optopt);
|
||||
usage(B_FALSE);
|
||||
}
|
||||
}
|
||||
|
||||
argv += optind;
|
||||
argc -= optind;
|
||||
if (argc == 0) {
|
||||
(void) fprintf(stderr,
|
||||
gettext("missing file or directory target(s)\n"));
|
||||
usage(B_FALSE);
|
||||
}
|
||||
|
||||
nvlist_t *dirs = fnvlist_alloc();
|
||||
for (int i = 0; i < argc; i++) {
|
||||
err = zfs_rewrite_path(argv[i], verbose, recurse, xdev, &args,
|
||||
dirs);
|
||||
if (err)
|
||||
ret = err;
|
||||
}
|
||||
nvpair_t *dir;
|
||||
while ((dir = nvlist_next_nvpair(dirs, NULL)) != NULL) {
|
||||
err = zfs_rewrite_dir(nvpair_name(dir), verbose, xdev,
|
||||
fnvpair_value_uint64(dir), &args, dirs);
|
||||
if (err)
|
||||
ret = err;
|
||||
fnvlist_remove_nvpair(dirs, dir);
|
||||
}
|
||||
fnvlist_free(dirs);
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_do_wait(int argc, char **argv)
|
||||
{
|
||||
|
||||
+25
-18
@@ -3881,7 +3881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
* If newvd is too small, it should fail with EOVERFLOW.
|
||||
*
|
||||
* If newvd is a distributed spare and it's being attached to a
|
||||
* dRAID which is not its parent it should fail with EINVAL.
|
||||
* dRAID which is not its parent it should fail with ENOTSUP.
|
||||
*/
|
||||
if (pvd->vdev_ops != &vdev_mirror_ops &&
|
||||
pvd->vdev_ops != &vdev_root_ops && (!replacing ||
|
||||
@@ -3900,7 +3900,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
|
||||
else if (ashift > oldvd->vdev_top->vdev_ashift)
|
||||
expected_error = EDOM;
|
||||
else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd))
|
||||
expected_error = EINVAL;
|
||||
expected_error = ENOTSUP;
|
||||
else
|
||||
expected_error = 0;
|
||||
|
||||
@@ -7812,6 +7812,9 @@ ztest_dataset_open(int d)
|
||||
|
||||
ztest_dataset_name(name, ztest_opts.zo_pool, d);
|
||||
|
||||
if (ztest_opts.zo_verbose >= 6)
|
||||
(void) printf("Opening %s\n", name);
|
||||
|
||||
(void) pthread_rwlock_rdlock(&ztest_name_lock);
|
||||
|
||||
error = ztest_dataset_create(name);
|
||||
@@ -8307,41 +8310,44 @@ static void
|
||||
ztest_generic_run(ztest_shared_t *zs, spa_t *spa)
|
||||
{
|
||||
kthread_t **run_threads;
|
||||
int t;
|
||||
int i, ndatasets;
|
||||
|
||||
run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
|
||||
UMEM_NOFAIL);
|
||||
|
||||
/*
|
||||
* Actual number of datasets to be used.
|
||||
*/
|
||||
ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads);
|
||||
|
||||
/*
|
||||
* Prepare the datasets first.
|
||||
*/
|
||||
for (i = 0; i < ndatasets; i++)
|
||||
VERIFY0(ztest_dataset_open(i));
|
||||
|
||||
/*
|
||||
* Kick off all the tests that run in parallel.
|
||||
*/
|
||||
for (t = 0; t < ztest_opts.zo_threads; t++) {
|
||||
if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
|
||||
umem_free(run_threads, ztest_opts.zo_threads *
|
||||
sizeof (kthread_t *));
|
||||
return;
|
||||
}
|
||||
|
||||
run_threads[t] = thread_create(NULL, 0, ztest_thread,
|
||||
(void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
|
||||
for (i = 0; i < ztest_opts.zo_threads; i++) {
|
||||
run_threads[i] = thread_create(NULL, 0, ztest_thread,
|
||||
(void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE,
|
||||
defclsyspri);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for all of the tests to complete.
|
||||
*/
|
||||
for (t = 0; t < ztest_opts.zo_threads; t++)
|
||||
VERIFY0(thread_join(run_threads[t]));
|
||||
for (i = 0; i < ztest_opts.zo_threads; i++)
|
||||
VERIFY0(thread_join(run_threads[i]));
|
||||
|
||||
/*
|
||||
* Close all datasets. This must be done after all the threads
|
||||
* are joined so we can be sure none of the datasets are in-use
|
||||
* by any of the threads.
|
||||
*/
|
||||
for (t = 0; t < ztest_opts.zo_threads; t++) {
|
||||
if (t < ztest_opts.zo_datasets)
|
||||
ztest_dataset_close(t);
|
||||
}
|
||||
for (i = 0; i < ndatasets; i++)
|
||||
ztest_dataset_close(i);
|
||||
|
||||
txg_wait_synced(spa_get_dsl(spa), 0);
|
||||
|
||||
@@ -8464,6 +8470,7 @@ ztest_run(ztest_shared_t *zs)
|
||||
|
||||
int d = ztest_random(ztest_opts.zo_datasets);
|
||||
ztest_dataset_destroy(d);
|
||||
txg_wait_synced(spa_get_dsl(spa), 0);
|
||||
}
|
||||
zs->zs_enospc_count = 0;
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@
|
||||
# modified version of the Autoconf Macro, you may extend this special
|
||||
# exception to the GPL to apply to your modified version as well.
|
||||
|
||||
#serial 36
|
||||
#serial 37
|
||||
|
||||
AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL])
|
||||
AC_DEFUN([AX_PYTHON_DEVEL],[
|
||||
@@ -316,7 +316,7 @@ EOD`
|
||||
PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version"
|
||||
fi
|
||||
|
||||
if test -z "PYTHON_LIBS"; then
|
||||
if test -z "$PYTHON_LIBS"; then
|
||||
AC_MSG_WARN([
|
||||
Cannot determine location of your Python DSO. Please check it was installed with
|
||||
dynamic libraries enabled, or try setting PYTHON_LIBS by hand.
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
dnl #
|
||||
dnl # Linux 5.2 API change
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE], [
|
||||
ZFS_LINUX_TEST_SRC([super_operations_free_inode], [
|
||||
#include <linux/fs.h>
|
||||
|
||||
static void free_inode(struct inode *) { }
|
||||
|
||||
static struct super_operations sops __attribute__ ((unused)) = {
|
||||
.free_inode = free_inode,
|
||||
};
|
||||
],[])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SOPS_FREE_INODE], [
|
||||
AC_MSG_CHECKING([whether sops->free_inode() exists])
|
||||
ZFS_LINUX_TEST_RESULT([super_operations_free_inode], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_SOPS_FREE_INODE, 1, [sops->free_inode() exists])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
@@ -49,6 +49,15 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_OBJTOOL], [
|
||||
#error "STACK_FRAME_NON_STANDARD is not defined."
|
||||
#endif
|
||||
])
|
||||
|
||||
dnl # 6.15 made CONFIG_OBJTOOL_WERROR=y the default. We need to handle
|
||||
dnl # this or our build will fail.
|
||||
ZFS_LINUX_TEST_SRC([config_objtool_werror], [
|
||||
#if !defined(CONFIG_OBJTOOL_WERROR)
|
||||
#error "CONFIG_OBJTOOL_WERROR is not defined."
|
||||
#endif
|
||||
])
|
||||
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [
|
||||
@@ -84,6 +93,14 @@ AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
|
||||
AC_MSG_CHECKING([whether CONFIG_OBJTOOL_WERROR is defined])
|
||||
ZFS_LINUX_TEST_RESULT([config_objtool_werror],[
|
||||
AC_MSG_RESULT(yes)
|
||||
CONFIG_OBJTOOL_WERROR_DEFINED=yes
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
dnl #
|
||||
dnl # Linux 6.16 removed readahead_page
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE], [
|
||||
ZFS_LINUX_TEST_SRC([pagemap_has_readahead_page], [
|
||||
#include <linux/pagemap.h>
|
||||
], [
|
||||
struct page *p __attribute__ ((unused)) = NULL;
|
||||
struct readahead_control *ractl __attribute__ ((unused)) = NULL;
|
||||
p = readahead_page(ractl);
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE], [
|
||||
AC_MSG_CHECKING([whether readahead_page() exists])
|
||||
ZFS_LINUX_TEST_RESULT([pagemap_has_readahead_page], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE(HAVE_PAGEMAP_READAHEAD_PAGE, 1,
|
||||
[readahead_page() exists])
|
||||
],[
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
@@ -0,0 +1,24 @@
|
||||
dnl #
|
||||
dnl # Linux 6.16 removes address_space_operations ->writepage
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE], [
|
||||
ZFS_LINUX_TEST_SRC([vfs_has_writepage], [
|
||||
#include <linux/fs.h>
|
||||
|
||||
static const struct address_space_operations
|
||||
aops __attribute__ ((unused)) = {
|
||||
.writepage = NULL,
|
||||
};
|
||||
],[])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_VFS_WRITEPAGE], [
|
||||
AC_MSG_CHECKING([whether aops->writepage exists])
|
||||
ZFS_LINUX_TEST_RESULT([vfs_has_writepage], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE(HAVE_VFS_WRITEPAGE, 1,
|
||||
[address_space_operations->writepage exists])
|
||||
],[
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
@@ -82,6 +82,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||
ZFS_AC_KERNEL_SRC_VFS_MIGRATEPAGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
|
||||
ZFS_AC_KERNEL_SRC_VFS_READPAGES
|
||||
ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS
|
||||
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
|
||||
@@ -111,6 +112,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||
ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG
|
||||
ZFS_AC_KERNEL_SRC_STRLCPY
|
||||
ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT
|
||||
ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE
|
||||
ZFS_AC_KERNEL_SRC_ADD_DISK
|
||||
ZFS_AC_KERNEL_SRC_KTHREAD
|
||||
ZFS_AC_KERNEL_SRC_ZERO_PAGE
|
||||
@@ -132,6 +134,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||
ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
|
||||
ZFS_AC_KERNEL_SRC_TIMER
|
||||
ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR
|
||||
ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
||||
@@ -197,6 +200,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||
ZFS_AC_KERNEL_VFS_MIGRATEPAGE
|
||||
ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
|
||||
ZFS_AC_KERNEL_VFS_READPAGES
|
||||
ZFS_AC_KERNEL_VFS_WRITEPAGE
|
||||
ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS
|
||||
ZFS_AC_KERNEL_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
|
||||
@@ -226,6 +230,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||
ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG
|
||||
ZFS_AC_KERNEL_STRLCPY
|
||||
ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT
|
||||
ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE
|
||||
ZFS_AC_KERNEL_ADD_DISK
|
||||
ZFS_AC_KERNEL_KTHREAD
|
||||
ZFS_AC_KERNEL_ZERO_PAGE
|
||||
@@ -248,6 +253,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||
ZFS_AC_KERNEL_PIN_USER_PAGES
|
||||
ZFS_AC_KERNEL_TIMER
|
||||
ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR
|
||||
ZFS_AC_KERNEL_SOPS_FREE_INODE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
||||
|
||||
+46
-23
@@ -38,9 +38,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE], [
|
||||
AC_MSG_CHECKING([whether host toolchain supports SSE])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("xorps %xmm0, %xmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_DEFINE([HAVE_SSE], 1, [Define if host toolchain supports SSE])
|
||||
@@ -57,9 +58,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2], [
|
||||
AC_MSG_CHECKING([whether host toolchain supports SSE2])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("pxor %xmm0, %xmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_DEFINE([HAVE_SSE2], 1, [Define if host toolchain supports SSE2])
|
||||
@@ -76,10 +78,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3], [
|
||||
AC_MSG_CHECKING([whether host toolchain supports SSE3])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
char v[16];
|
||||
__asm__ __volatile__("lddqu %0,%%xmm0" :: "m"(v[0]));
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_DEFINE([HAVE_SSE3], 1, [Define if host toolchain supports SSE3])
|
||||
@@ -96,9 +99,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3], [
|
||||
AC_MSG_CHECKING([whether host toolchain supports SSSE3])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("pshufb %xmm0,%xmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_DEFINE([HAVE_SSSE3], 1, [Define if host toolchain supports SSSE3])
|
||||
@@ -115,9 +119,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1], [
|
||||
AC_MSG_CHECKING([whether host toolchain supports SSE4.1])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("pmaxsb %xmm0,%xmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_DEFINE([HAVE_SSE4_1], 1, [Define if host toolchain supports SSE4.1])
|
||||
@@ -134,9 +139,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2], [
|
||||
AC_MSG_CHECKING([whether host toolchain supports SSE4.2])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("pcmpgtq %xmm0, %xmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_DEFINE([HAVE_SSE4_2], 1, [Define if host toolchain supports SSE4.2])
|
||||
@@ -153,10 +159,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX], [
|
||||
AC_MSG_CHECKING([whether host toolchain supports AVX])
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
char v[32];
|
||||
__asm__ __volatile__("vmovdqa %0,%%ymm0" :: "m"(v[0]));
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -174,9 +181,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vpshufb %ymm0,%ymm1,%ymm2");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -194,9 +202,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vpandd %zmm0,%zmm1,%zmm2");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -214,9 +223,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vplzcntd %zmm0,%zmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -234,9 +244,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vandpd %zmm0,%zmm1,%zmm2");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -254,9 +265,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vpshufb %zmm0,%zmm1,%zmm2");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -274,9 +286,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vpmadd52luq %zmm0,%zmm1,%zmm2");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -294,9 +307,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vpermb %zmm0,%zmm1,%zmm2");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -314,9 +328,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vgatherpf0dps (%rsi,%zmm0,4){%k1}");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -334,9 +349,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vexp2pd %zmm0,%zmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -354,9 +370,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("vpabsq %zmm0,%zmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -374,9 +391,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("aesenc %xmm0, %xmm1");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -394,9 +412,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("pclmulqdq %0, %%xmm0, %%xmm1" :: "i"(0));
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -414,9 +433,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
__asm__ __volatile__("movbe 0(%eax), %eax");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -434,10 +454,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
char b[4096] __attribute__ ((aligned (64)));
|
||||
__asm__ __volatile__("xsave %[b]\n" : : [b] "m" (*b) : "memory");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -455,10 +476,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
char b[4096] __attribute__ ((aligned (64)));
|
||||
__asm__ __volatile__("xsaveopt %[b]\n" : : [b] "m" (*b) : "memory");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
@@ -476,10 +498,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES], [
|
||||
|
||||
AC_LINK_IFELSE([AC_LANG_SOURCE([
|
||||
[
|
||||
void main()
|
||||
int main()
|
||||
{
|
||||
char b[4096] __attribute__ ((aligned (64)));
|
||||
__asm__ __volatile__("xsaves %[b]\n" : : [b] "m" (*b) : "memory");
|
||||
return (0);
|
||||
}
|
||||
]])], [
|
||||
AC_MSG_RESULT([yes])
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
dnl #
|
||||
dnl # Check for statx() function and STATX_MNT_ID availability
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [
|
||||
AC_CHECK_HEADERS([linux/stat.h],
|
||||
[have_stat_headers=yes],
|
||||
[have_stat_headers=no])
|
||||
|
||||
AS_IF([test "x$have_stat_headers" = "xyes"], [
|
||||
AC_CHECK_FUNC([statx], [
|
||||
AC_DEFINE([HAVE_STATX], [1], [statx() is available])
|
||||
|
||||
dnl Check for STATX_MNT_ID availability
|
||||
AC_MSG_CHECKING([for STATX_MNT_ID])
|
||||
AC_COMPILE_IFELSE([
|
||||
AC_LANG_PROGRAM([[
|
||||
#include <linux/stat.h>
|
||||
]], [[
|
||||
struct statx stx;
|
||||
int mask = STATX_MNT_ID;
|
||||
(void)mask;
|
||||
(void)stx.stx_mnt_id;
|
||||
]])
|
||||
], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([HAVE_STATX_MNT_ID], [1], [STATX_MNT_ID is available])
|
||||
], [
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
], [
|
||||
AC_MSG_WARN([linux/stat.h not found; skipping statx support])
|
||||
])
|
||||
]) dnl end AC_DEFUN
|
||||
@@ -17,6 +17,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
|
||||
ZFS_AC_CONFIG_USER_LIBUDEV
|
||||
ZFS_AC_CONFIG_USER_LIBUUID
|
||||
ZFS_AC_CONFIG_USER_LIBBLKID
|
||||
ZFS_AC_CONFIG_USER_STATX
|
||||
])
|
||||
ZFS_AC_CONFIG_USER_LIBTIRPC
|
||||
ZFS_AC_CONFIG_USER_LIBCRYPTO
|
||||
|
||||
@@ -205,6 +205,46 @@ AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [
|
||||
AC_MSG_RESULT([$enable_invariants])
|
||||
])
|
||||
|
||||
dnl # Disabled by default. If enabled allows a configured "turn objtools
|
||||
dnl # warnings into errors" (CONFIG_OBJTOOL_WERROR) behavior to take effect.
|
||||
dnl # If disabled, objtool warnings are never turned into errors. It can't
|
||||
dnl # be enabled if the kernel wasn't compiled with CONFIG_OBJTOOL_WERROR=y.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_OBJTOOL_WERROR], [
|
||||
AC_MSG_CHECKING([whether objtool error on warning behavior is enabled])
|
||||
AC_ARG_ENABLE([objtool-werror],
|
||||
[AS_HELP_STRING([--enable-objtool-werror],
|
||||
[Enable objtool's error on warning behaviour if present @<:@default=no@:>@])],
|
||||
[enable_objtool_werror=$enableval],
|
||||
[enable_objtool_werror=no])
|
||||
AC_MSG_RESULT([$enable_objtool_werror])
|
||||
|
||||
AS_IF([test x$CONFIG_OBJTOOL_WERROR_DEFINED = xyes],[
|
||||
AS_IF([test x$enable_objtool_werror = xyes],[
|
||||
AC_MSG_NOTICE([enable-objtool-werror defined, keeping -Werror ])
|
||||
],[
|
||||
AC_MSG_NOTICE([enable-objtool-werror undefined, disabling -Werror ])
|
||||
OBJTOOL_DISABLE_WERROR=y
|
||||
abs_objtool_binary=$kernelsrc/tools/objtool/objtool
|
||||
AS_IF([test -x $abs_objtool_binary],[],[
|
||||
AC_MSG_ERROR([*** objtool binary $abs_objtool_binary not found])
|
||||
])
|
||||
dnl # The path to the wrapper is defined in modules/Makefile.in.
|
||||
])
|
||||
],[
|
||||
dnl # We can't enable --Werror if it's not there.
|
||||
AS_IF([test x$enable_objtool_werror = xyes],[
|
||||
AC_MSG_ERROR([
|
||||
*** Cannot enable objtool-werror,
|
||||
*** a kernel built with CONFIG_OBJTOOL_WERROR=y is required.
|
||||
])
|
||||
],[])
|
||||
])
|
||||
|
||||
AC_SUBST(OBJTOOL_DISABLE_WERROR)
|
||||
AC_SUBST(abs_objtool_binary)
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [
|
||||
AX_COUNT_CPUS([])
|
||||
AC_SUBST(CPU_COUNT)
|
||||
|
||||
@@ -65,6 +65,7 @@ ZFS_AC_DEBUGINFO
|
||||
ZFS_AC_DEBUG_KMEM
|
||||
ZFS_AC_DEBUG_KMEM_TRACKING
|
||||
ZFS_AC_DEBUG_INVARIANTS
|
||||
ZFS_AC_OBJTOOL_WERROR
|
||||
|
||||
AC_CONFIG_FILES([
|
||||
contrib/debian/rules
|
||||
@@ -86,6 +87,7 @@ AC_CONFIG_FILES([
|
||||
zfs.release
|
||||
])
|
||||
|
||||
AC_CONFIG_FILES([scripts/objtool-wrapper], [chmod +x scripts/objtool-wrapper])
|
||||
|
||||
AC_OUTPUT
|
||||
|
||||
|
||||
@@ -100,8 +100,8 @@ Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
# The libcurl4 is loaded through dlopen("libcurl.so.4").
|
||||
# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=988521
|
||||
Recommends: libcurl4
|
||||
Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux
|
||||
Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux
|
||||
Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4
|
||||
Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4
|
||||
Conflicts: libzfs6linux
|
||||
Description: OpenZFS filesystem library for Linux - general support
|
||||
OpenZFS is a storage platform that encompasses the functionality of
|
||||
@@ -128,8 +128,8 @@ Package: openzfs-libzpool6
|
||||
Section: contrib/libs
|
||||
Architecture: linux-any
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
Breaks: libzpool2, libzpool5, libzpool5linux, libzpool6linux
|
||||
Replaces: libzpool2, libzpool5, libzpool5linux, libzpool6linux
|
||||
Breaks: libzpool2, libzpool5, libzpool6linux
|
||||
Replaces: libzpool2, libzpool5, libzpool6linux
|
||||
Conflicts: libzpool6linux
|
||||
Description: OpenZFS pool library for Linux
|
||||
OpenZFS is a storage platform that encompasses the functionality of
|
||||
|
||||
@@ -8,6 +8,7 @@ lib/systemd/system/zfs-import-scan.service
|
||||
lib/systemd/system/zfs-import.target
|
||||
lib/systemd/system/zfs-load-key.service
|
||||
lib/systemd/system/zfs-mount.service
|
||||
lib/systemd/system/zfs-mount@.service
|
||||
lib/systemd/system/zfs-scrub-monthly@.timer
|
||||
lib/systemd/system/zfs-scrub-weekly@.timer
|
||||
lib/systemd/system/zfs-scrub@.service
|
||||
@@ -73,6 +74,7 @@ usr/share/man/man8/zfs-recv.8
|
||||
usr/share/man/man8/zfs-redact.8
|
||||
usr/share/man/man8/zfs-release.8
|
||||
usr/share/man/man8/zfs-rename.8
|
||||
usr/share/man/man8/zfs-rewrite.8
|
||||
usr/share/man/man8/zfs-rollback.8
|
||||
usr/share/man/man8/zfs-send.8
|
||||
usr/share/man/man8/zfs-set.8
|
||||
|
||||
@@ -93,7 +93,7 @@ override_dh_auto_install:
|
||||
@# Install the DKMS source.
|
||||
@# We only want the files needed to build the modules
|
||||
install -D -t '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/scripts' \
|
||||
'$(CURDIR)/scripts/dkms.postbuild'
|
||||
'$(CURDIR)/scripts/dkms.postbuild' '$(CURDIR)/scripts/objtool-wrapper.in'
|
||||
$(foreach file,$(DKMSFILES),mv '$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/$(file)' '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)' || exit 1;)
|
||||
|
||||
@# Only ever build Linux modules
|
||||
@@ -108,8 +108,8 @@ override_dh_auto_install:
|
||||
@# - zfs.release$
|
||||
@# * Takes care of spaces and tabs
|
||||
@# * Remove reference to ZFS_AC_PACKAGE
|
||||
awk '/^AC_CONFIG_FILES\(\[/,/^\]\)/ {\
|
||||
if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))/) \
|
||||
awk '/^AC_CONFIG_FILES\(\[/,/\]\)/ {\
|
||||
if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))|scripts\/objtool-wrapper.*\]\)$$/) \
|
||||
{next} } {print}' \
|
||||
'$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac' | sed '/ZFS_AC_PACKAGE/d' > '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac'
|
||||
@# Set "SUBDIRS = module include" for CONFIG_KERNEL and remove SUBDIRS for all other configs.
|
||||
|
||||
@@ -56,6 +56,7 @@ systemdunit_DATA = \
|
||||
%D%/systemd/system/zfs-import-scan.service \
|
||||
%D%/systemd/system/zfs-import.target \
|
||||
%D%/systemd/system/zfs-mount.service \
|
||||
%D%/systemd/system/zfs-mount@.service \
|
||||
%D%/systemd/system/zfs-scrub-monthly@.timer \
|
||||
%D%/systemd/system/zfs-scrub-weekly@.timer \
|
||||
%D%/systemd/system/zfs-scrub@.service \
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
[Unit]
|
||||
Description=Mount ZFS filesystem %I
|
||||
Documentation=man:zfs(8)
|
||||
DefaultDependencies=no
|
||||
After=systemd-udev-settle.service
|
||||
After=zfs-import.target
|
||||
After=zfs-mount.service
|
||||
After=systemd-remount-fs.service
|
||||
Before=local-fs.target
|
||||
ConditionPathIsDirectory=/sys/module/zfs
|
||||
|
||||
# This merely tells the service manager
|
||||
# that unmounting everything undoes the
|
||||
# effect of this service. No extra logic
|
||||
# is ran as a result of these settings.
|
||||
Conflicts=umount.target
|
||||
Before=umount.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
RemainAfterExit=yes
|
||||
EnvironmentFile=-@initconfdir@/zfs
|
||||
ExecStart=@sbindir@/zfs mount -R %I
|
||||
|
||||
[Install]
|
||||
WantedBy=zfs.target
|
||||
@@ -56,4 +56,9 @@ struct opensolaris_utsname {
|
||||
#define task_io_account_read(n)
|
||||
#define task_io_account_write(n)
|
||||
|
||||
/*
|
||||
* Check if the current thread is a memory reclaim thread.
|
||||
*/
|
||||
extern int current_is_reclaim_thread(void);
|
||||
|
||||
#endif /* _OPENSOLARIS_SYS_MISC_H_ */
|
||||
|
||||
@@ -45,7 +45,9 @@
|
||||
#ifdef _KERNEL
|
||||
#define CPU curcpu
|
||||
#define minclsyspri PRIBIO
|
||||
#define defclsyspri minclsyspri
|
||||
#define defclsyspri minclsyspri
|
||||
/* Write issue taskq priority. */
|
||||
#define wtqclsyspri ((PVM + PRIBIO) / 2)
|
||||
#define maxclsyspri PVM
|
||||
#define max_ncpus (mp_maxid + 1)
|
||||
#define boot_max_ncpus (mp_maxid + 1)
|
||||
|
||||
@@ -8,6 +8,7 @@ kernel_linux_HEADERS = \
|
||||
%D%/kernel/linux/mm_compat.h \
|
||||
%D%/kernel/linux/mod_compat.h \
|
||||
%D%/kernel/linux/page_compat.h \
|
||||
%D%/kernel/linux/pagemap_compat.h \
|
||||
%D%/kernel/linux/simd.h \
|
||||
%D%/kernel/linux/simd_aarch64.h \
|
||||
%D%/kernel/linux/simd_arm.h \
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
// SPDX-License-Identifier: CDDL-1.0
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
#ifndef _ZFS_PAGEMAP_COMPAT_H
|
||||
#define _ZFS_PAGEMAP_COMPAT_H
|
||||
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
#ifndef HAVE_PAGEMAP_READAHEAD_PAGE
|
||||
#define readahead_page(ractl) (&(__readahead_folio(ractl)->page))
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -139,15 +139,6 @@
|
||||
*/
|
||||
#if defined(HAVE_KERNEL_FPU_INTERNAL)
|
||||
|
||||
/*
|
||||
* For kernels not exporting *kfpu_{begin,end} we have to use inline assembly
|
||||
* with the XSAVE{,OPT,S} instructions, so we need the toolchain to support at
|
||||
* least XSAVE.
|
||||
*/
|
||||
#if !defined(HAVE_XSAVE)
|
||||
#error "Toolchain needs to support the XSAVE assembler instruction"
|
||||
#endif
|
||||
|
||||
#ifndef XFEATURE_MASK_XTILE
|
||||
/*
|
||||
* For kernels where this doesn't exist yet, we still don't want to break
|
||||
@@ -335,9 +326,13 @@ kfpu_begin(void)
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_XSAVE)
|
||||
if (static_cpu_has(X86_FEATURE_XSAVE)) {
|
||||
kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE);
|
||||
} else if (static_cpu_has(X86_FEATURE_FXSR)) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (static_cpu_has(X86_FEATURE_FXSR)) {
|
||||
kfpu_save_fxsr(state);
|
||||
} else {
|
||||
kfpu_save_fsave(state);
|
||||
@@ -390,9 +385,13 @@ kfpu_end(void)
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_XSAVE)
|
||||
if (static_cpu_has(X86_FEATURE_XSAVE)) {
|
||||
kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE);
|
||||
} else if (static_cpu_has(X86_FEATURE_FXSR)) {
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
if (static_cpu_has(X86_FEATURE_FXSR)) {
|
||||
kfpu_restore_fxsr(state);
|
||||
} else {
|
||||
kfpu_restore_fsave(state);
|
||||
|
||||
@@ -24,7 +24,13 @@
|
||||
#define _OS_LINUX_SPL_MISC_H
|
||||
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/swap.h>
|
||||
|
||||
extern void spl_signal_kobj_evt(struct block_device *bdev);
|
||||
|
||||
/*
|
||||
* Check if the current thread is a memory reclaim thread.
|
||||
*/
|
||||
extern int current_is_reclaim_thread(void);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -92,8 +92,10 @@
|
||||
* Treat shim tasks as SCHED_NORMAL tasks
|
||||
*/
|
||||
#define minclsyspri (MAX_PRIO-1)
|
||||
#define maxclsyspri (MAX_RT_PRIO)
|
||||
#define defclsyspri (DEFAULT_PRIO)
|
||||
/* Write issue taskq priority. */
|
||||
#define wtqclsyspri (MAX_RT_PRIO + 1)
|
||||
#define maxclsyspri (MAX_RT_PRIO)
|
||||
|
||||
#ifndef NICE_TO_PRIO
|
||||
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
|
||||
|
||||
@@ -59,8 +59,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
|
||||
__field(uint64_t, z_size)
|
||||
__field(uint64_t, z_pflags)
|
||||
__field(uint32_t, z_sync_cnt)
|
||||
__field(uint32_t, z_sync_writes_cnt)
|
||||
__field(uint32_t, z_async_writes_cnt)
|
||||
__field(mode_t, z_mode)
|
||||
__field(boolean_t, z_is_sa)
|
||||
__field(boolean_t, z_is_ctldir)
|
||||
@@ -92,8 +90,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
|
||||
__entry->z_size = zn->z_size;
|
||||
__entry->z_pflags = zn->z_pflags;
|
||||
__entry->z_sync_cnt = zn->z_sync_cnt;
|
||||
__entry->z_sync_writes_cnt = zn->z_sync_writes_cnt;
|
||||
__entry->z_async_writes_cnt = zn->z_async_writes_cnt;
|
||||
__entry->z_mode = zn->z_mode;
|
||||
__entry->z_is_sa = zn->z_is_sa;
|
||||
__entry->z_is_ctldir = zn->z_is_ctldir;
|
||||
@@ -117,7 +113,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
|
||||
TP_printk("zn { id %llu unlinked %u atime_dirty %u "
|
||||
"zn_prefetch %u blksz %u seq %u "
|
||||
"mapcnt %llu size %llu pflags %llu "
|
||||
"sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
|
||||
"sync_cnt %u "
|
||||
"mode 0x%x is_sa %d is_ctldir %d "
|
||||
"inode { uid %u gid %u ino %lu nlink %u size %lli "
|
||||
"blkbits %u bytes %u mode 0x%x generation %x } } "
|
||||
@@ -126,7 +122,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
|
||||
__entry->z_zn_prefetch, __entry->z_blksz,
|
||||
__entry->z_seq, __entry->z_mapcnt, __entry->z_size,
|
||||
__entry->z_pflags, __entry->z_sync_cnt,
|
||||
__entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
|
||||
__entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir,
|
||||
__entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink,
|
||||
__entry->i_size, __entry->i_blkbits,
|
||||
|
||||
@@ -157,6 +157,7 @@ struct znode;
|
||||
|
||||
extern int zfs_sync(struct super_block *, int, cred_t *);
|
||||
extern int zfs_inode_alloc(struct super_block *, struct inode **ip);
|
||||
extern void zfs_inode_free(struct inode *);
|
||||
extern void zfs_inode_destroy(struct inode *);
|
||||
extern void zfs_mark_inode_dirty(struct inode *);
|
||||
extern boolean_t zfs_relatime_need_update(const struct inode *);
|
||||
|
||||
@@ -954,7 +954,7 @@ typedef struct arc_sums {
|
||||
wmsum_t arcstat_data_size;
|
||||
wmsum_t arcstat_metadata_size;
|
||||
wmsum_t arcstat_dbuf_size;
|
||||
wmsum_t arcstat_dnode_size;
|
||||
aggsum_t arcstat_dnode_size;
|
||||
wmsum_t arcstat_bonus_size;
|
||||
wmsum_t arcstat_l2_hits;
|
||||
wmsum_t arcstat_l2_misses;
|
||||
|
||||
@@ -174,6 +174,7 @@ typedef struct dbuf_dirty_record {
|
||||
arc_buf_t *dr_data;
|
||||
override_states_t dr_override_state;
|
||||
uint8_t dr_copies;
|
||||
uint8_t dr_gang_copies;
|
||||
boolean_t dr_nopwrite;
|
||||
boolean_t dr_brtwrite;
|
||||
boolean_t dr_diowrite;
|
||||
|
||||
+2
-5
@@ -286,14 +286,11 @@ typedef struct {
|
||||
ddt_log_t *ddt_log_active; /* pointers into ddt_log */
|
||||
ddt_log_t *ddt_log_flushing; /* swapped when flush starts */
|
||||
|
||||
hrtime_t ddt_flush_start; /* log flush start this txg */
|
||||
uint32_t ddt_flush_pass; /* log flush pass this txg */
|
||||
|
||||
int32_t ddt_flush_count; /* entries flushed this txg */
|
||||
int32_t ddt_flush_min; /* min rem entries to flush */
|
||||
int32_t ddt_log_ingest_rate; /* rolling log ingest rate */
|
||||
int32_t ddt_log_flush_rate; /* rolling log flush rate */
|
||||
int32_t ddt_log_flush_time_rate; /* avg time spent flushing */
|
||||
uint32_t ddt_log_flush_pressure; /* pressure to apply for cap */
|
||||
uint32_t ddt_log_flush_prev_backlog; /* prev backlog size */
|
||||
|
||||
uint64_t ddt_flush_force_txg; /* flush hard before this txg */
|
||||
|
||||
|
||||
+2
-2
@@ -144,9 +144,9 @@ typedef enum dmu_object_byteswap {
|
||||
#define DMU_OT_IS_DDT(ot) \
|
||||
((ot) == DMU_OT_DDT_ZAP)
|
||||
|
||||
#define DMU_OT_IS_CRITICAL(ot) \
|
||||
#define DMU_OT_IS_CRITICAL(ot, level) \
|
||||
(DMU_OT_IS_METADATA(ot) && \
|
||||
(ot) != DMU_OT_DNODE && \
|
||||
((ot) != DMU_OT_DNODE || (level) > 0) && \
|
||||
(ot) != DMU_OT_DIRECTORY_CONTENTS && \
|
||||
(ot) != DMU_OT_SA)
|
||||
|
||||
|
||||
@@ -1614,6 +1614,15 @@ typedef enum zfs_ioc {
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct zfs_rewrite_args {
|
||||
uint64_t off;
|
||||
uint64_t len;
|
||||
uint64_t flags;
|
||||
uint64_t arg;
|
||||
} zfs_rewrite_args_t;
|
||||
|
||||
#define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t)
|
||||
|
||||
/*
|
||||
* ZFS-specific error codes used for returning descriptive errors
|
||||
* to the userland through zfs ioctls.
|
||||
|
||||
@@ -568,6 +568,8 @@ typedef struct metaslab_unflushed_phys {
|
||||
uint64_t msp_unflushed_txg;
|
||||
} metaslab_unflushed_phys_t;
|
||||
|
||||
char *metaslab_rt_name(metaslab_group_t *, metaslab_t *, const char *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -49,6 +49,9 @@ typedef enum zfs_range_seg_type {
|
||||
ZFS_RANGE_SEG_NUM_TYPES,
|
||||
} zfs_range_seg_type_t;
|
||||
|
||||
#define ZFS_RT_NAME(rt) (((rt)->rt_name != NULL) ? (rt)->rt_name : "")
|
||||
#define ZFS_RT_F_DYN_NAME (1ULL << 0) /* if rt_name must be freed */
|
||||
|
||||
/*
|
||||
* Note: the range_tree may not be accessed concurrently; consumers
|
||||
* must provide external locking if required.
|
||||
@@ -68,6 +71,9 @@ typedef struct zfs_range_tree {
|
||||
void *rt_arg;
|
||||
uint64_t rt_gap; /* allowable inter-segment gap */
|
||||
|
||||
uint64_t rt_flags;
|
||||
const char *rt_name; /* details for debugging */
|
||||
|
||||
/*
|
||||
* The rt_histogram maintains a histogram of ranges. Each bucket,
|
||||
* rt_histogram[i], contains the number of ranges whose size is:
|
||||
@@ -281,6 +287,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
|
||||
uint64_t gap);
|
||||
zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
|
||||
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
|
||||
zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
|
||||
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
|
||||
uint64_t flags, const char *name);
|
||||
void zfs_range_tree_destroy(zfs_range_tree_t *rt);
|
||||
boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
|
||||
uint64_t size);
|
||||
|
||||
@@ -173,6 +173,7 @@ extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
|
||||
extern uint32_t vdev_queue_length(vdev_t *vd);
|
||||
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
|
||||
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
|
||||
extern boolean_t vdev_queue_pool_busy(spa_t *spa);
|
||||
|
||||
extern void vdev_config_dirty(vdev_t *vd);
|
||||
extern void vdev_config_clean(vdev_t *vd);
|
||||
|
||||
@@ -651,6 +651,7 @@ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
|
||||
int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp);
|
||||
#endif
|
||||
int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS);
|
||||
char *vdev_rt_name(vdev_t *vd, const char *name);
|
||||
|
||||
/*
|
||||
* Vdev ashift optimization tunables
|
||||
|
||||
@@ -236,6 +236,11 @@ typedef pthread_t kthread_t;
|
||||
#define thread_join(t) pthread_join((pthread_t)(t), NULL)
|
||||
|
||||
#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS)
|
||||
/*
|
||||
* Check if the current thread is a memory reclaim thread.
|
||||
* Always returns false in userspace (no memory reclaim thread).
|
||||
*/
|
||||
#define current_is_reclaim_thread() (0)
|
||||
|
||||
/* in libzpool, p0 exists only to have its address taken */
|
||||
typedef struct proc {
|
||||
@@ -623,8 +628,10 @@ extern void delay(clock_t ticks);
|
||||
* Process priorities as defined by setpriority(2) and getpriority(2).
|
||||
*/
|
||||
#define minclsyspri 19
|
||||
#define maxclsyspri -20
|
||||
#define defclsyspri 0
|
||||
/* Write issue taskq priority. */
|
||||
#define wtqclsyspri -19
|
||||
#define maxclsyspri -20
|
||||
|
||||
#define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1))
|
||||
#define CPU_SEQID_UNSTABLE CPU_SEQID
|
||||
|
||||
@@ -60,6 +60,7 @@ extern int zfs_dbgmsg_enable;
|
||||
#define ZFS_DEBUG_METASLAB_ALLOC (1 << 13)
|
||||
#define ZFS_DEBUG_BRT (1 << 14)
|
||||
#define ZFS_DEBUG_RAIDZ_RECONSTRUCT (1 << 15)
|
||||
#define ZFS_DEBUG_DDT (1 << 16)
|
||||
|
||||
extern void __set_error(const char *file, const char *func, int line, int err);
|
||||
extern void __zfs_dbgmsg(char *buf);
|
||||
|
||||
@@ -40,6 +40,7 @@ extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *,
|
||||
uint64_t *, cred_t *);
|
||||
extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
|
||||
const blkptr_t *, size_t);
|
||||
extern int zfs_rewrite(znode_t *, uint64_t, uint64_t, uint64_t, uint64_t);
|
||||
|
||||
extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *);
|
||||
extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *);
|
||||
|
||||
@@ -201,8 +201,6 @@ typedef struct znode {
|
||||
uint64_t z_size; /* file size (cached) */
|
||||
uint64_t z_pflags; /* pflags (cached) */
|
||||
uint32_t z_sync_cnt; /* synchronous open count */
|
||||
uint32_t z_sync_writes_cnt; /* synchronous write count */
|
||||
uint32_t z_async_writes_cnt; /* asynchronous write count */
|
||||
mode_t z_mode; /* mode (cached) */
|
||||
kmutex_t z_acl_lock; /* acl data lock */
|
||||
zfs_acl_t *z_acl_cached; /* cached acl */
|
||||
|
||||
+2
-1
@@ -350,6 +350,7 @@ typedef struct zio_prop {
|
||||
uint8_t zp_complevel;
|
||||
uint8_t zp_level;
|
||||
uint8_t zp_copies;
|
||||
uint8_t zp_gang_copies;
|
||||
dmu_object_type_t zp_type;
|
||||
boolean_t zp_dedup;
|
||||
boolean_t zp_dedup_verify;
|
||||
@@ -575,7 +576,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
|
||||
zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
|
||||
|
||||
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
|
||||
boolean_t nopwrite, boolean_t brtwrite);
|
||||
int gang_copies, boolean_t nopwrite, boolean_t brtwrite);
|
||||
|
||||
extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
|
||||
|
||||
|
||||
@@ -31,6 +31,11 @@
|
||||
|
||||
#include <sys/mount.h> /* for BLKGETSIZE64 */
|
||||
|
||||
#ifdef HAVE_STATX
|
||||
#include <fcntl.h>
|
||||
#include <linux/stat.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Emulate Solaris' behavior of returning the block device size in fstat64().
|
||||
*/
|
||||
|
||||
@@ -85,13 +85,21 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp)
|
||||
}
|
||||
|
||||
static int
|
||||
getextmntent_impl(FILE *fp, struct extmnttab *mp)
|
||||
getextmntent_impl(FILE *fp, struct extmnttab *mp, uint64_t *mnt_id)
|
||||
{
|
||||
int ret;
|
||||
struct stat64 st;
|
||||
|
||||
*mnt_id = 0;
|
||||
ret = _sol_getmntent(fp, (struct mnttab *)mp);
|
||||
if (ret == 0) {
|
||||
#ifdef HAVE_STATX_MNT_ID
|
||||
struct statx stx;
|
||||
if (statx(AT_FDCWD, mp->mnt_mountp,
|
||||
AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW,
|
||||
STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID))
|
||||
*mnt_id = stx.stx_mnt_id;
|
||||
#endif
|
||||
if (stat64(mp->mnt_mountp, &st) != 0) {
|
||||
mp->mnt_major = 0;
|
||||
mp->mnt_minor = 0;
|
||||
@@ -110,6 +118,12 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
|
||||
struct stat64 st;
|
||||
FILE *fp;
|
||||
int match;
|
||||
boolean_t have_mnt_id = B_FALSE;
|
||||
uint64_t target_mnt_id = 0;
|
||||
uint64_t entry_mnt_id;
|
||||
#ifdef HAVE_STATX_MNT_ID
|
||||
struct statx stx;
|
||||
#endif
|
||||
|
||||
if (strlen(path) >= MAXPATHLEN) {
|
||||
(void) fprintf(stderr, "invalid object; pathname too long\n");
|
||||
@@ -128,6 +142,13 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
|
||||
return (-1);
|
||||
}
|
||||
|
||||
#ifdef HAVE_STATX_MNT_ID
|
||||
if (statx(AT_FDCWD, path, AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW,
|
||||
STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) {
|
||||
have_mnt_id = B_TRUE;
|
||||
target_mnt_id = stx.stx_mnt_id;
|
||||
}
|
||||
#endif
|
||||
|
||||
if ((fp = fopen(MNTTAB, "re")) == NULL) {
|
||||
(void) fprintf(stderr, "cannot open %s\n", MNTTAB);
|
||||
@@ -139,12 +160,15 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf)
|
||||
*/
|
||||
|
||||
match = 0;
|
||||
while (getextmntent_impl(fp, entry) == 0) {
|
||||
if (makedev(entry->mnt_major, entry->mnt_minor) ==
|
||||
statbuf->st_dev) {
|
||||
match = 1;
|
||||
break;
|
||||
while (getextmntent_impl(fp, entry, &entry_mnt_id) == 0) {
|
||||
if (have_mnt_id) {
|
||||
match = (entry_mnt_id == target_mnt_id);
|
||||
} else {
|
||||
match = makedev(entry->mnt_major, entry->mnt_minor) ==
|
||||
statbuf->st_dev;
|
||||
}
|
||||
if (match)
|
||||
break;
|
||||
}
|
||||
(void) fclose(fp);
|
||||
|
||||
|
||||
@@ -50,6 +50,7 @@ dist_man_MANS = \
|
||||
%D%/man8/zfs-redact.8 \
|
||||
%D%/man8/zfs-release.8 \
|
||||
%D%/man8/zfs-rename.8 \
|
||||
%D%/man8/zfs-rewrite.8 \
|
||||
%D%/man8/zfs-rollback.8 \
|
||||
%D%/man8/zfs-send.8 \
|
||||
%D%/man8/zfs-set.8 \
|
||||
|
||||
+58
-38
@@ -1057,27 +1057,6 @@ milliseconds until the operation completes.
|
||||
.It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
Enable prefetching dedup-ed blocks which are going to be freed.
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
|
||||
Maximum number of dedup log flush passes (iterations) each transaction.
|
||||
.Pp
|
||||
At the start of each transaction, OpenZFS will estimate how many entries it
|
||||
needs to flush out to keep up with the change rate, taking the amount and time
|
||||
taken to flush on previous txgs into account (see
|
||||
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
|
||||
It will spread this amount into a number of passes.
|
||||
At each pass, it will use the amount already flushed and the total time taken
|
||||
by flushing and by other IO to recompute how much it should do for the remainder
|
||||
of the txg.
|
||||
.Pp
|
||||
Reducing the max number of passes will make flushing more aggressive, flushing
|
||||
out more entries on each pass.
|
||||
This can be faster, but also more likely to compete with other IO.
|
||||
Increasing the max number of passes will put fewer entries onto each pass,
|
||||
keeping the overhead of dedup changes to a minimum but possibly causing a large
|
||||
number of changes to be dumped on the last pass, which can blow out the txg
|
||||
sync time beyond
|
||||
.Sy zfs_txg_timeout .
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
|
||||
Minimum time to spend on dedup log flush each transaction.
|
||||
.Pp
|
||||
@@ -1087,22 +1066,58 @@ up to
|
||||
This occurs even if doing so would delay the transaction, that is, other IO
|
||||
completes under this time.
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
|
||||
.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 100 Ns Pq uint
|
||||
Flush at least this many entries each transaction.
|
||||
.Pp
|
||||
OpenZFS will estimate how many entries it needs to flush each transaction to
|
||||
keep up with the ingest rate (see
|
||||
.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
|
||||
This sets the minimum for that estimate.
|
||||
Raising it can force OpenZFS to flush more aggressively, keeping the log small
|
||||
and so reducing pool import times, but can make it less able to back off if
|
||||
log flushing would compete with other IO too much.
|
||||
OpenZFS will flush a fraction of the log every TXG, to keep the size
|
||||
proportional to the ingest rate (see
|
||||
.Sy zfs_dedup_log_flush_txgs ) .
|
||||
This sets the minimum for that estimate, which prevents the backlog from
|
||||
completely draining if the ingest rate falls.
|
||||
Raising it can force OpenZFS to flush more aggressively, reducing the backlog
|
||||
to zero more quickly, but can make it less able to back off if log
|
||||
flushing would compete with other IO too much.
|
||||
.
|
||||
.It Sy zfs_dedup_log_flush_entries_max Ns = Ns Sy UINT_MAX Ns Pq uint
|
||||
Flush at most this many entries each transaction.
|
||||
.Pp
|
||||
Mostly used for debugging purposes.
|
||||
.It Sy zfs_dedup_log_flush_txgs Ns = Ns Sy 100 Ns Pq uint
|
||||
Target number of TXGs to process the whole dedup log.
|
||||
.Pp
|
||||
Every TXG, OpenZFS will process the inverse of this number times the size
|
||||
of the DDT backlog.
|
||||
This will keep the backlog at a size roughly equal to the ingest rate
|
||||
times this value.
|
||||
This offers a balance between a more efficient DDT log, with better
|
||||
aggregation, and shorter import times, which increase as the size of the
|
||||
DDT log increases.
|
||||
Increasing this value will result in a more efficient DDT log, but longer
|
||||
import times.
|
||||
.It Sy zfs_dedup_log_cap Ns = Ns Sy UINT_MAX Ns Pq uint
|
||||
Soft cap for the size of the current dedup log.
|
||||
.Pp
|
||||
If the log is larger than this size, we increase the aggressiveness of
|
||||
the flushing to try to bring it back down to the soft cap.
|
||||
Setting it will reduce import times, but will reduce the efficiency of
|
||||
the DDT log, increasing the expected number of IOs required to flush the same
|
||||
amount of data.
|
||||
.It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
||||
Whether to treat the log cap as a firm cap or not.
|
||||
.Pp
|
||||
When set to 0 (the default), the
|
||||
.Sy zfs_dedup_log_cap
|
||||
will increase the maximum number of log entries we flush in a given txg.
|
||||
This will bring the backlog size down towards the cap, but not at the expense
|
||||
of making TXG syncs take longer.
|
||||
If this is set to 1, the cap acts more like a hard cap than a soft cap; it will
|
||||
also increase the minimum number of log entries we flush per TXG.
|
||||
Enabling it will reduce worst-case import times, at the cost of increased TXG
|
||||
sync times.
|
||||
.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
|
||||
Number of transactions to use to compute the flow rate.
|
||||
.Pp
|
||||
OpenZFS will estimate how many entries it needs to flush each transaction by
|
||||
monitoring the number of entries changed (ingest rate), number of entries
|
||||
OpenZFS will estimate number of entries changed (ingest rate), number of entries
|
||||
flushed (flush rate) and time spent flushing (flush time rate) and combining
|
||||
these into an overall "flow rate".
|
||||
It will use an exponential weighted moving average over some number of recent
|
||||
@@ -1369,14 +1384,15 @@ If this setting is 0, then even if feature@block_cloning is enabled,
|
||||
using functions and system calls that attempt to clone blocks will act as
|
||||
though the feature is disabled.
|
||||
.
|
||||
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
|
||||
written to disk.
|
||||
This allows the clone operation to reliably succeed when a file is
|
||||
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 1 Ns | Ns 0 Pq int
|
||||
When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
|
||||
data to be written to disk before proceeding.
|
||||
This ensures that the clone operation reliably succeeds, even if a file is
|
||||
modified and then immediately cloned.
|
||||
For small files this may be slower than making a copy of the file.
|
||||
Therefore, this setting defaults to 0 which causes a clone operation to
|
||||
immediately fail when encountering a dirty block.
|
||||
Note that for small files this may be slower than simply copying the file.
|
||||
When set to 0 the clone operation will immediately fail if it encounters
|
||||
any dirty blocks.
|
||||
By default waiting is enabled.
|
||||
.
|
||||
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
|
||||
Select a BLAKE3 implementation.
|
||||
@@ -1638,6 +1654,10 @@ _
|
||||
2048 ZFS_DEBUG_TRIM Verify TRIM ranges are always within the allocatable range tree.
|
||||
4096 ZFS_DEBUG_LOG_SPACEMAP Verify that the log summary is consistent with the spacemap log
|
||||
and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing.
|
||||
8192 ZFS_DEBUG_METASLAB_ALLOC Enable debugging messages when allocations fail.
|
||||
16384 ZFS_DEBUG_BRT Enable BRT-related debugging messages.
|
||||
32768 ZFS_DEBUG_RAIDZ_RECONSTRUCT Enabled debugging messages for raidz reconstruction.
|
||||
65536 ZFS_DEBUG_DDT Enable DDT-related debugging messages.
|
||||
.TE
|
||||
.Sy \& * No Requires debug build .
|
||||
.
|
||||
|
||||
+2
-1
@@ -1596,7 +1596,8 @@ When set to
|
||||
ZFS stores an extra copy of only critical metadata.
|
||||
This can improve file create performance since less metadata
|
||||
needs to be written.
|
||||
If a single on-disk block is corrupt, at worst a single user file can be lost.
|
||||
If a single on-disk block is corrupt, multiple user files or directories
|
||||
can be lost.
|
||||
.Pp
|
||||
When set to
|
||||
.Sy none ,
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
.\" SPDX-License-Identifier: CDDL-1.0
|
||||
.\"
|
||||
.\" CDDL HEADER START
|
||||
.\"
|
||||
.\" The contents of this file are subject to the terms of the
|
||||
.\" Common Development and Distribution License (the "License").
|
||||
.\" You may not use this file except in compliance with the License.
|
||||
.\"
|
||||
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
.\" or https://opensource.org/licenses/CDDL-1.0.
|
||||
.\" See the License for the specific language governing permissions
|
||||
.\" and limitations under the License.
|
||||
.\"
|
||||
.\" When distributing Covered Code, include this CDDL HEADER in each
|
||||
.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
.\" If applicable, add the following below this CDDL HEADER, with the
|
||||
.\" fields enclosed by brackets "[]" replaced with your own identifying
|
||||
.\" information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
.\"
|
||||
.\" CDDL HEADER END
|
||||
.\"
|
||||
.\" Copyright (c) 2025 iXsystems, Inc.
|
||||
.\"
|
||||
.Dd May 6, 2025
|
||||
.Dt ZFS-REWRITE 8
|
||||
.Os
|
||||
.
|
||||
.Sh NAME
|
||||
.Nm zfs-rewrite
|
||||
.Nd rewrite specified files without modification
|
||||
.Sh SYNOPSIS
|
||||
.Nm zfs
|
||||
.Cm rewrite
|
||||
.Oo Fl rvx Ns Oc
|
||||
.Op Fl l Ar length
|
||||
.Op Fl o Ar offset
|
||||
.Ar file Ns | Ns Ar directory Ns …
|
||||
.
|
||||
.Sh DESCRIPTION
|
||||
Rewrite blocks of specified
|
||||
.Ar file
|
||||
as is without modification at a new location and possibly with new
|
||||
properties, such as checksum, compression, dedup, copies, etc,
|
||||
as if they were atomically read and written back.
|
||||
.Bl -tag -width "-r"
|
||||
.It Fl l Ar length
|
||||
Rewrite at most this number of bytes.
|
||||
.It Fl o Ar offset
|
||||
Start at this offset in bytes.
|
||||
.It Fl r
|
||||
Recurse into directories.
|
||||
.It Fl v
|
||||
Print names of all successfully rewritten files.
|
||||
.It Fl x
|
||||
Don't cross file system mount points when recursing.
|
||||
.El
|
||||
.Sh NOTES
|
||||
Rewrite of cloned blocks and blocks that are part of any snapshots,
|
||||
same as some property changes may increase pool space usage.
|
||||
Holes that were never written or were previously zero-compressed are
|
||||
not rewritten and will remain holes even if compression is disabled.
|
||||
.Pp
|
||||
Rewritten blocks will be seen as modified in next snapshot and as such
|
||||
included into the incremental
|
||||
.Nm zfs Cm send
|
||||
stream.
|
||||
.Pp
|
||||
If a
|
||||
.Fl l
|
||||
or
|
||||
.Fl o
|
||||
value request a rewrite to regions past the end of the file, then those
|
||||
regions are silently ignored, and no error is reported.
|
||||
.
|
||||
.Sh SEE ALSO
|
||||
.Xr zfsprops 7
|
||||
+7
-1
@@ -37,7 +37,7 @@
|
||||
.\" Copyright 2018 Nexenta Systems, Inc.
|
||||
.\" Copyright 2019 Joyent, Inc.
|
||||
.\"
|
||||
.Dd May 12, 2022
|
||||
.Dd April 18, 2025
|
||||
.Dt ZFS 8
|
||||
.Os
|
||||
.
|
||||
@@ -299,6 +299,12 @@ Execute ZFS administrative operations
|
||||
programmatically via a Lua script-language channel program.
|
||||
.El
|
||||
.
|
||||
.Ss Data rewrite
|
||||
.Bl -tag -width ""
|
||||
.It Xr zfs-rewrite 8
|
||||
Rewrite specified files without modification.
|
||||
.El
|
||||
.
|
||||
.Ss Jails
|
||||
.Bl -tag -width ""
|
||||
.It Xr zfs-jail 8
|
||||
|
||||
@@ -494,3 +494,34 @@ UBSAN_SANITIZE_zfs/sa.o := n
|
||||
ifeq ($(CONFIG_ALTIVEC),y)
|
||||
$(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec
|
||||
endif
|
||||
|
||||
# The following recipes attempt to fix out of src-tree builds, where $(src) != $(obj), so that the
|
||||
# subdir %.c/%.S -> %.o targets will work as expected. The in-kernel pattern targets do not seem to
|
||||
# be working on subdirs since about ~6.10
|
||||
zobjdirs = $(dir $(zfs-objs)) $(dir $(spl-objs)) \
|
||||
$(dir $(zfs-$(CONFIG_X86))) $(dir $(zfs-$(CONFIG_UML_X86))) $(dir $(zfs-$(CONFIG_ARM64))) \
|
||||
$(dir $(zfs-$(CONFIG_PPC64))) $(dir $(zfs-$(CONFIG_PPC)))
|
||||
|
||||
z_cdirs = $(sort $(filter-out lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \
|
||||
$(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/) \
|
||||
$(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/) \
|
||||
$(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
|
||||
z_sdirs = $(sort $(filter lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \
|
||||
$(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/) \
|
||||
$(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/) \
|
||||
$(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
|
||||
|
||||
define ZKMOD_C_O_MAKE_TARGET
|
||||
$1%.o: $(src)/$1%.c FORCE
|
||||
$$(call if_changed_rule,cc_o_c)
|
||||
$$(call cmd,force_checksrc)
|
||||
endef
|
||||
|
||||
define ZKMOD_S_O_MAKE_TARGET
|
||||
$1%.o: $(src)/$1%.S FORCE
|
||||
$$(call if_changed_rule,as_o_S)
|
||||
$$(call cmd,force_checksrc)
|
||||
endef
|
||||
|
||||
$(foreach target,$(z_cdirs), $(eval $(call ZKMOD_C_O_MAKE_TARGET,$(target))))
|
||||
$(foreach target,$(z_sdirs), $(eval $(call ZKMOD_S_O_MAKE_TARGET,$(target))))
|
||||
|
||||
@@ -57,6 +57,7 @@ modules-Linux:
|
||||
$(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \
|
||||
$(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \
|
||||
$(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \
|
||||
$(if @OBJTOOL_DISABLE_WERROR@,objtool=@abs_top_builddir@/scripts/objtool-wrapper) \
|
||||
M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules
|
||||
|
||||
modules-FreeBSD:
|
||||
|
||||
@@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the current thread is a memory reclaim thread.
|
||||
* Returns true if curproc is pageproc (FreeBSD's page daemon).
|
||||
*/
|
||||
int
|
||||
current_is_reclaim_thread(void)
|
||||
{
|
||||
return (curproc == pageproc);
|
||||
}
|
||||
|
||||
SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
|
||||
opensolaris_utsname_init, NULL);
|
||||
|
||||
@@ -306,6 +306,18 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
|
||||
*(offset_t *)data = off;
|
||||
return (0);
|
||||
}
|
||||
case ZFS_IOC_REWRITE: {
|
||||
zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data;
|
||||
if ((flag & FWRITE) == 0)
|
||||
return (SET_ERROR(EBADF));
|
||||
error = vn_lock(vp, LK_SHARED);
|
||||
if (error)
|
||||
return (error);
|
||||
error = zfs_rewrite(VTOZ(vp), args->off, args->len,
|
||||
args->flags, args->arg);
|
||||
VOP_UNLOCK(vp);
|
||||
return (error);
|
||||
}
|
||||
}
|
||||
return (SET_ERROR(ENOTTY));
|
||||
}
|
||||
@@ -4228,17 +4240,46 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
|
||||
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
|
||||
ASSERT0(err);
|
||||
|
||||
putpage_commit_arg_t *pca = kmem_alloc(
|
||||
offsetof(putpage_commit_arg_t, pca_pages[ncount]),
|
||||
KM_SLEEP);
|
||||
pca->pca_npages = ncount;
|
||||
memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
|
||||
if (commit) {
|
||||
/*
|
||||
* Caller requested that we commit immediately. We set
|
||||
* a callback on the log entry, to be called once its
|
||||
* on disk after the call to zil_commit() below. The
|
||||
* pages will be undirtied and unbusied there.
|
||||
*/
|
||||
putpage_commit_arg_t *pca = kmem_alloc(
|
||||
offsetof(putpage_commit_arg_t, pca_pages[ncount]),
|
||||
KM_SLEEP);
|
||||
pca->pca_npages = ncount;
|
||||
memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
|
||||
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp,
|
||||
off, len, commit, B_FALSE, zfs_putpage_commit_cb, pca);
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
|
||||
B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
|
||||
|
||||
for (i = 0; i < ncount; i++)
|
||||
rtvals[i] = zfs_vm_pagerret_pend;
|
||||
for (i = 0; i < ncount; i++)
|
||||
rtvals[i] = zfs_vm_pagerret_pend;
|
||||
} else {
|
||||
/*
|
||||
* Caller just wants the page written back somewhere,
|
||||
* but doesn't need it committed yet. We've already
|
||||
* written it back to the DMU, so we just need to put
|
||||
* it on the async log, then undirty the page and
|
||||
* return.
|
||||
*
|
||||
* We cannot use a callback here, because it would keep
|
||||
* the page busy (locked) until it is eventually
|
||||
* written down at txg sync.
|
||||
*/
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
|
||||
B_FALSE, B_FALSE, NULL, NULL);
|
||||
|
||||
zfs_vmobject_wlock(object);
|
||||
for (i = 0; i < ncount; i++) {
|
||||
rtvals[i] = zfs_vm_pagerret_ok;
|
||||
vm_page_undirty(ma[i]);
|
||||
}
|
||||
zfs_vmobject_wunlock(object);
|
||||
}
|
||||
|
||||
VM_CNT_INC(v_vnodeout);
|
||||
VM_CNT_ADD(v_vnodepgsout, ncount);
|
||||
@@ -5201,6 +5242,11 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
|
||||
return (0);
|
||||
}
|
||||
return (EINVAL);
|
||||
#ifdef _PC_HAS_HIDDENSYSTEM
|
||||
case _PC_HAS_HIDDENSYSTEM:
|
||||
*ap->a_retval = 1;
|
||||
return (0);
|
||||
#endif
|
||||
default:
|
||||
return (vop_stdpathconf(ap));
|
||||
}
|
||||
|
||||
@@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
|
||||
zp->z_xattr_cached = NULL;
|
||||
zp->z_xattr_parent = 0;
|
||||
zp->z_vnode = NULL;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
|
||||
return (0);
|
||||
}
|
||||
@@ -172,9 +170,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
|
||||
|
||||
ASSERT3P(zp->z_acl_cached, ==, NULL);
|
||||
ASSERT3P(zp->z_xattr_cached, ==, NULL);
|
||||
|
||||
ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
|
||||
ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
|
||||
}
|
||||
|
||||
|
||||
@@ -293,6 +288,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
|
||||
sharezp->z_atime_dirty = 0;
|
||||
sharezp->z_zfsvfs = zfsvfs;
|
||||
sharezp->z_is_sa = zfsvfs->z_use_sa;
|
||||
sharezp->z_pflags = 0;
|
||||
|
||||
VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
|
||||
kcred, NULL, &acl_ids, NULL));
|
||||
@@ -455,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
|
||||
zp->z_blksz = blksz;
|
||||
zp->z_seq = 0x7A4653;
|
||||
zp->z_sync_cnt = 0;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
atomic_store_ptr(&zp->z_cached_symlink, NULL);
|
||||
|
||||
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
|
||||
@@ -1729,6 +1723,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
|
||||
rootzp->z_unlinked = 0;
|
||||
rootzp->z_atime_dirty = 0;
|
||||
rootzp->z_is_sa = USE_SA(version, os);
|
||||
rootzp->z_pflags = 0;
|
||||
|
||||
zfsvfs->z_os = os;
|
||||
zfsvfs->z_parent = zfsvfs;
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <sys/kmem.h>
|
||||
#include <sys/tsd.h>
|
||||
#include <sys/string.h>
|
||||
#include <sys/misc.h>
|
||||
|
||||
/*
|
||||
* Thread interfaces
|
||||
@@ -197,3 +198,14 @@ issig(void)
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(issig);
|
||||
|
||||
/*
|
||||
* Check if the current thread is a memory reclaim thread.
|
||||
* Returns true if current thread is kswapd.
|
||||
*/
|
||||
int
|
||||
current_is_reclaim_thread(void)
|
||||
{
|
||||
return (current_is_kswapd());
|
||||
}
|
||||
EXPORT_SYMBOL(current_is_reclaim_thread);
|
||||
|
||||
@@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
|
||||
zp->z_pflags = 0;
|
||||
zp->z_mode = 0;
|
||||
zp->z_sync_cnt = 0;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
ip->i_generation = 0;
|
||||
ip->i_ino = id;
|
||||
ip->i_mode = (S_IFDIR | S_IRWXUGO);
|
||||
|
||||
@@ -1176,6 +1176,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Dentry and inode caches referenced by a task in non-root memcg are
|
||||
* not going to be scanned by the kernel-provided shrinker. So, if
|
||||
* kernel prunes nothing, fall back to this manual walk to free dnodes.
|
||||
* To avoid scanning the same znodes multiple times they are always rotated
|
||||
* to the end of the z_all_znodes list. New znodes are inserted at the
|
||||
* end of the list so we're always scanning the oldest znodes first.
|
||||
*/
|
||||
static int
|
||||
zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
|
||||
{
|
||||
znode_t **zp_array, *zp;
|
||||
int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
|
||||
int objects = 0;
|
||||
int i = 0, j = 0;
|
||||
|
||||
zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
|
||||
|
||||
mutex_enter(&zfsvfs->z_znodes_lock);
|
||||
while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
|
||||
|
||||
if ((i++ > nr_to_scan) || (j >= max_array))
|
||||
break;
|
||||
|
||||
ASSERT(list_link_active(&zp->z_link_node));
|
||||
list_remove(&zfsvfs->z_all_znodes, zp);
|
||||
list_insert_tail(&zfsvfs->z_all_znodes, zp);
|
||||
|
||||
/* Skip active znodes and .zfs entries */
|
||||
if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
|
||||
continue;
|
||||
|
||||
if (igrab(ZTOI(zp)) == NULL)
|
||||
continue;
|
||||
|
||||
zp_array[j] = zp;
|
||||
j++;
|
||||
}
|
||||
mutex_exit(&zfsvfs->z_znodes_lock);
|
||||
|
||||
for (i = 0; i < j; i++) {
|
||||
zp = zp_array[i];
|
||||
|
||||
ASSERT3P(zp, !=, NULL);
|
||||
d_prune_aliases(ZTOI(zp));
|
||||
|
||||
if (atomic_read(&ZTOI(zp)->i_count) == 1)
|
||||
objects++;
|
||||
|
||||
zrele(zp);
|
||||
}
|
||||
|
||||
vmem_free(zp_array, max_array * sizeof (znode_t *));
|
||||
|
||||
return (objects);
|
||||
}
|
||||
|
||||
/*
|
||||
* The ARC has requested that the filesystem drop entries from the dentry
|
||||
* and inode caches. This can occur when the ARC needs to free meta data
|
||||
@@ -1227,6 +1284,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
|
||||
*objects = (*shrinker->scan_objects)(shrinker, &sc);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Fall back to zfs_prune_aliases if kernel's shrinker did nothing
|
||||
* due to dentry and inode caches being referenced by a task running
|
||||
* in non-root memcg.
|
||||
*/
|
||||
if (*objects == 0)
|
||||
*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
|
||||
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
|
||||
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
|
||||
* Copyright 2017 Nexenta Systems, Inc.
|
||||
* Copyright (c) 2025, Klara, Inc.
|
||||
*/
|
||||
|
||||
/* Portions Copyright 2007 Jeremy Teo */
|
||||
@@ -3682,7 +3683,7 @@ top:
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_putpage_sync_commit_cb(void *arg)
|
||||
zfs_putpage_commit_cb(void *arg)
|
||||
{
|
||||
struct page *pp = arg;
|
||||
|
||||
@@ -3690,17 +3691,6 @@ zfs_putpage_sync_commit_cb(void *arg)
|
||||
end_page_writeback(pp);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_putpage_async_commit_cb(void *arg)
|
||||
{
|
||||
struct page *pp = arg;
|
||||
znode_t *zp = ITOZ(pp->mapping->host);
|
||||
|
||||
ClearPageError(pp);
|
||||
end_page_writeback(pp);
|
||||
atomic_dec_32(&zp->z_async_writes_cnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Push a page out to disk, once the page is on stable storage the
|
||||
* registered commit callback will be run as notification of completion.
|
||||
@@ -3818,15 +3808,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
||||
zfs_rangelock_exit(lr);
|
||||
|
||||
if (wbc->sync_mode != WB_SYNC_NONE) {
|
||||
/*
|
||||
* Speed up any non-sync page writebacks since
|
||||
* they may take several seconds to complete.
|
||||
* Refer to the comment in zpl_fsync() for details.
|
||||
*/
|
||||
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
}
|
||||
|
||||
if (PageWriteback(pp))
|
||||
#ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
|
||||
folio_wait_bit(page_folio(pp), PG_writeback);
|
||||
@@ -3852,8 +3833,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
||||
* was in fact not skipped and should not be counted as if it were.
|
||||
*/
|
||||
wbc->pages_skipped--;
|
||||
if (!for_sync)
|
||||
atomic_inc_32(&zp->z_async_writes_cnt);
|
||||
set_page_writeback(pp);
|
||||
unlock_page(pp);
|
||||
|
||||
@@ -3872,8 +3851,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
||||
#endif
|
||||
ClearPageError(pp);
|
||||
end_page_writeback(pp);
|
||||
if (!for_sync)
|
||||
atomic_dec_32(&zp->z_async_writes_cnt);
|
||||
zfs_rangelock_exit(lr);
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (err);
|
||||
@@ -3899,35 +3876,61 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
||||
|
||||
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
|
||||
|
||||
boolean_t commit = B_FALSE;
|
||||
if (wbc->sync_mode != WB_SYNC_NONE) {
|
||||
/*
|
||||
* Note that this is rarely called under writepages(), because
|
||||
* writepages() normally handles the entire commit for
|
||||
* performance reasons.
|
||||
*/
|
||||
commit = B_TRUE;
|
||||
} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
|
||||
/*
|
||||
* If the caller does not intend to wait synchronously
|
||||
* for this page writeback to complete and there are active
|
||||
* synchronous calls on this file, do a commit so that
|
||||
* the latter don't accidentally end up waiting for
|
||||
* our writeback to complete. Refer to the comment in
|
||||
* zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
|
||||
*/
|
||||
commit = B_TRUE;
|
||||
}
|
||||
/*
|
||||
* A note about for_sync vs wbc->sync_mode.
|
||||
*
|
||||
* for_sync indicates that this is a syncing writeback, that is, kernel
|
||||
* caller expects the data to be durably stored before being notified.
|
||||
* Often, but not always, the call was triggered by a userspace syncing
|
||||
* op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
|
||||
* means that that page should remain "locked" (in the writeback state)
|
||||
* until it is definitely on disk (ie zil_commit() or spa_sync()).
|
||||
* Otherwise, we can unlock and return as soon as it is on the
|
||||
* in-memory ZIL.
|
||||
*
|
||||
* wbc->sync_mode has similar meaning. wbc is passed from the kernel to
|
||||
* zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
|
||||
* indicates this a regular async writeback (eg a cache eviction) and
|
||||
* so does not need a durability guarantee, while WB_SYNC_ALL indicates
|
||||
* a syncing op that must be waited on (by convention, we test for
|
||||
* !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
|
||||
* performance should there ever be a new mode that we have not yet
|
||||
* added support for).
|
||||
*
|
||||
* So, why a separate for_sync field? This is because zpl_writepages()
|
||||
* calls zfs_putpage() multiple times for a single "logical" operation.
|
||||
* It wants all the individual pages to be for_sync==TRUE ie only
|
||||
* unlocked once durably stored, but it only wants one call to
|
||||
* zil_commit() at the very end, once all the pages are synced. So,
|
||||
* it repurposes sync_mode slightly to indicate who issue and wait for
|
||||
* the IO: for NONE, the caller to zfs_putpage() will do it, while for
|
||||
* ALL, zfs_putpage should do it.
|
||||
*
|
||||
* Summary:
|
||||
* for_sync: 0=unlock immediately; 1 unlock once on disk
|
||||
* sync_mode: NONE=caller will commit; ALL=we will commit
|
||||
*/
|
||||
boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
|
||||
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
|
||||
B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
|
||||
zfs_putpage_async_commit_cb, pp);
|
||||
/*
|
||||
* We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
|
||||
* because it is a policy flag that indicates "someone will call
|
||||
* zil_commit() soon". for_sync=TRUE means exactly that; the only
|
||||
* question is whether it will be us, or zpl_writepages().
|
||||
*/
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync,
|
||||
B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
|
||||
|
||||
if (!for_sync) {
|
||||
ClearPageError(pp);
|
||||
end_page_writeback(pp);
|
||||
}
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
zfs_rangelock_exit(lr);
|
||||
|
||||
if (commit)
|
||||
if (need_commit)
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
|
||||
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
|
||||
|
||||
@@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
|
||||
zp->z_acl_cached = NULL;
|
||||
zp->z_xattr_cached = NULL;
|
||||
zp->z_xattr_parent = 0;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
|
||||
return (0);
|
||||
}
|
||||
@@ -149,9 +147,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
|
||||
ASSERT3P(zp->z_dirlocks, ==, NULL);
|
||||
ASSERT3P(zp->z_acl_cached, ==, NULL);
|
||||
ASSERT3P(zp->z_xattr_cached, ==, NULL);
|
||||
|
||||
ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
|
||||
ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip)
|
||||
return (0);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_inode_free(struct inode *ip)
|
||||
{
|
||||
kmem_cache_free(znode_cache, ITOZ(ip));
|
||||
}
|
||||
|
||||
/*
|
||||
* Called in multiple places when an inode should be destroyed.
|
||||
*/
|
||||
@@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip)
|
||||
nvlist_free(zp->z_xattr_cached);
|
||||
zp->z_xattr_cached = NULL;
|
||||
}
|
||||
|
||||
kmem_cache_free(znode_cache, zp);
|
||||
#ifndef HAVE_SOPS_FREE_INODE
|
||||
/*
|
||||
* inode needs to be freed in RCU callback. If we have
|
||||
* super_operations->free_inode, Linux kernel will do call_rcu
|
||||
* for us. But if we don't have it, since call_rcu is GPL-only
|
||||
* symbol, we can only free synchronously and accept the risk.
|
||||
*/
|
||||
zfs_inode_free(ip);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
|
||||
zp->z_blksz = blksz;
|
||||
zp->z_seq = 0x7A4653;
|
||||
zp->z_sync_cnt = 0;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
|
||||
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
|
||||
|
||||
|
||||
@@ -36,10 +36,7 @@
|
||||
#include <sys/zfs_vfsops.h>
|
||||
#include <sys/zfs_vnops.h>
|
||||
#include <sys/zfs_project.h>
|
||||
#if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \
|
||||
defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
|
||||
#include <linux/pagemap.h>
|
||||
#endif
|
||||
#include <linux/pagemap_compat.h>
|
||||
#include <linux/fadvise.h>
|
||||
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
||||
#include <linux/writeback.h>
|
||||
@@ -114,52 +111,11 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
|
||||
{
|
||||
struct inode *inode = filp->f_mapping->host;
|
||||
znode_t *zp = ITOZ(inode);
|
||||
zfsvfs_t *zfsvfs = ITOZSB(inode);
|
||||
cred_t *cr = CRED();
|
||||
int error;
|
||||
fstrans_cookie_t cookie;
|
||||
|
||||
/*
|
||||
* The variables z_sync_writes_cnt and z_async_writes_cnt work in
|
||||
* tandem so that sync writes can detect if there are any non-sync
|
||||
* writes going on and vice-versa. The "vice-versa" part to this logic
|
||||
* is located in zfs_putpage() where non-sync writes check if there are
|
||||
* any ongoing sync writes. If any sync and non-sync writes overlap,
|
||||
* we do a commit to complete the non-sync writes since the latter can
|
||||
* potentially take several seconds to complete and thus block sync
|
||||
* writes in the upcoming call to filemap_write_and_wait_range().
|
||||
*/
|
||||
atomic_inc_32(&zp->z_sync_writes_cnt);
|
||||
/*
|
||||
* If the following check does not detect an overlapping non-sync write
|
||||
* (say because it's just about to start), then it is guaranteed that
|
||||
* the non-sync write will detect this sync write. This is because we
|
||||
* always increment z_sync_writes_cnt / z_async_writes_cnt before doing
|
||||
* the check on z_async_writes_cnt / z_sync_writes_cnt here and in
|
||||
* zfs_putpage() respectively.
|
||||
*/
|
||||
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
|
||||
if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
|
||||
atomic_dec_32(&zp->z_sync_writes_cnt);
|
||||
return (error);
|
||||
}
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
zpl_exit(zfsvfs, FTAG);
|
||||
}
|
||||
|
||||
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
|
||||
|
||||
/*
|
||||
* The sync write is not complete yet but we decrement
|
||||
* z_sync_writes_cnt since zfs_fsync() increments and decrements
|
||||
* it internally. If a non-sync write starts just after the decrement
|
||||
* operation but before we call zfs_fsync(), it may not detect this
|
||||
* overlapping sync write but it does not matter since we have already
|
||||
* gone past filemap_write_and_wait_range() and we won't block due to
|
||||
* the non-sync write.
|
||||
*/
|
||||
atomic_dec_32(&zp->z_sync_writes_cnt);
|
||||
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
@@ -555,6 +511,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
return (result);
|
||||
}
|
||||
|
||||
#ifdef HAVE_VFS_WRITEPAGE
|
||||
/*
|
||||
* Write out dirty pages to the ARC, this function is only required to
|
||||
* support mmap(2). Mapped pages may be dirtied by memory operations
|
||||
@@ -571,6 +528,7 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
|
||||
|
||||
return (zpl_putpage(pp, wbc, &for_sync));
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The flag combination which matches the behavior of zfs_space() is
|
||||
@@ -985,6 +943,27 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
|
||||
return (err);
|
||||
}
|
||||
|
||||
static int
|
||||
zpl_ioctl_rewrite(struct file *filp, void __user *arg)
|
||||
{
|
||||
struct inode *ip = file_inode(filp);
|
||||
zfs_rewrite_args_t args;
|
||||
fstrans_cookie_t cookie;
|
||||
int err;
|
||||
|
||||
if (copy_from_user(&args, arg, sizeof (args)))
|
||||
return (-EFAULT);
|
||||
|
||||
if (unlikely(!(filp->f_mode & FMODE_WRITE)))
|
||||
return (-EBADF);
|
||||
|
||||
cookie = spl_fstrans_mark();
|
||||
err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
|
||||
spl_fstrans_unmark(cookie);
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
static long
|
||||
zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
@@ -1003,6 +982,8 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
|
||||
return (zpl_ioctl_getdosflags(filp, (void *)arg));
|
||||
case ZFS_IOC_SETDOSFLAGS:
|
||||
return (zpl_ioctl_setdosflags(filp, (void *)arg));
|
||||
case ZFS_IOC_REWRITE:
|
||||
return (zpl_ioctl_rewrite(filp, (void *)arg));
|
||||
default:
|
||||
return (-ENOTTY);
|
||||
}
|
||||
@@ -1040,7 +1021,9 @@ const struct address_space_operations zpl_address_space_operations = {
|
||||
#else
|
||||
.readpage = zpl_readpage,
|
||||
#endif
|
||||
#ifdef HAVE_VFS_WRITEPAGE
|
||||
.writepage = zpl_writepage,
|
||||
#endif
|
||||
.writepages = zpl_writepages,
|
||||
.direct_IO = zpl_direct_IO,
|
||||
#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
|
||||
|
||||
@@ -45,6 +45,15 @@ zpl_inode_alloc(struct super_block *sb)
|
||||
return (ip);
|
||||
}
|
||||
|
||||
#ifdef HAVE_SOPS_FREE_INODE
|
||||
static void
|
||||
zpl_inode_free(struct inode *ip)
|
||||
{
|
||||
ASSERT(atomic_read(&ip->i_count) == 0);
|
||||
zfs_inode_free(ip);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
zpl_inode_destroy(struct inode *ip)
|
||||
{
|
||||
@@ -455,6 +464,9 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg)
|
||||
|
||||
const struct super_operations zpl_super_operations = {
|
||||
.alloc_inode = zpl_inode_alloc,
|
||||
#ifdef HAVE_SOPS_FREE_INODE
|
||||
.free_inode = zpl_inode_free,
|
||||
#endif
|
||||
.destroy_inode = zpl_inode_destroy,
|
||||
.dirty_inode = zpl_dirty_inode,
|
||||
.write_inode = NULL,
|
||||
|
||||
@@ -558,8 +558,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
|
||||
#ifdef HAVE_BLK_MQ_RQ_HCTX
|
||||
blk_mq_hw_queue = rq->mq_hctx->queue_num;
|
||||
#else
|
||||
blk_mq_hw_queue =
|
||||
rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
|
||||
blk_mq_hw_queue = rq->q->queue_hw_ctx[
|
||||
rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
|
||||
#endif
|
||||
taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
|
||||
blk_mq_hw_queue);
|
||||
|
||||
+26
-43
@@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
|
||||
ARCSTAT_INCR(arcstat_bonus_size, space);
|
||||
break;
|
||||
case ARC_SPACE_DNODE:
|
||||
ARCSTAT_INCR(arcstat_dnode_size, space);
|
||||
aggsum_add(&arc_sums.arcstat_dnode_size, space);
|
||||
break;
|
||||
case ARC_SPACE_DBUF:
|
||||
ARCSTAT_INCR(arcstat_dbuf_size, space);
|
||||
@@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
|
||||
ARCSTAT_INCR(arcstat_bonus_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_DNODE:
|
||||
ARCSTAT_INCR(arcstat_dnode_size, -space);
|
||||
aggsum_add(&arc_sums.arcstat_dnode_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_DBUF:
|
||||
ARCSTAT_INCR(arcstat_dbuf_size, -space);
|
||||
@@ -4490,7 +4490,7 @@ arc_evict(void)
|
||||
* target is not evictable or if they go over arc_dnode_limit.
|
||||
*/
|
||||
int64_t prune = 0;
|
||||
int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
|
||||
int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
|
||||
int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
|
||||
+ zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
|
||||
- zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
|
||||
@@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
|
||||
* in the ARC. In practice, that's in the tens of MB, which is low
|
||||
* enough to be safe.
|
||||
*/
|
||||
int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
|
||||
int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
|
||||
zfs_max_recordsize;
|
||||
int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
|
||||
arc_dnode_limit;
|
||||
|
||||
/* Always allow at least one block of overflow. */
|
||||
if (over < 0)
|
||||
if (arc_over < 0 && dn_over <= 0)
|
||||
return (ARC_OVF_NONE);
|
||||
|
||||
/* If we are under memory pressure, report severe overflow. */
|
||||
@@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
|
||||
int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
|
||||
if (use_reserve)
|
||||
overflow *= 3;
|
||||
return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
|
||||
return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
|
||||
}
|
||||
|
||||
static abd_t *
|
||||
@@ -6627,27 +6629,11 @@ arc_release(arc_buf_t *buf, const void *tag)
|
||||
arc_state_t *state = hdr->b_l1hdr.b_state;
|
||||
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
|
||||
ASSERT3P(state, !=, arc_anon);
|
||||
ASSERT3P(state, !=, arc_l2c_only);
|
||||
|
||||
/* this buffer is not on any list */
|
||||
ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
|
||||
|
||||
if (HDR_HAS_L2HDR(hdr)) {
|
||||
mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
|
||||
|
||||
/*
|
||||
* We have to recheck this conditional again now that
|
||||
* we're holding the l2ad_mtx to prevent a race with
|
||||
* another thread which might be concurrently calling
|
||||
* l2arc_evict(). In that case, l2arc_evict() might have
|
||||
* destroyed the header's L2 portion as we were waiting
|
||||
* to acquire the l2ad_mtx.
|
||||
*/
|
||||
if (HDR_HAS_L2HDR(hdr))
|
||||
arc_hdr_l2hdr_destroy(hdr);
|
||||
|
||||
mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do we have more than one buf?
|
||||
*/
|
||||
@@ -6659,10 +6645,6 @@ arc_release(arc_buf_t *buf, const void *tag)
|
||||
boolean_t protected = HDR_PROTECTED(hdr);
|
||||
enum zio_compress compress = arc_hdr_get_compress(hdr);
|
||||
arc_buf_contents_t type = arc_buf_type(hdr);
|
||||
VERIFY3U(hdr->b_type, ==, type);
|
||||
|
||||
ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
|
||||
VERIFY3S(remove_reference(hdr, tag), >, 0);
|
||||
|
||||
if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
|
||||
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
|
||||
@@ -6670,10 +6652,10 @@ arc_release(arc_buf_t *buf, const void *tag)
|
||||
}
|
||||
|
||||
/*
|
||||
* Pull the data off of this hdr and attach it to
|
||||
* a new anonymous hdr. Also find the last buffer
|
||||
* Pull the buffer off of this hdr and find the last buffer
|
||||
* in the hdr's buffer list.
|
||||
*/
|
||||
VERIFY3S(remove_reference(hdr, tag), >, 0);
|
||||
arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
|
||||
ASSERT3P(lastbuf, !=, NULL);
|
||||
|
||||
@@ -6682,7 +6664,6 @@ arc_release(arc_buf_t *buf, const void *tag)
|
||||
* buffer, then we must stop sharing that block.
|
||||
*/
|
||||
if (ARC_BUF_SHARED(buf)) {
|
||||
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
|
||||
ASSERT(!arc_buf_is_shared(lastbuf));
|
||||
|
||||
/*
|
||||
@@ -6704,7 +6685,6 @@ arc_release(arc_buf_t *buf, const void *tag)
|
||||
abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
|
||||
buf->b_data, psize);
|
||||
}
|
||||
VERIFY3P(lastbuf->b_data, !=, NULL);
|
||||
} else if (HDR_SHARED_DATA(hdr)) {
|
||||
/*
|
||||
* Uncompressed shared buffers are always at the end
|
||||
@@ -6720,18 +6700,10 @@ arc_release(arc_buf_t *buf, const void *tag)
|
||||
}
|
||||
|
||||
ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
|
||||
ASSERT3P(state, !=, arc_l2c_only);
|
||||
|
||||
(void) zfs_refcount_remove_many(&state->arcs_size[type],
|
||||
arc_buf_size(buf), buf);
|
||||
|
||||
if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
|
||||
ASSERT3P(state, !=, arc_l2c_only);
|
||||
(void) zfs_refcount_remove_many(
|
||||
&state->arcs_esize[type],
|
||||
arc_buf_size(buf), buf);
|
||||
}
|
||||
|
||||
arc_cksum_verify(buf);
|
||||
arc_buf_unwatch(buf);
|
||||
|
||||
@@ -6759,6 +6731,15 @@ arc_release(arc_buf_t *buf, const void *tag)
|
||||
/* protected by hash lock, or hdr is on arc_anon */
|
||||
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
||||
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
|
||||
|
||||
if (HDR_HAS_L2HDR(hdr)) {
|
||||
mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
|
||||
/* Recheck to prevent race with l2arc_evict(). */
|
||||
if (HDR_HAS_L2HDR(hdr))
|
||||
arc_hdr_l2hdr_destroy(hdr);
|
||||
mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
|
||||
}
|
||||
|
||||
hdr->b_l1hdr.b_mru_hits = 0;
|
||||
hdr->b_l1hdr.b_mru_ghost_hits = 0;
|
||||
hdr->b_l1hdr.b_mfu_hits = 0;
|
||||
@@ -7086,6 +7067,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
|
||||
localprop.zp_nopwrite = B_FALSE;
|
||||
localprop.zp_copies =
|
||||
MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
|
||||
localprop.zp_gang_copies =
|
||||
MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
|
||||
}
|
||||
zio_flags |= ZIO_FLAG_RAW;
|
||||
} else if (ARC_BUF_COMPRESSED(buf)) {
|
||||
@@ -7343,7 +7326,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
|
||||
#if defined(COMPAT_FREEBSD11)
|
||||
as->arcstat_other_size.value.ui64 =
|
||||
wmsum_value(&arc_sums.arcstat_bonus_size) +
|
||||
wmsum_value(&arc_sums.arcstat_dnode_size) +
|
||||
aggsum_value(&arc_sums.arcstat_dnode_size) +
|
||||
wmsum_value(&arc_sums.arcstat_dbuf_size);
|
||||
#endif
|
||||
|
||||
@@ -7385,7 +7368,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
|
||||
&as->arcstat_uncached_evictable_metadata);
|
||||
|
||||
as->arcstat_dnode_size.value.ui64 =
|
||||
wmsum_value(&arc_sums.arcstat_dnode_size);
|
||||
aggsum_value(&arc_sums.arcstat_dnode_size);
|
||||
as->arcstat_bonus_size.value.ui64 =
|
||||
wmsum_value(&arc_sums.arcstat_bonus_size);
|
||||
as->arcstat_l2_hits.value.ui64 =
|
||||
@@ -7755,7 +7738,7 @@ arc_state_init(void)
|
||||
wmsum_init(&arc_sums.arcstat_data_size, 0);
|
||||
wmsum_init(&arc_sums.arcstat_metadata_size, 0);
|
||||
wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
|
||||
wmsum_init(&arc_sums.arcstat_dnode_size, 0);
|
||||
aggsum_init(&arc_sums.arcstat_dnode_size, 0);
|
||||
wmsum_init(&arc_sums.arcstat_bonus_size, 0);
|
||||
wmsum_init(&arc_sums.arcstat_l2_hits, 0);
|
||||
wmsum_init(&arc_sums.arcstat_l2_misses, 0);
|
||||
@@ -7914,7 +7897,7 @@ arc_state_fini(void)
|
||||
wmsum_fini(&arc_sums.arcstat_data_size);
|
||||
wmsum_fini(&arc_sums.arcstat_metadata_size);
|
||||
wmsum_fini(&arc_sums.arcstat_dbuf_size);
|
||||
wmsum_fini(&arc_sums.arcstat_dnode_size);
|
||||
aggsum_fini(&arc_sums.arcstat_dnode_size);
|
||||
wmsum_fini(&arc_sums.arcstat_bonus_size);
|
||||
wmsum_fini(&arc_sums.arcstat_l2_hits);
|
||||
wmsum_fini(&arc_sums.arcstat_l2_misses);
|
||||
|
||||
+14
-17
@@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size)
|
||||
* and grabbing the lock results in massive lock contention.
|
||||
*/
|
||||
if (size > dbuf_cache_target_bytes()) {
|
||||
if (size > dbuf_cache_hiwater_bytes())
|
||||
/*
|
||||
* Avoid calling dbuf_evict_one() from memory reclaim context
|
||||
* (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.
|
||||
* Memory reclaim threads can get stuck waiting for the dbuf
|
||||
* hash lock.
|
||||
*/
|
||||
if (size > dbuf_cache_hiwater_bytes() &&
|
||||
!current_is_reclaim_thread()) {
|
||||
dbuf_evict_one();
|
||||
}
|
||||
cv_signal(&dbuf_evict_cv);
|
||||
}
|
||||
}
|
||||
@@ -1185,16 +1193,9 @@ dbuf_verify(dmu_buf_impl_t *db)
|
||||
ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
|
||||
ASSERT3U(db->db_parent->db.db_object, ==,
|
||||
db->db.db_object);
|
||||
/*
|
||||
* dnode_grow_indblksz() can make this fail if we don't
|
||||
* have the parent's rwlock. XXX indblksz no longer
|
||||
* grows. safe to do this now?
|
||||
*/
|
||||
if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
|
||||
ASSERT3P(db->db_blkptr, ==,
|
||||
((blkptr_t *)db->db_parent->db.db_data +
|
||||
db->db_blkid % epb));
|
||||
}
|
||||
ASSERT3P(db->db_blkptr, ==,
|
||||
((blkptr_t *)db->db_parent->db.db_data +
|
||||
db->db_blkid % epb));
|
||||
}
|
||||
}
|
||||
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
|
||||
@@ -3391,12 +3392,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
||||
*parentp = NULL;
|
||||
return (err);
|
||||
}
|
||||
rw_enter(&(*parentp)->db_rwlock, RW_READER);
|
||||
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
|
||||
(blkid & ((1ULL << epbs) - 1));
|
||||
if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
|
||||
ASSERT(BP_IS_HOLE(*bpp));
|
||||
rw_exit(&(*parentp)->db_rwlock);
|
||||
return (0);
|
||||
} else {
|
||||
/* the block is referenced from the dnode */
|
||||
@@ -5375,8 +5372,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
||||
mutex_enter(&db->db_mtx);
|
||||
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
||||
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
|
||||
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
|
||||
dr->dt.dl.dr_brtwrite);
|
||||
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
|
||||
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
|
||||
mutex_exit(&db->db_mtx);
|
||||
} else if (data == NULL) {
|
||||
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
|
||||
|
||||
+179
-163
@@ -250,11 +250,6 @@ static uint32_t zfs_ddt_prunes_per_txg = 50000;
|
||||
boolean_t ddt_prune_artificial_age = B_FALSE;
|
||||
boolean_t ddt_dump_prune_histogram = B_FALSE;
|
||||
|
||||
/*
|
||||
* Don't do more than this many incremental flush passes per txg.
|
||||
*/
|
||||
uint_t zfs_dedup_log_flush_passes_max = 8;
|
||||
|
||||
/*
|
||||
* Minimum time to flush per txg.
|
||||
*/
|
||||
@@ -263,7 +258,32 @@ uint_t zfs_dedup_log_flush_min_time_ms = 1000;
|
||||
/*
|
||||
* Minimum entries to flush per txg.
|
||||
*/
|
||||
uint_t zfs_dedup_log_flush_entries_min = 1000;
|
||||
uint_t zfs_dedup_log_flush_entries_min = 200;
|
||||
|
||||
/*
|
||||
* Target number of TXGs until the whole dedup log has been flushed.
|
||||
* The log size will float around this value times the ingest rate.
|
||||
*/
|
||||
uint_t zfs_dedup_log_flush_txgs = 100;
|
||||
|
||||
/*
|
||||
* Maximum entries to flush per txg. Used for testing the dedup log.
|
||||
*/
|
||||
uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
|
||||
|
||||
/*
|
||||
* Soft cap for the size of the current dedup log. If the log is larger
|
||||
* than this size, we slightly increase the aggressiveness of the flushing to
|
||||
* try to bring it back down to the soft cap.
|
||||
*/
|
||||
uint_t zfs_dedup_log_cap = UINT_MAX;
|
||||
|
||||
/*
|
||||
* If this is set to B_TRUE, the cap above acts more like a hard cap:
|
||||
* flushing is significantly more aggressive, increasing the minimum amount we
|
||||
* flush per txg, as well as the maximum.
|
||||
*/
|
||||
boolean_t zfs_dedup_log_hard_cap = B_FALSE;
|
||||
|
||||
/*
|
||||
* Number of txgs to average flow rates across.
|
||||
@@ -1600,6 +1620,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
|
||||
ddt->ddt_spa = spa;
|
||||
ddt->ddt_os = spa->spa_meta_objset;
|
||||
ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
|
||||
ddt->ddt_log_flush_pressure = 10;
|
||||
|
||||
ddt_log_alloc(ddt);
|
||||
ddt_table_alloc_kstats(ddt);
|
||||
@@ -2013,146 +2034,6 @@ _ewma(int32_t val, int32_t prev, uint32_t weight)
|
||||
return (new);
|
||||
}
|
||||
|
||||
/* Returns true if done for this txg */
|
||||
static boolean_t
|
||||
ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
|
||||
{
|
||||
if (ddt->ddt_flush_pass == 0) {
|
||||
if (spa_sync_pass(ddt->ddt_spa) == 1) {
|
||||
/* First run this txg, get set up */
|
||||
ddt->ddt_flush_start = gethrtime();
|
||||
ddt->ddt_flush_count = 0;
|
||||
|
||||
/*
|
||||
* How many entries we need to flush. We want to at
|
||||
* least match the ingest rate.
|
||||
*/
|
||||
ddt->ddt_flush_min = MAX(
|
||||
ddt->ddt_log_ingest_rate,
|
||||
zfs_dedup_log_flush_entries_min);
|
||||
|
||||
/*
|
||||
* If we've been asked to flush everything in a hurry,
|
||||
* try to dump as much as possible on this txg. In
|
||||
* this case we're only limited by time, not amount.
|
||||
*/
|
||||
if (ddt->ddt_flush_force_txg > 0)
|
||||
ddt->ddt_flush_min =
|
||||
MAX(ddt->ddt_flush_min, avl_numnodes(
|
||||
&ddt->ddt_log_flushing->ddl_tree));
|
||||
} else {
|
||||
/* We already decided we're done for this txg */
|
||||
return (B_FALSE);
|
||||
}
|
||||
} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
|
||||
/*
|
||||
* We already did some flushing on this pass, skip it. This
|
||||
* happens when dsl_process_async_destroys() runs during a scan
|
||||
* (on pass 1) and does an additional ddt_sync() to update
|
||||
* freed blocks.
|
||||
*/
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
if (spa_sync_pass(ddt->ddt_spa) >
|
||||
MAX(zfs_dedup_log_flush_passes_max, 1)) {
|
||||
/* Too many passes this txg, defer until next. */
|
||||
ddt->ddt_flush_pass = 0;
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
|
||||
/* Nothing to flush, done for this txg. */
|
||||
ddt->ddt_flush_pass = 0;
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
|
||||
MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
|
||||
SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
|
||||
|
||||
uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
|
||||
|
||||
if (elapsed_time >= target_time) {
|
||||
/* Too long since we started, done for this txg. */
|
||||
ddt->ddt_flush_pass = 0;
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
ddt->ddt_flush_pass++;
|
||||
ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
|
||||
|
||||
/*
|
||||
* Estimate how much time we'll need to flush the remaining entries
|
||||
* based on how long it normally takes.
|
||||
*/
|
||||
uint32_t want_time;
|
||||
if (ddt->ddt_flush_pass == 1) {
|
||||
/* First pass, use the average time/entries */
|
||||
if (ddt->ddt_log_flush_rate == 0)
|
||||
/* Zero rate, just assume the whole time */
|
||||
want_time = target_time;
|
||||
else
|
||||
want_time = ddt->ddt_flush_min *
|
||||
ddt->ddt_log_flush_time_rate /
|
||||
ddt->ddt_log_flush_rate;
|
||||
} else {
|
||||
/* Later pass, calculate from this txg so far */
|
||||
want_time = ddt->ddt_flush_min *
|
||||
elapsed_time / ddt->ddt_flush_count;
|
||||
}
|
||||
|
||||
/* Figure out how much time we have left */
|
||||
uint32_t remain_time = target_time - elapsed_time;
|
||||
|
||||
/* Smear the remaining entries over the remaining passes. */
|
||||
uint32_t nentries = ddt->ddt_flush_min /
|
||||
(MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
|
||||
if (want_time > remain_time) {
|
||||
/*
|
||||
* We're behind; try to catch up a bit by doubling the amount
|
||||
* this pass. If we're behind that means we're in a later
|
||||
* pass and likely have most of the remaining time to
|
||||
* ourselves. If we're in the last couple of passes, then
|
||||
* doubling might just take us over the timeout, but probably
|
||||
* not be much, and it stops us falling behind. If we're
|
||||
* in the middle passes, there'll be more to do, but it
|
||||
* might just help us catch up a bit and we'll recalculate on
|
||||
* the next pass anyway.
|
||||
*/
|
||||
nentries = MIN(ddt->ddt_flush_min, nentries*2);
|
||||
}
|
||||
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
uint32_t count = 0;
|
||||
while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
|
||||
ddt_sync_flush_entry(ddt, &ddlwe,
|
||||
ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
|
||||
|
||||
/* End this pass if we've synced as much as we need to. */
|
||||
if (++count >= nentries)
|
||||
break;
|
||||
}
|
||||
ddt->ddt_flush_count += count;
|
||||
ddt->ddt_flush_min -= count;
|
||||
|
||||
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
|
||||
/* We emptied it, so truncate on-disk */
|
||||
DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
|
||||
ddt_log_truncate(ddt, tx);
|
||||
/* No more passes needed this txg */
|
||||
ddt->ddt_flush_pass = 0;
|
||||
} else {
|
||||
/* More to do next time, save checkpoint */
|
||||
DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
|
||||
ddt_log_checkpoint(ddt, &ddlwe, tx);
|
||||
}
|
||||
|
||||
ddt_sync_update_stats(ddt, tx);
|
||||
|
||||
return (ddt->ddt_flush_pass == 0);
|
||||
}
|
||||
|
||||
static inline void
|
||||
ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
|
||||
{
|
||||
@@ -2190,19 +2071,135 @@ ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
|
||||
static void
|
||||
ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = ddt->ddt_spa;
|
||||
ASSERT(avl_is_empty(&ddt->ddt_tree));
|
||||
|
||||
/* Don't do any flushing when the pool is ready to shut down */
|
||||
if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
|
||||
/*
|
||||
* Don't do any flushing when the pool is ready to shut down, or in
|
||||
* passes beyond the first.
|
||||
*/
|
||||
if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
|
||||
return;
|
||||
|
||||
/* Try to flush some. */
|
||||
if (!ddt_sync_flush_log_incremental(ddt, tx))
|
||||
/* More to do next time */
|
||||
return;
|
||||
hrtime_t flush_start = gethrtime();
|
||||
uint32_t count = 0;
|
||||
|
||||
/* No more flushing this txg, so we can do end-of-txg housekeeping */
|
||||
/*
|
||||
* How many entries we need to flush. We need to at
|
||||
* least match the ingest rate, and also consider the
|
||||
* current backlog of entries.
|
||||
*/
|
||||
uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
|
||||
avl_numnodes(&ddt->ddt_log_active->ddl_tree);
|
||||
|
||||
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
|
||||
goto housekeeping;
|
||||
|
||||
uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
|
||||
uint64_t cap = MAX(1, zfs_dedup_log_cap);
|
||||
uint64_t flush_min = MAX(backlog / txgs,
|
||||
zfs_dedup_log_flush_entries_min);
|
||||
|
||||
/*
|
||||
* The theory for this block is that if we increase the pressure while
|
||||
* we're growing above the cap, and remove it when we're significantly
|
||||
* below the cap, we'll stay near cap while not bouncing around too
|
||||
* much.
|
||||
*
|
||||
* The factor of 10 is to smooth the pressure effect by expressing it
|
||||
* in tenths. The addition of the cap to the backlog in the second
|
||||
* block is to round up, instead of down. We never let the pressure go
|
||||
* below 1 (10 tenths).
|
||||
*/
|
||||
if (cap != UINT_MAX && backlog > cap &&
|
||||
backlog > ddt->ddt_log_flush_prev_backlog) {
|
||||
ddt->ddt_log_flush_pressure += 10 * backlog / cap;
|
||||
} else if (cap != UINT_MAX && backlog < cap) {
|
||||
ddt->ddt_log_flush_pressure -=
|
||||
11 - (((10 * backlog) + cap - 1) / cap);
|
||||
ddt->ddt_log_flush_pressure =
|
||||
MAX(ddt->ddt_log_flush_pressure, 10);
|
||||
}
|
||||
|
||||
if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
|
||||
flush_min = MAX(flush_min, MIN(backlog - cap,
|
||||
(flush_min * ddt->ddt_log_flush_pressure) / 10));
|
||||
|
||||
uint64_t flush_max;
|
||||
|
||||
/*
|
||||
* If we've been asked to flush everything in a hurry,
|
||||
* try to dump as much as possible on this txg. In
|
||||
* this case we're only limited by time, not amount.
|
||||
*
|
||||
* Otherwise, if we are over the cap, try to get back down to it.
|
||||
*
|
||||
* Finally if there is no cap (or no pressure), just set the max a
|
||||
* little higher than the min to help smooth out variations in flush
|
||||
* times.
|
||||
*/
|
||||
if (ddt->ddt_flush_force_txg > 0)
|
||||
flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
|
||||
else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
|
||||
flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
|
||||
(flush_min * ddt->ddt_log_flush_pressure) / 10));
|
||||
else
|
||||
flush_max = flush_min * 5 / 4;
|
||||
flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
|
||||
|
||||
/*
|
||||
* When the pool is busy or someone is explicitly waiting for this txg
|
||||
* to complete, use the zfs_dedup_log_flush_min_time_ms. Otherwise use
|
||||
* half of the time in the txg timeout.
|
||||
*/
|
||||
uint64_t target_time;
|
||||
|
||||
if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
|
||||
vdev_queue_pool_busy(spa)) {
|
||||
target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
|
||||
SEC2NSEC(zfs_txg_timeout) / 2);
|
||||
} else {
|
||||
target_time = SEC2NSEC(zfs_txg_timeout) / 2;
|
||||
}
|
||||
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
|
||||
ddt_sync_flush_entry(ddt, &ddlwe,
|
||||
ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
|
||||
|
||||
/* End if we've synced as much as we needed to. */
|
||||
if (++count >= flush_max)
|
||||
break;
|
||||
|
||||
/*
|
||||
* As long as we've flushed the absolute minimum,
|
||||
* stop if we're way over our target time.
|
||||
*/
|
||||
uint64_t diff = gethrtime() - flush_start;
|
||||
if (count > zfs_dedup_log_flush_entries_min &&
|
||||
diff >= target_time * 2)
|
||||
break;
|
||||
|
||||
/*
|
||||
* End if we've passed the minimum flush and we're out of time.
|
||||
*/
|
||||
if (count > flush_min && diff >= target_time)
|
||||
break;
|
||||
}
|
||||
|
||||
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
|
||||
/* We emptied it, so truncate on-disk */
|
||||
DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
|
||||
ddt_log_truncate(ddt, tx);
|
||||
} else {
|
||||
/* More to do next time, save checkpoint */
|
||||
DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
|
||||
ddt_log_checkpoint(ddt, &ddlwe, tx);
|
||||
}
|
||||
|
||||
ddt_sync_update_stats(ddt, tx);
|
||||
|
||||
housekeeping:
|
||||
if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
|
||||
!avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
|
||||
/*
|
||||
@@ -2219,12 +2216,13 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
|
||||
/* If force flush is no longer necessary, turn it off. */
|
||||
ddt_flush_force_update_txg(ddt, 0);
|
||||
|
||||
ddt->ddt_log_flush_prev_backlog = backlog;
|
||||
|
||||
/*
|
||||
* Update flush rate. This is an exponential weighted moving average of
|
||||
* the number of entries flushed over recent txgs.
|
||||
* Update flush rate. This is an exponential weighted moving
|
||||
* average of the number of entries flushed over recent txgs.
|
||||
*/
|
||||
ddt->ddt_log_flush_rate = _ewma(
|
||||
ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
|
||||
ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
|
||||
zfs_dedup_log_flush_flow_rate_txgs);
|
||||
DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
|
||||
|
||||
@@ -2232,12 +2230,21 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
|
||||
* Update flush time rate. This is an exponential weighted moving
|
||||
* average of the total time taken to flush over recent txgs.
|
||||
*/
|
||||
ddt->ddt_log_flush_time_rate = _ewma(
|
||||
ddt->ddt_log_flush_time_rate,
|
||||
((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
|
||||
ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
|
||||
(int32_t)NSEC2MSEC(gethrtime() - flush_start),
|
||||
zfs_dedup_log_flush_flow_rate_txgs);
|
||||
DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
|
||||
ddt->ddt_log_flush_time_rate);
|
||||
if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
|
||||
zfs_flags & ZFS_DEBUG_DDT) {
|
||||
zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
|
||||
"txg %llu, in %llu ms, flush rate %d, time rate %d",
|
||||
(ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
|
||||
(ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
|
||||
count, (u_longlong_t)tx->tx_txg,
|
||||
(u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
|
||||
ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -2785,14 +2792,23 @@ ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
|
||||
"Enable prefetching dedup-ed blks");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
|
||||
"Max number of incremental dedup log flush passes per transaction");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
|
||||
"Min time to spend on incremental dedup log flush each transaction");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
|
||||
"Min number of log entries to flush each transaction");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
|
||||
"Max number of log entries to flush each transaction");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
|
||||
"Number of TXGs to try to rotate the log in");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
|
||||
"Soft cap for the size of the current dedup log");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
|
||||
"Whether to use the soft cap as a hard cap");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
|
||||
"Number of txgs to average flow rates across");
|
||||
|
||||
+21
-1
@@ -1916,6 +1916,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
||||
dr->dt.dl.dr_overridden_by = *zio->io_bp;
|
||||
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
|
||||
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
|
||||
dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;
|
||||
|
||||
/*
|
||||
* Old style holes are filled with all zeros, whereas
|
||||
@@ -2322,6 +2323,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
boolean_t dedup_verify = os->os_dedup_verify;
|
||||
boolean_t encrypt = B_FALSE;
|
||||
int copies = os->os_copies;
|
||||
int gang_copies = os->os_copies;
|
||||
|
||||
/*
|
||||
* We maintain different write policies for each of the following
|
||||
@@ -2354,15 +2356,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
switch (os->os_redundant_metadata) {
|
||||
case ZFS_REDUNDANT_METADATA_ALL:
|
||||
copies++;
|
||||
gang_copies++;
|
||||
break;
|
||||
case ZFS_REDUNDANT_METADATA_MOST:
|
||||
if (level >= zfs_redundant_metadata_most_ditto_level ||
|
||||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
|
||||
copies++;
|
||||
if (level + 1 >=
|
||||
zfs_redundant_metadata_most_ditto_level ||
|
||||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
|
||||
gang_copies++;
|
||||
break;
|
||||
case ZFS_REDUNDANT_METADATA_SOME:
|
||||
if (DMU_OT_IS_CRITICAL(type))
|
||||
if (DMU_OT_IS_CRITICAL(type, level)) {
|
||||
copies++;
|
||||
gang_copies++;
|
||||
} else if (DMU_OT_IS_METADATA(type)) {
|
||||
gang_copies++;
|
||||
}
|
||||
break;
|
||||
case ZFS_REDUNDANT_METADATA_NONE:
|
||||
break;
|
||||
@@ -2445,6 +2456,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
|
||||
ZCHECKSUM_FLAG_NOPWRITE) &&
|
||||
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
|
||||
|
||||
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
|
||||
(os->os_redundant_metadata ==
|
||||
ZFS_REDUNDANT_METADATA_MOST &&
|
||||
zfs_redundant_metadata_most_ditto_level <= 1))
|
||||
gang_copies++;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2461,6 +2478,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
|
||||
if (DMU_OT_IS_ENCRYPTED(type)) {
|
||||
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
|
||||
gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
|
||||
nopwrite = B_FALSE;
|
||||
} else {
|
||||
dedup = B_FALSE;
|
||||
@@ -2478,6 +2496,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
|
||||
zp->zp_level = level;
|
||||
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
|
||||
zp->zp_gang_copies = MIN(MAX(gang_copies, copies),
|
||||
spa_max_replication(os->os_spa));
|
||||
zp->zp_dedup = dedup;
|
||||
zp->zp_dedup_verify = dedup && dedup_verify;
|
||||
zp->zp_nopwrite = nopwrite;
|
||||
|
||||
@@ -2310,6 +2310,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
|
||||
zp.zp_nopwrite = B_FALSE;
|
||||
zp.zp_copies = MIN(zp.zp_copies,
|
||||
SPA_DVAS_PER_BP - 1);
|
||||
zp.zp_gang_copies =
|
||||
MIN(zp.zp_gang_copies,
|
||||
SPA_DVAS_PER_BP - 1);
|
||||
}
|
||||
zio_flags |= ZIO_FLAG_RAW;
|
||||
} else if (DRR_WRITE_COMPRESSED(drrw)) {
|
||||
|
||||
+28
-2
@@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
|
||||
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
|
||||
#endif /* _KERNEL */
|
||||
|
||||
static char *
|
||||
rt_name(dnode_t *dn, const char *name)
|
||||
{
|
||||
struct objset *os = dn->dn_objset;
|
||||
|
||||
return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",
|
||||
spa_name(os->os_spa),
|
||||
(u_longlong_t)(os->os_dsl_dataset ?
|
||||
os->os_dsl_dataset->ds_object : DMU_META_OBJSET),
|
||||
(u_longlong_t)dn->dn_object,
|
||||
name));
|
||||
}
|
||||
|
||||
static int
|
||||
dbuf_compare(const void *x1, const void *x2)
|
||||
{
|
||||
@@ -2436,8 +2449,10 @@ done:
|
||||
{
|
||||
int txgoff = tx->tx_txg & TXG_MASK;
|
||||
if (dn->dn_free_ranges[txgoff] == NULL) {
|
||||
dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
|
||||
ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
dn->dn_free_ranges[txgoff] =
|
||||
zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));
|
||||
}
|
||||
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
|
||||
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
|
||||
@@ -2559,6 +2574,11 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
|
||||
error = 0;
|
||||
epb = dn->dn_phys->dn_nblkptr;
|
||||
data = dn->dn_phys->dn_blkptr;
|
||||
if (dn->dn_dbuf != NULL)
|
||||
rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER);
|
||||
else if (dmu_objset_ds(dn->dn_objset) != NULL)
|
||||
rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
|
||||
RW_READER, FTAG);
|
||||
} else {
|
||||
uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
|
||||
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
|
||||
@@ -2663,6 +2683,12 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
|
||||
if (db != NULL) {
|
||||
rw_exit(&db->db_rwlock);
|
||||
dbuf_rele(db, FTAG);
|
||||
} else {
|
||||
if (dn->dn_dbuf != NULL)
|
||||
rw_exit(&dn->dn_dbuf->db_rwlock);
|
||||
else if (dmu_objset_ds(dn->dn_objset) != NULL)
|
||||
rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
|
||||
FTAG);
|
||||
}
|
||||
|
||||
return (error);
|
||||
|
||||
@@ -235,6 +235,9 @@ static uint_t zfs_resilver_defer_percent = 10;
|
||||
#define DSL_SCAN_IS_SCRUB(scn) \
|
||||
((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB)
|
||||
|
||||
#define DSL_SCAN_IS_RESILVER(scn) \
|
||||
((scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
|
||||
|
||||
/*
|
||||
* Enable/disable the processing of the free_bpobj object.
|
||||
*/
|
||||
@@ -1169,7 +1172,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
|
||||
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
|
||||
scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
|
||||
|
||||
if (scn->scn_phys.scn_min_txg) {
|
||||
if (DSL_SCAN_IS_RESILVER(scn)) {
|
||||
nvlist_t *aux = fnvlist_alloc();
|
||||
fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
|
||||
"healing");
|
||||
|
||||
+57
-47
@@ -370,6 +370,16 @@ static metaslab_stats_t metaslab_stats = {
|
||||
#define METASLABSTAT_BUMP(stat) \
|
||||
atomic_inc_64(&metaslab_stats.stat.value.ui64);
|
||||
|
||||
char *
|
||||
metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name)
|
||||
{
|
||||
return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}",
|
||||
spa_name(mg->mg_vd->vdev_spa),
|
||||
(u_longlong_t)mg->mg_vd->vdev_guid,
|
||||
(u_longlong_t)ms->ms_id,
|
||||
name));
|
||||
}
|
||||
|
||||
|
||||
static kstat_t *metaslab_ksp;
|
||||
|
||||
@@ -969,14 +979,16 @@ metaslab_group_passivate(metaslab_group_t *mg)
|
||||
if (msp != NULL) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_passivate(msp,
|
||||
metaslab_weight_from_range_tree(msp));
|
||||
metaslab_weight(msp, B_TRUE) &
|
||||
~METASLAB_ACTIVE_MASK);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
msp = mga->mga_secondary;
|
||||
if (msp != NULL) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_passivate(msp,
|
||||
metaslab_weight_from_range_tree(msp));
|
||||
metaslab_weight(msp, B_TRUE) &
|
||||
~METASLAB_ACTIVE_MASK);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
}
|
||||
@@ -2755,30 +2767,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
|
||||
zfs_range_seg_type_t type =
|
||||
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
|
||||
|
||||
ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
|
||||
shift);
|
||||
ms->ms_allocatable = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable"));
|
||||
for (int t = 0; t < TXG_SIZE; t++) {
|
||||
ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
|
||||
NULL, start, shift);
|
||||
ms->ms_allocating[t] = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME,
|
||||
metaslab_rt_name(mg, ms, "ms_allocating"));
|
||||
}
|
||||
ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
|
||||
ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
|
||||
ms->ms_freeing = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing"));
|
||||
ms->ms_freed = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed"));
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
|
||||
start, shift);
|
||||
ms->ms_defer[t] = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer"));
|
||||
}
|
||||
ms->ms_checkpointing =
|
||||
zfs_range_tree_create(NULL, type, NULL, start, shift);
|
||||
ms->ms_unflushed_allocs =
|
||||
zfs_range_tree_create(NULL, type, NULL, start, shift);
|
||||
ms->ms_checkpointing = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing"));
|
||||
ms->ms_unflushed_allocs = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs"));
|
||||
|
||||
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
|
||||
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
|
||||
mrap->mra_floor_shift = metaslab_by_size_min_shift;
|
||||
ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
|
||||
type, mrap, start, shift);
|
||||
ms->ms_unflushed_frees = zfs_range_tree_create_flags(
|
||||
&metaslab_rt_ops, type, mrap, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees"));
|
||||
|
||||
ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
|
||||
ms->ms_trim = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim"));
|
||||
|
||||
metaslab_group_add(mg, ms);
|
||||
metaslab_set_fragmentation(ms, B_FALSE);
|
||||
@@ -3752,7 +3777,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
|
||||
type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
|
||||
&start, &shift);
|
||||
|
||||
condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
|
||||
condense_tree = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME,
|
||||
metaslab_rt_name(msp->ms_group, msp, "condense_tree"));
|
||||
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
zfs_range_tree_walk(msp->ms_defer[t],
|
||||
@@ -3809,8 +3837,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
|
||||
* followed by FREES (due to space_map_write() in metaslab_sync()) for
|
||||
* sync pass 1.
|
||||
*/
|
||||
zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL,
|
||||
start, shift);
|
||||
zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift,
|
||||
ZFS_RT_F_DYN_NAME,
|
||||
metaslab_rt_name(msp->ms_group, msp, "tmp_tree"));
|
||||
zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
|
||||
space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
|
||||
space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
|
||||
@@ -5073,29 +5103,16 @@ next:
|
||||
|
||||
/*
|
||||
* We were unable to allocate from this metaslab so determine
|
||||
* a new weight for this metaslab. Now that we have loaded
|
||||
* the metaslab we can provide a better hint to the metaslab
|
||||
* selector.
|
||||
*
|
||||
* For space-based metaslabs, we use the maximum block size.
|
||||
* This information is only available when the metaslab
|
||||
* is loaded and is more accurate than the generic free
|
||||
* space weight that was calculated by metaslab_weight().
|
||||
* This information allows us to quickly compare the maximum
|
||||
* available allocation in the metaslab to the allocation
|
||||
* size being requested.
|
||||
*
|
||||
* For segment-based metaslabs, determine the new weight
|
||||
* based on the highest bucket in the range tree. We
|
||||
* explicitly use the loaded segment weight (i.e. the range
|
||||
* tree histogram) since it contains the space that is
|
||||
* currently available for allocation and is accurate
|
||||
* even within a sync pass.
|
||||
* a new weight for this metaslab. The weight was last
|
||||
* recalculated either when we loaded it (if this is the first
|
||||
* TXG it's been loaded in), or the last time a txg was synced
|
||||
* out.
|
||||
*/
|
||||
uint64_t weight;
|
||||
if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
|
||||
weight = metaslab_largest_allocatable(msp);
|
||||
WEIGHT_SET_SPACEBASED(weight);
|
||||
metaslab_set_fragmentation(msp, B_TRUE);
|
||||
weight = metaslab_space_weight(msp) &
|
||||
~METASLAB_ACTIVE_MASK;
|
||||
} else {
|
||||
weight = metaslab_weight_from_range_tree(msp);
|
||||
}
|
||||
@@ -5107,13 +5124,6 @@ next:
|
||||
* For the case where we use the metaslab that is
|
||||
* active for another allocator we want to make
|
||||
* sure that we retain the activation mask.
|
||||
*
|
||||
* Note that we could attempt to use something like
|
||||
* metaslab_recalculate_weight_and_sort() that
|
||||
* retains the activation mask here. That function
|
||||
* uses metaslab_weight() to set the weight though
|
||||
* which is not as accurate as the calculations
|
||||
* above.
|
||||
*/
|
||||
weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
|
||||
metaslab_group_sort(mg, msp, weight);
|
||||
|
||||
@@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
|
||||
ml->ml_num_sublists = num;
|
||||
ml->ml_index_func = index_func;
|
||||
|
||||
ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
|
||||
ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) *
|
||||
ml->ml_num_sublists, KM_SLEEP);
|
||||
|
||||
ASSERT3P(ml->ml_sublists, !=, NULL);
|
||||
@@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml)
|
||||
}
|
||||
|
||||
ASSERT3P(ml->ml_sublists, !=, NULL);
|
||||
kmem_free(ml->ml_sublists,
|
||||
vmem_free(ml->ml_sublists,
|
||||
sizeof (multilist_sublist_t) * ml->ml_num_sublists);
|
||||
|
||||
ml->ml_num_sublists = 0;
|
||||
|
||||
+68
-30
@@ -201,10 +201,10 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
|
||||
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
|
||||
zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
|
||||
|
||||
zfs_range_tree_t *
|
||||
zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
|
||||
static zfs_range_tree_t *
|
||||
zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
|
||||
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
|
||||
uint64_t gap)
|
||||
uint64_t gap, uint64_t flags, const char *name)
|
||||
{
|
||||
zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
|
||||
|
||||
@@ -236,6 +236,8 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
|
||||
|
||||
rt->rt_ops = ops;
|
||||
rt->rt_gap = gap;
|
||||
rt->rt_flags = flags;
|
||||
rt->rt_name = name;
|
||||
rt->rt_arg = arg;
|
||||
rt->rt_type = type;
|
||||
rt->rt_start = start;
|
||||
@@ -247,11 +249,30 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
|
||||
return (rt);
|
||||
}
|
||||
|
||||
zfs_range_tree_t *
|
||||
zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
|
||||
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
|
||||
uint64_t gap)
|
||||
{
|
||||
return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
|
||||
0, NULL));
|
||||
}
|
||||
|
||||
zfs_range_tree_t *
|
||||
zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
|
||||
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
|
||||
{
|
||||
return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
|
||||
return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
|
||||
0, NULL));
|
||||
}
|
||||
|
||||
zfs_range_tree_t *
|
||||
zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
|
||||
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
|
||||
uint64_t flags, const char *name)
|
||||
{
|
||||
return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
|
||||
flags, name));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -262,6 +283,9 @@ zfs_range_tree_destroy(zfs_range_tree_t *rt)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
|
||||
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
|
||||
|
||||
if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
|
||||
kmem_strfree((char *)(uintptr_t)rt->rt_name);
|
||||
|
||||
zfs_btree_destroy(&rt->rt_root);
|
||||
kmem_free(rt, sizeof (*rt));
|
||||
}
|
||||
@@ -271,15 +295,17 @@ zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
|
||||
int64_t delta)
|
||||
{
|
||||
if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
|
||||
zfs_panic_recover("zfs: attempting to decrease fill to or "
|
||||
"below 0; probable double remove in segment [%llx:%llx]",
|
||||
zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
|
||||
"or below 0; probable double remove in segment [%llx:%llx]",
|
||||
ZFS_RT_NAME(rt),
|
||||
(longlong_t)zfs_rs_get_start(rs, rt),
|
||||
(longlong_t)zfs_rs_get_end(rs, rt));
|
||||
}
|
||||
if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
|
||||
zfs_rs_get_start(rs, rt)) {
|
||||
zfs_panic_recover("zfs: attempting to increase fill beyond "
|
||||
"max; probable double add in segment [%llx:%llx]",
|
||||
zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
|
||||
"beyond max; probable double add in segment [%llx:%llx]",
|
||||
ZFS_RT_NAME(rt),
|
||||
(longlong_t)zfs_rs_get_start(rs, rt),
|
||||
(longlong_t)zfs_rs_get_end(rs, rt));
|
||||
}
|
||||
@@ -319,14 +345,17 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
|
||||
* the normal code paths.
|
||||
*/
|
||||
if (rs != NULL) {
|
||||
if (gap == 0) {
|
||||
zfs_panic_recover("zfs: adding existent segment to "
|
||||
"range tree (offset=%llx size=%llx)",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
uint64_t rstart = zfs_rs_get_start(rs, rt);
|
||||
uint64_t rend = zfs_rs_get_end(rs, rt);
|
||||
if (gap == 0) {
|
||||
zfs_panic_recover("zfs: rt=%s: adding segment "
|
||||
"(offset=%llx size=%llx) overlapping with existing "
|
||||
"one (offset=%llx size=%llx)",
|
||||
ZFS_RT_NAME(rt),
|
||||
(longlong_t)start, (longlong_t)size,
|
||||
(longlong_t)rstart, (longlong_t)(rend - rstart));
|
||||
return;
|
||||
}
|
||||
if (rstart <= start && rend >= end) {
|
||||
zfs_range_tree_adjust_fill(rt, rs, fill);
|
||||
return;
|
||||
@@ -451,6 +480,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
|
||||
zfs_range_seg_t *rs;
|
||||
zfs_range_seg_max_t rsearch, rs_tmp;
|
||||
uint64_t end = start + size;
|
||||
uint64_t rstart, rend;
|
||||
boolean_t left_over, right_over;
|
||||
|
||||
VERIFY3U(size, !=, 0);
|
||||
@@ -464,12 +494,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
|
||||
|
||||
/* Make sure we completely overlap with someone */
|
||||
if (rs == NULL) {
|
||||
zfs_panic_recover("zfs: removing nonexistent segment from "
|
||||
"range tree (offset=%llx size=%llx)",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
|
||||
"from range tree (offset=%llx size=%llx)",
|
||||
ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
|
||||
rstart = zfs_rs_get_start(rs, rt);
|
||||
rend = zfs_rs_get_end(rs, rt);
|
||||
|
||||
/*
|
||||
* Range trees with gap support must only remove complete segments
|
||||
* from the tree. This allows us to maintain accurate fill accounting
|
||||
@@ -479,31 +512,36 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
|
||||
if (rt->rt_gap != 0) {
|
||||
if (do_fill) {
|
||||
if (zfs_rs_get_fill(rs, rt) == size) {
|
||||
start = zfs_rs_get_start(rs, rt);
|
||||
end = zfs_rs_get_end(rs, rt);
|
||||
start = rstart;
|
||||
end = rend;
|
||||
size = end - start;
|
||||
} else {
|
||||
zfs_range_tree_adjust_fill(rt, rs, -size);
|
||||
return;
|
||||
}
|
||||
} else if (zfs_rs_get_start(rs, rt) != start ||
|
||||
zfs_rs_get_end(rs, rt) != end) {
|
||||
zfs_panic_recover("zfs: freeing partial segment of "
|
||||
"gap tree (offset=%llx size=%llx) of "
|
||||
} else if (rstart != start || rend != end) {
|
||||
zfs_panic_recover("zfs: rt=%s: freeing partial segment "
|
||||
"of gap tree (offset=%llx size=%llx) of "
|
||||
"(offset=%llx size=%llx)",
|
||||
ZFS_RT_NAME(rt),
|
||||
(longlong_t)start, (longlong_t)size,
|
||||
(longlong_t)zfs_rs_get_start(rs, rt),
|
||||
(longlong_t)zfs_rs_get_end(rs, rt) -
|
||||
zfs_rs_get_start(rs, rt));
|
||||
(longlong_t)rstart, (longlong_t)(rend - rstart));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
|
||||
VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
|
||||
if (!(rstart <= start && rend >= end)) {
|
||||
panic("zfs: rt=%s: removing segment "
|
||||
"(offset=%llx size=%llx) not completely overlapped by "
|
||||
"existing one (offset=%llx size=%llx)",
|
||||
ZFS_RT_NAME(rt),
|
||||
(longlong_t)start, (longlong_t)size,
|
||||
(longlong_t)rstart, (longlong_t)(rend - rstart));
|
||||
return;
|
||||
}
|
||||
|
||||
left_over = (zfs_rs_get_start(rs, rt) != start);
|
||||
right_over = (zfs_rs_get_end(rs, rt) != end);
|
||||
left_over = (rstart != start);
|
||||
right_over = (rend != end);
|
||||
|
||||
zfs_range_tree_stat_decr(rt, rs);
|
||||
|
||||
|
||||
+3
-18
@@ -1231,29 +1231,14 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
|
||||
spa->spa_proc, zio_taskq_basedc, flags);
|
||||
} else {
|
||||
#endif
|
||||
pri_t pri = maxclsyspri;
|
||||
/*
|
||||
* The write issue taskq can be extremely CPU
|
||||
* intensive. Run it at slightly less important
|
||||
* priority than the other taskqs.
|
||||
*
|
||||
* Under Linux and FreeBSD this means incrementing
|
||||
* the priority value as opposed to platforms like
|
||||
* illumos where it should be decremented.
|
||||
*
|
||||
* On FreeBSD, if priorities divided by four (RQ_PPQ)
|
||||
* are equal then a difference between them is
|
||||
* insignificant.
|
||||
*/
|
||||
if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
|
||||
#if defined(__linux__)
|
||||
pri++;
|
||||
#elif defined(__FreeBSD__)
|
||||
pri += 4;
|
||||
#else
|
||||
#error "unknown OS"
|
||||
#endif
|
||||
}
|
||||
const pri_t pri = (t == ZIO_TYPE_WRITE &&
|
||||
q == ZIO_TASKQ_ISSUE) ?
|
||||
wtqclsyspri : maxclsyspri;
|
||||
tq = taskq_create_proc(name, value, pri, 50,
|
||||
INT_MAX, spa->spa_proc, flags);
|
||||
#ifdef HAVE_SYSDC
|
||||
|
||||
+48
-6
@@ -243,6 +243,25 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
|
||||
vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
|
||||
}
|
||||
|
||||
char *
|
||||
vdev_rt_name(vdev_t *vd, const char *name)
|
||||
{
|
||||
return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}",
|
||||
spa_name(vd->vdev_spa),
|
||||
(u_longlong_t)vd->vdev_guid,
|
||||
name));
|
||||
}
|
||||
|
||||
static char *
|
||||
vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type)
|
||||
{
|
||||
return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}",
|
||||
spa_name(vd->vdev_spa),
|
||||
(u_longlong_t)vd->vdev_guid,
|
||||
name,
|
||||
dtl_type));
|
||||
}
|
||||
|
||||
/*
|
||||
* Virtual device management.
|
||||
*/
|
||||
@@ -540,6 +559,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
|
||||
|
||||
pvd->vdev_child = newchild;
|
||||
pvd->vdev_child[id] = cvd;
|
||||
pvd->vdev_nonrot &= cvd->vdev_nonrot;
|
||||
|
||||
cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
|
||||
ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
|
||||
@@ -678,8 +698,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
|
||||
rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
vd->vdev_obsolete_segments = zfs_range_tree_create(NULL,
|
||||
ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
vd->vdev_obsolete_segments = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments"));
|
||||
|
||||
/*
|
||||
* Initialize rate limit structs for events. We rate limit ZIO delay
|
||||
@@ -733,8 +754,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
vd->vdev_dtl[t] = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t));
|
||||
}
|
||||
|
||||
txg_list_create(&vd->vdev_ms_list, spa,
|
||||
@@ -1361,6 +1383,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
|
||||
mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
|
||||
mvd->vdev_state = cvd->vdev_state;
|
||||
mvd->vdev_crtxg = cvd->vdev_crtxg;
|
||||
mvd->vdev_nonrot = cvd->vdev_nonrot;
|
||||
|
||||
vdev_remove_child(pvd, cvd);
|
||||
vdev_add_child(pvd, mvd);
|
||||
@@ -1567,6 +1590,18 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
||||
vd->vdev_ms = mspp;
|
||||
vd->vdev_ms_count = newc;
|
||||
|
||||
/*
|
||||
* Weighting algorithms can depend on the number of metaslabs in the
|
||||
* vdev. In order to ensure that all weights are correct at all times,
|
||||
* we need to recalculate here.
|
||||
*/
|
||||
for (uint64_t m = 0; m < oldc; m++) {
|
||||
metaslab_t *msp = vd->vdev_ms[m];
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_recalculate_weight_and_sort(msp);
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
|
||||
for (uint64_t m = oldc; m < newc; m++) {
|
||||
uint64_t object = 0;
|
||||
/*
|
||||
@@ -1948,6 +1983,10 @@ vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
|
||||
taskq_wait(tq);
|
||||
for (int c = 0; c < children; c++) {
|
||||
vdev_t *cvd = vd->vdev_child[c];
|
||||
|
||||
if (open_func(cvd) == B_FALSE ||
|
||||
cvd->vdev_state <= VDEV_STATE_FAULTED)
|
||||
continue;
|
||||
vd->vdev_nonrot &= cvd->vdev_nonrot;
|
||||
}
|
||||
|
||||
@@ -3419,7 +3458,9 @@ vdev_dtl_load(vdev_t *vd)
|
||||
return (error);
|
||||
ASSERT(vd->vdev_dtl_sm != NULL);
|
||||
|
||||
rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
rt = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt"));
|
||||
error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
|
||||
if (error == 0) {
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
@@ -3567,7 +3608,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
|
||||
ASSERT(vd->vdev_dtl_sm != NULL);
|
||||
}
|
||||
|
||||
rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync"));
|
||||
|
||||
mutex_enter(&vd->vdev_dtl_lock);
|
||||
zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
|
||||
|
||||
@@ -2482,6 +2482,7 @@ vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
|
||||
*max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
|
||||
|
||||
vds->vds_draid_vdev = tvd;
|
||||
vd->vdev_nonrot = tvd->vdev_nonrot;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -541,8 +541,9 @@ vdev_initialize_thread(void *arg)
|
||||
|
||||
abd_t *deadbeef = vdev_initialize_block_alloc();
|
||||
|
||||
vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
vd->vdev_initialize_tree = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_initialize_tree"));
|
||||
|
||||
for (uint64_t i = 0; !vd->vdev_detached &&
|
||||
i < vd->vdev_top->vdev_ms_count; i++) {
|
||||
|
||||
@@ -1050,6 +1050,16 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
|
||||
mutex_exit(&vq->vq_lock);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
vdev_queue_pool_busy(spa_t *spa)
|
||||
{
|
||||
dsl_pool_t *dp = spa_get_dsl(spa);
|
||||
uint64_t min_bytes = zfs_dirty_data_max *
|
||||
zfs_vdev_async_write_active_min_dirty_percent / 100;
|
||||
|
||||
return (dp->dp_dirty_total > min_bytes);
|
||||
}
|
||||
|
||||
/*
|
||||
* As these two methods are only used for load calculations we're not
|
||||
* concerned if we get an incorrect value on 32bit platforms due to lack of
|
||||
|
||||
@@ -4556,8 +4556,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
|
||||
uint64_t shift, start;
|
||||
zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
|
||||
raidvd, msp, &start, &shift);
|
||||
zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
|
||||
start, shift);
|
||||
zfs_range_tree_t *rt = zfs_range_tree_create_flags(
|
||||
NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
|
||||
metaslab_rt_name(msp->ms_group, msp,
|
||||
"spa_raidz_expand_thread:rt"));
|
||||
zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
|
||||
zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
|
||||
rt);
|
||||
|
||||
@@ -787,8 +787,9 @@ vdev_rebuild_thread(void *arg)
|
||||
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
|
||||
vr->vr_top_vdev = vd;
|
||||
vr->vr_scan_msp = NULL;
|
||||
vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL,
|
||||
0, 0);
|
||||
vr->vr_scan_tree = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vr_scan_tree"));
|
||||
mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
|
||||
+18
-12
@@ -364,13 +364,15 @@ spa_vdev_removal_create(vdev_t *vd)
|
||||
spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
|
||||
mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
|
||||
svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
svr->svr_allocd_segs = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs"));
|
||||
svr->svr_vdev_id = vd->vdev_id;
|
||||
|
||||
for (int i = 0; i < TXG_SIZE; i++) {
|
||||
svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
svr->svr_frees[i] = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees"));
|
||||
list_create(&svr->svr_new_segments[i],
|
||||
sizeof (vdev_indirect_mapping_entry_t),
|
||||
offsetof(vdev_indirect_mapping_entry_t, vime_node));
|
||||
@@ -1179,8 +1181,9 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs,
|
||||
* relative to the start of the range to be copied (i.e. relative to the
|
||||
* local variable "start").
|
||||
*/
|
||||
zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL,
|
||||
ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs"));
|
||||
|
||||
zfs_btree_index_t where;
|
||||
zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
|
||||
@@ -1448,8 +1451,9 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
|
||||
* allocated segments that we are copying. We may also be copying
|
||||
* free segments (of up to vdev_removal_max_span bytes).
|
||||
*/
|
||||
zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
zfs_range_tree_t *segs = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs"));
|
||||
for (;;) {
|
||||
zfs_range_tree_t *rt = svr->svr_allocd_segs;
|
||||
zfs_range_seg_t *rs = zfs_range_tree_first(rt);
|
||||
@@ -1610,8 +1614,9 @@ spa_vdev_remove_thread(void *arg)
|
||||
vca.vca_read_error_bytes = 0;
|
||||
vca.vca_write_error_bytes = 0;
|
||||
|
||||
zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
zfs_range_tree_t *segs = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs"));
|
||||
|
||||
mutex_enter(&svr->svr_lock);
|
||||
|
||||
@@ -1894,8 +1899,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
|
||||
vdev_indirect_mapping_max_offset(vim));
|
||||
}
|
||||
|
||||
zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
zfs_range_tree_t *segs = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME,
|
||||
vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs"));
|
||||
for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
|
||||
metaslab_t *msp = vd->vdev_ms[msi];
|
||||
|
||||
|
||||
+17
-7
@@ -902,7 +902,9 @@ vdev_trim_thread(void *arg)
|
||||
ta.trim_vdev = vd;
|
||||
ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
|
||||
ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
|
||||
ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
ta.trim_tree = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
|
||||
ta.trim_type = TRIM_TYPE_MANUAL;
|
||||
ta.trim_flags = 0;
|
||||
|
||||
@@ -1305,8 +1307,10 @@ vdev_autotrim_thread(void *arg)
|
||||
* Allocate an empty range tree which is swapped in
|
||||
* for the existing ms_trim tree while it is processed.
|
||||
*/
|
||||
trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
|
||||
NULL, 0, 0);
|
||||
trim_tree = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME,
|
||||
vdev_rt_name(vd, "autotrim_tree"));
|
||||
zfs_range_tree_swap(&msp->ms_trim, &trim_tree);
|
||||
ASSERT(zfs_range_tree_is_empty(msp->ms_trim));
|
||||
|
||||
@@ -1360,8 +1364,10 @@ vdev_autotrim_thread(void *arg)
|
||||
if (!cvd->vdev_ops->vdev_op_leaf)
|
||||
continue;
|
||||
|
||||
ta->trim_tree = zfs_range_tree_create(NULL,
|
||||
ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
ta->trim_tree = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME,
|
||||
vdev_rt_name(vd, "autotrim_tree"));
|
||||
zfs_range_tree_walk(trim_tree,
|
||||
vdev_trim_range_add, ta);
|
||||
}
|
||||
@@ -1600,7 +1606,9 @@ vdev_trim_l2arc_thread(void *arg)
|
||||
vd->vdev_trim_secure = 0;
|
||||
|
||||
ta.trim_vdev = vd;
|
||||
ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
ta.trim_tree = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
|
||||
ta.trim_type = TRIM_TYPE_MANUAL;
|
||||
ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
|
||||
ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
|
||||
@@ -1735,7 +1743,9 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
|
||||
ASSERT(!vd->vdev_top->vdev_rz_expanding);
|
||||
|
||||
ta.trim_vdev = vd;
|
||||
ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
|
||||
ta.trim_tree = zfs_range_tree_create_flags(
|
||||
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
|
||||
ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
|
||||
ta.trim_type = TRIM_TYPE_SIMPLE;
|
||||
ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
|
||||
ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
|
||||
|
||||
+40
-29
@@ -32,9 +32,6 @@
|
||||
#include <sys/blake3.h>
|
||||
#include <sys/sha2.h>
|
||||
|
||||
/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
|
||||
#define LIMIT_PERF_MBS 300
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
const char *impl;
|
||||
@@ -52,9 +49,15 @@ typedef struct {
|
||||
zio_checksum_tmpl_free_t *(free);
|
||||
} chksum_stat_t;
|
||||
|
||||
#define AT_STARTUP 0
|
||||
#define AT_BENCHMARK 1
|
||||
#define AT_DONE 2
|
||||
|
||||
static chksum_stat_t *chksum_stat_data = 0;
|
||||
static int chksum_stat_cnt = 0;
|
||||
static kstat_t *chksum_kstat = NULL;
|
||||
static int chksum_stat_limit = AT_STARTUP;
|
||||
static int chksum_stat_cnt = 0;
|
||||
static void chksum_benchmark(void);
|
||||
|
||||
/*
|
||||
* Sample output on i3-1005G1 System:
|
||||
@@ -129,6 +132,9 @@ chksum_kstat_data(char *buf, size_t size, void *data)
|
||||
static void *
|
||||
chksum_kstat_addr(kstat_t *ksp, loff_t n)
|
||||
{
|
||||
/* full benchmark */
|
||||
chksum_benchmark();
|
||||
|
||||
if (n < chksum_stat_cnt)
|
||||
ksp->ks_private = (void *)(chksum_stat_data + n);
|
||||
else
|
||||
@@ -176,47 +182,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
|
||||
kpreempt_enable();
|
||||
|
||||
run_bw = size * run_count * NANOSEC;
|
||||
run_bw /= run_time_ns; /* B/s */
|
||||
run_bw /= run_time_ns; /* B/s */
|
||||
*result = run_bw/1024/1024; /* MiB/s */
|
||||
}
|
||||
|
||||
#define LIMIT_INIT 0
|
||||
#define LIMIT_NEEDED 1
|
||||
#define LIMIT_NOLIMIT 2
|
||||
|
||||
static void
|
||||
chksum_benchit(chksum_stat_t *cs)
|
||||
{
|
||||
abd_t *abd;
|
||||
void *ctx = 0;
|
||||
void *salt = &cs->salt.zcs_bytes;
|
||||
static int chksum_stat_limit = LIMIT_INIT;
|
||||
|
||||
memset(salt, 0, sizeof (cs->salt.zcs_bytes));
|
||||
if (cs->init)
|
||||
ctx = cs->init(&cs->salt);
|
||||
|
||||
/* benchmarks in startup mode */
|
||||
if (chksum_stat_limit == AT_STARTUP) {
|
||||
abd = abd_alloc_linear(1<<18, B_FALSE);
|
||||
chksum_run(cs, abd, ctx, 5, &cs->bs256k);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* allocate test memory via abd linear interface */
|
||||
abd = abd_alloc_linear(1<<20, B_FALSE);
|
||||
|
||||
/* benchmarks when requested */
|
||||
chksum_run(cs, abd, ctx, 1, &cs->bs1k);
|
||||
chksum_run(cs, abd, ctx, 2, &cs->bs4k);
|
||||
chksum_run(cs, abd, ctx, 3, &cs->bs16k);
|
||||
chksum_run(cs, abd, ctx, 4, &cs->bs64k);
|
||||
chksum_run(cs, abd, ctx, 5, &cs->bs256k);
|
||||
|
||||
/* check if we ran on a slow cpu */
|
||||
if (chksum_stat_limit == LIMIT_INIT) {
|
||||
if (cs->bs1k < LIMIT_PERF_MBS) {
|
||||
chksum_stat_limit = LIMIT_NEEDED;
|
||||
} else {
|
||||
chksum_stat_limit = LIMIT_NOLIMIT;
|
||||
}
|
||||
}
|
||||
|
||||
/* skip benchmarks >= 1MiB when the CPU is to slow */
|
||||
if (chksum_stat_limit == LIMIT_NEEDED)
|
||||
goto abort;
|
||||
|
||||
chksum_run(cs, abd, ctx, 6, &cs->bs1m);
|
||||
abd_free(abd);
|
||||
|
||||
@@ -225,7 +220,7 @@ chksum_benchit(chksum_stat_t *cs)
|
||||
chksum_run(cs, abd, ctx, 7, &cs->bs4m);
|
||||
chksum_run(cs, abd, ctx, 8, &cs->bs16m);
|
||||
|
||||
abort:
|
||||
done:
|
||||
abd_free(abd);
|
||||
|
||||
/* free up temp memory */
|
||||
@@ -243,7 +238,6 @@ chksum_benchmark(void)
|
||||
/* we need the benchmark only for the kernel module */
|
||||
return;
|
||||
#endif
|
||||
|
||||
chksum_stat_t *cs;
|
||||
uint64_t max;
|
||||
uint32_t id, cbid = 0, id_save;
|
||||
@@ -251,8 +245,14 @@ chksum_benchmark(void)
|
||||
const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
|
||||
const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
|
||||
|
||||
/* benchmarks are done */
|
||||
if (chksum_stat_limit == AT_DONE)
|
||||
return;
|
||||
|
||||
|
||||
/* count implementations */
|
||||
chksum_stat_cnt = 2;
|
||||
chksum_stat_cnt = 1; /* edonr */
|
||||
chksum_stat_cnt += 1; /* skein */
|
||||
chksum_stat_cnt += sha256->getcnt();
|
||||
chksum_stat_cnt += sha512->getcnt();
|
||||
chksum_stat_cnt += blake3->getcnt();
|
||||
@@ -332,6 +332,17 @@ chksum_benchmark(void)
|
||||
}
|
||||
}
|
||||
blake3->setid(id_save);
|
||||
|
||||
switch (chksum_stat_limit) {
|
||||
case AT_STARTUP:
|
||||
/* next time we want a full benchmark */
|
||||
chksum_stat_limit = AT_BENCHMARK;
|
||||
break;
|
||||
case AT_BENCHMARK:
|
||||
/* no further benchmarks */
|
||||
chksum_stat_limit = AT_DONE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -341,7 +352,7 @@ chksum_init(void)
|
||||
blake3_per_cpu_ctx_init();
|
||||
#endif
|
||||
|
||||
/* Benchmark supported implementations */
|
||||
/* 256KiB benchmark */
|
||||
chksum_benchmark();
|
||||
|
||||
/* Install kstats for all implementations */
|
||||
|
||||
+144
-8
@@ -67,13 +67,14 @@
|
||||
int zfs_bclone_enabled = 1;
|
||||
|
||||
/*
|
||||
* When set zfs_clone_range() waits for dirty data to be written to disk.
|
||||
* This allows the clone operation to reliably succeed when a file is modified
|
||||
* and then immediately cloned. For small files this may be slower than making
|
||||
* a copy of the file and is therefore not the default. However, in certain
|
||||
* scenarios this behavior may be desirable so a tunable is provided.
|
||||
* When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
|
||||
* data to be written to disk before proceeding. This ensures that the clone
|
||||
* operation reliably succeeds, even if a file is modified and then immediately
|
||||
* cloned. Note that for small files this may be slower than simply copying
|
||||
* the file. When set to 0 the clone operation will immediately fail if it
|
||||
* encounters any dirty blocks. By default waiting is enabled.
|
||||
*/
|
||||
int zfs_bclone_wait_dirty = 0;
|
||||
int zfs_bclone_wait_dirty = 1;
|
||||
|
||||
/*
|
||||
* Enable Direct I/O. If this setting is 0, then all I/O requests will be
|
||||
@@ -108,9 +109,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
|
||||
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
|
||||
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
|
||||
return (error);
|
||||
atomic_inc_32(&zp->z_sync_writes_cnt);
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
atomic_dec_32(&zp->z_sync_writes_cnt);
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
}
|
||||
return (error);
|
||||
@@ -1058,6 +1057,143 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Rewrite a range of file as-is without modification.
|
||||
*
|
||||
* IN: zp - znode of file to be rewritten.
|
||||
* off - Offset of the range to rewrite.
|
||||
* len - Length of the range to rewrite.
|
||||
* flags - Random rewrite parameters.
|
||||
* arg - flags-specific argument.
|
||||
*
|
||||
* RETURN: 0 if success
|
||||
* error code if failure
|
||||
*/
|
||||
int
|
||||
zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
|
||||
uint64_t arg)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (flags != 0 || arg != 0)
|
||||
return (SET_ERROR(EINVAL));
|
||||
|
||||
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
||||
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
|
||||
return (error);
|
||||
|
||||
if (zfs_is_readonly(zfsvfs)) {
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (SET_ERROR(EROFS));
|
||||
}
|
||||
|
||||
if (off >= zp->z_size) {
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (0);
|
||||
}
|
||||
if (len == 0 || len > zp->z_size - off)
|
||||
len = zp->z_size - off;
|
||||
|
||||
/* Flush any mmap()'d data to disk */
|
||||
if (zn_has_cached_data(zp, off, off + len - 1))
|
||||
zn_flush_cached_data(zp, B_TRUE);
|
||||
|
||||
zfs_locked_range_t *lr;
|
||||
lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
|
||||
|
||||
const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
|
||||
const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
|
||||
const uint64_t projid = zp->z_projid;
|
||||
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
|
||||
DB_DNODE_ENTER(db);
|
||||
dnode_t *dn = DB_DNODE(db);
|
||||
|
||||
uint64_t n, noff = off, nr = 0, nw = 0;
|
||||
while (len > 0) {
|
||||
/*
|
||||
* Rewrite only actual data, skipping any holes. This might
|
||||
* be inaccurate for dirty files, but we don't really care.
|
||||
*/
|
||||
if (noff == off) {
|
||||
/* Find next data in the file. */
|
||||
error = dnode_next_offset(dn, 0, &noff, 1, 1, 0);
|
||||
if (error || noff >= off + len) {
|
||||
if (error == ESRCH) /* No more data. */
|
||||
error = 0;
|
||||
break;
|
||||
}
|
||||
ASSERT3U(noff, >=, off);
|
||||
len -= noff - off;
|
||||
off = noff;
|
||||
|
||||
/* Find where the data end. */
|
||||
error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff,
|
||||
1, 1, 0);
|
||||
if (error != 0)
|
||||
noff = off + len;
|
||||
}
|
||||
ASSERT3U(noff, >, off);
|
||||
|
||||
if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
|
||||
zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
|
||||
(projid != ZFS_DEFAULT_PROJID &&
|
||||
zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
|
||||
projid))) {
|
||||
error = SET_ERROR(EDQUOT);
|
||||
break;
|
||||
}
|
||||
|
||||
n = MIN(MIN(len, noff - off),
|
||||
DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz));
|
||||
|
||||
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_write_by_dnode(tx, dn, off, n);
|
||||
error = dmu_tx_assign(tx, DMU_TX_WAIT);
|
||||
if (error) {
|
||||
dmu_tx_abort(tx);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Mark all dbufs within range as dirty to trigger rewrite. */
|
||||
dmu_buf_t **dbp;
|
||||
int numbufs;
|
||||
error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG,
|
||||
&numbufs, &dbp, DMU_READ_PREFETCH);
|
||||
if (error) {
|
||||
dmu_tx_commit(tx);
|
||||
break;
|
||||
}
|
||||
for (int i = 0; i < numbufs; i++) {
|
||||
nr += dbp[i]->db_size;
|
||||
if (dmu_buf_is_dirty(dbp[i], tx))
|
||||
continue;
|
||||
nw += dbp[i]->db_size;
|
||||
dmu_buf_will_dirty(dbp[i], tx);
|
||||
}
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
len -= n;
|
||||
off += n;
|
||||
|
||||
if (issig()) {
|
||||
error = SET_ERROR(EINTR);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr);
|
||||
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw);
|
||||
|
||||
zfs_rangelock_exit(lr);
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (error);
|
||||
}
|
||||
|
||||
int
|
||||
zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
|
||||
{
|
||||
|
||||
+17
-14
@@ -1691,7 +1691,7 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
|
||||
* If the previous lwb's write hasn't already completed, we also want
|
||||
* to order the completion of the lwb write zios (above, we only order
|
||||
* the completion of the lwb root zios). This is required because of
|
||||
* how we can defer the flush commands for each lwb.
|
||||
* how we can defer the flush commands for any lwb without waiters.
|
||||
*
|
||||
* When the flush commands are deferred, the previous lwb will rely on
|
||||
* this lwb to flush the vdevs written to by that previous lwb. Thus,
|
||||
@@ -1708,7 +1708,10 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
|
||||
*/
|
||||
if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
|
||||
ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
|
||||
zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
|
||||
if (list_is_empty(&prev_lwb->lwb_waiters)) {
|
||||
zio_add_child(lwb->lwb_write_zio,
|
||||
prev_lwb->lwb_write_zio);
|
||||
}
|
||||
} else {
|
||||
ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
|
||||
}
|
||||
@@ -2898,19 +2901,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
||||
|
||||
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
|
||||
|
||||
/*
|
||||
* Return if there's nothing to commit before we dirty the fs by
|
||||
* calling zil_create().
|
||||
*/
|
||||
if (list_is_empty(&zilog->zl_itx_commit_list))
|
||||
return;
|
||||
|
||||
list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
|
||||
list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
|
||||
offsetof(zil_commit_waiter_t, zcw_node));
|
||||
|
||||
lwb = list_tail(&zilog->zl_lwb_list);
|
||||
if (lwb == NULL) {
|
||||
/*
|
||||
* Return if there's nothing to commit before we dirty the fs.
|
||||
*/
|
||||
if (list_is_empty(&zilog->zl_itx_commit_list))
|
||||
return;
|
||||
|
||||
lwb = zil_create(zilog);
|
||||
} else {
|
||||
/*
|
||||
@@ -2938,6 +2936,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
||||
}
|
||||
}
|
||||
|
||||
list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
|
||||
list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
|
||||
offsetof(zil_commit_waiter_t, zcw_node));
|
||||
|
||||
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
|
||||
lr_t *lrc = &itx->itx_lr;
|
||||
uint64_t txg = lrc->lrc_txg;
|
||||
@@ -3107,7 +3109,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
||||
* possible, without significantly impacting the latency
|
||||
* of each individual itx.
|
||||
*/
|
||||
if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
|
||||
if (lwb->lwb_state == LWB_STATE_OPENED &&
|
||||
(!zilog->zl_parallel || zilog->zl_suspend > 0)) {
|
||||
zil_burst_done(zilog);
|
||||
list_insert_tail(ilwbs, lwb);
|
||||
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user