mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-30 21:09:38 +03:00
2a9c572059
In case if all label checksums will be invalid on any vdev, the pool will become unimportable. From other side zdb with -l option will not provide any useful information why it happened. Add notifications about corrupted label checksums. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: John Kennedy <john.kennedy@delphix.com> Signed-off-by: Fedor Uporov <fuporov.vstack@gmail.com> Closes #2509 Closes #12685
668 lines
16 KiB
Plaintext
668 lines
16 KiB
Plaintext
#
|
|
# This file and its contents are supplied under the terms of the
|
|
# Common Development and Distribution License ("CDDL"), version 1.0.
|
|
# You may only use this file in accordance with the terms of version
|
|
# 1.0 of the CDDL.
|
|
#
|
|
# A full copy of the text of the CDDL should have accompanied this
|
|
# source. A copy of the CDDL is also available via the Internet at
|
|
# http://www.illumos.org/license/CDDL.
|
|
#
|
|
|
|
#
|
|
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
|
# Use is subject to license terms.
|
|
# Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
|
# Copyright 2016 Nexenta Systems, Inc.
|
|
# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
|
|
# Copyright (c) 2017 Lawrence Livermore National Security, LLC.
|
|
# Copyright (c) 2017 Datto Inc.
|
|
# Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
|
|
# Copyright 2019 Richard Elling
|
|
#
|
|
|
|
#
|
|
# Returns SCSI host number for the given disk
|
|
#
|
|
function get_scsi_host #disk
|
|
{
|
|
typeset disk=$1
|
|
ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1
|
|
}
|
|
|
|
#
|
|
# Cause a scan of all scsi host adapters by default
|
|
#
|
|
# $1 optional host number
|
|
#
|
|
function scan_scsi_hosts
|
|
{
|
|
typeset hostnum=${1}
|
|
|
|
if is_linux; then
|
|
if [[ -z $hostnum ]]; then
|
|
for host in /sys/class/scsi_host/host*; do
|
|
log_must eval "echo '- - -' > $host/scan"
|
|
done
|
|
else
|
|
log_must eval \
|
|
"echo /sys/class/scsi_host/host$hostnum/scan" \
|
|
> /dev/null
|
|
log_must eval \
|
|
"echo '- - -' > /sys/class/scsi_host/host$hostnum/scan"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Wait for newly created block devices to have their minors created.
|
|
# Additional arguments can be passed to udevadm trigger, with the expected
|
|
# arguments to typically be a block device pathname. This is useful when
|
|
# checking waiting on a specific device to settle rather than triggering
|
|
# all devices and waiting for them all to settle.
|
|
#
|
|
# The udevadm settle timeout can be 120 or 180 seconds by default for
|
|
# some distros. If a long delay is experienced, it could be due to some
|
|
# strangeness in a malfunctioning device that isn't related to the devices
|
|
# under test. To help debug this condition, a notice is given if settle takes
|
|
# too long.
|
|
#
|
|
# Note: there is no meaningful return code if udevadm fails. Consumers
|
|
# should not expect a return code (do not call as argument to log_must)
|
|
#
|
|
function block_device_wait
|
|
{
|
|
if is_linux; then
|
|
udevadm trigger $* 2>/dev/null
|
|
typeset start=$SECONDS
|
|
udevadm settle
|
|
typeset elapsed=$((SECONDS - start))
|
|
[[ $elapsed > 60 ]] && \
|
|
log_note udevadm settle time too long: $elapsed
|
|
elif is_freebsd; then
|
|
if [[ ${#@} -eq 0 ]]; then
|
|
# Do something that has to go through the geom event
|
|
# queue to complete.
|
|
sysctl kern.geom.conftxt >/dev/null
|
|
return
|
|
fi
|
|
fi
|
|
# Poll for the given paths to appear, but give up eventually.
|
|
typeset -i i
|
|
for (( i = 0; i < 5; ++i )); do
|
|
typeset missing=false
|
|
typeset dev
|
|
for dev in "${@}"; do
|
|
if ! [[ -e $dev ]]; then
|
|
missing=true
|
|
break
|
|
fi
|
|
done
|
|
if ! $missing; then
|
|
break
|
|
fi
|
|
sleep ${#@}
|
|
done
|
|
}
|
|
|
|
#
|
|
# Check if the given device is physical device
|
|
#
|
|
function is_physical_device #device
|
|
{
|
|
typeset device=${1#$DEV_DSKDIR/}
|
|
device=${device#$DEV_RDSKDIR/}
|
|
|
|
if is_linux; then
|
|
is_disk_device "$DEV_DSKDIR/$device" && \
|
|
[[ -f /sys/module/loop/parameters/max_part ]]
|
|
return $?
|
|
elif is_freebsd; then
|
|
is_disk_device "$DEV_DSKDIR/$device" && \
|
|
echo $device | egrep -q \
|
|
-e '^a?da[0-9]+$' \
|
|
-e '^md[0-9]+$' \
|
|
-e '^mfid[0-9]+$' \
|
|
-e '^nda[0-9]+$' \
|
|
-e '^nvd[0-9]+$' \
|
|
-e '^vtbd[0-9]+$'
|
|
return $?
|
|
else
|
|
echo $device | egrep "^c[0-F]+([td][0-F]+)+$" > /dev/null 2>&1
|
|
return $?
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Check if the given device is a real device (ie SCSI device)
|
|
#
|
|
function is_real_device #disk
|
|
{
|
|
typeset disk=$1
|
|
[[ -z $disk ]] && log_fail "No argument for disk given."
|
|
|
|
if is_linux; then
|
|
lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
|
|
egrep disk >/dev/null
|
|
return $?
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Check if the given device is a loop device
|
|
#
|
|
function is_loop_device #disk
|
|
{
|
|
typeset disk=$1
|
|
[[ -z $disk ]] && log_fail "No argument for disk given."
|
|
|
|
if is_linux; then
|
|
lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
|
|
egrep loop >/dev/null
|
|
return $?
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Linux:
|
|
# Check if the given device is a multipath device and if there is a symbolic
|
|
# link to a device mapper and to a disk
|
|
# Currently no support for dm devices alone without multipath
|
|
#
|
|
# FreeBSD:
|
|
# Check if the given device is a gmultipath device.
|
|
#
|
|
# Others:
|
|
# No multipath detection.
|
|
#
|
|
function is_mpath_device #disk
|
|
{
|
|
typeset disk=$1
|
|
[[ -z $disk ]] && log_fail "No argument for disk given."
|
|
|
|
if is_linux; then
|
|
lsblk $DEV_MPATHDIR/$disk -o TYPE 2>/dev/null | \
|
|
egrep mpath >/dev/null
|
|
if (($? == 0)); then
|
|
readlink $DEV_MPATHDIR/$disk > /dev/null 2>&1
|
|
return $?
|
|
else
|
|
return $?
|
|
fi
|
|
elif is_freebsd; then
|
|
is_disk_device $DEV_MPATHDIR/$disk
|
|
else
|
|
false
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Check if the given path is the appropriate sort of device special node.
|
|
#
|
|
function is_disk_device #path
|
|
{
|
|
typeset path=$1
|
|
|
|
if is_freebsd; then
|
|
# FreeBSD doesn't have block devices, only character devices.
|
|
test -c $path
|
|
else
|
|
test -b $path
|
|
fi
|
|
}
|
|
|
|
# Set the slice prefix for disk partitioning depending
|
|
# on whether the device is a real, multipath, or loop device.
|
|
# Currently all disks have to be of the same type, so only
|
|
# checks first disk to determine slice prefix.
|
|
#
|
|
function set_slice_prefix
|
|
{
|
|
typeset disk
|
|
typeset -i i=0
|
|
|
|
if is_linux; then
|
|
while (( i < $DISK_ARRAY_NUM )); do
|
|
disk="$(echo $DISKS | nawk '{print $(i + 1)}')"
|
|
if ( is_mpath_device $disk ) && [[ -z $(echo $disk | awk 'substr($1,18,1)\
|
|
~ /^[[:digit:]]+$/') ]] || ( is_real_device $disk ); then
|
|
export SLICE_PREFIX=""
|
|
return 0
|
|
elif ( is_mpath_device $disk || is_loop_device \
|
|
$disk ); then
|
|
export SLICE_PREFIX="p"
|
|
return 0
|
|
else
|
|
log_fail "$disk not supported for partitioning."
|
|
fi
|
|
(( i = i + 1))
|
|
done
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Set the directory path of the listed devices in $DISK_ARRAY_NUM
|
|
# Currently all disks have to be of the same type, so only
|
|
# checks first disk to determine device directory
|
|
# default = /dev (linux)
|
|
# real disk = /dev (linux)
|
|
# multipath device = /dev/mapper (linux)
|
|
#
|
|
function set_device_dir
|
|
{
|
|
typeset disk
|
|
typeset -i i=0
|
|
|
|
if is_linux; then
|
|
while (( i < $DISK_ARRAY_NUM )); do
|
|
disk="$(echo $DISKS | nawk '{print $(i + 1)}')"
|
|
if is_mpath_device $disk; then
|
|
export DEV_DSKDIR=$DEV_MPATHDIR
|
|
return 0
|
|
else
|
|
export DEV_DSKDIR=$DEV_RDSKDIR
|
|
return 0
|
|
fi
|
|
(( i = i + 1))
|
|
done
|
|
else
|
|
export DEV_DSKDIR=$DEV_RDSKDIR
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Get the directory path of given device
|
|
#
|
|
function get_device_dir #device
|
|
{
|
|
typeset device=$1
|
|
|
|
if ! is_freebsd && ! is_physical_device $device; then
|
|
if [[ $device != "/" ]]; then
|
|
device=${device%/*}
|
|
fi
|
|
if is_disk_device "$DEV_DSKDIR/$device"; then
|
|
device="$DEV_DSKDIR"
|
|
fi
|
|
echo $device
|
|
else
|
|
echo "$DEV_DSKDIR"
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Get persistent name for given disk
|
|
#
|
|
function get_persistent_disk_name #device
|
|
{
|
|
typeset device=$1
|
|
typeset dev_id
|
|
|
|
if is_linux; then
|
|
if is_real_device $device; then
|
|
dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \
|
|
| egrep disk/by-id | nawk '{print $2; exit}' \
|
|
| nawk -F / '{print $3}')"
|
|
echo $dev_id
|
|
elif is_mpath_device $device; then
|
|
dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \
|
|
| egrep disk/by-id/dm-uuid \
|
|
| nawk '{print $2; exit}' \
|
|
| nawk -F / '{print $3}')"
|
|
echo $dev_id
|
|
else
|
|
echo $device
|
|
fi
|
|
else
|
|
echo $device
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Online or offline a disk on the system
|
|
#
|
|
# First checks state of disk. Test will fail if disk is not properly onlined
|
|
# or offlined. Online is a full rescan of SCSI disks by echoing to every
|
|
# host entry.
|
|
#
|
|
function on_off_disk # disk state{online,offline} host
|
|
{
|
|
typeset disk=$1
|
|
typeset state=$2
|
|
typeset host=$3
|
|
|
|
[[ -z $disk ]] || [[ -z $state ]] && \
|
|
log_fail "Arguments invalid or missing"
|
|
|
|
if is_linux; then
|
|
if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then
|
|
dm_name="$(readlink $DEV_DSKDIR/$disk \
|
|
| nawk -F / '{print $2}')"
|
|
dep="$(ls /sys/block/${dm_name}/slaves \
|
|
| nawk '{print $1}')"
|
|
while [[ -n $dep ]]; do
|
|
#check if disk is online
|
|
lsscsi | egrep $dep > /dev/null
|
|
if (($? == 0)); then
|
|
dep_dir="/sys/block/${dm_name}"
|
|
dep_dir+="/slaves/${dep}/device"
|
|
ss="${dep_dir}/state"
|
|
sd="${dep_dir}/delete"
|
|
log_must eval "echo 'offline' > ${ss}"
|
|
log_must eval "echo '1' > ${sd}"
|
|
lsscsi | egrep $dep > /dev/null
|
|
if (($? == 0)); then
|
|
log_fail "Offlining" \
|
|
"$disk failed"
|
|
fi
|
|
fi
|
|
dep="$(ls /sys/block/$dm_name/slaves \
|
|
2>/dev/null | nawk '{print $1}')"
|
|
done
|
|
elif [[ $state == "offline" ]] && ( is_real_device $disk ); then
|
|
#check if disk is online
|
|
lsscsi | egrep $disk > /dev/null
|
|
if (($? == 0)); then
|
|
dev_state="/sys/block/$disk/device/state"
|
|
dev_delete="/sys/block/$disk/device/delete"
|
|
log_must eval "echo 'offline' > ${dev_state}"
|
|
log_must eval "echo '1' > ${dev_delete}"
|
|
lsscsi | egrep $disk > /dev/null
|
|
if (($? == 0)); then
|
|
log_fail "Offlining $disk" \
|
|
"failed"
|
|
fi
|
|
else
|
|
log_note "$disk is already offline"
|
|
fi
|
|
elif [[ $state == "online" ]]; then
|
|
#force a full rescan
|
|
scan_scsi_hosts $host
|
|
block_device_wait
|
|
if is_mpath_device $disk; then
|
|
dm_name="$(readlink $DEV_DSKDIR/$disk \
|
|
| nawk -F / '{print $2}')"
|
|
dep="$(ls /sys/block/$dm_name/slaves \
|
|
| nawk '{print $1}')"
|
|
lsscsi | egrep $dep > /dev/null
|
|
if (($? != 0)); then
|
|
log_fail "Onlining $disk failed"
|
|
fi
|
|
elif is_real_device $disk; then
|
|
block_device_wait
|
|
typeset -i retries=0
|
|
while ! lsscsi | egrep -q $disk; do
|
|
if (( $retries > 2 )); then
|
|
log_fail "Onlining $disk failed"
|
|
break
|
|
fi
|
|
(( ++retries ))
|
|
sleep 1
|
|
done
|
|
else
|
|
log_fail "$disk is not a real dev"
|
|
fi
|
|
else
|
|
log_fail "$disk failed to $state"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Simulate disk removal
|
|
#
|
|
function remove_disk #disk
|
|
{
|
|
typeset disk=$1
|
|
on_off_disk $disk "offline"
|
|
block_device_wait
|
|
}
|
|
|
|
#
|
|
# Simulate disk insertion for the given SCSI host
|
|
#
|
|
function insert_disk #disk scsi_host
|
|
{
|
|
typeset disk=$1
|
|
typeset scsi_host=$2
|
|
on_off_disk $disk "online" $scsi_host
|
|
block_device_wait
|
|
}
|
|
|
|
#
|
|
# Load scsi_debug module with specified parameters
|
|
# $blksz can be either one of: < 512b | 512e | 4Kn >
|
|
#
|
|
function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz
|
|
{
|
|
typeset devsize=$1
|
|
typeset hosts=$2
|
|
typeset tgts=$3
|
|
typeset luns=$4
|
|
typeset blksz=$5
|
|
|
|
[[ -z $devsize ]] || [[ -z $hosts ]] || [[ -z $tgts ]] || \
|
|
[[ -z $luns ]] || [[ -z $blksz ]] && \
|
|
log_fail "Arguments invalid or missing"
|
|
|
|
case "$5" in
|
|
'512b')
|
|
typeset sector=512
|
|
typeset blkexp=0
|
|
;;
|
|
'512e')
|
|
typeset sector=512
|
|
typeset blkexp=3
|
|
;;
|
|
'4Kn')
|
|
typeset sector=4096
|
|
typeset blkexp=0
|
|
;;
|
|
*) log_fail "Unsupported blksz value: $5" ;;
|
|
esac
|
|
|
|
if is_linux; then
|
|
modprobe -n scsi_debug
|
|
if (($? != 0)); then
|
|
log_unsupported "Platform does not have scsi_debug"
|
|
"module"
|
|
fi
|
|
lsmod | egrep scsi_debug > /dev/null
|
|
if (($? == 0)); then
|
|
log_fail "scsi_debug module already installed"
|
|
else
|
|
log_must modprobe scsi_debug dev_size_mb=$devsize \
|
|
add_host=$hosts num_tgts=$tgts max_luns=$luns \
|
|
sector_size=$sector physblk_exp=$blkexp
|
|
block_device_wait
|
|
lsscsi | egrep scsi_debug > /dev/null
|
|
if (($? == 1)); then
|
|
log_fail "scsi_debug module install failed"
|
|
fi
|
|
fi
|
|
fi
|
|
}
|
|
|
|
#
|
|
# Unload scsi_debug module, if needed.
|
|
#
|
|
function unload_scsi_debug
|
|
{
|
|
log_must_retry "in use" 5 modprobe -r scsi_debug
|
|
}
|
|
|
|
#
|
|
# Get scsi_debug device name.
|
|
# Returns basename of scsi_debug device (for example "sdb").
|
|
#
|
|
function get_debug_device
|
|
{
|
|
for i in {1..10} ; do
|
|
val=$(lsscsi | nawk '/scsi_debug/ {print $6; exit}' | cut -d / -f3)
|
|
|
|
# lsscsi can take time to settle
|
|
if [ "$val" != "-" ] ; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
echo "$val"
|
|
}
|
|
|
|
#
|
|
# Get actual devices used by the pool (i.e. linux sdb1 not sdb).
|
|
#
|
|
function get_pool_devices #testpool #devdir
|
|
{
|
|
typeset testpool=$1
|
|
typeset devdir=$2
|
|
typeset out=""
|
|
|
|
if is_linux || is_freebsd; then
|
|
out=$(zpool status -P $testpool |grep ${devdir} | awk '{print $1}')
|
|
out=$(echo $out | sed -e "s|${devdir}/||g" | tr '\n' ' ')
|
|
fi
|
|
echo $out
|
|
}
|
|
|
|
#
|
|
# Write to standard out giving the level, device name, offset and length
|
|
# of all blocks in an input file. The offset and length are in units of
|
|
# 512 byte blocks. In the case of mirrored vdevs, only the first
|
|
# device is listed, as the levels, blocks and offsets will be the same
|
|
# on other devices. Note that this function only works with mirrored
|
|
# or non-redundant pools, not raidz.
|
|
#
|
|
# The output of this function can be used to introduce corruption at
|
|
# varying levels of indirection.
|
|
#
|
|
function list_file_blocks # input_file
|
|
{
|
|
typeset input_file=$1
|
|
|
|
[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
|
|
|
|
typeset ds="$(zfs list -H -o name $input_file)"
|
|
typeset pool="${ds%%/*}"
|
|
typeset objnum="$(get_objnum $input_file)"
|
|
|
|
#
|
|
# Establish a mapping between vdev ids as shown in a DVA and the
|
|
# pathnames they correspond to in ${VDEV_MAP[][]}.
|
|
#
|
|
# The vdev bits in a DVA refer to the top level vdev id.
|
|
# ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev.
|
|
#
|
|
eval $(zdb -C $pool | awk '
|
|
BEGIN { printf "typeset -a VDEV_MAP;" }
|
|
function subscript(s) {
|
|
# "[#]" is more convenient than the bare "#"
|
|
match(s, /\[[0-9]*\]/)
|
|
return substr(s, RSTART, RLENGTH)
|
|
}
|
|
id && !/^ / {
|
|
# left a top level vdev
|
|
id = 0
|
|
}
|
|
id && $1 ~ /^path:$/ {
|
|
# found a vdev path; save it in the map
|
|
printf "VDEV_MAP%s%s=%s;", id, child, $2
|
|
}
|
|
/^ children/ {
|
|
# entering a top level vdev
|
|
id = subscript($0)
|
|
child = "[0]" # default in case there is no nested vdev
|
|
printf "typeset -a VDEV_MAP%s;", id
|
|
}
|
|
/^ children/ {
|
|
# entering a nested vdev (e.g. child of a top level mirror)
|
|
child = subscript($0)
|
|
}
|
|
')
|
|
|
|
#
|
|
# The awk below parses the output of zdb, printing out the level
|
|
# of each block along with vdev id, offset and length. The last
|
|
# two are converted to decimal in the while loop. 4M is added to
|
|
# the offset to compensate for the first two labels and boot
|
|
# block. Lastly, the offset and length are printed in units of
|
|
# 512B blocks for ease of use with dd.
|
|
#
|
|
typeset level vdev path offset length
|
|
if awk -n '' 2>/dev/null; then
|
|
# gawk needs -n to decode hex
|
|
AWK='awk -n'
|
|
else
|
|
AWK='awk'
|
|
fi
|
|
log_must zpool sync -f
|
|
zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 '
|
|
/^$/ { looking = 0 }
|
|
looking {
|
|
level = $2
|
|
field = 3
|
|
while (split($field, dva, ":") == 3) {
|
|
# top level vdev id
|
|
vdev = int(dva[1])
|
|
# offset + 4M label/boot pad in 512B blocks
|
|
offset = (int("0x"dva[2]) + pad) / bs
|
|
# length in 512B blocks
|
|
len = int("0x"dva[3]) / bs
|
|
|
|
print level, vdev, offset, len
|
|
|
|
++field
|
|
}
|
|
}
|
|
/^Indirect blocks:/ { looking = 1 }
|
|
' | \
|
|
while read level vdev offset length; do
|
|
for path in ${VDEV_MAP[$vdev][@]}; do
|
|
echo "$level $path $offset $length"
|
|
done
|
|
done 2>/dev/null
|
|
}
|
|
|
|
function corrupt_blocks_at_level # input_file corrupt_level
|
|
{
|
|
typeset input_file=$1
|
|
typeset corrupt_level="L${2:-0}"
|
|
typeset level path offset length
|
|
|
|
[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
|
|
|
|
if is_freebsd; then
|
|
# Temporarily allow corrupting an inuse device.
|
|
debugflags=$(sysctl -n kern.geom.debugflags)
|
|
sysctl kern.geom.debugflags=16
|
|
fi
|
|
|
|
list_file_blocks $input_file | \
|
|
while read level path offset length; do
|
|
if [[ $level = $corrupt_level ]]; then
|
|
log_must dd if=/dev/urandom of=$path bs=512 \
|
|
count=$length seek=$offset conv=notrunc
|
|
fi
|
|
done
|
|
|
|
if is_freebsd; then
|
|
sysctl kern.geom.debugflags=$debugflags
|
|
fi
|
|
|
|
# This is necessary for pools made of loop devices.
|
|
sync
|
|
}
|
|
|
|
function corrupt_label_checksum # label_number vdev_path
|
|
{
|
|
typeset label_size=$((256*1024))
|
|
typeset vdev_size=$(stat_size ${2})
|
|
typeset -a offsets=("$((128*1024 - 32))" \
|
|
"$(($label_size + (128*1024 - 32)))" \
|
|
"$(($vdev_size - $label_size - (128*1024 + 32)))" \
|
|
"$(($vdev_size - (128*1024 + 32)))")
|
|
|
|
dd if=/dev/urandom of=${2} seek=${offsets[$1]} bs=1 count=32 \
|
|
conv=notrunc
|
|
}
|