mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-15 04:30:33 +03:00
5e3085e360
Add in SMART self-test results to zpool status|iostat -c. This works for both SAS and SATA drives. Also, add plumbing to allow the 'smart' script to take smartctl output from a directory of output text files instead of running it against the vdevs. Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #7178
243 lines
7.2 KiB
Bash
Executable File
243 lines
7.2 KiB
Bash
Executable File
#!/bin/sh
|
|
#
|
|
# Show SMART stats
|
|
#
|
|
|
|
helpstr="
|
|
smart: Show SMART temperature and error stats (specific to drive type)
|
|
smartx: Show SMART extended drive stats (specific to drive type).
|
|
temp: Show SMART drive temperature in celsius (all drives).
|
|
health: Show reported SMART status (all drives).
|
|
r_proc: Show SMART read GBytes processed over drive lifetime (SAS).
|
|
w_proc: Show SMART write GBytes processed over drive lifetime (SAS).
|
|
r_ucor: Show SMART read uncorrectable errors (SAS).
|
|
w_ucor: Show SMART write uncorrectable errors (SAS).
|
|
nonmed: Show SMART non-medium errors (SAS).
|
|
defect: Show SMART grown defect list (SAS).
|
|
hours_on: Show number of hours drive powered on (all drives).
|
|
realloc: Show SMART reallocated sectors count (ATA).
|
|
rep_ucor: Show SMART reported uncorrectable count (ATA).
|
|
cmd_to: Show SMART command timeout count (ATA).
|
|
pend_sec: Show SMART current pending sector count (ATA).
|
|
off_ucor: Show SMART offline uncorrectable errors (ATA).
|
|
ata_err: Show SMART ATA errors (ATA).
|
|
pwr_cyc: Show SMART power cycle count (ATA).
|
|
serial: Show disk serial number.
|
|
nvme_err: Show SMART NVMe errors (NVMe).
|
|
smart_test: Show SMART self-test results summary.
|
|
test_type: Show SMART self-test type (short, long... ).
|
|
test_status: Show SMART self-test status.
|
|
test_progress: Show SMART self-test percentage done.
|
|
test_ended: Show when the last SMART self-test ended (if supported).
|
|
"
|
|
|
|
# Hack for developer testing
|
|
#
|
|
# If you set $samples to a directory containing smartctl output text files,
|
|
# we will use them instead of running smartctl on the vdevs. This can be
|
|
# useful if you want to test a bunch of different smartctl outputs. Also, if
|
|
# $samples is set, and additional 'file' column is added to the zpool output
|
|
# showing the filename.
|
|
samples=
|
|
|
|
# get_filename_from_dir DIR
|
|
#
|
|
# Look in directory DIR and return a filename from it. The filename returned
|
|
# is chosen quasi-sequentially (based off our PID). This allows us to return
|
|
# a different filename every time this script is invoked (which we do for each
|
|
# vdev), without having to maintain state.
|
|
get_filename_from_dir()
|
|
{
|
|
dir=$1
|
|
pid="$$"
|
|
num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
|
|
mod=$((pid % num_files))
|
|
i=0
|
|
find "$dir" -type f -printf "%f\n" | while read -r file ; do
|
|
if [ "$mod" = "$i" ] ; then
|
|
echo "$file"
|
|
break
|
|
fi
|
|
i=$((i+1))
|
|
done
|
|
}
|
|
|
|
script=$(basename "$0")
|
|
|
|
if [ "$1" = "-h" ] ; then
|
|
echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
|
|
exit
|
|
fi
|
|
|
|
smartctl_path=$(which smartctl)
|
|
|
|
if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
|
|
if [ -n "$samples" ] ; then
|
|
# cat a smartctl output text file instead of running smartctl
|
|
# on a vdev (only used for developer testing).
|
|
file=$(get_filename_from_dir $samples)
|
|
echo "file=$file"
|
|
raw_out=$(cat "$samples/$file")
|
|
else
|
|
raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
|
|
fi
|
|
|
|
# What kind of drive are we? Look for the right line in smartctl:
|
|
#
|
|
# SAS:
|
|
# Transport protocol: SAS
|
|
#
|
|
# SATA:
|
|
# ATA Version is: 8
|
|
#
|
|
# NVMe:
|
|
# SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
|
|
#
|
|
out=$(echo "$raw_out" | awk '
|
|
# SAS specific
|
|
/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
|
|
/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8}
|
|
/Non-medium error count/{print "nonmed="$4}
|
|
/Elements in grown defect list/{print "defect="$6}
|
|
|
|
# SAS common
|
|
/SAS/{type="sas"}
|
|
/Drive Temperature:/{print "temp="$4}
|
|
# Status can be a long string, substitute spaces for '_'
|
|
/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
|
|
/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
|
|
/Serial number:/{print "serial="$3}
|
|
|
|
# SATA specific
|
|
/Reallocated_Sector_Ct/{print "realloc="$10}
|
|
/Reported_Uncorrect/{print "rep_ucor="$10}
|
|
/Command_Timeout/{print "cmd_to="$10}
|
|
/Current_Pending_Sector/{print "pend_sec="$10}
|
|
/Offline_Uncorrectable/{print "off_ucor="$10}
|
|
/ATA Error Count:/{print "ata_err="$4}
|
|
/Power_Cycle_Count/{print "pwr_cyc="$10}
|
|
|
|
# SATA common
|
|
/SATA/{type="sata"}
|
|
/Temperature_Celsius/{print "temp="$10}
|
|
/Airflow_Temperature_Cel/{print "temp="$10}
|
|
/Current Temperature:/{print "temp="$3}
|
|
/SMART overall-health self-assessment test result:/{print "health="$6}
|
|
/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
|
|
/Serial Number:/{print "serial="$3}
|
|
|
|
# NVMe common
|
|
/NVMe/{type="nvme"}
|
|
/Temperature:/{print "temp="$2}
|
|
/SMART overall-health self-assessment test result:/{print "health="$6}
|
|
/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
|
|
/Serial Number:/{print "serial="$3}
|
|
/Power Cycles:/{print "pwr_cyc="$3}
|
|
|
|
# NVMe specific
|
|
/Media and Data Integrity Errors:/{print "nvme_err="$6}
|
|
|
|
# SMART self-test info
|
|
/Self-test execution status:/{progress=tolower($4)} # SAS
|
|
/SMART Self-test log/{test_seen=1} # SAS
|
|
/SMART Extended Self-test Log/{test_seen=1} # SATA
|
|
/# 1/{
|
|
test_type=tolower($3"_"$4);
|
|
# Status could be one word ("Completed") or multiple ("Completed: read
|
|
# failure"). Look for the ":" to see if we need to grab more words.
|
|
|
|
if ($5 ~ ":")
|
|
status=tolower($5""$6"_"$7)
|
|
else
|
|
status=tolower($5)
|
|
if (status=="self")
|
|
status="running";
|
|
|
|
if (type == "sas") {
|
|
hours=int($(NF-4))
|
|
} else {
|
|
hours=int($(NF-1))
|
|
# SATA reports percent remaining, rather than percent done
|
|
# Convert it to percent done.
|
|
progress=(100-int($(NF-2)))"%"
|
|
}
|
|
# When we int()-ify "hours", it converts stuff like "NOW" and "-" into
|
|
# 0. In those cases, set it to hours_on, so they will cancel out in
|
|
# the "hours_ago" calculation later on.
|
|
if (hours == 0)
|
|
hours=hours_on
|
|
|
|
if (test_seen) {
|
|
print "test="hours_on
|
|
print "test_type="test_type
|
|
print "test_status="status
|
|
print "test_progress="progress
|
|
}
|
|
# Not all drives report hours_on
|
|
if (hours_on && hours) {
|
|
total_hours_ago=(hours_on-hours)
|
|
days_ago=int(total_hours_ago/24)
|
|
hours_ago=(total_hours_ago % 24)
|
|
if (days_ago != 0)
|
|
ago_str=days_ago"d"
|
|
if (hours_ago !=0)
|
|
ago_str=ago_str""hours_ago"h"
|
|
print "test_ended="ago_str
|
|
}
|
|
}
|
|
|
|
END {print "type="type; ORS="\n"; print ""}
|
|
');
|
|
fi
|
|
type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
|
|
|
|
# If type is not set by now, either we don't have a block device
|
|
# or smartctl failed. Either way, default to ATA and set $out to
|
|
# nothing.
|
|
if [ -z "$type" ]; then
|
|
type="sata"
|
|
out=
|
|
fi
|
|
|
|
case $script in
|
|
smart)
|
|
# Print temperature plus common predictors of drive failure
|
|
if [ "$type" = "sas" ] ; then
|
|
scripts="temp|health|r_ucor|w_ucor"
|
|
elif [ "$type" = "sata" ] ; then
|
|
scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
|
|
elif [ "$type" = "nvme" ] ; then
|
|
scripts="temp|health|nvme_err"
|
|
fi
|
|
;;
|
|
smartx)
|
|
# Print some other interesting stats
|
|
if [ "$type" = "sas" ] ; then
|
|
scripts="hours_on|defect|nonmed|r_proc|w_proc"
|
|
elif [ "$type" = "sata" ] ; then
|
|
scripts="hours_on|pwr_cyc"
|
|
elif [ "$type" = "nvme" ] ; then
|
|
scripts="hours_on|pwr_cyc"
|
|
fi
|
|
;;
|
|
smart_test)
|
|
scripts="test_type|test_status|test_progress|test_ended"
|
|
;;
|
|
*)
|
|
scripts="$script"
|
|
esac
|
|
|
|
with_vals=$(echo "$out" | grep -E "$scripts")
|
|
if [ ! -z "$with_vals" ]; then
|
|
echo "$with_vals"
|
|
without_vals=$(echo "$scripts" | tr "|" "\n" |
|
|
grep -v -E "$(echo "$with_vals" |
|
|
awk -F "=" '{print $1}')" | awk '{print $0"="}')
|
|
else
|
|
without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}')
|
|
fi
|
|
|
|
if [ ! -z "$without_vals" ]; then
|
|
echo "$without_vals"
|
|
fi
|