From 5e3085e360161456fe2af697494c479de0ee2085 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 27 Feb 2018 09:31:27 -0800 Subject: [PATCH] Add SMART self-test results to zpool status -c Add in SMART self-test results to zpool status|iostat -c. This works for both SAS and SATA drives. Also, add plumbing to allow the 'smart' script to take smartctl output from a directory of output text files instead of running it against the vdevs. Reviewed-by: Giuseppe Di Natale Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #7178 --- cmd/zpool/Makefile.am | 14 +++- cmd/zpool/zpool.d/smart | 132 ++++++++++++++++++++++++++++---- cmd/zpool/zpool.d/smart_test | 1 + cmd/zpool/zpool.d/test_ended | 1 + cmd/zpool/zpool.d/test_progress | 1 + cmd/zpool/zpool.d/test_status | 1 + cmd/zpool/zpool.d/test_type | 1 + 7 files changed, 133 insertions(+), 18 deletions(-) create mode 120000 cmd/zpool/zpool.d/smart_test create mode 120000 cmd/zpool/zpool.d/test_ended create mode 120000 cmd/zpool/zpool.d/test_progress create mode 120000 cmd/zpool/zpool.d/test_status create mode 120000 cmd/zpool/zpool.d/test_type diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index c7b8b76e3..d07f8d616 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -63,7 +63,12 @@ dist_zpoolexec_SCRIPTS = \ zpool.d/nvme_err \ zpool.d/pwr_cyc \ zpool.d/upath \ - zpool.d/vendor + zpool.d/vendor \ + zpool.d/smart_test \ + zpool.d/test_type \ + zpool.d/test_status \ + zpool.d/test_progress \ + zpool.d/test_ended zpoolconfdefaults = \ enc \ @@ -102,7 +107,12 @@ zpoolconfdefaults = \ nvme_err \ pwr_cyc \ upath \ - vendor + vendor \ + smart_test \ + test_type \ + test_status \ + test_progress \ + test_ended install-data-hook: $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)" diff --git a/cmd/zpool/zpool.d/smart b/cmd/zpool/zpool.d/smart index 4bc3af39d..64b5f6e4e 100755 --- a/cmd/zpool/zpool.d/smart +++ b/cmd/zpool/zpool.d/smart @@ -24,8 +24,44 @@ ata_err: Show SMART ATA errors (ATA). pwr_cyc: Show SMART power cycle count (ATA). serial: Show disk serial number. nvme_err: Show SMART NVMe errors (NVMe). +smart_test: Show SMART self-test results summary. +test_type: Show SMART self-test type (short, long... ). +test_status: Show SMART self-test status. +test_progress: Show SMART self-test percentage done. +test_ended: Show when the last SMART self-test ended (if supported). " +# Hack for developer testing +# +# If you set $samples to a directory containing smartctl output text files, +# we will use them instead of running smartctl on the vdevs. This can be +# useful if you want to test a bunch of different smartctl outputs. Also, if +# $samples is set, and additional 'file' column is added to the zpool output +# showing the filename. +samples= + +# get_filename_from_dir DIR +# +# Look in directory DIR and return a filename from it. The filename returned +# is chosen quasi-sequentially (based off our PID). This allows us to return +# a different filename every time this script is invoked (which we do for each +# vdev), without having to maintain state. +get_filename_from_dir() +{ + dir=$1 + pid="$$" + num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) + mod=$((pid % num_files)) + i=0 + find "$dir" -type f -printf "%f\n" | while read -r file ; do + if [ "$mod" = "$i" ] ; then + echo "$file" + break + fi + i=$((i+1)) + done +} + script=$(basename "$0") if [ "$1" = "-h" ] ; then @@ -35,8 +71,16 @@ fi smartctl_path=$(which smartctl) -if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then - raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") +if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then + if [ -n "$samples" ] ; then + # cat a smartctl output text file instead of running smartctl + # on a vdev (only used for developer testing). + file=$(get_filename_from_dir $samples) + echo "file=$file" + raw_out=$(cat "$samples/$file") + else + raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") + fi # What kind of drive are we? Look for the right line in smartctl: # @@ -49,7 +93,6 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then # NVMe: # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn) # - type=$(echo "$raw_out" | grep -m 1 -Eo '^ATA|NVMe|SAS$') out=$(echo "$raw_out" | awk ' # SAS specific /read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8} @@ -58,10 +101,11 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then /Elements in grown defect list/{print "defect="$6} # SAS common +/SAS/{type="sas"} /Drive Temperature:/{print "temp="$4} # Status can be a long string, substitute spaces for '_' /SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i} -/number of hours powered up/{print "hours_on="$7} +/number of hours powered up/{print "hours_on="$7; hours_on=int($7)} /Serial number:/{print "serial="$3} # SATA specific @@ -74,13 +118,16 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then /Power_Cycle_Count/{print "pwr_cyc="$10} # SATA common +/SATA/{type="sata"} /Temperature_Celsius/{print "temp="$10} /Airflow_Temperature_Cel/{print "temp="$10} +/Current Temperature:/{print "temp="$3} /SMART overall-health self-assessment test result:/{print "health="$6} -/Power_On_Hours/{print "hours_on="$10} +/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)} /Serial Number:/{print "serial="$3} # NVMe common +/NVMe/{type="nvme"} /Temperature:/{print "temp="$2} /SMART overall-health self-assessment test result:/{print "health="$6} /Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4} @@ -90,39 +137,92 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then # NVMe specific /Media and Data Integrity Errors:/{print "nvme_err="$6} -END {ORS="\n"; print ""} +# SMART self-test info +/Self-test execution status:/{progress=tolower($4)} # SAS +/SMART Self-test log/{test_seen=1} # SAS +/SMART Extended Self-test Log/{test_seen=1} # SATA +/# 1/{ + test_type=tolower($3"_"$4); + # Status could be one word ("Completed") or multiple ("Completed: read + # failure"). Look for the ":" to see if we need to grab more words. + + if ($5 ~ ":") + status=tolower($5""$6"_"$7) + else + status=tolower($5) + if (status=="self") + status="running"; + + if (type == "sas") { + hours=int($(NF-4)) + } else { + hours=int($(NF-1)) + # SATA reports percent remaining, rather than percent done + # Convert it to percent done. + progress=(100-int($(NF-2)))"%" + } + # When we int()-ify "hours", it converts stuff like "NOW" and "-" into + # 0. In those cases, set it to hours_on, so they will cancel out in + # the "hours_ago" calculation later on. + if (hours == 0) + hours=hours_on + + if (test_seen) { + print "test="hours_on + print "test_type="test_type + print "test_status="status + print "test_progress="progress + } + # Not all drives report hours_on + if (hours_on && hours) { + total_hours_ago=(hours_on-hours) + days_ago=int(total_hours_ago/24) + hours_ago=(total_hours_ago % 24) + if (days_ago != 0) + ago_str=days_ago"d" + if (hours_ago !=0) + ago_str=ago_str""hours_ago"h" + print "test_ended="ago_str + } +} + +END {print "type="type; ORS="\n"; print ""} '); fi +type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2) -# if type is not set by now, either we don't have a block device -# or smartctl failed. Either way, default to ATA and set out to -# nothing +# If type is not set by now, either we don't have a block device +# or smartctl failed. Either way, default to ATA and set $out to +# nothing. if [ -z "$type" ]; then - type="ATA" + type="sata" out= fi case $script in smart) # Print temperature plus common predictors of drive failure - if [ "$type" = "SAS" ] ; then + if [ "$type" = "sas" ] ; then scripts="temp|health|r_ucor|w_ucor" - elif [ "$type" = "ATA" ] ; then + elif [ "$type" = "sata" ] ; then scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor" - elif [ "$type" = "NVMe" ] ; then + elif [ "$type" = "nvme" ] ; then scripts="temp|health|nvme_err" fi ;; smartx) # Print some other interesting stats - if [ "$type" = "SAS" ] ; then + if [ "$type" = "sas" ] ; then scripts="hours_on|defect|nonmed|r_proc|w_proc" - elif [ "$type" = "ATA" ] ; then + elif [ "$type" = "sata" ] ; then scripts="hours_on|pwr_cyc" - elif [ "$type" = "NVMe" ] ; then + elif [ "$type" = "nvme" ] ; then scripts="hours_on|pwr_cyc" fi ;; +smart_test) + scripts="test_type|test_status|test_progress|test_ended" + ;; *) scripts="$script" esac diff --git a/cmd/zpool/zpool.d/smart_test b/cmd/zpool/zpool.d/smart_test new file mode 120000 index 000000000..94f22861f --- /dev/null +++ b/cmd/zpool/zpool.d/smart_test @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_ended b/cmd/zpool/zpool.d/test_ended new file mode 120000 index 000000000..94f22861f --- /dev/null +++ b/cmd/zpool/zpool.d/test_ended @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_progress b/cmd/zpool/zpool.d/test_progress new file mode 120000 index 000000000..94f22861f --- /dev/null +++ b/cmd/zpool/zpool.d/test_progress @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_status b/cmd/zpool/zpool.d/test_status new file mode 120000 index 000000000..94f22861f --- /dev/null +++ b/cmd/zpool/zpool.d/test_status @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_type b/cmd/zpool/zpool.d/test_type new file mode 120000 index 000000000..94f22861f --- /dev/null +++ b/cmd/zpool/zpool.d/test_type @@ -0,0 +1 @@ +smart \ No newline at end of file