323 lines
9.8 KiB
Diff
323 lines
9.8 KiB
Diff
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||
|
From: Tony Hutter <hutter2@llnl.gov>
|
||
|
Date: Tue, 27 Feb 2018 09:31:27 -0800
|
||
|
Subject: [PATCH] Add SMART self-test results to zpool status -c
|
||
|
MIME-Version: 1.0
|
||
|
Content-Type: text/plain; charset=UTF-8
|
||
|
Content-Transfer-Encoding: 8bit
|
||
|
|
||
|
Add in SMART self-test results to zpool status|iostat -c. This
|
||
|
works for both SAS and SATA drives.
|
||
|
|
||
|
Also, add plumbing to allow the 'smart' script to take smartctl
|
||
|
output from a directory of output text files instead of running
|
||
|
it against the vdevs.
|
||
|
|
||
|
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
|
||
|
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||
|
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
|
||
|
Closes #7178
|
||
|
(cherry picked from commit 5e3085e360161456fe2af697494c479de0ee2085)
|
||
|
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
||
|
---
|
||
|
cmd/zpool/Makefile.am | 14 ++++-
|
||
|
cmd/zpool/zpool.d/smart | 132 +++++++++++++++++++++++++++++++++++-----
|
||
|
cmd/zpool/zpool.d/smart_test | 1 +
|
||
|
cmd/zpool/zpool.d/test_ended | 1 +
|
||
|
cmd/zpool/zpool.d/test_progress | 1 +
|
||
|
cmd/zpool/zpool.d/test_status | 1 +
|
||
|
cmd/zpool/zpool.d/test_type | 1 +
|
||
|
7 files changed, 133 insertions(+), 18 deletions(-)
|
||
|
create mode 120000 cmd/zpool/zpool.d/smart_test
|
||
|
create mode 120000 cmd/zpool/zpool.d/test_ended
|
||
|
create mode 120000 cmd/zpool/zpool.d/test_progress
|
||
|
create mode 120000 cmd/zpool/zpool.d/test_status
|
||
|
create mode 120000 cmd/zpool/zpool.d/test_type
|
||
|
|
||
|
diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am
|
||
|
index c7b8b76e3..d07f8d616 100644
|
||
|
--- a/cmd/zpool/Makefile.am
|
||
|
+++ b/cmd/zpool/Makefile.am
|
||
|
@@ -63,7 +63,12 @@ dist_zpoolexec_SCRIPTS = \
|
||
|
zpool.d/nvme_err \
|
||
|
zpool.d/pwr_cyc \
|
||
|
zpool.d/upath \
|
||
|
- zpool.d/vendor
|
||
|
+ zpool.d/vendor \
|
||
|
+ zpool.d/smart_test \
|
||
|
+ zpool.d/test_type \
|
||
|
+ zpool.d/test_status \
|
||
|
+ zpool.d/test_progress \
|
||
|
+ zpool.d/test_ended
|
||
|
|
||
|
zpoolconfdefaults = \
|
||
|
enc \
|
||
|
@@ -102,7 +107,12 @@ zpoolconfdefaults = \
|
||
|
nvme_err \
|
||
|
pwr_cyc \
|
||
|
upath \
|
||
|
- vendor
|
||
|
+ vendor \
|
||
|
+ smart_test \
|
||
|
+ test_type \
|
||
|
+ test_status \
|
||
|
+ test_progress \
|
||
|
+ test_ended
|
||
|
|
||
|
install-data-hook:
|
||
|
$(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
|
||
|
diff --git a/cmd/zpool/zpool.d/smart b/cmd/zpool/zpool.d/smart
|
||
|
index 4bc3af39d..64b5f6e4e 100755
|
||
|
--- a/cmd/zpool/zpool.d/smart
|
||
|
+++ b/cmd/zpool/zpool.d/smart
|
||
|
@@ -24,8 +24,44 @@ ata_err: Show SMART ATA errors (ATA).
|
||
|
pwr_cyc: Show SMART power cycle count (ATA).
|
||
|
serial: Show disk serial number.
|
||
|
nvme_err: Show SMART NVMe errors (NVMe).
|
||
|
+smart_test: Show SMART self-test results summary.
|
||
|
+test_type: Show SMART self-test type (short, long... ).
|
||
|
+test_status: Show SMART self-test status.
|
||
|
+test_progress: Show SMART self-test percentage done.
|
||
|
+test_ended: Show when the last SMART self-test ended (if supported).
|
||
|
"
|
||
|
|
||
|
+# Hack for developer testing
|
||
|
+#
|
||
|
+# If you set $samples to a directory containing smartctl output text files,
|
||
|
+# we will use them instead of running smartctl on the vdevs. This can be
|
||
|
+# useful if you want to test a bunch of different smartctl outputs. Also, if
|
||
|
+# $samples is set, and additional 'file' column is added to the zpool output
|
||
|
+# showing the filename.
|
||
|
+samples=
|
||
|
+
|
||
|
+# get_filename_from_dir DIR
|
||
|
+#
|
||
|
+# Look in directory DIR and return a filename from it. The filename returned
|
||
|
+# is chosen quasi-sequentially (based off our PID). This allows us to return
|
||
|
+# a different filename every time this script is invoked (which we do for each
|
||
|
+# vdev), without having to maintain state.
|
||
|
+get_filename_from_dir()
|
||
|
+{
|
||
|
+ dir=$1
|
||
|
+ pid="$$"
|
||
|
+ num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
|
||
|
+ mod=$((pid % num_files))
|
||
|
+ i=0
|
||
|
+ find "$dir" -type f -printf "%f\n" | while read -r file ; do
|
||
|
+ if [ "$mod" = "$i" ] ; then
|
||
|
+ echo "$file"
|
||
|
+ break
|
||
|
+ fi
|
||
|
+ i=$((i+1))
|
||
|
+ done
|
||
|
+}
|
||
|
+
|
||
|
script=$(basename "$0")
|
||
|
|
||
|
if [ "$1" = "-h" ] ; then
|
||
|
@@ -35,8 +71,16 @@ fi
|
||
|
|
||
|
smartctl_path=$(which smartctl)
|
||
|
|
||
|
-if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
||
|
- raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
|
||
|
+if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
|
||
|
+ if [ -n "$samples" ] ; then
|
||
|
+ # cat a smartctl output text file instead of running smartctl
|
||
|
+ # on a vdev (only used for developer testing).
|
||
|
+ file=$(get_filename_from_dir $samples)
|
||
|
+ echo "file=$file"
|
||
|
+ raw_out=$(cat "$samples/$file")
|
||
|
+ else
|
||
|
+ raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
|
||
|
+ fi
|
||
|
|
||
|
# What kind of drive are we? Look for the right line in smartctl:
|
||
|
#
|
||
|
@@ -49,7 +93,6 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
||
|
# NVMe:
|
||
|
# SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
|
||
|
#
|
||
|
- type=$(echo "$raw_out" | grep -m 1 -Eo '^ATA|NVMe|SAS$')
|
||
|
out=$(echo "$raw_out" | awk '
|
||
|
# SAS specific
|
||
|
/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
|
||
|
@@ -58,10 +101,11 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
||
|
/Elements in grown defect list/{print "defect="$6}
|
||
|
|
||
|
# SAS common
|
||
|
+/SAS/{type="sas"}
|
||
|
/Drive Temperature:/{print "temp="$4}
|
||
|
# Status can be a long string, substitute spaces for '_'
|
||
|
/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
|
||
|
-/number of hours powered up/{print "hours_on="$7}
|
||
|
+/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
|
||
|
/Serial number:/{print "serial="$3}
|
||
|
|
||
|
# SATA specific
|
||
|
@@ -74,13 +118,16 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
||
|
/Power_Cycle_Count/{print "pwr_cyc="$10}
|
||
|
|
||
|
# SATA common
|
||
|
+/SATA/{type="sata"}
|
||
|
/Temperature_Celsius/{print "temp="$10}
|
||
|
/Airflow_Temperature_Cel/{print "temp="$10}
|
||
|
+/Current Temperature:/{print "temp="$3}
|
||
|
/SMART overall-health self-assessment test result:/{print "health="$6}
|
||
|
-/Power_On_Hours/{print "hours_on="$10}
|
||
|
+/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
|
||
|
/Serial Number:/{print "serial="$3}
|
||
|
|
||
|
# NVMe common
|
||
|
+/NVMe/{type="nvme"}
|
||
|
/Temperature:/{print "temp="$2}
|
||
|
/SMART overall-health self-assessment test result:/{print "health="$6}
|
||
|
/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
|
||
|
@@ -90,39 +137,92 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
||
|
# NVMe specific
|
||
|
/Media and Data Integrity Errors:/{print "nvme_err="$6}
|
||
|
|
||
|
-END {ORS="\n"; print ""}
|
||
|
+# SMART self-test info
|
||
|
+/Self-test execution status:/{progress=tolower($4)} # SAS
|
||
|
+/SMART Self-test log/{test_seen=1} # SAS
|
||
|
+/SMART Extended Self-test Log/{test_seen=1} # SATA
|
||
|
+/# 1/{
|
||
|
+ test_type=tolower($3"_"$4);
|
||
|
+ # Status could be one word ("Completed") or multiple ("Completed: read
|
||
|
+ # failure"). Look for the ":" to see if we need to grab more words.
|
||
|
+
|
||
|
+ if ($5 ~ ":")
|
||
|
+ status=tolower($5""$6"_"$7)
|
||
|
+ else
|
||
|
+ status=tolower($5)
|
||
|
+ if (status=="self")
|
||
|
+ status="running";
|
||
|
+
|
||
|
+ if (type == "sas") {
|
||
|
+ hours=int($(NF-4))
|
||
|
+ } else {
|
||
|
+ hours=int($(NF-1))
|
||
|
+ # SATA reports percent remaining, rather than percent done
|
||
|
+ # Convert it to percent done.
|
||
|
+ progress=(100-int($(NF-2)))"%"
|
||
|
+ }
|
||
|
+ # When we int()-ify "hours", it converts stuff like "NOW" and "-" into
|
||
|
+ # 0. In those cases, set it to hours_on, so they will cancel out in
|
||
|
+ # the "hours_ago" calculation later on.
|
||
|
+ if (hours == 0)
|
||
|
+ hours=hours_on
|
||
|
+
|
||
|
+ if (test_seen) {
|
||
|
+ print "test="hours_on
|
||
|
+ print "test_type="test_type
|
||
|
+ print "test_status="status
|
||
|
+ print "test_progress="progress
|
||
|
+ }
|
||
|
+ # Not all drives report hours_on
|
||
|
+ if (hours_on && hours) {
|
||
|
+ total_hours_ago=(hours_on-hours)
|
||
|
+ days_ago=int(total_hours_ago/24)
|
||
|
+ hours_ago=(total_hours_ago % 24)
|
||
|
+ if (days_ago != 0)
|
||
|
+ ago_str=days_ago"d"
|
||
|
+ if (hours_ago !=0)
|
||
|
+ ago_str=ago_str""hours_ago"h"
|
||
|
+ print "test_ended="ago_str
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+END {print "type="type; ORS="\n"; print ""}
|
||
|
');
|
||
|
fi
|
||
|
+type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
|
||
|
|
||
|
-# if type is not set by now, either we don't have a block device
|
||
|
-# or smartctl failed. Either way, default to ATA and set out to
|
||
|
-# nothing
|
||
|
+# If type is not set by now, either we don't have a block device
|
||
|
+# or smartctl failed. Either way, default to ATA and set $out to
|
||
|
+# nothing.
|
||
|
if [ -z "$type" ]; then
|
||
|
- type="ATA"
|
||
|
+ type="sata"
|
||
|
out=
|
||
|
fi
|
||
|
|
||
|
case $script in
|
||
|
smart)
|
||
|
# Print temperature plus common predictors of drive failure
|
||
|
- if [ "$type" = "SAS" ] ; then
|
||
|
+ if [ "$type" = "sas" ] ; then
|
||
|
scripts="temp|health|r_ucor|w_ucor"
|
||
|
- elif [ "$type" = "ATA" ] ; then
|
||
|
+ elif [ "$type" = "sata" ] ; then
|
||
|
scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
|
||
|
- elif [ "$type" = "NVMe" ] ; then
|
||
|
+ elif [ "$type" = "nvme" ] ; then
|
||
|
scripts="temp|health|nvme_err"
|
||
|
fi
|
||
|
;;
|
||
|
smartx)
|
||
|
# Print some other interesting stats
|
||
|
- if [ "$type" = "SAS" ] ; then
|
||
|
+ if [ "$type" = "sas" ] ; then
|
||
|
scripts="hours_on|defect|nonmed|r_proc|w_proc"
|
||
|
- elif [ "$type" = "ATA" ] ; then
|
||
|
+ elif [ "$type" = "sata" ] ; then
|
||
|
scripts="hours_on|pwr_cyc"
|
||
|
- elif [ "$type" = "NVMe" ] ; then
|
||
|
+ elif [ "$type" = "nvme" ] ; then
|
||
|
scripts="hours_on|pwr_cyc"
|
||
|
fi
|
||
|
;;
|
||
|
+smart_test)
|
||
|
+ scripts="test_type|test_status|test_progress|test_ended"
|
||
|
+ ;;
|
||
|
*)
|
||
|
scripts="$script"
|
||
|
esac
|
||
|
diff --git a/cmd/zpool/zpool.d/smart_test b/cmd/zpool/zpool.d/smart_test
|
||
|
new file mode 120000
|
||
|
index 000000000..94f22861f
|
||
|
--- /dev/null
|
||
|
+++ b/cmd/zpool/zpool.d/smart_test
|
||
|
@@ -0,0 +1 @@
|
||
|
+smart
|
||
|
\ No newline at end of file
|
||
|
diff --git a/cmd/zpool/zpool.d/test_ended b/cmd/zpool/zpool.d/test_ended
|
||
|
new file mode 120000
|
||
|
index 000000000..94f22861f
|
||
|
--- /dev/null
|
||
|
+++ b/cmd/zpool/zpool.d/test_ended
|
||
|
@@ -0,0 +1 @@
|
||
|
+smart
|
||
|
\ No newline at end of file
|
||
|
diff --git a/cmd/zpool/zpool.d/test_progress b/cmd/zpool/zpool.d/test_progress
|
||
|
new file mode 120000
|
||
|
index 000000000..94f22861f
|
||
|
--- /dev/null
|
||
|
+++ b/cmd/zpool/zpool.d/test_progress
|
||
|
@@ -0,0 +1 @@
|
||
|
+smart
|
||
|
\ No newline at end of file
|
||
|
diff --git a/cmd/zpool/zpool.d/test_status b/cmd/zpool/zpool.d/test_status
|
||
|
new file mode 120000
|
||
|
index 000000000..94f22861f
|
||
|
--- /dev/null
|
||
|
+++ b/cmd/zpool/zpool.d/test_status
|
||
|
@@ -0,0 +1 @@
|
||
|
+smart
|
||
|
\ No newline at end of file
|
||
|
diff --git a/cmd/zpool/zpool.d/test_type b/cmd/zpool/zpool.d/test_type
|
||
|
new file mode 120000
|
||
|
index 000000000..94f22861f
|
||
|
--- /dev/null
|
||
|
+++ b/cmd/zpool/zpool.d/test_type
|
||
|
@@ -0,0 +1 @@
|
||
|
+smart
|
||
|
\ No newline at end of file
|
||
|
--
|
||
|
2.14.2
|
||
|
|