75b07eca3e
by importing the upstream release as patches. replace user namespace patch with version which has been applied usptream.
323 lines
9.8 KiB
Diff
323 lines
9.8 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Tony Hutter <hutter2@llnl.gov>
|
|
Date: Tue, 27 Feb 2018 09:31:27 -0800
|
|
Subject: [PATCH] Add SMART self-test results to zpool status -c
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Add in SMART self-test results to zpool status|iostat -c. This
|
|
works for both SAS and SATA drives.
|
|
|
|
Also, add plumbing to allow the 'smart' script to take smartctl
|
|
output from a directory of output text files instead of running
|
|
it against the vdevs.
|
|
|
|
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
|
|
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
|
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
|
|
Closes #7178
|
|
(cherry picked from commit 5e3085e360161456fe2af697494c479de0ee2085)
|
|
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
|
---
|
|
cmd/zpool/Makefile.am | 14 ++++-
|
|
cmd/zpool/zpool.d/smart | 132 +++++++++++++++++++++++++++++++++++-----
|
|
cmd/zpool/zpool.d/smart_test | 1 +
|
|
cmd/zpool/zpool.d/test_ended | 1 +
|
|
cmd/zpool/zpool.d/test_progress | 1 +
|
|
cmd/zpool/zpool.d/test_status | 1 +
|
|
cmd/zpool/zpool.d/test_type | 1 +
|
|
7 files changed, 133 insertions(+), 18 deletions(-)
|
|
create mode 120000 cmd/zpool/zpool.d/smart_test
|
|
create mode 120000 cmd/zpool/zpool.d/test_ended
|
|
create mode 120000 cmd/zpool/zpool.d/test_progress
|
|
create mode 120000 cmd/zpool/zpool.d/test_status
|
|
create mode 120000 cmd/zpool/zpool.d/test_type
|
|
|
|
diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am
|
|
index c7b8b76e3..d07f8d616 100644
|
|
--- a/cmd/zpool/Makefile.am
|
|
+++ b/cmd/zpool/Makefile.am
|
|
@@ -63,7 +63,12 @@ dist_zpoolexec_SCRIPTS = \
|
|
zpool.d/nvme_err \
|
|
zpool.d/pwr_cyc \
|
|
zpool.d/upath \
|
|
- zpool.d/vendor
|
|
+ zpool.d/vendor \
|
|
+ zpool.d/smart_test \
|
|
+ zpool.d/test_type \
|
|
+ zpool.d/test_status \
|
|
+ zpool.d/test_progress \
|
|
+ zpool.d/test_ended
|
|
|
|
zpoolconfdefaults = \
|
|
enc \
|
|
@@ -102,7 +107,12 @@ zpoolconfdefaults = \
|
|
nvme_err \
|
|
pwr_cyc \
|
|
upath \
|
|
- vendor
|
|
+ vendor \
|
|
+ smart_test \
|
|
+ test_type \
|
|
+ test_status \
|
|
+ test_progress \
|
|
+ test_ended
|
|
|
|
install-data-hook:
|
|
$(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
|
|
diff --git a/cmd/zpool/zpool.d/smart b/cmd/zpool/zpool.d/smart
|
|
index 4bc3af39d..64b5f6e4e 100755
|
|
--- a/cmd/zpool/zpool.d/smart
|
|
+++ b/cmd/zpool/zpool.d/smart
|
|
@@ -24,8 +24,44 @@ ata_err: Show SMART ATA errors (ATA).
|
|
pwr_cyc: Show SMART power cycle count (ATA).
|
|
serial: Show disk serial number.
|
|
nvme_err: Show SMART NVMe errors (NVMe).
|
|
+smart_test: Show SMART self-test results summary.
|
|
+test_type: Show SMART self-test type (short, long... ).
|
|
+test_status: Show SMART self-test status.
|
|
+test_progress: Show SMART self-test percentage done.
|
|
+test_ended: Show when the last SMART self-test ended (if supported).
|
|
"
|
|
|
|
+# Hack for developer testing
|
|
+#
|
|
+# If you set $samples to a directory containing smartctl output text files,
|
|
+# we will use them instead of running smartctl on the vdevs. This can be
|
|
+# useful if you want to test a bunch of different smartctl outputs. Also, if
|
|
+# $samples is set, and additional 'file' column is added to the zpool output
|
|
+# showing the filename.
|
|
+samples=
|
|
+
|
|
+# get_filename_from_dir DIR
|
|
+#
|
|
+# Look in directory DIR and return a filename from it. The filename returned
|
|
+# is chosen quasi-sequentially (based off our PID). This allows us to return
|
|
+# a different filename every time this script is invoked (which we do for each
|
|
+# vdev), without having to maintain state.
|
|
+get_filename_from_dir()
|
|
+{
|
|
+ dir=$1
|
|
+ pid="$$"
|
|
+ num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
|
|
+ mod=$((pid % num_files))
|
|
+ i=0
|
|
+ find "$dir" -type f -printf "%f\n" | while read -r file ; do
|
|
+ if [ "$mod" = "$i" ] ; then
|
|
+ echo "$file"
|
|
+ break
|
|
+ fi
|
|
+ i=$((i+1))
|
|
+ done
|
|
+}
|
|
+
|
|
script=$(basename "$0")
|
|
|
|
if [ "$1" = "-h" ] ; then
|
|
@@ -35,8 +71,16 @@ fi
|
|
|
|
smartctl_path=$(which smartctl)
|
|
|
|
-if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
|
- raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
|
|
+if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
|
|
+ if [ -n "$samples" ] ; then
|
|
+ # cat a smartctl output text file instead of running smartctl
|
|
+ # on a vdev (only used for developer testing).
|
|
+ file=$(get_filename_from_dir $samples)
|
|
+ echo "file=$file"
|
|
+ raw_out=$(cat "$samples/$file")
|
|
+ else
|
|
+ raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
|
|
+ fi
|
|
|
|
# What kind of drive are we? Look for the right line in smartctl:
|
|
#
|
|
@@ -49,7 +93,6 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
|
# NVMe:
|
|
# SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
|
|
#
|
|
- type=$(echo "$raw_out" | grep -m 1 -Eo '^ATA|NVMe|SAS$')
|
|
out=$(echo "$raw_out" | awk '
|
|
# SAS specific
|
|
/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
|
|
@@ -58,10 +101,11 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
|
/Elements in grown defect list/{print "defect="$6}
|
|
|
|
# SAS common
|
|
+/SAS/{type="sas"}
|
|
/Drive Temperature:/{print "temp="$4}
|
|
# Status can be a long string, substitute spaces for '_'
|
|
/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
|
|
-/number of hours powered up/{print "hours_on="$7}
|
|
+/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
|
|
/Serial number:/{print "serial="$3}
|
|
|
|
# SATA specific
|
|
@@ -74,13 +118,16 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
|
/Power_Cycle_Count/{print "pwr_cyc="$10}
|
|
|
|
# SATA common
|
|
+/SATA/{type="sata"}
|
|
/Temperature_Celsius/{print "temp="$10}
|
|
/Airflow_Temperature_Cel/{print "temp="$10}
|
|
+/Current Temperature:/{print "temp="$3}
|
|
/SMART overall-health self-assessment test result:/{print "health="$6}
|
|
-/Power_On_Hours/{print "hours_on="$10}
|
|
+/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
|
|
/Serial Number:/{print "serial="$3}
|
|
|
|
# NVMe common
|
|
+/NVMe/{type="nvme"}
|
|
/Temperature:/{print "temp="$2}
|
|
/SMART overall-health self-assessment test result:/{print "health="$6}
|
|
/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
|
|
@@ -90,39 +137,92 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
|
|
# NVMe specific
|
|
/Media and Data Integrity Errors:/{print "nvme_err="$6}
|
|
|
|
-END {ORS="\n"; print ""}
|
|
+# SMART self-test info
|
|
+/Self-test execution status:/{progress=tolower($4)} # SAS
|
|
+/SMART Self-test log/{test_seen=1} # SAS
|
|
+/SMART Extended Self-test Log/{test_seen=1} # SATA
|
|
+/# 1/{
|
|
+ test_type=tolower($3"_"$4);
|
|
+ # Status could be one word ("Completed") or multiple ("Completed: read
|
|
+ # failure"). Look for the ":" to see if we need to grab more words.
|
|
+
|
|
+ if ($5 ~ ":")
|
|
+ status=tolower($5""$6"_"$7)
|
|
+ else
|
|
+ status=tolower($5)
|
|
+ if (status=="self")
|
|
+ status="running";
|
|
+
|
|
+ if (type == "sas") {
|
|
+ hours=int($(NF-4))
|
|
+ } else {
|
|
+ hours=int($(NF-1))
|
|
+ # SATA reports percent remaining, rather than percent done
|
|
+ # Convert it to percent done.
|
|
+ progress=(100-int($(NF-2)))"%"
|
|
+ }
|
|
+ # When we int()-ify "hours", it converts stuff like "NOW" and "-" into
|
|
+ # 0. In those cases, set it to hours_on, so they will cancel out in
|
|
+ # the "hours_ago" calculation later on.
|
|
+ if (hours == 0)
|
|
+ hours=hours_on
|
|
+
|
|
+ if (test_seen) {
|
|
+ print "test="hours_on
|
|
+ print "test_type="test_type
|
|
+ print "test_status="status
|
|
+ print "test_progress="progress
|
|
+ }
|
|
+ # Not all drives report hours_on
|
|
+ if (hours_on && hours) {
|
|
+ total_hours_ago=(hours_on-hours)
|
|
+ days_ago=int(total_hours_ago/24)
|
|
+ hours_ago=(total_hours_ago % 24)
|
|
+ if (days_ago != 0)
|
|
+ ago_str=days_ago"d"
|
|
+ if (hours_ago !=0)
|
|
+ ago_str=ago_str""hours_ago"h"
|
|
+ print "test_ended="ago_str
|
|
+ }
|
|
+}
|
|
+
|
|
+END {print "type="type; ORS="\n"; print ""}
|
|
');
|
|
fi
|
|
+type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
|
|
|
|
-# if type is not set by now, either we don't have a block device
|
|
-# or smartctl failed. Either way, default to ATA and set out to
|
|
-# nothing
|
|
+# If type is not set by now, either we don't have a block device
|
|
+# or smartctl failed. Either way, default to ATA and set $out to
|
|
+# nothing.
|
|
if [ -z "$type" ]; then
|
|
- type="ATA"
|
|
+ type="sata"
|
|
out=
|
|
fi
|
|
|
|
case $script in
|
|
smart)
|
|
# Print temperature plus common predictors of drive failure
|
|
- if [ "$type" = "SAS" ] ; then
|
|
+ if [ "$type" = "sas" ] ; then
|
|
scripts="temp|health|r_ucor|w_ucor"
|
|
- elif [ "$type" = "ATA" ] ; then
|
|
+ elif [ "$type" = "sata" ] ; then
|
|
scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
|
|
- elif [ "$type" = "NVMe" ] ; then
|
|
+ elif [ "$type" = "nvme" ] ; then
|
|
scripts="temp|health|nvme_err"
|
|
fi
|
|
;;
|
|
smartx)
|
|
# Print some other interesting stats
|
|
- if [ "$type" = "SAS" ] ; then
|
|
+ if [ "$type" = "sas" ] ; then
|
|
scripts="hours_on|defect|nonmed|r_proc|w_proc"
|
|
- elif [ "$type" = "ATA" ] ; then
|
|
+ elif [ "$type" = "sata" ] ; then
|
|
scripts="hours_on|pwr_cyc"
|
|
- elif [ "$type" = "NVMe" ] ; then
|
|
+ elif [ "$type" = "nvme" ] ; then
|
|
scripts="hours_on|pwr_cyc"
|
|
fi
|
|
;;
|
|
+smart_test)
|
|
+ scripts="test_type|test_status|test_progress|test_ended"
|
|
+ ;;
|
|
*)
|
|
scripts="$script"
|
|
esac
|
|
diff --git a/cmd/zpool/zpool.d/smart_test b/cmd/zpool/zpool.d/smart_test
|
|
new file mode 120000
|
|
index 000000000..94f22861f
|
|
--- /dev/null
|
|
+++ b/cmd/zpool/zpool.d/smart_test
|
|
@@ -0,0 +1 @@
|
|
+smart
|
|
\ No newline at end of file
|
|
diff --git a/cmd/zpool/zpool.d/test_ended b/cmd/zpool/zpool.d/test_ended
|
|
new file mode 120000
|
|
index 000000000..94f22861f
|
|
--- /dev/null
|
|
+++ b/cmd/zpool/zpool.d/test_ended
|
|
@@ -0,0 +1 @@
|
|
+smart
|
|
\ No newline at end of file
|
|
diff --git a/cmd/zpool/zpool.d/test_progress b/cmd/zpool/zpool.d/test_progress
|
|
new file mode 120000
|
|
index 000000000..94f22861f
|
|
--- /dev/null
|
|
+++ b/cmd/zpool/zpool.d/test_progress
|
|
@@ -0,0 +1 @@
|
|
+smart
|
|
\ No newline at end of file
|
|
diff --git a/cmd/zpool/zpool.d/test_status b/cmd/zpool/zpool.d/test_status
|
|
new file mode 120000
|
|
index 000000000..94f22861f
|
|
--- /dev/null
|
|
+++ b/cmd/zpool/zpool.d/test_status
|
|
@@ -0,0 +1 @@
|
|
+smart
|
|
\ No newline at end of file
|
|
diff --git a/cmd/zpool/zpool.d/test_type b/cmd/zpool/zpool.d/test_type
|
|
new file mode 120000
|
|
index 000000000..94f22861f
|
|
--- /dev/null
|
|
+++ b/cmd/zpool/zpool.d/test_type
|
|
@@ -0,0 +1 @@
|
|
+smart
|
|
\ No newline at end of file
|
|
--
|
|
2.14.2
|
|
|