mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Add SMART self-test results to zpool status -c
Add in SMART self-test results to zpool status|iostat -c. This works for both SAS and SATA drives. Also, add plumbing to allow the 'smart' script to take smartctl output from a directory of output text files instead of running it against the vdevs. Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #7178
This commit is contained in:
		
							parent
							
								
									99920d823e
								
							
						
					
					
						commit
						5e3085e360
					
				@ -63,7 +63,12 @@ dist_zpoolexec_SCRIPTS = \
 | 
			
		||||
	zpool.d/nvme_err \
 | 
			
		||||
	zpool.d/pwr_cyc \
 | 
			
		||||
	zpool.d/upath \
 | 
			
		||||
	zpool.d/vendor
 | 
			
		||||
	zpool.d/vendor \
 | 
			
		||||
	zpool.d/smart_test \
 | 
			
		||||
	zpool.d/test_type \
 | 
			
		||||
	zpool.d/test_status \
 | 
			
		||||
	zpool.d/test_progress \
 | 
			
		||||
	zpool.d/test_ended
 | 
			
		||||
 | 
			
		||||
zpoolconfdefaults = \
 | 
			
		||||
	enc \
 | 
			
		||||
@ -102,7 +107,12 @@ zpoolconfdefaults = \
 | 
			
		||||
	nvme_err \
 | 
			
		||||
	pwr_cyc \
 | 
			
		||||
	upath \
 | 
			
		||||
	vendor
 | 
			
		||||
	vendor \
 | 
			
		||||
	smart_test \
 | 
			
		||||
	test_type \
 | 
			
		||||
	test_status \
 | 
			
		||||
	test_progress \
 | 
			
		||||
	test_ended
 | 
			
		||||
 | 
			
		||||
install-data-hook:
 | 
			
		||||
	$(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
 | 
			
		||||
 | 
			
		||||
@ -24,8 +24,44 @@ ata_err:	Show SMART ATA errors (ATA).
 | 
			
		||||
pwr_cyc:	Show SMART power cycle count (ATA).
 | 
			
		||||
serial:		Show disk serial number.
 | 
			
		||||
nvme_err:	Show SMART NVMe errors (NVMe).
 | 
			
		||||
smart_test:	Show SMART self-test results summary.
 | 
			
		||||
test_type:	Show SMART self-test type (short, long... ).
 | 
			
		||||
test_status:	Show SMART self-test status.
 | 
			
		||||
test_progress:	Show SMART self-test percentage done.
 | 
			
		||||
test_ended:	Show when the last SMART self-test ended (if supported).
 | 
			
		||||
"
 | 
			
		||||
 | 
			
		||||
# Hack for developer testing
 | 
			
		||||
#
 | 
			
		||||
# If you set $samples to a directory containing smartctl output text files,
 | 
			
		||||
# we will use them instead of running smartctl on the vdevs.  This can be
 | 
			
		||||
# useful if you want to test a bunch of different smartctl outputs.  Also, if
 | 
			
		||||
# $samples is set, and additional 'file' column is added to the zpool output
 | 
			
		||||
# showing the filename.
 | 
			
		||||
samples=
 | 
			
		||||
 | 
			
		||||
# get_filename_from_dir DIR
 | 
			
		||||
#
 | 
			
		||||
# Look in directory DIR and return a filename from it.  The filename returned
 | 
			
		||||
# is chosen quasi-sequentially (based off our PID).  This allows us to return
 | 
			
		||||
# a different filename every time this script is invoked (which we do for each
 | 
			
		||||
# vdev), without having to maintain state.
 | 
			
		||||
get_filename_from_dir()
 | 
			
		||||
{
 | 
			
		||||
	dir=$1
 | 
			
		||||
	pid="$$"
 | 
			
		||||
	num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
 | 
			
		||||
	mod=$((pid % num_files))
 | 
			
		||||
	i=0
 | 
			
		||||
	find "$dir" -type f -printf "%f\n" | while read -r file ; do
 | 
			
		||||
		if [ "$mod" = "$i" ] ; then
 | 
			
		||||
			echo "$file"
 | 
			
		||||
			break
 | 
			
		||||
		fi
 | 
			
		||||
		i=$((i+1))
 | 
			
		||||
	done
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
script=$(basename "$0")
 | 
			
		||||
 | 
			
		||||
if [ "$1" = "-h" ] ; then
 | 
			
		||||
@ -35,8 +71,16 @@ fi
 | 
			
		||||
 | 
			
		||||
smartctl_path=$(which smartctl)
 | 
			
		||||
 | 
			
		||||
if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
 | 
			
		||||
	raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
 | 
			
		||||
if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
 | 
			
		||||
	if [ -n "$samples" ] ; then
 | 
			
		||||
		# cat a smartctl output text file instead of running smartctl
 | 
			
		||||
		# on a vdev (only used for developer testing).
 | 
			
		||||
		file=$(get_filename_from_dir $samples)
 | 
			
		||||
		echo "file=$file"
 | 
			
		||||
		raw_out=$(cat "$samples/$file")
 | 
			
		||||
	else
 | 
			
		||||
		raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
 | 
			
		||||
	fi
 | 
			
		||||
 | 
			
		||||
	# What kind of drive are we?  Look for the right line in smartctl:
 | 
			
		||||
	#
 | 
			
		||||
@ -49,7 +93,6 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
 | 
			
		||||
	# NVMe:
 | 
			
		||||
	#       SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
 | 
			
		||||
	#
 | 
			
		||||
	type=$(echo "$raw_out" | grep -m 1 -Eo '^ATA|NVMe|SAS$')
 | 
			
		||||
	out=$(echo "$raw_out" | awk '
 | 
			
		||||
# SAS specific
 | 
			
		||||
/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
 | 
			
		||||
@ -58,10 +101,11 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
 | 
			
		||||
/Elements in grown defect list/{print "defect="$6}
 | 
			
		||||
 | 
			
		||||
# SAS common
 | 
			
		||||
/SAS/{type="sas"}
 | 
			
		||||
/Drive Temperature:/{print "temp="$4}
 | 
			
		||||
# Status can be a long string, substitute spaces for '_'
 | 
			
		||||
/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
 | 
			
		||||
/number of hours powered up/{print "hours_on="$7}
 | 
			
		||||
/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
 | 
			
		||||
/Serial number:/{print "serial="$3}
 | 
			
		||||
 | 
			
		||||
# SATA specific
 | 
			
		||||
@ -74,13 +118,16 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
 | 
			
		||||
/Power_Cycle_Count/{print "pwr_cyc="$10}
 | 
			
		||||
 | 
			
		||||
# SATA common
 | 
			
		||||
/SATA/{type="sata"}
 | 
			
		||||
/Temperature_Celsius/{print "temp="$10}
 | 
			
		||||
/Airflow_Temperature_Cel/{print "temp="$10}
 | 
			
		||||
/Current Temperature:/{print "temp="$3}
 | 
			
		||||
/SMART overall-health self-assessment test result:/{print "health="$6}
 | 
			
		||||
/Power_On_Hours/{print "hours_on="$10}
 | 
			
		||||
/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
 | 
			
		||||
/Serial Number:/{print "serial="$3}
 | 
			
		||||
 | 
			
		||||
# NVMe common
 | 
			
		||||
/NVMe/{type="nvme"}
 | 
			
		||||
/Temperature:/{print "temp="$2}
 | 
			
		||||
/SMART overall-health self-assessment test result:/{print "health="$6}
 | 
			
		||||
/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
 | 
			
		||||
@ -90,39 +137,92 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
 | 
			
		||||
# NVMe specific
 | 
			
		||||
/Media and Data Integrity Errors:/{print "nvme_err="$6}
 | 
			
		||||
 | 
			
		||||
END {ORS="\n"; print ""}
 | 
			
		||||
# SMART self-test info
 | 
			
		||||
/Self-test execution status:/{progress=tolower($4)} # SAS
 | 
			
		||||
/SMART Self-test log/{test_seen=1} # SAS
 | 
			
		||||
/SMART Extended Self-test Log/{test_seen=1} # SATA
 | 
			
		||||
/# 1/{
 | 
			
		||||
	test_type=tolower($3"_"$4);
 | 
			
		||||
	# Status could be one word ("Completed") or multiple ("Completed: read
 | 
			
		||||
	# failure").  Look for the ":" to see if we need to grab more words.
 | 
			
		||||
 | 
			
		||||
	if ($5 ~ ":")
 | 
			
		||||
		status=tolower($5""$6"_"$7)
 | 
			
		||||
	else
 | 
			
		||||
		status=tolower($5)
 | 
			
		||||
	if (status=="self")
 | 
			
		||||
		status="running";
 | 
			
		||||
 | 
			
		||||
	if (type == "sas") {
 | 
			
		||||
		hours=int($(NF-4))
 | 
			
		||||
	} else {
 | 
			
		||||
		hours=int($(NF-1))
 | 
			
		||||
		# SATA reports percent remaining, rather than percent done
 | 
			
		||||
		# Convert it to percent done.
 | 
			
		||||
		progress=(100-int($(NF-2)))"%"
 | 
			
		||||
	}
 | 
			
		||||
	# When we int()-ify "hours", it converts stuff like "NOW" and "-" into
 | 
			
		||||
	# 0.  In those cases, set it to hours_on, so they will cancel out in
 | 
			
		||||
	# the "hours_ago" calculation later on.
 | 
			
		||||
	if (hours == 0)
 | 
			
		||||
		hours=hours_on
 | 
			
		||||
 | 
			
		||||
	if (test_seen) {
 | 
			
		||||
		print "test="hours_on
 | 
			
		||||
		print "test_type="test_type
 | 
			
		||||
		print "test_status="status
 | 
			
		||||
		print "test_progress="progress
 | 
			
		||||
	}
 | 
			
		||||
	# Not all drives report hours_on
 | 
			
		||||
	if (hours_on && hours) {
 | 
			
		||||
		total_hours_ago=(hours_on-hours)
 | 
			
		||||
		days_ago=int(total_hours_ago/24)
 | 
			
		||||
		hours_ago=(total_hours_ago % 24)
 | 
			
		||||
		if (days_ago != 0)
 | 
			
		||||
			ago_str=days_ago"d"
 | 
			
		||||
		if (hours_ago !=0)
 | 
			
		||||
			ago_str=ago_str""hours_ago"h"
 | 
			
		||||
		print "test_ended="ago_str
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
END {print "type="type; ORS="\n"; print ""}
 | 
			
		||||
');
 | 
			
		||||
fi
 | 
			
		||||
type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
 | 
			
		||||
 | 
			
		||||
# if type is not set by now, either we don't have a block device
 | 
			
		||||
# or smartctl failed. Either way, default to ATA and set out to
 | 
			
		||||
# nothing
 | 
			
		||||
# If type is not set by now, either we don't have a block device
 | 
			
		||||
# or smartctl failed. Either way, default to ATA and set $out to
 | 
			
		||||
# nothing.
 | 
			
		||||
if [ -z "$type" ]; then
 | 
			
		||||
	type="ATA"
 | 
			
		||||
	type="sata"
 | 
			
		||||
	out=
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
case $script in
 | 
			
		||||
smart)
 | 
			
		||||
	# Print temperature plus common predictors of drive failure
 | 
			
		||||
	if [ "$type" = "SAS" ] ; then
 | 
			
		||||
	if [ "$type" = "sas" ] ; then
 | 
			
		||||
		scripts="temp|health|r_ucor|w_ucor"
 | 
			
		||||
	elif [ "$type" = "ATA" ] ; then
 | 
			
		||||
	elif [ "$type" = "sata" ] ; then
 | 
			
		||||
		scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
 | 
			
		||||
	elif [ "$type" = "NVMe" ] ; then
 | 
			
		||||
	elif [ "$type" = "nvme" ] ; then
 | 
			
		||||
		scripts="temp|health|nvme_err"
 | 
			
		||||
	fi
 | 
			
		||||
	;;
 | 
			
		||||
smartx)
 | 
			
		||||
	# Print some other interesting stats
 | 
			
		||||
	if [ "$type" = "SAS" ] ; then
 | 
			
		||||
	if [ "$type" = "sas" ] ; then
 | 
			
		||||
		scripts="hours_on|defect|nonmed|r_proc|w_proc"
 | 
			
		||||
	elif [ "$type" = "ATA" ] ; then
 | 
			
		||||
	elif [ "$type" = "sata" ] ; then
 | 
			
		||||
		scripts="hours_on|pwr_cyc"
 | 
			
		||||
	elif [ "$type" = "NVMe" ] ; then
 | 
			
		||||
	elif [ "$type" = "nvme" ] ; then
 | 
			
		||||
		scripts="hours_on|pwr_cyc"
 | 
			
		||||
	fi
 | 
			
		||||
	;;
 | 
			
		||||
smart_test)
 | 
			
		||||
	scripts="test_type|test_status|test_progress|test_ended"
 | 
			
		||||
	;;
 | 
			
		||||
*)
 | 
			
		||||
	scripts="$script"
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1
									
								
								cmd/zpool/zpool.d/smart_test
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								cmd/zpool/zpool.d/smart_test
									
									
									
									
									
										Symbolic link
									
								
							@ -0,0 +1 @@
 | 
			
		||||
smart
 | 
			
		||||
							
								
								
									
										1
									
								
								cmd/zpool/zpool.d/test_ended
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								cmd/zpool/zpool.d/test_ended
									
									
									
									
									
										Symbolic link
									
								
							@ -0,0 +1 @@
 | 
			
		||||
smart
 | 
			
		||||
							
								
								
									
										1
									
								
								cmd/zpool/zpool.d/test_progress
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								cmd/zpool/zpool.d/test_progress
									
									
									
									
									
										Symbolic link
									
								
							@ -0,0 +1 @@
 | 
			
		||||
smart
 | 
			
		||||
							
								
								
									
										1
									
								
								cmd/zpool/zpool.d/test_status
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								cmd/zpool/zpool.d/test_status
									
									
									
									
									
										Symbolic link
									
								
							@ -0,0 +1 @@
 | 
			
		||||
smart
 | 
			
		||||
							
								
								
									
										1
									
								
								cmd/zpool/zpool.d/test_type
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								cmd/zpool/zpool.d/test_type
									
									
									
									
									
										Symbolic link
									
								
							@ -0,0 +1 @@
 | 
			
		||||
smart
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user