# SPDX-License-Identifier: CDDL-1.0
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source.  A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#

#
# Copyright (c) 2015, 2021 by Delphix. All rights reserved.
# Copyright (c) 2016, Intel Corporation.
#

. "$STF_SUITE"/include/libtest.shlib

# Defaults common to all the tests in the regression group
export PERF_RUNTIME=${PERF_RUNTIME:-'180'}
export PERF_RANDSEED=${PERF_RANDSEED:-'1234'}
export PERF_COMPPERCENT=${PERF_COMPPERCENT:-'66'}
export PERF_COMPCHUNK=${PERF_COMPCHUNK:-'4096'}

# Default to JSON for fio output
export PERF_FIO_FORMAT=${PERF_FIO_FORMAT:-'json'}

# Default fs creation options
export PERF_FS_OPTS=${PERF_FS_OPTS:-'-o recsize=8k -o compress=lz4' \
    ' -o checksum=sha256 -o redundant_metadata=most'}

function get_sync_str
{
	typeset sync=$1
	typeset sync_str=''

	[[ $sync -eq 0 ]] && sync_str='async'
	[[ $sync -eq 1 ]] && sync_str='sync'
	echo $sync_str
}

function get_suffix
{
	typeset threads=$1
	typeset sync=$2
	typeset iosize=$3

	typeset sync_str=$(get_sync_str "$sync")
	typeset filesystems=$(get_nfilesystems)

	typeset suffix="$sync_str.$iosize-ios"
	suffix="$suffix.$threads-threads.$filesystems-filesystems"
	echo "$suffix"
}

function do_fio_run_impl
{
	typeset script=$1
	typeset do_recreate=$2
	typeset clear_cache=$3

	typeset threads=$4
	typeset threads_per_fs=$5
	typeset sync=$6
	typeset iosize=$7

	typeset sync_str=$(get_sync_str "$sync")
	log_note "Running with $threads $sync_str threads, $iosize ios"

	if [[ -n $threads_per_fs && $threads_per_fs -ne 0 ]]; then
		log_must test "$do_recreate"
		verify_threads_per_fs "$threads" "$threads_per_fs"
	fi

	if $do_recreate; then
		recreate_perf_pool

		#
		# A value of zero for "threads_per_fs" is "special", and
		# means a single filesystem should be used, regardless
		# of the number of threads.
		#
		if [[ -n $threads_per_fs && $threads_per_fs -ne 0 ]]; then
			populate_perf_filesystems $((threads / threads_per_fs))
		else
			populate_perf_filesystems 1
		fi
	fi

	if $clear_cache; then
		# Clear the ARC
		log_must zinject -a
	fi

	if [[ -n $ZINJECT_DELAYS ]]; then
		apply_zinject_delays
	else
		log_note "No per-device commands to execute."
	fi

	#
	# Allow this to be overridden by the individual test case. This
	# can be used to run the FIO job against something other than
	# the default filesystem (e.g. against a clone).
	#
	export DIRECTORY=$(get_directory)
	log_note "DIRECTORY: $DIRECTORY"

	export RUNTIME=$PERF_RUNTIME
	export RANDSEED=$PERF_RANDSEED
	export COMPPERCENT=$PERF_COMPPERCENT
	export COMPCHUNK=$PERF_COMPCHUNK
	export FILESIZE=$((TOTAL_SIZE / threads))
	export NUMJOBS=$threads
	export SYNC_TYPE=$sync
	export BLOCKSIZE=$iosize
	sync

	# When running locally, we want to keep the default behavior of
	# DIRECT == 0, so only set it when we're running over NFS to
	# disable client cache for reads.
	if [[ $NFS -eq 1 ]]; then
		export DIRECT=1
		do_setup_nfs "$script"
	else
		export DIRECT=0
	fi

	# This will be part of the output filename.
	typeset suffix=$(get_suffix "$threads" "$sync" "$iosize")

	# Start the data collection
	do_collect_scripts "$suffix"

	# Define output file
	typeset logbase="$(get_perf_output_dir)/$(basename \
	    "$SUDO_COMMAND")"
	typeset outfile="$logbase.fio.$suffix"

	# Start the load
	if [[ $NFS -eq 1 ]]; then
		log_must ssh -t "$NFS_USER@$NFS_CLIENT" "
			fio --output-format=${PERF_FIO_FORMAT} \
			    --output /tmp/fio.out /tmp/test.fio
		"
		log_must scp "$NFS_USER@$NFS_CLIENT":/tmp/fio.out "$outfile"
		log_must ssh -t "$NFS_USER@$NFS_CLIENT" "sudo -S umount $NFS_MOUNT"
	else
		log_must fio --output-format="${PERF_FIO_FORMAT}" \
		    --output "$outfile" "$FIO_SCRIPTS/$script"
	fi
}

#
# This function will run fio in a loop, according to the .fio file passed
# in and a number of environment variables. The following variables can be
# set before launching zfstest to override the defaults.
#
# PERF_RUNTIME: The time in seconds each fio invocation should run.
# PERF_NTHREADS: A list of how many threads each fio invocation will use.
# PERF_SYNC_TYPES: Whether to use (O_SYNC) or not. 1 is sync IO, 0 is async IO.
# PERF_IOSIZES: A list of blocksizes in which each fio invocation will do IO.
# PERF_COLLECT_SCRIPTS: A comma delimited list of 'command args, logfile_tag'
#    pairs that will be added to the scripts specified in each test.
#
function do_fio_run
{
	typeset script=$1
	typeset do_recreate=$2
	typeset clear_cache=$3
	typeset threads threads_per_fs sync iosize

	for threads in $PERF_NTHREADS; do
		for threads_per_fs in $PERF_NTHREADS_PER_FS; do
			for sync in $PERF_SYNC_TYPES; do
				for iosize in $PERF_IOSIZES; do
					do_fio_run_impl \
					    "$script" \
					    "$do_recreate" \
					    "$clear_cache" \
					    "$threads" \
					    "$threads_per_fs" \
					    "$sync" \
					    "$iosize"
				done
			done
		done
	done
}

# This function sets NFS mount on the client and make sure all correct
# permissions are in place
#
function do_setup_nfs
{
	typeset script=$1
	zfs set sharenfs=on "$TESTFS"
	log_must chmod  -R 777 /"$TESTFS"

	ssh -t "$NFS_USER@$NFS_CLIENT" "mkdir -m 777 -p $NFS_MOUNT"
	ssh -t "$NFS_USER@$NFS_CLIENT" "sudo -S umount $NFS_MOUNT"
	log_must ssh -t "$NFS_USER@$NFS_CLIENT" "
		sudo -S mount $NFS_OPTIONS $NFS_SERVER:/$TESTFS $NFS_MOUNT
	"
	#
	# The variables in the fio script are only available in our current
	# shell session, so we have to evaluate them here before copying
	# the resulting script over to the target machine.
	#
	export jobnum='$jobnum'
	while read line; do
		eval echo "$line"
	done < "$FIO_SCRIPTS/$script" > /tmp/test.fio
	log_must sed -i -e "s%directory.*%directory=$NFS_MOUNT%" /tmp/test.fio
	log_must scp /tmp/test.fio "$NFS_USER@$NFS_CLIENT":/tmp
	log_must rm /tmp/test.fio
}

#
# This function iterates through the value pairs in $PERF_COLLECT_SCRIPTS.
# The script at index N is launched in the background, with its output
# redirected to a logfile containing the tag specified at index N + 1.
#
function do_collect_scripts
{
	typeset suffix=$1

	[[ -n $collect_scripts ]] || log_fail "No data collection scripts."
	[[ -n $PERF_RUNTIME ]] || log_fail "No runtime specified."

	# Add in user supplied scripts and logfiles, if any.
	typeset oIFS=$IFS
	IFS=','
	for item in $PERF_COLLECT_SCRIPTS; do
		collect_scripts+=($(echo "$item" | sed 's/^ *//g'))
	done
	IFS=$oIFS

	typeset idx=0
	while [[ $idx -lt "${#collect_scripts[@]}" ]]; do
		typeset logbase="$(get_perf_output_dir)/$(basename \
		    "$SUDO_COMMAND")"
		typeset outfile="$logbase.${collect_scripts[$idx + 1]}.$suffix"

		timeout "$PERF_RUNTIME" "${collect_scripts[$idx]}" >"$outfile" 2>&1 &
		((idx += 2))
	done

	# Need to explicitly return 0 because timeout(1) will kill
	# a child process and cause us to return non-zero.
	return 0
}

# Find a place to deposit performance data collected while under load.
function get_perf_output_dir
{
	typeset dir="$PWD/perf_data"
	[[ -d $dir ]] || mkdir -p "$dir"

	echo "$dir"
}

function apply_zinject_delays
{
	typeset idx=0
	while [[ $idx -lt "${#ZINJECT_DELAYS[@]}" ]]; do
		[[ -n ${ZINJECT_DELAYS[$idx]} ]] || \
		    log_fail "No zinject delay found at index: $idx"

		for disk in $DISKS; do
			log_must zinject \
			    -d "$disk" -D "${ZINJECT_DELAYS[$idx]}" "$PERFPOOL"
		done

		((idx += 1))
	done
}

function clear_zinject_delays
{
	log_must zinject -c all
}

#
# Destroy and create the pool used for performance tests.
#
function recreate_perf_pool
{
	[[ -n $PERFPOOL ]] || log_fail "The \$PERFPOOL variable isn't set."

	#
	# In case there's been some "leaked" zinject delays, or if the
	# performance test injected some delays itself, we clear all
	# delays before attempting to destroy the pool. Each delay
	# places a hold on the pool, so the destroy will fail if there
	# are any outstanding delays.
	#
	clear_zinject_delays

	#
	# This function handles the case where the pool already exists,
	# and will destroy the previous pool and recreate a new pool.
	#
	create_pool "$PERFPOOL" "$DISKS"
}

function verify_threads_per_fs
{
	typeset threads=$1
	typeset threads_per_fs=$2

	log_must test -n "$threads"
	log_must test -n "$threads_per_fs"

	#
	# A value of "0" is treated as a "special value", and it is
	# interpreted to mean all threads will run using a single
	# filesystem.
	#
	[[ $threads_per_fs -eq 0 ]] && return

	#
	# The number of threads per filesystem must be a value greater
	# than or equal to zero; since we just verified the value isn't
	# 0 above, then it must be greater than zero here.
	#
	log_must test "$threads_per_fs" -ge 0

	#
	# This restriction can be lifted later if needed, but for now,
	# we restrict the number of threads per filesystem to a value
	# that evenly divides the thread count. This way, the threads
	# will be evenly distributed over all the filesystems.
	#
	log_must test $((threads % threads_per_fs)) -eq 0
}

function populate_perf_filesystems
{
	typeset nfilesystems=${1:-1}

	export TESTFS=""
	for i in $(seq 1 "$nfilesystems"); do
		typeset dataset="$PERFPOOL/fs$i"
		create_dataset "$dataset" "$PERF_FS_OPTS"
		if [[ -z "$TESTFS" ]]; then
			TESTFS="$dataset"
		else
			TESTFS="$TESTFS $dataset"
		fi
	done
}

function get_nfilesystems
{
	typeset filesystems=($TESTFS)
	echo ${#filesystems[@]}
}

function get_directory
{
	typeset filesystems=($TESTFS)
	typeset directory=

	typeset idx=0
	while [[ $idx -lt "${#filesystems[@]}" ]]; do
		mountpoint=$(get_prop mountpoint "${filesystems[$idx]}")

		if [[ -n $directory ]]; then
			directory=$directory:$mountpoint
		else
			directory=$mountpoint
		fi

		((idx += 1))
	done

	echo "$directory"
}

function get_min_arc_size
{
	case "$UNAME" in
	Linux)
		awk '$1 == "c_min" { print $3 }' /proc/spl/kstat/zfs/arcstats
		;;
	FreeBSD)
		sysctl -n kstat.zfs.misc.arcstats.c_min
		;;
	*)
		dtrace -qn 'BEGIN {
		    printf("%u\n", `arc_stats.arcstat_c_min.value.ui64);
		    exit(0);
		}'
		;;
	esac || log_fail "get_min_arc_size failed"
}

function get_max_arc_size
{
	case "$UNAME" in
	Linux)
		awk '$1 == "c_max" { print $3 }' /proc/spl/kstat/zfs/arcstats
		;;
	FreeBSD)
		sysctl -n kstat.zfs.misc.arcstats.c_max
		;;
	*)
		dtrace -qn 'BEGIN {
		    printf("%u\n", `arc_stats.arcstat_c_max.value.ui64);
		    exit(0);
		}'
		;;
	esac || log_fail "get_max_arc_size failed"
}

function get_arc_target
{
	case "$UNAME" in
	Linux)
		awk '$1 == "c" { print $3 }' /proc/spl/kstat/zfs/arcstats
		;;
	FreeBSD)
		sysctl -n kstat.zfs.misc.arcstats.c
		;;
	*)
		dtrace -qn 'BEGIN {
		    printf("%u\n", `arc_stats.arcstat_c.value.ui64);
		    exit(0);
		}'
		;;
	esac || log_fail "get_arc_target failed"
}

function get_dbuf_cache_size
{
	typeset -l dbuf_cache_size dbuf_cache_shift

	if is_illumos; then
		dbuf_cache_size=$(dtrace -qn 'BEGIN {
		    printf("%u\n", `dbuf_cache_max_bytes);
		    exit(0);
		}')
	else
		dbuf_cache_shift=$(get_tunable DBUF_CACHE_SHIFT)
		dbuf_cache_size=$(($(get_arc_target) / 2**dbuf_cache_shift))
	fi || log_fail "get_dbuf_cache_size failed"

	echo "$dbuf_cache_size"
}

# Create a file with some information about how this system is configured.
function get_system_config
{
	typeset config=$PERF_DATA_DIR/$1

	echo "{" >>"$config"
	if is_linux; then
		echo "  \"ncpus\": \"$(lscpu | awk '/^CPU\(s\)/ {print $2; exit}')\"," >>"$config"
		echo "  \"physmem\": \"$(free -b | \
		    awk '$1 == "Mem:" { print $2 }')\"," >>"$config"
		echo "  \"c_max\": \"$(get_max_arc_size)\"," >>"$config"
		echo "  \"hostname\": \"$(uname -n)\"," >>"$config"
		echo "  \"kernel version\": \"$(uname -sr)\"," >>"$config"
	else
		dtrace -qn 'BEGIN{
		    printf("  \"ncpus\": %d,\n", `ncpus);
		    printf("  \"physmem\": %u,\n", `physmem * `_pagesize);
		    printf("  \"c_max\": %u,\n", `arc_stats.arcstat_c_max.value.ui64);
		    printf("  \"kmem_flags\": \"0x%x\",", `kmem_flags);
		    exit(0)}' >>"$config"
		echo "  \"hostname\": \"$(uname -n)\"," >>"$config"
		echo "  \"kernel version\": \"$(uname -v)\"," >>"$config"
	fi
	if is_linux; then
		lsblk -dino NAME,SIZE | awk 'BEGIN {
		    printf("  \"disks\": {\n"); first = 1}
		    {disk = $1} {size = $2;
		    if (first != 1) {printf(",\n")} else {first = 0}
		    printf("    \"%s\": \"%s\"", disk, size)}
		    END {printf("\n  },\n")}' >>"$config"

		zfs_tunables="/sys/module/zfs/parameters"

		printf "  \"tunables\": {\n" >>"$config"
		for tunable in \
		    zfs_arc_max \
		    zfs_arc_sys_free \
		    zfs_dirty_data_max \
		    zfs_flags \
		    zfs_prefetch_disable \
		    zfs_txg_timeout \
		    zfs_vdev_aggregation_limit \
		    zfs_vdev_async_read_max_active \
		    zfs_vdev_async_write_max_active \
		    zfs_vdev_sync_read_max_active \
		    zfs_vdev_sync_write_max_active \
		    zio_slow_io_ms
		do
			if [ "$tunable" != "zfs_arc_max" ]
			then
				printf ",\n" >>"$config"
			fi
			printf  "    \"$tunable\": \"$(<$zfs_tunables/$tunable)\"" \
			    >>"$config"
		done
		printf "\n  }\n" >>"$config"
	else
		iostat -En | awk 'BEGIN {
		    printf("  \"disks\": {\n"); first = 1}
		    /^c/ {disk = $1}
		    /^Size: [^0]/ {size = $2;
		    if (first != 1) {printf(",\n")} else {first = 0}
		    printf("    \"%s\": \"%s\"", disk, size)}
		    END {printf("\n  },\n")}' >>"$config"

		sed -n 's/^set \(.*\)[ ]=[ ]\(.*\)/\1=\2/p' /etc/system | \
		    awk -F= 'BEGIN {printf("  \"system\": {\n"); first = 1}
		    {if (first != 1) {printf(",\n")} else {first = 0};
		    printf("    \"%s\": %s", $1, $2)}
		    END {printf("\n  }\n")}' >>"$config"
	fi
	echo "}" >>"$config"
}

#
# On illumos this looks like: ":sd3:sd4:sd1:sd2:"
#
function pool_to_lun_list
{
	typeset pool=$1
	typeset ctd ctds devname lun
	typeset lun_list=':'

	case "$UNAME" in
	Linux)
		ctds=$(zpool list -HLv "$pool" | \
		    awk '/sd[a-z]*|loop[0-9]*|dm-[0-9]*/ {print $1}')

		for ctd in $ctds; do
			lun_list="$lun_list$ctd:"
		done
		;;
	FreeBSD)
		lun_list+=$(zpool list -HLv "$pool" | \
		    awk '/a?da[0-9]+|md[0-9]+|mfid[0-9]+|nda[0-9]+|nvd[0-9]+|vtbd[0-9]+/
		         { printf "%s:", $1 }')
		;;
	*)
		ctds=$(zpool list -v "$pool" |
		    awk '/c[0-9]*t[0-9a-fA-F]*d[0-9]*/ {print $1}')

		for ctd in $ctds; do
			# Get the device name as it appears in /etc/path_to_inst
			devname=$(readlink -f /dev/dsk/"${ctd}"s0 | sed -n 's/\/devices\([^:]*\):.*/\1/p')
			# Add a string composed of the driver name and instance
			# number to the list for comparison with dev_statname.
			lun=$(sed 's/"//g' /etc/path_to_inst | awk -v dn="$devname" '$0 ~ dn {print $3$2}')
			lun_list="$lun_list$lun:"
		done
		;;
	esac
	echo "$lun_list"
}

function print_perf_settings
{
	echo "PERF_NTHREADS: $PERF_NTHREADS"
	echo "PERF_NTHREADS_PER_FS: $PERF_NTHREADS_PER_FS"
	echo "PERF_SYNC_TYPES: $PERF_SYNC_TYPES"
	echo "PERF_IOSIZES: $PERF_IOSIZES"
}

# Create a perf_data directory to hold performance statistics and
# configuration information.
export PERF_DATA_DIR=$(get_perf_output_dir)
[[ -f $PERF_DATA_DIR/config.json ]] || get_system_config config.json