mirror_zfs/cmd/zilstat.in
Rob Norris 7ddc1f737f
zil: add stats for commit failure/fallback (#16315)
There's no good way to tell when a ZIL commit fails and falls back to a
transaction sync, other than perhaps a throughput drop. This adds
counters so we can see when it happens and why.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
2024-07-25 16:53:59 -07:00

555 lines
16 KiB
Plaintext
Executable File

#!/usr/bin/env @PYTHON_SHEBANG@
#
# Print out statistics for all zil stats. This information is
# available through the zil kstat.
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License, Version 1.0 only
# (the "License"). You may not use this file except in compliance
# with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# This script must remain compatible with Python 3.6+.
#
import sys
import subprocess
import time
import copy
import os
import re
import signal
from collections import defaultdict
import argparse
from argparse import RawTextHelpFormatter
cols = {
# hdr: [size, scale, kstat name]
"time": [8, -1, "time"],
"pool": [12, -1, "pool"],
"ds": [12, -1, "dataset_name"],
"obj": [12, -1, "objset"],
"cc": [5, 1000, "zil_commit_count"],
"cwc": [5, 1000, "zil_commit_writer_count"],
"cec": [5, 1000, "zil_commit_error_count"],
"csc": [5, 1000, "zil_commit_stall_count"],
"cSc": [5, 1000, "zil_commit_suspend_count"],
"ic": [5, 1000, "zil_itx_count"],
"iic": [5, 1000, "zil_itx_indirect_count"],
"iib": [5, 1024, "zil_itx_indirect_bytes"],
"icc": [5, 1000, "zil_itx_copied_count"],
"icb": [5, 1024, "zil_itx_copied_bytes"],
"inc": [5, 1000, "zil_itx_needcopy_count"],
"inb": [5, 1024, "zil_itx_needcopy_bytes"],
"idc": [5, 1000, "icc+inc"],
"idb": [5, 1024, "icb+inb"],
"iwc": [5, 1000, "iic+idc"],
"iwb": [5, 1024, "iib+idb"],
"imnc": [6, 1000, "zil_itx_metaslab_normal_count"],
"imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"],
"imnw": [6, 1024, "zil_itx_metaslab_normal_write"],
"imna": [6, 1024, "zil_itx_metaslab_normal_alloc"],
"imsc": [6, 1000, "zil_itx_metaslab_slog_count"],
"imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"],
"imsw": [6, 1024, "zil_itx_metaslab_slog_write"],
"imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"],
"imc": [5, 1000, "imnc+imsc"],
"imb": [5, 1024, "imnb+imsb"],
"imw": [5, 1024, "imnw+imsw"],
"ima": [5, 1024, "imna+imsa"],
"se%": [3, 100, "imb/ima"],
"sen%": [4, 100, "imnb/imna"],
"ses%": [4, 100, "imsb/imsa"],
"te%": [3, 100, "imb/imw"],
"ten%": [4, 100, "imnb/imnw"],
"tes%": [4, 100, "imsb/imsw"],
}
hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib",
"imnc", "imnw", "imsc", "imsw"]
ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib",
"imnc", "imnw", "imsc", "imsw"]
cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]")
curr = {}
diff = {}
kstat = {}
ds_pairs = {}
pool_name = None
dataset_name = None
interval = 0
sep = " "
gFlag = True
dsFlag = False
def prettynum(sz, scale, num=0):
suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
index = 0
save = 0
if scale == -1:
return "%*s" % (sz, num)
# Rounding error, return 0
elif 0 < num < 1:
num = 0
while num > scale and index < 5:
save = num
num = num / scale
index += 1
if index == 0:
return "%*d" % (sz, num)
if (save / scale) < 10:
return "%*.1f%s" % (sz - 1, num, suffix[index])
else:
return "%*d%s" % (sz - 1, num, suffix[index])
def print_header():
global hdr
global sep
for col in hdr:
new_col = col
if interval > 0 and cols[col][1] > 100:
new_col += "/s"
sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep))
sys.stdout.write("\n")
def print_values(v):
global hdr
global sep
for col in hdr:
val = v[cols[col][2]]
if interval > 0 and cols[col][1] > 100:
val = v[cols[col][2]] // interval
sys.stdout.write("%s%s" % (
prettynum(cols[col][0], cols[col][1], val), sep))
sys.stdout.write("\n")
def print_dict(d):
for pool in d:
for objset in d[pool]:
print_values(d[pool][objset])
def detailed_usage():
sys.stderr.write("%s\n" % cmd)
sys.stderr.write("Field definitions are as follows:\n")
for key in cols:
sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
sys.stderr.write("\n")
sys.exit(0)
def init():
global pool_name
global dataset_name
global interval
global hdr
global curr
global gFlag
global sep
curr = dict()
parser = argparse.ArgumentParser(description='Program to print zilstats',
add_help=True,
formatter_class=RawTextHelpFormatter,
epilog="\nUsage Examples\n"\
"Note: Global zilstats is shown by default,"\
" if none of a|p|d option is not provided\n"\
"\tzilstat -a\n"\
'\tzilstat -v\n'\
'\tzilstat -p tank\n'\
'\tzilstat -d tank/d1,tank/d2,tank/zv1\n'\
'\tzilstat -i 1\n'\
'\tzilstat -s \"***\"\n'\
'\tzilstat -f zcwc,zimnb,zimsb\n')
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="List field headers and definitions"
)
pool_grp = parser.add_mutually_exclusive_group()
pool_grp.add_argument(
"-a", "--all",
action="store_true",
dest="all",
help="Print all dataset stats"
)
pool_grp.add_argument(
"-p", "--pool",
type=str,
help="Print stats for all datasets of a speicfied pool"
)
pool_grp.add_argument(
"-d", "--dataset",
type=str,
help="Print given dataset(s) (Comma separated)"
)
parser.add_argument(
"-f", "--columns",
type=str,
help="Specify specific fields to print (see -v)"
)
parser.add_argument(
"-s", "--separator",
type=str,
help="Override default field separator with custom "
"character or string"
)
parser.add_argument(
"-i", "--interval",
type=int,
dest="interval",
help="Print stats between specified interval"
" (in seconds)"
)
parsed_args = parser.parse_args()
if parsed_args.verbose:
detailed_usage()
if parsed_args.all:
gFlag = False
if parsed_args.interval:
interval = parsed_args.interval
if parsed_args.pool:
pool_name = parsed_args.pool
gFlag = False
if parsed_args.dataset:
dataset_name = parsed_args.dataset
gFlag = False
if parsed_args.separator:
sep = parsed_args.separator
if gFlag:
hdr = ghdr
if parsed_args.columns:
hdr = parsed_args.columns.split(",")
invalid = []
for ele in hdr:
if ele not in cols:
invalid.append(ele)
if len(invalid) > 0:
sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
sys.exit(1)
if pool_name and dataset_name:
print ("Error: Can not filter both dataset and pool")
sys.exit(1)
def FileCheck(fname):
try:
return (open(fname))
except IOError:
print ("Unable to open zilstat proc file: " + fname)
sys.exit(1)
if sys.platform.startswith('freebsd'):
# Requires py-sysctl on FreeBSD
import sysctl
def kstat_update(pool = None, objid = None):
global kstat
kstat = {}
if not pool:
file = "kstat.zfs.misc.zil"
k = [ctl for ctl in sysctl.filter(file) \
if ctl.type != sysctl.CTLTYPE_NODE]
kstat_process_str(k, file, "GLOBAL", len(file + "."))
elif objid:
file = "kstat.zfs." + pool + ".dataset.objset-" + objid
k = [ctl for ctl in sysctl.filter(file) if ctl.type \
!= sysctl.CTLTYPE_NODE]
kstat_process_str(k, file, objid, len(file + "."))
else:
file = "kstat.zfs." + pool + ".dataset"
zil_start = len(file + ".")
obj_start = len("kstat.zfs." + pool + ".")
k = [ctl for ctl in sysctl.filter(file)
if ctl.type != sysctl.CTLTYPE_NODE]
for s in k:
if not s or (s.name.find("zil") == -1 and \
s.name.find("dataset_name") == -1):
continue
name, value = s.name, s.value
objid = re.findall(r'0x[0-9A-F]+', \
name[obj_start:], re.I)[0]
if objid not in kstat:
kstat[objid] = dict()
zil_start = len(file + ".objset-" + \
objid + ".")
kstat[objid][name[zil_start:]] = value \
if (name.find("dataset_name")) \
else int(value)
def kstat_process_str(k, file, objset = "GLOBAL", zil_start = 0):
global kstat
if not k:
print("Unable to process kstat for: " + file)
sys.exit(1)
kstat[objset] = dict()
for s in k:
if not s or (s.name.find("zil") == -1 and \
s.name.find("dataset_name") == -1):
continue
name, value = s.name, s.value
kstat[objset][name[zil_start:]] = value \
if (name.find("dataset_name")) else int(value)
elif sys.platform.startswith('linux'):
def kstat_update(pool = None, objid = None):
global kstat
kstat = {}
if not pool:
k = [line.strip() for line in \
FileCheck("/proc/spl/kstat/zfs/zil")]
kstat_process_str(k, "/proc/spl/kstat/zfs/zil")
elif objid:
file = "/proc/spl/kstat/zfs/" + pool + "/objset-" + objid
k = [line.strip() for line in FileCheck(file)]
kstat_process_str(k, file, objid)
else:
if not os.path.exists(f"/proc/spl/kstat/zfs/{pool}"):
print("Pool \"" + pool + "\" does not exist, Exitting")
sys.exit(1)
objsets = os.listdir(f'/proc/spl/kstat/zfs/{pool}')
for objid in objsets:
if objid.find("objset-") == -1:
continue
file = "/proc/spl/kstat/zfs/" + pool + "/" + objid
k = [line.strip() for line in FileCheck(file)]
kstat_process_str(k, file, objid.replace("objset-", ""))
def kstat_process_str(k, file, objset = "GLOBAL", zil_start = 0):
global kstat
if not k:
print("Unable to process kstat for: " + file)
sys.exit(1)
kstat[objset] = dict()
for s in k:
if not s or (s.find("zil") == -1 and \
s.find("dataset_name") == -1):
continue
name, unused, value = s.split()
kstat[objset][name] = value \
if (name == "dataset_name") else int(value)
def zil_process_kstat():
global curr, pool_name, dataset_name, dsFlag, ds_pairs
curr.clear()
if gFlag == True:
kstat_update()
zil_build_dict()
else:
if pool_name:
kstat_update(pool_name)
zil_build_dict(pool_name)
elif dataset_name:
if dsFlag == False:
dsFlag = True
datasets = dataset_name.split(',')
ds_pairs = defaultdict(list)
for ds in datasets:
try:
objid = subprocess.check_output(['zfs',
'list', '-Hpo', 'objsetid', ds], \
stderr=subprocess.DEVNULL) \
.decode('utf-8').strip()
except subprocess.CalledProcessError as e:
print("Command: \"zfs list -Hpo objset "\
+ str(ds) + "\" failed with error code:"\
+ str(e.returncode))
print("Please make sure that dataset \""\
+ str(ds) + "\" exists")
sys.exit(1)
if not objid:
continue
ds_pairs[ds.split('/')[0]]. \
append(hex(int(objid)))
for pool, objids in ds_pairs.items():
for objid in objids:
kstat_update(pool, objid)
zil_build_dict(pool)
else:
try:
pools = subprocess.check_output(['zpool', 'list', '-Hpo',\
'name']).decode('utf-8').split()
except subprocess.CalledProcessError as e:
print("Command: \"zpool list -Hpo name\" failed with error"\
"code: " + str(e.returncode))
sys.exit(1)
for pool in pools:
kstat_update(pool)
zil_build_dict(pool)
def calculate_diff():
global curr, diff
prev = copy.deepcopy(curr)
zil_process_kstat()
diff = copy.deepcopy(curr)
for pool in curr:
for objset in curr[pool]:
for key in curr[pool][objset]:
if not isinstance(diff[pool][objset][key], int):
continue
# If prev is NULL, this is the
# first time we are here
if not prev:
diff[pool][objset][key] = 0
else:
diff[pool][objset][key] \
= curr[pool][objset][key] \
- prev[pool][objset][key]
def zil_build_dict(pool = "GLOBAL"):
global kstat
for objset in kstat:
for key in kstat[objset]:
val = kstat[objset][key]
if pool not in curr:
curr[pool] = dict()
if objset not in curr[pool]:
curr[pool][objset] = dict()
curr[pool][objset][key] = val
def zil_extend_dict():
global diff
for pool in diff:
for objset in diff[pool]:
diff[pool][objset]["pool"] = pool
diff[pool][objset]["objset"] = objset
diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \
time.localtime())
diff[pool][objset]["icc+inc"] = \
diff[pool][objset]["zil_itx_copied_count"] + \
diff[pool][objset]["zil_itx_needcopy_count"]
diff[pool][objset]["icb+inb"] = \
diff[pool][objset]["zil_itx_copied_bytes"] + \
diff[pool][objset]["zil_itx_needcopy_bytes"]
diff[pool][objset]["iic+idc"] = \
diff[pool][objset]["zil_itx_indirect_count"] + \
diff[pool][objset]["zil_itx_copied_count"] + \
diff[pool][objset]["zil_itx_needcopy_count"]
diff[pool][objset]["iib+idb"] = \
diff[pool][objset]["zil_itx_indirect_bytes"] + \
diff[pool][objset]["zil_itx_copied_bytes"] + \
diff[pool][objset]["zil_itx_needcopy_bytes"]
diff[pool][objset]["imnc+imsc"] = \
diff[pool][objset]["zil_itx_metaslab_normal_count"] + \
diff[pool][objset]["zil_itx_metaslab_slog_count"]
diff[pool][objset]["imnb+imsb"] = \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"]
diff[pool][objset]["imnw+imsw"] = \
diff[pool][objset]["zil_itx_metaslab_normal_write"] + \
diff[pool][objset]["zil_itx_metaslab_slog_write"]
diff[pool][objset]["imna+imsa"] = \
diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \
diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
if diff[pool][objset]["imna+imsa"] > 0:
diff[pool][objset]["imb/ima"] = 100 * \
diff[pool][objset]["imnb+imsb"] // \
diff[pool][objset]["imna+imsa"]
else:
diff[pool][objset]["imb/ima"] = 100
if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
diff[pool][objset]["imnb/imna"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_normal_alloc"]
else:
diff[pool][objset]["imnb/imna"] = 100
if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
diff[pool][objset]["imsb/imsa"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
else:
diff[pool][objset]["imsb/imsa"] = 100
if diff[pool][objset]["imnw+imsw"] > 0:
diff[pool][objset]["imb/imw"] = 100 * \
diff[pool][objset]["imnb+imsb"] // \
diff[pool][objset]["imnw+imsw"]
else:
diff[pool][objset]["imb/imw"] = 100
if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
diff[pool][objset]["imnb/imnw"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_normal_write"]
else:
diff[pool][objset]["imnb/imnw"] = 100
if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
diff[pool][objset]["imsb/imsw"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_slog_write"]
else:
diff[pool][objset]["imsb/imsw"] = 100
def sign_handler_epipe(sig, frame):
print("Caught EPIPE signal: " + str(frame))
print("Exitting...")
sys.exit(0)
def main():
global interval
global curr, diff
hprint = False
init()
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, sign_handler_epipe)
zil_process_kstat()
if not curr:
print ("Error: No stats to show")
sys.exit(0)
print_header()
if interval > 0:
time.sleep(interval)
while True:
calculate_diff()
if not diff:
print ("Error: No stats to show")
sys.exit(0)
zil_extend_dict()
print_dict(diff)
time.sleep(interval)
else:
diff = curr
zil_extend_dict()
print_dict(diff)
if __name__ == '__main__':
main()