Merge branch 'kstat'

This branch updates several of the zfs kstats to take advantage
of the improved raw kstat functionality.  In addition, two new
kstats and a script called dbufstat.py are introduced.

Updated+New Kstats
* dbufs        - Stats for all dbufs in the dbuf_hash
* <pool>/txgs  - Stats for the last N txgs synced to disk
* <pool>/reads - Stats for rhe last N reads issues by the ARC
* <pool>/dmu_tx_assign - Histogram of tx assign times

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Brian Behlendorf 2013-10-25 15:22:34 -07:00
commit 8eaf9f3543
27 changed files with 1653 additions and 326 deletions

View File

@ -1,2 +1,2 @@
SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios
SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat
SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat

View File

@ -106,7 +106,7 @@ opfile = None
sep = " " # Default separator is 2 spaces
version = "0.4"
l2exist = False
cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval "
cmd = ("Usage: arcstat.py [-hvx] [-f fields] [-o file] [-s string] [interval "
"[count]]\n")
cur = {}
d = {}
@ -136,10 +136,10 @@ def usage():
sys.stderr.write("\t -s : Override default field separator with custom "
"character or string\n")
sys.stderr.write("\nExamples:\n")
sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n")
sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n")
sys.stderr.write("\tarcstat -v\n")
sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n")
sys.stderr.write("\tarcstat.py -o /tmp/a.log 2 10\n")
sys.stderr.write("\tarcstat.py -s \",\" -o /tmp/a.log 2 10\n")
sys.stderr.write("\tarcstat.py -v\n")
sys.stderr.write("\tarcstat.py -f time,hit%,dh%,ph%,mh% 1\n")
sys.stderr.write("\n")
sys.exit(1)

2
cmd/dbufstat/Makefile.am Normal file
View File

@ -0,0 +1,2 @@
bin_SCRIPTS = dbufstat.py
EXTRA_DIST = $(bin_SCRIPTS)

535
cmd/dbufstat/dbufstat.py Executable file
View File

@ -0,0 +1,535 @@
#!/usr/bin/python
#
# Print out statistics for all cached dmu buffers. This information
# is available through the dbufs kstat and may be post-processed as
# needed by the script.
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License, Version 1.0 only
# (the "License"). You may not use this file except in compliance
# with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (C) 2013 Lawrence Livermore National Security, LLC.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
#
import sys
import getopt
import errno
bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
"meta", "state", "dbholds", "list", "atype", "index", "flags", "count",
"asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
"l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize"]
bincompat = ["cached", "direct", "indirect", "bonus", "spill"]
dhdr = ["pool", "objset", "object", "dtype", "cached"]
dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
"indirect", "bonus", "spill"]
dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
"list", "atype", "index", "flags", "count", "asize", "access", "mru",
"gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds"]
thdr = ["pool", "objset", "dtype", "cached"]
txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
"bonus", "spill"]
tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
"dbholds", "list", "atype", "index", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp",
"aholds", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds",
"blocks", "dsize"]
cols = {
# hdr: [size, scale, description]
"pool": [15, -1, "pool name"],
"objset": [6, -1, "dataset identification number"],
"object": [10, -1, "object number"],
"level": [5, -1, "indirection level of buffer"],
"blkid": [8, -1, "block number of buffer"],
"offset": [12, 1024, "offset in object of buffer"],
"dbsize": [7, 1024, "size of buffer"],
"meta": [4, -1, "is this buffer metadata?"],
"state": [5, -1, "state of buffer (read, cached, etc)"],
"dbholds": [7, 1000, "number of holds on buffer"],
"list": [4, -1, "which ARC list contains this buffer"],
"atype": [7, -1, "ARC header type (data or metadata)"],
"index": [5, -1, "buffer's index into its ARC list"],
"flags": [8, -1, "ARC read flags"],
"count": [5, -1, "ARC data count"],
"asize": [7, 1024, "size of this ARC buffer"],
"access": [10, -1, "time this ARC buffer was last accessed"],
"mru": [5, 1000, "hits while on the ARC's MRU list"],
"gmru": [5, 1000, "hits while on the ARC's MRU ghost list"],
"mfu": [5, 1000, "hits while on the ARC's MFU list"],
"gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"],
"l2": [5, 1000, "hits while on the L2ARC"],
"l2_dattr": [8, -1, "L2ARC disk address/offset"],
"l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"],
"l2_comp": [21, -1, "L2ARC compression algorithm for buffer"],
"aholds": [6, 1000, "number of holds on this ARC buffer"],
"dtype": [27, -1, "dnode type"],
"btype": [27, -1, "bonus buffer type"],
"data_bs": [7, 1024, "data block size"],
"meta_bs": [7, 1024, "metadata block size"],
"bsize": [6, 1024, "bonus buffer size"],
"lvls": [6, -1, "number of indirection levels"],
"dholds": [6, 1000, "number of holds on dnode"],
"blocks": [8, 1000, "number of allocated blocks"],
"dsize": [12, 1024, "size of dnode"],
"cached": [6, 1024, "bytes cached for all blocks"],
"direct": [6, 1024, "bytes cached for direct blocks"],
"indirect": [8, 1024, "bytes cached for indirect blocks"],
"bonus": [5, 1024, "bytes cached for bonus buffer"],
"spill": [5, 1024, "bytes cached for spill block"],
}
hdr = None
xhdr = None
sep = " " # Default separator is 2 spaces
cmd = ("Usage: dbufstat.py [-bdhrtvx] [-i file] [-f fields] [-o file] "
"[-s string]\n")
raw = 0
def print_incompat_helper(incompat):
cnt = 0
for key in sorted(incompat):
if cnt is 0:
sys.stderr.write("\t")
elif cnt > 8:
sys.stderr.write(",\n\t")
cnt = 0
else:
sys.stderr.write(", ")
sys.stderr.write("%s" % key)
cnt += 1
sys.stderr.write("\n\n")
def detailed_usage():
sys.stderr.write("%s\n" % cmd)
sys.stderr.write("Field definitions incompatible with '-b' option:\n")
print_incompat_helper(bincompat)
sys.stderr.write("Field definitions incompatible with '-d' option:\n")
print_incompat_helper(dincompat)
sys.stderr.write("Field definitions incompatible with '-t' option:\n")
print_incompat_helper(tincompat)
sys.stderr.write("Field definitions are as follows:\n")
for key in sorted(cols.keys()):
sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
sys.stderr.write("\n")
sys.exit(1)
def usage():
sys.stderr.write("%s\n" % cmd)
sys.stderr.write("\t -b : Print table of information for each dbuf\n")
sys.stderr.write("\t -d : Print table of information for each dnode\n")
sys.stderr.write("\t -h : Print this help message\n")
sys.stderr.write("\t -r : Print raw values\n")
sys.stderr.write("\t -t : Print table of information for each dnode type\n")
sys.stderr.write("\t -v : List all possible field headers and definitions"
"\n")
sys.stderr.write("\t -x : Print extended stats\n")
sys.stderr.write("\t -i : Redirect input from the specified file\n")
sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
sys.stderr.write("\t -o : Redirect output to the specified file\n")
sys.stderr.write("\t -s : Override default field separator with custom "
"character or string\n")
sys.stderr.write("\nExamples:\n")
sys.stderr.write("\tdbufstat.py -d -o /tmp/d.log\n")
sys.stderr.write("\tdbufstat.py -t -s \",\" -o /tmp/t.log\n")
sys.stderr.write("\tdbufstat.py -v\n")
sys.stderr.write("\tdbufstat.py -d -f pool,object,objset,dsize,cached\n")
sys.stderr.write("\n")
sys.exit(1)
def prettynum(sz, scale, num=0):
global raw
suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
index = 0
save = 0
if raw or scale == -1:
return "%*s" % (sz, num)
# Rounding error, return 0
elif num > 0 and num < 1:
num = 0
while num > scale and index < 5:
save = num
num = num / scale
index += 1
if index == 0:
return "%*d" % (sz, num)
if (save / scale) < 10:
return "%*.1f%s" % (sz - 1, num, suffix[index])
else:
return "%*d%s" % (sz - 1, num, suffix[index])
def print_values(v):
global hdr
global sep
try:
for col in hdr:
sys.stdout.write("%s%s" % (
prettynum(cols[col][0], cols[col][1], v[col]), sep))
sys.stdout.write("\n")
except IOError as e:
if e.errno == errno.EPIPE:
sys.exit(1)
def print_header():
global hdr
global sep
try:
for col in hdr:
sys.stdout.write("%*s%s" % (cols[col][0], col, sep))
sys.stdout.write("\n")
except IOError as e:
if e.errno == errno.EPIPE:
sys.exit(1)
def get_typestring(t):
type_strings = ["DMU_OT_NONE",
# general:
"DMU_OT_OBJECT_DIRECTORY", "DMU_OT_OBJECT_ARRAY",
"DMU_OT_PACKED_NVLIST", "DMU_OT_PACKED_NVLIST_SIZE",
"DMU_OT_BPOBJ", "DMU_OT_BPOBJ_HDR",
# spa:
"DMU_OT_SPACE_MAP_HEADER", "DMU_OT_SPACE_MAP",
# zil:
"DMU_OT_INTENT_LOG",
# dmu:
"DMU_OT_DNODE", "DMU_OT_OBJSET",
# dsl:
"DMU_OT_DSL_DIR", "DMU_OT_DSL_DIR_CHILD_MAP",
"DMU_OT_DSL_DS_SNAP_MAP", "DMU_OT_DSL_PROPS",
"DMU_OT_DSL_DATASET",
# zpl:
"DMU_OT_ZNODE", "DMU_OT_OLDACL", "DMU_OT_PLAIN_FILE_CONTENTS",
"DMU_OT_DIRECTORY_CONTENTS", "DMU_OT_MASTER_NODE",
"DMU_OT_UNLINKED_SET",
# zvol:
"DMU_OT_ZVOL", "DMU_OT_ZVOL_PROP",
# other; for testing only!
"DMU_OT_PLAIN_OTHER", "DMU_OT_UINT64_OTHER", "DMU_OT_ZAP_OTHER",
# new object types:
"DMU_OT_ERROR_LOG", "DMU_OT_SPA_HISTORY",
"DMU_OT_SPA_HISTORY_OFFSETS", "DMU_OT_POOL_PROPS",
"DMU_OT_DSL_PERMS", "DMU_OT_ACL", "DMU_OT_SYSACL",
"DMU_OT_FUID", "DMU_OT_FUID_SIZE", "DMU_OT_NEXT_CLONES",
"DMU_OT_SCAN_QUEUE", "DMU_OT_USERGROUP_USED",
"DMU_OT_USERGROUP_QUOTA", "DMU_OT_USERREFS", "DMU_OT_DDT_ZAP",
"DMU_OT_DDT_STATS", "DMU_OT_SA", "DMU_OT_SA_MASTER_NODE",
"DMU_OT_SA_ATTR_REGISTRATION", "DMU_OT_SA_ATTR_LAYOUTS",
"DMU_OT_SCAN_XLATE", "DMU_OT_DEDUP", "DMU_OT_DEADLIST",
"DMU_OT_DEADLIST_HDR", "DMU_OT_DSL_CLONES",
"DMU_OT_BPOBJ_SUBOBJ"]
# If "-rr" option is used, don't convert to string representation
if raw > 1:
return "%i" % t
try:
return type_strings[t];
except IndexError:
return "%i" % t
def get_compstring(c):
comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON",
"ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB",
"ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1",
"ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3",
"ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5",
"ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7",
"ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9",
"ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4",
"ZIO_COMPRESS_FUNCTION"]
# If "-rr" option is used, don't convert to string representation
if raw > 1:
return "%i" % c
try:
return comp_strings[c];
except IndexError:
return "%i" % c
def parse_line(line, labels):
global hdr
new = dict()
val = None
for col in hdr:
# These are "special" fields computed in the update_dict
# function, prevent KeyError exception on labels[col] for these.
if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']:
val = line[labels[col]]
if col in ['pool', 'flags']:
new[col] = str(val)
elif col in ['dtype', 'btype']:
new[col] = get_typestring(int(val))
elif col in ['l2_comp']:
new[col] = get_compstring(int(val))
else:
new[col] = int(val)
return new
def update_dict(d, k, line, labels):
pool = line[labels['pool']]
objset = line[labels['objset']]
key = line[labels[k]]
dbsize = int(line[labels['dbsize']])
blkid = int(line[labels['blkid']])
level = int(line[labels['level']])
if pool not in d:
d[pool] = dict()
if objset not in d[pool]:
d[pool][objset] = dict()
if key not in d[pool][objset]:
d[pool][objset][key] = parse_line(line, labels)
d[pool][objset][key]['bonus'] = 0
d[pool][objset][key]['cached'] = 0
d[pool][objset][key]['direct'] = 0
d[pool][objset][key]['indirect'] = 0
d[pool][objset][key]['spill'] = 0
d[pool][objset][key]['cached'] += dbsize
if blkid == -1:
d[pool][objset][key]['bonus'] += dbsize
elif blkid == -2:
d[pool][objset][key]['spill'] += dbsize
else:
if level == 0:
d[pool][objset][key]['direct'] += dbsize
else:
d[pool][objset][key]['indirect'] += dbsize
return d
def print_dict(d):
print_header()
for pool in d.iterkeys():
for objset in d[pool].iterkeys():
for v in d[pool][objset].itervalues():
print_values(v)
def dnodes_build_dict(filehandle):
labels = dict()
dnodes = dict()
# First 3 lines are header information, skip the first two
for i in range(0, 2):
next(filehandle)
# The third line contains the labels and index locations
for i, v in enumerate(next(filehandle).split()):
labels[v] = i
# The rest of the file is buffer information
for line in filehandle:
update_dict(dnodes, 'object', line.split(), labels)
return dnodes
def types_build_dict(filehandle):
labels = dict()
types = dict()
# First 3 lines are header information, skip the first two
for i in range(0, 2):
next(filehandle)
# The third line contains the labels and index locations
for i, v in enumerate(next(filehandle).split()):
labels[v] = i
# The rest of the file is buffer information
for line in filehandle:
update_dict(types, 'dtype', line.split(), labels)
return types
def buffers_print_all(filehandle):
labels = dict()
# First 3 lines are header information, skip the first two
for i in range(0, 2):
next(filehandle)
# The third line contains the labels and index locations
for i, v in enumerate(next(filehandle).split()):
labels[v] = i
print_header()
# The rest of the file is buffer information
for line in filehandle:
print_values(parse_line(line.split(), labels))
def main():
global hdr
global sep
global raw
desired_cols = None
bflag = False
dflag = False
hflag = False
ifile = None
ofile = None
tflag = False
vflag = False
xflag = False
try:
opts, args = getopt.getopt(
sys.argv[1:],
"bdf:hi:o:rs:tvx",
[
"buffers",
"dnodes",
"columns",
"help",
"infile",
"outfile",
"seperator",
"types",
"verbose",
"extended"
]
)
except getopt.error:
usage()
for opt, arg in opts:
if opt in ('-b', '--buffers'):
bflag = True
if opt in ('-d', '--dnodes'):
dflag = True
if opt in ('-f', '--columns'):
desired_cols = arg
if opt in ('-h', '--help'):
hflag = True
if opt in ('-i', '--infile'):
ifile = arg
if opt in ('-o', '--outfile'):
ofile = arg
if opt in ('-r', '--raw'):
raw += 1
if opt in ('-s', '--seperator'):
sep = arg
if opt in ('-t', '--types'):
tflag = True
if opt in ('-v', '--verbose'):
vflag = True
if opt in ('-x', '--extended'):
xflag = True
if hflag or (xflag and desired_cols):
usage()
if vflag:
detailed_usage()
# Ensure at most only one of b, d, or t flags are set
if (bflag and dflag) or (bflag and tflag) or (dflag and tflag):
usage()
if bflag:
hdr = bxhdr if xflag else bhdr
elif tflag:
hdr = txhdr if xflag else thdr
else: # Even if dflag is False, it's the default if none set
dflag = True
hdr = dxhdr if xflag else dhdr
if desired_cols:
hdr = desired_cols.split(",")
invalid = []
incompat = []
for ele in hdr:
if ele not in cols:
invalid.append(ele)
elif ((bflag and bincompat and ele in bincompat) or
(dflag and dincompat and ele in dincompat) or
(tflag and tincompat and ele in tincompat)):
incompat.append(ele)
if len(invalid) > 0:
sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
usage()
if len(incompat) > 0:
sys.stderr.write("Incompatible field specified! -- %s\n" % incompat)
usage()
if ofile:
try:
tmp = open(ofile, "w")
sys.stdout = tmp
except:
sys.stderr.write("Cannot open %s for writing\n", ofile)
sys.exit(1)
if not ifile:
ifile = '/proc/spl/kstat/zfs/dbufs'
if ifile is not "-":
try:
tmp = open(ifile, "r")
sys.stdin = tmp
except:
sys.stderr.write("Cannot open %s for reading\n" % ifile)
sys.exit(1)
if bflag:
buffers_print_all(sys.stdin)
if dflag:
print_dict(dnodes_build_dict(sys.stdin))
if tflag:
print_dict(types_build_dict(sys.stdin))
if __name__ == '__main__':
main()

View File

@ -104,6 +104,7 @@ AC_CONFIG_FILES([
cmd/zvol_id/Makefile
cmd/vdev_id/Makefile
cmd/arcstat/Makefile
cmd/dbufstat/Makefile
module/Makefile
module/avl/Makefile
module/nvpair/Makefile

View File

@ -92,6 +92,36 @@ typedef enum arc_space_type {
ARC_SPACE_NUMTYPES
} arc_space_type_t;
typedef enum arc_state_type {
ARC_STATE_ANON,
ARC_STATE_MRU,
ARC_STATE_MRU_GHOST,
ARC_STATE_MFU,
ARC_STATE_MFU_GHOST,
ARC_STATE_L2C_ONLY,
ARC_STATE_NUMTYPES
} arc_state_type_t;
typedef struct arc_buf_info {
arc_state_type_t abi_state_type;
arc_buf_contents_t abi_state_contents;
uint64_t abi_state_index;
uint32_t abi_flags;
uint32_t abi_datacnt;
uint64_t abi_size;
uint64_t abi_spa;
uint64_t abi_access;
uint32_t abi_mru_hits;
uint32_t abi_mru_ghost_hits;
uint32_t abi_mfu_hits;
uint32_t abi_mfu_ghost_hits;
uint32_t abi_l2arc_hits;
uint32_t abi_holds;
uint64_t abi_l2arc_dattr;
uint64_t abi_l2arc_asize;
enum zio_compress abi_l2arc_compress;
} arc_buf_info_t;
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
@ -101,6 +131,7 @@ void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);

View File

@ -282,6 +282,9 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
void dbuf_stats_init(dbuf_hash_table_t *hash);
void dbuf_stats_destroy(void);
#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode)
#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock)
#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))

View File

@ -660,6 +660,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
* If doi is NULL, just indicates whether the object exists.
*/
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,

View File

@ -71,13 +71,6 @@ typedef struct zfs_all_blkstats {
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
} zfs_all_blkstats_t;
typedef struct txg_history {
kstat_txg_t th_kstat;
vdev_stat_t th_vs1;
vdev_stat_t th_vs2;
kmutex_t th_lock;
list_node_t th_link;
} txg_history_t;
typedef struct dsl_pool {
/* Immutable */
@ -89,8 +82,6 @@ typedef struct dsl_pool {
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;
kstat_t *dp_txg_kstat;
kstat_t *dp_tx_assign_kstat;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
@ -111,11 +102,6 @@ typedef struct dsl_pool {
uint64_t dp_mos_used_delta;
uint64_t dp_mos_compressed_delta;
uint64_t dp_mos_uncompressed_delta;
uint64_t dp_txg_history_size;
list_t dp_txg_history;
uint64_t dp_tx_assign_size;
kstat_named_t *dp_tx_assign_buckets;
/* Has its own locking */
tx_state_t dp_tx;
@ -171,12 +157,6 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
void dsl_pool_rele(dsl_pool_t *dp, void *tag);
void dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs);
txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg);
txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg);
void dsl_pool_txg_history_put(txg_history_t *th);
#ifdef __cplusplus
}
#endif

View File

@ -51,6 +51,8 @@ typedef struct zilog zilog_t;
typedef struct spa_aux_vdev spa_aux_vdev_t;
typedef struct ddt ddt_t;
typedef struct ddt_entry ddt_entry_t;
typedef struct zbookmark zbookmark_t;
struct dsl_pool;
struct dsl_dataset;
@ -534,6 +536,41 @@ extern boolean_t spa_refcount_zero(spa_t *spa);
#define SCL_ALL ((1 << SCL_LOCKS) - 1)
#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO)
/* Historical pool statistics */
typedef struct spa_stats_history {
kmutex_t lock;
uint64_t count;
uint64_t size;
kstat_t *kstat;
void *private;
list_t list;
} spa_stats_history_t;
typedef struct spa_stats {
spa_stats_history_t read_history;
spa_stats_history_t txg_history;
spa_stats_history_t tx_assign_histogram;
} spa_stats_t;
typedef enum txg_state {
TXG_STATE_BIRTH = 0,
TXG_STATE_OPEN = 1,
TXG_STATE_QUIESCED = 2,
TXG_STATE_SYNCED = 3,
TXG_STATE_COMMITTED = 4,
} txg_state_t;
extern void spa_stats_init(spa_t *spa);
extern void spa_stats_destroy(spa_t *spa);
extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb,
uint32_t aflags);
extern void spa_txg_history_add(spa_t *spa, uint64_t txg);
extern int spa_txg_history_set(spa_t *spa, uint64_t txg,
txg_state_t completed_state, hrtime_t completed_time);
extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved);
extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);

View File

@ -236,6 +236,8 @@ struct spa {
uint64_t spa_deadman_calls; /* number of deadman calls */
uint64_t spa_sync_starttime; /* starting time fo spa_sync */
uint64_t spa_deadman_synctime; /* deadman expiration timer */
spa_stats_t spa_stats; /* assorted spa statistics */
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.

View File

@ -210,6 +210,7 @@ typedef struct kthread {
} kthread_t;
#define curthread zk_thread_current()
#define getcomm() "unknown"
#define thread_exit zk_thread_exit
#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
zk_thread_create(stk, stksize, (thread_func_t)func, arg, \
@ -347,6 +348,10 @@ extern kstat_t *kstat_create(char *, int,
char *, char *, uchar_t, ulong_t, uchar_t);
extern void kstat_install(kstat_t *);
extern void kstat_delete(kstat_t *);
extern void kstat_set_raw_ops(kstat_t *ksp,
int (*headers)(char *buf, size_t size),
int (*data)(char *buf, size_t size, void *data),
void *(*addr)(kstat_t *ksp, loff_t index));
/*
* Kernel memory

View File

@ -256,12 +256,13 @@ extern char *zio_type_name[ZIO_TYPES];
* Therefore it must not change size or alignment between 32/64 bit
* compilation options.
*/
typedef struct zbookmark {
struct zbookmark {
uint64_t zb_objset;
uint64_t zb_object;
int64_t zb_level;
uint64_t zb_blkid;
} zbookmark_t;
char * zb_func;
};
#define SET_BOOKMARK(zb, objset, object, level, blkid) \
{ \
@ -269,6 +270,7 @@ typedef struct zbookmark {
(zb)->zb_object = object; \
(zb)->zb_level = level; \
(zb)->zb_blkid = blkid; \
(zb)->zb_func = FTAG; \
}
#define ZB_DESTROYED_OBJSET (-1ULL)

View File

@ -228,10 +228,8 @@ typedef struct kstat32 {
/* ks_ndata == 1 */
#define KSTAT_TYPE_TIMER 4 /* event timer */
/* ks_ndata >= 1 */
#define KSTAT_TYPE_TXG 5 /* txg statistics */
/* ks_ndata >= 0 */
#define KSTAT_NUM_TYPES 6
#define KSTAT_NUM_TYPES 5
/*
* kstat class
@ -700,29 +698,6 @@ typedef struct kstat_timer {
#define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data)
/*
* TXG statistics - bytes read/written and iops performed
*/
typedef enum kstat_txg_state {
TXG_STATE_OPEN = 1,
TXG_STATE_QUIESCING = 2,
TXG_STATE_SYNCING = 3,
TXG_STATE_COMMITTED = 4,
} kstat_txg_state_t;
typedef struct kstat_txg {
u_longlong_t txg; /* txg id */
kstat_txg_state_t state; /* txg state */
hrtime_t birth; /* birth time stamp */
u_longlong_t nread; /* number of bytes read */
u_longlong_t nwritten; /* number of bytes written */
uint_t reads; /* number of read operations */
uint_t writes; /* number of write operations */
hrtime_t open_time; /* open time */
hrtime_t quiesce_time; /* quiesce time */
hrtime_t sync_time; /* sync time */
} kstat_txg_t;
#if defined(_KERNEL)
#include <sys/t_lock.h>

View File

@ -25,6 +25,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/bpobj.c \
$(top_srcdir)/module/zfs/bptree.c \
$(top_srcdir)/module/zfs/dbuf.c \
$(top_srcdir)/module/zfs/dbuf_stats.c \
$(top_srcdir)/module/zfs/ddt.c \
$(top_srcdir)/module/zfs/ddt_zap.c \
$(top_srcdir)/module/zfs/dmu.c \
@ -62,6 +63,7 @@ libzpool_la_SOURCES = \
$(top_srcdir)/module/zfs/spa_errlog.c \
$(top_srcdir)/module/zfs/spa_history.c \
$(top_srcdir)/module/zfs/spa_misc.c \
$(top_srcdir)/module/zfs/spa_stats.c \
$(top_srcdir)/module/zfs/space_map.c \
$(top_srcdir)/module/zfs/txg.c \
$(top_srcdir)/module/zfs/uberblock.c \

View File

@ -240,6 +240,14 @@ void
kstat_delete(kstat_t *ksp)
{}
/*ARGSUSED*/
void
kstat_set_raw_ops(kstat_t *ksp,
int (*headers)(char *buf, size_t size),
int (*data)(char *buf, size_t size, void *data),
void *(*addr)(kstat_t *ksp, loff_t index))
{}
/*
* =========================================================================
* mutexes

View File

@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o
$(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o
$(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o
$(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o
@ -44,6 +45,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o

View File

@ -232,6 +232,7 @@ typedef struct arc_state {
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
arc_state_type_t arcs_state;
} arc_state_t;
/* The 6 states: */
@ -534,6 +535,11 @@ struct arc_buf_hdr {
/* updated atomically */
clock_t b_arc_access;
uint32_t b_mru_hits;
uint32_t b_mru_ghost_hits;
uint32_t b_mfu_hits;
uint32_t b_mfu_ghost_hits;
uint32_t b_l2_hits;
/* self protecting */
refcount_t b_refcnt;
@ -709,7 +715,8 @@ struct l2arc_buf_hdr {
/* compression applied to buffer data */
enum zio_compress b_compress;
/* real alloc'd buffer size depending on b_compress applied */
int b_asize;
uint32_t b_asize;
uint32_t b_hits;
/* temporary buffer holder for in-flight compressed data */
void *b_tmp_cdata;
};
@ -1137,6 +1144,54 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
return (cnt);
}
/*
* Returns detailed information about a specific arc buffer. When the
* state_index argument is set the function will calculate the arc header
* list position for its arc state. Since this requires a linear traversal
* callers are strongly encourage not to do this. However, it can be helpful
* for targeted analysis so the functionality is provided.
*/
void
arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
{
arc_buf_hdr_t *hdr = ab->b_hdr;
arc_state_t *state = hdr->b_state;
memset(abi, 0, sizeof(arc_buf_info_t));
abi->abi_flags = hdr->b_flags;
abi->abi_datacnt = hdr->b_datacnt;
abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
abi->abi_state_contents = hdr->b_type;
abi->abi_state_index = -1;
abi->abi_size = hdr->b_size;
abi->abi_access = hdr->b_arc_access;
abi->abi_mru_hits = hdr->b_mru_hits;
abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits;
abi->abi_mfu_hits = hdr->b_mfu_hits;
abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
abi->abi_holds = refcount_count(&hdr->b_refcnt);
if (hdr->b_l2hdr) {
abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr;
abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize;
abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress;
abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits;
}
if (state && state_index && list_link_active(&hdr->b_arc_node)) {
list_t *list = &state->arcs_list[hdr->b_type];
arc_buf_hdr_t *h;
mutex_enter(&state->arcs_mtx);
for (h = list_head(list); h != NULL; h = list_next(list, h)) {
abi->abi_state_index++;
if (h == hdr)
break;
}
mutex_exit(&state->arcs_mtx);
}
}
/*
* Move the supplied buffer to the indicated state. The mutex
* for the buffer must be held by the caller.
@ -1298,6 +1353,11 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
hdr->b_spa = spa_load_guid(spa);
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
hdr->b_mru_hits = 0;
hdr->b_mru_ghost_hits = 0;
hdr->b_mfu_hits = 0;
hdr->b_mfu_ghost_hits = 0;
hdr->b_l2_hits = 0;
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
@ -2670,6 +2730,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
ASSERT(list_link_active(&buf->b_arc_node));
} else {
buf->b_flags &= ~ARC_PREFETCH;
atomic_inc_32(&buf->b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
}
buf->b_arc_access = now;
@ -2691,6 +2752,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(arc_mfu, buf, hash_lock);
}
atomic_inc_32(&buf->b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
} else if (buf->b_state == arc_mru_ghost) {
arc_state_t *new_state;
@ -2713,6 +2775,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
buf->b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, buf, hash_lock);
atomic_inc_32(&buf->b_mru_ghost_hits);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
} else if (buf->b_state == arc_mfu) {
/*
@ -2728,6 +2791,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
ASSERT(refcount_count(&buf->b_refcnt) == 0);
ASSERT(list_link_active(&buf->b_arc_node));
}
atomic_inc_32(&buf->b_mfu_hits);
ARCSTAT_BUMP(arcstat_mfu_hits);
buf->b_arc_access = ddi_get_lbolt();
} else if (buf->b_state == arc_mfu_ghost) {
@ -2751,6 +2815,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
arc_change_state(new_state, buf, hash_lock);
atomic_inc_32(&buf->b_mfu_ghost_hits);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
} else if (buf->b_state == arc_l2c_only) {
/*
@ -2943,6 +3008,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
kmutex_t *hash_lock;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
int rc = 0;
top:
hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
@ -2976,10 +3042,10 @@ top:
hdr->b_acb = acb;
add_reference(hdr, hash_lock, private);
mutex_exit(hash_lock);
return (0);
goto out;
}
mutex_exit(hash_lock);
return (0);
goto out;
}
ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@ -3133,6 +3199,7 @@ top:
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_hits);
atomic_inc_32(&hdr->b_l2hdr->b_hits);
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
KM_PUSHPAGE);
@ -3174,12 +3241,12 @@ top:
if (*arc_flags & ARC_NOWAIT) {
zio_nowait(rzio);
return (0);
goto out;
}
ASSERT(*arc_flags & ARC_WAIT);
if (zio_wait(rzio) == 0)
return (0);
goto out;
/* l2arc read error; goto zio_read() */
} else {
@ -3203,13 +3270,18 @@ top:
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, zio_flags, zb);
if (*arc_flags & ARC_WAIT)
return (zio_wait(rzio));
if (*arc_flags & ARC_WAIT) {
rc = zio_wait(rzio);
goto out;
}
ASSERT(*arc_flags & ARC_NOWAIT);
zio_nowait(rzio);
}
return (0);
out:
spa_read_history_add(spa, zb, *arc_flags);
return (rc);
}
arc_prune_t *
@ -3463,6 +3535,11 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_buf = buf;
nhdr->b_state = arc_anon;
nhdr->b_arc_access = 0;
nhdr->b_mru_hits = 0;
nhdr->b_mru_ghost_hits = 0;
nhdr->b_mfu_hits = 0;
nhdr->b_mfu_ghost_hits = 0;
nhdr->b_l2_hits = 0;
nhdr->b_flags = flags & ARC_L2_WRITING;
nhdr->b_l2hdr = NULL;
nhdr->b_datacnt = 1;
@ -3479,6 +3556,11 @@ arc_release(arc_buf_t *buf, void *tag)
if (hdr->b_state != arc_anon)
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
hdr->b_mru_hits = 0;
hdr->b_mru_ghost_hits = 0;
hdr->b_mfu_hits = 0;
hdr->b_mfu_ghost_hits = 0;
hdr->b_l2_hits = 0;
if (hash_lock)
mutex_exit(hash_lock);
@ -3896,6 +3978,13 @@ arc_init(void)
list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
arc_anon->arcs_state = ARC_STATE_ANON;
arc_mru->arcs_state = ARC_STATE_MRU;
arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
arc_mfu->arcs_state = ARC_STATE_MFU;
arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
buf_init();
arc_thread_exit = 0;
@ -4779,6 +4868,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
l2hdr->b_compress = ZIO_COMPRESS_OFF;
l2hdr->b_asize = ab->b_size;
l2hdr->b_tmp_cdata = ab->b_buf->b_data;
l2hdr->b_hits = 0;
buf_sz = ab->b_size;
ab->b_l2hdr = l2hdr;
@ -5311,6 +5401,7 @@ l2arc_stop(void)
#if defined(_KERNEL) && defined(HAVE_SPL)
EXPORT_SYMBOL(arc_read);
EXPORT_SYMBOL(arc_buf_remove_ref);
EXPORT_SYMBOL(arc_buf_info);
EXPORT_SYMBOL(arc_getbuf_func);
EXPORT_SYMBOL(arc_add_prune_callback);
EXPORT_SYMBOL(arc_remove_prune_callback);

View File

@ -317,6 +317,8 @@ retry:
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
dbuf_stats_init(h);
}
void
@ -325,6 +327,8 @@ dbuf_fini(void)
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
dbuf_stats_destroy();
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_destroy(&h->hash_mutexes[i]);
#if defined(_KERNEL) && defined(HAVE_SPL)

230
module/zfs/dbuf_stats.c Normal file
View File

@ -0,0 +1,230 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
#include <sys/zfs_context.h>
#include <sys/dbuf.h>
#include <sys/dmu_objset.h>
/*
* Calculate the index of the arc header for the state, disabled by default.
*/
int zfs_dbuf_state_index = 0;
/*
* ==========================================================================
* Dbuf Hash Read Routines
* ==========================================================================
*/
typedef struct dbuf_stats_t {
kmutex_t lock;
kstat_t *kstat;
dbuf_hash_table_t *hash;
int idx;
} dbuf_stats_t;
static dbuf_stats_t dbuf_stats_hash_table;
static int
dbuf_stats_hash_table_headers(char *buf, size_t size)
{
size = snprintf(buf, size - 1,
"%-88s | %-124s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
"%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
"%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
"blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
"atype", "index", "flags", "count", "asize", "access", "mru", "gmru",
"mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds",
"dtype", "btype", "data_bs", "meta_bs", "bsize",
"lvls", "dholds", "blocks", "dsize");
buf[size] = '\0';
return (0);
}
int
__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
{
arc_buf_info_t abi = { 0 };
dmu_object_info_t doi = { 0 };
dnode_t *dn = DB_DNODE(db);
if (db->db_buf)
arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
if (dn)
__dmu_object_info_from_dnode(dn, &doi);
size = snprintf(buf, size - 1,
"%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
"%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
/* dmu_buf_impl_t */
spa_name(dn->dn_objset->os_spa),
(u_longlong_t)dmu_objset_id(db->db_objset),
(longlong_t)db->db.db_object,
(longlong_t)db->db_level,
(longlong_t)db->db_blkid,
(u_longlong_t)db->db.db_offset,
(u_longlong_t)db->db.db_size,
!!dbuf_is_metadata(db),
db->db_state,
(ulong_t)refcount_count(&db->db_holds),
/* arc_buf_info_t */
abi.abi_state_type,
abi.abi_state_contents,
(longlong_t)abi.abi_state_index,
abi.abi_flags,
(ulong_t)abi.abi_datacnt,
(u_longlong_t)abi.abi_size,
(u_longlong_t)abi.abi_access,
(ulong_t)abi.abi_mru_hits,
(ulong_t)abi.abi_mru_ghost_hits,
(ulong_t)abi.abi_mfu_hits,
(ulong_t)abi.abi_mfu_ghost_hits,
(ulong_t)abi.abi_l2arc_hits,
(u_longlong_t)abi.abi_l2arc_dattr,
(u_longlong_t)abi.abi_l2arc_asize,
abi.abi_l2arc_compress,
(ulong_t)abi.abi_holds,
/* dmu_object_info_t */
doi.doi_type,
doi.doi_bonus_type,
(ulong_t)doi.doi_data_block_size,
(ulong_t)doi.doi_metadata_block_size,
(u_longlong_t)doi.doi_bonus_size,
(ulong_t)doi.doi_indirection,
(ulong_t)refcount_count(&dn->dn_holds),
(u_longlong_t)doi.doi_fill_count,
(u_longlong_t)doi.doi_max_offset);
buf[size] = '\0';
return (size);
}
static int
dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
{
dbuf_stats_t *dsh = (dbuf_stats_t *)data;
dbuf_hash_table_t *h = dsh->hash;
dmu_buf_impl_t *db;
int length, error = 0;
ASSERT3S(dsh->idx, >=, 0);
ASSERT3S(dsh->idx, <=, h->hash_table_mask);
memset(buf, 0, size);
mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
/*
* Returning ENOMEM will cause the data and header functions
* to be called with a larger scratch buffers.
*/
if (size < 512) {
error = ENOMEM;
break;
}
mutex_enter(&db->db_mtx);
mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
length = __dbuf_stats_hash_table_data(buf, size, db);
buf += length;
size -= length;
mutex_exit(&db->db_mtx);
mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
}
mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
return (error);
}
static void *
dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n)
{
dbuf_stats_t *dsh = ksp->ks_private;
ASSERT(MUTEX_HELD(&dsh->lock));
if (n <= dsh->hash->hash_table_mask) {
dsh->idx = n;
return (dsh);
}
return (NULL);
}
static void
dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
{
dbuf_stats_t *dsh = &dbuf_stats_hash_table;
kstat_t *ksp;
mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
dsh->hash = hash;
ksp = kstat_create("zfs", 0, "dbufs", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
dsh->kstat = ksp;
if (ksp) {
ksp->ks_lock = &dsh->lock;
ksp->ks_ndata = UINT32_MAX;
ksp->ks_private = dsh;
kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
kstat_install(ksp);
}
}
static void
dbuf_stats_hash_table_destroy(void)
{
dbuf_stats_t *dsh = &dbuf_stats_hash_table;
kstat_t *ksp;
ksp = dsh->kstat;
if (ksp)
kstat_delete(ksp);
mutex_destroy(&dsh->lock);
}
void
dbuf_stats_init(dbuf_hash_table_t *hash)
{
dbuf_stats_hash_table_init(hash);
}
void
dbuf_stats_destroy(void)
{
dbuf_stats_hash_table_destroy();
}
#if defined(_KERNEL) && defined(HAVE_SPL)
module_param(zfs_dbuf_state_index, int, 0644);
MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index");
#endif

View File

@ -1815,16 +1815,11 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
}
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
dnode_phys_t *dnp;
dnode_phys_t *dnp = dn->dn_phys;
int i;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
dnp = dn->dn_phys;
doi->doi_data_block_size = dn->dn_datablksz;
doi->doi_metadata_block_size = dn->dn_indblkshift ?
1ULL << dn->dn_indblkshift : 0;
@ -1839,6 +1834,15 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_fill_count = 0;
for (i = 0; i < dnp->dn_nblkptr; i++)
doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
}
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
__dmu_object_info_from_dnode(dn, doi);
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);

View File

@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
SET_BOOKMARK(czb, td->td_objset,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
uint32_t flags = ARC_WAIT;
@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
err = arc_read(NULL, td->td_spa, rootbp,
arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
if (err != 0)
return (err);
@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td, TQ_NOQUEUE))
pd->pd_exited = B_TRUE;
SET_BOOKMARK(czb, td->td_objset,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
err = traverse_visitbp(td, NULL, rootbp, czb);
mutex_enter(&pd->pd_mtx);

View File

@ -1077,7 +1077,7 @@ dmu_tx_unassign(dmu_tx_t *tx)
int
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
{
hrtime_t before, after;
hrtime_t before;
int err;
ASSERT(tx->tx_txg == 0);
@ -1100,10 +1100,7 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
txg_rele_to_quiesce(&tx->tx_txgh);
after = gethrtime();
dsl_pool_tx_assign_add_usecs(tx->tx_pool,
(after - before) / NSEC_PER_USEC);
spa_tx_assign_add_nsecs(tx->tx_pool->dp_spa, gethrtime() - before);
return (0);
}

View File

@ -48,7 +48,6 @@
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
int zfs_txg_history = 60; /* statistics for the last N txgs */
unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
unsigned long zfs_write_limit_max = 0; /* max data payload per txg */
@ -59,200 +58,6 @@ kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0;
static void
dsl_pool_tx_assign_init(dsl_pool_t *dp, unsigned int ndata)
{
kstat_named_t *ks;
char name[KSTAT_STRLEN];
int i, data_size = ndata * sizeof(kstat_named_t);
(void) snprintf(name, KSTAT_STRLEN, "dmu_tx_assign-%s",
spa_name(dp->dp_spa));
dp->dp_tx_assign_size = ndata;
if (data_size)
dp->dp_tx_assign_buckets = kmem_alloc(data_size, KM_SLEEP);
else
dp->dp_tx_assign_buckets = NULL;
for (i = 0; i < dp->dp_tx_assign_size; i++) {
ks = &dp->dp_tx_assign_buckets[i];
ks->data_type = KSTAT_DATA_UINT64;
ks->value.ui64 = 0;
(void) snprintf(ks->name, KSTAT_STRLEN, "%u us", 1 << i);
}
dp->dp_tx_assign_kstat = kstat_create("zfs", 0, name, "misc",
KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
if (dp->dp_tx_assign_kstat) {
dp->dp_tx_assign_kstat->ks_data = dp->dp_tx_assign_buckets;
dp->dp_tx_assign_kstat->ks_ndata = dp->dp_tx_assign_size;
dp->dp_tx_assign_kstat->ks_data_size = data_size;
kstat_install(dp->dp_tx_assign_kstat);
}
}
static void
dsl_pool_tx_assign_destroy(dsl_pool_t *dp)
{
if (dp->dp_tx_assign_buckets)
kmem_free(dp->dp_tx_assign_buckets,
dp->dp_tx_assign_size * sizeof(kstat_named_t));
if (dp->dp_tx_assign_kstat)
kstat_delete(dp->dp_tx_assign_kstat);
}
void
dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs)
{
uint64_t idx = 0;
while (((1 << idx) < usecs) && (idx < dp->dp_tx_assign_size - 1))
idx++;
atomic_inc_64(&dp->dp_tx_assign_buckets[idx].value.ui64);
}
static int
dsl_pool_txg_history_update(kstat_t *ksp, int rw)
{
dsl_pool_t *dp = ksp->ks_private;
txg_history_t *th;
int i = 0;
if (rw == KSTAT_WRITE)
return (EACCES);
if (ksp->ks_data)
kmem_free(ksp->ks_data, ksp->ks_data_size);
mutex_enter(&dp->dp_lock);
ksp->ks_ndata = dp->dp_txg_history_size;
ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t);
if (ksp->ks_data_size > 0)
ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE);
/* Traversed oldest to youngest for the most readable kstat output */
for (th = list_tail(&dp->dp_txg_history); th != NULL;
th = list_prev(&dp->dp_txg_history, th)) {
mutex_enter(&th->th_lock);
ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size);
memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t));
i += sizeof(kstat_txg_t);
mutex_exit(&th->th_lock);
}
mutex_exit(&dp->dp_lock);
return (0);
}
static void
dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg)
{
char name[KSTAT_STRLEN];
list_create(&dp->dp_txg_history, sizeof (txg_history_t),
offsetof(txg_history_t, th_link));
dsl_pool_txg_history_add(dp, txg);
(void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa));
dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc",
KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL);
if (dp->dp_txg_kstat) {
dp->dp_txg_kstat->ks_data = NULL;
dp->dp_txg_kstat->ks_private = dp;
dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update;
kstat_install(dp->dp_txg_kstat);
}
}
static void
dsl_pool_txg_history_destroy(dsl_pool_t *dp)
{
txg_history_t *th;
if (dp->dp_txg_kstat) {
if (dp->dp_txg_kstat->ks_data)
kmem_free(dp->dp_txg_kstat->ks_data,
dp->dp_txg_kstat->ks_data_size);
kstat_delete(dp->dp_txg_kstat);
}
mutex_enter(&dp->dp_lock);
while ((th = list_remove_head(&dp->dp_txg_history))) {
dp->dp_txg_history_size--;
mutex_destroy(&th->th_lock);
kmem_free(th, sizeof(txg_history_t));
}
ASSERT3U(dp->dp_txg_history_size, ==, 0);
list_destroy(&dp->dp_txg_history);
mutex_exit(&dp->dp_lock);
}
txg_history_t *
dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg)
{
txg_history_t *th, *rm;
th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE);
mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL);
th->th_kstat.txg = txg;
th->th_kstat.state = TXG_STATE_OPEN;
th->th_kstat.birth = gethrtime();
mutex_enter(&dp->dp_lock);
list_insert_head(&dp->dp_txg_history, th);
dp->dp_txg_history_size++;
while (dp->dp_txg_history_size > zfs_txg_history) {
dp->dp_txg_history_size--;
rm = list_remove_tail(&dp->dp_txg_history);
mutex_destroy(&rm->th_lock);
kmem_free(rm, sizeof(txg_history_t));
}
mutex_exit(&dp->dp_lock);
return (th);
}
/*
* Traversed youngest to oldest because lookups are only done for open
* or syncing txgs which are guaranteed to be at the head of the list.
* The txg_history_t structure will be returned locked.
*/
txg_history_t *
dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg)
{
txg_history_t *th;
mutex_enter(&dp->dp_lock);
for (th = list_head(&dp->dp_txg_history); th != NULL;
th = list_next(&dp->dp_txg_history, th)) {
if (th->th_kstat.txg == txg) {
mutex_enter(&th->th_lock);
break;
}
}
mutex_exit(&dp->dp_lock);
return (th);
}
void
dsl_pool_txg_history_put(txg_history_t *th)
{
mutex_exit(&th->th_lock);
}
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
@ -295,9 +100,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
1, 4, 0);
dsl_pool_txg_history_init(dp, txg);
dsl_pool_tx_assign_init(dp, 32);
return (dp);
}
@ -438,8 +240,6 @@ dsl_pool_close(dsl_pool_t *dp)
arc_flush(dp->dp_spa);
txg_fini(dp);
dsl_scan_fini(dp);
dsl_pool_tx_assign_destroy(dp);
dsl_pool_txg_history_destroy(dp);
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
taskq_destroy(dp->dp_iput_taskq);
@ -1241,9 +1041,6 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg");
module_param(zfs_txg_synctime_ms, int, 0644);
MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync");
module_param(zfs_txg_history, int, 0644);
MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
module_param(zfs_write_limit_min, ulong, 0444);
MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit");

View File

@ -48,6 +48,7 @@
#include <sys/metaslab_impl.h>
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/kstat.h>
#include "zfs_prop.h"
#include "zfeature_common.h"
@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL;
*/
int zfs_deadman_enabled = 1;
/*
* ==========================================================================
* SPA config locking
@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
spa_stats_init(spa);
avl_add(&spa_namespace_avl, spa);
@ -580,6 +581,7 @@ spa_remove(spa_t *spa)
refcount_destroy(&spa->spa_refcount);
spa_stats_destroy(spa);
spa_config_lock_destroy(spa);
for (t = 0; t < TXG_SIZE; t++)

636
module/zfs/spa_stats.c Normal file
View File

@ -0,0 +1,636 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
/*
* Keeps stats on last N reads per spa_t, disabled by default.
*/
int zfs_read_history = 0;
/*
* Include cache hits in history, disabled by default.
*/
int zfs_read_history_hits = 0;
/*
* Keeps stats on the last N txgs, disabled by default.
*/
int zfs_txg_history = 0;
/*
* ==========================================================================
* SPA Read History Routines
* ==========================================================================
*/
/*
* Read statistics - Information exported regarding each arc_read call
*/
typedef struct spa_read_history {
uint64_t uid; /* unique identifier */
hrtime_t start; /* time read completed */
uint64_t objset; /* read from this objset */
uint64_t object; /* read of this object number */
uint64_t level; /* block's indirection level */
uint64_t blkid; /* read of this block id */
char origin[24]; /* read originated from here */
uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
pid_t pid; /* PID of task doing read */
char comm[16]; /* process name of task doing read */
list_node_t srh_link;
} spa_read_history_t;
static int
spa_read_history_headers(char *buf, size_t size)
{
size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
"%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
"level", "blkid", "aflags", "origin", "pid", "process");
buf[size] = '\0';
return (0);
}
static int
spa_read_history_data(char *buf, size_t size, void *data)
{
spa_read_history_t *srh = (spa_read_history_t *)data;
size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx "
"%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
(u_longlong_t)srh->uid, srh->start,
(longlong_t)srh->objset, (longlong_t)srh->object,
(longlong_t)srh->level, (longlong_t)srh->blkid,
srh->aflags, srh->origin, srh->pid, srh->comm);
buf[size] = '\0';
return (0);
}
/*
* Calculate the address for the next spa_stats_history_t entry. The
* ssh->lock will be held until ksp->ks_ndata entries are processed.
*/
static void *
spa_read_history_addr(kstat_t *ksp, loff_t n)
{
spa_t *spa = ksp->ks_private;
spa_stats_history_t *ssh = &spa->spa_stats.read_history;
ASSERT(MUTEX_HELD(&ssh->lock));
if (n == 0)
ssh->private = list_tail(&ssh->list);
else if (ssh->private)
ssh->private = list_prev(&ssh->list, ssh->private);
return (ssh->private);
}
/*
* When the kstat is written discard all spa_read_history_t entires. The
* ssh->lock will be held until ksp->ks_ndata entries are processed.
*/
static int
spa_read_history_update(kstat_t *ksp, int rw)
{
spa_t *spa = ksp->ks_private;
spa_stats_history_t *ssh = &spa->spa_stats.read_history;
if (rw == KSTAT_WRITE) {
spa_read_history_t *srh;
while ((srh = list_remove_head(&ssh->list))) {
ssh->size--;
kmem_free(srh, sizeof(spa_read_history_t));
}
ASSERT3U(ssh->size, ==, 0);
}
ksp->ks_ndata = ssh->size;
ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t);
return (0);
}
static void
spa_read_history_init(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.read_history;
char name[KSTAT_STRLEN];
kstat_t *ksp;
mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&ssh->list, sizeof (spa_read_history_t),
offsetof(spa_read_history_t, srh_link));
ssh->count = 0;
ssh->size = 0;
ssh->private = NULL;
(void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
name[KSTAT_STRLEN-1] = '\0';
ksp = kstat_create(name, 0, "reads", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
ssh->kstat = ksp;
if (ksp) {
ksp->ks_lock = &ssh->lock;
ksp->ks_data = NULL;
ksp->ks_private = spa;
ksp->ks_update = spa_read_history_update;
kstat_set_raw_ops(ksp, spa_read_history_headers,
spa_read_history_data, spa_read_history_addr);
kstat_install(ksp);
}
}
static void
spa_read_history_destroy(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.read_history;
spa_read_history_t *srh;
kstat_t *ksp;
ksp = ssh->kstat;
if (ksp)
kstat_delete(ksp);
mutex_enter(&ssh->lock);
while ((srh = list_remove_head(&ssh->list))) {
ssh->size--;
kmem_free(srh, sizeof(spa_read_history_t));
}
ASSERT3U(ssh->size, ==, 0);
list_destroy(&ssh->list);
mutex_exit(&ssh->lock);
mutex_destroy(&ssh->lock);
}
void
spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags)
{
spa_stats_history_t *ssh = &spa->spa_stats.read_history;
spa_read_history_t *srh, *rm;
ASSERT3P(spa, !=, NULL);
ASSERT3P(zb, !=, NULL);
if (zfs_read_history == 0 && ssh->size == 0)
return;
if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED))
return;
srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE);
strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin));
strlcpy(srh->comm, getcomm(), sizeof(srh->comm));
srh->start = gethrtime();
srh->objset = zb->zb_objset;
srh->object = zb->zb_object;
srh->level = zb->zb_level;
srh->blkid = zb->zb_blkid;
srh->aflags = aflags;
srh->pid = getpid();
mutex_enter(&ssh->lock);
srh->uid = ssh->count++;
list_insert_head(&ssh->list, srh);
ssh->size++;
while (ssh->size > zfs_read_history) {
ssh->size--;
rm = list_remove_tail(&ssh->list);
kmem_free(rm, sizeof(spa_read_history_t));
}
mutex_exit(&ssh->lock);
}
/*
* ==========================================================================
* SPA TXG History Routines
* ==========================================================================
*/
/*
* Txg statistics - Information exported regarding each txg sync
*/
typedef struct spa_txg_history {
uint64_t txg; /* txg id */
txg_state_t state; /* active txg state */
uint64_t nread; /* number of bytes read */
uint64_t nwritten; /* number of bytes written */
uint64_t reads; /* number of read operations */
uint64_t writes; /* number of write operations */
uint64_t nreserved; /* number of bytes reserved */
hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
list_node_t sth_link;
} spa_txg_history_t;
static int
spa_txg_history_headers(char *buf, size_t size)
{
size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s "
"%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state",
"nreserved", "nread", "nwritten", "reads", "writes",
"otime", "qtime", "stime");
buf[size] = '\0';
return (0);
}
static int
spa_txg_history_data(char *buf, size_t size, void *data)
{
spa_txg_history_t *sth = (spa_txg_history_t *)data;
uint64_t open = 0, quiesce = 0, sync = 0;
char state;
switch (sth->state) {
case TXG_STATE_BIRTH: state = 'B'; break;
case TXG_STATE_OPEN: state = 'O'; break;
case TXG_STATE_QUIESCED: state = 'Q'; break;
case TXG_STATE_SYNCED: state = 'S'; break;
case TXG_STATE_COMMITTED: state = 'C'; break;
default: state = '?'; break;
}
if (sth->times[TXG_STATE_OPEN])
open = sth->times[TXG_STATE_OPEN] -
sth->times[TXG_STATE_BIRTH];
if (sth->times[TXG_STATE_QUIESCED])
quiesce = sth->times[TXG_STATE_QUIESCED] -
sth->times[TXG_STATE_OPEN];
if (sth->times[TXG_STATE_SYNCED])
sync = sth->times[TXG_STATE_SYNCED] -
sth->times[TXG_STATE_QUIESCED];
size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu "
"%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n",
(longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
(u_longlong_t)sth->nreserved,
(u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
(u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
(u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync);
buf[size] = '\0';
return (0);
}
/*
* Calculate the address for the next spa_stats_history_t entry. The
* ssh->lock will be held until ksp->ks_ndata entries are processed.
*/
static void *
spa_txg_history_addr(kstat_t *ksp, loff_t n)
{
spa_t *spa = ksp->ks_private;
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
ASSERT(MUTEX_HELD(&ssh->lock));
if (n == 0)
ssh->private = list_tail(&ssh->list);
else if (ssh->private)
ssh->private = list_prev(&ssh->list, ssh->private);
return (ssh->private);
}
/*
* When the kstat is written discard all spa_txg_history_t entires. The
* ssh->lock will be held until ksp->ks_ndata entries are processed.
*/
static int
spa_txg_history_update(kstat_t *ksp, int rw)
{
spa_t *spa = ksp->ks_private;
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
ASSERT(MUTEX_HELD(&ssh->lock));
if (rw == KSTAT_WRITE) {
spa_txg_history_t *sth;
while ((sth = list_remove_head(&ssh->list))) {
ssh->size--;
kmem_free(sth, sizeof(spa_txg_history_t));
}
ASSERT3U(ssh->size, ==, 0);
}
ksp->ks_ndata = ssh->size;
ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t);
return (0);
}
static void
spa_txg_history_init(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
char name[KSTAT_STRLEN];
kstat_t *ksp;
mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&ssh->list, sizeof (spa_txg_history_t),
offsetof(spa_txg_history_t, sth_link));
ssh->count = 0;
ssh->size = 0;
ssh->private = NULL;
(void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
name[KSTAT_STRLEN-1] = '\0';
ksp = kstat_create(name, 0, "txgs", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
ssh->kstat = ksp;
if (ksp) {
ksp->ks_lock = &ssh->lock;
ksp->ks_data = NULL;
ksp->ks_private = spa;
ksp->ks_update = spa_txg_history_update;
kstat_set_raw_ops(ksp, spa_txg_history_headers,
spa_txg_history_data, spa_txg_history_addr);
kstat_install(ksp);
}
}
static void
spa_txg_history_destroy(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth;
kstat_t *ksp;
ksp = ssh->kstat;
if (ksp)
kstat_delete(ksp);
mutex_enter(&ssh->lock);
while ((sth = list_remove_head(&ssh->list))) {
ssh->size--;
kmem_free(sth, sizeof(spa_txg_history_t));
}
ASSERT3U(ssh->size, ==, 0);
list_destroy(&ssh->list);
mutex_exit(&ssh->lock);
mutex_destroy(&ssh->lock);
}
/*
* Add a new txg to historical record.
*/
void
spa_txg_history_add(spa_t *spa, uint64_t txg)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth, *rm;
if (zfs_txg_history == 0 && ssh->size == 0)
return;
sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE);
sth->txg = txg;
sth->state = TXG_STATE_OPEN;
sth->times[TXG_STATE_BIRTH] = gethrtime();
mutex_enter(&ssh->lock);
list_insert_head(&ssh->list, sth);
ssh->size++;
while (ssh->size > zfs_txg_history) {
ssh->size--;
rm = list_remove_tail(&ssh->list);
kmem_free(rm, sizeof(spa_txg_history_t));
}
mutex_exit(&ssh->lock);
}
/*
* Set txg state completion time and increment current state.
*/
int
spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
hrtime_t completed_time)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth;
int error = ENOENT;
if (zfs_txg_history == 0)
return (0);
mutex_enter(&ssh->lock);
for (sth = list_head(&ssh->list); sth != NULL;
sth = list_next(&ssh->list, sth)) {
if (sth->txg == txg) {
sth->times[completed_state] = completed_time;
sth->state++;
error = 0;
break;
}
}
mutex_exit(&ssh->lock);
return (error);
}
/*
* Set txg IO stats.
*/
int
spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved)
{
spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
spa_txg_history_t *sth;
int error = ENOENT;
if (zfs_txg_history == 0)
return (0);
mutex_enter(&ssh->lock);
for (sth = list_head(&ssh->list); sth != NULL;
sth = list_next(&ssh->list, sth)) {
if (sth->txg == txg) {
sth->nread = nread;
sth->nwritten = nwritten;
sth->reads = reads;
sth->writes = writes;
sth->nreserved = nreserved;
error = 0;
break;
}
}
mutex_exit(&ssh->lock);
return (error);
}
/*
* ==========================================================================
* SPA TX Assign Histogram Routines
* ==========================================================================
*/
/*
* Tx statistics - Information exported regarding dmu_tx_assign time.
*/
/*
* When the kstat is written zero all buckets. When the kstat is read
* count the number of trailing buckets set to zero and update ks_ndata
* such that they are not output.
*/
static int
spa_tx_assign_update(kstat_t *ksp, int rw)
{
spa_t *spa = ksp->ks_private;
spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
int i;
if (rw == KSTAT_WRITE) {
for (i = 0; i < ssh->count; i++)
((kstat_named_t *)ssh->private)[i].value.ui64 = 0;
}
for (i = ssh->count; i > 0; i--)
if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0)
break;
ksp->ks_ndata = i;
ksp->ks_data_size = i * sizeof(kstat_named_t);
return (0);
}
static void
spa_tx_assign_init(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
char name[KSTAT_STRLEN];
kstat_named_t *ks;
kstat_t *ksp;
int i;
mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
ssh->count = 42; /* power of two buckets for 1ns to 2,199s */
ssh->size = ssh->count * sizeof(kstat_named_t);
ssh->private = kmem_alloc(ssh->size, KM_SLEEP);
(void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
name[KSTAT_STRLEN-1] = '\0';
for (i = 0; i < ssh->count; i++) {
ks = &((kstat_named_t *)ssh->private)[i];
ks->data_type = KSTAT_DATA_UINT64;
ks->value.ui64 = 0;
(void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
(u_longlong_t)1 << i);
}
ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
ssh->kstat = ksp;
if (ksp) {
ksp->ks_lock = &ssh->lock;
ksp->ks_data = ssh->private;
ksp->ks_ndata = ssh->count;
ksp->ks_data_size = ssh->size;
ksp->ks_private = spa;
ksp->ks_update = spa_tx_assign_update;
kstat_install(ksp);
}
}
static void
spa_tx_assign_destroy(spa_t *spa)
{
spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
kstat_t *ksp;
ksp = ssh->kstat;
if (ksp)
kstat_delete(ksp);
kmem_free(ssh->private, ssh->size);
mutex_destroy(&ssh->lock);
}
void
spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
{
spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
uint64_t idx = 0;
while (((1 << idx) < nsecs) && (idx < ssh->size - 1))
idx++;
atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64);
}
void
spa_stats_init(spa_t *spa)
{
spa_read_history_init(spa);
spa_txg_history_init(spa);
spa_tx_assign_init(spa);
}
void
spa_stats_destroy(spa_t *spa)
{
spa_tx_assign_destroy(spa);
spa_txg_history_destroy(spa);
spa_read_history_destroy(spa);
}
#if defined(_KERNEL) && defined(HAVE_SPL)
module_param(zfs_read_history, int, 0644);
MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads");
module_param(zfs_read_history_hits, int, 0644);
MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history");
module_param(zfs_txg_history, int, 0644);
MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
#endif

View File

@ -27,11 +27,11 @@
#include <sys/zfs_context.h>
#include <sys/txg_impl.h>
#include <sys/dmu_impl.h>
#include <sys/spa_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/callb.h>
#include <sys/spa_impl.h>
/*
* ZFS Transaction Groups
@ -351,8 +351,6 @@ txg_rele_to_sync(txg_handle_t *th)
static void
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
hrtime_t start;
txg_history_t *th;
tx_state_t *tx = &dp->dp_tx;
int g = txg & TXG_MASK;
int c;
@ -366,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
/*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
@ -373,20 +374,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
for (c = 0; c < max_ncpus; c++)
mutex_exit(&tx->tx_cpu[c].tc_lock);
/*
* Measure how long the txg was open and replace the kstat.
*/
th = dsl_pool_txg_history_get(dp, txg);
th->th_kstat.open_time = gethrtime() - th->th_kstat.birth;
th->th_kstat.state = TXG_STATE_QUIESCING;
dsl_pool_txg_history_put(th);
dsl_pool_txg_history_add(dp, tx->tx_open_txg);
/*
* Quiesce the transaction group by waiting for everyone to txg_exit().
*/
start = gethrtime();
for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c];
mutex_enter(&tc->tc_lock);
@ -395,12 +385,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
mutex_exit(&tc->tc_lock);
}
/*
* Measure how long the txg took to quiesce.
*/
th = dsl_pool_txg_history_get(dp, txg);
th->th_kstat.quiesce_time = gethrtime() - start;
dsl_pool_txg_history_put(th);
spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
}
static void
@ -472,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp)
spa_t *spa = dp->dp_spa;
tx_state_t *tx = &dp->dp_tx;
callb_cpr_t cpr;
vdev_stat_t *vs1, *vs2;
uint64_t start, delta;
#ifdef _KERNEL
@ -485,10 +471,11 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_enter(tx, &cpr);
vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
start = delta = 0;
for (;;) {
hrtime_t hrstart;
txg_history_t *th;
uint64_t timer, timeout;
uint64_t txg;
@ -522,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp)
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
}
if (tx->tx_exiting)
if (tx->tx_exiting) {
kmem_free(vs2, sizeof(vdev_stat_t));
kmem_free(vs1, sizeof(vdev_stat_t));
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
}
vdev_get_stats(spa->spa_root_vdev, vs1);
/*
* Consume the quiesced txg which has been handed off to
@ -535,17 +527,11 @@ txg_sync_thread(dsl_pool_t *dp)
tx->tx_syncing_txg = txg;
cv_broadcast(&tx->tx_quiesce_more_cv);
th = dsl_pool_txg_history_get(dp, txg);
th->th_kstat.state = TXG_STATE_SYNCING;
vdev_get_stats(spa->spa_root_vdev, &th->th_vs1);
dsl_pool_txg_history_put(th);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
start = ddi_get_lbolt();
hrstart = gethrtime();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
@ -559,22 +545,15 @@ txg_sync_thread(dsl_pool_t *dp)
*/
txg_dispatch_callbacks(dp, txg);
/*
* Measure the txg sync time determine the amount of I/O done.
*/
th = dsl_pool_txg_history_get(dp, txg);
vdev_get_stats(spa->spa_root_vdev, &th->th_vs2);
th->th_kstat.sync_time = gethrtime() - hrstart;
th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] -
th->th_vs1.vs_bytes[ZIO_TYPE_READ];
th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] -
th->th_vs1.vs_bytes[ZIO_TYPE_WRITE];
th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] -
th->th_vs1.vs_ops[ZIO_TYPE_READ];
th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] -
th->th_vs1.vs_ops[ZIO_TYPE_WRITE];
th->th_kstat.state = TXG_STATE_COMMITTED;
dsl_pool_txg_history_put(th);
vdev_get_stats(spa->spa_root_vdev, vs2);
spa_txg_history_set_io(spa, txg,
vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
dp->dp_space_towrite[txg & TXG_MASK] +
dp->dp_tempreserved[txg & TXG_MASK] / 2);
spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
}
}