diff --git a/cmd/Makefile.am b/cmd/Makefile.am index bad1af63e..4c3e4bbb2 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,2 +1,2 @@ SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios -SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat +SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py index e01dd8b4c..f703e33cd 100755 --- a/cmd/arcstat/arcstat.py +++ b/cmd/arcstat/arcstat.py @@ -106,7 +106,7 @@ opfile = None sep = " " # Default separator is 2 spaces version = "0.4" l2exist = False -cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " +cmd = ("Usage: arcstat.py [-hvx] [-f fields] [-o file] [-s string] [interval " "[count]]\n") cur = {} d = {} @@ -136,10 +136,10 @@ def usage(): sys.stderr.write("\t -s : Override default field separator with custom " "character or string\n") sys.stderr.write("\nExamples:\n") - sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -v\n") - sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") + sys.stderr.write("\tarcstat.py -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat.py -s \",\" -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat.py -v\n") + sys.stderr.write("\tarcstat.py -f time,hit%,dh%,ph%,mh% 1\n") sys.stderr.write("\n") sys.exit(1) diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am new file mode 100644 index 000000000..0548b240c --- /dev/null +++ b/cmd/dbufstat/Makefile.am @@ -0,0 +1,2 @@ +bin_SCRIPTS = dbufstat.py +EXTRA_DIST = $(bin_SCRIPTS) diff --git a/cmd/dbufstat/dbufstat.py b/cmd/dbufstat/dbufstat.py new file mode 100755 index 000000000..80569b1d4 --- /dev/null +++ b/cmd/dbufstat/dbufstat.py @@ -0,0 +1,535 @@ +#!/usr/bin/python +# +# Print out statistics for all cached dmu buffers. This information +# is available through the dbufs kstat and may be post-processed as +# needed by the script. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (C) 2013 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# + +import sys +import getopt +import errno + +bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] +bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", + "meta", "state", "dbholds", "list", "atype", "index", "flags", "count", + "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize"] +bincompat = ["cached", "direct", "indirect", "bonus", "spill"] + +dhdr = ["pool", "objset", "object", "dtype", "cached"] +dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", + "indirect", "bonus", "spill"] +dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", + "list", "atype", "index", "flags", "count", "asize", "access", "mru", + "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds"] + +thdr = ["pool", "objset", "dtype", "cached"] +txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", + "bonus", "spill"] +tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", + "dbholds", "list", "atype", "index", "flags", "count", "asize", "access", + "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", + "aholds", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", + "blocks", "dsize"] + +cols = { + # hdr: [size, scale, description] + "pool": [15, -1, "pool name"], + "objset": [6, -1, "dataset identification number"], + "object": [10, -1, "object number"], + "level": [5, -1, "indirection level of buffer"], + "blkid": [8, -1, "block number of buffer"], + "offset": [12, 1024, "offset in object of buffer"], + "dbsize": [7, 1024, "size of buffer"], + "meta": [4, -1, "is this buffer metadata?"], + "state": [5, -1, "state of buffer (read, cached, etc)"], + "dbholds": [7, 1000, "number of holds on buffer"], + "list": [4, -1, "which ARC list contains this buffer"], + "atype": [7, -1, "ARC header type (data or metadata)"], + "index": [5, -1, "buffer's index into its ARC list"], + "flags": [8, -1, "ARC read flags"], + "count": [5, -1, "ARC data count"], + "asize": [7, 1024, "size of this ARC buffer"], + "access": [10, -1, "time this ARC buffer was last accessed"], + "mru": [5, 1000, "hits while on the ARC's MRU list"], + "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"], + "mfu": [5, 1000, "hits while on the ARC's MFU list"], + "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"], + "l2": [5, 1000, "hits while on the L2ARC"], + "l2_dattr": [8, -1, "L2ARC disk address/offset"], + "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"], + "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"], + "aholds": [6, 1000, "number of holds on this ARC buffer"], + "dtype": [27, -1, "dnode type"], + "btype": [27, -1, "bonus buffer type"], + "data_bs": [7, 1024, "data block size"], + "meta_bs": [7, 1024, "metadata block size"], + "bsize": [6, 1024, "bonus buffer size"], + "lvls": [6, -1, "number of indirection levels"], + "dholds": [6, 1000, "number of holds on dnode"], + "blocks": [8, 1000, "number of allocated blocks"], + "dsize": [12, 1024, "size of dnode"], + "cached": [6, 1024, "bytes cached for all blocks"], + "direct": [6, 1024, "bytes cached for direct blocks"], + "indirect": [8, 1024, "bytes cached for indirect blocks"], + "bonus": [5, 1024, "bytes cached for bonus buffer"], + "spill": [5, 1024, "bytes cached for spill block"], +} + +hdr = None +xhdr = None +sep = " " # Default separator is 2 spaces +cmd = ("Usage: dbufstat.py [-bdhrtvx] [-i file] [-f fields] [-o file] " + "[-s string]\n") +raw = 0 + +def print_incompat_helper(incompat): + cnt = 0 + for key in sorted(incompat): + if cnt is 0: + sys.stderr.write("\t") + elif cnt > 8: + sys.stderr.write(",\n\t") + cnt = 0 + else: + sys.stderr.write(", ") + + sys.stderr.write("%s" % key) + cnt += 1 + + sys.stderr.write("\n\n") + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + + sys.stderr.write("Field definitions incompatible with '-b' option:\n") + print_incompat_helper(bincompat) + + sys.stderr.write("Field definitions incompatible with '-d' option:\n") + print_incompat_helper(dincompat) + + sys.stderr.write("Field definitions incompatible with '-t' option:\n") + print_incompat_helper(tincompat) + + sys.stderr.write("Field definitions are as follows:\n") + for key in sorted(cols.keys()): + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(1) + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -b : Print table of information for each dbuf\n") + sys.stderr.write("\t -d : Print table of information for each dnode\n") + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -r : Print raw values\n") + sys.stderr.write("\t -t : Print table of information for each dnode type\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -i : Redirect input from the specified file\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tdbufstat.py -d -o /tmp/d.log\n") + sys.stderr.write("\tdbufstat.py -t -s \",\" -o /tmp/t.log\n") + sys.stderr.write("\tdbufstat.py -v\n") + sys.stderr.write("\tdbufstat.py -d -f pool,object,objset,dsize,cached\n") + sys.stderr.write("\n") + + sys.exit(1) + +def prettynum(sz, scale, num=0): + global raw + + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + if raw or scale == -1: + return "%*s" % (sz, num) + + # Rounding error, return 0 + elif num > 0 and num < 1: + num = 0 + + while num > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if (save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + +def print_values(v): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%s%s" % ( + prettynum(cols[col][0], cols[col][1], v[col]), sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + +def print_header(): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + +def get_typestring(t): + type_strings = ["DMU_OT_NONE", + # general: + "DMU_OT_OBJECT_DIRECTORY", "DMU_OT_OBJECT_ARRAY", + "DMU_OT_PACKED_NVLIST", "DMU_OT_PACKED_NVLIST_SIZE", + "DMU_OT_BPOBJ", "DMU_OT_BPOBJ_HDR", + # spa: + "DMU_OT_SPACE_MAP_HEADER", "DMU_OT_SPACE_MAP", + # zil: + "DMU_OT_INTENT_LOG", + # dmu: + "DMU_OT_DNODE", "DMU_OT_OBJSET", + # dsl: + "DMU_OT_DSL_DIR", "DMU_OT_DSL_DIR_CHILD_MAP", + "DMU_OT_DSL_DS_SNAP_MAP", "DMU_OT_DSL_PROPS", + "DMU_OT_DSL_DATASET", + # zpl: + "DMU_OT_ZNODE", "DMU_OT_OLDACL", "DMU_OT_PLAIN_FILE_CONTENTS", + "DMU_OT_DIRECTORY_CONTENTS", "DMU_OT_MASTER_NODE", + "DMU_OT_UNLINKED_SET", + # zvol: + "DMU_OT_ZVOL", "DMU_OT_ZVOL_PROP", + # other; for testing only! + "DMU_OT_PLAIN_OTHER", "DMU_OT_UINT64_OTHER", "DMU_OT_ZAP_OTHER", + # new object types: + "DMU_OT_ERROR_LOG", "DMU_OT_SPA_HISTORY", + "DMU_OT_SPA_HISTORY_OFFSETS", "DMU_OT_POOL_PROPS", + "DMU_OT_DSL_PERMS", "DMU_OT_ACL", "DMU_OT_SYSACL", + "DMU_OT_FUID", "DMU_OT_FUID_SIZE", "DMU_OT_NEXT_CLONES", + "DMU_OT_SCAN_QUEUE", "DMU_OT_USERGROUP_USED", + "DMU_OT_USERGROUP_QUOTA", "DMU_OT_USERREFS", "DMU_OT_DDT_ZAP", + "DMU_OT_DDT_STATS", "DMU_OT_SA", "DMU_OT_SA_MASTER_NODE", + "DMU_OT_SA_ATTR_REGISTRATION", "DMU_OT_SA_ATTR_LAYOUTS", + "DMU_OT_SCAN_XLATE", "DMU_OT_DEDUP", "DMU_OT_DEADLIST", + "DMU_OT_DEADLIST_HDR", "DMU_OT_DSL_CLONES", + "DMU_OT_BPOBJ_SUBOBJ"] + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % t + + try: + return type_strings[t]; + except IndexError: + return "%i" % t + +def get_compstring(c): + comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON", + "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB", + "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1", + "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3", + "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5", + "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", + "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", + "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", + "ZIO_COMPRESS_FUNCTION"] + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % c + + try: + return comp_strings[c]; + except IndexError: + return "%i" % c + +def parse_line(line, labels): + global hdr + + new = dict() + val = None + for col in hdr: + # These are "special" fields computed in the update_dict + # function, prevent KeyError exception on labels[col] for these. + if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']: + val = line[labels[col]] + + if col in ['pool', 'flags']: + new[col] = str(val) + elif col in ['dtype', 'btype']: + new[col] = get_typestring(int(val)) + elif col in ['l2_comp']: + new[col] = get_compstring(int(val)) + else: + new[col] = int(val) + + return new + +def update_dict(d, k, line, labels): + pool = line[labels['pool']] + objset = line[labels['objset']] + key = line[labels[k]] + + dbsize = int(line[labels['dbsize']]) + blkid = int(line[labels['blkid']]) + level = int(line[labels['level']]) + + if pool not in d: + d[pool] = dict() + + if objset not in d[pool]: + d[pool][objset] = dict() + + if key not in d[pool][objset]: + d[pool][objset][key] = parse_line(line, labels) + d[pool][objset][key]['bonus'] = 0 + d[pool][objset][key]['cached'] = 0 + d[pool][objset][key]['direct'] = 0 + d[pool][objset][key]['indirect'] = 0 + d[pool][objset][key]['spill'] = 0 + + d[pool][objset][key]['cached'] += dbsize + + if blkid == -1: + d[pool][objset][key]['bonus'] += dbsize + elif blkid == -2: + d[pool][objset][key]['spill'] += dbsize + else: + if level == 0: + d[pool][objset][key]['direct'] += dbsize + else: + d[pool][objset][key]['indirect'] += dbsize + + return d + +def print_dict(d): + print_header() + for pool in d.iterkeys(): + for objset in d[pool].iterkeys(): + for v in d[pool][objset].itervalues(): + print_values(v) + +def dnodes_build_dict(filehandle): + labels = dict() + dnodes = dict() + + # First 3 lines are header information, skip the first two + for i in range(0, 2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(dnodes, 'object', line.split(), labels) + + return dnodes + +def types_build_dict(filehandle): + labels = dict() + types = dict() + + # First 3 lines are header information, skip the first two + for i in range(0, 2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(types, 'dtype', line.split(), labels) + + return types + +def buffers_print_all(filehandle): + labels = dict() + + # First 3 lines are header information, skip the first two + for i in range(0, 2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + print_header() + + # The rest of the file is buffer information + for line in filehandle: + print_values(parse_line(line.split(), labels)) + +def main(): + global hdr + global sep + global raw + + desired_cols = None + bflag = False + dflag = False + hflag = False + ifile = None + ofile = None + tflag = False + vflag = False + xflag = False + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "bdf:hi:o:rs:tvx", + [ + "buffers", + "dnodes", + "columns", + "help", + "infile", + "outfile", + "seperator", + "types", + "verbose", + "extended" + ] + ) + except getopt.error: + usage() + + for opt, arg in opts: + if opt in ('-b', '--buffers'): + bflag = True + if opt in ('-d', '--dnodes'): + dflag = True + if opt in ('-f', '--columns'): + desired_cols = arg + if opt in ('-h', '--help'): + hflag = True + if opt in ('-i', '--infile'): + ifile = arg + if opt in ('-o', '--outfile'): + ofile = arg + if opt in ('-r', '--raw'): + raw += 1 + if opt in ('-s', '--seperator'): + sep = arg + if opt in ('-t', '--types'): + tflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-x', '--extended'): + xflag = True + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + # Ensure at most only one of b, d, or t flags are set + if (bflag and dflag) or (bflag and tflag) or (dflag and tflag): + usage() + + if bflag: + hdr = bxhdr if xflag else bhdr + elif tflag: + hdr = txhdr if xflag else thdr + else: # Even if dflag is False, it's the default if none set + dflag = True + hdr = dxhdr if xflag else dhdr + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif ((bflag and bincompat and ele in bincompat) or + (dflag and dincompat and ele in dincompat) or + (tflag and tincompat and ele in tincompat)): + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % incompat) + usage() + + if ofile: + try: + tmp = open(ofile, "w") + sys.stdout = tmp + + except: + sys.stderr.write("Cannot open %s for writing\n", ofile) + sys.exit(1) + + if not ifile: + ifile = '/proc/spl/kstat/zfs/dbufs' + + if ifile is not "-": + try: + tmp = open(ifile, "r") + sys.stdin = tmp + except: + sys.stderr.write("Cannot open %s for reading\n" % ifile) + sys.exit(1) + + if bflag: + buffers_print_all(sys.stdin) + + if dflag: + print_dict(dnodes_build_dict(sys.stdin)) + + if tflag: + print_dict(types_build_dict(sys.stdin)) + +if __name__ == '__main__': + main() diff --git a/configure.ac b/configure.ac index 08b807e3f..7c4809426 100644 --- a/configure.ac +++ b/configure.ac @@ -104,6 +104,7 @@ AC_CONFIG_FILES([ cmd/zvol_id/Makefile cmd/vdev_id/Makefile cmd/arcstat/Makefile + cmd/dbufstat/Makefile module/Makefile module/avl/Makefile module/nvpair/Makefile diff --git a/include/sys/arc.h b/include/sys/arc.h index 8c10d947c..efafb551a 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -92,6 +92,36 @@ typedef enum arc_space_type { ARC_SPACE_NUMTYPES } arc_space_type_t; +typedef enum arc_state_type { + ARC_STATE_ANON, + ARC_STATE_MRU, + ARC_STATE_MRU_GHOST, + ARC_STATE_MFU, + ARC_STATE_MFU_GHOST, + ARC_STATE_L2C_ONLY, + ARC_STATE_NUMTYPES +} arc_state_type_t; + +typedef struct arc_buf_info { + arc_state_type_t abi_state_type; + arc_buf_contents_t abi_state_contents; + uint64_t abi_state_index; + uint32_t abi_flags; + uint32_t abi_datacnt; + uint64_t abi_size; + uint64_t abi_spa; + uint64_t abi_access; + uint32_t abi_mru_hits; + uint32_t abi_mru_ghost_hits; + uint32_t abi_mfu_hits; + uint32_t abi_mfu_ghost_hits; + uint32_t abi_l2arc_hits; + uint32_t abi_holds; + uint64_t abi_l2arc_dattr; + uint64_t abi_l2arc_asize; + enum zio_compress abi_l2arc_compress; +} arc_buf_info_t; + void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, @@ -101,6 +131,7 @@ void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 8cd1fde01..85e967d0f 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -282,6 +282,9 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); +void dbuf_stats_init(dbuf_hash_table_t *hash); +void dbuf_stats_destroy(void); + #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index b0db7604d..fd1460d5c 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -660,6 +660,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 51b588e6a..25abd9c03 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -71,13 +71,6 @@ typedef struct zfs_all_blkstats { zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; } zfs_all_blkstats_t; -typedef struct txg_history { - kstat_txg_t th_kstat; - vdev_stat_t th_vs1; - vdev_stat_t th_vs2; - kmutex_t th_lock; - list_node_t th_link; -} txg_history_t; typedef struct dsl_pool { /* Immutable */ @@ -89,8 +82,6 @@ typedef struct dsl_pool { struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_iput_taskq; - kstat_t *dp_txg_kstat; - kstat_t *dp_tx_assign_kstat; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; @@ -111,11 +102,6 @@ typedef struct dsl_pool { uint64_t dp_mos_used_delta; uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; - uint64_t dp_txg_history_size; - list_t dp_txg_history; - uint64_t dp_tx_assign_size; - kstat_named_t *dp_tx_assign_buckets; - /* Has its own locking */ tx_state_t dp_tx; @@ -171,12 +157,6 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); void dsl_pool_rele(dsl_pool_t *dp, void *tag); -void dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs); - -txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg); -txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg); -void dsl_pool_txg_history_put(txg_history_t *th); - #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 401ae8343..cb3ce11bc 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -51,6 +51,8 @@ typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; +typedef struct zbookmark zbookmark_t; + struct dsl_pool; struct dsl_dataset; @@ -534,6 +536,41 @@ extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) +/* Historical pool statistics */ +typedef struct spa_stats_history { + kmutex_t lock; + uint64_t count; + uint64_t size; + kstat_t *kstat; + void *private; + list_t list; +} spa_stats_history_t; + +typedef struct spa_stats { + spa_stats_history_t read_history; + spa_stats_history_t txg_history; + spa_stats_history_t tx_assign_histogram; +} spa_stats_t; + +typedef enum txg_state { + TXG_STATE_BIRTH = 0, + TXG_STATE_OPEN = 1, + TXG_STATE_QUIESCED = 2, + TXG_STATE_SYNCED = 3, + TXG_STATE_COMMITTED = 4, +} txg_state_t; + +extern void spa_stats_init(spa_t *spa); +extern void spa_stats_destroy(spa_t *spa); +extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, + uint32_t aflags); +extern void spa_txg_history_add(spa_t *spa, uint64_t txg); +extern int spa_txg_history_set(spa_t *spa, uint64_t txg, + txg_state_t completed_state, hrtime_t completed_time); +extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved); +extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); + /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 47dfe432e..107d14a76 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -236,6 +236,8 @@ struct spa { uint64_t spa_deadman_calls; /* number of deadman calls */ uint64_t spa_sync_starttime; /* starting time fo spa_sync */ uint64_t spa_deadman_synctime; /* deadman expiration timer */ + spa_stats_t spa_stats; /* assorted spa statistics */ + /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index a126c058e..dfd11d9f1 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -210,6 +210,7 @@ typedef struct kthread { } kthread_t; #define curthread zk_thread_current() +#define getcomm() "unknown" #define thread_exit zk_thread_exit #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ @@ -347,6 +348,10 @@ extern kstat_t *kstat_create(char *, int, char *, char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); +extern void kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)); /* * Kernel memory diff --git a/include/sys/zio.h b/include/sys/zio.h index 189966bef..f5a128e0b 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -256,12 +256,13 @@ extern char *zio_type_name[ZIO_TYPES]; * Therefore it must not change size or alignment between 32/64 bit * compilation options. */ -typedef struct zbookmark { +struct zbookmark { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; -} zbookmark_t; + char * zb_func; +}; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ @@ -269,6 +270,7 @@ typedef struct zbookmark { (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ + (zb)->zb_func = FTAG; \ } #define ZB_DESTROYED_OBJSET (-1ULL) diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h index 6bd2ec878..fcd3ed98b 100644 --- a/lib/libspl/include/sys/kstat.h +++ b/lib/libspl/include/sys/kstat.h @@ -228,10 +228,8 @@ typedef struct kstat32 { /* ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer */ /* ks_ndata >= 1 */ -#define KSTAT_TYPE_TXG 5 /* txg statistics */ - /* ks_ndata >= 0 */ -#define KSTAT_NUM_TYPES 6 +#define KSTAT_NUM_TYPES 5 /* * kstat class @@ -700,29 +698,6 @@ typedef struct kstat_timer { #define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data) -/* - * TXG statistics - bytes read/written and iops performed - */ -typedef enum kstat_txg_state { - TXG_STATE_OPEN = 1, - TXG_STATE_QUIESCING = 2, - TXG_STATE_SYNCING = 3, - TXG_STATE_COMMITTED = 4, -} kstat_txg_state_t; - -typedef struct kstat_txg { - u_longlong_t txg; /* txg id */ - kstat_txg_state_t state; /* txg state */ - hrtime_t birth; /* birth time stamp */ - u_longlong_t nread; /* number of bytes read */ - u_longlong_t nwritten; /* number of bytes written */ - uint_t reads; /* number of read operations */ - uint_t writes; /* number of write operations */ - hrtime_t open_time; /* open time */ - hrtime_t quiesce_time; /* quiesce time */ - hrtime_t sync_time; /* sync time */ -} kstat_txg_t; - #if defined(_KERNEL) #include diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index da81659d2..e4189a378 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -25,6 +25,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/bpobj.c \ $(top_srcdir)/module/zfs/bptree.c \ $(top_srcdir)/module/zfs/dbuf.c \ + $(top_srcdir)/module/zfs/dbuf_stats.c \ $(top_srcdir)/module/zfs/ddt.c \ $(top_srcdir)/module/zfs/ddt_zap.c \ $(top_srcdir)/module/zfs/dmu.c \ @@ -62,6 +63,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/spa_errlog.c \ $(top_srcdir)/module/zfs/spa_history.c \ $(top_srcdir)/module/zfs/spa_misc.c \ + $(top_srcdir)/module/zfs/spa_stats.c \ $(top_srcdir)/module/zfs/space_map.c \ $(top_srcdir)/module/zfs/txg.c \ $(top_srcdir)/module/zfs/uberblock.c \ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index e4d645cf7..57a3739e2 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -240,6 +240,14 @@ void kstat_delete(kstat_t *ksp) {} +/*ARGSUSED*/ +void +kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{} + /* * ========================================================================= * mutexes diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index e71228454..5552436ad 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o +$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o @@ -44,6 +45,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o +$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1298c5b91..9098988fd 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -232,6 +232,7 @@ typedef struct arc_state { uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; + arc_state_type_t arcs_state; } arc_state_t; /* The 6 states: */ @@ -534,6 +535,11 @@ struct arc_buf_hdr { /* updated atomically */ clock_t b_arc_access; + uint32_t b_mru_hits; + uint32_t b_mru_ghost_hits; + uint32_t b_mfu_hits; + uint32_t b_mfu_ghost_hits; + uint32_t b_l2_hits; /* self protecting */ refcount_t b_refcnt; @@ -709,7 +715,8 @@ struct l2arc_buf_hdr { /* compression applied to buffer data */ enum zio_compress b_compress; /* real alloc'd buffer size depending on b_compress applied */ - int b_asize; + uint32_t b_asize; + uint32_t b_hits; /* temporary buffer holder for in-flight compressed data */ void *b_tmp_cdata; }; @@ -1137,6 +1144,54 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) return (cnt); } +/* + * Returns detailed information about a specific arc buffer. When the + * state_index argument is set the function will calculate the arc header + * list position for its arc state. Since this requires a linear traversal + * callers are strongly encourage not to do this. However, it can be helpful + * for targeted analysis so the functionality is provided. + */ +void +arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) +{ + arc_buf_hdr_t *hdr = ab->b_hdr; + arc_state_t *state = hdr->b_state; + + memset(abi, 0, sizeof(arc_buf_info_t)); + abi->abi_flags = hdr->b_flags; + abi->abi_datacnt = hdr->b_datacnt; + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; + abi->abi_state_contents = hdr->b_type; + abi->abi_state_index = -1; + abi->abi_size = hdr->b_size; + abi->abi_access = hdr->b_arc_access; + abi->abi_mru_hits = hdr->b_mru_hits; + abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&hdr->b_refcnt); + + if (hdr->b_l2hdr) { + abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; + abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; + abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; + abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; + } + + if (state && state_index && list_link_active(&hdr->b_arc_node)) { + list_t *list = &state->arcs_list[hdr->b_type]; + arc_buf_hdr_t *h; + + mutex_enter(&state->arcs_mtx); + for (h = list_head(list); h != NULL; h = list_next(list, h)) { + abi->abi_state_index++; + if (h == hdr) + break; + } + mutex_exit(&state->arcs_mtx); + } +} + /* * Move the supplied buffer to the indicated state. The mutex * for the buffer must be held by the caller. @@ -1298,6 +1353,11 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -2670,6 +2730,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(list_link_active(&buf->b_arc_node)); } else { buf->b_flags &= ~ARC_PREFETCH; + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } buf->b_arc_access = now; @@ -2691,6 +2752,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } else if (buf->b_state == arc_mru_ghost) { arc_state_t *new_state; @@ -2713,6 +2775,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (buf->b_state == arc_mfu) { /* @@ -2728,6 +2791,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); } + atomic_inc_32(&buf->b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { @@ -2751,6 +2815,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (buf->b_state == arc_l2c_only) { /* @@ -2943,6 +3008,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + int rc = 0; top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -2976,10 +3042,10 @@ top: hdr->b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); - return (0); + goto out; } mutex_exit(hash_lock); - return (0); + goto out; } ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -3133,6 +3199,7 @@ top: DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); + atomic_inc_32(&hdr->b_l2hdr->b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_PUSHPAGE); @@ -3174,12 +3241,12 @@ top: if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); - return (0); + goto out; } ASSERT(*arc_flags & ARC_WAIT); if (zio_wait(rzio) == 0) - return (0); + goto out; /* l2arc read error; goto zio_read() */ } else { @@ -3203,13 +3270,18 @@ top: rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) - return (zio_wait(rzio)); + if (*arc_flags & ARC_WAIT) { + rc = zio_wait(rzio); + goto out; + } ASSERT(*arc_flags & ARC_NOWAIT); zio_nowait(rzio); } - return (0); + +out: + spa_read_history_add(spa, zb, *arc_flags); + return (rc); } arc_prune_t * @@ -3463,6 +3535,11 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; + nhdr->b_mru_hits = 0; + nhdr->b_mru_ghost_hits = 0; + nhdr->b_mfu_hits = 0; + nhdr->b_mfu_ghost_hits = 0; + nhdr->b_l2_hits = 0; nhdr->b_flags = flags & ARC_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; @@ -3479,6 +3556,11 @@ arc_release(arc_buf_t *buf, void *tag) if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; if (hash_lock) mutex_exit(hash_lock); @@ -3896,6 +3978,13 @@ arc_init(void) list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; + buf_init(); arc_thread_exit = 0; @@ -4779,6 +4868,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, l2hdr->b_compress = ZIO_COMPRESS_OFF; l2hdr->b_asize = ab->b_size; l2hdr->b_tmp_cdata = ab->b_buf->b_data; + l2hdr->b_hits = 0; buf_sz = ab->b_size; ab->b_l2hdr = l2hdr; @@ -5311,6 +5401,7 @@ l2arc_stop(void) #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_remove_ref); +EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d655d6621..44e9419b7 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -317,6 +317,8 @@ retry: for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + dbuf_stats_init(h); } void @@ -325,6 +327,8 @@ dbuf_fini(void) dbuf_hash_table_t *h = &dbuf_hash_table; int i; + dbuf_stats_destroy(); + for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c new file mode 100644 index 000000000..ef760eaba --- /dev/null +++ b/module/zfs/dbuf_stats.c @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +/* + * Calculate the index of the arc header for the state, disabled by default. + */ +int zfs_dbuf_state_index = 0; + +/* + * ========================================================================== + * Dbuf Hash Read Routines + * ========================================================================== + */ +typedef struct dbuf_stats_t { + kmutex_t lock; + kstat_t *kstat; + dbuf_hash_table_t *hash; + int idx; +} dbuf_stats_t; + +static dbuf_stats_t dbuf_stats_hash_table; + +static int +dbuf_stats_hash_table_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, + "%-88s | %-124s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " + "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " + "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " + "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", + "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", + "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", + "atype", "index", "flags", "count", "asize", "access", "mru", "gmru", + "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", + "dtype", "btype", "data_bs", "meta_bs", "bsize", + "lvls", "dholds", "blocks", "dsize"); + buf[size] = '\0'; + + return (0); +} + +int +__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) +{ + arc_buf_info_t abi = { 0 }; + dmu_object_info_t doi = { 0 }; + dnode_t *dn = DB_DNODE(db); + + if (db->db_buf) + arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); + + if (dn) + __dmu_object_info_from_dnode(dn, &doi); + + size = snprintf(buf, size - 1, + "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " + "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " + "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " + "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", + /* dmu_buf_impl_t */ + spa_name(dn->dn_objset->os_spa), + (u_longlong_t)dmu_objset_id(db->db_objset), + (longlong_t)db->db.db_object, + (longlong_t)db->db_level, + (longlong_t)db->db_blkid, + (u_longlong_t)db->db.db_offset, + (u_longlong_t)db->db.db_size, + !!dbuf_is_metadata(db), + db->db_state, + (ulong_t)refcount_count(&db->db_holds), + /* arc_buf_info_t */ + abi.abi_state_type, + abi.abi_state_contents, + (longlong_t)abi.abi_state_index, + abi.abi_flags, + (ulong_t)abi.abi_datacnt, + (u_longlong_t)abi.abi_size, + (u_longlong_t)abi.abi_access, + (ulong_t)abi.abi_mru_hits, + (ulong_t)abi.abi_mru_ghost_hits, + (ulong_t)abi.abi_mfu_hits, + (ulong_t)abi.abi_mfu_ghost_hits, + (ulong_t)abi.abi_l2arc_hits, + (u_longlong_t)abi.abi_l2arc_dattr, + (u_longlong_t)abi.abi_l2arc_asize, + abi.abi_l2arc_compress, + (ulong_t)abi.abi_holds, + /* dmu_object_info_t */ + doi.doi_type, + doi.doi_bonus_type, + (ulong_t)doi.doi_data_block_size, + (ulong_t)doi.doi_metadata_block_size, + (u_longlong_t)doi.doi_bonus_size, + (ulong_t)doi.doi_indirection, + (ulong_t)refcount_count(&dn->dn_holds), + (u_longlong_t)doi.doi_fill_count, + (u_longlong_t)doi.doi_max_offset); + buf[size] = '\0'; + + return (size); +} + +static int +dbuf_stats_hash_table_data(char *buf, size_t size, void *data) +{ + dbuf_stats_t *dsh = (dbuf_stats_t *)data; + dbuf_hash_table_t *h = dsh->hash; + dmu_buf_impl_t *db; + int length, error = 0; + + ASSERT3S(dsh->idx, >=, 0); + ASSERT3S(dsh->idx, <=, h->hash_table_mask); + memset(buf, 0, size); + + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { + /* + * Returning ENOMEM will cause the data and header functions + * to be called with a larger scratch buffers. + */ + if (size < 512) { + error = ENOMEM; + break; + } + + mutex_enter(&db->db_mtx); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + length = __dbuf_stats_hash_table_data(buf, size, db); + buf += length; + size -= length; + + mutex_exit(&db->db_mtx); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + } + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + return (error); +} + +static void * +dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n) +{ + dbuf_stats_t *dsh = ksp->ks_private; + + ASSERT(MUTEX_HELD(&dsh->lock)); + + if (n <= dsh->hash->hash_table_mask) { + dsh->idx = n; + return (dsh); + } + + return (NULL); +} + +static void +dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); + dsh->hash = hash; + + ksp = kstat_create("zfs", 0, "dbufs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + dsh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &dsh->lock; + ksp->ks_ndata = UINT32_MAX; + ksp->ks_private = dsh; + kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, + dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); + kstat_install(ksp); + } +} + +static void +dbuf_stats_hash_table_destroy(void) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + ksp = dsh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&dsh->lock); +} + +void +dbuf_stats_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_hash_table_init(hash); +} + +void +dbuf_stats_destroy(void) +{ + dbuf_stats_hash_table_destroy(); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_dbuf_state_index, int, 0644); +MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index"); +#endif diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cbf4790b1..4ec9cb46a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1815,16 +1815,11 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) } void -dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { - dnode_phys_t *dnp; + dnode_phys_t *dnp = dn->dn_phys; int i; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - mutex_enter(&dn->dn_mtx); - - dnp = dn->dn_phys; - doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; @@ -1839,6 +1834,15 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_fill_count = 0; for (i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 32b3e50fc..1827ffc74 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); + SET_BOOKMARK(czb, td->td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { uint32_t flags = ARC_WAIT; @@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb); if (err != 0) return (err); @@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, td, TQ_NOQUEUE)) pd->pd_exited = B_TRUE; - SET_BOOKMARK(czb, td->td_objset, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(td, NULL, rootbp, czb); mutex_enter(&pd->pd_mtx); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3e46a02f8..7b6223956 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1077,7 +1077,7 @@ dmu_tx_unassign(dmu_tx_t *tx) int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { - hrtime_t before, after; + hrtime_t before; int err; ASSERT(tx->tx_txg == 0); @@ -1100,10 +1100,7 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); - after = gethrtime(); - - dsl_pool_tx_assign_add_usecs(tx->tx_pool, - (after - before) / NSEC_PER_USEC); + spa_tx_assign_add_nsecs(tx->tx_pool->dp_spa, gethrtime() - before); return (0); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index da4a9df85..ffef9be75 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -48,7 +48,6 @@ int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ -int zfs_txg_history = 60; /* statistics for the last N txgs */ unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ unsigned long zfs_write_limit_max = 0; /* max data payload per txg */ @@ -59,200 +58,6 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; -static void -dsl_pool_tx_assign_init(dsl_pool_t *dp, unsigned int ndata) -{ - kstat_named_t *ks; - char name[KSTAT_STRLEN]; - int i, data_size = ndata * sizeof(kstat_named_t); - - (void) snprintf(name, KSTAT_STRLEN, "dmu_tx_assign-%s", - spa_name(dp->dp_spa)); - - dp->dp_tx_assign_size = ndata; - - if (data_size) - dp->dp_tx_assign_buckets = kmem_alloc(data_size, KM_SLEEP); - else - dp->dp_tx_assign_buckets = NULL; - - for (i = 0; i < dp->dp_tx_assign_size; i++) { - ks = &dp->dp_tx_assign_buckets[i]; - ks->data_type = KSTAT_DATA_UINT64; - ks->value.ui64 = 0; - (void) snprintf(ks->name, KSTAT_STRLEN, "%u us", 1 << i); - } - - dp->dp_tx_assign_kstat = kstat_create("zfs", 0, name, "misc", - KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); - - if (dp->dp_tx_assign_kstat) { - dp->dp_tx_assign_kstat->ks_data = dp->dp_tx_assign_buckets; - dp->dp_tx_assign_kstat->ks_ndata = dp->dp_tx_assign_size; - dp->dp_tx_assign_kstat->ks_data_size = data_size; - kstat_install(dp->dp_tx_assign_kstat); - } -} - -static void -dsl_pool_tx_assign_destroy(dsl_pool_t *dp) -{ - if (dp->dp_tx_assign_buckets) - kmem_free(dp->dp_tx_assign_buckets, - dp->dp_tx_assign_size * sizeof(kstat_named_t)); - - if (dp->dp_tx_assign_kstat) - kstat_delete(dp->dp_tx_assign_kstat); -} - -void -dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs) -{ - uint64_t idx = 0; - - while (((1 << idx) < usecs) && (idx < dp->dp_tx_assign_size - 1)) - idx++; - - atomic_inc_64(&dp->dp_tx_assign_buckets[idx].value.ui64); -} - -static int -dsl_pool_txg_history_update(kstat_t *ksp, int rw) -{ - dsl_pool_t *dp = ksp->ks_private; - txg_history_t *th; - int i = 0; - - if (rw == KSTAT_WRITE) - return (EACCES); - - if (ksp->ks_data) - kmem_free(ksp->ks_data, ksp->ks_data_size); - - mutex_enter(&dp->dp_lock); - - ksp->ks_ndata = dp->dp_txg_history_size; - ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t); - if (ksp->ks_data_size > 0) - ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE); - - /* Traversed oldest to youngest for the most readable kstat output */ - for (th = list_tail(&dp->dp_txg_history); th != NULL; - th = list_prev(&dp->dp_txg_history, th)) { - mutex_enter(&th->th_lock); - ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size); - memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t)); - i += sizeof(kstat_txg_t); - mutex_exit(&th->th_lock); - } - - mutex_exit(&dp->dp_lock); - - return (0); -} - -static void -dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg) -{ - char name[KSTAT_STRLEN]; - - list_create(&dp->dp_txg_history, sizeof (txg_history_t), - offsetof(txg_history_t, th_link)); - dsl_pool_txg_history_add(dp, txg); - - (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa)); - dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc", - KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL); - if (dp->dp_txg_kstat) { - dp->dp_txg_kstat->ks_data = NULL; - dp->dp_txg_kstat->ks_private = dp; - dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update; - kstat_install(dp->dp_txg_kstat); - } -} - -static void -dsl_pool_txg_history_destroy(dsl_pool_t *dp) -{ - txg_history_t *th; - - if (dp->dp_txg_kstat) { - if (dp->dp_txg_kstat->ks_data) - kmem_free(dp->dp_txg_kstat->ks_data, - dp->dp_txg_kstat->ks_data_size); - - kstat_delete(dp->dp_txg_kstat); - } - - mutex_enter(&dp->dp_lock); - while ((th = list_remove_head(&dp->dp_txg_history))) { - dp->dp_txg_history_size--; - mutex_destroy(&th->th_lock); - kmem_free(th, sizeof(txg_history_t)); - } - - ASSERT3U(dp->dp_txg_history_size, ==, 0); - list_destroy(&dp->dp_txg_history); - mutex_exit(&dp->dp_lock); -} - -txg_history_t * -dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) -{ - txg_history_t *th, *rm; - - th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE); - mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); - th->th_kstat.txg = txg; - th->th_kstat.state = TXG_STATE_OPEN; - th->th_kstat.birth = gethrtime(); - - mutex_enter(&dp->dp_lock); - - list_insert_head(&dp->dp_txg_history, th); - dp->dp_txg_history_size++; - - while (dp->dp_txg_history_size > zfs_txg_history) { - dp->dp_txg_history_size--; - rm = list_remove_tail(&dp->dp_txg_history); - mutex_destroy(&rm->th_lock); - kmem_free(rm, sizeof(txg_history_t)); - } - - mutex_exit(&dp->dp_lock); - - return (th); -} - -/* - * Traversed youngest to oldest because lookups are only done for open - * or syncing txgs which are guaranteed to be at the head of the list. - * The txg_history_t structure will be returned locked. - */ -txg_history_t * -dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg) -{ - txg_history_t *th; - - mutex_enter(&dp->dp_lock); - for (th = list_head(&dp->dp_txg_history); th != NULL; - th = list_next(&dp->dp_txg_history, th)) { - if (th->th_kstat.txg == txg) { - mutex_enter(&th->th_lock); - break; - } - } - mutex_exit(&dp->dp_lock); - - return (th); -} - -void -dsl_pool_txg_history_put(txg_history_t *th) -{ - mutex_exit(&th->th_lock); -} - int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { @@ -295,9 +100,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri, 1, 4, 0); - dsl_pool_txg_history_init(dp, txg); - dsl_pool_tx_assign_init(dp, 32); - return (dp); } @@ -438,8 +240,6 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); - dsl_pool_tx_assign_destroy(dp); - dsl_pool_txg_history_destroy(dp); rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); @@ -1241,9 +1041,6 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg"); module_param(zfs_txg_synctime_ms, int, 0644); MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync"); -module_param(zfs_txg_history, int, 0644); -MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); - module_param(zfs_write_limit_min, ulong, 0444); MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9ba6e8963..a0b4797c9 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "zfs_prop.h" #include "zfeature_common.h" @@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL; */ int zfs_deadman_enabled = 1; - /* * ========================================================================== * SPA config locking @@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); + spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); @@ -580,6 +581,7 @@ spa_remove(spa_t *spa) refcount_destroy(&spa->spa_refcount); + spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (t = 0; t < TXG_SIZE; t++) diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c new file mode 100644 index 000000000..789e8c3e6 --- /dev/null +++ b/module/zfs/spa_stats.c @@ -0,0 +1,636 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include + +/* + * Keeps stats on last N reads per spa_t, disabled by default. + */ +int zfs_read_history = 0; + +/* + * Include cache hits in history, disabled by default. + */ +int zfs_read_history_hits = 0; + +/* + * Keeps stats on the last N txgs, disabled by default. + */ +int zfs_txg_history = 0; + +/* + * ========================================================================== + * SPA Read History Routines + * ========================================================================== + */ + +/* + * Read statistics - Information exported regarding each arc_read call + */ +typedef struct spa_read_history { + uint64_t uid; /* unique identifier */ + hrtime_t start; /* time read completed */ + uint64_t objset; /* read from this objset */ + uint64_t object; /* read of this object number */ + uint64_t level; /* block's indirection level */ + uint64_t blkid; /* read of this block id */ + char origin[24]; /* read originated from here */ + uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ + pid_t pid; /* PID of task doing read */ + char comm[16]; /* process name of task doing read */ + list_node_t srh_link; +} spa_read_history_t; + +static int +spa_read_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", + "level", "blkid", "aflags", "origin", "pid", "process"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_read_history_data(char *buf, size_t size, void *data) +{ + spa_read_history_t *srh = (spa_read_history_t *)data; + + size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " + "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", + (u_longlong_t)srh->uid, srh->start, + (longlong_t)srh->objset, (longlong_t)srh->object, + (longlong_t)srh->level, (longlong_t)srh->blkid, + srh->aflags, srh->origin, srh->pid, srh->comm); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_read_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_read_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_read_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + if (rw == KSTAT_WRITE) { + spa_read_history_t *srh; + + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t); + + return (0); +} + +static void +spa_read_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_read_history_t), + offsetof(spa_read_history_t, srh_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "reads", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_read_history_update; + kstat_set_raw_ops(ksp, spa_read_history_headers, + spa_read_history_data, spa_read_history_addr); + kstat_install(ksp); + } +} + +static void +spa_read_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh, *rm; + + ASSERT3P(spa, !=, NULL); + ASSERT3P(zb, !=, NULL); + + if (zfs_read_history == 0 && ssh->size == 0) + return; + + if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + return; + + srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE); + strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin)); + strlcpy(srh->comm, getcomm(), sizeof(srh->comm)); + srh->start = gethrtime(); + srh->objset = zb->zb_objset; + srh->object = zb->zb_object; + srh->level = zb->zb_level; + srh->blkid = zb->zb_blkid; + srh->aflags = aflags; + srh->pid = getpid(); + + mutex_enter(&ssh->lock); + + srh->uid = ssh->count++; + list_insert_head(&ssh->list, srh); + ssh->size++; + + while (ssh->size > zfs_read_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_read_history_t)); + } + + mutex_exit(&ssh->lock); +} + +/* + * ========================================================================== + * SPA TXG History Routines + * ========================================================================== + */ + +/* + * Txg statistics - Information exported regarding each txg sync + */ + +typedef struct spa_txg_history { + uint64_t txg; /* txg id */ + txg_state_t state; /* active txg state */ + uint64_t nread; /* number of bytes read */ + uint64_t nwritten; /* number of bytes written */ + uint64_t reads; /* number of read operations */ + uint64_t writes; /* number of write operations */ + uint64_t nreserved; /* number of bytes reserved */ + hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ + list_node_t sth_link; +} spa_txg_history_t; + +static int +spa_txg_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " + "%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state", + "nreserved", "nread", "nwritten", "reads", "writes", + "otime", "qtime", "stime"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_txg_history_data(char *buf, size_t size, void *data) +{ + spa_txg_history_t *sth = (spa_txg_history_t *)data; + uint64_t open = 0, quiesce = 0, sync = 0; + char state; + + switch (sth->state) { + case TXG_STATE_BIRTH: state = 'B'; break; + case TXG_STATE_OPEN: state = 'O'; break; + case TXG_STATE_QUIESCED: state = 'Q'; break; + case TXG_STATE_SYNCED: state = 'S'; break; + case TXG_STATE_COMMITTED: state = 'C'; break; + default: state = '?'; break; + } + + if (sth->times[TXG_STATE_OPEN]) + open = sth->times[TXG_STATE_OPEN] - + sth->times[TXG_STATE_BIRTH]; + + if (sth->times[TXG_STATE_QUIESCED]) + quiesce = sth->times[TXG_STATE_QUIESCED] - + sth->times[TXG_STATE_OPEN]; + + if (sth->times[TXG_STATE_SYNCED]) + sync = sth->times[TXG_STATE_SYNCED] - + sth->times[TXG_STATE_QUIESCED]; + + size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " + "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n", + (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, + (u_longlong_t)sth->nreserved, + (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, + (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, + (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_txg_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_txg_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_txg_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (rw == KSTAT_WRITE) { + spa_txg_history_t *sth; + + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t); + + return (0); +} + +static void +spa_txg_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_txg_history_t), + offsetof(spa_txg_history_t, sth_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "txgs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_txg_history_update; + kstat_set_raw_ops(ksp, spa_txg_history_headers, + spa_txg_history_data, spa_txg_history_addr); + kstat_install(ksp); + } +} + +static void +spa_txg_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +/* + * Add a new txg to historical record. + */ +void +spa_txg_history_add(spa_t *spa, uint64_t txg) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth, *rm; + + if (zfs_txg_history == 0 && ssh->size == 0) + return; + + sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE); + sth->txg = txg; + sth->state = TXG_STATE_OPEN; + sth->times[TXG_STATE_BIRTH] = gethrtime(); + + mutex_enter(&ssh->lock); + + list_insert_head(&ssh->list, sth); + ssh->size++; + + while (ssh->size > zfs_txg_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_txg_history_t)); + } + + mutex_exit(&ssh->lock); +} + +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->times[completed_state] = completed_time; + sth->state++; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + +/* + * Set txg IO stats. + */ +int +spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->nread = nread; + sth->nwritten = nwritten; + sth->reads = reads; + sth->writes = writes; + sth->nreserved = nreserved; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + +/* + * ========================================================================== + * SPA TX Assign Histogram Routines + * ========================================================================== + */ + +/* + * Tx statistics - Information exported regarding dmu_tx_assign time. + */ + +/* + * When the kstat is written zero all buckets. When the kstat is read + * count the number of trailing buckets set to zero and update ks_ndata + * such that they are not output. + */ +static int +spa_tx_assign_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + int i; + + if (rw == KSTAT_WRITE) { + for (i = 0; i < ssh->count; i++) + ((kstat_named_t *)ssh->private)[i].value.ui64 = 0; + } + + for (i = ssh->count; i > 0; i--) + if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0) + break; + + ksp->ks_ndata = i; + ksp->ks_data_size = i * sizeof(kstat_named_t); + + return (0); +} + +static void +spa_tx_assign_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + char name[KSTAT_STRLEN]; + kstat_named_t *ks; + kstat_t *ksp; + int i; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + + ssh->count = 42; /* power of two buckets for 1ns to 2,199s */ + ssh->size = ssh->count * sizeof(kstat_named_t); + ssh->private = kmem_alloc(ssh->size, KM_SLEEP); + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + for (i = 0; i < ssh->count; i++) { + ks = &((kstat_named_t *)ssh->private)[i]; + ks->data_type = KSTAT_DATA_UINT64; + ks->value.ui64 = 0; + (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", + (u_longlong_t)1 << i); + } + + ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", + KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = ssh->private; + ksp->ks_ndata = ssh->count; + ksp->ks_data_size = ssh->size; + ksp->ks_private = spa; + ksp->ks_update = spa_tx_assign_update; + kstat_install(ksp); + } +} + +static void +spa_tx_assign_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + kmem_free(ssh->private, ssh->size); + mutex_destroy(&ssh->lock); +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + uint64_t idx = 0; + + while (((1 << idx) < nsecs) && (idx < ssh->size - 1)) + idx++; + + atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64); +} + +void +spa_stats_init(spa_t *spa) +{ + spa_read_history_init(spa); + spa_txg_history_init(spa); + spa_tx_assign_init(spa); +} + +void +spa_stats_destroy(spa_t *spa) +{ + spa_tx_assign_destroy(spa); + spa_txg_history_destroy(spa); + spa_read_history_destroy(spa); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_read_history, int, 0644); +MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); + +module_param(zfs_read_history_hits, int, 0644); +MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); + +module_param(zfs_txg_history, int, 0644); +MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); +#endif diff --git a/module/zfs/txg.c b/module/zfs/txg.c index b3e537f45..7a3da8647 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -27,11 +27,11 @@ #include #include #include +#include #include #include #include #include -#include /* * ZFS Transaction Groups @@ -351,8 +351,6 @@ txg_rele_to_sync(txg_handle_t *th) static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { - hrtime_t start; - txg_history_t *th; tx_state_t *tx = &dp->dp_tx; int g = txg & TXG_MASK; int c; @@ -366,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime()); + spa_txg_history_add(dp->dp_spa, tx->tx_open_txg); + /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. @@ -373,20 +374,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) for (c = 0; c < max_ncpus; c++) mutex_exit(&tx->tx_cpu[c].tc_lock); - /* - * Measure how long the txg was open and replace the kstat. - */ - th = dsl_pool_txg_history_get(dp, txg); - th->th_kstat.open_time = gethrtime() - th->th_kstat.birth; - th->th_kstat.state = TXG_STATE_QUIESCING; - dsl_pool_txg_history_put(th); - dsl_pool_txg_history_add(dp, tx->tx_open_txg); - /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ - start = gethrtime(); - for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; mutex_enter(&tc->tc_lock); @@ -395,12 +385,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) mutex_exit(&tc->tc_lock); } - /* - * Measure how long the txg took to quiesce. - */ - th = dsl_pool_txg_history_get(dp, txg); - th->th_kstat.quiesce_time = gethrtime() - start; - dsl_pool_txg_history_put(th); + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); } static void @@ -472,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp) spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; + vdev_stat_t *vs1, *vs2; uint64_t start, delta; #ifdef _KERNEL @@ -485,10 +471,11 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_enter(tx, &cpr); + vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + start = delta = 0; for (;;) { - hrtime_t hrstart; - txg_history_t *th; uint64_t timer, timeout; uint64_t txg; @@ -522,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } - if (tx->tx_exiting) + if (tx->tx_exiting) { + kmem_free(vs2, sizeof(vdev_stat_t)); + kmem_free(vs1, sizeof(vdev_stat_t)); txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + } + + vdev_get_stats(spa->spa_root_vdev, vs1); /* * Consume the quiesced txg which has been handed off to @@ -535,17 +527,11 @@ txg_sync_thread(dsl_pool_t *dp) tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); - th = dsl_pool_txg_history_get(dp, txg); - th->th_kstat.state = TXG_STATE_SYNCING; - vdev_get_stats(spa->spa_root_vdev, &th->th_vs1); - dsl_pool_txg_history_put(th); - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); - hrstart = gethrtime(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; @@ -559,22 +545,15 @@ txg_sync_thread(dsl_pool_t *dp) */ txg_dispatch_callbacks(dp, txg); - /* - * Measure the txg sync time determine the amount of I/O done. - */ - th = dsl_pool_txg_history_get(dp, txg); - vdev_get_stats(spa->spa_root_vdev, &th->th_vs2); - th->th_kstat.sync_time = gethrtime() - hrstart; - th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] - - th->th_vs1.vs_bytes[ZIO_TYPE_READ]; - th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] - - th->th_vs1.vs_bytes[ZIO_TYPE_WRITE]; - th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] - - th->th_vs1.vs_ops[ZIO_TYPE_READ]; - th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] - - th->th_vs1.vs_ops[ZIO_TYPE_WRITE]; - th->th_kstat.state = TXG_STATE_COMMITTED; - dsl_pool_txg_history_put(th); + vdev_get_stats(spa->spa_root_vdev, vs2); + spa_txg_history_set_io(spa, txg, + vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], + vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], + vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], + vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], + dp->dp_space_towrite[txg & TXG_MASK] + + dp->dp_tempreserved[txg & TXG_MASK] / 2); + spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); } }