From bfcbec6f5d8aad60365eaeacff21df92c04c26df Mon Sep 17 00:00:00 2001 From: Robert Novak Date: Fri, 26 Jun 2020 15:09:20 -0700 Subject: [PATCH] Add block histogram to zdb The block histogram tracks the changes to psize, lsize and asize both in the count of the number of blocks (by blocksize) and the total length of all of the blocks for that blocksize. It also keeps a running total of the cumulative size of all of the blocks up to each size to help determine the size of caching SSDs to be added to zfs hardware deployments. The block history counts and lengths are summarized in bins which are powers of two. Even rows with counts of zero are printed. This change is accessed by specifying one of two options: zdb -bbb pool zdb -Pbbb pool The first version prints the table in fixed size columns. The second prints in "parseable" output that can be placed into a CSV file. Fixed Column, nicenum output sample: block psize lsize asize size Count Length Cum. Count Length Cum. Count Length Cum. 512: 3.50K 1.75M 1.75M 3.43K 1.71M 1.71M 3.41K 1.71M 1.71M 1K: 3.65K 3.67M 5.43M 3.43K 3.44M 5.15M 3.50K 3.51M 5.22M 2K: 3.45K 6.92M 12.3M 3.41K 6.83M 12.0M 3.59K 7.26M 12.5M 4K: 3.44K 13.8M 26.1M 3.43K 13.7M 25.7M 3.49K 14.1M 26.6M 8K: 3.42K 27.3M 53.5M 3.41K 27.3M 53.0M 3.44K 27.6M 54.2M 16K: 3.43K 54.9M 108M 3.50K 56.1M 109M 3.42K 54.7M 109M 32K: 3.44K 110M 219M 3.41K 109M 218M 3.43K 110M 219M 64K: 3.41K 218M 437M 3.41K 218M 437M 3.44K 221M 439M 128K: 3.41K 437M 874M 3.70K 474M 911M 3.41K 437M 876M 256K: 3.41K 874M 1.71G 3.41K 874M 1.74G 3.41K 874M 1.71G 512K: 3.41K 1.71G 3.41G 3.41K 1.71G 3.45G 3.41K 1.71G 3.42G 1M: 3.41K 3.41G 6.82G 3.41K 3.41G 6.86G 3.41K 3.41G 6.83G 2M: 0 0 6.82G 0 0 6.86G 0 0 6.83G 4M: 0 0 6.82G 0 0 6.86G 0 0 6.83G 8M: 0 0 6.82G 0 0 6.86G 0 0 6.83G 16M: 0 0 6.82G 0 0 6.86G 0 0 6.83G Reviewed-by: Brian Behlendorf Signed-off-by: Robert E. Novak Closes: #9158 Closes #10315 --- cmd/zdb/zdb.c | 203 +++++++++++++ tests/runfiles/common.run | 5 +- .../tests/functional/cli_root/zdb/Makefile.am | 1 + .../cli_root/zdb/zdb_block_size_histogram.ksh | 272 ++++++++++++++++++ 4 files changed, 479 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 763a086ac..a329e4a83 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4161,6 +4161,7 @@ static const char *zdb_ot_extname[] = { }; #define ZB_TOTAL DN_MAX_LEVELS +#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; @@ -4168,6 +4169,15 @@ typedef struct zdb_cb { uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_total; + uint64_t zcb_lsize_total; + uint64_t zcb_asize_total; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] [BPE_PAYLOAD_SIZE + 1]; @@ -4191,6 +4201,172 @@ same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) return ((off1 >> ms_shift) == (off2 >> ms_shift)); } +/* + * Used to simplify reporting of the histogram data. + */ +typedef struct one_histo { + char *name; + uint64_t *count; + uint64_t *len; + uint64_t cumulative; +} one_histo_t; + +/* + * The number of separate histograms processed for psize, lsize and asize. + */ +#define NUM_HISTO 3 + +/* + * This routine will create a fixed column size output of three different + * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M + * the count, length and cumulative length of the psize, lsize and + * asize blocks. + * + * All three types of blocks are listed on a single line + * + * By default the table is printed in nicenumber format (e.g. 123K) but + * if the '-P' parameter is specified then the full raw number (parseable) + * is printed out. + */ +static void +dump_size_histograms(zdb_cb_t *zcb) +{ + /* + * A temporary buffer that allows us to convert a number into + * a string using zdb_nicenumber to allow either raw or human + * readable numbers to be output. + */ + char numbuf[32]; + + /* + * Define titles which are used in the headers of the tables + * printed by this routine. + */ + const char blocksize_title1[] = "block"; + const char blocksize_title2[] = "size"; + const char count_title[] = "Count"; + const char length_title[] = "Size"; + const char cumulative_title[] = "Cum."; + + /* + * Setup the histogram arrays (psize, lsize, and asize). + */ + one_histo_t parm_histo[NUM_HISTO]; + + parm_histo[0].name = "psize"; + parm_histo[0].count = zcb->zcb_psize_count; + parm_histo[0].len = zcb->zcb_psize_len; + parm_histo[0].cumulative = 0; + + parm_histo[1].name = "lsize"; + parm_histo[1].count = zcb->zcb_lsize_count; + parm_histo[1].len = zcb->zcb_lsize_len; + parm_histo[1].cumulative = 0; + + parm_histo[2].name = "asize"; + parm_histo[2].count = zcb->zcb_asize_count; + parm_histo[2].len = zcb->zcb_asize_len; + parm_histo[2].cumulative = 0; + + + (void) printf("\nBlock Size Histogram\n"); + /* + * Print the first line titles + */ + if (dump_opt['P']) + (void) printf("\n%s\t", blocksize_title1); + else + (void) printf("\n%7s ", blocksize_title1); + + for (int j = 0; j < NUM_HISTO; j++) { + if (dump_opt['P']) { + if (j < NUM_HISTO - 1) { + (void) printf("%s\t\t\t", parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf(" %s", parm_histo[j].name); + } + } else { + if (j < NUM_HISTO - 1) { + /* Left aligned strings in the output */ + (void) printf("%-7s ", + parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf("%s", parm_histo[j].name); + } + } + } + (void) printf("\n"); + + /* + * Print the second line titles + */ + if (dump_opt['P']) { + (void) printf("%s\t", blocksize_title2); + } else { + (void) printf("%7s ", blocksize_title2); + } + + for (int i = 0; i < NUM_HISTO; i++) { + if (dump_opt['P']) { + (void) printf("%s\t%s\t%s\t", + count_title, length_title, cumulative_title); + } else { + (void) printf("%7s%7s%7s", + count_title, length_title, cumulative_title); + } + } + (void) printf("\n"); + + /* + * Print the rows + */ + for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { + + /* + * Print the first column showing the blocksize + */ + zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); + + if (dump_opt['P']) { + printf("%s", numbuf); + } else { + printf("%7s:", numbuf); + } + + /* + * Print the remaining set of 3 columns per size: + * for psize, lsize and asize + */ + for (int j = 0; j < NUM_HISTO; j++) { + parm_histo[j].cumulative += parm_histo[j].len[i]; + + zdb_nicenum(parm_histo[j].count[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].len[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].cumulative, + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + } + (void) printf("\n"); + } +} + static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) @@ -4284,6 +4460,28 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, [BPE_GET_PSIZE(bp)]++; return; } + /* + * The binning histogram bins by powers of two up to + * SPA_MAXBLOCKSIZE rather than creating bins for + * every possible blocksize found in the pool. + */ + int bin = highbit64(BP_GET_PSIZE(bp)) - 1; + + zcb->zcb_psize_count[bin]++; + zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); + zcb->zcb_psize_total += BP_GET_PSIZE(bp); + + bin = highbit64(BP_GET_LSIZE(bp)) - 1; + + zcb->zcb_lsize_count[bin]++; + zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); + zcb->zcb_lsize_total += BP_GET_LSIZE(bp); + + bin = highbit64(BP_GET_ASIZE(bp)) - 1; + + zcb->zcb_asize_count[bin]++; + zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); + zcb->zcb_asize_total += BP_GET_ASIZE(bp); if (dump_opt['L']) return; @@ -5645,6 +5843,11 @@ dump_block_stats(spa_t *spa) } } } + + /* Output a table summarizing block sizes in the pool */ + if (dump_opt['b'] >= 2) { + dump_size_histograms(&zcb); + } } (void) printf("\n"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index d8c109eb7..765ffea8a 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -104,8 +104,9 @@ tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', - 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_checksum', 'zdb_decompress', - 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_display_block', + 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', + 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', + 'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id'] pre = post = diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am index e332a91a8..3cf13f3ae 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am @@ -7,6 +7,7 @@ dist_pkgdata_SCRIPTS = \ zdb_006_pos.ksh \ zdb_args_neg.ksh \ zdb_args_pos.ksh \ + zdb_block_size_histogram.ksh \ zdb_checksum.ksh \ zdb_decompress.ksh \ zdb_object_range_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh new file mode 100755 index 000000000..0c949f983 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh @@ -0,0 +1,272 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security LLC. + +. $STF_SUITE/include/libtest.shlib + + +# +# DESCRIPTION: +# Create a pool and populate it with files of various +# recordsizes +# +# STRATEGY: +# 1. Create pool +# 2. Populate it +# 3. Run zdb -Pbbb on pool +# 4. Verify variance on blocksizes +# +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +SPA_MAXBLOCKSHIFT=24 + +function histo_populate_test_pool +{ + if [ $# -ne 1 ]; then + log_note "histo_populate_test_pool: insufficient parameters" + log_fail "hptp: 1 requested $# received" + fi + typeset pool=$1 + + set -A recordsizes + typeset -i min_rsbits=9 #512 + typeset -i max_rsbits=SPA_MAXBLOCKSHIFT #16 MiB + typeset -i sum_filesizes=0 + re_number='^[0-9]+$' + + let histo_pool_size=$(get_pool_prop size ${pool}) + if [[ ! ${histo_pool_size} =~ ${re_number} ]]; then + log_fail "histo_pool_size is not numeric ${pool_size}" + fi + let max_pool_record_size=$(get_prop recordsize ${pool}) + if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then + log_fail "hptp: max_pool_record_size is not numeric ${max_pool_record_size}" + fi + + sum_filesizes=$(echo "2^21"|bc) + ((min_pool_size=12*sum_filesizes)) + if [ ${histo_pool_size} -lt ${min_pool_size} ]; then + log_note "hptp: Your pool size ${histo_pool_size}" + log_fail "hptp: is less than minimum ${min_pool_size}" + fi + this_ri=min_rsbits + file_num=0 + total_count=0 + ################### + # generate 10% + 20% + 30% + 31% = 91% of the filespace + # attempting to use 100% will lead to no space left on device + # Heuristic testing showed that 91% was the practical upper + # bound on the default 4G zpool (mirrored) that is used in + # testing. + # + # In order to expedite testing, we will only fill 2G (of 4G) + # of the test pool. You may want to modify this for + # standalone testing. + # + # In filling only 50% of the pool, we create one object on + # each "pass" below to achieve multiple objects per record + # size. Creating one file per object would lead to + # excessive file creation time. + ################### + # for pass in 10 20 30 31 # 91% + for pass in 20 20 10 # 50% + do + ((thiscount=(((histo_pool_size*pass)/100)/sum_filesizes))) + + ((total_count+=thiscount)) + for rb in $(seq ${min_rsbits} ${max_rsbits}) + do + this_rs=$(echo "2^${rb}" | bc) + if [ ${this_rs} -gt ${max_pool_record_size} ]; then + continue + fi + + if [ ! -d /${pool}/B_${this_rs} ]; then + zfs create ${pool}/B_${this_rs} + zfs set recordsize=${this_rs} \ + ${pool}/B_${this_rs} + fi + #################### + # Create the files in the devices and datasets + # of the right size. The files are filled + # with random data to defeat the compression + # + # Note that the dd output is suppressed unless + # there are errors + #################### + + dd if=/dev/urandom \ + of=/${pool}/B_${this_rs}/file_${filenum} \ + bs=${this_rs} count=${thiscount} \ + iflag=fullblock 2>&1 | \ + egrep -v -e "records in" -e "records out" \ + -e "bytes.*copied" + ((filenum+=1)) + done + done + + #################### + # Testing showed that on some devices, unless the pool is + # synchronized, that the block counts will be below the + # anticipated sizes since not all of the blocks will be flushed + # to the device. This 'sync' command prevents that from + # happening. + #################### + log_must zpool sync ${pool} +} +function histo_check_test_pool +{ + if [ $# -ne 1 ]; then + log_note "histo_check_test_pool: insufficient parameters" + log_fail "hctp: 1 requested $# received" + fi + typeset pool=$1 + + set -A recordsizes + set -A recordcounts + typeset -i rb + typeset -i min_rsbits=9 #512 + typeset -i max_rsbits=SPA_MAXBLOCKSHIFT+1 + typeset -i this_rs + typeset -i this_ri + typeset -i sum_filesizes=0 + typeset dumped + typeset stripped + + let histo_check_pool_size=$(get_pool_prop size ${pool}) + if [[ ! ${histo_check_pool_size} =~ ${re_number} ]]; then + log_fail "histo_check_pool_size is not numeric ${histo_check_pool_size}" + fi + let max_pool_record_size=$(get_prop recordsize ${pool}) + if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then + log_fail "hctp: max_pool_record_size is not numeric ${max_pool_record_size}" + fi + + dumped="${TEST_BASE_DIR}/${pool}_dump.txt" + stripped="${TEST_BASE_DIR}/${pool}_stripped.txt" + + zdb -Pbbb ${pool} | \ + tee ${dumped} | \ + sed -e '1,/^block[ ][ ]*psize[ ][ ]*lsize.*$/d' \ + -e '/^size[ ]*Count/d' -e '/^$/,$d' \ + > ${stripped} + + sum_filesizes=$(echo "2^21"|bc) + + ################### + # generate 10% + 20% + 30% + 31% = 91% of the filespace + # attempting to use 100% will lead to no space left on device + # attempting to use 100% will lead to no space left on device + # Heuristic testing showed that 91% was the practical upper + # bound on the default 4G zpool (mirrored) that is used in + # testing. + # + # In order to expedite testing, we will only fill 2G (of 4G) + # of the test pool. You may want to modify this for + # standalone testing. + # + # In filling only 50% of the pool, we create one object on + # each "pass" below to achieve multiple objects per record + # size. Creating one file per object would lead to + # excessive file creation time. + ################### + # for pass in 10 20 30 31 # 91% + for pass in 20 20 10 # 50% + do + ((thiscount=(((histo_check_pool_size*pass)/100)/sum_filesizes))) + + for rb in $(seq ${min_rsbits} ${max_rsbits}) + do + blksize=$(echo "2^$rb"|bc) + if [ $blksize -le $max_pool_record_size ]; then + ((recordcounts[$blksize]+=thiscount)) + fi + done + done + + ################### + # compare the above computed counts for blocks against + # lsize count. Since some devices have a minimum hardware + # blocksize > 512, we cannot compare against the asize count. + # E.G., if the HWBlocksize = 4096, then the asize counts for + # 512, 1024 and 2048 will be zero and rolled up into the + # 4096 blocksize count for asize. For verification we stick + # to just lsize counts. + # + # The max_variance is hard-coded here at 10%. testing so far + # has shown this to be in the range of 2%-8% so we leave a + # generous allowance... This might need changes in the future + ################### + let max_variance=10 + let fail_value=0 + let error_count=0 + log_note "Comparisons for ${pool}" + log_note "Bsize is the blocksize, Count is predicted value" + log_note "Bsize\tCount\tpsize\tlsize\tasize" + while read -r blksize pc pl pm lc ll lm ac al am + do + if [ $blksize -gt $max_pool_record_size ]; then + continue + fi + log_note \ + "$blksize\t${recordcounts[${blksize}]}\t$pc\t$lc\t$ac" + + ################### + # get the computer record count and compute the + # difference percentage in integer arithmetic + ################### + rc=${recordcounts[${blksize}]} + ((rclc=(rc-lc)<0?lc-rc:rc-lc)) # absolute value + ((dp=(rclc*100)/rc)) + + ################### + # Check against the allowed variance + ################### + if [ $dp -gt ${max_variance} ]; then + log_note \ + "Expected variance < ${max_variance}% observed ${dp}%" + if [ ${dp} -gt ${fail_value} ]; then + fail_value=${dp} + ((error_count++)) + fi + fi + done < ${stripped} + if [ ${fail_value} -gt 0 ]; then + if [ ${error_count} -eq 1 ]; then + log_note "hctp: There was ${error_count} error" + else + log_note "hctp:There were a total of ${error_count} errors" + fi + log_fail \ + "hctp: Max variance of ${max_variance}% exceeded, saw ${fail_value}%" + fi +} + +log_assert "Verify zdb -Pbbb (block histogram) works as expected" +log_onexit cleanup +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS + +histo_populate_test_pool $TESTPOOL + +histo_check_test_pool $TESTPOOL + +log_pass "Histogram for zdb"