Add block histogram to zdb

The block histogram tracks the changes to psize, lsize and asize
both in the count of the number of blocks (by blocksize) and the
total length of all of the blocks for that blocksize.  It also
keeps a running total of the cumulative size of all of the blocks
up to each size to help determine the size of caching SSDs to be
added to zfs hardware deployments.

The block history counts and lengths are summarized in bins
which are powers of two. Even rows with counts of zero are printed.

This change is accessed by specifying one of two options:

zdb -bbb pool
zdb -Pbbb pool

The first version prints the table in fixed size columns.
The second prints in "parseable" output that can be placed into
a CSV file.

Fixed Column, nicenum output sample:
  block   psize                lsize                asize
   size   Count Length   Cum.  Count Length   Cum.  Count Length   Cum.
    512:  3.50K  1.75M  1.75M  3.43K  1.71M  1.71M  3.41K  1.71M  1.71M
     1K:  3.65K  3.67M  5.43M  3.43K  3.44M  5.15M  3.50K  3.51M  5.22M
     2K:  3.45K  6.92M  12.3M  3.41K  6.83M  12.0M  3.59K  7.26M  12.5M
     4K:  3.44K  13.8M  26.1M  3.43K  13.7M  25.7M  3.49K  14.1M  26.6M
     8K:  3.42K  27.3M  53.5M  3.41K  27.3M  53.0M  3.44K  27.6M  54.2M
    16K:  3.43K  54.9M   108M  3.50K  56.1M   109M  3.42K  54.7M   109M
    32K:  3.44K   110M   219M  3.41K   109M   218M  3.43K   110M   219M
    64K:  3.41K   218M   437M  3.41K   218M   437M  3.44K   221M   439M
   128K:  3.41K   437M   874M  3.70K   474M   911M  3.41K   437M   876M
   256K:  3.41K   874M  1.71G  3.41K   874M  1.74G  3.41K   874M  1.71G
   512K:  3.41K  1.71G  3.41G  3.41K  1.71G  3.45G  3.41K  1.71G  3.42G
     1M:  3.41K  3.41G  6.82G  3.41K  3.41G  6.86G  3.41K  3.41G  6.83G
     2M:      0      0  6.82G      0      0  6.86G      0      0  6.83G
     4M:      0      0  6.82G      0      0  6.86G      0      0  6.83G
     8M:      0      0  6.82G      0      0  6.86G      0      0  6.83G
    16M:      0      0  6.82G      0      0  6.86G      0      0  6.83G

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Robert E. Novak <novak5@llnl.gov>
Closes: #9158 
Closes #10315
This commit is contained in:
Robert Novak 2020-06-26 15:09:20 -07:00 committed by GitHub
parent 6b99fc0620
commit bfcbec6f5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 479 additions and 2 deletions

View File

@ -4161,6 +4161,7 @@ static const char *zdb_ot_extname[] = {
}; };
#define ZB_TOTAL DN_MAX_LEVELS #define ZB_TOTAL DN_MAX_LEVELS
#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
typedef struct zdb_cb { typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
@ -4168,6 +4169,15 @@ typedef struct zdb_cb {
uint64_t zcb_checkpoint_size; uint64_t zcb_checkpoint_size;
uint64_t zcb_dedup_asize; uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks; uint64_t zcb_dedup_blocks;
uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
uint64_t zcb_psize_len[SPA_MAX_FOR_16M];
uint64_t zcb_lsize_len[SPA_MAX_FOR_16M];
uint64_t zcb_asize_len[SPA_MAX_FOR_16M];
uint64_t zcb_psize_total;
uint64_t zcb_lsize_total;
uint64_t zcb_asize_total;
uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
[BPE_PAYLOAD_SIZE + 1]; [BPE_PAYLOAD_SIZE + 1];
@ -4191,6 +4201,172 @@ same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
return ((off1 >> ms_shift) == (off2 >> ms_shift)); return ((off1 >> ms_shift) == (off2 >> ms_shift));
} }
/*
* Used to simplify reporting of the histogram data.
*/
typedef struct one_histo {
char *name;
uint64_t *count;
uint64_t *len;
uint64_t cumulative;
} one_histo_t;
/*
* The number of separate histograms processed for psize, lsize and asize.
*/
#define NUM_HISTO 3
/*
* This routine will create a fixed column size output of three different
* histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
* the count, length and cumulative length of the psize, lsize and
* asize blocks.
*
* All three types of blocks are listed on a single line
*
* By default the table is printed in nicenumber format (e.g. 123K) but
* if the '-P' parameter is specified then the full raw number (parseable)
* is printed out.
*/
static void
dump_size_histograms(zdb_cb_t *zcb)
{
/*
* A temporary buffer that allows us to convert a number into
* a string using zdb_nicenumber to allow either raw or human
* readable numbers to be output.
*/
char numbuf[32];
/*
* Define titles which are used in the headers of the tables
* printed by this routine.
*/
const char blocksize_title1[] = "block";
const char blocksize_title2[] = "size";
const char count_title[] = "Count";
const char length_title[] = "Size";
const char cumulative_title[] = "Cum.";
/*
* Setup the histogram arrays (psize, lsize, and asize).
*/
one_histo_t parm_histo[NUM_HISTO];
parm_histo[0].name = "psize";
parm_histo[0].count = zcb->zcb_psize_count;
parm_histo[0].len = zcb->zcb_psize_len;
parm_histo[0].cumulative = 0;
parm_histo[1].name = "lsize";
parm_histo[1].count = zcb->zcb_lsize_count;
parm_histo[1].len = zcb->zcb_lsize_len;
parm_histo[1].cumulative = 0;
parm_histo[2].name = "asize";
parm_histo[2].count = zcb->zcb_asize_count;
parm_histo[2].len = zcb->zcb_asize_len;
parm_histo[2].cumulative = 0;
(void) printf("\nBlock Size Histogram\n");
/*
* Print the first line titles
*/
if (dump_opt['P'])
(void) printf("\n%s\t", blocksize_title1);
else
(void) printf("\n%7s ", blocksize_title1);
for (int j = 0; j < NUM_HISTO; j++) {
if (dump_opt['P']) {
if (j < NUM_HISTO - 1) {
(void) printf("%s\t\t\t", parm_histo[j].name);
} else {
/* Don't print trailing spaces */
(void) printf(" %s", parm_histo[j].name);
}
} else {
if (j < NUM_HISTO - 1) {
/* Left aligned strings in the output */
(void) printf("%-7s ",
parm_histo[j].name);
} else {
/* Don't print trailing spaces */
(void) printf("%s", parm_histo[j].name);
}
}
}
(void) printf("\n");
/*
* Print the second line titles
*/
if (dump_opt['P']) {
(void) printf("%s\t", blocksize_title2);
} else {
(void) printf("%7s ", blocksize_title2);
}
for (int i = 0; i < NUM_HISTO; i++) {
if (dump_opt['P']) {
(void) printf("%s\t%s\t%s\t",
count_title, length_title, cumulative_title);
} else {
(void) printf("%7s%7s%7s",
count_title, length_title, cumulative_title);
}
}
(void) printf("\n");
/*
* Print the rows
*/
for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
/*
* Print the first column showing the blocksize
*/
zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
if (dump_opt['P']) {
printf("%s", numbuf);
} else {
printf("%7s:", numbuf);
}
/*
* Print the remaining set of 3 columns per size:
* for psize, lsize and asize
*/
for (int j = 0; j < NUM_HISTO; j++) {
parm_histo[j].cumulative += parm_histo[j].len[i];
zdb_nicenum(parm_histo[j].count[i],
numbuf, sizeof (numbuf));
if (dump_opt['P'])
(void) printf("\t%s", numbuf);
else
(void) printf("%7s", numbuf);
zdb_nicenum(parm_histo[j].len[i],
numbuf, sizeof (numbuf));
if (dump_opt['P'])
(void) printf("\t%s", numbuf);
else
(void) printf("%7s", numbuf);
zdb_nicenum(parm_histo[j].cumulative,
numbuf, sizeof (numbuf));
if (dump_opt['P'])
(void) printf("\t%s", numbuf);
else
(void) printf("%7s", numbuf);
}
(void) printf("\n");
}
}
static void static void
zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type) dmu_object_type_t type)
@ -4284,6 +4460,28 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
[BPE_GET_PSIZE(bp)]++; [BPE_GET_PSIZE(bp)]++;
return; return;
} }
/*
* The binning histogram bins by powers of two up to
* SPA_MAXBLOCKSIZE rather than creating bins for
* every possible blocksize found in the pool.
*/
int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
zcb->zcb_psize_count[bin]++;
zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
zcb->zcb_psize_total += BP_GET_PSIZE(bp);
bin = highbit64(BP_GET_LSIZE(bp)) - 1;
zcb->zcb_lsize_count[bin]++;
zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
bin = highbit64(BP_GET_ASIZE(bp)) - 1;
zcb->zcb_asize_count[bin]++;
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
zcb->zcb_asize_total += BP_GET_ASIZE(bp);
if (dump_opt['L']) if (dump_opt['L'])
return; return;
@ -5645,6 +5843,11 @@ dump_block_stats(spa_t *spa)
} }
} }
} }
/* Output a table summarizing block sizes in the pool */
if (dump_opt['b'] >= 2) {
dump_size_histograms(&zcb);
}
} }
(void) printf("\n"); (void) printf("\n");

View File

@ -104,8 +104,9 @@ tags = ['functional', 'clean_mirror']
[tests/functional/cli_root/zdb] [tests/functional/cli_root/zdb]
tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_checksum', 'zdb_decompress', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_display_block', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos',
'zdb_objset_id'] 'zdb_objset_id']
pre = pre =
post = post =

View File

@ -7,6 +7,7 @@ dist_pkgdata_SCRIPTS = \
zdb_006_pos.ksh \ zdb_006_pos.ksh \
zdb_args_neg.ksh \ zdb_args_neg.ksh \
zdb_args_pos.ksh \ zdb_args_pos.ksh \
zdb_block_size_histogram.ksh \
zdb_checksum.ksh \ zdb_checksum.ksh \
zdb_decompress.ksh \ zdb_decompress.ksh \
zdb_object_range_neg.ksh \ zdb_object_range_neg.ksh \

View File

@ -0,0 +1,272 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2017 by Delphix. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security LLC.
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Create a pool and populate it with files of various
# recordsizes
#
# STRATEGY:
# 1. Create pool
# 2. Populate it
# 3. Run zdb -Pbbb on pool
# 4. Verify variance on blocksizes
#
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
SPA_MAXBLOCKSHIFT=24
function histo_populate_test_pool
{
if [ $# -ne 1 ]; then
log_note "histo_populate_test_pool: insufficient parameters"
log_fail "hptp: 1 requested $# received"
fi
typeset pool=$1
set -A recordsizes
typeset -i min_rsbits=9 #512
typeset -i max_rsbits=SPA_MAXBLOCKSHIFT #16 MiB
typeset -i sum_filesizes=0
re_number='^[0-9]+$'
let histo_pool_size=$(get_pool_prop size ${pool})
if [[ ! ${histo_pool_size} =~ ${re_number} ]]; then
log_fail "histo_pool_size is not numeric ${pool_size}"
fi
let max_pool_record_size=$(get_prop recordsize ${pool})
if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then
log_fail "hptp: max_pool_record_size is not numeric ${max_pool_record_size}"
fi
sum_filesizes=$(echo "2^21"|bc)
((min_pool_size=12*sum_filesizes))
if [ ${histo_pool_size} -lt ${min_pool_size} ]; then
log_note "hptp: Your pool size ${histo_pool_size}"
log_fail "hptp: is less than minimum ${min_pool_size}"
fi
this_ri=min_rsbits
file_num=0
total_count=0
###################
# generate 10% + 20% + 30% + 31% = 91% of the filespace
# attempting to use 100% will lead to no space left on device
# Heuristic testing showed that 91% was the practical upper
# bound on the default 4G zpool (mirrored) that is used in
# testing.
#
# In order to expedite testing, we will only fill 2G (of 4G)
# of the test pool. You may want to modify this for
# standalone testing.
#
# In filling only 50% of the pool, we create one object on
# each "pass" below to achieve multiple objects per record
# size. Creating one file per object would lead to
# excessive file creation time.
###################
# for pass in 10 20 30 31 # 91%
for pass in 20 20 10 # 50%
do
((thiscount=(((histo_pool_size*pass)/100)/sum_filesizes)))
((total_count+=thiscount))
for rb in $(seq ${min_rsbits} ${max_rsbits})
do
this_rs=$(echo "2^${rb}" | bc)
if [ ${this_rs} -gt ${max_pool_record_size} ]; then
continue
fi
if [ ! -d /${pool}/B_${this_rs} ]; then
zfs create ${pool}/B_${this_rs}
zfs set recordsize=${this_rs} \
${pool}/B_${this_rs}
fi
####################
# Create the files in the devices and datasets
# of the right size. The files are filled
# with random data to defeat the compression
#
# Note that the dd output is suppressed unless
# there are errors
####################
dd if=/dev/urandom \
of=/${pool}/B_${this_rs}/file_${filenum} \
bs=${this_rs} count=${thiscount} \
iflag=fullblock 2>&1 | \
egrep -v -e "records in" -e "records out" \
-e "bytes.*copied"
((filenum+=1))
done
done
####################
# Testing showed that on some devices, unless the pool is
# synchronized, that the block counts will be below the
# anticipated sizes since not all of the blocks will be flushed
# to the device. This 'sync' command prevents that from
# happening.
####################
log_must zpool sync ${pool}
}
function histo_check_test_pool
{
if [ $# -ne 1 ]; then
log_note "histo_check_test_pool: insufficient parameters"
log_fail "hctp: 1 requested $# received"
fi
typeset pool=$1
set -A recordsizes
set -A recordcounts
typeset -i rb
typeset -i min_rsbits=9 #512
typeset -i max_rsbits=SPA_MAXBLOCKSHIFT+1
typeset -i this_rs
typeset -i this_ri
typeset -i sum_filesizes=0
typeset dumped
typeset stripped
let histo_check_pool_size=$(get_pool_prop size ${pool})
if [[ ! ${histo_check_pool_size} =~ ${re_number} ]]; then
log_fail "histo_check_pool_size is not numeric ${histo_check_pool_size}"
fi
let max_pool_record_size=$(get_prop recordsize ${pool})
if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then
log_fail "hctp: max_pool_record_size is not numeric ${max_pool_record_size}"
fi
dumped="${TEST_BASE_DIR}/${pool}_dump.txt"
stripped="${TEST_BASE_DIR}/${pool}_stripped.txt"
zdb -Pbbb ${pool} | \
tee ${dumped} | \
sed -e '1,/^block[ ][ ]*psize[ ][ ]*lsize.*$/d' \
-e '/^size[ ]*Count/d' -e '/^$/,$d' \
> ${stripped}
sum_filesizes=$(echo "2^21"|bc)
###################
# generate 10% + 20% + 30% + 31% = 91% of the filespace
# attempting to use 100% will lead to no space left on device
# attempting to use 100% will lead to no space left on device
# Heuristic testing showed that 91% was the practical upper
# bound on the default 4G zpool (mirrored) that is used in
# testing.
#
# In order to expedite testing, we will only fill 2G (of 4G)
# of the test pool. You may want to modify this for
# standalone testing.
#
# In filling only 50% of the pool, we create one object on
# each "pass" below to achieve multiple objects per record
# size. Creating one file per object would lead to
# excessive file creation time.
###################
# for pass in 10 20 30 31 # 91%
for pass in 20 20 10 # 50%
do
((thiscount=(((histo_check_pool_size*pass)/100)/sum_filesizes)))
for rb in $(seq ${min_rsbits} ${max_rsbits})
do
blksize=$(echo "2^$rb"|bc)
if [ $blksize -le $max_pool_record_size ]; then
((recordcounts[$blksize]+=thiscount))
fi
done
done
###################
# compare the above computed counts for blocks against
# lsize count. Since some devices have a minimum hardware
# blocksize > 512, we cannot compare against the asize count.
# E.G., if the HWBlocksize = 4096, then the asize counts for
# 512, 1024 and 2048 will be zero and rolled up into the
# 4096 blocksize count for asize. For verification we stick
# to just lsize counts.
#
# The max_variance is hard-coded here at 10%. testing so far
# has shown this to be in the range of 2%-8% so we leave a
# generous allowance... This might need changes in the future
###################
let max_variance=10
let fail_value=0
let error_count=0
log_note "Comparisons for ${pool}"
log_note "Bsize is the blocksize, Count is predicted value"
log_note "Bsize\tCount\tpsize\tlsize\tasize"
while read -r blksize pc pl pm lc ll lm ac al am
do
if [ $blksize -gt $max_pool_record_size ]; then
continue
fi
log_note \
"$blksize\t${recordcounts[${blksize}]}\t$pc\t$lc\t$ac"
###################
# get the computer record count and compute the
# difference percentage in integer arithmetic
###################
rc=${recordcounts[${blksize}]}
((rclc=(rc-lc)<0?lc-rc:rc-lc)) # absolute value
((dp=(rclc*100)/rc))
###################
# Check against the allowed variance
###################
if [ $dp -gt ${max_variance} ]; then
log_note \
"Expected variance < ${max_variance}% observed ${dp}%"
if [ ${dp} -gt ${fail_value} ]; then
fail_value=${dp}
((error_count++))
fi
fi
done < ${stripped}
if [ ${fail_value} -gt 0 ]; then
if [ ${error_count} -eq 1 ]; then
log_note "hctp: There was ${error_count} error"
else
log_note "hctp:There were a total of ${error_count} errors"
fi
log_fail \
"hctp: Max variance of ${max_variance}% exceeded, saw ${fail_value}%"
fi
}
log_assert "Verify zdb -Pbbb (block histogram) works as expected"
log_onexit cleanup
verify_runnable "global"
verify_disk_count "$DISKS" 2
default_mirror_setup_noexit $DISKS
histo_populate_test_pool $TESTPOOL
histo_check_test_pool $TESTPOOL
log_pass "Histogram for zdb"