683 lines
17 KiB
Plaintext
683 lines
17 KiB
Plaintext
perf.data format
|
|
|
|
Uptodate as of v4.7
|
|
|
|
This document describes the on-disk perf.data format, generated by perf record
|
|
or perf inject and consumed by the other perf tools.
|
|
|
|
On a high level perf.data contains the events generated by the PMUs, plus metadata.
|
|
|
|
All fields are in native-endian of the machine that generated the perf.data.
|
|
|
|
When perf is writing to a pipe it uses a special version of the file
|
|
format that does not rely on seeking to adjust data offsets. This
|
|
format is described in "Pipe-mode data" section. The pipe data version can be
|
|
augmented with additional events using perf inject.
|
|
|
|
The file starts with a perf_header:
|
|
|
|
struct perf_header {
|
|
char magic[8]; /* PERFILE2 */
|
|
uint64_t size; /* size of the header */
|
|
uint64_t attr_size; /* size of an attribute in attrs */
|
|
struct perf_file_section attrs;
|
|
struct perf_file_section data;
|
|
struct perf_file_section event_types;
|
|
uint64_t flags;
|
|
uint64_t flags1[3];
|
|
};
|
|
|
|
The magic number identifies the perf file and the version. Current perf versions
|
|
use PERFILE2. Old perf versions generated a version 1 format (PERFFILE). Version 1
|
|
is not described here. The magic number also identifies the endian. When the
|
|
magic value is 64bit byte swapped compared the file is in non-native
|
|
endian.
|
|
|
|
A perf_file_section contains a pointer to another section of the perf file.
|
|
The header contains three such pointers: for attributes, data and event types.
|
|
|
|
struct perf_file_section {
|
|
uint64_t offset; /* offset from start of file */
|
|
uint64_t size; /* size of the section */
|
|
};
|
|
|
|
Flags section:
|
|
|
|
For each of the optional features a perf_file_section is placed after the data
|
|
section if the feature bit is set in the perf_header flags bitset. The
|
|
respective perf_file_section points to the data of the additional header and
|
|
defines its size.
|
|
|
|
Some headers consist of strings, which are defined like this:
|
|
|
|
struct perf_header_string {
|
|
uint32_t len;
|
|
char string[len]; /* zero terminated */
|
|
};
|
|
|
|
Some headers consist of a sequence of strings, which start with a
|
|
|
|
struct perf_header_string_list {
|
|
uint32_t nr;
|
|
struct perf_header_string strings[nr]; /* variable length records */
|
|
};
|
|
|
|
The bits are the flags bits in a 256 bit bitmap starting with
|
|
flags. These define the valid bits:
|
|
|
|
HEADER_RESERVED = 0, /* always cleared */
|
|
HEADER_FIRST_FEATURE = 1,
|
|
HEADER_TRACING_DATA = 1,
|
|
|
|
Describe me.
|
|
|
|
HEADER_BUILD_ID = 2,
|
|
|
|
The header consists of an sequence of build_id_event. The size of each record
|
|
is defined by header.size (see perf_event.h). Each event defines a ELF build id
|
|
for a executable file name for a pid. An ELF build id is a unique identifier
|
|
assigned by the linker to an executable.
|
|
|
|
struct build_id_event {
|
|
struct perf_event_header header;
|
|
pid_t pid;
|
|
uint8_t build_id[24];
|
|
char filename[header.size - offsetof(struct build_id_event, filename)];
|
|
};
|
|
|
|
HEADER_HOSTNAME = 3,
|
|
|
|
A perf_header_string with the hostname where the data was collected
|
|
(uname -n)
|
|
|
|
HEADER_OSRELEASE = 4,
|
|
|
|
A perf_header_string with the os release where the data was collected
|
|
(uname -r)
|
|
|
|
HEADER_VERSION = 5,
|
|
|
|
A perf_header_string with the perf user tool version where the
|
|
data was collected. This is the same as the version of the source tree
|
|
the perf tool was built from.
|
|
|
|
HEADER_ARCH = 6,
|
|
|
|
A perf_header_string with the CPU architecture (uname -m)
|
|
|
|
HEADER_NRCPUS = 7,
|
|
|
|
A structure defining the number of CPUs.
|
|
|
|
struct nr_cpus {
|
|
uint32_t nr_cpus_available; /* CPUs not yet onlined */
|
|
uint32_t nr_cpus_online;
|
|
};
|
|
|
|
HEADER_CPUDESC = 8,
|
|
|
|
A perf_header_string with description of the CPU. On x86 this is the model name
|
|
in /proc/cpuinfo
|
|
|
|
HEADER_CPUID = 9,
|
|
|
|
A perf_header_string with the exact CPU type. On x86 this is
|
|
vendor,family,model,stepping. For example: GenuineIntel,6,69,1
|
|
|
|
HEADER_TOTAL_MEM = 10,
|
|
|
|
An uint64_t with the total memory in kilobytes.
|
|
|
|
HEADER_CMDLINE = 11,
|
|
|
|
A perf_header_string_list with the perf arg-vector used to collect the data.
|
|
|
|
HEADER_EVENT_DESC = 12,
|
|
|
|
Another description of the perf_event_attrs, more detailed than header.attrs
|
|
including IDs and names. See perf_event.h or the man page for a description
|
|
of a struct perf_event_attr.
|
|
|
|
struct {
|
|
uint32_t nr; /* number of events */
|
|
uint32_t attr_size; /* size of each perf_event_attr */
|
|
struct {
|
|
struct perf_event_attr attr; /* size of attr_size */
|
|
uint32_t nr_ids;
|
|
struct perf_header_string event_string;
|
|
uint64_t ids[nr_ids];
|
|
} events[nr]; /* Variable length records */
|
|
};
|
|
|
|
HEADER_CPU_TOPOLOGY = 13,
|
|
|
|
struct {
|
|
/*
|
|
* First revision of HEADER_CPU_TOPOLOGY
|
|
*
|
|
* See 'struct perf_header_string_list' definition earlier
|
|
* in this file.
|
|
*/
|
|
|
|
struct perf_header_string_list cores; /* Variable length */
|
|
struct perf_header_string_list threads; /* Variable length */
|
|
|
|
/*
|
|
* Second revision of HEADER_CPU_TOPOLOGY, older tools
|
|
* will not consider what comes next
|
|
*/
|
|
|
|
struct {
|
|
uint32_t core_id;
|
|
uint32_t socket_id;
|
|
} cpus[nr]; /* Variable length records */
|
|
/* 'nr' comes from previously processed HEADER_NRCPUS's nr_cpu_avail */
|
|
|
|
/*
|
|
* Third revision of HEADER_CPU_TOPOLOGY, older tools
|
|
* will not consider what comes next
|
|
*/
|
|
|
|
struct perf_header_string_list dies; /* Variable length */
|
|
uint32_t die_id[nr_cpus_avail]; /* from previously processed HEADER_NR_CPUS, VLA */
|
|
};
|
|
|
|
Example:
|
|
sibling sockets : 0-8
|
|
sibling dies : 0-3
|
|
sibling dies : 4-7
|
|
sibling threads : 0-1
|
|
sibling threads : 2-3
|
|
sibling threads : 4-5
|
|
sibling threads : 6-7
|
|
|
|
HEADER_NUMA_TOPOLOGY = 14,
|
|
|
|
A list of NUMA node descriptions
|
|
|
|
struct {
|
|
uint32_t nr;
|
|
struct {
|
|
uint32_t nodenr;
|
|
uint64_t mem_total;
|
|
uint64_t mem_free;
|
|
struct perf_header_string cpus;
|
|
} nodes[nr]; /* Variable length records */
|
|
};
|
|
|
|
HEADER_BRANCH_STACK = 15,
|
|
|
|
Not implemented in perf.
|
|
|
|
HEADER_PMU_MAPPINGS = 16,
|
|
|
|
A list of PMU structures, defining the different PMUs supported by perf.
|
|
|
|
struct {
|
|
uint32_t nr;
|
|
struct pmu {
|
|
uint32_t pmu_type;
|
|
struct perf_header_string pmu_name;
|
|
} [nr]; /* Variable length records */
|
|
};
|
|
|
|
HEADER_GROUP_DESC = 17,
|
|
|
|
Description of counter groups ({...} in perf syntax)
|
|
|
|
struct {
|
|
uint32_t nr;
|
|
struct {
|
|
struct perf_header_string string;
|
|
uint32_t leader_idx;
|
|
uint32_t nr_members;
|
|
} [nr]; /* Variable length records */
|
|
};
|
|
|
|
HEADER_AUXTRACE = 18,
|
|
|
|
Define additional auxtrace areas in the perf.data. auxtrace is used to store
|
|
undecoded hardware tracing information, such as Intel Processor Trace data.
|
|
|
|
/**
|
|
* struct auxtrace_index_entry - indexes a AUX area tracing event within a
|
|
* perf.data file.
|
|
* @file_offset: offset within the perf.data file
|
|
* @sz: size of the event
|
|
*/
|
|
struct auxtrace_index_entry {
|
|
u64 file_offset;
|
|
u64 sz;
|
|
};
|
|
|
|
#define PERF_AUXTRACE_INDEX_ENTRY_COUNT 256
|
|
|
|
/**
|
|
* struct auxtrace_index - index of AUX area tracing events within a perf.data
|
|
* file.
|
|
* @list: linking a number of arrays of entries
|
|
* @nr: number of entries
|
|
* @entries: array of entries
|
|
*/
|
|
struct auxtrace_index {
|
|
struct list_head list;
|
|
size_t nr;
|
|
struct auxtrace_index_entry entries[PERF_AUXTRACE_INDEX_ENTRY_COUNT];
|
|
};
|
|
|
|
HEADER_STAT = 19,
|
|
|
|
This is merely a flag signifying that the data section contains data
|
|
recorded from perf stat record.
|
|
|
|
HEADER_CACHE = 20,
|
|
|
|
Description of the cache hierarchy. Based on the Linux sysfs format
|
|
in /sys/devices/system/cpu/cpu*/cache/
|
|
|
|
u32 version Currently always 1
|
|
u32 number_of_cache_levels
|
|
|
|
struct {
|
|
u32 level;
|
|
u32 line_size;
|
|
u32 sets;
|
|
u32 ways;
|
|
struct perf_header_string type;
|
|
struct perf_header_string size;
|
|
struct perf_header_string map;
|
|
}[number_of_cache_levels];
|
|
|
|
HEADER_SAMPLE_TIME = 21,
|
|
|
|
Two uint64_t for the time of first sample and the time of last sample.
|
|
|
|
HEADER_SAMPLE_TOPOLOGY = 22,
|
|
|
|
Physical memory map and its node assignments.
|
|
|
|
The format of data in MEM_TOPOLOGY is as follows:
|
|
|
|
u64 version; // Currently 1
|
|
u64 block_size_bytes; // /sys/devices/system/memory/block_size_bytes
|
|
u64 count; // number of nodes
|
|
|
|
struct memory_node {
|
|
u64 node_id; // node index
|
|
u64 size; // size of bitmap
|
|
struct bitmap {
|
|
/* size of bitmap again */
|
|
u64 bitmapsize;
|
|
/* bitmap of memory indexes that belongs to node */
|
|
/* /sys/devices/system/node/node<NODE>/memory<INDEX> */
|
|
u64 entries[(bitmapsize/64)+1];
|
|
}
|
|
}[count];
|
|
|
|
The MEM_TOPOLOGY can be displayed with following command:
|
|
|
|
$ perf report --header-only -I
|
|
...
|
|
# memory nodes (nr 1, block size 0x8000000):
|
|
# 0 [7G]: 0-23,32-69
|
|
|
|
HEADER_CLOCKID = 23,
|
|
|
|
One uint64_t for the clockid frequency, specified, for instance, via 'perf
|
|
record -k' (see clock_gettime()), to enable timestamps derived metrics
|
|
conversion into wall clock time on the reporting stage.
|
|
|
|
HEADER_DIR_FORMAT = 24,
|
|
|
|
The data files layout is described by HEADER_DIR_FORMAT feature. Currently it
|
|
holds only version number (1):
|
|
|
|
uint64_t version;
|
|
|
|
The current version holds only version value (1) means that data files:
|
|
|
|
- Follow the 'data.*' name format.
|
|
|
|
- Contain raw events data in standard perf format as read from kernel (and need
|
|
to be sorted)
|
|
|
|
Future versions are expected to describe different data files layout according
|
|
to special needs.
|
|
|
|
HEADER_BPF_PROG_INFO = 25,
|
|
|
|
struct perf_bpil, which contains detailed information about
|
|
a BPF program, including type, id, tag, jited/xlated instructions, etc.
|
|
|
|
HEADER_BPF_BTF = 26,
|
|
|
|
Contains BPF Type Format (BTF). For more information about BTF, please
|
|
refer to Documentation/bpf/btf.rst.
|
|
|
|
struct {
|
|
u32 id;
|
|
u32 data_size;
|
|
char data[];
|
|
};
|
|
|
|
HEADER_COMPRESSED = 27,
|
|
|
|
struct {
|
|
u32 version;
|
|
u32 type;
|
|
u32 level;
|
|
u32 ratio;
|
|
u32 mmap_len;
|
|
};
|
|
|
|
Indicates that trace contains records of PERF_RECORD_COMPRESSED type
|
|
that have perf_events records in compressed form.
|
|
|
|
HEADER_CPU_PMU_CAPS = 28,
|
|
|
|
A list of cpu PMU capabilities. The format of data is as below.
|
|
|
|
struct {
|
|
u32 nr_cpu_pmu_caps;
|
|
{
|
|
char name[];
|
|
char value[];
|
|
} [nr_cpu_pmu_caps]
|
|
};
|
|
|
|
|
|
Example:
|
|
cpu pmu capabilities: branches=32, max_precise=3, pmu_name=icelake
|
|
|
|
HEADER_CLOCK_DATA = 29,
|
|
|
|
Contains clock id and its reference time together with wall clock
|
|
time taken at the 'same time', both values are in nanoseconds.
|
|
The format of data is as below.
|
|
|
|
struct {
|
|
u32 version; /* version = 1 */
|
|
u32 clockid;
|
|
u64 wall_clock_ns;
|
|
u64 clockid_time_ns;
|
|
};
|
|
|
|
HEADER_HYBRID_TOPOLOGY = 30,
|
|
|
|
Indicate the hybrid CPUs. The format of data is as below.
|
|
|
|
struct {
|
|
u32 nr;
|
|
struct {
|
|
char pmu_name[];
|
|
char cpus[];
|
|
} [nr]; /* Variable length records */
|
|
};
|
|
|
|
Example:
|
|
hybrid cpu system:
|
|
cpu_core cpu list : 0-15
|
|
cpu_atom cpu list : 16-23
|
|
|
|
HEADER_PMU_CAPS = 31,
|
|
|
|
List of pmu capabilities (except cpu pmu which is already
|
|
covered by HEADER_CPU_PMU_CAPS). Note that hybrid cpu pmu
|
|
capabilities are also stored here.
|
|
|
|
struct {
|
|
u32 nr_pmu;
|
|
struct {
|
|
u32 nr_caps;
|
|
{
|
|
char name[];
|
|
char value[];
|
|
} [nr_caps];
|
|
char pmu_name[];
|
|
} [nr_pmu];
|
|
};
|
|
|
|
other bits are reserved and should ignored for now
|
|
HEADER_FEAT_BITS = 256,
|
|
|
|
Attributes
|
|
|
|
This is an array of perf_event_attrs, each attr_size bytes long, which defines
|
|
each event collected. See perf_event.h or the man page for a detailed
|
|
description.
|
|
|
|
Data
|
|
|
|
This section is the bulk of the file. It consist of a stream of perf_events
|
|
describing events. This matches the format generated by the kernel.
|
|
See perf_event.h or the manpage for a detailed description.
|
|
|
|
Some notes on parsing:
|
|
|
|
Ordering
|
|
|
|
The events are not necessarily in time stamp order, as they can be
|
|
collected in parallel on different CPUs. If the events should be
|
|
processed in time order they need to be sorted first. It is possible
|
|
to only do a partial sort using the FINISHED_ROUND event header (see
|
|
below). perf record guarantees that there is no reordering over a
|
|
FINISHED_ROUND.
|
|
|
|
ID vs IDENTIFIER
|
|
|
|
When the event stream contains multiple events each event is identified
|
|
by an ID. This can be either through the PERF_SAMPLE_ID or the
|
|
PERF_SAMPLE_IDENTIFIER header. The PERF_SAMPLE_IDENTIFIER header is
|
|
at a fixed offset from the event header, which allows reliable
|
|
parsing of the header. Relying on ID may be ambiguous.
|
|
IDENTIFIER is only supported by newer Linux kernels.
|
|
|
|
Perf record specific events:
|
|
|
|
In addition to the kernel generated event types perf record adds its
|
|
own event types (in addition it also synthesizes some kernel events,
|
|
for example MMAP events)
|
|
|
|
PERF_RECORD_USER_TYPE_START = 64,
|
|
PERF_RECORD_HEADER_ATTR = 64,
|
|
|
|
struct attr_event {
|
|
struct perf_event_header header;
|
|
struct perf_event_attr attr;
|
|
uint64_t id[];
|
|
};
|
|
|
|
PERF_RECORD_HEADER_EVENT_TYPE = 65, /* deprecated */
|
|
|
|
#define MAX_EVENT_NAME 64
|
|
|
|
struct perf_trace_event_type {
|
|
uint64_t event_id;
|
|
char name[MAX_EVENT_NAME];
|
|
};
|
|
|
|
struct event_type_event {
|
|
struct perf_event_header header;
|
|
struct perf_trace_event_type event_type;
|
|
};
|
|
|
|
|
|
PERF_RECORD_HEADER_TRACING_DATA = 66,
|
|
|
|
Describe me
|
|
|
|
struct tracing_data_event {
|
|
struct perf_event_header header;
|
|
uint32_t size;
|
|
};
|
|
|
|
PERF_RECORD_HEADER_BUILD_ID = 67,
|
|
|
|
Define a ELF build ID for a referenced executable.
|
|
|
|
struct build_id_event; /* See above */
|
|
|
|
PERF_RECORD_FINISHED_ROUND = 68,
|
|
|
|
No event reordering over this header. No payload.
|
|
|
|
PERF_RECORD_ID_INDEX = 69,
|
|
|
|
Map event ids to CPUs and TIDs.
|
|
|
|
struct id_index_entry {
|
|
uint64_t id;
|
|
uint64_t idx;
|
|
uint64_t cpu;
|
|
uint64_t tid;
|
|
};
|
|
|
|
struct id_index_event {
|
|
struct perf_event_header header;
|
|
uint64_t nr;
|
|
struct id_index_entry entries[nr];
|
|
};
|
|
|
|
PERF_RECORD_AUXTRACE_INFO = 70,
|
|
|
|
Auxtrace type specific information. Describe me
|
|
|
|
struct auxtrace_info_event {
|
|
struct perf_event_header header;
|
|
uint32_t type;
|
|
uint32_t reserved__; /* For alignment */
|
|
uint64_t priv[];
|
|
};
|
|
|
|
PERF_RECORD_AUXTRACE = 71,
|
|
|
|
Defines auxtrace data. Followed by the actual data. The contents of
|
|
the auxtrace data is dependent on the event and the CPU. For example
|
|
for Intel Processor Trace it contains Processor Trace data generated
|
|
by the CPU.
|
|
|
|
struct auxtrace_event {
|
|
struct perf_event_header header;
|
|
uint64_t size;
|
|
uint64_t offset;
|
|
uint64_t reference;
|
|
uint32_t idx;
|
|
uint32_t tid;
|
|
uint32_t cpu;
|
|
uint32_t reserved__; /* For alignment */
|
|
};
|
|
|
|
struct aux_event {
|
|
struct perf_event_header header;
|
|
uint64_t aux_offset;
|
|
uint64_t aux_size;
|
|
uint64_t flags;
|
|
};
|
|
|
|
PERF_RECORD_AUXTRACE_ERROR = 72,
|
|
|
|
Describes an error in hardware tracing
|
|
|
|
enum auxtrace_error_type {
|
|
PERF_AUXTRACE_ERROR_ITRACE = 1,
|
|
PERF_AUXTRACE_ERROR_MAX
|
|
};
|
|
|
|
#define MAX_AUXTRACE_ERROR_MSG 64
|
|
|
|
struct auxtrace_error_event {
|
|
struct perf_event_header header;
|
|
uint32_t type;
|
|
uint32_t code;
|
|
uint32_t cpu;
|
|
uint32_t pid;
|
|
uint32_t tid;
|
|
uint32_t reserved__; /* For alignment */
|
|
uint64_t ip;
|
|
char msg[MAX_AUXTRACE_ERROR_MSG];
|
|
};
|
|
|
|
PERF_RECORD_HEADER_FEATURE = 80,
|
|
|
|
Describes a header feature. These are records used in pipe-mode that
|
|
contain information that otherwise would be in perf.data file's header.
|
|
|
|
PERF_RECORD_COMPRESSED = 81,
|
|
|
|
struct compressed_event {
|
|
struct perf_event_header header;
|
|
char data[];
|
|
};
|
|
|
|
PERF_RECORD_FINISHED_INIT = 82,
|
|
|
|
Marks the end of records for the system, pre-existing threads in system wide
|
|
sessions, etc. Those are the ones prefixed PERF_RECORD_USER_*.
|
|
|
|
This is used, for instance, to 'perf inject' events after init and before
|
|
regular events, those emitted by the kernel, to support combining guest and
|
|
host records.
|
|
|
|
|
|
The header is followed by compressed data frame that can be decompressed
|
|
into array of perf trace records. The size of the entire compressed event
|
|
record including the header is limited by the max value of header.size.
|
|
|
|
Event types
|
|
|
|
Define the event attributes with their IDs.
|
|
|
|
An array bound by the perf_file_section size.
|
|
|
|
struct {
|
|
struct perf_event_attr attr; /* Size defined by header.attr_size */
|
|
struct perf_file_section ids;
|
|
}
|
|
|
|
ids points to a array of uint64_t defining the ids for event attr attr.
|
|
|
|
Pipe-mode data
|
|
|
|
Pipe-mode avoid seeks in the file by removing the perf_file_section and flags
|
|
from the struct perf_header. The trimmed header is:
|
|
|
|
struct perf_pipe_file_header {
|
|
u64 magic;
|
|
u64 size;
|
|
};
|
|
|
|
The information about attrs, data, and event_types is instead in the
|
|
synthesized events PERF_RECORD_ATTR, PERF_RECORD_HEADER_TRACING_DATA,
|
|
PERF_RECORD_HEADER_EVENT_TYPE, and PERF_RECORD_HEADER_FEATURE
|
|
that are generated by perf record in pipe-mode.
|
|
|
|
|
|
References:
|
|
|
|
include/uapi/linux/perf_event.h
|
|
|
|
This is the canonical description of the kernel generated perf_events
|
|
and the perf_event_attrs.
|
|
|
|
perf_events manpage
|
|
|
|
A manpage describing perf_event and perf_event_attr is here:
|
|
http://web.eece.maine.edu/~vweaver/projects/perf_events/programming.html
|
|
This tends to be slightly behind the kernel include, but has better
|
|
descriptions. An (typically older) version of the man page may be
|
|
included with the standard Linux man pages, available with "man
|
|
perf_events"
|
|
|
|
pmu-tools
|
|
|
|
https://github.com/andikleen/pmu-tools/tree/master/parser
|
|
|
|
A definition of the perf.data format in python "construct" format is available
|
|
in pmu-tools parser. This allows to read perf.data from python and dump it.
|
|
|
|
quipper
|
|
|
|
The quipper C++ parser is available at
|
|
http://github.com/google/perf_data_converter/tree/master/src/quipper
|
|
|