2014-12-13 05:07:39 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-12 00:16:13 +03:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2014-12-13 05:07:39 +03:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/list.h>
|
|
|
|
|
2019-07-08 21:20:53 +03:00
|
|
|
#if defined(_KERNEL)
|
|
|
|
#if defined(HAVE_DECLARE_EVENT_CLASS)
|
2014-12-13 05:07:39 +03:00
|
|
|
|
|
|
|
#undef TRACE_SYSTEM
|
|
|
|
#define TRACE_SYSTEM zfs
|
|
|
|
|
2015-07-28 15:42:14 +03:00
|
|
|
#undef TRACE_SYSTEM_VAR
|
|
|
|
#define TRACE_SYSTEM_VAR zfs_arc
|
|
|
|
|
2014-12-13 05:07:39 +03:00
|
|
|
#if !defined(_TRACE_ARC_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
|
|
#define _TRACE_ARC_H
|
|
|
|
|
|
|
|
#include <linux/tracepoint.h>
|
|
|
|
#include <sys/types.h>
|
2016-05-23 20:41:29 +03:00
|
|
|
#include <sys/trace_common.h> /* For ZIO macros */
|
2014-12-13 05:07:39 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic support for one argument tracepoints of the form:
|
|
|
|
*
|
|
|
|
* DTRACE_PROBE1(...,
|
|
|
|
* arc_buf_hdr_t *, ...);
|
|
|
|
*/
|
2016-12-12 21:46:26 +03:00
|
|
|
/* BEGIN CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
|
|
|
|
TP_PROTO(arc_buf_hdr_t *ab),
|
|
|
|
TP_ARGS(ab),
|
|
|
|
TP_STRUCT__entry(
|
|
|
|
__array(uint64_t, hdr_dva_word, 2)
|
|
|
|
__field(uint64_t, hdr_birth)
|
|
|
|
__field(uint32_t, hdr_flags)
|
|
|
|
__field(arc_buf_contents_t, hdr_type)
|
2016-06-02 07:04:53 +03:00
|
|
|
__field(uint16_t, hdr_psize)
|
|
|
|
__field(uint16_t, hdr_lsize)
|
2014-12-13 05:07:39 +03:00
|
|
|
__field(uint64_t, hdr_spa)
|
|
|
|
__field(arc_state_type_t, hdr_state_type)
|
|
|
|
__field(clock_t, hdr_access)
|
|
|
|
__field(uint32_t, hdr_mru_hits)
|
|
|
|
__field(uint32_t, hdr_mru_ghost_hits)
|
|
|
|
__field(uint32_t, hdr_mfu_hits)
|
|
|
|
__field(uint32_t, hdr_mfu_ghost_hits)
|
|
|
|
__field(uint32_t, hdr_l2_hits)
|
|
|
|
__field(int64_t, hdr_refcount)
|
|
|
|
),
|
|
|
|
TP_fast_assign(
|
|
|
|
__entry->hdr_dva_word[0] = ab->b_dva.dva_word[0];
|
|
|
|
__entry->hdr_dva_word[1] = ab->b_dva.dva_word[1];
|
|
|
|
__entry->hdr_birth = ab->b_birth;
|
|
|
|
__entry->hdr_flags = ab->b_flags;
|
2016-06-02 07:04:53 +03:00
|
|
|
__entry->hdr_psize = ab->b_psize;
|
|
|
|
__entry->hdr_lsize = ab->b_lsize;
|
2014-12-13 05:07:39 +03:00
|
|
|
__entry->hdr_spa = ab->b_spa;
|
2014-12-30 06:12:23 +03:00
|
|
|
__entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state;
|
|
|
|
__entry->hdr_access = ab->b_l1hdr.b_arc_access;
|
|
|
|
__entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits;
|
|
|
|
__entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
|
|
|
|
__entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits;
|
|
|
|
__entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
|
2021-08-17 18:50:31 +03:00
|
|
|
__entry->hdr_l2_hits = ab->b_l2hdr.b_hits;
|
2014-12-30 06:12:23 +03:00
|
|
|
__entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
|
2014-12-13 05:07:39 +03:00
|
|
|
),
|
2014-12-30 06:12:23 +03:00
|
|
|
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
|
2023-10-06 18:56:17 +03:00
|
|
|
"flags 0x%x type %u psize %u lsize %u spa %llu "
|
2014-12-13 05:07:39 +03:00
|
|
|
"state_type %u access %lu mru_hits %u mru_ghost_hits %u "
|
|
|
|
"mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
|
|
|
|
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
|
2014-12-30 06:12:23 +03:00
|
|
|
__entry->hdr_birth, __entry->hdr_flags,
|
2023-10-06 18:56:17 +03:00
|
|
|
__entry->hdr_type, __entry->hdr_psize,
|
2016-06-02 07:04:53 +03:00
|
|
|
__entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type,
|
2014-12-13 05:07:39 +03:00
|
|
|
__entry->hdr_access, __entry->hdr_mru_hits,
|
|
|
|
__entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits,
|
|
|
|
__entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits,
|
|
|
|
__entry->hdr_refcount)
|
|
|
|
);
|
2016-12-12 21:46:26 +03:00
|
|
|
/* END CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
|
2017-01-24 19:50:15 +03:00
|
|
|
#define DEFINE_ARC_BUF_HDR_EVENT(name) \
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \
|
2022-01-21 19:07:15 +03:00
|
|
|
TP_PROTO(arc_buf_hdr_t *ab), \
|
|
|
|
TP_ARGS(ab))
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit);
|
2022-12-22 23:10:24 +03:00
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__iohit);
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
|
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
|
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
|
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
|
Implement uncached prefetch
Previously the primarycache property was handled only in the dbuf
layer. Since the speculative prefetcher is implemented in the ARC,
it had to be disabled for uncacheable buffers.
This change gives the ARC knowledge about uncacheable buffers
via arc_read() and arc_write(). So when remove_reference() drops
the last reference on the ARC header, it can either immediately destroy
it, or if it is marked as prefetch, put it into a new arc_uncached state.
That state is scanned every second, evicting stale buffers that were
not demand read.
This change also tracks dbufs that were read from the beginning,
but not to the end. It is assumed that such buffers may receive further
reads, and so they are stored in dbuf cache. If a following
reads reaches the end of the buffer, it is immediately evicted.
Otherwise it will follow regular dbuf cache eviction. Since the dbuf
layer does not know actual file sizes, this logic is not applied to
the final buffer of a dnode.
Since uncacheable buffers should no longer stay in the ARC for long,
this patch also tries to optimize I/O by allocating ARC physical
buffers as linear to allow buffer sharing.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #14243
2023-01-05 03:29:54 +03:00
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__uncached);
|
2017-12-21 20:13:06 +03:00
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
|
|
|
|
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic support for two argument tracepoints of the form:
|
|
|
|
*
|
|
|
|
* DTRACE_PROBE2(...,
|
|
|
|
* vdev_t *, ...,
|
|
|
|
* zio_t *, ...);
|
|
|
|
*/
|
2016-12-12 21:46:26 +03:00
|
|
|
/* BEGIN CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
DECLARE_EVENT_CLASS(zfs_l2arc_rw_class,
|
|
|
|
TP_PROTO(vdev_t *vd, zio_t *zio),
|
|
|
|
TP_ARGS(vd, zio),
|
|
|
|
TP_STRUCT__entry(
|
|
|
|
__field(uint64_t, vdev_id)
|
|
|
|
__field(uint64_t, vdev_guid)
|
|
|
|
__field(uint64_t, vdev_state)
|
|
|
|
ZIO_TP_STRUCT_ENTRY
|
|
|
|
),
|
|
|
|
TP_fast_assign(
|
|
|
|
__entry->vdev_id = vd->vdev_id;
|
|
|
|
__entry->vdev_guid = vd->vdev_guid;
|
|
|
|
__entry->vdev_state = vd->vdev_state;
|
|
|
|
ZIO_TP_FAST_ASSIGN
|
|
|
|
),
|
|
|
|
TP_printk("vdev { id %llu guid %llu state %llu } "
|
|
|
|
ZIO_TP_PRINTK_FMT, __entry->vdev_id, __entry->vdev_guid,
|
|
|
|
__entry->vdev_state, ZIO_TP_PRINTK_ARGS)
|
|
|
|
);
|
2016-12-12 21:46:26 +03:00
|
|
|
/* END CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
|
2017-01-24 19:50:15 +03:00
|
|
|
#define DEFINE_L2ARC_RW_EVENT(name) \
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_EVENT(zfs_l2arc_rw_class, name, \
|
2022-01-21 19:07:15 +03:00
|
|
|
TP_PROTO(vdev_t *vd, zio_t *zio), \
|
|
|
|
TP_ARGS(vd, zio))
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_L2ARC_RW_EVENT(zfs_l2arc__read);
|
|
|
|
DEFINE_L2ARC_RW_EVENT(zfs_l2arc__write);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic support for two argument tracepoints of the form:
|
|
|
|
*
|
|
|
|
* DTRACE_PROBE2(...,
|
|
|
|
* zio_t *, ...,
|
|
|
|
* l2arc_write_callback_t *, ...);
|
|
|
|
*/
|
2016-12-12 21:46:26 +03:00
|
|
|
/* BEGIN CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
DECLARE_EVENT_CLASS(zfs_l2arc_iodone_class,
|
|
|
|
TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb),
|
|
|
|
TP_ARGS(zio, cb),
|
|
|
|
TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY),
|
|
|
|
TP_fast_assign(ZIO_TP_FAST_ASSIGN),
|
|
|
|
TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS)
|
|
|
|
);
|
2016-12-12 21:46:26 +03:00
|
|
|
/* END CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
|
2017-01-24 19:50:15 +03:00
|
|
|
#define DEFINE_L2ARC_IODONE_EVENT(name) \
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_EVENT(zfs_l2arc_iodone_class, name, \
|
2022-01-21 19:07:15 +03:00
|
|
|
TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), \
|
|
|
|
TP_ARGS(zio, cb))
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_L2ARC_IODONE_EVENT(zfs_l2arc__iodone);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic support for four argument tracepoints of the form:
|
|
|
|
*
|
|
|
|
* DTRACE_PROBE4(...,
|
|
|
|
* arc_buf_hdr_t *, ...,
|
|
|
|
* const blkptr_t *,
|
|
|
|
* uint64_t,
|
|
|
|
* const zbookmark_phys_t *);
|
|
|
|
*/
|
2016-12-12 21:46:26 +03:00
|
|
|
/* BEGIN CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
DECLARE_EVENT_CLASS(zfs_arc_miss_class,
|
|
|
|
TP_PROTO(arc_buf_hdr_t *hdr,
|
|
|
|
const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb),
|
|
|
|
TP_ARGS(hdr, bp, size, zb),
|
|
|
|
TP_STRUCT__entry(
|
|
|
|
__array(uint64_t, hdr_dva_word, 2)
|
|
|
|
__field(uint64_t, hdr_birth)
|
|
|
|
__field(uint32_t, hdr_flags)
|
|
|
|
__field(arc_buf_contents_t, hdr_type)
|
2016-06-02 07:04:53 +03:00
|
|
|
__field(uint16_t, hdr_psize)
|
|
|
|
__field(uint16_t, hdr_lsize)
|
2014-12-13 05:07:39 +03:00
|
|
|
__field(uint64_t, hdr_spa)
|
|
|
|
__field(arc_state_type_t, hdr_state_type)
|
|
|
|
__field(clock_t, hdr_access)
|
|
|
|
__field(uint32_t, hdr_mru_hits)
|
|
|
|
__field(uint32_t, hdr_mru_ghost_hits)
|
|
|
|
__field(uint32_t, hdr_mfu_hits)
|
|
|
|
__field(uint32_t, hdr_mfu_ghost_hits)
|
|
|
|
__field(uint32_t, hdr_l2_hits)
|
|
|
|
__field(int64_t, hdr_refcount)
|
|
|
|
|
|
|
|
__array(uint64_t, bp_dva0, 2)
|
|
|
|
__array(uint64_t, bp_dva1, 2)
|
|
|
|
__array(uint64_t, bp_dva2, 2)
|
|
|
|
__array(uint64_t, bp_cksum, 4)
|
|
|
|
|
|
|
|
__field(uint64_t, bp_lsize)
|
|
|
|
|
|
|
|
__field(uint64_t, zb_objset)
|
|
|
|
__field(uint64_t, zb_object)
|
|
|
|
__field(int64_t, zb_level)
|
|
|
|
__field(uint64_t, zb_blkid)
|
|
|
|
),
|
|
|
|
TP_fast_assign(
|
|
|
|
__entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0];
|
|
|
|
__entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1];
|
|
|
|
__entry->hdr_birth = hdr->b_birth;
|
|
|
|
__entry->hdr_flags = hdr->b_flags;
|
2016-06-02 07:04:53 +03:00
|
|
|
__entry->hdr_psize = hdr->b_psize;
|
|
|
|
__entry->hdr_lsize = hdr->b_lsize;
|
2014-12-13 05:07:39 +03:00
|
|
|
__entry->hdr_spa = hdr->b_spa;
|
2014-12-30 06:12:23 +03:00
|
|
|
__entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state;
|
|
|
|
__entry->hdr_access = hdr->b_l1hdr.b_arc_access;
|
|
|
|
__entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits;
|
|
|
|
__entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
|
|
|
|
__entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
|
|
|
|
__entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
|
2021-08-17 18:50:31 +03:00
|
|
|
__entry->hdr_l2_hits = hdr->b_l2hdr.b_hits;
|
2014-12-30 06:12:23 +03:00
|
|
|
__entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count;
|
2014-12-13 05:07:39 +03:00
|
|
|
|
|
|
|
__entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0];
|
|
|
|
__entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1];
|
|
|
|
__entry->bp_dva1[0] = bp->blk_dva[1].dva_word[0];
|
|
|
|
__entry->bp_dva1[1] = bp->blk_dva[1].dva_word[1];
|
|
|
|
__entry->bp_dva2[0] = bp->blk_dva[2].dva_word[0];
|
|
|
|
__entry->bp_dva2[1] = bp->blk_dva[2].dva_word[1];
|
|
|
|
__entry->bp_cksum[0] = bp->blk_cksum.zc_word[0];
|
|
|
|
__entry->bp_cksum[1] = bp->blk_cksum.zc_word[1];
|
|
|
|
__entry->bp_cksum[2] = bp->blk_cksum.zc_word[2];
|
|
|
|
__entry->bp_cksum[3] = bp->blk_cksum.zc_word[3];
|
|
|
|
|
|
|
|
__entry->bp_lsize = size;
|
|
|
|
|
|
|
|
__entry->zb_objset = zb->zb_objset;
|
|
|
|
__entry->zb_object = zb->zb_object;
|
|
|
|
__entry->zb_level = zb->zb_level;
|
|
|
|
__entry->zb_blkid = zb->zb_blkid;
|
|
|
|
),
|
2014-12-30 06:12:23 +03:00
|
|
|
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
|
2023-10-06 18:56:17 +03:00
|
|
|
"flags 0x%x psize %u lsize %u spa %llu state_type %u "
|
2014-12-13 05:07:39 +03:00
|
|
|
"access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
|
|
|
|
"mfu_ghost_hits %u l2_hits %u refcount %lli } "
|
|
|
|
"bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
|
|
|
|
"0x%llx:0x%llx cksum 0x%llx:0x%llx:0x%llx:0x%llx "
|
|
|
|
"lsize %llu } zb { objset %llu object %llu level %lli "
|
|
|
|
"blkid %llu }",
|
|
|
|
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
|
2014-12-30 06:12:23 +03:00
|
|
|
__entry->hdr_birth, __entry->hdr_flags,
|
2023-10-06 18:56:17 +03:00
|
|
|
__entry->hdr_psize, __entry->hdr_lsize,
|
2014-12-13 05:07:39 +03:00
|
|
|
__entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
|
|
|
|
__entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
|
|
|
|
__entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,
|
|
|
|
__entry->hdr_l2_hits, __entry->hdr_refcount,
|
|
|
|
__entry->bp_dva0[0], __entry->bp_dva0[1],
|
|
|
|
__entry->bp_dva1[0], __entry->bp_dva1[1],
|
|
|
|
__entry->bp_dva2[0], __entry->bp_dva2[1],
|
|
|
|
__entry->bp_cksum[0], __entry->bp_cksum[1],
|
|
|
|
__entry->bp_cksum[2], __entry->bp_cksum[3],
|
|
|
|
__entry->bp_lsize, __entry->zb_objset, __entry->zb_object,
|
|
|
|
__entry->zb_level, __entry->zb_blkid)
|
|
|
|
);
|
2016-12-12 21:46:26 +03:00
|
|
|
/* END CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
|
2017-01-24 19:50:15 +03:00
|
|
|
#define DEFINE_ARC_MISS_EVENT(name) \
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_EVENT(zfs_arc_miss_class, name, \
|
2022-01-21 19:07:15 +03:00
|
|
|
TP_PROTO(arc_buf_hdr_t *hdr, \
|
|
|
|
const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), \
|
|
|
|
TP_ARGS(hdr, bp, size, zb))
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_ARC_MISS_EVENT(zfs_arc__miss);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generic support for four argument tracepoints of the form:
|
|
|
|
*
|
|
|
|
* DTRACE_PROBE4(...,
|
|
|
|
* l2arc_dev_t *, ...,
|
|
|
|
* list_t *, ...,
|
|
|
|
* uint64_t, ...,
|
|
|
|
* boolean_t, ...);
|
|
|
|
*/
|
2016-12-12 21:46:26 +03:00
|
|
|
/* BEGIN CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
DECLARE_EVENT_CLASS(zfs_l2arc_evict_class,
|
|
|
|
TP_PROTO(l2arc_dev_t *dev,
|
|
|
|
list_t *buflist, uint64_t taddr, boolean_t all),
|
|
|
|
TP_ARGS(dev, buflist, taddr, all),
|
|
|
|
TP_STRUCT__entry(
|
|
|
|
__field(uint64_t, vdev_id)
|
|
|
|
__field(uint64_t, vdev_guid)
|
|
|
|
__field(uint64_t, vdev_state)
|
|
|
|
|
|
|
|
__field(uint64_t, l2ad_hand)
|
|
|
|
__field(uint64_t, l2ad_start)
|
|
|
|
__field(uint64_t, l2ad_end)
|
|
|
|
__field(boolean_t, l2ad_first)
|
|
|
|
__field(boolean_t, l2ad_writing)
|
|
|
|
|
|
|
|
__field(uint64_t, taddr)
|
|
|
|
__field(boolean_t, all)
|
|
|
|
),
|
|
|
|
TP_fast_assign(
|
|
|
|
__entry->vdev_id = dev->l2ad_vdev->vdev_id;
|
|
|
|
__entry->vdev_guid = dev->l2ad_vdev->vdev_guid;
|
|
|
|
__entry->vdev_state = dev->l2ad_vdev->vdev_state;
|
|
|
|
|
|
|
|
__entry->l2ad_hand = dev->l2ad_hand;
|
|
|
|
__entry->l2ad_start = dev->l2ad_start;
|
|
|
|
__entry->l2ad_end = dev->l2ad_end;
|
|
|
|
__entry->l2ad_first = dev->l2ad_first;
|
|
|
|
__entry->l2ad_writing = dev->l2ad_writing;
|
|
|
|
|
|
|
|
__entry->taddr = taddr;
|
|
|
|
__entry->all = all;
|
|
|
|
),
|
|
|
|
TP_printk("l2ad { vdev { id %llu guid %llu state %llu } "
|
2015-06-27 06:15:27 +03:00
|
|
|
"hand %llu start %llu end %llu "
|
2014-12-13 05:07:39 +03:00
|
|
|
"first %d writing %d } taddr %llu all %d",
|
|
|
|
__entry->vdev_id, __entry->vdev_guid, __entry->vdev_state,
|
|
|
|
__entry->l2ad_hand, __entry->l2ad_start,
|
2015-06-27 06:15:27 +03:00
|
|
|
__entry->l2ad_end, __entry->l2ad_first, __entry->l2ad_writing,
|
2014-12-13 05:07:39 +03:00
|
|
|
__entry->taddr, __entry->all)
|
|
|
|
);
|
2016-12-12 21:46:26 +03:00
|
|
|
/* END CSTYLED */
|
2014-12-13 05:07:39 +03:00
|
|
|
|
2017-01-24 19:50:15 +03:00
|
|
|
#define DEFINE_L2ARC_EVICT_EVENT(name) \
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_EVENT(zfs_l2arc_evict_class, name, \
|
2022-01-21 19:07:15 +03:00
|
|
|
TP_PROTO(l2arc_dev_t *dev, list_t *buflist, uint64_t taddr, boolean_t all),\
|
|
|
|
TP_ARGS(dev, buflist, taddr, all))
|
2014-12-13 05:07:39 +03:00
|
|
|
DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict);
|
|
|
|
|
Revise ARC shrinker algorithm
The ARC shrinker callback `arc_shrinker_count/_scan()` is invoked by the
kernel's shrinker mechanism when the system is running low on free
pages. This happens via 2 code paths:
1. "direct reclaim": The system is attempting to allocate a page, but we
are low on memory. The ARC shrinker callback is invoked from the
page-allocation code path.
2. "indirect reclaim": kswapd notices that there aren't many free pages,
so it invokes the ARC shrinker callback.
In both cases, the kernel's shrinker code requests that the ARC shrinker
callback release some of its cache, and then it measures how many pages
were released. However, it's measurement of released pages does not
include pages that are freed via `__free_pages()`, which is how the ARC
releases memory (via `abd_free_chunks()`). Rather, the kernel shrinker
code is looking for pages to be placed on the lists of reclaimable pages
(which is separate from actually-free pages).
Because the kernel shrinker code doesn't detect that the ARC has
released pages, it may call the ARC shrinker callback many times,
resulting in the ARC "collapsing" down to `arc_c_min`. This has several
negative impacts:
1. ZFS doesn't use RAM to cache data effectively.
2. In the direct reclaim case, a single page allocation may wait a long
time (e.g. more than a minute) while we evict the entire ARC.
3. Even with the improvements made in 67c0f0dedc5 ("ARC shrinking blocks
reads/writes"), occasionally `arc_size` may stay above `arc_c` for the
entire time of the ARC collapse, thus blocking ZFS read/write operations
in `arc_get_data_impl()`.
To address these issues, this commit limits the ways that the ARC
shrinker callback can be used by the kernel shrinker code, and mitigates
the impact of arc_is_overflowing() on ZFS read/write operations.
With this commit:
1. We limit the amount of data that can be reclaimed from the ARC via
the "direct reclaim" shrinker. This limits the amount of time it takes
to allocate a single page.
2. We do not allow the ARC to shrink via kswapd (indirect reclaim).
Instead we rely on `arc_evict_zthr` to monitor free memory and reduce
the ARC target size to keep sufficient free memory in the system. Note
that we can't simply rely on limiting the amount that we reclaim at once
(as for the direct reclaim case), because kswapd's "boosted" logic can
invoke the callback an unlimited number of times (see
`balance_pgdat()`).
3. When `arc_is_overflowing()` and we want to allocate memory,
`arc_get_data_impl()` will wait only for a multiple of the requested
amount of data to be evicted, rather than waiting for the ARC to no
longer be overflowing. This allows ZFS reads/writes to make progress
even while the ARC is overflowing, while also ensuring that the eviction
thread makes progress towards reducing the total amount of memory used
by the ARC.
4. The amount of memory that the ARC always tries to keep free for the
rest of the system, `arc_sys_free` is increased.
5. Now that the shrinker callback is able to provide feedback to the
kernel's shrinker code about our progress, we can safely enable
the kswapd hook. This will allow the arc to receive notifications
when memory pressure is first detected by the kernel. We also
re-enable the appropriate kstats to track these callbacks.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10600
2020-08-01 07:10:52 +03:00
|
|
|
/*
|
|
|
|
* Generic support for three argument tracepoints of the form:
|
|
|
|
*
|
|
|
|
* DTRACE_PROBE3(...,
|
|
|
|
* uint64_t, ...,
|
|
|
|
* uint64_t, ...,
|
|
|
|
* uint64_t, ...);
|
|
|
|
*/
|
|
|
|
/* BEGIN CSTYLED */
|
|
|
|
DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class,
|
|
|
|
TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count),
|
|
|
|
TP_ARGS(amount, arc_evict_count, aew_count),
|
|
|
|
TP_STRUCT__entry(
|
|
|
|
__field(uint64_t, amount)
|
|
|
|
__field(uint64_t, arc_evict_count)
|
|
|
|
__field(uint64_t, aew_count)
|
|
|
|
),
|
|
|
|
TP_fast_assign(
|
|
|
|
__entry->amount = amount;
|
|
|
|
__entry->arc_evict_count = arc_evict_count;
|
|
|
|
__entry->aew_count = aew_count;
|
|
|
|
),
|
|
|
|
TP_printk("amount %llu arc_evict_count %llu aew_count %llu",
|
|
|
|
__entry->amount, __entry->arc_evict_count, __entry->aew_count)
|
|
|
|
);
|
|
|
|
/* END CSTYLED */
|
|
|
|
|
|
|
|
#define DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \
|
|
|
|
DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \
|
2022-01-21 19:07:15 +03:00
|
|
|
TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), \
|
|
|
|
TP_ARGS(amount, arc_evict_count, aew_count))
|
Revise ARC shrinker algorithm
The ARC shrinker callback `arc_shrinker_count/_scan()` is invoked by the
kernel's shrinker mechanism when the system is running low on free
pages. This happens via 2 code paths:
1. "direct reclaim": The system is attempting to allocate a page, but we
are low on memory. The ARC shrinker callback is invoked from the
page-allocation code path.
2. "indirect reclaim": kswapd notices that there aren't many free pages,
so it invokes the ARC shrinker callback.
In both cases, the kernel's shrinker code requests that the ARC shrinker
callback release some of its cache, and then it measures how many pages
were released. However, it's measurement of released pages does not
include pages that are freed via `__free_pages()`, which is how the ARC
releases memory (via `abd_free_chunks()`). Rather, the kernel shrinker
code is looking for pages to be placed on the lists of reclaimable pages
(which is separate from actually-free pages).
Because the kernel shrinker code doesn't detect that the ARC has
released pages, it may call the ARC shrinker callback many times,
resulting in the ARC "collapsing" down to `arc_c_min`. This has several
negative impacts:
1. ZFS doesn't use RAM to cache data effectively.
2. In the direct reclaim case, a single page allocation may wait a long
time (e.g. more than a minute) while we evict the entire ARC.
3. Even with the improvements made in 67c0f0dedc5 ("ARC shrinking blocks
reads/writes"), occasionally `arc_size` may stay above `arc_c` for the
entire time of the ARC collapse, thus blocking ZFS read/write operations
in `arc_get_data_impl()`.
To address these issues, this commit limits the ways that the ARC
shrinker callback can be used by the kernel shrinker code, and mitigates
the impact of arc_is_overflowing() on ZFS read/write operations.
With this commit:
1. We limit the amount of data that can be reclaimed from the ARC via
the "direct reclaim" shrinker. This limits the amount of time it takes
to allocate a single page.
2. We do not allow the ARC to shrink via kswapd (indirect reclaim).
Instead we rely on `arc_evict_zthr` to monitor free memory and reduce
the ARC target size to keep sufficient free memory in the system. Note
that we can't simply rely on limiting the amount that we reclaim at once
(as for the direct reclaim case), because kswapd's "boosted" logic can
invoke the callback an unlimited number of times (see
`balance_pgdat()`).
3. When `arc_is_overflowing()` and we want to allocate memory,
`arc_get_data_impl()` will wait only for a multiple of the requested
amount of data to be evicted, rather than waiting for the ARC to no
longer be overflowing. This allows ZFS reads/writes to make progress
even while the ARC is overflowing, while also ensuring that the eviction
thread makes progress towards reducing the total amount of memory used
by the ARC.
4. The amount of memory that the ARC always tries to keep free for the
rest of the system, `arc_sys_free` is increased.
5. Now that the shrinker callback is able to provide feedback to the
kernel's shrinker code about our progress, we can safely enable
the kswapd hook. This will allow the arc to receive notifications
when memory pressure is first detected by the kernel. We also
re-enable the appropriate kstats to track these callbacks.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10600
2020-08-01 07:10:52 +03:00
|
|
|
DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction);
|
|
|
|
|
2014-12-13 05:07:39 +03:00
|
|
|
#endif /* _TRACE_ARC_H */
|
|
|
|
|
|
|
|
#undef TRACE_INCLUDE_PATH
|
|
|
|
#undef TRACE_INCLUDE_FILE
|
|
|
|
#define TRACE_INCLUDE_PATH sys
|
|
|
|
#define TRACE_INCLUDE_FILE trace_arc
|
|
|
|
#include <trace/define_trace.h>
|
|
|
|
|
2019-07-08 21:20:53 +03:00
|
|
|
#else
|
|
|
|
|
|
|
|
DEFINE_DTRACE_PROBE1(arc__hit);
|
2022-12-22 23:10:24 +03:00
|
|
|
DEFINE_DTRACE_PROBE1(arc__iohit);
|
2019-07-08 21:20:53 +03:00
|
|
|
DEFINE_DTRACE_PROBE1(arc__evict);
|
|
|
|
DEFINE_DTRACE_PROBE1(arc__delete);
|
|
|
|
DEFINE_DTRACE_PROBE1(new_state__mru);
|
|
|
|
DEFINE_DTRACE_PROBE1(new_state__mfu);
|
Implement uncached prefetch
Previously the primarycache property was handled only in the dbuf
layer. Since the speculative prefetcher is implemented in the ARC,
it had to be disabled for uncacheable buffers.
This change gives the ARC knowledge about uncacheable buffers
via arc_read() and arc_write(). So when remove_reference() drops
the last reference on the ARC header, it can either immediately destroy
it, or if it is marked as prefetch, put it into a new arc_uncached state.
That state is scanned every second, evicting stale buffers that were
not demand read.
This change also tracks dbufs that were read from the beginning,
but not to the end. It is assumed that such buffers may receive further
reads, and so they are stored in dbuf cache. If a following
reads reaches the end of the buffer, it is immediately evicted.
Otherwise it will follow regular dbuf cache eviction. Since the dbuf
layer does not know actual file sizes, this logic is not applied to
the final buffer of a dnode.
Since uncacheable buffers should no longer stay in the ARC for long,
this patch also tries to optimize I/O by allocating ARC physical
buffers as linear to allow buffer sharing.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #14243
2023-01-05 03:29:54 +03:00
|
|
|
DEFINE_DTRACE_PROBE1(new_state__uncached);
|
2019-07-08 21:20:53 +03:00
|
|
|
DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync);
|
|
|
|
DEFINE_DTRACE_PROBE1(l2arc__hit);
|
|
|
|
DEFINE_DTRACE_PROBE1(l2arc__miss);
|
|
|
|
DEFINE_DTRACE_PROBE2(l2arc__read);
|
|
|
|
DEFINE_DTRACE_PROBE2(l2arc__write);
|
|
|
|
DEFINE_DTRACE_PROBE2(l2arc__iodone);
|
Revise ARC shrinker algorithm
The ARC shrinker callback `arc_shrinker_count/_scan()` is invoked by the
kernel's shrinker mechanism when the system is running low on free
pages. This happens via 2 code paths:
1. "direct reclaim": The system is attempting to allocate a page, but we
are low on memory. The ARC shrinker callback is invoked from the
page-allocation code path.
2. "indirect reclaim": kswapd notices that there aren't many free pages,
so it invokes the ARC shrinker callback.
In both cases, the kernel's shrinker code requests that the ARC shrinker
callback release some of its cache, and then it measures how many pages
were released. However, it's measurement of released pages does not
include pages that are freed via `__free_pages()`, which is how the ARC
releases memory (via `abd_free_chunks()`). Rather, the kernel shrinker
code is looking for pages to be placed on the lists of reclaimable pages
(which is separate from actually-free pages).
Because the kernel shrinker code doesn't detect that the ARC has
released pages, it may call the ARC shrinker callback many times,
resulting in the ARC "collapsing" down to `arc_c_min`. This has several
negative impacts:
1. ZFS doesn't use RAM to cache data effectively.
2. In the direct reclaim case, a single page allocation may wait a long
time (e.g. more than a minute) while we evict the entire ARC.
3. Even with the improvements made in 67c0f0dedc5 ("ARC shrinking blocks
reads/writes"), occasionally `arc_size` may stay above `arc_c` for the
entire time of the ARC collapse, thus blocking ZFS read/write operations
in `arc_get_data_impl()`.
To address these issues, this commit limits the ways that the ARC
shrinker callback can be used by the kernel shrinker code, and mitigates
the impact of arc_is_overflowing() on ZFS read/write operations.
With this commit:
1. We limit the amount of data that can be reclaimed from the ARC via
the "direct reclaim" shrinker. This limits the amount of time it takes
to allocate a single page.
2. We do not allow the ARC to shrink via kswapd (indirect reclaim).
Instead we rely on `arc_evict_zthr` to monitor free memory and reduce
the ARC target size to keep sufficient free memory in the system. Note
that we can't simply rely on limiting the amount that we reclaim at once
(as for the direct reclaim case), because kswapd's "boosted" logic can
invoke the callback an unlimited number of times (see
`balance_pgdat()`).
3. When `arc_is_overflowing()` and we want to allocate memory,
`arc_get_data_impl()` will wait only for a multiple of the requested
amount of data to be evicted, rather than waiting for the ARC to no
longer be overflowing. This allows ZFS reads/writes to make progress
even while the ARC is overflowing, while also ensuring that the eviction
thread makes progress towards reducing the total amount of memory used
by the ARC.
4. The amount of memory that the ARC always tries to keep free for the
rest of the system, `arc_sys_free` is increased.
5. Now that the shrinker callback is able to provide feedback to the
kernel's shrinker code about our progress, we can safely enable
the kswapd hook. This will allow the arc to receive notifications
when memory pressure is first detected by the kernel. We also
re-enable the appropriate kstats to track these callbacks.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10600
2020-08-01 07:10:52 +03:00
|
|
|
DEFINE_DTRACE_PROBE3(arc__wait__for__eviction);
|
2019-07-08 21:20:53 +03:00
|
|
|
DEFINE_DTRACE_PROBE4(arc__miss);
|
|
|
|
DEFINE_DTRACE_PROBE4(l2arc__evict);
|
|
|
|
|
|
|
|
#endif /* HAVE_DECLARE_EVENT_CLASS */
|
|
|
|
#endif /* _KERNEL */
|