2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
Illumos #3805 arc shouldn't cache freed blocks
3805 arc shouldn't cache freed blocks
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Elling <richard.elling@dey-sys.com>
Reviewed by: Will Andrews <will@firepipe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
References:
illumos/illumos-gate@6e6d5868f52089b9026785bd90257a3d3f6e5ee2
https://www.illumos.org/issues/3805
ZFS should proactively evict freed blocks from the cache.
On dcenter, we saw that we were caching ~256GB of metadata, while the
pool only had <4GB of metadata on disk. We were wasting about half the
system's RAM (252GB) on blocks that have been freed.
Even though these freed blocks will never be used again, and thus will
eventually be evicted, this causes us to use memory inefficiently for 2
reasons:
1. A block that is freed has no chance of being accessed again, but will
be kept in memory preferentially to a block that was accessed before it
(and is thus older) but has not been freed and thus has at least some
chance of being accessed again.
2. We partition the ARC into several buckets:
user data that has been accessed only once (MRU)
metadata that has been accessed only once (MRU)
user data that has been accessed more than once (MFU)
metadata that has been accessed more than once (MFU)
The user data vs metadata split is somewhat arbitrary, and the primary
control on how much memory is used to cache data vs metadata is to
simply try to keep the proportion the same as it has been in the past
(each bucket "evicts against" itself). The secondary control is to
evict data before evicting metadata.
Because of this bucketing, we may end up with one bucket mostly
containing freed blocks that are very old, while another bucket has more
recently accessed, still-allocated blocks. Data in the useful bucket
(with still-allocated blocks) may be evicted in preference to data in
the useless bucket (with old, freed blocks).
On dcenter, we saw that the MFU metadata bucket was 230MB, while the MFU
data bucket was 27GB and the MRU metadata bucket was 256GB. However,
the vast majority of data in the MRU metadata bucket (256GB) was freed
blocks, and thus useless. Meanwhile, the MFU metadata bucket (230MB)
was constantly evicting useful blocks that will be soon needed.
The problem of cache segmentation is a larger problem that needs more
investigation. However, if we stop caching freed blocks, it should
reduce the impact of this more fundamental issue.
Ported-by: Richard Yao <ryao@cs.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1503
2013-06-07 02:46:55 +04:00
|
|
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
2013-08-02 00:02:10 +04:00
|
|
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _SYS_ARC_H
|
|
|
|
#define _SYS_ARC_H
|
|
|
|
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <sys/zio.h>
|
|
|
|
#include <sys/dmu.h>
|
|
|
|
#include <sys/spa.h>
|
2011-12-23 00:20:43 +04:00
|
|
|
#include <sys/refcount.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
typedef struct arc_buf_hdr arc_buf_hdr_t;
|
|
|
|
typedef struct arc_buf arc_buf_t;
|
2011-12-23 00:20:43 +04:00
|
|
|
typedef struct arc_prune arc_prune_t;
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
|
2011-12-23 00:20:43 +04:00
|
|
|
typedef void arc_prune_func_t(int64_t bytes, void *private);
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef int arc_evict_func_t(void *private);
|
|
|
|
|
|
|
|
/* generic arc_done_func_t's which you can use */
|
|
|
|
arc_done_func_t arc_bcopy_func;
|
|
|
|
arc_done_func_t arc_getbuf_func;
|
|
|
|
|
2011-12-23 00:20:43 +04:00
|
|
|
/* generic arc_prune_func_t wrapper for callbacks */
|
|
|
|
struct arc_prune {
|
|
|
|
arc_prune_func_t *p_pfunc;
|
|
|
|
void *p_private;
|
|
|
|
list_node_t p_node;
|
|
|
|
refcount_t p_refcnt;
|
|
|
|
};
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
struct arc_buf {
|
|
|
|
arc_buf_hdr_t *b_hdr;
|
|
|
|
arc_buf_t *b_next;
|
2010-05-29 00:45:14 +04:00
|
|
|
kmutex_t b_evict_lock;
|
2008-11-20 23:01:55 +03:00
|
|
|
void *b_data;
|
|
|
|
arc_evict_func_t *b_efunc;
|
|
|
|
void *b_private;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef enum arc_buf_contents {
|
|
|
|
ARC_BUFC_DATA, /* buffer contains data */
|
|
|
|
ARC_BUFC_METADATA, /* buffer contains metadata */
|
|
|
|
ARC_BUFC_NUMTYPES
|
|
|
|
} arc_buf_contents_t;
|
|
|
|
/*
|
|
|
|
* These are the flags we pass into calls to the arc
|
|
|
|
*/
|
|
|
|
#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
|
|
|
|
#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
|
|
|
|
#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
|
|
|
|
#define ARC_CACHED (1 << 4) /* I/O was already in cache */
|
2008-12-03 23:09:06 +03:00
|
|
|
#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
|
2013-08-02 00:02:10 +04:00
|
|
|
#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
/*
|
|
|
|
* The following breakdows of arc_size exist for kstat only.
|
|
|
|
*/
|
|
|
|
typedef enum arc_space_type {
|
|
|
|
ARC_SPACE_DATA,
|
|
|
|
ARC_SPACE_HDRS,
|
|
|
|
ARC_SPACE_L2HDRS,
|
|
|
|
ARC_SPACE_OTHER,
|
|
|
|
ARC_SPACE_NUMTYPES
|
|
|
|
} arc_space_type_t;
|
|
|
|
|
|
|
|
void arc_space_consume(uint64_t space, arc_space_type_t type);
|
|
|
|
void arc_space_return(uint64_t space, arc_space_type_t type);
|
2008-11-20 23:01:55 +03:00
|
|
|
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
|
|
|
|
arc_buf_contents_t type);
|
2009-07-03 02:44:48 +04:00
|
|
|
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
|
|
|
|
void arc_return_buf(arc_buf_t *buf, void *tag);
|
2010-05-29 00:45:14 +04:00
|
|
|
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
|
2008-11-20 23:01:55 +03:00
|
|
|
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
|
|
|
|
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
|
|
|
|
int arc_buf_size(arc_buf_t *buf);
|
|
|
|
void arc_release(arc_buf_t *buf, void *tag);
|
|
|
|
int arc_released(arc_buf_t *buf);
|
|
|
|
int arc_has_callback(arc_buf_t *buf);
|
|
|
|
void arc_buf_freeze(arc_buf_t *buf);
|
|
|
|
void arc_buf_thaw(arc_buf_t *buf);
|
2012-12-22 02:57:09 +04:00
|
|
|
boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
int arc_referenced(arc_buf_t *buf);
|
|
|
|
#endif
|
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
|
2008-11-20 23:01:55 +03:00
|
|
|
arc_done_func_t *done, void *private, int priority, int flags,
|
2008-12-03 23:09:06 +03:00
|
|
|
uint32_t *arc_flags, const zbookmark_t *zb);
|
2010-05-29 00:45:14 +04:00
|
|
|
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
|
2013-08-02 00:02:10 +04:00
|
|
|
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
|
|
|
|
const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
|
|
|
|
void *private, int priority, int zio_flags, const zbookmark_t *zb);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-12-23 00:20:43 +04:00
|
|
|
arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private);
|
|
|
|
void arc_remove_prune_callback(arc_prune_t *p);
|
Illumos #3805 arc shouldn't cache freed blocks
3805 arc shouldn't cache freed blocks
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Richard Elling <richard.elling@dey-sys.com>
Reviewed by: Will Andrews <will@firepipe.net>
Approved by: Dan McDonald <danmcd@nexenta.com>
References:
illumos/illumos-gate@6e6d5868f52089b9026785bd90257a3d3f6e5ee2
https://www.illumos.org/issues/3805
ZFS should proactively evict freed blocks from the cache.
On dcenter, we saw that we were caching ~256GB of metadata, while the
pool only had <4GB of metadata on disk. We were wasting about half the
system's RAM (252GB) on blocks that have been freed.
Even though these freed blocks will never be used again, and thus will
eventually be evicted, this causes us to use memory inefficiently for 2
reasons:
1. A block that is freed has no chance of being accessed again, but will
be kept in memory preferentially to a block that was accessed before it
(and is thus older) but has not been freed and thus has at least some
chance of being accessed again.
2. We partition the ARC into several buckets:
user data that has been accessed only once (MRU)
metadata that has been accessed only once (MRU)
user data that has been accessed more than once (MFU)
metadata that has been accessed more than once (MFU)
The user data vs metadata split is somewhat arbitrary, and the primary
control on how much memory is used to cache data vs metadata is to
simply try to keep the proportion the same as it has been in the past
(each bucket "evicts against" itself). The secondary control is to
evict data before evicting metadata.
Because of this bucketing, we may end up with one bucket mostly
containing freed blocks that are very old, while another bucket has more
recently accessed, still-allocated blocks. Data in the useful bucket
(with still-allocated blocks) may be evicted in preference to data in
the useless bucket (with old, freed blocks).
On dcenter, we saw that the MFU metadata bucket was 230MB, while the MFU
data bucket was 27GB and the MRU metadata bucket was 256GB. However,
the vast majority of data in the MRU metadata bucket (256GB) was freed
blocks, and thus useless. Meanwhile, the MFU metadata bucket (230MB)
was constantly evicting useful blocks that will be soon needed.
The problem of cache segmentation is a larger problem that needs more
investigation. However, if we stop caching freed blocks, it should
reduce the impact of this more fundamental issue.
Ported-by: Richard Yao <ryao@cs.stonybrook.edu>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1503
2013-06-07 02:46:55 +04:00
|
|
|
void arc_freed(spa_t *spa, const blkptr_t *bp);
|
2011-12-23 00:20:43 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
|
|
|
|
int arc_buf_evict(arc_buf_t *buf);
|
|
|
|
|
2011-12-23 00:20:43 +04:00
|
|
|
void arc_adjust_meta(int64_t adjustment, boolean_t may_prune);
|
2008-11-20 23:01:55 +03:00
|
|
|
void arc_flush(spa_t *spa);
|
|
|
|
void arc_tempreserve_clear(uint64_t reserve);
|
|
|
|
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
|
|
|
|
|
|
|
|
void arc_init(void);
|
|
|
|
void arc_fini(void);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Level 2 ARC
|
|
|
|
*/
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
|
2008-11-20 23:01:55 +03:00
|
|
|
void l2arc_remove_vdev(vdev_t *vd);
|
2008-12-03 23:09:06 +03:00
|
|
|
boolean_t l2arc_vdev_present(vdev_t *vd);
|
2008-11-20 23:01:55 +03:00
|
|
|
void l2arc_init(void);
|
|
|
|
void l2arc_fini(void);
|
2008-12-03 23:09:06 +03:00
|
|
|
void l2arc_start(void);
|
|
|
|
void l2arc_stop(void);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2012-10-11 07:57:45 +04:00
|
|
|
/* Global tunings */
|
|
|
|
extern int zfs_write_limit_shift;
|
|
|
|
extern unsigned long zfs_write_limit_max;
|
|
|
|
extern kmutex_t zfs_write_limit_lock;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_ARC_H */
|