From d3c2ae1c0806b183a315e3d43cc8018cfdca79b5 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Thu, 2 Jun 2016 00:04:53 -0400 Subject: [PATCH] OpenZFS 6950 - ARC should cache compressed data Authored by: George Wilson Reviewed by: Prakash Surya Reviewed by: Dan Kimmel Reviewed by: Matt Ahrens Reviewed by: Paul Dagnelie Reviewed by: Tom Caputi Reviewed by: Brian Behlendorf Ported by: David Quigley This review covers the reading and writing of compressed arc headers, sharing data between the arc_hdr_t and the arc_buf_t, and the implementation of a new dbuf cache to keep frequently access data uncompressed. I've added a new member to l1 arc hdr called b_pdata. The b_pdata always hangs off the arc_buf_hdr_t (if an L1 hdr is in use) and points to the physical block for that DVA. The physical block may or may not be compressed. If compressed arc is enabled and the block on-disk is compressed, then the b_pdata will match the block on-disk and remain compressed in memory. If the block on disk is not compressed, then neither will the b_pdata. Lastly, if compressed arc is disabled, then b_pdata will always be an uncompressed version of the on-disk block. Typically the arc will cache only the arc_buf_hdr_t and will aggressively evict any arc_buf_t's that are no longer referenced. This means that the arc will primarily have compressed blocks as the arc_buf_t's are considered overhead and are always uncompressed. When a consumer reads a block we first look to see if the arc_buf_hdr_t is cached. If the hdr is cached then we allocate a new arc_buf_t and decompress the b_pdata contents into the arc_buf_t's b_data. If the hdr already has a arc_buf_t, then we will allocate an additional arc_buf_t and bcopy the uncompressed contents from the first arc_buf_t to the new one. Writing to the compressed arc requires that we first discard the b_pdata since the physical block is about to be rewritten. The new data contents will be passed in via an arc_buf_t (uncompressed) and during the I/O pipeline stages we will copy the physical block contents to a newly allocated b_pdata. When an l2arc is inuse it will also take advantage of the b_pdata. Now the l2arc will always write the contents of b_pdata to the l2arc. This means that when compressed arc is enabled that the l2arc blocks are identical to those stored in the main data pool. This provides a significant advantage since we can leverage the bp's checksum when reading from the l2arc to determine if the contents are valid. If the compressed arc is disabled, then we must first transform the read block to look like the physical block in the main data pool before comparing the checksum and determining it's valid. OpenZFS-issue: https://www.illumos.org/issues/6950 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7fc10f0 Issue #5078 --- cmd/zdb/zdb.c | 2 +- cmd/ztest/ztest.c | 7 + include/sys/arc.h | 93 +- include/sys/arc_impl.h | 49 +- include/sys/dbuf.h | 13 +- include/sys/refcount.h | 2 + include/sys/spa.h | 8 +- include/sys/trace_arc.h | 30 +- include/sys/trace_dbuf.h | 14 + include/sys/zio.h | 4 + include/sys/zio_checksum.h | 4 + man/man5/zfs-module-parameters.5 | 14 - module/zfs/arc.c | 3520 ++++++++++++++++-------------- module/zfs/dbuf.c | 662 ++++-- module/zfs/dbuf_stats.c | 2 +- module/zfs/dmu.c | 7 +- module/zfs/dmu_diff.c | 4 +- module/zfs/dmu_objset.c | 15 +- module/zfs/dmu_send.c | 8 +- module/zfs/dmu_traverse.c | 4 +- module/zfs/dnode.c | 2 +- module/zfs/dnode_sync.c | 4 +- module/zfs/dsl_scan.c | 6 +- module/zfs/refcount.c | 24 + module/zfs/zil.c | 6 +- module/zfs/zio.c | 6 +- module/zfs/zio_checksum.c | 57 +- 27 files changed, 2520 insertions(+), 2047 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 6a076e65d..611e92294 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1297,7 +1297,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, } if (!err) ASSERT3U(fill, ==, BP_GET_FILL(bp)); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (err); diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 852bf00a6..1b77b6cee 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -192,6 +192,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5880,6 +5881,12 @@ ztest_resume_thread(void *arg) if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); } thread_exit(); diff --git a/include/sys/arc.h b/include/sys/arc.h index d8a85e830..13788a9b6 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -44,12 +44,24 @@ extern "C" { */ #define ARC_EVICT_ALL -1ULL +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); typedef void arc_prune_func_t(int64_t bytes, void *private); -typedef int arc_evict_func_t(void *private); /* Shared module parameters */ extern int zfs_arc_average_blocksize; @@ -58,6 +70,8 @@ extern int zfs_arc_average_blocksize; arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; +extern int zfs_arc_num_sublists_per_state; + /* generic arc_prune_func_t wrapper for callbacks */ struct arc_prune { arc_prune_func_t *p_pfunc; @@ -77,37 +91,54 @@ typedef enum arc_flags /* * Public flags that can be passed into the ARC by external consumers. */ - ARC_FLAG_NONE = 1 << 0, /* No flags set */ - ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ - ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ - ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ - ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ - ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ - ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */ - ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */ - ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */ - ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 14, - ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, + ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 18, + ARC_FLAG_BUFC_METADATA = 1 << 14, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 19, - ARC_FLAG_HAS_L2HDR = 1 << 20, + ARC_FLAG_HAS_L1HDR = 1 << 15, + ARC_FLAG_HAS_L2HDR = 1 << 16, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 17, + ARC_FLAG_SHARED_DATA = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 } arc_flags_t; @@ -116,11 +147,10 @@ struct arc_buf { arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; - arc_evict_func_t *b_efunc; - void *b_private; }; typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES @@ -154,7 +184,7 @@ typedef struct arc_buf_info { arc_state_type_t abi_state_type; arc_buf_contents_t abi_state_contents; uint32_t abi_flags; - uint32_t abi_datacnt; + uint32_t abi_bufcnt; uint64_t abi_size; uint64_t abi_spa; uint64_t abi_access; @@ -171,13 +201,12 @@ typedef struct arc_buf_info { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); -arc_buf_t *arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, +arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, uint64_t size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); -void arc_buf_add_ref(arc_buf_t *buf, void *tag); -boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); uint64_t arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); @@ -185,7 +214,6 @@ int arc_released(arc_buf_t *buf); void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); -boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif @@ -194,8 +222,7 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *child_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, @@ -205,13 +232,11 @@ arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); -void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); -boolean_t arc_clear_callback(arc_buf_t *buf); - void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 5c57c3157..c23187d6a 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -74,7 +74,7 @@ typedef struct arc_state { /* * total amount of evictable data in this state */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. @@ -140,11 +140,13 @@ struct arc_write_callback { */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; arc_buf_t *b_buf; - uint32_t b_datacnt; + uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; + uint8_t b_byteswap; /* protected by arc state mutex */ @@ -163,8 +165,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; + void *b_pdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev { @@ -185,10 +186,7 @@ typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ - /* real alloc'd buffer size depending on b_compress applied */ uint32_t b_hits; - int32_t b_asize; - uint8_t b_compress; list_node_t b_l2node; } l2arc_buf_hdr_t; @@ -202,20 +200,37 @@ struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; - /* - * Even though this checksum is only set/verified when a buffer is in - * the L1 cache, it needs to be in the set of common fields because it - * must be preserved from the time before a buffer is written out to - * L2ARC until after it is read back in. - */ - zio_cksum_t *b_freeze_cksum; + arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; - /* immutable */ - int32_t b_size; - uint64_t b_spa; + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 079358911..bf546db6f 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -228,6 +229,11 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; + /* + * Link in dbuf_cache. + */ + multilist_node_t db_cache_link; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -303,8 +309,7 @@ void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); -void dbuf_clear(dmu_buf_impl_t *db); -void dbuf_evict(dmu_buf_impl_t *db); +void dbuf_destroy(dmu_buf_impl_t *db); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); @@ -342,10 +347,6 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2COMPRESSIBLE(_db) \ - ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \ - (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE)) - #ifdef ZFS_DEBUG /* diff --git a/include/sys/refcount.h b/include/sys/refcount.h index d3f90fdc6..ac82a4106 100644 --- a/include/sys/refcount.h +++ b/include/sys/refcount.h @@ -70,6 +70,7 @@ int64_t refcount_remove(refcount_t *rc, void *holder_tag); int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); void refcount_transfer(refcount_t *dst, refcount_t *src); +void refcount_transfer_ownership(refcount_t *, void *, void *); void refcount_init(void); void refcount_fini(void); @@ -97,6 +98,7 @@ typedef struct refcount { atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } +#define refcount_transfer_ownership(rc, current_holder, new_holder) #define refcount_init() #define refcount_fini() diff --git a/include/sys/spa.h b/include/sys/spa.h index fead2d9de..320c58402 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -135,6 +135,8 @@ _NOTE(CONSTCOND) } while (0) #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ +#define SPA_COMPRESSBITS 7 + /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. @@ -363,8 +365,10 @@ _NOTE(CONSTCOND) } while (0) 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 0fca639e1..0384a44ab 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -50,9 +50,10 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_datacnt) + __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) - __field(uint64_t, hdr_size) + __field(uint16_t, hdr_psize) + __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) @@ -68,8 +69,9 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; __entry->hdr_flags = ab->b_flags; - __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt; - __entry->hdr_size = ab->b_size; + __entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt; + __entry->hdr_psize = ab->b_psize; + __entry->hdr_lsize = ab->b_lsize; __entry->hdr_spa = ab->b_spa; __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; __entry->hdr_access = ab->b_l1hdr.b_arc_access; @@ -81,13 +83,13 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x datacnt %u type %u size %llu spa %llu " + "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, - __entry->hdr_spa, __entry->hdr_state_type, + __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize, + __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, @@ -185,9 +187,10 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) - __field(uint32_t, hdr_datacnt) + __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) - __field(uint64_t, hdr_size) + __field(uint16_t, hdr_psize) + __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) @@ -215,8 +218,9 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt; - __entry->hdr_size = hdr->b_size; + __entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt; + __entry->hdr_psize = hdr->b_psize; + __entry->hdr_lsize = hdr->b_lsize; __entry->hdr_spa = hdr->b_spa; __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; __entry->hdr_access = hdr->b_l1hdr.b_arc_access; @@ -246,7 +250,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_blkid = zb->zb_blkid; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " - "flags 0x%x datacnt %u size %llu spa %llu state_type %u " + "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -255,7 +259,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_size, + __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, diff --git a/include/sys/trace_dbuf.h b/include/sys/trace_dbuf.h index aca6d6531..76274d152 100644 --- a/include/sys/trace_dbuf.h +++ b/include/sys/trace_dbuf.h @@ -92,6 +92,20 @@ DEFINE_EVENT(zfs_dbuf_class, name, \ TP_ARGS(db, zio)) DEFINE_DBUF_EVENT(zfs_blocked__read); +DECLARE_EVENT_CLASS(zfs_dbuf_evict_one_class, + TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), + TP_ARGS(db, mls), + TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), + TP_fast_assign(DBUF_TP_FAST_ASSIGN), + TP_printk(DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS) +); + +#define DEFINE_DBUF_EVICT_ONE_EVENT(name) \ +DEFINE_EVENT(zfs_dbuf_evict_one_class, name, \ + TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), \ + TP_ARGS(db, mls)) +DEFINE_DBUF_EVICT_ONE_EVENT(zfs_dbuf__evict__one); + #endif /* _TRACE_DBUF_H */ #undef TRACE_INCLUDE_PATH diff --git a/include/sys/zio.h b/include/sys/zio.h index 42e85e666..7388eb72b 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -527,6 +527,10 @@ extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void *zio_buf_alloc_flags(size_t size, int flags); +extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 9fcfd521f..04573ba54 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -64,8 +64,12 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; */ extern zio_checksum_func_t zio_checksum_SHA256; +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); +extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index aa2d06d6a..3c652277b 100755 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -96,20 +96,6 @@ successfully compressed before writing. A value of 100 disables this feature. Default value: \fB200\fR. .RE -.sp -.ne 2 -.na -\fBl2arc_max_block_size\fR (ulong) -.ad -.RS 12n -The maximum block size which may be written to an L2ARC device, after -compression and other factors. This setting is used to prevent a small -number of large blocks from pushing a larger number of small blocks out -of the cache. -.sp -Default value: \fB16,777,216\fR. -.RE - .sp .ne 2 .na diff --git a/module/zfs/arc.c b/module/zfs/arc.c index eee4973b2..43f0bfa4a 100755 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -23,7 +23,7 @@ * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* @@ -128,9 +128,134 @@ * - ARC header release, as it removes from L2ARC buflists */ +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer, and always contains uncompressed data. The ARC will provide + * references to this data and will keep it cached until it is no longer in + * use. Typically, the arc will try to cache only the L1ARC's physical data + * block and will aggressively evict any arc_buf_t that is no longer referenced. + * The amount of memory consumed by the arc_buf_t's can be seen via the + * "overhead_size" kstat. + * + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * (potentially) | | | | + * compressed | | | | + * data +------+ | v + * +->+------+ +------+ + * uncompressed | | | | + * data | | | | + * +------+ +------+ + * + * The L1ARC's data pointer, however, may or may not be uncompressed. The + * ARC has the ability to store the physical data (b_pdata) associated with + * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk + * physical block, it will match its on-disk compression characteristics. + * If the block on-disk is compressed, then the physical data block + * in the cache will also be compressed and vice-versa. This behavior + * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, + * then an additional arc_buf_t is allocated and the uncompressed data is + * bcopied from the existing arc_buf_t. If the hdr is cached but does not + * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses + * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's + * b_pdata is not compressed, then the block is shared with the newly + * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t + * in the arc buffer chain. Sharing the block reduces the memory overhead + * required when the hdr is caching uncompressed blocks or the compressed + * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t: + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the arc requires that the ARC first discard the b_pdata + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline + * performs the write, it may compress the data before writing it to disk. + * The ARC will be called with the transformed data and will bcopy the + * transformed on-disk block into a newly allocated b_pdata. + * + * When the L2ARC is in use, it will also take advantage of the b_pdata. The + * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * that when compressed arc is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * arc is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + #include #include +#include #include +#include #include #include #include @@ -162,10 +287,6 @@ static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; -static kmutex_t arc_user_evicts_lock; -static kcondvar_t arc_user_evicts_cv; -static boolean_t arc_user_evicts_thread_exit; - /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower @@ -224,6 +345,11 @@ static int arc_dead; */ static boolean_t arc_warm; +/* + * log2 fraction of the zio arena to keep free. + */ +int arc_zio_arena_free_shift = 2; + /* * These tunables are for performance analysis. */ @@ -236,9 +362,10 @@ unsigned long zfs_arc_dnode_reduce_percent = 10; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; -int zfs_disable_dup_eviction = 0; int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +int zfs_compressed_arc_enabled = B_TRUE; + /* * ARC will evict meta buffers that exceed arc_meta_limit. This * tunable make arc_meta_limit adjustable for different workloads. @@ -318,6 +445,26 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pdata. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually @@ -463,25 +610,17 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_writes_skip_toobig; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_l2_compress_successes; - kstat_named_t arcstat_l2_compress_zeros; - kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_memory_throttle_count; - kstat_named_t arcstat_duplicate_buffers; - kstat_named_t arcstat_duplicate_buffers_size; - kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_no_grow; @@ -532,6 +671,9 @@ static arc_stats_t arc_stats = { { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, @@ -563,25 +705,17 @@ static arc_stats_t arc_stats = { { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, - { "l2_writes_skip_toobig", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "l2_compress_successes", KSTAT_DATA_UINT64 }, - { "l2_compress_zeros", KSTAT_DATA_UINT64 }, - { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, - { "duplicate_buffers", KSTAT_DATA_UINT64 }, - { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, @@ -658,7 +792,7 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_no_grow ARCSTAT(arcstat_no_grow) +#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ @@ -672,14 +806,16 @@ static arc_state_t *arc_l2c_only; #define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ -#define L2ARC_IS_VALID_COMPRESS(_c_) \ - ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static taskq_t *arc_prune_taskq; -static arc_buf_t *arc_eviction_list; -static arc_buf_hdr_t arc_eviction_hdr; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -689,25 +825,35 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) -#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ - ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ - ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) + /* * Other sizes */ @@ -753,7 +899,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ -#define L2ARC_MAX_BLOCK_SIZE (16 * 1024 * 1024) /* max compress size */ /* * If we discover during ARC scan any buffers to be compressed, we boost @@ -763,17 +908,6 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ - -/* - * Used to distinguish headers that are being process by - * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk - * address. This can happen when the header is added to the l2arc's list - * of buffers to write in the first stage of l2arc_write_buffers(), but - * has not yet been written out which happens in the second stage of - * l2arc_write_buffers(). - */ -#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) - #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -782,11 +916,9 @@ unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -unsigned long l2arc_max_block_size = L2ARC_MAX_BLOCK_SIZE; unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -int l2arc_nocompress = B_FALSE; /* don't compress bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ @@ -803,19 +935,17 @@ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ + arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ - enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; @@ -823,7 +953,10 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void arc_get_data_buf(arc_buf_t *); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); +static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); @@ -832,14 +965,12 @@ static void arc_prune_async(int64_t); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); -static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *); - static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { @@ -857,14 +988,14 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) return (crc); } -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) @@ -886,7 +1017,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, hdr)) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } @@ -924,13 +1055,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { - if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { @@ -958,12 +1089,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { - ASSERT(fhdr != NULL); + ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; - hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -1060,7 +1191,7 @@ hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); @@ -1074,7 +1205,7 @@ hdr_l2only_dest(void *vbuf, void *unused) { ASSERTV(arc_buf_hdr_t *hdr = vbuf); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } @@ -1156,159 +1287,140 @@ retry: } } -/* - * Transition between the two allocation states for the arc_buf_hdr struct. - * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without - * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller - * version is used when a cache buffer is only in the L2ARC in order to reduce - * memory usage. - */ -static arc_buf_hdr_t * -arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) { - arc_buf_hdr_t *nhdr; - l2arc_dev_t *dev; - - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || - (old == hdr_l2only_cache && new == hdr_full_cache)); - - dev = hdr->b_l2hdr.b_dev; - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); - - ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); - buf_hash_remove(hdr); - - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - - if (new == hdr_full_cache) { - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - /* - * arc_access and arc_change_state need to be aware that a - * header has just come out of L2ARC, so we set its state to - * l2c_only even though it's about to change. - */ - nhdr->b_l1hdr.b_state = arc_l2c_only; - - /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - ASSERT(hdr->b_l1hdr.b_buf == NULL); - ASSERT0(hdr->b_l1hdr.b_datacnt); - - /* - * If we've reached here, We must have been called from - * arc_evict_hdr(), as such we should have already been - * removed from any ghost list we were previously on - * (which protects us from racing with arc_evict_state), - * thus no locking is needed during this check. - */ - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - - /* - * A buffer must not be moved into the arc_l2c_only - * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field - * might try to be accessed, even though it was removed. - */ - VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - - nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; - } - /* - * The header has been reallocated so we need to re-insert it into any - * lists it was on. - */ - (void) buf_hash_insert(nhdr, NULL); - - ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); - - mutex_enter(&dev->l2ad_mtx); - - /* - * We must place the realloc'ed header back into the list at - * the same spot. Otherwise, if it's placed earlier in the list, - * l2arc_write_buffers() could find it during the function's - * write phase, and try to write it out to the l2arc. - */ - list_insert_after(&dev->l2ad_buflist, hdr, nhdr); - list_remove(&dev->l2ad_buflist, hdr); - - mutex_exit(&dev->l2ad_mtx); - - /* - * Since we're using the pointer address as the tag when - * incrementing and decrementing the l2ad_alloc refcount, we - * must remove the old pointer (that we're about to destroy) and - * add the new pointer to the refcount. Otherwise we'd remove - * the wrong pointer address when calling arc_hdr_destroy() later. - */ - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - - (void) refcount_add_many(&dev->l2ad_alloc, - nhdr->b_l2hdr.b_asize, nhdr); - - buf_discard_identity(hdr); - hdr->b_freeze_cksum = NULL; - kmem_cache_free(old, hdr); - - return (nhdr); + boolean_t shared = (buf->b_data != NULL && + buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + return (shared); } - -#define ARC_MINTIME (hz>>4) /* 62 ms */ +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +} static void arc_cksum_verify(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } -static int -arc_cksum_equal(arc_buf_t *buf) +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - zio_cksum_t zc; - int equal; + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); - equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); - return (equal); + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + uint64_t lsize; + uint64_t csize; + void *cbuf; + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + + cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); + lsize = HDR_GET_LSIZE(hdr); + csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); + } + + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); } static void -arc_cksum_compute(arc_buf_t *buf, boolean_t force) +arc_cksum_compute(arc_buf_t *buf) { - if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; + ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -1326,7 +1438,7 @@ arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, + ASSERT0(mprotect(buf->b_data, HDR_GET_LSIZE(buf->b_hdr), PROT_READ | PROT_WRITE)); } #endif @@ -1338,18 +1450,22 @@ arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); + ASSERT0(mprotect(buf->b_data, HDR_GET_LSIZE(buf->b_hdr), + PROT_READ)); #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { + arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { - return (ARC_BUFC_METADATA); + type = ARC_BUFC_METADATA; } else { - return (ARC_BUFC_DATA); + type = ARC_BUFC_DATA; } + VERIFY3U(hdr->b_type, ==, type); + return (type); } static uint32_t @@ -1371,50 +1487,245 @@ arc_bufc_to_flags(arc_buf_contents_t type) void arc_buf_thaw(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; + if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_l1hdr.b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (HDR_IO_IN_PROGRESS(buf->b_hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; - } - - mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); - + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_l1hdr.b_state == arc_anon); - arc_cksum_compute(buf, B_FALSE); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); mutex_exit(hash_lock); } +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ static void -add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} + +static int +arc_decompress(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + int error; + + if (arc_buf_is_shared(buf)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + /* + * The arc_buf_hdr_t is either not compressed or is + * associated with an embedded block or a hole in which + * case they remain anonymous. + */ + IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || + HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + } else { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (error != 0) { + zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + arc_cksum_compute(buf); + return (0); +} + +/* + * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + arc_buf_t *buf; + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + arc_buf_t *buf; + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) { arc_state_t *state; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(hash_lock)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } state = hdr->b_l1hdr.b_state; @@ -1422,27 +1733,20 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { - arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_remove(list, hdr); - - if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evitable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { @@ -1459,15 +1763,9 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - arc_buf_contents_t type = arc_buf_type(hdr); - multilist_t *list = &state->arcs_list[type]; - uint64_t *size = &state->arcs_lsize[type]; - - multilist_insert(list, hdr); - - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - atomic_add_64(size, hdr->b_size * - hdr->b_l1hdr.b_datacnt); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); } return (cnt); } @@ -1502,7 +1800,7 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) l2hdr = &hdr->b_l2hdr; if (l1hdr) { - abi->abi_datacnt = l1hdr->b_datacnt; + abi->abi_bufcnt = l1hdr->b_bufcnt; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; @@ -1513,14 +1811,12 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; - abi->abi_l2arc_asize = l2hdr->b_asize; - abi->abi_l2arc_compress = l2hdr->b_compress; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); - abi->abi_size = hdr->b_size; + abi->abi_size = arc_hdr_size(hdr); } /* @@ -1533,8 +1829,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, { arc_state_t *old_state; int64_t refcnt; - uint32_t datacnt; - uint64_t from_delta, to_delta; + uint32_t bufcnt; + boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* @@ -1547,20 +1843,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); - datacnt = hdr->b_l1hdr.b_datacnt; + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); } else { old_state = arc_l2c_only; refcnt = 0; - datacnt = 0; + bufcnt = 0; + update_old = B_FALSE; } + update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || datacnt > 0); - ASSERT(!GHOST_STATE(new_state) || datacnt == 0); - ASSERT(old_state != arc_anon || datacnt <= 1); - - from_delta = to_delta = datacnt * hdr->b_size; + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -1568,26 +1864,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - uint64_t *size = &old_state->arcs_lsize[buftype]; - ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - /* - * If prefetching out of the ghost cache, - * we will have a non-zero datacnt. - */ - if (GHOST_STATE(old_state) && datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(hdr->b_l1hdr.b_buf == NULL); - from_delta = hdr->b_size; + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; } - ASSERT3U(*size, >=, from_delta); - atomic_add_64(size, -from_delta); + arc_evitable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { - uint64_t *size = &new_state->arcs_lsize[buftype]; - /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or @@ -1597,39 +1884,39 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); - ASSERT(hdr->b_l1hdr.b_buf == NULL); - to_delta = hdr->b_size; + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; } - atomic_add_64(size, to_delta); + arc_evictable_space_increment(hdr, new_state); } } - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ - if (to_delta && new_state != arc_l2c_only) { + if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(datacnt); + ASSERT0(bufcnt); /* - * We moving a header to a ghost state, we first + * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a - * datacnt of zero, and no arc buffer to use for + * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { arc_buf_t *buf; - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1638,35 +1925,54 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&new_state->arcs_size, - hdr->b_size, buf); + HDR_GET_LSIZE(hdr), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&new_state->arcs_size, + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); } } } - if (from_delta && old_state != arc_l2c_only) { + if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + /* * When moving a header off of a ghost state, - * there's the possibility for datacnt to be - * non-zero. This is because we first add the - * arc buffer to the header prior to changing - * the header's state. Since we used the header - * for the reference when putting the header on - * the ghost state, we must balance that and use - * the header when removing off the ghost state - * (even though datacnt is non zero). + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. */ - IMPLY(datacnt == 0, new_state == arc_anon || - new_state == arc_l2c_only); - (void) refcount_remove_many(&old_state->arcs_size, - hdr->b_size, hdr); + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { arc_buf_t *buf; - ASSERT3U(datacnt, !=, 0); + uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -1675,9 +1981,29 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many( - &old_state->arcs_size, hdr->b_size, buf); + &old_state->arcs_size, HDR_GET_LSIZE(hdr), + buf); } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } @@ -1771,18 +2097,23 @@ arc_space_return(uint64_t space, arc_space_type_t type) atomic_add_64(&arc_size, -space); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) +/* + * Allocate an initial buffer for this hdr, subsequent buffers will + * use arc_buf_clone(). + */ +static arc_buf_t * +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { - arc_buf_hdr_t *hdr; arc_buf_t *buf; - VERIFY3U(size, <=, spa_maxblocksize(spa)); - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - ASSERT(BUF_EMPTY(hdr)); - ASSERT3P(hdr->b_freeze_cksum, ==, NULL); - hdr->b_size = size; - hdr->b_spa = spa_load_guid(spa); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; @@ -1792,23 +2123,64 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; buf->b_next = NULL; - hdr->b_flags = arc_bufc_to_flags(type); - hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * If the hdr's data can be shared (no byteswapping, hdr is + * uncompressed, hdr's data is not currently being written to the + * L2ARC write) then we share the data buffer and set the appropriate + * bit in the hdr's b_flags to indicate the hdr is sharing it's + * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to + * store the buf's data. + */ + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + buf->b_data = hdr->b_l1hdr.b_pdata; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } + VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_state = arc_anon; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_datacnt = 1; - hdr->b_l1hdr.b_tmp_cdata = NULL; + hdr->b_l1hdr.b_bufcnt += 1; - arc_get_data_buf(buf); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + return (buf); +} +/* + * Used when allocating additional buffers. + */ +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_l1hdr.b_bufcnt += 1; + + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } @@ -1825,7 +2197,7 @@ arc_loan_buf(spa_t *spa, uint64_t size) { arc_buf_t *buf; - buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); @@ -1839,12 +2211,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -hdr->b_size); + atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ @@ -1853,171 +2225,102 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); + ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - buf->b_efunc = NULL; - buf->b_private = NULL; - atomic_add_64(&arc_loaned_bytes, hdr->b_size); -} - -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) -{ - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_l1hdr.b_buf; - hdr->b_l1hdr.b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); - - /* - * This buffer already exists in the arc so create a duplicate - * copy for the caller. If the buffer is associated with user data - * then track the size and number of duplicates. These stats will be - * updated as duplicate buffers are created and destroyed. - */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMP(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); - } - hdr->b_l1hdr.b_datacnt += 1; - return (buf); -} - -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - - /* - * Check to see if this buffer is evicted. Callers - * must verify b_data != NULL to know if the add_ref - * was successful. - */ - mutex_enter(&buf->b_evict_lock); - if (buf->b_data == NULL) { - mutex_exit(&buf->b_evict_lock); - return; - } - hash_lock = HDR_LOCK(buf->b_hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - mutex_exit(&buf->b_evict_lock); - - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - add_reference(hdr, hash_lock, tag); - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { - l2arc_data_free_t *df; + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; - df->l2df_func = free_func; + df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } +static void +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); + } + (void) refcount_remove_many(&state->arcs_size, size, hdr); + + l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); +} + /* - * Free the arc data buffer. If it is an l2arc write in progress, - * the buffer is placed on l2arc_free_on_write to be freed later. + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - arc_buf_hdr_t *hdr = buf->b_hdr; + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - free_func(buf->b_data, hdr->b_size); - } + /* + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. + */ + refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pdata = buf->b_data; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. + */ + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); } static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - ASSERT(HDR_HAS_L2HDR(hdr)); - ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* - * The b_tmp_cdata field is linked off of the b_l1hdr, so if - * that doesn't exist, the header is in the arc_l2c_only state, - * and there isn't anything to free (it's already been freed). + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. */ - if (!HDR_HAS_L1HDR(hdr)) - return; + refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + hdr->b_l1hdr.b_pdata = NULL; /* - * The header isn't being written to the l2arc device, thus it - * shouldn't have a b_tmp_cdata to free. + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. */ - if (!HDR_L2_WRITING(hdr)) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } - - /* - * The header does not have compression enabled. This can be due - * to the buffer not being compressible, or because we're - * freeing the buffer before the second phase of - * l2arc_write_buffer() has started (which does the compression - * step). In either case, b_tmp_cdata does not point to a - * separately compressed buffer, so there's nothing to free (it - * points to the same buffer as the arc_buf_t's b_data field). - */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) { - hdr->b_l1hdr.b_tmp_cdata = NULL; - return; - } - - /* - * There's nothing to free since the buffer was all zero's and - * compressed to a zero length buffer. - */ - if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - return; - } - - ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress)); - - arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size, zio_data_buf_free); - - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - hdr->b_l1hdr.b_tmp_cdata = NULL; + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } /* @@ -2025,54 +2328,42 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t remove) +arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_buf_t *lastbuf = NULL; + uint64_t size = HDR_GET_LSIZE(hdr); + boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); - /* free up data associated with the buf */ + /* + * Free up the data associated with the buf but only + * if we're not sharing this with the hdr. If we are sharing + * it with the hdr, then hdr will have performed the allocation + * so allow it to do the free. + */ if (buf->b_data != NULL) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); + if (destroyed_buf_is_shared) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(HDR_SHARED_DATA(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); } - - /* protected by hash lock, if in the hash table */ - if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { - uint64_t *cnt = &state->arcs_lsize[type]; - - ASSERT(refcount_is_zero( - &buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(state != arc_anon && state != arc_l2c_only); - - ASSERT3U(*cnt, >=, size); - atomic_add_64(cnt, -size); - } - - (void) refcount_remove_many(&state->arcs_size, size, buf); buf->b_data = NULL; - /* - * If we're destroying a duplicate buffer make sure - * that the appropriate statistics are updated. - */ - if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && - HDR_ISTYPE_DATA(buf->b_hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); - } - ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); - buf->b_hdr->b_l1hdr.b_datacnt -= 1; + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; } /* only remove the buf if requested */ @@ -2080,68 +2371,269 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; - bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; - buf->b_next = NULL; + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; - ASSERT(buf->b_efunc == NULL); + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (destroyed_buf_is_shared && lastbuf != NULL) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + + if (hdr->b_l1hdr.b_bufcnt == 0) + arc_cksum_free(hdr); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } +static void +arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr), hdr); + } + hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compress, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + ASSERT3U(lsize, >, 0); + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compress); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pdata(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); +} + +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); + + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); + + return (nhdr); +} + +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + buf = arc_buf_alloc_impl(hdr, tag); + arc_buf_thaw(buf); + return (buf); +} + static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t asize = arc_hdr_size(hdr); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); - /* - * We don't want to leak the b_tmp_cdata buffer that was - * allocated in l2arc_write_buffers() - */ - arc_buf_l2_cdata_free(hdr); + ARCSTAT_INCR(arcstat_l2_asize, -asize); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - /* - * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then - * this header is being processed by l2arc_write_buffers() (i.e. - * it's in the first stage of l2arc_write_buffers()). - * Re-affirming that truth here, just to serve as a reminder. If - * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or - * may not have its HDR_L2_WRITING flag set. (the write may have - * completed, in which case HDR_L2_WRITING will be false and the - * b_daddr field will point to the address of the buffer on disk). - */ - IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); - /* - * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with - * l2arc_write_buffers(). Since we've just removed this header - * from the l2arc buffer list, this header will never reach the - * second stage of l2arc_write_buffers(), which increments the - * accounting stats for this header. Thus, we must be careful - * not to decrement them for this header either. - */ - if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - - vdev_space_update(dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - - (void) refcount_remove_many(&dev->l2ad_alloc, - l2hdr->b_asize, hdr); - } - - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void @@ -2149,13 +2641,16 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_datacnt > 0); + hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -2179,33 +2674,14 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } - if (!BUF_EMPTY(hdr)) - buf_discard_identity(hdr); - - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } - if (HDR_HAS_L1HDR(hdr)) { - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; + arc_cksum_free(hdr); - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); - } + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); + + if (hdr->b_l1hdr.b_pdata != NULL) { + arc_hdr_free_pdata(hdr); } } @@ -2220,133 +2696,35 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } void -arc_buf_free(arc_buf_t *buf, void *tag) -{ - arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_l1hdr.b_state != arc_anon; - - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); - - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); - - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; - /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_user_evicts_lock); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_user_evicts_lock); - if (destroy_hdr) - arc_hdr_destroy(hdr); - } else { - if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, TRUE); - else - arc_hdr_destroy(hdr); - } -} - -boolean_t -arc_buf_remove_ref(arc_buf_t *buf, void* tag) +arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); - boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - arc_buf_free(buf, tag); - return (no_callback); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; } mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT(buf->b_data != NULL); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_l1hdr.b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - } - ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || - refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + arc_buf_destroy_impl(buf, B_TRUE); mutex_exit(hash_lock); - return (no_callback); } uint64_t arc_buf_size(arc_buf_t *buf) { - return (buf->b_hdr->b_size); -} - -/* - * Called from the DMU to determine if the current buffer should be - * evicted. In order to ensure proper locking, the eviction must be initiated - * from the DMU. Return true if the buffer is associated with user data and - * duplicate buffers still exist. - */ -boolean_t -arc_buf_eviction_needed(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - boolean_t evict_needed = B_FALSE; - - if (zfs_disable_dup_eviction) - return (B_FALSE); - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(); let that function - * perform the eviction. - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We have already been added to the arc eviction list; - * recommend eviction. - */ - ASSERT3P(hdr, ==, &arc_eviction_hdr); - mutex_exit(&buf->b_evict_lock); - return (B_TRUE); - } - - if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) - evict_needed = B_TRUE; - - mutex_exit(&buf->b_evict_lock); - return (evict_needed); + return (HDR_GET_LSIZE(buf->b_hdr)); } /* @@ -2373,11 +2751,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_tmp_cdata field) during its write phase. + * (i.e. its b_pdata field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. @@ -2388,11 +2766,13 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ARCSTAT_BUMP(arcstat_deleted); - bytes_evicted += hdr->b_size; + bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -2405,6 +2785,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } @@ -2424,7 +2805,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { @@ -2432,37 +2812,39 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) break; } if (buf->b_data != NULL) - bytes_evicted += hdr->b_size; - if (buf->b_efunc != NULL) { - mutex_enter(&arc_user_evicts_lock); - arc_buf_destroy(buf, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - cv_signal(&arc_user_evicts_cv); - mutex_exit(&arc_user_evicts_lock); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf, B_TRUE); } if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) - ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); - else - ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } } - if (hdr->b_l1hdr.b_datacnt == 0) { + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_buf(). + */ + arc_hdr_free_pdata(hdr); + arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } @@ -2718,12 +3100,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * - * When 'retry' is set to FALSE, the function will make a single pass + * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * - * When 'retry' is set to TRUE, the function will continually retry the + * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might @@ -2735,7 +3117,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, { uint64_t evicted = 0; - while (state->arcs_lsize[type] != 0) { + while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) @@ -2806,8 +3188,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, { int64_t delta; - if (bytes > 0 && state->arcs_lsize[type] > 0) { - delta = MIN(state->arcs_lsize[type], bytes); + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } @@ -2834,8 +3216,8 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, static uint64_t arc_adjust_meta_balanced(void) { - int64_t adjustmnt, delta, prune = 0; - uint64_t total_evicted = 0; + int64_t delta, prune = 0; + uint64_t adjustmnt, total_evicted = 0; arc_buf_contents_t type = ARC_BUFC_DATA; int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); @@ -2850,8 +3232,9 @@ restart: */ adjustmnt = arc_meta_used - arc_meta_limit; - if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) { - delta = MIN(arc_mru->arcs_lsize[type], adjustmnt); + if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mru->arcs_esize[type]), + adjustmnt); total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); adjustmnt -= delta; } @@ -2866,23 +3249,26 @@ restart: * simply decrement the amount of data evicted from the MRU. */ - if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) { - delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt); + if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]), + adjustmnt); total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); } adjustmnt = arc_meta_used - arc_meta_limit; - if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + if (adjustmnt > 0 && + refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, - arc_mru_ghost->arcs_lsize[type]); + refcount_count(&arc_mru_ghost->arcs_esize[type])); total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); adjustmnt -= delta; } - if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) { + if (adjustmnt > 0 && + refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, - arc_mfu_ghost->arcs_lsize[type]); + refcount_count(&arc_mfu_ghost->arcs_esize[type])); total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); } @@ -3178,36 +3564,13 @@ arc_adjust(void) return (total_evicted); } -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_user_evicts_lock); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = NULL; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_user_evicts_lock); - - if (buf->b_efunc != NULL) - VERIFY0(buf->b_efunc(buf->b_private)); - - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); -} - void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* - * If retry is TRUE, a spa must not be specified since we have + * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ @@ -3227,9 +3590,6 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - - arc_do_user_evicts(); - ASSERT(spa || arc_eviction_list == NULL); } void @@ -3372,15 +3732,15 @@ arc_available_memory(void) /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this arena remains - * above about 1/16th free. + * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. * - * Note: The 1/16th arena free requirement was put in place - * to aggressively evict memory from the arc in order to avoid - * memory fragmentation issues. + * Note that reducing the arc_zio_arena_free_shift keeps more virtual + * memory (in the zio_arena) free, which can avoid memory + * fragmentation issues. */ if (zio_arena != NULL) { - n = vmem_size(zio_arena, VMEM_FREE) - - (vmem_size(zio_arena, VMEM_ALLOC) >> 4); + n = vmem_size(zio_arena, VMEM_FREE) - (vmem_size(zio_arena, + VMEM_ALLOC) >> arc_zio_arena_free_shift); if (n < lowest) { lowest = n; r = FMR_ZIO_ARENA; @@ -3400,7 +3760,7 @@ arc_available_memory(void) /* * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of TRUE indicates that the system + * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t @@ -3489,6 +3849,21 @@ arc_reclaim_thread(void) arc_tuning_update(); + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ +#ifndef __linux__ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); +#endif mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { @@ -3558,60 +3933,13 @@ arc_reclaim_thread(void) } } - arc_reclaim_thread_exit = FALSE; + arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ spl_fstrans_unmark(cookie); thread_exit(); } -static void -arc_user_evicts_thread(void) -{ - fstrans_cookie_t cookie = spl_fstrans_mark(); - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); - - mutex_enter(&arc_user_evicts_lock); - while (!arc_user_evicts_thread_exit) { - mutex_exit(&arc_user_evicts_lock); - - arc_do_user_evicts(); - - /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. - */ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); - - mutex_enter(&arc_user_evicts_lock); - - /* - * Block until signaled, or after one second (we need to - * call the arc's kstat update function regularly). - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig(&arc_user_evicts_cv, - &arc_user_evicts_lock, ddi_get_lbolt() + hz); - CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); - } - - arc_user_evicts_thread_exit = FALSE; - cv_broadcast(&arc_user_evicts_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ - spl_fstrans_unmark(cookie); - thread_exit(); -} - #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the @@ -3661,15 +3989,15 @@ arc_user_evicts_thread(void) static uint64_t arc_evictable_memory(void) { uint64_t arc_clean = - arc_mru->arcs_lsize[ARC_BUFC_DATA] + - arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; + refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + + refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); uint64_t ghost_clean = - arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] + - arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]; + refcount_count(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]) + + refcount_count(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]) + + refcount_count(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0); if (arc_dirty >= arc_c_min) @@ -3845,18 +4173,17 @@ arc_is_overflowing(void) } /* - * The buffer, supplied as the first argument, needs a data block. If we - * are hitting the hard limit for the cache size, we must sleep, waiting - * for the eviction thread to catch up. If we're past the target size - * but below the hard limit, we'll only signal the reclaim thread and - * continue on. + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. */ -static void -arc_get_data_buf(arc_buf_t *buf) +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = arc_buf_type(buf->b_hdr); + void *datap = NULL; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -3896,12 +4223,13 @@ arc_get_data_buf(arc_buf_t *buf) mutex_exit(&arc_reclaim_lock); } + VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); + datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); + datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -3909,11 +4237,9 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; - arc_state_t *state = hdr->b_l1hdr.b_state; + if (!GHOST_STATE(state)) { - (void) refcount_add_many(&state->arcs_size, size, buf); + (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is @@ -3926,9 +4252,10 @@ arc_get_data_buf(arc_buf_t *buf) */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], - size); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); } + /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p @@ -3938,6 +4265,37 @@ arc_get_data_buf(arc_buf_t *buf) refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } + return (datap); +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(data, size); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(data, size); + arc_space_return(size, ARC_SPACE_DATA); + } } /* @@ -3981,7 +4339,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } @@ -4018,7 +4376,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -4091,8 +4449,8 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg)); + bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ @@ -4101,7 +4459,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg)); + arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; @@ -4109,18 +4467,30 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) } } +static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + } +} + static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ + arc_buf_hdr_t *hdr = zio->io_private; + arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; + int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -4139,34 +4509,34 @@ arc_read_done(zio_t *zio) ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); - found = buf_hash_find(hdr->b_spa, zio->io_bp, - &hash_lock); + found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && - hash_lock == NULL) || - (found == hdr && + ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); } - hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } + } + + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - hdr->b_flags &= ~ARC_FLAG_L2CACHE; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - /* byteswap if necessary */ callback_list = hdr->b_l1hdr.b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - dmu_object_byteswap_t bswap = - DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); - if (BP_GET_LEVEL(zio->io_bp) > 0) - byteswap_uint64_array(buf->b_data, hdr->b_size); - else - dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size); - } - - arc_cksum_compute(buf, B_FALSE); - arc_buf_watch(buf); + ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { @@ -4180,31 +4550,50 @@ arc_read_done(zio_t *zio) } /* create copies of the data buffer for the callers */ - abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ if (abuf == NULL) { - ARCSTAT_BUMP(arcstat_duplicate_reads); - abuf = arc_buf_clone(buf); + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private); + if (zio->io_error == 0) { + zio->io_error = + arc_decompress(acb->acb_buf); + } + abuf = acb->acb_buf; + } else { + add_reference(hdr, acb->acb_private); + acb->acb_buf = arc_buf_clone(abuf); } - acb->acb_buf = abuf; - abuf = NULL; } } hdr->b_l1hdr.b_acb = NULL; - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) { - ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt == 1); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (abuf == NULL) { + /* + * This buffer didn't have a callback so it must + * be a prefetch. + */ + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error != 0) { - hdr->b_flags |= ARC_FLAG_IO_ERROR; + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -4274,7 +4663,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; - arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); @@ -4292,8 +4680,8 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { - + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { @@ -4325,7 +4713,8 @@ top: ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { @@ -4346,10 +4735,9 @@ top: acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - ASSERT(acb->acb_done != NULL); + ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); goto out; } @@ -4372,34 +4760,36 @@ top: arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); - hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); } - add_reference(hdr, hash_lock, private); + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + if (buf == NULL) { + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + buf = arc_buf_alloc_impl(hdr, private); + VERIFY0(arc_decompress(buf)); } else { + add_reference(hdr, private); buf = arc_buf_clone(buf); } + ASSERT3P(buf->b_data, !=, NULL); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - hdr->b_flags |= ARC_FLAG_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), @@ -4409,20 +4799,19 @@ top: if (done) done(NULL, buf, private); } else { - uint64_t size = BP_GET_LSIZE(bp); + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; - enum zio_compress b_compress = ZIO_COMPRESS_OFF; - int32_t b_asize = 0; + uint64_t size; /* * Gracefully handle a damaged logical block size as a * checksum error. */ - if (size > spa_maxblocksize(spa)) { - ASSERT3P(buf, ==, NULL); + if (lsize > spa_maxblocksize(spa)) { rc = SET_ERROR(ECKSUM); goto out; } @@ -4431,8 +4820,9 @@ top: /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); @@ -4442,26 +4832,9 @@ top: /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); - (void) arc_buf_remove_ref(buf, private); + arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - - /* - * If there is a callback, we pass our reference to - * it; otherwise we remove our reference. - */ - if (done == NULL) { - (void) remove_reference(hdr, hash_lock, - private); - } - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* * This block is in the ghost cache. If it was L2-only @@ -4473,53 +4846,60 @@ top: hdr_full_cache); } + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* - * If there is a callback, we pass a reference to it. + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). */ - if (done != NULL) - add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_FLAG_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREFETCH; - if (*arc_flags & ARC_FLAG_L2CACHE) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (*arc_flags & ARC_FLAG_L2COMPRESS) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_l1hdr.b_buf = buf; - ASSERT0(hdr->b_l1hdr.b_datacnt); - hdr->b_l1hdr.b_datacnt = 1; - arc_get_data_buf(buf); arc_access(hdr, hash_lock); + arc_hdr_alloc_pdata(hdr); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + size = arc_hdr_size(hdr); + + /* + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; } + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; - b_compress = hdr->b_l2hdr.b_compress; - b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -4528,6 +4908,11 @@ top: vd = NULL; } + if (priority == ZIO_PRIORITY_ASYNC_READ) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + if (hash_lock != NULL) mutex_exit(hash_lock); @@ -4535,19 +4920,15 @@ top: * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ - ASSERT3U(hdr->b_size, ==, size); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_phys_t *, zb); + uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - if (priority == ZIO_PRIORITY_ASYNC_READ) - hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; - else - hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: @@ -4569,15 +4950,13 @@ top: cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_buf = buf; - cb->l2rcb_spa = spa; + cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; - cb->l2rcb_compress = b_compress; ASSERT(addr >= VDEV_LABEL_START_SIZE && - addr + size < vd->vdev_psize - + addr + lsize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* @@ -4586,26 +4965,20 @@ top: * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - if (b_compress == ZIO_COMPRESS_EMPTY) { - rzio = zio_null(pio, spa, vd, - l2arc_read_done, cb, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); - } else { - rzio = zio_read_phys(pio, vd, addr, - b_asize, buf->b_data, - ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY, B_FALSE); - } + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + size, hdr->b_l1hdr.b_pdata, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -4635,8 +5008,8 @@ top: } } - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, zio_flags, zb); + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); @@ -4689,20 +5062,6 @@ arc_remove_prune_callback(arc_prune_t *p) kmem_free(p, sizeof (*p)); } -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || - func == NULL); - ASSERT(buf->b_efunc == NULL); - ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); - - buf->b_efunc = func; - buf->b_private = private; -} - /* * Notify the arc that a block was freed, and thus will never be used again. */ @@ -4718,87 +5077,40 @@ arc_freed(spa_t *spa, const blkptr_t *bp) hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; - if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - mutex_exit(hash_lock); - arc_release(buf, FTAG); - (void) arc_buf_remove_ref(buf, FTAG); + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } -/* - * Clear the user eviction callback set by arc_set_callback(), first calling - * it if it exists. Because the presence of a callback keeps an arc_buf cached - * clearing the callback may result in the arc_buf being destroyed. However, - * it will not result in the *last* arc_buf being destroyed, hence the data - * will remain cached in the ARC. We make a copy of the arc buffer here so - * that we can process the callback without holding any locks. - * - * It's possible that the callback is already in the process of being cleared - * by another thread. In this case we can not clear the callback. - * - * Returns B_TRUE if the callback was successfully called and cleared. - */ -boolean_t -arc_clear_callback(arc_buf_t *buf) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - arc_evict_func_t *efunc = buf->b_efunc; - void *private = buf->b_private; - - mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - mutex_exit(&buf->b_evict_lock); - return (B_FALSE); - } else if (buf->b_data == NULL) { - /* - * We are on the eviction list; process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - mutex_exit(&buf->b_evict_lock); - VERIFY0(efunc(private)); - return (B_TRUE); - } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - - ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, - hdr->b_l1hdr.b_datacnt); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); - - buf->b_efunc = NULL; - buf->b_private = NULL; - - if (hdr->b_l1hdr.b_datacnt > 1) { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, TRUE); - } else { - ASSERT(buf == hdr->b_l1hdr.b_buf); - hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; - mutex_exit(&buf->b_evict_lock); - } - - mutex_exit(hash_lock); - VERIFY0(efunc(private)); - return (B_TRUE); -} - /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. @@ -4832,16 +5144,19 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(buf->b_efunc, ==, NULL); - ASSERT3P(buf->b_private, ==, NULL); - hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); arc_buf_thaw(buf); return; @@ -4882,79 +5197,116 @@ arc_release(arc_buf_t *buf, void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_datacnt > 1) { + if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); - uint32_t flags = hdr->b_flags; + arc_buf_t *lastbuf = NULL; + VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + /* * Pull the data off of this hdr and attach it to - * a new anonymous hdr. + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. */ - (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_l1hdr.b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; - buf->b_next = NULL; + while (*bufp != NULL) { + if (*bufp == buf) { + *bufp = buf->b_next; + } - ASSERT3P(state, !=, arc_l2c_only); - - (void) refcount_remove_many( - &state->arcs_size, hdr->b_size, buf); - - if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - uint64_t *size; - - ASSERT3P(state, !=, arc_l2c_only); - size = &state->arcs_lsize[type]; - ASSERT3U(*size, >=, hdr->b_size); - atomic_add_64(size, -hdr->b_size); + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + ASSERT3P(lastbuf, !=, NULL); /* - * We're releasing a duplicate user data buffer, update - * our statistics accordingly. + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block, transfer + * ownership and setup sharing with a new arc_buf_t at the end + * of the hdr's b_buf list. */ - if (HDR_ISTYPE_DATA(hdr)) { - ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); - ARCSTAT_INCR(arcstat_duplicate_buffers_size, - -hdr->b_size); + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. Then, setup a new + * block sharing relationship with the last buffer + * on the arc_buf_t list. + */ + arc_unshare_buf(hdr, buf); + arc_share_buf(hdr, lastbuf); + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); } - hdr->b_l1hdr.b_datacnt -= 1; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(state, !=, arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_size, + HDR_GET_LSIZE(hdr), buf); + + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + ASSERT3P(state, !=, arc_l2c_only); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), buf); + } + + hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - nhdr->b_size = blksz; - nhdr->b_spa = spa; + /* + * Allocate a new hdr. The new hdr will contain a b_pdata + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_bufcnt = 1; nhdr->b_l1hdr.b_mru_hits = 0; nhdr->b_l1hdr.b_mru_ghost_hits = 0; nhdr->b_l1hdr.b_mfu_hits = 0; nhdr->b_l1hdr.b_mfu_ghost_hits = 0; nhdr->b_l1hdr.b_l2_hits = 0; - nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_flags |= arc_bufc_to_flags(type); - nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; - - nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_datacnt = 1; - nhdr->b_l1hdr.b_state = arc_anon; - nhdr->b_l1hdr.b_arc_access = 0; - nhdr->b_l1hdr.b_tmp_cdata = NULL; - nhdr->b_freeze_cksum = NULL; - (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); - (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); + (void) refcount_add_many(&arc_anon->arcs_size, + HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -4973,8 +5325,6 @@ arc_release(arc_buf_t *buf, void *tag) buf_discard_identity(hdr); arc_buf_thaw(buf); } - buf->b_efunc = NULL; - buf->b_private = NULL; } int @@ -5008,28 +5358,83 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); + enum zio_compress compress; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); - callback->awcb_ready(zio, buf, callback->awcb_private); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* - * If the IO is already in progress, then this is a re-write - * attempt, so we need to thaw and re-compute the cksum. - * It is the responsibility of the callback to handle the - * accounting for any re-write attempt. + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback as invoked. */ - if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_l1hdr.b_freeze_lock); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); + arc_buf_unwatch(buf); + if (hdr->b_l1hdr.b_pdata != NULL) { + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } } - mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } - arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + /* + * If the hdr is compressed, then copy the compressed + * zio contents into arc_buf_hdr_t. Otherwise, copy the original + * data buf into the hdr. Ideally, we would like to always copy the + * io_data into b_pdata but the user may have disabled compressed + * arc thus the on-disk block may or may not match what we maintain + * in the hdr's b_pdata field. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pdata(hdr); + bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + /* + * This hdr is not compressed so we're able to share + * the arc_buf_t data buffer with the hdr. + */ + arc_share_buf(hdr, buf); + VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + HDR_GET_LSIZE(hdr))); + } + arc_hdr_verify(hdr, zio->io_bp); } static void @@ -5060,9 +5465,11 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { @@ -5070,7 +5477,7 @@ arc_write_done(zio_t *zio) hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); } /* @@ -5079,7 +5486,7 @@ arc_write_done(zio_t *zio) * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ - if (!BUF_EMPTY(hdr)) { + if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -5113,19 +5520,19 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -5136,7 +5543,7 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, @@ -5146,16 +5553,14 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, arc_write_callback_t *callback; zio_t *zio; - ASSERT(ready != NULL); - ASSERT(done != NULL); + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_l1hdr.b_acb == NULL); - ASSERT(hdr->b_l1hdr.b_datacnt > 0); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) - hdr->b_flags |= ARC_FLAG_L2CACHE; - if (l2arc_compress) - hdr->b_flags |= ARC_FLAG_L2COMPRESS; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5164,7 +5569,30 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback->awcb_private = private; callback->awcb_buf = buf; - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, + /* + * The hdr's b_pdata is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pdata != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, @@ -5191,7 +5619,6 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg) last_txg = txg; page_load = 0; } - /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to @@ -5270,12 +5697,14 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, - arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, - arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, - reserve>>10, arc_c>>10); + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } @@ -5288,8 +5717,10 @@ arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); - evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; - evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int @@ -5342,7 +5773,7 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ - ASSERT(!BUF_EMPTY(hdr)); + ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given @@ -5441,92 +5872,15 @@ arc_tuning_update(void) } -void -arc_init(void) +static void +arc_state_init(void) { - /* - * allmem is "all memory that we could possibly use". - */ -#ifdef _KERNEL - uint64_t allmem = ptob(physmem); -#else - uint64_t allmem = (physmem * PAGESIZE) / 2; -#endif - uint64_t percent; - - mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); - cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); - - mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); - - /* Convert seconds to clock ticks */ - arc_min_prefetch_lifespan = 1 * hz; - -#ifdef _KERNEL - /* - * Register a shrinker to support synchronous (direct) memory - * reclaim from the arc. This is done to prevent kswapd from - * swapping out pages when it is preferable to shrink the arc. - */ - spl_register_shrinker(&arc_shrinker); - - /* Set to 1/64 of all memory or a minimum of 512K */ - arc_sys_free = MAX(ptob(physmem / 64), (512 * 1024)); - arc_need_free = 0; -#endif - - /* Set max to 1/2 of all memory */ - arc_c_max = allmem / 2; - - /* - * In userland, there's only the memory pressure that we artificially - * create (see arc_available_memory()). Don't let arc_c get too - * small, because it can cause transactions to be larger than - * arc_c, causing arc_tempreserve_space() to fail. - */ -#ifndef _KERNEL - arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); -#else - arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT; -#endif - - arc_c = arc_c_max; - arc_p = (arc_c >> 1); - - /* Set min to 1/2 of arc_c_min */ - arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; - /* Initialize maximum observed usage to zero */ - arc_meta_max = 0; - /* - * Set arc_meta_limit to a percent of arc_c_max with a floor of - * arc_meta_min, and a ceiling of arc_c_max. - */ - percent = MIN(zfs_arc_meta_limit_percent, 100); - arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); - percent = MIN(zfs_arc_dnode_limit_percent, 100); - arc_dnode_limit = (percent * arc_meta_limit) / 100; - - /* Apply user specified tunings */ - arc_tuning_update(); - - if (zfs_arc_num_sublists_per_state < 1) - zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1); - - /* if kmem_flags are set, lets try to use less memory */ - if (kmem_debugging()) - arc_c = arc_c / 2; - if (arc_c < arc_c_min) - arc_c = arc_c_min; - arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; - arc_size = 0; multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), @@ -5569,12 +5923,18 @@ arc_init(void) offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); - arc_anon->arcs_state = ARC_STATE_ANON; - arc_mru->arcs_state = ARC_STATE_MRU; - arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; - arc_mfu->arcs_state = ARC_STATE_MFU; - arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; - arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_anon->arcs_size); refcount_create(&arc_mru->arcs_size); @@ -5583,19 +5943,142 @@ arc_init(void) refcount_create(&arc_mfu_ghost->arcs_size); refcount_create(&arc_l2c_only->arcs_size); + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); +} + +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + +void +arc_init(void) +{ + /* + * allmem is "all memory that we could possibly use". + */ +#ifdef _KERNEL + uint64_t allmem = ptob(physmem); +#else + uint64_t allmem = (physmem * PAGESIZE) / 2; +#endif + uint64_t percent; + + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + + /* Convert seconds to clock ticks */ + arc_min_prefetch_lifespan = 1 * hz; + +#ifdef _KERNEL + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); + + /* Set to 1/64 of all memory or a minimum of 512K */ + arc_sys_free = MAX(ptob(physmem / 64), (512 * 1024)); + arc_need_free = 0; +#endif + + /* Set max to 1/2 of all memory */ + arc_c_max = allmem / 2; + + /* + * In userland, there's only the memory pressure that we artificially + * create (see arc_available_memory()). Don't let arc_c get too + * small, because it can cause transactions to be larger than + * arc_c, causing arc_tempreserve_space() to fail. + */ +#ifndef _KERNEL + arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); +#else + arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT; +#endif + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + arc_size = 0; + + /* Set min to 1/2 of arc_c_min */ + arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; + /* Initialize maximum observed usage to zero */ + arc_meta_max = 0; + /* + * Set arc_meta_limit to a percent of arc_c_max with a floor of + * arc_meta_min, and a ceiling of arc_c_max. + */ + percent = MIN(zfs_arc_meta_limit_percent, 100); + arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); + percent = MIN(zfs_arc_dnode_limit_percent, 100); + arc_dnode_limit = (percent * arc_meta_limit) / 100; + + /* Apply user specified tunings */ + arc_tuning_update(); + + if (zfs_arc_num_sublists_per_state < 1) + zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1); + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + arc_state_init(); buf_init(); - arc_reclaim_thread_exit = FALSE; - arc_user_evicts_thread_exit = FALSE; list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); - arc_eviction_list = NULL; mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri, max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + arc_reclaim_thread_exit = B_FALSE; + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -5608,10 +6091,7 @@ arc_init(void) (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, defclsyspri); - (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, - TS_RUN, defclsyspri); - - arc_dead = FALSE; + arc_dead = B_FALSE; arc_warm = B_FALSE; /* @@ -5644,10 +6124,10 @@ arc_fini(void) #endif /* _KERNEL */ mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = TRUE; + arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to - * FALSE when it is finished exiting; we're waiting for that. + * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); @@ -5655,22 +6135,10 @@ arc_fini(void) } mutex_exit(&arc_reclaim_lock); - mutex_enter(&arc_user_evicts_lock); - arc_user_evicts_thread_exit = TRUE; - /* - * The user evicts thread will set arc_user_evicts_thread_exit - * to FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_user_evicts_thread_exit) { - cv_signal(&arc_user_evicts_cv); - cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); - } - mutex_exit(&arc_user_evicts_lock); + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); - /* Use TRUE to ensure *all* buffers are evicted */ - arc_flush(NULL, TRUE); - - arc_dead = TRUE; + arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); @@ -5695,27 +6163,7 @@ arc_fini(void) cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); - mutex_destroy(&arc_user_evicts_lock); - cv_destroy(&arc_user_evicts_cv); - - refcount_destroy(&arc_anon->arcs_size); - refcount_destroy(&arc_mru->arcs_size); - refcount_destroy(&arc_mru_ghost->arcs_size); - refcount_destroy(&arc_mfu->arcs_size); - refcount_destroy(&arc_mfu_ghost->arcs_size); - refcount_destroy(&arc_l2c_only->arcs_size); - - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); - multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); - + arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); @@ -5845,7 +6293,6 @@ arc_fini(void) * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers - * l2arc_nocompress skip compressing buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this @@ -6005,9 +6452,13 @@ l2arc_do_free_on_write(void) for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT(df->l2df_data != NULL); - ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + ASSERT3P(df->l2df_data, !=, NULL); + if (df->l2df_type == ARC_BUFC_METADATA) { + zio_buf_free(df->l2df_data, df->l2df_size); + } else { + ASSERT(df->l2df_type == ARC_BUFC_DATA); + zio_data_buf_free(df->l2df_data, df->l2df_size); + } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6030,13 +6481,13 @@ l2arc_write_done(zio_t *zio) int64_t bytes_dropped = 0; cb = zio->io_private; - ASSERT(cb != NULL); + ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; - ASSERT(dev != NULL); + ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; - ASSERT(head != NULL); + ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; - ASSERT(buflist != NULL); + ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -6094,45 +6545,30 @@ top: */ ASSERT(HDR_HAS_L1HDR(hdr)); - /* - * We may have allocated a buffer for L2ARC compression, - * we must release it to avoid leaking this data. - */ - l2arc_release_cdata_buf(hdr); - /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ - if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) { - list_remove(buflist, hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; - hdr->b_flags &= ~ARC_FLAG_L2CACHE; - - ARCSTAT_BUMP(arcstat_l2_writes_skip_toobig); - - (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); - } else if (zio->io_error != 0) { + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); - ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); - bytes_dropped += hdr->b_l2hdr.b_asize; + bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, - hdr->b_l2hdr.b_asize, hdr); + arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } @@ -6159,43 +6595,36 @@ l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; - arc_buf_t *buf; kmutex_t *hash_lock; - int equal; + boolean_t valid_cksum; - ASSERT(zio->io_vd != NULL); + ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; - ASSERT(cb != NULL); - buf = cb->l2rcb_buf; - ASSERT(buf != NULL); + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); - hash_lock = HDR_LOCK(buf->b_hdr); + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); - hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - /* - * If the buffer was compressed, decompress it first. - */ - if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) - l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); - ASSERT(zio->io_data != NULL); - ASSERT3U(zio->io_size, ==, hdr->b_size); - ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); + ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ - equal = arc_cksum_equal(buf); - if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); - zio->io_private = buf; - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); @@ -6208,7 +6637,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!equal) + if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -6221,9 +6650,10 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, hdr->b_size, arc_read_done, buf, - zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); } } @@ -6373,12 +6803,11 @@ top: */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - hdr->b_flags |= ARC_FLAG_L2_EVICTED; + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); arc_hdr_l2hdr_destroy(hdr); } @@ -6399,37 +6828,23 @@ top: * the delta by which the device hand has changed due to alignment). */ static uint64_t -l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - boolean_t *headroom_boost) +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_sz, headroom, buf_compress_minsz, - stats_size; - void *buf_data; + uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); int try; - const boolean_t do_headroom_boost = *headroom_boost; - ASSERT(dev->l2ad_vdev != NULL); - - /* Lower the flag now, we might want to raise it again later. */ - *headroom_boost = B_FALSE; + ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_sz = write_asize = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); - head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; - head->b_flags |= ARC_FLAG_HAS_L2HDR; - - /* - * We will want to try to compress buffers that are at least 2x the - * device sector size. - */ - buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. @@ -6450,13 +6865,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; - if (do_headroom_boost) + if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; - uint64_t buf_sz; - uint64_t buf_a_sz; + uint64_t asize, size; + void *to_write; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -6471,7 +6886,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += hdr->b_size; + passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. @@ -6485,15 +6900,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - /* - * Assume that the buffer is not going to be compressed - * and could take more space on disk because of a larger - * disk block size. - */ - buf_sz = hdr->b_size; - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - - if ((write_asize + buf_a_sz) > target_sz) { + if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -6517,63 +6924,76 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ZIO_FLAG_CANFAIL); } - /* - * Create and add a new L2ARC header. - */ hdr->b_l2hdr.b_dev = dev; - hdr->b_flags |= ARC_FLAG_L2_WRITING; - /* - * Temporarily stash the data buffer in b_tmp_cdata. - * The subsequent write step will pick it up from - * there. This is because can't access b_l1hdr.b_buf - * without holding the hash_lock, which we in turn - * can't access without holding the ARC list locks - * (which we want to avoid during compression/writing) - */ - hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; - hdr->b_l2hdr.b_asize = hdr->b_size; hdr->b_l2hdr.b_hits = 0; - hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; - /* - * Explicitly set the b_daddr field to a known - * value which means "invalid address". This - * enables us to differentiate which stage of - * l2arc_write_buffers() the particular header - * is in (e.g. this loop, or the one below). - * ARC_FLAG_L2_WRITING is not enough to make - * this distinction, and we need to know in - * order to do proper l2arc vdev accounting in - * arc_release() and arc_hdr_destroy(). - * - * Note, we can't use a new flag to distinguish - * the two stages because we don't hold the - * header's hash_lock below, in the second stage - * of this function. Thus, we can't simply - * change the b_flags field to denote that the - * IO has been sent. We can change the b_daddr - * field of the L2 portion, though, since we'll - * be holding the l2ad_mtx; which is why we're - * using it to denote the header's state change. - */ - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; - hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* - * Compute and store the buffer cksum before - * writing. On debug the cksum is verified first. + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ - arc_cksum_verify(hdr->b_l1hdr.b_buf); - arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + size = arc_hdr_size(hdr); + + (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); + + /* + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. To + * ensure that this copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. + */ + if (!HDR_SHARED_DATA(hdr)) { + to_write = hdr->b_l1hdr.b_pdata; + } else { + arc_buf_contents_t type = arc_buf_type(hdr); + if (type == ARC_BUFC_METADATA) { + to_write = zio_buf_alloc(size); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + to_write = zio_data_buf_alloc(size); + } + + bcopy(hdr->b_l1hdr.b_pdata, to_write, size); + l2arc_free_data_on_write(to_write, size, type); + } + wzio = zio_write_phys(pio, dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, size, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + write_sz += HDR_GET_LSIZE(hdr); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + + write_asize += size; + /* + * Keep the clock hand suitably device-aligned. + */ + asize = vdev_psize_to_asize(dev->l2ad_vdev, size); + write_psize += asize; + dev->l2ad_hand += asize; mutex_exit(hash_lock); - write_sz += buf_sz; - write_asize += buf_a_sz; + (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); @@ -6590,114 +7010,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (0); } - mutex_enter(&dev->l2ad_mtx); - - /* - * Note that elsewhere in this file arcstat_l2_asize - * and the used space on l2ad_vdev are updated using b_asize, - * which is not necessarily rounded up to the device block size. - * Too keep accounting consistent we do the same here as well: - * stats_size accumulates the sum of b_asize of the written buffers, - * while write_asize accumulates the sum of b_asize rounded up - * to the device block size. - * The latter sum is used only to validate the corectness of the code. - */ - stats_size = 0; - write_asize = 0; - - /* - * Now start writing the buffers. We're starting at the write head - * and work backwards, retracing the course of the buffer selector - * loop above. - */ - for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; - hdr = list_prev(&dev->l2ad_buflist, hdr)) { - uint64_t buf_sz; - - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - - /* - * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_FLAG_L2_WRITING in the previous step, but we must - * take care to only access its L2 cache parameters. In - * particular, hdr->l1hdr.b_buf may be invalid by now due to - * ARC eviction. - */ - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - - if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) && - hdr->b_l2hdr.b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(hdr)) { - /* - * If compression succeeded, enable headroom - * boost on the next scan cycle. - */ - *headroom_boost = B_TRUE; - } - } - - /* - * Pick up the buffer data we had previously stashed away - * (and now potentially also compressed). - */ - buf_data = hdr->b_l1hdr.b_tmp_cdata; - buf_sz = hdr->b_l2hdr.b_asize; - - /* - * We need to do this regardless if buf_sz is zero or - * not, otherwise, when this l2hdr is evicted we'll - * remove a reference that was never added. - */ - (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); - - /* Compression may have squashed the buffer to zero length. */ - if (buf_sz != 0) { - uint64_t buf_a_sz; - - /* - * Buffers which are larger than l2arc_max_block_size - * after compression are skipped and removed from L2 - * eligibility. - */ - if (buf_sz > l2arc_max_block_size) { - hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; - continue; - } - - wzio = zio_write_phys(pio, dev->l2ad_vdev, - dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, - NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_CANFAIL, B_FALSE); - - DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, - zio_t *, wzio); - (void) zio_nowait(wzio); - - stats_size += buf_sz; - - /* - * Keep the clock hand suitably device-aligned. - */ - buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); - write_asize += buf_a_sz; - dev->l2ad_hand += buf_a_sz; - } - } - - mutex_exit(&dev->l2ad_mtx); - ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - ARCSTAT_INCR(arcstat_l2_asize, stats_size); - vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, write_asize); + vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. @@ -6715,186 +7033,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, return (write_asize); } -/* - * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its - * size in l2hdr->b_asize. This routine tries to compress the data and - * depending on the compression result there are three possible outcomes: - * *) The buffer was incompressible. The original l2hdr contents were left - * untouched and are ready for writing to an L2 device. - * *) The buffer was all-zeros, so there is no need to write it to an L2 - * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is - * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. - * *) Compression succeeded and b_tmp_cdata was replaced with a temporary - * data buffer which holds the compressed data to be written, and b_asize - * tells us how much data there is. b_compress is set to the appropriate - * compression algorithm. Once writing is done, invoke - * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. - * - * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the - * buffer was incompressible). - */ -static boolean_t -l2arc_compress_buf(arc_buf_hdr_t *hdr) -{ - void *cdata; - size_t csize, len, rounded; - l2arc_buf_hdr_t *l2hdr; - - ASSERT(HDR_HAS_L2HDR(hdr)); - - l2hdr = &hdr->b_l2hdr; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - - len = l2hdr->b_asize; - cdata = zio_data_buf_alloc(len); - ASSERT3P(cdata, !=, NULL); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, - cdata, l2hdr->b_asize); - - rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); - if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); - csize = rounded; - } - - if (csize == 0) { - /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; - l2hdr->b_asize = 0; - hdr->b_l1hdr.b_tmp_cdata = NULL; - ARCSTAT_BUMP(arcstat_l2_compress_zeros); - return (B_TRUE); - } else if (csize > 0 && csize < len) { - /* - * Compression succeeded, we'll keep the cdata around for - * writing and release it afterwards. - */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; - l2hdr->b_asize = csize; - hdr->b_l1hdr.b_tmp_cdata = cdata; - ARCSTAT_BUMP(arcstat_l2_compress_successes); - return (B_TRUE); - } else { - /* - * Compression failed, release the compressed buffer. - * l2hdr will be left unmodified. - */ - zio_data_buf_free(cdata, len); - ARCSTAT_BUMP(arcstat_l2_compress_failures); - return (B_FALSE); - } -} - -/* - * Decompresses a zio read back from an l2arc device. On success, the - * underlying zio's io_data buffer is overwritten by the uncompressed - * version. On decompression error (corrupt compressed stream), the - * zio->io_error value is set to signal an I/O error. - * - * Please note that the compressed data stream is not checksummed, so - * if the underlying device is experiencing data corruption, we may feed - * corrupt data to the decompressor, so the decompressor needs to be - * able to handle this situation (LZ4 does). - */ -static void -l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) -{ - uint64_t csize; - void *cdata; - - ASSERT(L2ARC_IS_VALID_COMPRESS(c)); - - if (zio->io_error != 0) { - /* - * An io error has occured, just restore the original io - * size in preparation for a main pool read. - */ - zio->io_orig_size = zio->io_size = hdr->b_size; - return; - } - - if (c == ZIO_COMPRESS_EMPTY) { - /* - * An empty buffer results in a null zio, which means we - * need to fill its io_data after we're done restoring the - * buffer's contents. - */ - ASSERT(hdr->b_l1hdr.b_buf != NULL); - bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; - } else { - ASSERT(zio->io_data != NULL); - /* - * We copy the compressed data from the start of the arc buffer - * (the zio_read will have pulled in only what we need, the - * rest is garbage which we will overwrite at decompression) - * and then decompress back to the ARC data buffer. This way we - * can minimize copying by simply decompressing back over the - * original compressed data (rather than decompressing to an - * aux buffer and then copying back the uncompressed buffer, - * which is likely to be much larger). - */ - csize = zio->io_size; - cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, - hdr->b_size) != 0) - zio->io_error = EIO; - zio_data_buf_free(cdata, csize); - } - - /* Restore the expected uncompressed IO size. */ - zio->io_orig_size = zio->io_size = hdr->b_size; -} - -/* - * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. - * This buffer serves as a temporary holder of compressed data while - * the buffer entry is being written to an l2arc device. Once that is - * done, we can dispose of it. - */ -static void -l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) -{ - enum zio_compress comp; - - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(HDR_HAS_L2HDR(hdr)); - comp = hdr->b_l2hdr.b_compress; - ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); - - if (comp == ZIO_COMPRESS_OFF) { - /* - * In this case, b_tmp_cdata points to the same buffer - * as the arc_buf_t's b_data field. We don't want to - * free it, since the arc_buf_t will handle that. - */ - hdr->b_l1hdr.b_tmp_cdata = NULL; - } else if (comp == ZIO_COMPRESS_EMPTY) { - /* - * In this case, b_tmp_cdata was compressed to an empty - * buffer, thus there's nothing to free and b_tmp_cdata - * should have been set to NULL in l2arc_write_buffers(). - */ - ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); - } else { - /* - * If the data was compressed, then we've allocated a - * temporary buffer for it, so now we need to release it. - */ - ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); - zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, - hdr->b_size); - hdr->b_l1hdr.b_tmp_cdata = NULL; - } - -} - /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. @@ -6907,7 +7045,6 @@ l2arc_feed_thread(void) spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); - boolean_t headroom_boost = B_FALSE; fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -6947,7 +7084,7 @@ l2arc_feed_thread(void) continue; spa = dev->l2ad_spa; - ASSERT(spa != NULL); + ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to @@ -6980,7 +7117,7 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); + wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. @@ -7075,7 +7212,7 @@ l2arc_remove_vdev(vdev_t *vd) break; } } - ASSERT(remdev != NULL); + ASSERT3P(remdev, !=, NULL); /* * Remove device from global list @@ -7164,7 +7301,6 @@ l2arc_stop(void) EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); -EXPORT_SYMBOL(arc_buf_remove_ref); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); @@ -7211,12 +7347,12 @@ MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); module_param(zfs_arc_p_min_shift, int, 0644); MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); -module_param(zfs_disable_dup_eviction, int, 0644); -MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction"); - module_param(zfs_arc_average_blocksize, int, 0444); MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size"); +module_param(zfs_compressed_arc_enabled, int, 0644); +MODULE_PARM_DESC(zfs_arc_average_blocksize, "Disable compressed arc buffers"); + module_param(zfs_arc_min_prefetch_lifespan, int, 0644); MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block"); @@ -7236,9 +7372,6 @@ MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); module_param(l2arc_headroom_boost, ulong, 0644); MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier"); -module_param(l2arc_max_block_size, ulong, 0644); -MODULE_PARM_DESC(l2arc_max_block_size, "Skip L2ARC buffers larger than N"); - module_param(l2arc_feed_secs, ulong, 0644); MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); @@ -7248,9 +7381,6 @@ MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); module_param(l2arc_noprefetch, int, 0644); MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); -module_param(l2arc_nocompress, int, 0644); -MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers"); - module_param(l2arc_feed_again, int, 0644); MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d9b414564..c334c8088 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -45,6 +45,7 @@ #include #include #include +#include struct dbuf_hold_impl_data { /* Function arguments */ @@ -71,13 +72,13 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); +uint_t zfs_dbuf_evict_key; /* * Number of times that zfs_free_range() took the slow path while doing * a zfs receive. A nonzero value indicates a potential performance problem. */ uint64_t zfs_free_range_recv_miss; -static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -89,9 +90,76 @@ extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, /* * Global data structures and functions for the dbuf cache. */ -static kmem_cache_t *dbuf_cache; +static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs are added to the dbuf cache once the last hold is released. If a + * dbuf is later accessed and still exists in the dbuf cache, then it will + * be removed from the cache and later re-added to the head of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + */ +static multilist_t dbuf_cache; +static refcount_t dbuf_cache_size; +unsigned long dbuf_cache_max_bytes = 100 * 1024 * 1024; + +/* Cap the size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_max_shift = 5; + +/* + * The dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) @@ -101,7 +169,9 @@ dbuf_cons(void *vdb, void *unused, int kmflag) mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); + multilist_link_init(&db->db_cache_link); return (0); } @@ -113,6 +183,7 @@ dbuf_dest(void *vdb, void *unused) dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } @@ -142,8 +213,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) return (crc); } -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -158,7 +227,7 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) uint64_t idx; dmu_buf_impl_t *db; - hv = DBUF_HASH(os, obj, level, blkid); + hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); @@ -211,7 +280,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) dmu_buf_impl_t *dbf; blkid = db->db_blkid; - hv = DBUF_HASH(os, obj, level, blkid); + hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); @@ -245,7 +314,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) uint64_t hv, idx; dmu_buf_impl_t *dbf, **dbp; - hv = DBUF_HASH(db->db_objset, db->db.db_object, + hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); idx = hv & h->hash_table_mask; @@ -269,8 +338,6 @@ dbuf_hash_remove(dmu_buf_impl_t *db) atomic_dec_64(&dbuf_hash_count); } -static arc_evict_func_t dbuf_do_evict; - typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING @@ -358,17 +425,186 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } -void -dbuf_evict(dmu_buf_impl_t *db) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); - dbuf_clear(db); - dbuf_destroy(db); +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) +{ + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); } +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + uint64_t dbuf_cache_hiwater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + uint64_t dbuf_cache_lowater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(&dbuf_cache); + multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); + dmu_buf_impl_t *db; + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + /* + * Set the thread's tsd to indicate that it's processing evictions. + * Once a thread stops evicting from the dbuf cache it will + * reset its tsd to NULL. + */ + ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); + (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); + + db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + dbuf_destroy(db); + } else { + multilist_sublist_unlock(mls); + } + (void) tsd_set(zfs_dbuf_evict_key, NULL); +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +static void +dbuf_evict_thread(void) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_sig_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } + + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + + /* + * We use thread specific data to track when a thread has + * started processing evictions. This allows us to avoid deeply + * nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + * The dbuf_eviction_thread will always have its tsd set until + * that thread exits. All other threads will only set their tsd + * if they are participating in the eviction process. This only + * happens if the eviction thread is unable to process evictions + * fast enough. To keep the dbuf cache size in check, other threads + * can evict from the dbuf cache directly. Those threads will set + * their tsd values so that we ensure that they only evict one dbuf + * from the dbuf cache. + */ + if (tsd_get(zfs_dbuf_evict_key) != NULL) + return; + + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + boolean_t evict_now = B_FALSE; + + mutex_enter(&dbuf_evict_lock); + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + evict_now = dbuf_cache_above_hiwater(); + cv_signal(&dbuf_evict_cv); + } + mutex_exit(&dbuf_evict_lock); + + if (evict_now) { + dbuf_evict_one(); + } + } +} + + + void dbuf_init(void) { @@ -403,7 +639,7 @@ retry: goto retry; } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); @@ -412,11 +648,31 @@ retry: dbuf_stats_init(h); + /* + * Setup the parameters for the dbuf cache. We cap the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. + */ + dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_max_shift); + /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); + + multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + zfs_arc_num_sublists_per_state, + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_cache_size); + + tsd_create(&zfs_dbuf_evict_key, NULL); + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); } void @@ -438,8 +694,23 @@ dbuf_fini(void) #else kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #endif - kmem_cache_destroy(dbuf_cache); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + tsd_destroy(&zfs_dbuf_evict_key); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + refcount_destroy(&dbuf_cache_size); + multilist_destroy(&dbuf_cache); } /* @@ -598,7 +869,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - db->db_buf = NULL; + ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; @@ -613,8 +884,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -625,6 +894,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; @@ -636,6 +906,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } @@ -704,7 +975,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -759,7 +1030,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); @@ -797,8 +1068,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; - if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -918,7 +1187,7 @@ dbuf_noread(dmu_buf_impl_t *db) ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); @@ -976,9 +1245,10 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; - dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); + dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { + db->db_buf = NULL; dbuf_clear_data(db); } } @@ -1102,7 +1372,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_destroy(db); continue; } /* The dbuf is referenced */ @@ -1210,7 +1480,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -1221,7 +1491,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db)); + arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { @@ -1628,7 +1898,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); + arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1637,12 +1907,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); return (B_TRUE); } @@ -1807,7 +2073,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db)); + arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } @@ -1825,10 +2091,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db)); + arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } @@ -1840,61 +2106,64 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) dmu_buf_fill_done(&db->db, tx); } -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_clear_callback() - * ARC: dbuf_do_evict()->dbuf_destroy() - * - * This routine will dissociate the dbuf from the arc, by calling - * arc_clear_callback(), but will not evict the data from the ARC. - */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; - boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); - dbuf_evict_user(db); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } - if (db->db_state == DB_CACHED) { + if (db->db_blkid == DMU_BONUS_BLKID) { + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DMU_BONUS_BLKID) { - int slots = DB_DNODE(db)->dn_num_slots; - int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); - zio_buf_free(db->db.db_data, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_BONUS); - } - db->db.db_data = NULL; + zio_buf_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; } + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; - if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), @@ -1905,15 +2174,25 @@ dbuf_clear(dmu_buf_impl_t *db) */ dnode_rele(dn, db); db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } - if (db->db_buf) - dbuf_gone = arc_clear_callback(db->db_buf); + ASSERT(refcount_is_zero(&db->db_holds)); - if (!dbuf_gone) - mutex_exit(&db->db_mtx); + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); /* * If this dbuf is referenced from an indirect dbuf, @@ -2035,7 +2314,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -2084,7 +2363,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); + kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } @@ -2109,76 +2388,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (db); } -static int -dbuf_do_evict(void *private) -{ - dmu_buf_impl_t *db = private; - - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); - - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); -} - -static void -dbuf_destroy(dmu_buf_impl_t *db) -{ - ASSERT(refcount_is_zero(&db->db_holds)); - - if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (db->db_dnode_handle != NULL) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - mutex_enter(&dn->dn_dbufs_mtx); - avl_remove(&dn->dn_dbufs, db); - atomic_dec_32(&dn->dn_dbufs_count); - mutex_exit(&dn->dn_dbufs_mtx); - DB_DNODE_EXIT(db); - /* - * Decrementing the dbuf count means that the hold - * corresponding to the removed dbuf is no longer - * discounted in dnode_move(), so the dnode cannot be - * moved until after we release the hold. - */ - dnode_rele(dn, db); - db->db_dnode_handle = NULL; - } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_buf = NULL; - - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); - - kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); -} - typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ @@ -2218,10 +2433,37 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; @@ -2250,7 +2492,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } - (void) arc_buf_remove_ref(abuf, private); + + arc_buf_destroy(abuf, private); } /* @@ -2348,6 +2591,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; @@ -2397,7 +2641,7 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); *(dh->dh_dbp) = NULL; -top: + /* dbuf_find() returns with db_mtx held */ dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, dh->dh_level, dh->dh_blkid); @@ -2433,18 +2677,8 @@ top: return (SET_ERROR(ENOENT)); } - if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { - arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); - if (dh->dh_db->db_buf->b_data == NULL) { - dbuf_clear(dh->dh_db); - if (dh->dh_parent) { - dbuf_rele(dh->dh_parent, NULL); - dh->dh_parent = NULL; - } - goto top; - } + if (dh->dh_db->db_buf != NULL) ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); - } ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); @@ -2463,13 +2697,19 @@ top: dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db); dbuf_set_data(dh->dh_db, - arc_buf_alloc(dh->dh_dn->dn_objset->os_spa, + arc_alloc_buf(dh->dh_dn->dn_objset->os_spa, dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); bcopy(dh->dh_dr->dt.dl.dr_data->b_data, dh->dh_db->db.db_data, dh->dh_db->db.db_size); } } + if (multilist_link_active(&dh->dh_db->db_cache_link)) { + ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); + multilist_remove(&dbuf_cache, dh->dh_db); + (void) refcount_remove_many(&dbuf_cache_size, + dh->dh_db->db.db_size, dh->dh_db); + } (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); DBUF_VERIFY(dh->dh_db); mutex_exit(&dh->dh_db->db_mtx); @@ -2595,7 +2835,8 @@ dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { - VERIFY(refcount_add(&db->db_holds, tag) > 1); + int64_t holds = refcount_add(&db->db_holds, tag); + VERIFY3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref @@ -2666,8 +2907,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); + } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) @@ -2712,55 +2955,44 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_destroy(db); } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_clear_data(db); - VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); + dbuf_destroy(db); } else { - VERIFY(!arc_buf_remove_ref(db->db_buf, db)); + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); - /* - * A dbuf will be eligible for eviction if either the - * 'primarycache' property is set or a duplicate - * copy of this buffer is already cached in the arc. - * - * In the case of the 'primarycache' a buffer - * is considered for eviction if it matches the - * criteria set in the property. - * - * To decide if our buffer is considered a - * duplicate, we must call into the arc to determine - * if multiple buffers are referencing the same - * block on-disk. If so, then we simply evict - * ourselves. - */ - if (!DBUF_IS_CACHEABLE(db)) { - if (db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - spa_t *spa = - dmu_objset_spa(db->db_objset); - blkptr_t bp = *db->db_blkptr; - dbuf_clear(db); - arc_freed(spa, &bp); - } else { - dbuf_clear(db); - } - } else if (db->db_pending_evict || - arc_buf_eviction_needed(db->db_buf)) { - dbuf_clear(db); - } else { - mutex_exit(&db->db_mtx); + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; } + + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + multilist_insert(&dbuf_cache, db); + (void) refcount_add_many(&dbuf_cache_size, + db->db.db_size, db); + mutex_exit(&db->db_mtx); + + dbuf_evict_notify(); + } + + if (do_arc_evict) + arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } + } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -3099,7 +3331,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + *datap = arc_alloc_buf(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -3370,10 +3602,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db)); - else if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; @@ -3389,8 +3618,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); @@ -3566,17 +3793,17 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, - children_ready_cb, - dbuf_write_physdone, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + &zp, dbuf_write_ready, + children_ready_cb, dbuf_write_physdone, + dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED, &zb); } } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); -EXPORT_SYMBOL(dbuf_evict); +EXPORT_SYMBOL(dbuf_destroy); EXPORT_SYMBOL(dbuf_loan_arcbuf); EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); @@ -3591,7 +3818,6 @@ EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); EXPORT_SYMBOL(dmu_buf_rele); EXPORT_SYMBOL(dbuf_assign_arcbuf); -EXPORT_SYMBOL(dbuf_clear); EXPORT_SYMBOL(dbuf_prefetch); EXPORT_SYMBOL(dbuf_hold_impl); EXPORT_SYMBOL(dbuf_hold); @@ -3609,4 +3835,24 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); + + +module_param(dbuf_cache_max_bytes, ulong, 0644); +MODULE_PARM_DESC(dbuf_cache_max_bytes, + "Maximum size in bytes of the dbuf cache."); + +module_param(dbuf_cache_hiwater_pct, uint, 0644); +MODULE_PARM_DESC(dbuf_cache_hiwater_pct, + "Percentage over dbuf_cache_max_bytes when dbufs \ + much be evicted directly."); + +module_param(dbuf_cache_lowater_pct, uint, 0644); +MODULE_PARM_DESC(dbuf_cache_lowater_pct, + "Percentage below dbuf_cache_max_bytes \ + when the evict thread stop evicting dbufs."); + +module_param(dbuf_cache_max_shift, int, 0644); +MODULE_PARM_DESC(dbuf_cache_max_shift, + "Cap the size of the dbuf cache to log2 fraction of arc size."); + #endif diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 6f39f80e5..ae8ba8682 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -95,7 +95,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) abi.abi_state_type, abi.abi_state_contents, abi.abi_flags, - (ulong_t)abi.abi_datacnt, + (ulong_t)abi.abi_bufcnt, (u_longlong_t)abi.abi_size, (u_longlong_t)abi.abi_access, (ulong_t)abi.abi_mru_hits, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 2d12f8923..542adb650 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1337,7 +1337,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG)); + arc_buf_destroy(buf, FTAG); } /* @@ -1681,8 +1681,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, - NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -2040,11 +2039,11 @@ dmu_init(void) xuio_stat_init(); dmu_objset_init(); dnode_init(); - dbuf_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); + dbuf_init(); } void diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 7665d1ca5..982b96132 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -146,7 +146,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (err) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); if (err) return (err); /* Don't care about the data blocks */ diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 22ca84d96..ac98ab6f2 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -358,8 +358,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; - if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, @@ -376,14 +374,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_buf_alloc(spa, + arc_buf_t *buf = arc_alloc_buf(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); - (void) arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } @@ -392,7 +389,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_buf_alloc(spa, size, + os->os_phys_buf = arc_alloc_buf(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); @@ -475,8 +472,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } @@ -787,7 +783,7 @@ dmu_objset_evict_done(objset_t *os) } zil_free(os->os_zil); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in @@ -1183,7 +1179,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 21007a9d1..587a29fd4 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -647,7 +647,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (err != 0) break; } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; @@ -659,7 +659,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; @@ -684,7 +684,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) if (zfs_send_corrupt_data) { uint64_t *ptr; /* Send a block filled with 0x"zfs badd bloc" */ - abuf = arc_buf_alloc(spa, blksz, &abuf, + abuf = arc_alloc_buf(spa, blksz, &abuf, ARC_BUFC_DATA); for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; @@ -713,7 +713,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 4c9459412..65f82cbcd 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -386,7 +386,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (buf) - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) @@ -610,7 +610,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, osp = buf->b_data; traverse_zil(td, &osp->os_zil_header); - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index abc004bd4..a54db9511 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -502,7 +502,7 @@ dnode_destroy(dnode_t *dn) } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 54066e2e3..b19f50af9 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -421,7 +421,7 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, db_marker, db, AVL_BEFORE); - dbuf_clear(db); + dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); avl_remove(&dn->dn_dbufs, db_marker); @@ -445,7 +445,7 @@ dnode_evict_bonus(dnode_t *dn) if (dn->dn_bonus != NULL) { if (refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } else { dn->dn_bonus->db_pending_evict = TRUE; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 7389b4b1d..41b3ce79b 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -692,7 +692,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; @@ -722,7 +722,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, cdnp, zb->zb_blkid * epb + i, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; @@ -754,7 +754,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); } return (0); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 4c460a200..1903c5954 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -227,4 +227,28 @@ refcount_transfer(refcount_t *dst, refcount_t *src) list_destroy(&removed); } +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} #endif /* ZFS_DEBUG */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index c538251c3..760f0a891 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -260,7 +260,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } return (error); @@ -297,7 +297,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); - (void) arc_buf_remove_ref(abuf, &abuf); + arc_buf_destroy(abuf, &abuf); } return (error); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3dd8cffe9..545a43d81 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -301,7 +301,7 @@ zio_data_buf_free(void *buf, size_t size) * Push and pop I/O transform buffers * ========================================================================== */ -static void +void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { @@ -319,7 +319,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio->io_size = size; } -static void +void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; @@ -2390,7 +2390,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); - VERIFY(arc_buf_remove_ref(abuf, &abuf)); + arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 3a5c73a6a..b05e787dc 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -187,25 +187,19 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, } int -zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, + void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { - blkptr_t *bp = zio->io_bp; - uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : - (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); - int byteswap; - int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); - uint64_t offset = zio->io_offset; - void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t actual_cksum, expected_cksum, verifier; + zio_cksum_t actual_cksum, expected_cksum; + int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); if (ci->ci_eck) { zio_eck_t *eck; + zio_cksum_t verifier; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -244,32 +238,51 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) ci->ci_func[byteswap](data, size, &actual_cksum); eck->zec_cksum = expected_cksum; - if (byteswap) + if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); + } } else { - ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](data, size, &actual_cksum); } - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; - info->zbc_checksum_name = ci->ci_name; - info->zbc_byteswapped = byteswap; - info->zbc_injected = 0; - info->zbc_has_cksum = 1; + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); - if (zio_injection_enabled && !zio->io_error && + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + void *data = zio->io_data; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + if (error != 0 && zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } - - return (0); + return (error); }