mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-25 18:59:33 +03:00
brt: lift internal definitions into _impl header
So that zdb (and others!) can get at the BRT on-disk structures. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Kay Pedersen <mail@mkwg.de> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rob Norris <robn@despairlabs.com> Closes #15541
This commit is contained in:
parent
41c4599cba
commit
d702f86eaf
@ -33,6 +33,7 @@ COMMON_H = \
|
|||||||
sys/bqueue.h \
|
sys/bqueue.h \
|
||||||
sys/btree.h \
|
sys/btree.h \
|
||||||
sys/brt.h \
|
sys/brt.h \
|
||||||
|
sys/brt_impl.h \
|
||||||
sys/dataset_kstats.h \
|
sys/dataset_kstats.h \
|
||||||
sys/dbuf.h \
|
sys/dbuf.h \
|
||||||
sys/ddt.h \
|
sys/ddt.h \
|
||||||
|
199
include/sys/brt_impl.h
Normal file
199
include/sys/brt_impl.h
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the terms of the
|
||||||
|
* Common Development and Distribution License (the "License").
|
||||||
|
* You may not use this file except in compliance with the License.
|
||||||
|
*
|
||||||
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||||
|
* or https://opensource.org/licenses/CDDL-1.0.
|
||||||
|
* See the License for the specific language governing permissions
|
||||||
|
* and limitations under the License.
|
||||||
|
*
|
||||||
|
* When distributing Covered Code, include this CDDL HEADER in each
|
||||||
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||||
|
* If applicable, add the following below this CDDL HEADER, with the
|
||||||
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||||
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _SYS_BRT_IMPL_H
|
||||||
|
#define _SYS_BRT_IMPL_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BRT - Block Reference Table.
|
||||||
|
*/
|
||||||
|
#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We divide each VDEV into 16MB chunks. Each chunk is represented in memory
|
||||||
|
* by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
|
||||||
|
* Each element in this array represents how many BRT entries do we have in this
|
||||||
|
* chunk of storage. We always load this entire array into memory and update as
|
||||||
|
* needed. By having it in memory we can quickly tell (during zio_free()) if
|
||||||
|
* there are any BRT entries that we might need to update.
|
||||||
|
*
|
||||||
|
* This value cannot be larger than 16MB, at least as long as we support
|
||||||
|
* 512 byte block sizes. With 512 byte block size we can have exactly
|
||||||
|
* 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
|
||||||
|
* many for a 16bit counter.
|
||||||
|
*/
|
||||||
|
#define BRT_RANGESIZE (16 * 1024 * 1024)
|
||||||
|
_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
|
||||||
|
"BRT_RANGESIZE is too large.");
|
||||||
|
/*
|
||||||
|
* We don't want to update the whole structure every time. Maintain bitmap
|
||||||
|
* of dirty blocks within the regions, so that a single bit represents a
|
||||||
|
* block size of entcounts. For example if we have a 1PB vdev then all
|
||||||
|
* entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
|
||||||
|
* 128MB array of entcounts into 32kB disk blocks, as we don't want to update
|
||||||
|
* the whole 128MB on disk when we have updated only a single entcount.
|
||||||
|
* We maintain a bitmap where each 32kB disk block within 128MB entcounts array
|
||||||
|
* is represented by a single bit. This gives us 4096 bits. A set bit in the
|
||||||
|
* bitmap means that we had a change in at least one of the 16384 entcounts
|
||||||
|
* that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
|
||||||
|
*/
|
||||||
|
#define BRT_BLOCKSIZE (32 * 1024)
|
||||||
|
#define BRT_RANGESIZE_TO_NBLOCKS(size) \
|
||||||
|
(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
|
||||||
|
|
||||||
|
#define BRT_LITTLE_ENDIAN 0
|
||||||
|
#define BRT_BIG_ENDIAN 1
|
||||||
|
#ifdef _ZFS_LITTLE_ENDIAN
|
||||||
|
#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
|
||||||
|
#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
|
||||||
|
#else
|
||||||
|
#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
|
||||||
|
#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct brt_vdev_phys {
|
||||||
|
uint64_t bvp_mos_entries;
|
||||||
|
uint64_t bvp_size;
|
||||||
|
uint64_t bvp_byteorder;
|
||||||
|
uint64_t bvp_totalcount;
|
||||||
|
uint64_t bvp_rangesize;
|
||||||
|
uint64_t bvp_usedspace;
|
||||||
|
uint64_t bvp_savedspace;
|
||||||
|
} brt_vdev_phys_t;
|
||||||
|
|
||||||
|
typedef struct brt_vdev {
|
||||||
|
/*
|
||||||
|
* VDEV id.
|
||||||
|
*/
|
||||||
|
uint64_t bv_vdevid;
|
||||||
|
/*
|
||||||
|
* Is the structure initiated?
|
||||||
|
* (bv_entcount and bv_bitmap are allocated?)
|
||||||
|
*/
|
||||||
|
boolean_t bv_initiated;
|
||||||
|
/*
|
||||||
|
* Object number in the MOS for the entcount array and brt_vdev_phys.
|
||||||
|
*/
|
||||||
|
uint64_t bv_mos_brtvdev;
|
||||||
|
/*
|
||||||
|
* Object number in the MOS for the entries table.
|
||||||
|
*/
|
||||||
|
uint64_t bv_mos_entries;
|
||||||
|
/*
|
||||||
|
* Entries to sync.
|
||||||
|
*/
|
||||||
|
avl_tree_t bv_tree;
|
||||||
|
/*
|
||||||
|
* Does the bv_entcount[] array needs byte swapping?
|
||||||
|
*/
|
||||||
|
boolean_t bv_need_byteswap;
|
||||||
|
/*
|
||||||
|
* Number of entries in the bv_entcount[] array.
|
||||||
|
*/
|
||||||
|
uint64_t bv_size;
|
||||||
|
/*
|
||||||
|
* This is the array with BRT entry count per BRT_RANGESIZE.
|
||||||
|
*/
|
||||||
|
uint16_t *bv_entcount;
|
||||||
|
/*
|
||||||
|
* Sum of all bv_entcount[]s.
|
||||||
|
*/
|
||||||
|
uint64_t bv_totalcount;
|
||||||
|
/*
|
||||||
|
* Space on disk occupied by cloned blocks (without compression).
|
||||||
|
*/
|
||||||
|
uint64_t bv_usedspace;
|
||||||
|
/*
|
||||||
|
* How much additional space would be occupied without block cloning.
|
||||||
|
*/
|
||||||
|
uint64_t bv_savedspace;
|
||||||
|
/*
|
||||||
|
* brt_vdev_phys needs updating on disk.
|
||||||
|
*/
|
||||||
|
boolean_t bv_meta_dirty;
|
||||||
|
/*
|
||||||
|
* bv_entcount[] needs updating on disk.
|
||||||
|
*/
|
||||||
|
boolean_t bv_entcount_dirty;
|
||||||
|
/*
|
||||||
|
* bv_entcount[] potentially can be a bit too big to sychronize it all
|
||||||
|
* when we just changed few entcounts. The fields below allow us to
|
||||||
|
* track updates to bv_entcount[] array since the last sync.
|
||||||
|
* A single bit in the bv_bitmap represents as many entcounts as can
|
||||||
|
* fit into a single BRT_BLOCKSIZE.
|
||||||
|
* For example we have 65536 entcounts in the bv_entcount array
|
||||||
|
* (so the whole array is 128kB). We updated bv_entcount[2] and
|
||||||
|
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
|
||||||
|
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
|
||||||
|
*/
|
||||||
|
ulong_t *bv_bitmap;
|
||||||
|
uint64_t bv_nblocks;
|
||||||
|
} brt_vdev_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In-core brt
|
||||||
|
*/
|
||||||
|
typedef struct brt {
|
||||||
|
krwlock_t brt_lock;
|
||||||
|
spa_t *brt_spa;
|
||||||
|
#define brt_mos brt_spa->spa_meta_objset
|
||||||
|
uint64_t brt_rangesize;
|
||||||
|
uint64_t brt_usedspace;
|
||||||
|
uint64_t brt_savedspace;
|
||||||
|
avl_tree_t brt_pending_tree[TXG_SIZE];
|
||||||
|
kmutex_t brt_pending_lock[TXG_SIZE];
|
||||||
|
/* Sum of all entries across all bv_trees. */
|
||||||
|
uint64_t brt_nentries;
|
||||||
|
brt_vdev_t *brt_vdevs;
|
||||||
|
uint64_t brt_nvdevs;
|
||||||
|
} brt_t;
|
||||||
|
|
||||||
|
/* Size of bre_offset / sizeof (uint64_t). */
|
||||||
|
#define BRT_KEY_WORDS (1)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In-core brt entry.
|
||||||
|
* On-disk we use bre_offset as the key and bre_refcount as the value.
|
||||||
|
*/
|
||||||
|
typedef struct brt_entry {
|
||||||
|
uint64_t bre_offset;
|
||||||
|
uint64_t bre_refcount;
|
||||||
|
avl_node_t bre_node;
|
||||||
|
} brt_entry_t;
|
||||||
|
|
||||||
|
typedef struct brt_pending_entry {
|
||||||
|
blkptr_t bpe_bp;
|
||||||
|
int bpe_count;
|
||||||
|
avl_node_t bpe_node;
|
||||||
|
} brt_pending_entry_t;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _SYS_BRT_IMPL_H */
|
164
module/zfs/brt.c
164
module/zfs/brt.c
@ -28,6 +28,7 @@
|
|||||||
#include <sys/spa_impl.h>
|
#include <sys/spa_impl.h>
|
||||||
#include <sys/zio.h>
|
#include <sys/zio.h>
|
||||||
#include <sys/brt.h>
|
#include <sys/brt.h>
|
||||||
|
#include <sys/brt_impl.h>
|
||||||
#include <sys/ddt.h>
|
#include <sys/ddt.h>
|
||||||
#include <sys/bitmap.h>
|
#include <sys/bitmap.h>
|
||||||
#include <sys/zap.h>
|
#include <sys/zap.h>
|
||||||
@ -243,169 +244,6 @@
|
|||||||
* a chance to clean this up on dataset destroy (see zil_free_clone_range()).
|
* a chance to clean this up on dataset destroy (see zil_free_clone_range()).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
|
||||||
* BRT - Block Reference Table.
|
|
||||||
*/
|
|
||||||
#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We divide each VDEV into 16MB chunks. Each chunk is represented in memory
|
|
||||||
* by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
|
|
||||||
* Each element in this array represents how many BRT entries do we have in this
|
|
||||||
* chunk of storage. We always load this entire array into memory and update as
|
|
||||||
* needed. By having it in memory we can quickly tell (during zio_free()) if
|
|
||||||
* there are any BRT entries that we might need to update.
|
|
||||||
*
|
|
||||||
* This value cannot be larger than 16MB, at least as long as we support
|
|
||||||
* 512 byte block sizes. With 512 byte block size we can have exactly
|
|
||||||
* 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
|
|
||||||
* many for a 16bit counter.
|
|
||||||
*/
|
|
||||||
#define BRT_RANGESIZE (16 * 1024 * 1024)
|
|
||||||
_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
|
|
||||||
"BRT_RANGESIZE is too large.");
|
|
||||||
/*
|
|
||||||
* We don't want to update the whole structure every time. Maintain bitmap
|
|
||||||
* of dirty blocks within the regions, so that a single bit represents a
|
|
||||||
* block size of entcounts. For example if we have a 1PB vdev then all
|
|
||||||
* entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
|
|
||||||
* 128MB array of entcounts into 32kB disk blocks, as we don't want to update
|
|
||||||
* the whole 128MB on disk when we have updated only a single entcount.
|
|
||||||
* We maintain a bitmap where each 32kB disk block within 128MB entcounts array
|
|
||||||
* is represented by a single bit. This gives us 4096 bits. A set bit in the
|
|
||||||
* bitmap means that we had a change in at least one of the 16384 entcounts
|
|
||||||
* that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
|
|
||||||
*/
|
|
||||||
#define BRT_BLOCKSIZE (32 * 1024)
|
|
||||||
#define BRT_RANGESIZE_TO_NBLOCKS(size) \
|
|
||||||
(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
|
|
||||||
|
|
||||||
#define BRT_LITTLE_ENDIAN 0
|
|
||||||
#define BRT_BIG_ENDIAN 1
|
|
||||||
#ifdef _ZFS_LITTLE_ENDIAN
|
|
||||||
#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
|
|
||||||
#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN
|
|
||||||
#else
|
|
||||||
#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN
|
|
||||||
#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct brt_vdev_phys {
|
|
||||||
uint64_t bvp_mos_entries;
|
|
||||||
uint64_t bvp_size;
|
|
||||||
uint64_t bvp_byteorder;
|
|
||||||
uint64_t bvp_totalcount;
|
|
||||||
uint64_t bvp_rangesize;
|
|
||||||
uint64_t bvp_usedspace;
|
|
||||||
uint64_t bvp_savedspace;
|
|
||||||
} brt_vdev_phys_t;
|
|
||||||
|
|
||||||
typedef struct brt_vdev {
|
|
||||||
/*
|
|
||||||
* VDEV id.
|
|
||||||
*/
|
|
||||||
uint64_t bv_vdevid;
|
|
||||||
/*
|
|
||||||
* Is the structure initiated?
|
|
||||||
* (bv_entcount and bv_bitmap are allocated?)
|
|
||||||
*/
|
|
||||||
boolean_t bv_initiated;
|
|
||||||
/*
|
|
||||||
* Object number in the MOS for the entcount array and brt_vdev_phys.
|
|
||||||
*/
|
|
||||||
uint64_t bv_mos_brtvdev;
|
|
||||||
/*
|
|
||||||
* Object number in the MOS for the entries table.
|
|
||||||
*/
|
|
||||||
uint64_t bv_mos_entries;
|
|
||||||
/*
|
|
||||||
* Entries to sync.
|
|
||||||
*/
|
|
||||||
avl_tree_t bv_tree;
|
|
||||||
/*
|
|
||||||
* Does the bv_entcount[] array needs byte swapping?
|
|
||||||
*/
|
|
||||||
boolean_t bv_need_byteswap;
|
|
||||||
/*
|
|
||||||
* Number of entries in the bv_entcount[] array.
|
|
||||||
*/
|
|
||||||
uint64_t bv_size;
|
|
||||||
/*
|
|
||||||
* This is the array with BRT entry count per BRT_RANGESIZE.
|
|
||||||
*/
|
|
||||||
uint16_t *bv_entcount;
|
|
||||||
/*
|
|
||||||
* Sum of all bv_entcount[]s.
|
|
||||||
*/
|
|
||||||
uint64_t bv_totalcount;
|
|
||||||
/*
|
|
||||||
* Space on disk occupied by cloned blocks (without compression).
|
|
||||||
*/
|
|
||||||
uint64_t bv_usedspace;
|
|
||||||
/*
|
|
||||||
* How much additional space would be occupied without block cloning.
|
|
||||||
*/
|
|
||||||
uint64_t bv_savedspace;
|
|
||||||
/*
|
|
||||||
* brt_vdev_phys needs updating on disk.
|
|
||||||
*/
|
|
||||||
boolean_t bv_meta_dirty;
|
|
||||||
/*
|
|
||||||
* bv_entcount[] needs updating on disk.
|
|
||||||
*/
|
|
||||||
boolean_t bv_entcount_dirty;
|
|
||||||
/*
|
|
||||||
* bv_entcount[] potentially can be a bit too big to sychronize it all
|
|
||||||
* when we just changed few entcounts. The fields below allow us to
|
|
||||||
* track updates to bv_entcount[] array since the last sync.
|
|
||||||
* A single bit in the bv_bitmap represents as many entcounts as can
|
|
||||||
* fit into a single BRT_BLOCKSIZE.
|
|
||||||
* For example we have 65536 entcounts in the bv_entcount array
|
|
||||||
* (so the whole array is 128kB). We updated bv_entcount[2] and
|
|
||||||
* bv_entcount[5]. In that case only first bit in the bv_bitmap will
|
|
||||||
* be set and we will write only first BRT_BLOCKSIZE out of 128kB.
|
|
||||||
*/
|
|
||||||
ulong_t *bv_bitmap;
|
|
||||||
uint64_t bv_nblocks;
|
|
||||||
} brt_vdev_t;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In-core brt
|
|
||||||
*/
|
|
||||||
typedef struct brt {
|
|
||||||
krwlock_t brt_lock;
|
|
||||||
spa_t *brt_spa;
|
|
||||||
#define brt_mos brt_spa->spa_meta_objset
|
|
||||||
uint64_t brt_rangesize;
|
|
||||||
uint64_t brt_usedspace;
|
|
||||||
uint64_t brt_savedspace;
|
|
||||||
avl_tree_t brt_pending_tree[TXG_SIZE];
|
|
||||||
kmutex_t brt_pending_lock[TXG_SIZE];
|
|
||||||
/* Sum of all entries across all bv_trees. */
|
|
||||||
uint64_t brt_nentries;
|
|
||||||
brt_vdev_t *brt_vdevs;
|
|
||||||
uint64_t brt_nvdevs;
|
|
||||||
} brt_t;
|
|
||||||
|
|
||||||
/* Size of bre_offset / sizeof (uint64_t). */
|
|
||||||
#define BRT_KEY_WORDS (1)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In-core brt entry.
|
|
||||||
* On-disk we use bre_offset as the key and bre_refcount as the value.
|
|
||||||
*/
|
|
||||||
typedef struct brt_entry {
|
|
||||||
uint64_t bre_offset;
|
|
||||||
uint64_t bre_refcount;
|
|
||||||
avl_node_t bre_node;
|
|
||||||
} brt_entry_t;
|
|
||||||
|
|
||||||
typedef struct brt_pending_entry {
|
|
||||||
blkptr_t bpe_bp;
|
|
||||||
int bpe_count;
|
|
||||||
avl_node_t bpe_node;
|
|
||||||
} brt_pending_entry_t;
|
|
||||||
|
|
||||||
static kmem_cache_t *brt_entry_cache;
|
static kmem_cache_t *brt_entry_cache;
|
||||||
static kmem_cache_t *brt_pending_entry_cache;
|
static kmem_cache_t *brt_pending_entry_cache;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user