Allow physical rewrite without logical

During regular block writes ZFS sets both logical and physical
birth times equal to the current TXG.  During dedup and block
cloning logical birth time is still set to the current TXG, but
physical may be copied from the original block that was used.
This represents the fact that logically user data has changed,
but the physically it is the same old block.

But block rewrite introduces a new situation, when block is not
changed logically, but stored in a different place of the pool.
From ARC, scrub and some other perspectives this is a new block,
but for example for user applications or incremental replication
it is not.  Somewhat similar thing happen during remap phase of
device removal, but in that case space blocks are still acounted
as allocated at their logical birth times.

This patch introduces a new "rewrite" flag in the block pointer
structure, allowing to differentiate physical rewrite (when the
block is actually reallocated at the physical birth time) from
the device reval case (when the logical birth time is used).

The new functionality is not used at this point, and the only
expected change is that error log is now kept in terms of physical
physical birth times, rather than logical, since if a block with
logged error was somehow rewritten, then the previous error does
not matter any more.

This change also introduces a new TRAVERSE_LOGICAL flag to the
traverse code, allowing zfs send, redact and diff to work in
context of logical birth times, ignoring physical-only rewrites.
It also changes nothing at this point due to lack of those writes,
but they will come in a following patch.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17565
This commit is contained in:
Alexander Motin
2025-07-17 12:50:54 -04:00
committed by Brian Behlendorf
parent 894edd084e
commit 4ae8bf406b
29 changed files with 205 additions and 144 deletions
+7
View File
@@ -59,6 +59,13 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
#define TRAVERSE_NO_DECRYPT (1<<5)
/*
* Always use logical birth time for birth time comparisons. This is useful
* for operations that care about user data changes rather than physical
* block rewrites (e.g., incremental replication).
*/
#define TRAVERSE_LOGICAL (1<<6)
/* Special traverse error return value to indicate skipping of children */
#define TRAVERSE_VISIT_NO_CHILDREN -1
+50 -20
View File
@@ -140,7 +140,7 @@ typedef struct zio_cksum_salt {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* 7 |R| padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -175,6 +175,7 @@ typedef struct zio_cksum_salt {
* E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
* R rewrite (reallocated/rewritten at phys birth TXG)
* phys birth txg when dva[0] was written; zero if same as logical birth txg
* note that typically all the dva's would be written in this
* txg, but they could be different if they were moved by
@@ -204,7 +205,7 @@ typedef struct zio_cksum_salt {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* 7 |R| padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -373,7 +374,8 @@ typedef enum bp_embedded_type {
typedef struct blkptr {
dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
uint64_t blk_prop; /* size, compression, type, etc */
uint64_t blk_pad[2]; /* Extra space for the future */
uint64_t blk_prop2; /* additional properties */
uint64_t blk_pad; /* Extra space for the future */
uint64_t blk_birth_word[2];
uint64_t blk_fill; /* fill count */
zio_cksum_t blk_cksum; /* 256-bit checksum */
@@ -476,32 +478,51 @@ typedef struct blkptr {
#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)
/*
* Block birth time macros for different use cases:
* - BP_GET_LOGICAL_BIRTH(): When the block was logically modified by user.
* To be used with a focus on user data, like incremental replication.
* - BP_GET_PHYSICAL_BIRTH(): When the block was physically written to disks.
* For regular writes is equal to logical birth. For dedup and block cloning
* can be smaller than logical birth. For remapped and rewritten blocks can
* be bigger. To be used with focus on physical disk content: ARC, DDT, scrub.
* - BP_GET_RAW_PHYSICAL_BIRTH(): Raw physical birth value. Zero if equal
* to logical birth. Should only be used for BP copying and debugging.
* - BP_GET_BIRTH(): When the block was allocated, which is a physical birth
* for rewritten blocks (rewrite flag set) or logical birth otherwise.
*/
#define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1]
#define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x))
#define BP_GET_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0]
#define BP_GET_RAW_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0]
#define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x))
#define BP_GET_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \
#define BP_GET_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
BP_GET_RAW_PHYSICAL_BIRTH(bp) ? BP_GET_RAW_PHYSICAL_BIRTH(bp) : \
BP_GET_LOGICAL_BIRTH(bp))
#define BP_SET_BIRTH(bp, logical, physical) \
{ \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BP_SET_LOGICAL_BIRTH(bp, logical); \
BP_SET_PHYSICAL_BIRTH(bp, \
((logical) == (physical) ? 0 : (physical))); \
#define BP_GET_BIRTH(bp) \
((BP_IS_EMBEDDED(bp) || !BP_GET_REWRITE(bp)) ? \
BP_GET_LOGICAL_BIRTH(bp) : BP_GET_PHYSICAL_BIRTH(bp))
#define BP_SET_BIRTH(bp, logical, physical) \
{ \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BP_SET_LOGICAL_BIRTH(bp, logical); \
BP_SET_PHYSICAL_BIRTH(bp, \
((logical) == (physical) ? 0 : (physical))); \
}
#define BP_GET_FILL(bp) \
((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \
((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill))
(BP_IS_EMBEDDED(bp) ? 1 : \
BP_IS_ENCRYPTED(bp) ? BF64_GET((bp)->blk_fill, 0, 32) : \
(bp)->blk_fill)
#define BP_SET_FILL(bp, fill) \
{ \
if (BP_IS_ENCRYPTED(bp)) \
ASSERT(!BP_IS_EMBEDDED(bp)); \
if (BP_IS_ENCRYPTED(bp)) \
BF64_SET((bp)->blk_fill, 0, 32, fill); \
else \
(bp)->blk_fill = fill; \
@@ -516,6 +537,15 @@ typedef struct blkptr {
BF64_SET((bp)->blk_fill, 32, 32, iv2); \
}
#define BP_GET_REWRITE(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : BF64_GET((bp)->blk_prop2, 63, 1))
#define BP_SET_REWRITE(bp, x) \
{ \
ASSERT(!BP_IS_EMBEDDED(bp)); \
BF64_SET((bp)->blk_prop2, 63, 1, x); \
}
#define BP_IS_METADATA(bp) \
(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
@@ -545,7 +575,7 @@ typedef struct blkptr {
(dva1)->dva_word[0] == (dva2)->dva_word[0])
#define BP_EQUAL(bp1, bp2) \
(BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \
(BP_GET_PHYSICAL_BIRTH(bp1) == BP_GET_PHYSICAL_BIRTH(bp2) && \
BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
@@ -588,8 +618,8 @@ typedef struct blkptr {
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
(bp)->blk_prop2 = 0; \
(bp)->blk_pad = 0; \
(bp)->blk_birth_word[0] = 0; \
(bp)->blk_birth_word[1] = 0; \
(bp)->blk_fill = 0; \
@@ -696,7 +726,7 @@ typedef struct blkptr {
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \
(u_longlong_t)BP_GET_BIRTH(bp), \
(u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp), \
(u_longlong_t)BP_GET_FILL(bp), \
ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \