Fix read corruption after block clone after truncate

When copy_file_range overwrites a recent truncation, subsequent reads
can incorrectly determine that it is read hole instead of reading the
cloned blocks.

This can happen when the following conditions are met:
- Truncate adds blkid to dn_free_ranges
- A new TXG is created
- copy_file_range calls dmu_brt_clone which override the block pointer
  and set DB_NOFILL
- Subsequent read, given DB_NOFILL, hits dbuf_read_impl and
  dbuf_read_hole
- dbuf_read_hole calls dnode_block_freed, which returns TRUE because the
  truncated blkids are still in dn_free_ranges

This will not happen if the clone and truncate are in the same TXG,
because the block clone would update the current TXG's dn_free_ranges,
which is why this bug only triggers under high IO load (such as
compilation).

Fix this by skipping the dnode_block_freed call if the block is
overridden. The fix shouldn't cause an issue when the cloned block is
subsequently freed in later TXGs, as dbuf_undirty would remove the
override.

This requires a dedicated test program as it is much harder to trigger
with scripts (this needs to generate a lot of I/O in short period of
time for the bug to trigger reliably).

Assisted-by: Gemini:gemini-3.1-pro
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Gary Guo <gary@kernel.org>
Closes #18412
Closes #18421
This commit is contained in:
Gary Guo
2026-04-15 22:51:53 +01:00
committed by Tony Hutter
parent b2602a400a
commit e7524594a9
9 changed files with 161 additions and 2 deletions
+5 -1
View File
@@ -1481,8 +1481,12 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
* processes the delete record and clears the bp while we are waiting
* for the dn_mtx (resulting in a "no" from block_freed).
*
* If bp != db->db_blkptr, it means that it was overridden (by a block
* clone or direct I/O write). We cannot rely on dnode_block_freed as
* the range can be freed in an earlier TXG but overridden in later.
*/
if (!is_hole && db->db_level == 0)
if (!is_hole && db->db_level == 0 && bp == db->db_blkptr)
is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
if (is_hole) {
+2 -1
View File
@@ -83,7 +83,8 @@ tests = ['block_cloning_clone_mmap_cached',
'block_cloning_replay', 'block_cloning_replay_encrypted',
'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
'block_cloning_rlimit_fsize', 'block_cloning_large_offset',
'block_cloning_after_device_removal']
'block_cloning_after_device_removal',
'block_cloning_after_trunc']
tags = ['functional', 'block_cloning']
[tests/functional/bootfs]
+1
View File
@@ -319,6 +319,7 @@ elif sys.platform.startswith('linux'):
'bclone/bclone_samefs_data': ['SKIP', cfr_reason],
'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason],
'bclone/bclone_samefs_hole': ['SKIP', cfr_reason],
'block_cloning/block_cloning_after_trunc': ['SKIP', cfr_reason],
'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason],
'block_cloning/block_cloning_clone_mmap_write':
['SKIP', cfr_reason],
+1
View File
@@ -2,6 +2,7 @@
/btree_test
/chg_usr_exec
/clonefile
/clone_after_trunc
/clone_mmap_cached
/clone_mmap_write
/crypto_test
+2
View File
@@ -35,6 +35,8 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/crypto_test
%C%_crypto_test_SOURCES = %D%/crypto_test.c
%C%_crypto_test_LDADD = libzpool.la
scripts_zfs_tests_bin_PROGRAMS += %D%/clone_after_trunc
%C%_clone_after_trunc_LDADD = -lpthread
if WANT_DEVNAME2DEVID
scripts_zfs_tests_bin_PROGRAMS += %D%/devname2devid
+117
View File
@@ -0,0 +1,117 @@
// SPDX-License-Identifier: CDDL-1.0
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#if defined(_GNU_SOURCE) && defined(__linux__)
_Static_assert(sizeof (loff_t) == sizeof (off_t),
"loff_t and off_t must be the same size");
#endif
ssize_t
copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int)
__attribute__((weak));
#define FILE_SIZE (1024 * 1024)
#define RECORD_SIZE (128 * 1024)
#define NUM_THREADS 64
const char *dir;
volatile int failed;
static void *
run_test(void *arg)
{
int thread_id = (int)(long)arg;
char src_path[PATH_MAX], dst_path[PATH_MAX];
snprintf(src_path, PATH_MAX, "%s/src-%d", dir, thread_id);
snprintf(dst_path, PATH_MAX, "%s/dst-%d", dir, thread_id);
unsigned char *write_buf = malloc(FILE_SIZE);
unsigned char *read_buf = malloc(FILE_SIZE);
// Write out expected data.
memset(write_buf, 0xAA, FILE_SIZE);
int src = open(src_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (write(src, write_buf, FILE_SIZE) != FILE_SIZE)
perror("write");
close(src);
// Create destination file so we exercise O_TRUNC.
int dst = open(dst_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (write(dst, write_buf, FILE_SIZE) != FILE_SIZE)
perror("write");
fsync(dst);
close(dst);
// Open file with O_TRUNC and perform copy.
src = open(src_path, O_RDONLY);
dst = open(dst_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
off_t off_in = 0, off_out = 0;
ssize_t ret =
copy_file_range(src, &off_in, dst, &off_out, FILE_SIZE, 0);
if (ret != FILE_SIZE)
perror("copy_file_range");
close(src);
close(dst);
// Read back
dst = open(dst_path, O_RDONLY);
if (read(dst, read_buf, FILE_SIZE) != FILE_SIZE)
perror("read");
close(dst);
// Bug check
if (memcmp(write_buf, read_buf, FILE_SIZE) != 0) {
failed = 1;
fprintf(stderr, "[%d]: FAIL\n", thread_id);
int all_zeros = 1;
for (int i = 0; i < RECORD_SIZE; i++) {
if (read_buf[i] != 0) {
all_zeros = 0;
break;
}
}
if (all_zeros) {
fprintf(stderr, "[%d]: ALL ZERO\n", thread_id);
}
}
unlink(src_path);
unlink(dst_path);
free(write_buf);
free(read_buf);
return (NULL);
}
int
main(int argc, const char **argv)
{
if (argc < 2) {
fprintf(stderr, "usage: %s <dir>\n", argv[0]);
return (1);
}
dir = argv[1];
pthread_t threads[NUM_THREADS];
for (int i = 0; i < NUM_THREADS; i++) {
pthread_create(&threads[i], NULL, run_test, (void *)(long)i);
}
for (int i = 0; i < NUM_THREADS; i++) {
pthread_join(threads[i], NULL);
}
return (failed);
}
+1
View File
@@ -186,6 +186,7 @@ export ZFSTEST_FILES_COMMON='badsend
btree_test
chg_usr_exec
clonefile
clone_after_trunc
clone_mmap_cached
clone_mmap_write
crypto_test
+1
View File
@@ -493,6 +493,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/block_cloning/block_cloning_rlimit_fsize.ksh \
functional/block_cloning/block_cloning_large_offset.ksh \
functional/block_cloning/block_cloning_after_device_removal.ksh \
functional/block_cloning/block_cloning_after_trunc.ksh \
functional/bootfs/bootfs_001_pos.ksh \
functional/bootfs/bootfs_002_neg.ksh \
functional/bootfs/bootfs_003_pos.ksh \
@@ -0,0 +1,31 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
#
# DESCRIPTION:
# When a block is truncated and then cloned to, a read data corruption can occur.
# This is a regression test for #18412.
#
verify_runnable "global"
claim="No read data corruption when cloning blocks after a truncate"
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
# Run for a few times to increase the likelihood of bug triggering.
for i in {0..50}; do
log_must clone_after_trunc /$TESTPOOL/
done
log_pass $claim