mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 19:04:45 +03:00
Fix read corruption after block clone after truncate
When copy_file_range overwrites a recent truncation, subsequent reads can incorrectly determine that it is read hole instead of reading the cloned blocks. This can happen when the following conditions are met: - Truncate adds blkid to dn_free_ranges - A new TXG is created - copy_file_range calls dmu_brt_clone which override the block pointer and set DB_NOFILL - Subsequent read, given DB_NOFILL, hits dbuf_read_impl and dbuf_read_hole - dbuf_read_hole calls dnode_block_freed, which returns TRUE because the truncated blkids are still in dn_free_ranges This will not happen if the clone and truncate are in the same TXG, because the block clone would update the current TXG's dn_free_ranges, which is why this bug only triggers under high IO load (such as compilation). Fix this by skipping the dnode_block_freed call if the block is overridden. The fix shouldn't cause an issue when the cloned block is subsequently freed in later TXGs, as dbuf_undirty would remove the override. This requires a dedicated test program as it is much harder to trigger with scripts (this needs to generate a lot of I/O in short period of time for the bug to trigger reliably). Assisted-by: Gemini:gemini-3.1-pro Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Gary Guo <gary@kernel.org> Closes #18412 Closes #18421
This commit is contained in:
+5
-1
@@ -1481,8 +1481,12 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
|
||||
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
|
||||
* processes the delete record and clears the bp while we are waiting
|
||||
* for the dn_mtx (resulting in a "no" from block_freed).
|
||||
*
|
||||
* If bp != db->db_blkptr, it means that it was overridden (by a block
|
||||
* clone or direct I/O write). We cannot rely on dnode_block_freed as
|
||||
* the range can be freed in an earlier TXG but overridden in later.
|
||||
*/
|
||||
if (!is_hole && db->db_level == 0)
|
||||
if (!is_hole && db->db_level == 0 && bp == db->db_blkptr)
|
||||
is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
|
||||
|
||||
if (is_hole) {
|
||||
|
||||
@@ -83,7 +83,8 @@ tests = ['block_cloning_clone_mmap_cached',
|
||||
'block_cloning_replay', 'block_cloning_replay_encrypted',
|
||||
'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
|
||||
'block_cloning_rlimit_fsize', 'block_cloning_large_offset',
|
||||
'block_cloning_after_device_removal']
|
||||
'block_cloning_after_device_removal',
|
||||
'block_cloning_after_trunc']
|
||||
tags = ['functional', 'block_cloning']
|
||||
|
||||
[tests/functional/bootfs]
|
||||
|
||||
@@ -319,6 +319,7 @@ elif sys.platform.startswith('linux'):
|
||||
'bclone/bclone_samefs_data': ['SKIP', cfr_reason],
|
||||
'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason],
|
||||
'bclone/bclone_samefs_hole': ['SKIP', cfr_reason],
|
||||
'block_cloning/block_cloning_after_trunc': ['SKIP', cfr_reason],
|
||||
'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason],
|
||||
'block_cloning/block_cloning_clone_mmap_write':
|
||||
['SKIP', cfr_reason],
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
/btree_test
|
||||
/chg_usr_exec
|
||||
/clonefile
|
||||
/clone_after_trunc
|
||||
/clone_mmap_cached
|
||||
/clone_mmap_write
|
||||
/crypto_test
|
||||
|
||||
@@ -35,6 +35,8 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/crypto_test
|
||||
%C%_crypto_test_SOURCES = %D%/crypto_test.c
|
||||
%C%_crypto_test_LDADD = libzpool.la
|
||||
|
||||
scripts_zfs_tests_bin_PROGRAMS += %D%/clone_after_trunc
|
||||
%C%_clone_after_trunc_LDADD = -lpthread
|
||||
|
||||
if WANT_DEVNAME2DEVID
|
||||
scripts_zfs_tests_bin_PROGRAMS += %D%/devname2devid
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
// SPDX-License-Identifier: CDDL-1.0
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#if defined(_GNU_SOURCE) && defined(__linux__)
|
||||
_Static_assert(sizeof (loff_t) == sizeof (off_t),
|
||||
"loff_t and off_t must be the same size");
|
||||
#endif
|
||||
|
||||
ssize_t
|
||||
copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int)
|
||||
__attribute__((weak));
|
||||
|
||||
#define FILE_SIZE (1024 * 1024)
|
||||
#define RECORD_SIZE (128 * 1024)
|
||||
#define NUM_THREADS 64
|
||||
|
||||
const char *dir;
|
||||
volatile int failed;
|
||||
|
||||
static void *
|
||||
run_test(void *arg)
|
||||
{
|
||||
int thread_id = (int)(long)arg;
|
||||
|
||||
char src_path[PATH_MAX], dst_path[PATH_MAX];
|
||||
snprintf(src_path, PATH_MAX, "%s/src-%d", dir, thread_id);
|
||||
snprintf(dst_path, PATH_MAX, "%s/dst-%d", dir, thread_id);
|
||||
|
||||
unsigned char *write_buf = malloc(FILE_SIZE);
|
||||
unsigned char *read_buf = malloc(FILE_SIZE);
|
||||
|
||||
// Write out expected data.
|
||||
memset(write_buf, 0xAA, FILE_SIZE);
|
||||
int src = open(src_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
||||
if (write(src, write_buf, FILE_SIZE) != FILE_SIZE)
|
||||
perror("write");
|
||||
close(src);
|
||||
|
||||
// Create destination file so we exercise O_TRUNC.
|
||||
int dst = open(dst_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
||||
if (write(dst, write_buf, FILE_SIZE) != FILE_SIZE)
|
||||
perror("write");
|
||||
fsync(dst);
|
||||
close(dst);
|
||||
|
||||
// Open file with O_TRUNC and perform copy.
|
||||
src = open(src_path, O_RDONLY);
|
||||
dst = open(dst_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
||||
|
||||
off_t off_in = 0, off_out = 0;
|
||||
ssize_t ret =
|
||||
copy_file_range(src, &off_in, dst, &off_out, FILE_SIZE, 0);
|
||||
if (ret != FILE_SIZE)
|
||||
perror("copy_file_range");
|
||||
close(src);
|
||||
close(dst);
|
||||
|
||||
// Read back
|
||||
dst = open(dst_path, O_RDONLY);
|
||||
if (read(dst, read_buf, FILE_SIZE) != FILE_SIZE)
|
||||
perror("read");
|
||||
close(dst);
|
||||
|
||||
// Bug check
|
||||
if (memcmp(write_buf, read_buf, FILE_SIZE) != 0) {
|
||||
failed = 1;
|
||||
fprintf(stderr, "[%d]: FAIL\n", thread_id);
|
||||
|
||||
int all_zeros = 1;
|
||||
for (int i = 0; i < RECORD_SIZE; i++) {
|
||||
if (read_buf[i] != 0) {
|
||||
all_zeros = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_zeros) {
|
||||
fprintf(stderr, "[%d]: ALL ZERO\n", thread_id);
|
||||
}
|
||||
}
|
||||
|
||||
unlink(src_path);
|
||||
unlink(dst_path);
|
||||
free(write_buf);
|
||||
free(read_buf);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, const char **argv)
|
||||
{
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "usage: %s <dir>\n", argv[0]);
|
||||
return (1);
|
||||
}
|
||||
dir = argv[1];
|
||||
|
||||
pthread_t threads[NUM_THREADS];
|
||||
|
||||
for (int i = 0; i < NUM_THREADS; i++) {
|
||||
pthread_create(&threads[i], NULL, run_test, (void *)(long)i);
|
||||
}
|
||||
for (int i = 0; i < NUM_THREADS; i++) {
|
||||
pthread_join(threads[i], NULL);
|
||||
}
|
||||
|
||||
return (failed);
|
||||
}
|
||||
@@ -186,6 +186,7 @@ export ZFSTEST_FILES_COMMON='badsend
|
||||
btree_test
|
||||
chg_usr_exec
|
||||
clonefile
|
||||
clone_after_trunc
|
||||
clone_mmap_cached
|
||||
clone_mmap_write
|
||||
crypto_test
|
||||
|
||||
@@ -493,6 +493,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/block_cloning/block_cloning_rlimit_fsize.ksh \
|
||||
functional/block_cloning/block_cloning_large_offset.ksh \
|
||||
functional/block_cloning/block_cloning_after_device_removal.ksh \
|
||||
functional/block_cloning/block_cloning_after_trunc.ksh \
|
||||
functional/bootfs/bootfs_001_pos.ksh \
|
||||
functional/bootfs/bootfs_002_neg.ksh \
|
||||
functional/bootfs/bootfs_003_pos.ksh \
|
||||
|
||||
+31
@@ -0,0 +1,31 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# When a block is truncated and then cloned to, a read data corruption can occur.
|
||||
# This is a regression test for #18412.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
claim="No read data corruption when cloning blocks after a truncate"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
|
||||
|
||||
# Run for a few times to increase the likelihood of bug triggering.
|
||||
for i in {0..50}; do
|
||||
log_must clone_after_trunc /$TESTPOOL/
|
||||
done
|
||||
|
||||
log_pass $claim
|
||||
Reference in New Issue
Block a user