mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-26 09:54:22 +03:00
Fix buffered/direct/mmap I/O race
When a page is faulted in for memory mapped I/O the page lock may be dropped before it has been read and marked up to date. If a buffered read encounters such a page in mappedread() it must wait until the page has been updated. Failure to do so will result in a panic on debug builds and incorrect data on production builds. The critical part of this change is in mappedread() where pages which are not up to date are now handled. Additionally, it includes the following simplifications. - zfs_getpage() and zfs_fillpage() could be passed an array of pages. This could be more efficient if it was used but in practice only a single page was ever provided. These interfaces were simplified to acknowledge that. - update_pages() was modified to correctly set the PG_error bit on a page when it cannot be read by dmu_read(). - Setting PG_error and PG_uptodate was moved to zfs_fillpage() from zpl_readpage_common(). This is consistent with the handling in update_pages() and mappedread(). - Minor additional refactoring to comments and variable declarations to improve readability. - Add a test case to exercise concurrent buffered, direct, and mmap IO to the same file. - Reduce the mmap_sync test case default run time. Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #13608 Closes #14498
This commit is contained in:
parent
7cb67d627c
commit
89cd2197b9
@ -72,7 +72,7 @@ extern void zfs_inactive(struct inode *ip);
|
||||
extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
|
||||
offset_t offset, cred_t *cr);
|
||||
extern int zfs_fid(struct inode *ip, fid_t *fidp);
|
||||
extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
|
||||
extern int zfs_getpage(struct inode *ip, struct page *pp);
|
||||
extern int zfs_putpage(struct inode *ip, struct page *pp,
|
||||
struct writeback_control *wbc, boolean_t for_sync);
|
||||
extern int zfs_dirty_inode(struct inode *ip, int flags);
|
||||
|
@ -220,43 +220,46 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
|
||||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
|
||||
static int zfs_fillpage(struct inode *ip, struct page *pp);
|
||||
|
||||
/*
|
||||
* When a file is memory mapped, we must keep the IO data synchronized
|
||||
* between the DMU cache and the memory mapped pages. What this means:
|
||||
*
|
||||
* On Write: If we find a memory mapped page, we write to *both*
|
||||
* the page and the dmu buffer.
|
||||
* between the DMU cache and the memory mapped pages. Update all mapped
|
||||
* pages with the contents of the coresponding dmu buffer.
|
||||
*/
|
||||
void
|
||||
update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
|
||||
{
|
||||
struct inode *ip = ZTOI(zp);
|
||||
struct address_space *mp = ip->i_mapping;
|
||||
struct page *pp;
|
||||
uint64_t nbytes;
|
||||
int64_t off;
|
||||
void *pb;
|
||||
struct address_space *mp = ZTOI(zp)->i_mapping;
|
||||
int64_t off = start & (PAGE_SIZE - 1);
|
||||
|
||||
off = start & (PAGE_SIZE-1);
|
||||
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
|
||||
nbytes = MIN(PAGE_SIZE - off, len);
|
||||
uint64_t nbytes = MIN(PAGE_SIZE - off, len);
|
||||
|
||||
pp = find_lock_page(mp, start >> PAGE_SHIFT);
|
||||
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
|
||||
if (pp) {
|
||||
if (mapping_writably_mapped(mp))
|
||||
flush_dcache_page(pp);
|
||||
|
||||
pb = kmap(pp);
|
||||
(void) dmu_read(os, zp->z_id, start + off, nbytes,
|
||||
pb + off, DMU_READ_PREFETCH);
|
||||
void *pb = kmap(pp);
|
||||
int error = dmu_read(os, zp->z_id, start + off,
|
||||
nbytes, pb + off, DMU_READ_PREFETCH);
|
||||
kunmap(pp);
|
||||
|
||||
if (mapping_writably_mapped(mp))
|
||||
flush_dcache_page(pp);
|
||||
if (error) {
|
||||
SetPageError(pp);
|
||||
ClearPageUptodate(pp);
|
||||
} else {
|
||||
ClearPageError(pp);
|
||||
SetPageUptodate(pp);
|
||||
|
||||
if (mapping_writably_mapped(mp))
|
||||
flush_dcache_page(pp);
|
||||
|
||||
mark_page_accessed(pp);
|
||||
}
|
||||
|
||||
mark_page_accessed(pp);
|
||||
SetPageUptodate(pp);
|
||||
ClearPageError(pp);
|
||||
unlock_page(pp);
|
||||
put_page(pp);
|
||||
}
|
||||
@ -267,38 +270,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
|
||||
}
|
||||
|
||||
/*
|
||||
* When a file is memory mapped, we must keep the IO data synchronized
|
||||
* between the DMU cache and the memory mapped pages. What this means:
|
||||
*
|
||||
* On Read: We "read" preferentially from memory mapped pages,
|
||||
* else we default from the dmu buffer.
|
||||
*
|
||||
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
|
||||
* the file is memory mapped.
|
||||
* When a file is memory mapped, we must keep the I/O data synchronized
|
||||
* between the DMU cache and the memory mapped pages. Preferentially read
|
||||
* from memory mapped pages, otherwise fallback to reading through the dmu.
|
||||
*/
|
||||
int
|
||||
mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
|
||||
{
|
||||
struct inode *ip = ZTOI(zp);
|
||||
struct address_space *mp = ip->i_mapping;
|
||||
struct page *pp;
|
||||
int64_t start, off;
|
||||
uint64_t bytes;
|
||||
int64_t start = uio->uio_loffset;
|
||||
int64_t off = start & (PAGE_SIZE - 1);
|
||||
int len = nbytes;
|
||||
int error = 0;
|
||||
void *pb;
|
||||
|
||||
start = uio->uio_loffset;
|
||||
off = start & (PAGE_SIZE-1);
|
||||
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
|
||||
bytes = MIN(PAGE_SIZE - off, len);
|
||||
uint64_t bytes = MIN(PAGE_SIZE - off, len);
|
||||
|
||||
pp = find_lock_page(mp, start >> PAGE_SHIFT);
|
||||
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
|
||||
if (pp) {
|
||||
ASSERT(PageUptodate(pp));
|
||||
/*
|
||||
* If filemap_fault() retries there exists a window
|
||||
* where the page will be unlocked and not up to date.
|
||||
* In this case we must try and fill the page.
|
||||
*/
|
||||
if (unlikely(!PageUptodate(pp))) {
|
||||
error = zfs_fillpage(ip, pp);
|
||||
if (error) {
|
||||
unlock_page(pp);
|
||||
put_page(pp);
|
||||
return (error);
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT(PageUptodate(pp) || PageDirty(pp));
|
||||
|
||||
unlock_page(pp);
|
||||
|
||||
pb = kmap(pp);
|
||||
void *pb = kmap(pp);
|
||||
error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
|
||||
kunmap(pp);
|
||||
|
||||
@ -314,9 +323,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
|
||||
|
||||
len -= bytes;
|
||||
off = 0;
|
||||
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
|
||||
return (error);
|
||||
}
|
||||
#endif /* _KERNEL */
|
||||
@ -3959,55 +3970,41 @@ zfs_inactive(struct inode *ip)
|
||||
* Fill pages with data from the disk.
|
||||
*/
|
||||
static int
|
||||
zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
|
||||
zfs_fillpage(struct inode *ip, struct page *pp)
|
||||
{
|
||||
znode_t *zp = ITOZ(ip);
|
||||
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
||||
objset_t *os;
|
||||
struct page *cur_pp;
|
||||
u_offset_t io_off, total;
|
||||
size_t io_len;
|
||||
loff_t i_size;
|
||||
unsigned page_idx;
|
||||
int err;
|
||||
|
||||
os = zfsvfs->z_os;
|
||||
io_len = nr_pages << PAGE_SHIFT;
|
||||
i_size = i_size_read(ip);
|
||||
io_off = page_offset(pl[0]);
|
||||
loff_t i_size = i_size_read(ip);
|
||||
u_offset_t io_off = page_offset(pp);
|
||||
size_t io_len = PAGE_SIZE;
|
||||
|
||||
if (io_off + io_len > i_size)
|
||||
io_len = i_size - io_off;
|
||||
|
||||
/*
|
||||
* Iterate over list of pages and read each page individually.
|
||||
*/
|
||||
page_idx = 0;
|
||||
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
|
||||
caddr_t va;
|
||||
void *va = kmap(pp);
|
||||
int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
|
||||
PAGE_SIZE, va, DMU_READ_PREFETCH);
|
||||
kunmap(pp);
|
||||
|
||||
cur_pp = pl[page_idx++];
|
||||
va = kmap(cur_pp);
|
||||
err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
|
||||
DMU_READ_PREFETCH);
|
||||
kunmap(cur_pp);
|
||||
if (err) {
|
||||
/* convert checksum errors into IO errors */
|
||||
if (err == ECKSUM)
|
||||
err = SET_ERROR(EIO);
|
||||
return (err);
|
||||
}
|
||||
if (error) {
|
||||
/* convert checksum errors into IO errors */
|
||||
if (error == ECKSUM)
|
||||
error = SET_ERROR(EIO);
|
||||
|
||||
SetPageError(pp);
|
||||
ClearPageUptodate(pp);
|
||||
} else {
|
||||
ClearPageError(pp);
|
||||
SetPageUptodate(pp);
|
||||
}
|
||||
|
||||
return (0);
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Uses zfs_fillpage to read data from the file and fill the pages.
|
||||
* Uses zfs_fillpage to read data from the file and fill the page.
|
||||
*
|
||||
* IN: ip - inode of file to get data from.
|
||||
* pl - list of pages to read
|
||||
* nr_pages - number of pages to read
|
||||
* pp - page to read
|
||||
*
|
||||
* RETURN: 0 on success, error code on failure.
|
||||
*
|
||||
@ -4015,24 +4012,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
|
||||
* vp - atime updated
|
||||
*/
|
||||
int
|
||||
zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
|
||||
zfs_getpage(struct inode *ip, struct page *pp)
|
||||
{
|
||||
znode_t *zp = ITOZ(ip);
|
||||
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
||||
int err;
|
||||
znode_t *zp = ITOZ(ip);
|
||||
int error;
|
||||
|
||||
if (pl == NULL)
|
||||
return (0);
|
||||
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
|
||||
return (error);
|
||||
|
||||
if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
|
||||
return (err);
|
||||
|
||||
err = zfs_fillpage(ip, pl, nr_pages);
|
||||
|
||||
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
|
||||
error = zfs_fillpage(ip, pp);
|
||||
if (error == 0)
|
||||
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
|
||||
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (err);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -657,29 +657,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
|
||||
static inline int
|
||||
zpl_readpage_common(struct page *pp)
|
||||
{
|
||||
struct inode *ip;
|
||||
struct page *pl[1];
|
||||
int error = 0;
|
||||
fstrans_cookie_t cookie;
|
||||
|
||||
ASSERT(PageLocked(pp));
|
||||
ip = pp->mapping->host;
|
||||
pl[0] = pp;
|
||||
|
||||
cookie = spl_fstrans_mark();
|
||||
error = -zfs_getpage(ip, pl, 1);
|
||||
int error = -zfs_getpage(pp->mapping->host, pp);
|
||||
spl_fstrans_unmark(cookie);
|
||||
|
||||
if (error) {
|
||||
SetPageError(pp);
|
||||
ClearPageUptodate(pp);
|
||||
} else {
|
||||
ClearPageError(pp);
|
||||
SetPageUptodate(pp);
|
||||
flush_dcache_page(pp);
|
||||
}
|
||||
|
||||
unlock_page(pp);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
|
@ -687,7 +687,8 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
|
||||
tags = ['functional', 'migration']
|
||||
|
||||
[tests/functional/mmap]
|
||||
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos']
|
||||
tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
|
||||
'mmap_sync_001_pos', 'mmap_write_001_pos']
|
||||
tags = ['functional', 'mmap']
|
||||
|
||||
[tests/functional/mount]
|
||||
|
@ -59,7 +59,7 @@ main(int argc, char *argv[])
|
||||
return (1);
|
||||
}
|
||||
|
||||
int run_time_mins = 5;
|
||||
int run_time_mins = 1;
|
||||
if (argc >= 2) {
|
||||
run_time_mins = atoi(argv[1]);
|
||||
}
|
||||
|
@ -1491,6 +1491,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/migration/setup.ksh \
|
||||
functional/mmap/cleanup.ksh \
|
||||
functional/mmap/mmap_libaio_001_pos.ksh \
|
||||
functional/mmap/mmap_mixed.ksh \
|
||||
functional/mmap/mmap_read_001_pos.ksh \
|
||||
functional/mmap/mmap_seek_001_pos.ksh \
|
||||
functional/mmap/mmap_sync_001_pos.ksh \
|
||||
|
86
tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh
Executable file
86
tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh
Executable file
@ -0,0 +1,86 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2023 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/mmap/mmap.cfg
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify mixed buffered and mmap IO.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create an empty file.
|
||||
# 2. Start a background buffered read/write fio to the file.
|
||||
# 3. Start a background mmap read/write fio to the file.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must rm -f "$tmp_file"
|
||||
}
|
||||
|
||||
log_assert "Verify mixed buffered and mmap IO"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
tmp_file=$mntpnt/file
|
||||
bs=$((128 * 1024))
|
||||
blocks=64
|
||||
size=$((bs * blocks))
|
||||
runtime=60
|
||||
|
||||
log_must dd if=/dev/zero of=$tmp_file bs=$bs count=$blocks
|
||||
|
||||
# Buffered IO writes
|
||||
log_must eval "fio --filename=$tmp_file --name=buffer-write \
|
||||
--rw=randwrite --size=$size --bs=$bs --direct=0 --numjobs=1 \
|
||||
--ioengine=sync --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
# Buffered IO reads
|
||||
log_must eval "fio --filename=$tmp_file --name=buffer-read \
|
||||
--rw=randread --size=$size --bs=$bs --direct=0 --numjobs=1 \
|
||||
--ioengine=sync --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
# mmap IO writes
|
||||
log_must eval "fio --filename=$tmp_file --name=mmap-write \
|
||||
--rw=randwrite --size=$size --bs=$bs --numjobs=1 \
|
||||
--ioengine=mmap --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
# mmap IO reads
|
||||
log_must eval "fio --filename=$tmp_file --name=mmap-read \
|
||||
--rw=randread --size=$size --bs=$bs --numjobs=1 \
|
||||
--ioengine=mmap --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
log_must wait
|
||||
|
||||
log_pass "Verfied mixed buffered and mmap IO"
|
Loading…
Reference in New Issue
Block a user