diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 197ea9bec..1caec0ef4 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -72,7 +72,7 @@ extern void zfs_inactive(struct inode *ip); extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr); extern int zfs_fid(struct inode *ip, fid_t *fidp); -extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); +extern int zfs_getpage(struct inode *ip, struct page *pp); extern int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync); extern int zfs_dirty_inode(struct inode *ip, int flags); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 302a88c2d..9904807fa 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -220,43 +220,46 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) } #if defined(_KERNEL) + +static int zfs_fillpage(struct inode *ip, struct page *pp); + /* * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. + * between the DMU cache and the memory mapped pages. Update all mapped + * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { - struct inode *ip = ZTOI(zp); - struct address_space *mp = ip->i_mapping; - struct page *pp; - uint64_t nbytes; - int64_t off; - void *pb; + struct address_space *mp = ZTOI(zp)->i_mapping; + int64_t off = start & (PAGE_SIZE - 1); - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - nbytes = MIN(PAGE_SIZE - off, len); + uint64_t nbytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); - pb = kmap(pp); - (void) dmu_read(os, zp->z_id, start + off, nbytes, - pb + off, DMU_READ_PREFETCH); + void *pb = kmap(pp); + int error = dmu_read(os, zp->z_id, start + off, + nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); + if (error) { + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + } - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); unlock_page(pp); put_page(pp); } @@ -267,38 +270,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) } /* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. + * When a file is memory mapped, we must keep the I/O data synchronized + * between the DMU cache and the memory mapped pages. Preferentially read + * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; - struct page *pp; - int64_t start, off; - uint64_t bytes; + int64_t start = uio->uio_loffset; + int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; - void *pb; - start = uio->uio_loffset; - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - bytes = MIN(PAGE_SIZE - off, len); + uint64_t bytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { - ASSERT(PageUptodate(pp)); + /* + * If filemap_fault() retries there exists a window + * where the page will be unlocked and not up to date. + * In this case we must try and fill the page. + */ + if (unlikely(!PageUptodate(pp))) { + error = zfs_fillpage(ip, pp); + if (error) { + unlock_page(pp); + put_page(pp); + return (error); + } + } + + ASSERT(PageUptodate(pp) || PageDirty(pp)); + unlock_page(pp); - pb = kmap(pp); + void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); @@ -314,9 +323,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) len -= bytes; off = 0; + if (error) break; } + return (error); } #endif /* _KERNEL */ @@ -3959,55 +3970,41 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_fillpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - struct page *cur_pp; - u_offset_t io_off, total; - size_t io_len; - loff_t i_size; - unsigned page_idx; - int err; - - os = zfsvfs->z_os; - io_len = nr_pages << PAGE_SHIFT; - i_size = i_size_read(ip); - io_off = page_offset(pl[0]); + loff_t i_size = i_size_read(ip); + u_offset_t io_off = page_offset(pp); + size_t io_len = PAGE_SIZE; if (io_off + io_len > i_size) io_len = i_size - io_off; - /* - * Iterate over list of pages and read each page individually. - */ - page_idx = 0; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; + void *va = kmap(pp); + int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + PAGE_SIZE, va, DMU_READ_PREFETCH); + kunmap(pp); - cur_pp = pl[page_idx++]; - va = kmap(cur_pp); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - kunmap(cur_pp); - if (err) { - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); } - return (0); + return (error); } /* - * Uses zfs_fillpage to read data from the file and fill the pages. + * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. - * pl - list of pages to read - * nr_pages - number of pages to read + * pp - page to read * * RETURN: 0 on success, error code on failure. * @@ -4015,24 +4012,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) * vp - atime updated */ int -zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_getpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - int err; + znode_t *zp = ITOZ(ip); + int error; - if (pl == NULL) - return (0); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); - if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) - return (err); - - err = zfs_fillpage(ip, pl, nr_pages); - - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); + error = zfs_fillpage(ip, pp); + if (error == 0) + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); zfs_exit(zfsvfs, FTAG); - return (err); + + return (error); } /* diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index e42b15042..0a50f80ea 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -657,29 +657,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) static inline int zpl_readpage_common(struct page *pp) { - struct inode *ip; - struct page *pl[1]; - int error = 0; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); - ip = pp->mapping->host; - pl[0] = pp; cookie = spl_fstrans_mark(); - error = -zfs_getpage(ip, pl, 1); + int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); - if (error) { - SetPageError(pp); - ClearPageUptodate(pp); - } else { - ClearPageError(pp); - SetPageUptodate(pp); - flush_dcache_page(pp); - } - unlock_page(pp); + return (error); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 7a7cf927c..ed6ec4205 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -687,7 +687,8 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', tags = ['functional', 'migration'] [tests/functional/mmap] -tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos'] +tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos', + 'mmap_sync_001_pos', 'mmap_write_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] diff --git a/tests/zfs-tests/cmd/mmap_sync.c b/tests/zfs-tests/cmd/mmap_sync.c index e4d190aef..226e71be2 100644 --- a/tests/zfs-tests/cmd/mmap_sync.c +++ b/tests/zfs-tests/cmd/mmap_sync.c @@ -59,7 +59,7 @@ main(int argc, char *argv[]) return (1); } - int run_time_mins = 5; + int run_time_mins = 1; if (argc >= 2) { run_time_mins = atoi(argv[1]); } diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ad2ec4670..1a039742a 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1491,6 +1491,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/migration/setup.ksh \ functional/mmap/cleanup.ksh \ functional/mmap/mmap_libaio_001_pos.ksh \ + functional/mmap/mmap_mixed.ksh \ functional/mmap/mmap_read_001_pos.ksh \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh new file mode 100755 index 000000000..6c8246d48 --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmap/mmap_mixed.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmap/mmap.cfg + +# +# DESCRIPTION: +# Verify mixed buffered and mmap IO. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background buffered read/write fio to the file. +# 3. Start a background mmap read/write fio to the file. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$tmp_file" +} + +log_assert "Verify mixed buffered and mmap IO" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((128 * 1024)) +blocks=64 +size=$((bs * blocks)) +runtime=60 + +log_must dd if=/dev/zero of=$tmp_file bs=$bs count=$blocks + +# Buffered IO writes +log_must eval "fio --filename=$tmp_file --name=buffer-write \ + --rw=randwrite --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# Buffered IO reads +log_must eval "fio --filename=$tmp_file --name=buffer-read \ + --rw=randread --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap IO writes +log_must eval "fio --filename=$tmp_file --name=mmap-write \ + --rw=randwrite --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap IO reads +log_must eval "fio --filename=$tmp_file --name=mmap-read \ + --rw=randread --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +log_must wait + +log_pass "Verfied mixed buffered and mmap IO"