Fix buffered/direct/mmap I/O race

When a page is faulted in for memory mapped I/O the page lock may be dropped before it has been read and marked up to date. If a buffered read encounters such a page in mappedread() it must wait until the page has been updated. Failure to do so will result in a panic on debug builds and incorrect data on production builds. The critical part of this change is in mappedread() where pages which are not up to date are now handled. Additionally, it includes the following simplifications. - zfs_getpage() and zfs_fillpage() could be passed an array of pages. This could be more efficient if it was used but in practice only a single page was ever provided. These interfaces were simplified to acknowledge that. - update_pages() was modified to correctly set the PG_error bit on a page when it cannot be read by dmu_read(). - Setting PG_error and PG_uptodate was moved to zfs_fillpage() from zpl_readpage_common(). This is consistent with the handling in update_pages() and mappedread(). - Minor additional refactoring to comments and variable declarations to improve readability. - Add a test case to exercise concurrent buffered, direct, and mmap IO to the same file. - Reduce the mmap_sync test case default run time. Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #13608 Closes #14498
2026-05-22 02:27:36 +03:00 · 2023-02-23 10:57:24 -08:00
parent 7cb67d627c
commit 89cd2197b9
7 changed files with 174 additions and 104 deletions
@@ -220,43 +220,46 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
 }

 #if defined(_KERNEL)
+
+static int zfs_fillpage(struct inode *ip, struct page *pp);
+
 /*
 * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages.  What this means:
- *
- * On Write:	If we find a memory mapped page, we write to *both*
- *		the page and the dmu buffer.
+ * between the DMU cache and the memory mapped pages.  Update all mapped
+ * pages with the contents of the coresponding dmu buffer.
 */
 void
 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 {
-	struct inode *ip = ZTOI(zp);
-	struct address_space *mp = ip->i_mapping;
-	struct page *pp;
-	uint64_t nbytes;
-	int64_t	off;
-	void *pb;
+	struct address_space *mp = ZTOI(zp)->i_mapping;
+	int64_t off = start & (PAGE_SIZE - 1);

-	off = start & (PAGE_SIZE-1);
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
-		nbytes = MIN(PAGE_SIZE - off, len);
+		uint64_t nbytes = MIN(PAGE_SIZE - off, len);

-		pp = find_lock_page(mp, start >> PAGE_SHIFT);
+		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);

-			pb = kmap(pp);
-			(void) dmu_read(os, zp->z_id, start + off, nbytes,
-			    pb + off, DMU_READ_PREFETCH);
+			void *pb = kmap(pp);
+			int error = dmu_read(os, zp->z_id, start + off,
+			    nbytes, pb + off, DMU_READ_PREFETCH);
 			kunmap(pp);

-			if (mapping_writably_mapped(mp))
-				flush_dcache_page(pp);
+			if (error) {
+				SetPageError(pp);
+				ClearPageUptodate(pp);
+			} else {
+				ClearPageError(pp);
+				SetPageUptodate(pp);
+
+				if (mapping_writably_mapped(mp))
+					flush_dcache_page(pp);
+
+				mark_page_accessed(pp);
+			}

-			mark_page_accessed(pp);
-			SetPageUptodate(pp);
-			ClearPageError(pp);
 			unlock_page(pp);
 			put_page(pp);
 		}
@@ -267,38 +270,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 }

 /*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages.  What this means:
- *
- * On Read:	We "read" preferentially from memory mapped pages,
- *		else we default from the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- *	 the file is memory mapped.
+ * When a file is memory mapped, we must keep the I/O data synchronized
+ * between the DMU cache and the memory mapped pages.  Preferentially read
+ * from memory mapped pages, otherwise fallback to reading through the dmu.
 */
 int
 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 {
 	struct inode *ip = ZTOI(zp);
 	struct address_space *mp = ip->i_mapping;
-	struct page *pp;
-	int64_t	start, off;
-	uint64_t bytes;
+	int64_t start = uio->uio_loffset;
+	int64_t off = start & (PAGE_SIZE - 1);
 	int len = nbytes;
 	int error = 0;
-	void *pb;

-	start = uio->uio_loffset;
-	off = start & (PAGE_SIZE-1);
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
-		bytes = MIN(PAGE_SIZE - off, len);
+		uint64_t bytes = MIN(PAGE_SIZE - off, len);

-		pp = find_lock_page(mp, start >> PAGE_SHIFT);
+		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
-			ASSERT(PageUptodate(pp));
+			/*
+			 * If filemap_fault() retries there exists a window
+			 * where the page will be unlocked and not up to date.
+			 * In this case we must try and fill the page.
+			 */
+			if (unlikely(!PageUptodate(pp))) {
+				error = zfs_fillpage(ip, pp);
+				if (error) {
+					unlock_page(pp);
+					put_page(pp);
+					return (error);
+				}
+			}
+
+			ASSERT(PageUptodate(pp) || PageDirty(pp));
+
 			unlock_page(pp);

-			pb = kmap(pp);
+			void *pb = kmap(pp);
 			error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 			kunmap(pp);

@@ -314,9 +323,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)

 		len -= bytes;
 		off = 0;
+
 		if (error)
 			break;
 	}
+
 	return (error);
 }
 #endif /* _KERNEL */
@@ -3959,55 +3970,41 @@ zfs_inactive(struct inode *ip)
 * Fill pages with data from the disk.
 */
 static int
-zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
+zfs_fillpage(struct inode *ip, struct page *pp)
 {
-	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
-	objset_t *os;
-	struct page *cur_pp;
-	u_offset_t io_off, total;
-	size_t io_len;
-	loff_t i_size;
-	unsigned page_idx;
-	int err;
-
-	os = zfsvfs->z_os;
-	io_len = nr_pages << PAGE_SHIFT;
-	i_size = i_size_read(ip);
-	io_off = page_offset(pl[0]);
+	loff_t i_size = i_size_read(ip);
+	u_offset_t io_off = page_offset(pp);
+	size_t io_len = PAGE_SIZE;

 	if (io_off + io_len > i_size)
 		io_len = i_size - io_off;

-	/*
-	 * Iterate over list of pages and read each page individually.
-	 */
-	page_idx = 0;
-	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
-		caddr_t va;
+	void *va = kmap(pp);
+	int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
+	    PAGE_SIZE, va, DMU_READ_PREFETCH);
+	kunmap(pp);

-		cur_pp = pl[page_idx++];
-		va = kmap(cur_pp);
-		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
-		    DMU_READ_PREFETCH);
-		kunmap(cur_pp);
-		if (err) {
-			/* convert checksum errors into IO errors */
-			if (err == ECKSUM)
-				err = SET_ERROR(EIO);
-			return (err);
-		}
+	if (error) {
+		/* convert checksum errors into IO errors */
+		if (error == ECKSUM)
+			error = SET_ERROR(EIO);
+
+		SetPageError(pp);
+		ClearPageUptodate(pp);
+	} else {
+		ClearPageError(pp);
+		SetPageUptodate(pp);
 	}

-	return (0);
+	return (error);
 }

 /*
- * Uses zfs_fillpage to read data from the file and fill the pages.
+ * Uses zfs_fillpage to read data from the file and fill the page.
 *
 *	IN:	ip	 - inode of file to get data from.
- *		pl	 - list of pages to read
- *		nr_pages - number of pages to read
+ *		pp	 - page to read
 *
 *	RETURN:	0 on success, error code on failure.
 *
@@ -4015,24 +4012,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
 *	vp - atime updated
 */
 int
-zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
+zfs_getpage(struct inode *ip, struct page *pp)
 {
-	znode_t	 *zp  = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
-	int	 err;
+	znode_t *zp = ITOZ(ip);
+	int error;

-	if (pl == NULL)
-		return (0);
+	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+		return (error);

-	if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
-		return (err);
-
-	err = zfs_fillpage(ip, pl, nr_pages);
-
-	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
+	error = zfs_fillpage(ip, pp);
+	if (error == 0)
+		dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);

 	zfs_exit(zfsvfs, FTAG);
-	return (err);
+
+	return (error);
 }

 /*
@@ -657,29 +657,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
 static inline int
 zpl_readpage_common(struct page *pp)
 {
-	struct inode *ip;
-	struct page *pl[1];
-	int error = 0;
 	fstrans_cookie_t cookie;

 	ASSERT(PageLocked(pp));
-	ip = pp->mapping->host;
-	pl[0] = pp;

 	cookie = spl_fstrans_mark();
-	error = -zfs_getpage(ip, pl, 1);
+	int error = -zfs_getpage(pp->mapping->host, pp);
 	spl_fstrans_unmark(cookie);

-	if (error) {
-		SetPageError(pp);
-		ClearPageUptodate(pp);
-	} else {
-		ClearPageError(pp);
-		SetPageUptodate(pp);
-		flush_dcache_page(pp);
-	}
-
 	unlock_page(pp);
+
 	return (error);
 }