Speed up WB_SYNC_NONE when a WB_SYNC_ALL occurs simultaneously

Page writebacks with WB_SYNC_NONE can take several seconds to complete 
since they wait for the transaction group to close before being 
committed. This is usually not a problem since the caller does not 
need to wait. However, if we're simultaneously doing a writeback 
with WB_SYNC_ALL (e.g via msync), the latter can block for several 
seconds (up to zfs_txg_timeout) due to the active WB_SYNC_NONE 
writeback since it needs to wait for the transaction to complete 
and the PG_writeback bit to be cleared.

This commit deals with 2 cases:

- No page writeback is active. A WB_SYNC_ALL page writeback starts 
  and even completes. But when it's about to check if the PG_writeback 
  bit has been cleared, another writeback with WB_SYNC_NONE starts. 
  The sync page writeback ends up waiting for the non-sync page 
  writeback to complete.

- A page writeback with WB_SYNC_NONE is already active when a 
  WB_SYNC_ALL writeback starts. The WB_SYNC_ALL writeback ends up 
  waiting for the WB_SYNC_NONE writeback.

The fix works by carefully keeping track of active sync/non-sync 
writebacks and committing when beneficial.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Shaan Nobee <sniper111@gmail.com>
Closes #12662
Closes #12790
This commit is contained in:
Shaan Nobee
2022-05-04 00:23:26 +04:00
committed by GitHub
parent a64d757aa4
commit 411f4a018d
17 changed files with 351 additions and 20 deletions
+1 -1
View File
@@ -681,7 +681,7 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
tags = ['functional', 'migration']
[tests/functional/mmap]
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos']
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos']
tags = ['functional', 'mmap']
[tests/functional/mount]
+1
View File
@@ -167,6 +167,7 @@ if sys.platform.startswith('freebsd'):
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
'link_count/link_count_001': ['SKIP', na_reason],
'casenorm/mixed_create_failure': ['FAIL', 13215],
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
})
elif sys.platform.startswith('linux'):
known.update({
+1
View File
@@ -18,6 +18,7 @@
/mmap_exec
/mmap_libaio
/mmap_seek
/mmap_sync
/mmapwrite
/nvlist_to_lua
/randfree_file
+2 -1
View File
@@ -80,9 +80,10 @@ mkfiles_SOURCES = mkfiles.c
mktree_SOURCES = mktree.c
pkgexec_PROGRAMS += mmap_exec mmap_seek mmapwrite readmmap
pkgexec_PROGRAMS += mmap_exec mmap_seek mmap_sync mmapwrite readmmap
mmap_exec_SOURCES = mmap_exec.c
mmap_seek_SOURCES = mmap_seek.c
mmap_sync_SOURCES = mmap_sync.c
mmapwrite_SOURCES = mmapwrite.c
mmapwrite_LDADD = -lpthread
readmmap_SOURCES = readmmap.c
+152
View File
@@ -0,0 +1,152 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
static void
cleanup(char *file)
{
remove(file);
}
int
main(int argc, char *argv[])
{
char *testdir = getenv("TESTDIR");
if (!testdir) {
fprintf(stderr, "environment variable TESTDIR not set\n");
return (1);
}
struct stat st;
umask(0);
if (stat(testdir, &st) != 0 &&
mkdir(testdir, 0777) != 0) {
perror("mkdir");
return (1);
}
if (argc > 3) {
fprintf(stderr, "usage: %s "
"[run time in mins] "
"[max msync time in ms]\n", argv[0]);
return (1);
}
int run_time_mins = 5;
if (argc >= 2) {
run_time_mins = atoi(argv[1]);
}
int max_msync_time_ms = 1000;
if (argc >= 3) {
max_msync_time_ms = atoi(argv[2]);
}
char filepath[512];
filepath[0] = '\0';
char *file = &filepath[0];
strcat(file, testdir);
strcat(file, "/msync_file");
const int LEN = 8;
cleanup(file);
int fd = open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR |
S_IRGRP | S_IROTH);
if (fd == -1) {
(void) fprintf(stderr, "%s: %s: ", argv[0], file);
perror("open");
return (1);
}
if (ftruncate(fd, LEN) != 0) {
perror("ftruncate");
cleanup(file);
return (1);
}
void *ptr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (ptr == MAP_FAILED) {
perror("mmap");
cleanup(file);
return (1);
}
struct timeval tstart;
gettimeofday(&tstart, NULL);
long long x = 0LL;
for (;;) {
*((long long *)ptr) = x;
x++;
struct timeval t1, t2;
gettimeofday(&t1, NULL);
if (msync(ptr, LEN, MS_SYNC|MS_INVALIDATE) != 0) {
perror("msync");
cleanup(file);
return (1);
}
gettimeofday(&t2, NULL);
double elapsed = (t2.tv_sec - t1.tv_sec) * 1000.0;
elapsed += ((t2.tv_usec - t1.tv_usec) / 1000.0);
if (elapsed > max_msync_time_ms) {
fprintf(stderr, "slow msync: %f ms\n", elapsed);
munmap(ptr, LEN);
cleanup(file);
return (1);
}
double elapsed_start = (t2.tv_sec - tstart.tv_sec) * 1000.0;
elapsed_start += ((t2.tv_usec - tstart.tv_usec) / 1000.0);
if (elapsed_start > run_time_mins * 60 * 1000) {
break;
}
}
if (munmap(ptr, LEN) != 0) {
perror("munmap");
cleanup(file);
return (1);
}
if (close(fd) != 0) {
perror("close");
}
cleanup(file);
return (0);
}
+1
View File
@@ -193,6 +193,7 @@ export ZFSTEST_FILES='badsend
mmap_exec
mmap_libaio
mmap_seek
mmap_sync
mmapwrite
nvlist_to_lua
randfree_file
@@ -5,7 +5,8 @@ dist_pkgdata_SCRIPTS = \
mmap_read_001_pos.ksh \
mmap_write_001_pos.ksh \
mmap_libaio_001_pos.ksh \
mmap_seek_001_pos.ksh
mmap_seek_001_pos.ksh \
mmap_sync_001_pos.ksh
dist_pkgdata_DATA = \
mmap.cfg
@@ -0,0 +1,63 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# msync()s of mmap()'ed file should complete quickly during
# background dirty page writebacks by the kernel.
#
function cleanup
{
log_must eval "echo $saved_vm_dirty_expire_centisecs > /proc/sys/vm/dirty_expire_centisecs"
log_must eval "echo $saved_vm_dirty_background_ratio > /proc/sys/vm/dirty_background_ratio"
log_must eval "echo $saved_vm_dirty_writeback_centisecs > /proc/sys/vm/dirty_writeback_centisecs"
# revert to some sensible defaults if the values we saved
# were incorrect due to a previous run being interrupted
if [ $(</proc/sys/vm/dirty_expire_centisecs) -eq 1 ]; then
log_must eval "echo 3000 > /proc/sys/vm/dirty_expire_centisecs"
fi
if [ $(</proc/sys/vm/dirty_background_ratio) -eq 0 ]; then
log_must eval "echo 10 > /proc/sys/vm/dirty_background_ratio"
fi
if [ $(</proc/sys/vm/dirty_writeback_centisecs) -eq 1 ]; then
log_must eval "echo 500 > /proc/sys/vm/dirty_writeback_centisecs"
fi
}
if ! is_linux; then
log_unsupported "Only supported on Linux, requires /proc/sys/vm/ tunables"
fi
log_onexit cleanup
log_assert "Run the tests for mmap_sync"
read -r saved_vm_dirty_expire_centisecs < /proc/sys/vm/dirty_expire_centisecs
read -r saved_vm_dirty_background_ratio < /proc/sys/vm/dirty_background_ratio
read -r saved_vm_dirty_writeback_centisecs < /proc/sys/vm/dirty_writeback_centisecs
log_must eval "echo 1 > /proc/sys/vm/dirty_expire_centisecs"
log_must eval "echo 1 > /proc/sys/vm/dirty_background_bytes"
log_must eval "echo 1 > /proc/sys/vm/dirty_writeback_centisecs"
log_must mmap_sync
log_pass "mmap_sync tests passed."