Fix size inflation in spa_get_worst_case_asize()

When we try assign a new transaction to a TXG we must know beforehand
if there is sufficient free space on disk. This is to decide,
in dmu_tx_assign(), if we should reject the TX with ENOSPC.

We rely on spa_get_worst_case_asize() to inflate the size of our
logical writes by a factor of spa_asize_inflation which is
calculated as:

   (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24

The problem with the current implementation is that we don't take
into account what happens with very small writes on VDEVs with large
physical block sizes.
Consider the case of writes to a dataset with recordsize=512,
copies=3 on a VDEV with ashift=13 (usually SSD with 8K block size):
every logical IO will end up allocating 3 * 8K = 24K on disk, so 512
bytes multiplied by 48, which is double the size we account for.
If we allow this kind of writes to be assigned a TX it is possible,
when the pool is almost full, to trigger an allocation failure
(ENOSPC) in the ZIO pipeline, which will in turn result in the whole
pool being suspended.

The bug is fixed by using, in spa_get_worst_case_asize(), the MAX()
value chosen between the logical io size from zfs_write() and the
maximum physical block size used among our VDEVs.

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #5941
This commit is contained in:
LOLi 2017-04-11 00:28:21 +02:00 committed by Brian Behlendorf
parent 8542ef852a
commit 047187c1bd
5 changed files with 87 additions and 4 deletions

View File

@ -1624,11 +1624,19 @@ spa_freeze_txg(spa_t *spa)
return (spa->spa_freeze_txg);
}
/* ARGSUSED */
/*
* Return the inflated asize for a logical write in bytes. This is used by the
* DMU to calculate the space a logical write will require on disk.
* If lsize is smaller than the largest physical block size allocatable on this
* pool we use its value instead, since the write will end up using the whole
* block anyway.
*/
uint64_t
spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
{
return (lsize * spa_asize_inflation);
if (lsize == 0)
return (0); /* No inflation needed */
return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation);
}
/*

View File

@ -494,7 +494,7 @@ tests = ['mv_files_001_pos', 'mv_files_002_pos']
tests = ['nestedfs_001_pos']
[tests/functional/no_space]
tests = ['enospc_001_pos']
tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos']
# DISABLED:
# nopwrite_volume - https://github.com/zfsonlinux/zfs/issues/5510

View File

@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
enospc_001_pos.ksh \
enospc_002_pos.ksh
enospc_002_pos.ksh \
enospc_003_pos.ksh

View File

View File

@ -0,0 +1,74 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2017, loli10K. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/no_space/enospc.cfg
. $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib
#
# DESCRIPTION:
# ENOSPC is returned on pools with large physical block size and small
# recordsize.
#
# STRATEGY:
# 1. Create a pool with property ashift=13 (8K block size)
# 2. Set property recordsize=512 and copies=3 on the root dataset
# 3. Write a file until the file system is full
# 4. Verify the return code is ENOSPC
#
verify_runnable "both"
function cleanup
{
log_must zpool destroy $TESTPOOL1
log_must rm -f $disk
}
log_onexit cleanup
log_assert "ENOSPC is returned on pools with large physical block size"
disk=$TEST_BASE_DIR/$FILEDISK0
# we need a device big enough to test this or failure will not trigger
size="512m"
log_must mkfile $size $disk
log_must zpool create $TESTPOOL1 -o ashift=13 $disk
log_must zfs set mountpoint=$TESTDIR $TESTPOOL1
log_must zfs set compression=off $TESTPOOL1
log_must zfs set recordsize=512 $TESTPOOL1
log_must zfs set copies=3 $TESTPOOL1
log_note "Writing file: $TESTFILE0 until ENOSPC."
file_write -o create -f $TESTDIR/$TESTFILE0 -b $BLOCKSZ \
-c $NUM_WRITES -d $DATA
ret=$?
(( $ret != $ENOSPC )) && \
log_fail "$TESTFILE0 returned: $ret rather than ENOSPC."
log_pass "ENOSPC returned as expected."