Fix dnode allocation race

When performing concurrent object allocations using the new
multi-threaded allocator and large dnodes it's possible to
allocate overlapping large dnodes.

This case should have been handled by detecting an error
returned by dnode_hold_impl().  But that logic only checked
the returned dnp was not-NULL, and the dnp variable was not
reset to NULL when retrying.  Resolve this issue by properly
checking the return value of dnode_hold_impl().

Additionally, it was possible that dnode_hold_impl() would
misreport a dnode as free when it was in fact in use.  This
could occurs for two reasons:

* The per-slot zrl_lock must be held over the entire critical
  section which includes the alloc/free until the new dnode
  is assigned to children_dnodes.  Additionally, all of the
  zrl_lock's in the range must be held to protect moving
  dnodes.

* The dn->dn_ot_type cannot be solely relied upon to check
  the type.  When allocating a new dnode its type will be
  DMU_OT_NONE after dnode_create().  Only latter when
  dnode_allocate() is called will it transition to the new
  type.  This means there's a window when allocating where
  it can mistaken for a free dnode.

Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #6414 
Closes #6439
This commit is contained in:
Brian Behlendorf 2017-08-08 08:38:53 -07:00 committed by GitHub
parent d19a6d5c80
commit 9631681b75
8 changed files with 136 additions and 68 deletions

View File

@ -61,6 +61,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
boolean_t restarted = B_FALSE;
uint64_t *cpuobj = NULL;
int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
int error;
kpreempt_disable();
cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
@ -129,7 +130,6 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
uint64_t offset;
uint64_t blkfill;
int minlvl;
int error;
if (os->os_rescan_dnodes) {
offset = 0;
os->os_rescan_dnodes = B_FALSE;
@ -163,9 +163,9 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
(void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
dn_slots, FTAG, &dn);
if (dn != NULL) {
if (error == 0) {
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
/*
* Another thread could have allocated it; check

View File

@ -1242,11 +1242,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
void
dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
{
dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
tx->tx_objset, object, THT_SPILL, 0, 0);
dmu_tx_hold_t *txh;
(void) refcount_add_many(&txh->txh_space_towrite,
SPA_OLD_MAXBLOCKSIZE, FTAG);
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
THT_SPILL, 0, 0);
if (txh != NULL)
(void) refcount_add_many(&txh->txh_space_towrite,
SPA_OLD_MAXBLOCKSIZE, FTAG);
}
void

View File

@ -391,7 +391,7 @@ dnode_setdblksz(dnode_t *dn, int size)
}
static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn;
@ -424,11 +424,15 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
if (slots && dn->dn_type == DMU_OT_NONE)
dn->dn_num_slots = slots;
else
dn->dn_num_slots = dnp->dn_extra_slots + 1;
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
@ -1011,7 +1015,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
{
dnode_t *dn;
dn = dnode_create(os, dnp, NULL, object, dnh);
dn = dnode_create(os, dnp, 0, NULL, object, dnh);
zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
}
@ -1062,36 +1066,26 @@ dnode_buf_evict_async(void *dbu)
*
* The dnode_phys_t buffer may not be in sync with the in-core dnode
* structure, so we try to check the dnode structure first and fall back
* to the dnode_phys_t buffer it doesn't exist.
* to the dnode_phys_t buffer it doesn't exist. When an in-code dnode
* exists we can always trust dn->dn_num_slots to be accurate, even for
* a held dnode which has not yet been fully allocated.
*/
static boolean_t
dnode_is_consumed(dmu_buf_impl_t *db, int idx)
dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
{
dnode_handle_t *dnh;
dmu_object_type_t ot;
dnode_children_t *children_dnodes;
dnode_phys_t *dn_block;
int skip;
int i;
children_dnodes = dmu_buf_get_user(&db->db);
dn_block = (dnode_phys_t *)db->db.db_data;
int skip, i;
for (i = 0; i < idx; i += skip) {
dnh = &children_dnodes->dnc_children[i];
dnode_handle_t *dnh = &children->dnc_children[i];
zrl_add(&dnh->dnh_zrlock);
if (dnh->dnh_dnode != NULL) {
ot = dnh->dnh_dnode->dn_type;
skip = dnh->dnh_dnode->dn_num_slots;
} else {
ot = dn_block[i].dn_type;
skip = dn_block[i].dn_extra_slots + 1;
if (dn_block[i].dn_type != DMU_OT_NONE)
skip = dn_block[i].dn_extra_slots + 1;
else
skip = 1;
}
zrl_remove(&dnh->dnh_zrlock);
if (ot == DMU_OT_NONE)
skip = 1;
}
return (i > idx);
@ -1107,27 +1101,19 @@ dnode_is_consumed(dmu_buf_impl_t *db, int idx)
* to the dnode_phys_t buffer it doesn't exist.
*/
static boolean_t
dnode_is_allocated(dmu_buf_impl_t *db, int idx)
dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
{
dnode_handle_t *dnh;
dmu_object_type_t ot;
dnode_children_t *children_dnodes;
dnode_phys_t *dn_block;
if (dnode_is_consumed(db, idx))
if (dnode_is_consumed(children, dn_block, idx))
return (B_FALSE);
children_dnodes = dmu_buf_get_user(&db->db);
dn_block = (dnode_phys_t *)db->db.db_data;
dnh = &children_dnodes->dnc_children[idx];
zrl_add(&dnh->dnh_zrlock);
dnh = &children->dnc_children[idx];
if (dnh->dnh_dnode != NULL)
ot = dnh->dnh_dnode->dn_type;
else
ot = dn_block[idx].dn_type;
zrl_remove(&dnh->dnh_zrlock);
return (ot != DMU_OT_NONE);
}
@ -1142,32 +1128,27 @@ dnode_is_allocated(dmu_buf_impl_t *db, int idx)
* to the dnode_phys_t buffer it doesn't exist.
*/
static boolean_t
dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx,
int slots)
{
dnode_handle_t *dnh;
dmu_object_type_t ot;
dnode_children_t *children_dnodes;
dnode_phys_t *dn_block;
int i;
if (idx + slots > DNODES_PER_BLOCK)
return (B_FALSE);
children_dnodes = dmu_buf_get_user(&db->db);
dn_block = (dnode_phys_t *)db->db.db_data;
if (dnode_is_consumed(db, idx))
if (dnode_is_consumed(children, dn_block, idx))
return (B_FALSE);
for (i = idx; i < idx + slots; i++) {
dnh = &children_dnodes->dnc_children[i];
for (int i = idx; i < idx + slots; i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
dmu_object_type_t ot;
if (dnh->dnh_dnode != NULL) {
if (dnh->dnh_dnode->dn_num_slots > 1)
return (B_FALSE);
zrl_add(&dnh->dnh_zrlock);
if (dnh->dnh_dnode != NULL)
ot = dnh->dnh_dnode->dn_type;
else
} else {
ot = dn_block[i].dn_type;
zrl_remove(&dnh->dnh_zrlock);
}
if (ot != DMU_OT_NONE)
return (B_FALSE);
@ -1176,6 +1157,24 @@ dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
return (B_TRUE);
}
static void
dnode_hold_slots(dnode_children_t *children, int idx, int slots)
{
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
zrl_add(&dnh->dnh_zrlock);
}
}
static void
dnode_rele_slots(dnode_children_t *children, int idx, int slots)
{
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
zrl_remove(&dnh->dnh_zrlock);
}
}
/*
* errors:
* EINVAL - invalid object number.
@ -1286,36 +1285,42 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
idx = object & (epb - 1);
dn_block_begin = (dnode_phys_t *)db->db.db_data;
if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
dnode_hold_slots(children_dnodes, idx, slots);
if ((flag & DNODE_MUST_BE_FREE) &&
!dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) {
dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOSPC));
} else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
!dnode_is_allocated(db, idx)) {
!dnode_is_allocated(children_dnodes, dn_block_begin, idx)) {
dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOENT));
}
dnh = &children_dnodes->dnc_children[idx];
zrl_add(&dnh->dnh_zrlock);
dn = dnh->dnh_dnode;
if (dn == NULL)
dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
dn = dnode_create(os, dn_block_begin + idx, slots, db,
object, dnh);
mutex_enter(&dn->dn_mtx);
type = dn->dn_type;
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
mutex_exit(&dn->dn_mtx);
zrl_remove(&dnh->dnh_zrlock);
dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
}
if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
mutex_exit(&dn->dn_mtx);
/* Now we can rely on the hold to prevent the dnode from moving. */
zrl_remove(&dnh->dnh_zrlock);
dnode_rele_slots(children_dnodes, idx, slots);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);

View File

@ -675,7 +675,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
dmu_buf_t *db;
mzap_phys_t *zp;
VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
#ifdef ZFS_DEBUG
{

View File

@ -763,7 +763,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
}
zh = zfs_znode_hold_enter(zfsvfs, obj);
VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
/*
* If this is the root, fix up the half-initialized parent pointer

View File

@ -364,7 +364,7 @@ tests = ['async_destroy_001_pos']
[tests/functional/features/large_dnode]
tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
'large_dnode_007_neg']
'large_dnode_007_neg', 'large_dnode_008_pos']
[tests/functional/grow_pool]
tests = ['grow_pool_001_pos']

View File

@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \
large_dnode_004_neg.ksh \
large_dnode_005_pos.ksh \
large_dnode_006_pos.ksh \
large_dnode_007_neg.ksh
large_dnode_007_neg.ksh \
large_dnode_008_pos.ksh

View File

@ -0,0 +1,60 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
# Use is subject to license terms.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Run many xattrtests on a dataset with large dnodes and xattr=sa to
# stress concurrent allocation of large dnodes.
#
TEST_FS=$TESTPOOL/large_dnode
verify_runnable "both"
function cleanup
{
datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
}
log_onexit cleanup
log_assert "xattrtest runs concurrently on dataset with large dnodes"
log_must zfs create $TEST_FS
log_must zfs set dnsize=auto $TEST_FS
log_must zfs set xattr=sa $TEST_FS
for ((i=0; i < 100; i++)); do
dir="/$TEST_FS/dir.$i"
log_must mkdir "$dir"
log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &"
done
log_must wait
log_pass