3642 dsl_scan_active() should not issue I/O to determine if async
     destroying is active
3643 txg_delay should not hold the tc_lock
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Approved by: Gordon Ross <gwr@nexenta.com>

References:
  https://www.illumos.org/issues/3642
  https://www.illumos.org/issues/3643
  illumos/illumos-gate@4a92375985

Ported-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1775

Porting Notes:

1. The alignment assumptions for the tx_cpu structure assume that
   a kmutex_t is 8 bytes.  This isn't true under Linux but tc_pad[]
   was adjusted anyway for consistency since this structure was
   never carefully aligned in ZoL.  If careful alignment does impact
   performance significantly this should be reworked to be portable.
This commit is contained in:
George Wilson 2013-04-23 09:31:42 -08:00 committed by Brian Behlendorf
parent 7ec09286b7
commit 2696dfafd9
5 changed files with 78 additions and 14 deletions

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_DSL_SCAN_H #ifndef _SYS_DSL_SCAN_H
@ -82,6 +82,7 @@ typedef struct dsl_scan {
/* for freeing blocks */ /* for freeing blocks */
boolean_t scn_is_bptree; boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
/* for debugging / information */ /* for debugging / information */
uint64_t scn_visited_this_txg; uint64_t scn_visited_this_txg;

View File

@ -23,6 +23,10 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/*
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_TXG_IMPL_H #ifndef _SYS_TXG_IMPL_H
#define _SYS_TXG_IMPL_H #define _SYS_TXG_IMPL_H
@ -33,14 +37,55 @@
extern "C" { extern "C" {
#endif #endif
/*
* The tx_cpu structure is a per-cpu structure that is used to track
* the number of active transaction holds (tc_count). As transactions
* are assigned into a transaction group the appropriate tc_count is
* incremented to indicate that there are pending changes that have yet
* to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
* the tc_count. A transaction group is not considered quiesced until all
* tx_cpu structures have reached a tc_count of zero.
*
* This structure is a per-cpu structure by design. Updates to this structure
* are frequent and concurrent. Having a single structure would result in
* heavy lock contention so a per-cpu design was implemented. With the fanned
* out mutex design, consumers only need to lock the mutex associated with
* thread's cpu.
*
* The tx_cpu contains two locks, the tc_lock and tc_open_lock.
* The tc_lock is used to protect all members of the tx_cpu structure with
* the exception of the tc_open_lock. This lock should only be held for a
* short period of time, typically when updating the value of tc_count.
*
* The tc_open_lock protects the tx_open_txg member of the tx_state structure.
* This lock is used to ensure that transactions are only assigned into
* the current open transaction group. In order to move the current open
* transaction group to the quiesce phase, the txg_quiesce thread must
* grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
* The tc_open_lock is held until the transaction is assigned into the
* transaction group. Typically, this is a short operation but if throttling
* is occuring it may be held for longer periods of time.
*/
struct tx_cpu { struct tx_cpu {
kmutex_t tc_lock; kmutex_t tc_open_lock; /* protects tx_open_txg */
kmutex_t tc_lock; /* protects the rest of this struct */
kcondvar_t tc_cv[TXG_SIZE]; kcondvar_t tc_cv[TXG_SIZE];
uint64_t tc_count[TXG_SIZE]; uint64_t tc_count[TXG_SIZE];
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
char tc_pad[16]; char tc_pad[8]; /* pad to fill 3 cache lines */
}; };
/*
* The tx_state structure maintains the state information about the different
* stages of the pool's transcation groups. A per pool tx_state structure
* is used to track this information. The tx_state structure also points to
* an array of tx_cpu structures (described above). Although the tx_sync_lock
* is used to protect the members of this structure, it is not used to
* protect the tx_open_txg. Instead a special lock in the tx_cpu structure
* is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
* Any thread wishing to update tx_open_txg must grab the tc_open_lock on
* every cpu (see txg_quiesce()).
*/
typedef struct tx_state { typedef struct tx_state {
tx_cpu_t *tx_cpu; /* protects right to enter txg */ tx_cpu_t *tx_cpu; /* protects right to enter txg */
kmutex_t tx_sync_lock; /* protects tx_state_t */ kmutex_t tx_sync_lock; /* protects tx_state_t */

View File

@ -761,12 +761,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
zil_destroy_sync(dmu_objset_zil(os), tx); zil_destroy_sync(dmu_objset_zil(os), tx);
if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
dsl_scan_t *scn = dp->dp_scan;
spa_feature_incr(dp->dp_spa, async_destroy, tx); spa_feature_incr(dp->dp_spa, async_destroy, tx);
dp->dp_bptree_obj = bptree_alloc(mos, tx); dp->dp_bptree_obj = bptree_alloc(mos, tx);
VERIFY0(zap_add(mos, VERIFY0(zap_add(mos,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
&dp->dp_bptree_obj, tx)); &dp->dp_bptree_obj, tx));
ASSERT(!scn->scn_async_destroying);
scn->scn_async_destroying = B_TRUE;
} }
used = ds->ds_dir->dd_phys->dd_used_bytes; used = ds->ds_dir->dd_phys->dd_used_bytes;

View File

@ -91,6 +91,15 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
scn->scn_dp = dp; scn->scn_dp = dp;
/*
* It's possible that we're resuming a scan after a reboot so
* make sure that the scan_async_destroying flag is initialized
* appropriately.
*/
ASSERT(!scn->scn_async_destroying);
scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
"scrub_func", sizeof (uint64_t), 1, &f); "scrub_func", sizeof (uint64_t), 1, &f);
if (err == 0) { if (err == 0) {
@ -1362,13 +1371,10 @@ dsl_scan_active(dsl_scan_t *scn)
if (spa_shutting_down(spa)) if (spa_shutting_down(spa))
return (B_FALSE); return (B_FALSE);
if (scn->scn_phys.scn_state == DSS_SCANNING) if (scn->scn_phys.scn_state == DSS_SCANNING ||
scn->scn_async_destroying)
return (B_TRUE); return (B_TRUE);
if (spa_feature_is_active(spa,
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
return (B_TRUE);
}
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj, (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
&used, &comp, &uncomp); &used, &comp, &uncomp);
@ -1424,6 +1430,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (err == 0 && spa_feature_is_active(spa, if (err == 0 && spa_feature_is_active(spa,
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
ASSERT(scn->scn_async_destroying);
scn->scn_is_bptree = B_TRUE; scn->scn_is_bptree = B_TRUE;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL, scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED); NULL, ZIO_FLAG_MUSTSUCCEED);
@ -1444,6 +1451,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
VERIFY0(bptree_free(dp->dp_meta_objset, VERIFY0(bptree_free(dp->dp_meta_objset,
dp->dp_bptree_obj, tx)); dp->dp_bptree_obj, tx));
dp->dp_bptree_obj = 0; dp->dp_bptree_obj = 0;
scn->scn_async_destroying = B_FALSE;
} }
} }
if (scn->scn_visited_this_txg) { if (scn->scn_visited_this_txg) {

View File

@ -127,6 +127,8 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
int i; int i;
mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
NULL);
for (i = 0; i < TXG_SIZE; i++) { for (i = 0; i < TXG_SIZE; i++) {
cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
NULL); NULL);
@ -169,6 +171,7 @@ txg_fini(dsl_pool_t *dp)
for (c = 0; c < max_ncpus; c++) { for (c = 0; c < max_ncpus; c++) {
int i; int i;
mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
mutex_destroy(&tx->tx_cpu[c].tc_lock); mutex_destroy(&tx->tx_cpu[c].tc_lock);
for (i = 0; i < TXG_SIZE; i++) { for (i = 0; i < TXG_SIZE; i++) {
cv_destroy(&tx->tx_cpu[c].tc_cv[i]); cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
@ -303,10 +306,12 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
tc = &tx->tx_cpu[CPU_SEQID]; tc = &tx->tx_cpu[CPU_SEQID];
kpreempt_enable(); kpreempt_enable();
mutex_enter(&tc->tc_lock); mutex_enter(&tc->tc_open_lock);
txg = tx->tx_open_txg; txg = tx->tx_open_txg;
mutex_enter(&tc->tc_lock);
tc->tc_count[txg & TXG_MASK]++; tc->tc_count[txg & TXG_MASK]++;
mutex_exit(&tc->tc_lock);
th->th_cpu = tc; th->th_cpu = tc;
th->th_txg = txg; th->th_txg = txg;
@ -319,7 +324,8 @@ txg_rele_to_quiesce(txg_handle_t *th)
{ {
tx_cpu_t *tc = th->th_cpu; tx_cpu_t *tc = th->th_cpu;
mutex_exit(&tc->tc_lock); ASSERT(!MUTEX_HELD(&tc->tc_lock));
mutex_exit(&tc->tc_open_lock);
} }
void void
@ -356,10 +362,10 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
int c; int c;
/* /*
* Grab all tx_cpu locks so nobody else can get into this txg. * Grab all tc_open_locks so nobody else can get into this txg.
*/ */
for (c = 0; c < max_ncpus; c++) for (c = 0; c < max_ncpus; c++)
mutex_enter(&tx->tx_cpu[c].tc_lock); mutex_enter(&tx->tx_cpu[c].tc_open_lock);
ASSERT(txg == tx->tx_open_txg); ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++; tx->tx_open_txg++;
@ -372,7 +378,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
* enter the next transaction group. * enter the next transaction group.
*/ */
for (c = 0; c < max_ncpus; c++) for (c = 0; c < max_ncpus; c++)
mutex_exit(&tx->tx_cpu[c].tc_lock); mutex_exit(&tx->tx_cpu[c].tc_open_lock);
/* /*
* Quiesce the transaction group by waiting for everyone to txg_exit(). * Quiesce the transaction group by waiting for everyone to txg_exit().