3582 zfs_delay() should support a variable resolution
3584 DTrace sdt probes for ZFS txg states

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Dan McDonald <danmcd@nexenta.com>
Reviewed by: Richard Elling <richard.elling@dey-sys.com>
Approved by: Garrett D'Amore <garrett@damore.org>

References:
    https://www.illumos.org/issues/3582
    illumos/illumos-gate@0689f76

Ported by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1775
This commit is contained in:
Adam Leventhal 2013-08-28 16:05:48 -07:00 committed by Brian Behlendorf
parent c1fabe7961
commit 63fd3c6cfd
10 changed files with 87 additions and 33 deletions

View File

@ -74,13 +74,8 @@ extern void txg_rele_to_quiesce(txg_handle_t *txghp);
extern void txg_rele_to_sync(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp);
extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
/* extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
* Delay the caller by the specified number of ticks or until hrtime_t resolution);
* the txg closes (whichever comes first). This is intended
* to be used to throttle writers when the system nears its
* capacity.
*/
extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
/* /*
* Wait until the given transaction group has finished syncing. * Wait until the given transaction group has finished syncing.

View File

@ -70,7 +70,7 @@ struct tx_cpu {
kmutex_t tc_open_lock; /* protects tx_open_txg */ kmutex_t tc_open_lock; /* protects tx_open_txg */
kmutex_t tc_lock; /* protects the rest of this struct */ kmutex_t tc_lock; /* protects the rest of this struct */
kcondvar_t tc_cv[TXG_SIZE]; kcondvar_t tc_cv[TXG_SIZE];
uint64_t tc_count[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
char tc_pad[8]; /* pad to fill 3 cache lines */ char tc_pad[8]; /* pad to fill 3 cache lines */
}; };
@ -87,8 +87,8 @@ struct tx_cpu {
* every cpu (see txg_quiesce()). * every cpu (see txg_quiesce()).
*/ */
typedef struct tx_state { typedef struct tx_state {
tx_cpu_t *tx_cpu; /* protects right to enter txg */ tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
kmutex_t tx_sync_lock; /* protects tx_state_t */ kmutex_t tx_sync_lock; /* protects the rest of this struct */
uint64_t tx_open_txg; /* currently open txg id */ uint64_t tx_open_txg; /* currently open txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */ uint64_t tx_syncing_txg; /* currently syncing txg id */

View File

@ -338,6 +338,8 @@ extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
extern void cv_destroy(kcondvar_t *cv); extern void cv_destroy(kcondvar_t *cv);
extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
hrtime_t res, int flag);
extern void cv_signal(kcondvar_t *cv); extern void cv_signal(kcondvar_t *cv);
extern void cv_broadcast(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv);
#define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at)

View File

@ -50,6 +50,14 @@
#define NSEC_PER_USEC 1000L #define NSEC_PER_USEC 1000L
#endif #endif
#ifndef MSEC2NSEC
#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
#endif
#ifndef NSEC2MSEC
#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
#endif
extern hrtime_t gethrtime(void); extern hrtime_t gethrtime(void);
extern void gethrestime(timestruc_t *); extern void gethrestime(timestruc_t *);

View File

@ -528,6 +528,41 @@ top:
return (1); return (1);
} }
/*ARGSUSED*/
clock_t
cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
int flag)
{
int error;
timestruc_t ts;
hrtime_t delta;
ASSERT(flag == 0);
top:
delta = tim - gethrtime();
if (delta <= 0)
return (-1);
ts.tv_sec = delta / NANOSEC;
ts.tv_nsec = delta % NANOSEC;
ASSERT(mutex_owner(mp) == curthread);
mp->m_owner = NULL;
error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts);
mp->m_owner = curthread;
if (error == ETIME)
return (-1);
if (error == EINTR)
goto top;
ASSERT(error == 0);
return (1);
}
void void
cv_signal(kcondvar_t *cv) cv_signal(kcondvar_t *cv)
{ {

View File

@ -743,7 +743,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
} else { } else {
if (err == EAGAIN) { if (err == EAGAIN) {
txg_delay(dd->dd_pool, tx->tx_txg, 1); txg_delay(dd->dd_pool, tx->tx_txg,
MSEC2NSEC(10), MSEC2NSEC(10));
err = SET_ERROR(ERESTART); err = SET_ERROR(ERESTART);
} }
dsl_pool_memory_pressure(dd->dd_pool); dsl_pool_memory_pressure(dd->dd_pool);

View File

@ -58,6 +58,9 @@ kmutex_t zfs_write_limit_lock;
static pgcnt_t old_physmem = 0; static pgcnt_t old_physmem = 0;
hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
int int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{ {
@ -512,12 +515,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* Weight the throughput calculation towards the current value: * Weight the throughput calculation towards the current value:
* thru = 3/4 old_thru + 1/4 new_thru * thru = 3/4 old_thru + 1/4 new_thru
* *
* Note: write_time is in nanosecs, so write_time/MICROSEC * Note: write_time is in nanosecs while dp_throughput is expressed in
* yields millisecs * bytes per millisecond.
*/ */
ASSERT(zfs_write_limit_min > 0); ASSERT(zfs_write_limit_min > 0);
if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { if (data_written > zfs_write_limit_min / 8 &&
uint64_t throughput = data_written / (write_time / MICROSEC); write_time > MSEC2NSEC(1)) {
uint64_t throughput = data_written / NSEC2MSEC(write_time);
if (dp->dp_throughput) if (dp->dp_throughput)
dp->dp_throughput = throughput / 4 + dp->dp_throughput = throughput / 4 +
@ -617,8 +621,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
* the caller 1 clock tick. This will slow down the "fill" * the caller 1 clock tick. This will slow down the "fill"
* rate until the sync process can catch up with us. * rate until the sync process can catch up with us.
*/ */
if (reserved && reserved > (write_limit - (write_limit >> 3))) if (reserved && reserved > (write_limit - (write_limit >> 3))) {
txg_delay(dp, tx->tx_txg, 1); txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
zfs_throttle_resolution);
}
return (0); return (0);
} }

View File

@ -409,7 +409,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
zfs_resilver_min_time_ms : zfs_scan_min_time_ms; zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
(elapsed_nanosecs / MICROSEC > mintime && (NSEC2MSEC(elapsed_nanosecs) > mintime &&
txg_sync_waiting(scn->scn_dp)) || txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa)) { spa_shutting_down(scn->scn_dp->dp_spa)) {
if (zb) { if (zb) {
@ -1335,7 +1335,7 @@ dsl_scan_free_should_pause(dsl_scan_t *scn)
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
(elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) || txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa)); spa_shutting_down(scn->scn_dp->dp_spa));
} }
@ -1459,7 +1459,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
"free_bpobj/bptree txg %llu", "free_bpobj/bptree txg %llu",
(longlong_t)scn->scn_visited_this_txg, (longlong_t)scn->scn_visited_this_txg,
(longlong_t) (longlong_t)
(gethrtime() - scn->scn_sync_start_time) / MICROSEC, NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
(longlong_t)tx->tx_txg); (longlong_t)tx->tx_txg);
scn->scn_visited_this_txg = 0; scn->scn_visited_this_txg = 0;
/* /*
@ -1507,7 +1507,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
zfs_dbgmsg("visited %llu blocks in %llums", zfs_dbgmsg("visited %llu blocks in %llums",
(longlong_t)scn->scn_visited_this_txg, (longlong_t)scn->scn_visited_this_txg,
(longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
if (!scn->scn_pausing) { if (!scn->scn_pausing) {
/* finished with scan. */ /* finished with scan. */

View File

@ -490,8 +490,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_proc = &p0; spa->spa_proc = &p0;
spa->spa_proc_state = SPA_PROC_NONE; spa->spa_proc_state = SPA_PROC_NONE;
spa->spa_deadman_synctime = zfs_deadman_synctime * spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
zfs_txg_synctime_ms * MICROSEC; zfs_txg_synctime_ms);
refcount_create(&spa->spa_refcount); refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa); spa_config_lock_init(spa);

View File

@ -236,7 +236,7 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
} }
static void static void
txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
{ {
CALLB_CPR_SAFE_BEGIN(cpr); CALLB_CPR_SAFE_BEGIN(cpr);
@ -373,6 +373,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime()); spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
spa_txg_history_add(dp->dp_spa, tx->tx_open_txg); spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
/* /*
* Now that we've incremented tx_open_txg, we can let threads * Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group. * enter the next transaction group.
@ -531,6 +534,7 @@ txg_sync_thread(dsl_pool_t *dp)
txg = tx->tx_quiesced_txg; txg = tx->tx_quiesced_txg;
tx->tx_quiesced_txg = 0; tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg; tx->tx_syncing_txg = txg;
DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_quiesce_more_cv); cv_broadcast(&tx->tx_quiesce_more_cv);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@ -544,6 +548,7 @@ txg_sync_thread(dsl_pool_t *dp)
mutex_enter(&tx->tx_sync_lock); mutex_enter(&tx->tx_sync_lock);
tx->tx_synced_txg = txg; tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0; tx->tx_syncing_txg = 0;
DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_done_cv); cv_broadcast(&tx->tx_sync_done_cv);
/* /*
@ -602,21 +607,22 @@ txg_quiesce_thread(dsl_pool_t *dp)
*/ */
dprintf("quiesce done, handing off txg %llu\n", txg); dprintf("quiesce done, handing off txg %llu\n", txg);
tx->tx_quiesced_txg = txg; tx->tx_quiesced_txg = txg;
DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_more_cv); cv_broadcast(&tx->tx_sync_more_cv);
cv_broadcast(&tx->tx_quiesce_done_cv); cv_broadcast(&tx->tx_quiesce_done_cv);
} }
} }
/* /*
* Delay this thread by 'ticks' if we are still in the open transaction * Delay this thread by delay nanoseconds if we are still in the open
* group and there is already a waiting txg quiesing or quiesced. Abort * transaction group and there is already a waiting txg quiesing or quiesced.
* the delay if this txg stalls or enters the quiesing state. * Abort the delay if this txg stalls or enters the quiesing state.
*/ */
void void
txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
{ {
tx_state_t *tx = &dp->dp_tx; tx_state_t *tx = &dp->dp_tx;
clock_t timeout = ddi_get_lbolt() + ticks; hrtime_t start = gethrtime();
/* don't delay if this txg could transition to quiesing immediately */ /* don't delay if this txg could transition to quiesing immediately */
if (tx->tx_open_txg > txg || if (tx->tx_open_txg > txg ||
@ -629,10 +635,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
return; return;
} }
while (ddi_get_lbolt() < timeout && while (gethrtime() - start < delay &&
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
timeout); &tx->tx_sync_lock, delay, resolution, 0);
}
DMU_TX_STAT_BUMP(dmu_tx_delay); DMU_TX_STAT_BUMP(dmu_tx_delay);