diff --git a/include/sys/spa.h b/include/sys/spa.h index a4a717ac0..85a62588a 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -548,12 +548,26 @@ typedef struct spa_stats_history { typedef struct spa_stats { spa_stats_history_t read_history; + spa_stats_history_t txg_history; } spa_stats_t; +typedef enum txg_state { + TXG_STATE_BIRTH = 0, + TXG_STATE_OPEN = 1, + TXG_STATE_QUIESCED = 2, + TXG_STATE_SYNCED = 3, + TXG_STATE_COMMITTED = 4, +} txg_state_t; + extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags); +extern void spa_txg_history_add(spa_t *spa, uint64_t txg); +extern int spa_txg_history_set(spa_t *spa, uint64_t txg, + txg_state_t completed_state, hrtime_t completed_time); +extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 271a3c312..16ccbb0ad 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -32,6 +32,11 @@ int zfs_read_history = 0; */ int zfs_read_history_hits = 0; +/* + * Keeps stats on the last N txgs, disabled by default. + */ +int zfs_txg_history = 0; + /* * ========================================================================== * SPA Read History Routines @@ -227,15 +232,289 @@ spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) mutex_exit(&ssh->lock); } +/* + * ========================================================================== + * SPA TXG History Routines + * ========================================================================== + */ + +/* + * Txg statistics - Information exported regarding each txg sync + */ + +typedef struct spa_txg_history { + uint64_t txg; /* txg id */ + txg_state_t state; /* active txg state */ + uint64_t nread; /* number of bytes read */ + uint64_t nwritten; /* number of bytes written */ + uint64_t reads; /* number of read operations */ + uint64_t writes; /* number of write operations */ + uint64_t nreserved; /* number of bytes reserved */ + hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ + list_node_t sth_link; +} spa_txg_history_t; + +static int +spa_txg_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " + "%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state", + "nreserved", "nread", "nwritten", "reads", "writes", + "otime", "qtime", "stime"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_txg_history_data(char *buf, size_t size, void *data) +{ + spa_txg_history_t *sth = (spa_txg_history_t *)data; + uint64_t open = 0, quiesce = 0, sync = 0; + char state; + + switch (sth->state) { + case TXG_STATE_BIRTH: state = 'B'; break; + case TXG_STATE_OPEN: state = 'O'; break; + case TXG_STATE_QUIESCED: state = 'Q'; break; + case TXG_STATE_SYNCED: state = 'S'; break; + case TXG_STATE_COMMITTED: state = 'C'; break; + default: state = '?'; break; + } + + if (sth->times[TXG_STATE_OPEN]) + open = sth->times[TXG_STATE_OPEN] - + sth->times[TXG_STATE_BIRTH]; + + if (sth->times[TXG_STATE_QUIESCED]) + quiesce = sth->times[TXG_STATE_QUIESCED] - + sth->times[TXG_STATE_OPEN]; + + if (sth->times[TXG_STATE_SYNCED]) + sync = sth->times[TXG_STATE_SYNCED] - + sth->times[TXG_STATE_QUIESCED]; + + size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " + "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n", + (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, + (u_longlong_t)sth->nreserved, + (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, + (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, + (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_txg_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_txg_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_txg_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (rw == KSTAT_WRITE) { + spa_txg_history_t *sth; + + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t); + + return (0); +} + +static void +spa_txg_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_txg_history_t), + offsetof(spa_txg_history_t, sth_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "txgs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_txg_history_update; + kstat_set_raw_ops(ksp, spa_txg_history_headers, + spa_txg_history_data, spa_txg_history_addr); + kstat_install(ksp); + } +} + +static void +spa_txg_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +/* + * Add a new txg to historical record. + */ +void +spa_txg_history_add(spa_t *spa, uint64_t txg) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth, *rm; + + if (zfs_txg_history == 0 && ssh->size == 0) + return; + + sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE); + sth->txg = txg; + sth->state = TXG_STATE_OPEN; + sth->times[TXG_STATE_BIRTH] = gethrtime(); + + mutex_enter(&ssh->lock); + + list_insert_head(&ssh->list, sth); + ssh->size++; + + while (ssh->size > zfs_txg_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_txg_history_t)); + } + + mutex_exit(&ssh->lock); +} + +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->times[completed_state] = completed_time; + sth->state++; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + +/* + * Set txg IO stats. + */ +int +spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->nread = nread; + sth->nwritten = nwritten; + sth->reads = reads; + sth->writes = writes; + sth->nreserved = nreserved; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + void spa_stats_init(spa_t *spa) { spa_read_history_init(spa); + spa_txg_history_init(spa); } void spa_stats_destroy(spa_t *spa) { + spa_txg_history_destroy(spa); spa_read_history_destroy(spa); } @@ -245,4 +524,7 @@ MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); module_param(zfs_read_history_hits, int, 0644); MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); + +module_param(zfs_txg_history, int, 0644); +MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); #endif diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 440353a50..7a3da8647 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -363,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime()); + spa_txg_history_add(dp->dp_spa, tx->tx_open_txg); + /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. @@ -380,6 +384,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } + + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); } static void @@ -451,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp) spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; + vdev_stat_t *vs1, *vs2; uint64_t start, delta; #ifdef _KERNEL @@ -464,6 +471,9 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_enter(tx, &cpr); + vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + start = delta = 0; for (;;) { uint64_t timer, timeout; @@ -499,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } - if (tx->tx_exiting) + if (tx->tx_exiting) { + kmem_free(vs2, sizeof(vdev_stat_t)); + kmem_free(vs1, sizeof(vdev_stat_t)); txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + } + + vdev_get_stats(spa->spa_root_vdev, vs1); /* * Consume the quiesced txg which has been handed off to @@ -529,6 +544,16 @@ txg_sync_thread(dsl_pool_t *dp) * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); + + vdev_get_stats(spa->spa_root_vdev, vs2); + spa_txg_history_set_io(spa, txg, + vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], + vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], + vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], + vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], + dp->dp_space_towrite[txg & TXG_MASK] + + dp->dp_tempreserved[txg & TXG_MASK] / 2); + spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); } }