Merge remote branch 'eris/stats'

Bring in support for the new KSTAT_TYPE_TXG type. This allows for additional visibility in to the txg handling. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2026-05-27 04:32:16 +03:00 · 2012-11-06 14:44:07 -08:00
parent e8fd45a0f9 9dcb971983
commit bbf8c74805
7 changed files with 259 additions and 11 deletions
@@ -65,6 +65,13 @@ typedef struct zfs_all_blkstats {
 	zfs_blkstat_t	zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
 } zfs_all_blkstats_t;

+typedef struct txg_history {
+	kstat_txg_t	th_kstat;
+	vdev_stat_t	th_vs1;
+	vdev_stat_t	th_vs2;
+	kmutex_t	th_lock;
+	list_node_t	th_link;
+} txg_history_t;

 typedef struct dsl_pool {
 	/* Immutable */
@@ -76,6 +83,7 @@ typedef struct dsl_pool {
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_iput_taskq;
+	kstat_t *dp_txg_kstat;

 	/* No lock needed - sync context only */
 	blkptr_t dp_meta_rootbp;
@@ -92,6 +100,9 @@ typedef struct dsl_pool {
 	kmutex_t dp_lock;
 	uint64_t dp_space_towrite[TXG_SIZE];
 	uint64_t dp_tempreserved[TXG_SIZE];
+	uint64_t dp_txg_history_size;
+	list_t dp_txg_history;
+

 	/* Has its own locking */
 	tx_state_t dp_tx;
@@ -144,6 +155,10 @@ extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
 extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
 int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);

+txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg);
+txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_txg_history_put(txg_history_t *th);
+
 #ifdef	__cplusplus
 }
 #endif
@@ -83,6 +83,8 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET	"zio_offset"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE		"zio_size"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS	"zio_flags"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE	"zio_stage"
+#define	FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE	"zio_pipeline"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY	"zio_delay"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
@@ -228,8 +228,10 @@ typedef struct kstat32 {
 					/* ks_ndata == 1 */
 #define	KSTAT_TYPE_TIMER	4	/* event timer */
 					/* ks_ndata >= 1 */
+#define	KSTAT_TYPE_TXG		5	/* txg statistics */
+					/* ks_ndata >= 0 */

-#define	KSTAT_NUM_TYPES		5
+#define	KSTAT_NUM_TYPES		6

 /*
 * kstat class
@@ -698,6 +700,29 @@ typedef struct kstat_timer {

 #define	KSTAT_TIMER_PTR(kptr)	((kstat_timer_t *)(kptr)->ks_data)

+/*
+ * TXG statistics - bytes read/written and iops performed
+ */
+typedef enum kstat_txg_state {
+	TXG_STATE_OPEN      = 1,
+	TXG_STATE_QUIESCING = 2,
+	TXG_STATE_SYNCING   = 3,
+	TXG_STATE_COMMITTED = 4,
+} kstat_txg_state_t;
+
+typedef struct kstat_txg {
+	u_longlong_t		txg;		/* txg id */
+	kstat_txg_state_t	state;		/* txg state */
+	hrtime_t		birth;		/* birth time stamp */
+	u_longlong_t		nread;		/* number of bytes read */
+	u_longlong_t		nwritten;	/* number of bytes written */
+	uint_t			reads;		/* number of read operations */
+	uint_t			writes;		/* number of write operations */
+	hrtime_t		open_time;	/* open time */
+	hrtime_t		quiesce_time;	/* quiesce time */
+	hrtime_t		sync_time;	/* sync time */
+} kstat_txg_t;
+
 #if	defined(_KERNEL)

 #include <sys/t_lock.h>
@@ -44,6 +44,7 @@
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
 int zfs_txg_synctime_ms = 1000;		/* target millisecs to sync a txg */
+int zfs_txg_history = 60;		/* statistics for the last N txgs */

 unsigned long zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
 unsigned long zfs_write_limit_max = 0;		/* max data payload per txg */
@@ -54,6 +55,143 @@ kmutex_t zfs_write_limit_lock;

 static pgcnt_t old_physmem = 0;

+static int
+dsl_pool_txg_history_update(kstat_t *ksp, int rw)
+{
+	dsl_pool_t *dp = ksp->ks_private;
+	txg_history_t *th;
+	int i = 0;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	if (ksp->ks_data)
+		kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+	mutex_enter(&dp->dp_lock);
+
+	ksp->ks_ndata = dp->dp_txg_history_size;
+	ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t);
+	if (ksp->ks_data_size > 0)
+		ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE);
+
+	/* Traversed oldest to youngest for the most readable kstat output */
+	for (th = list_tail(&dp->dp_txg_history); th != NULL;
+	     th = list_prev(&dp->dp_txg_history, th)) {
+		mutex_enter(&th->th_lock);
+		ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size);
+		memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t));
+		i += sizeof(kstat_txg_t);
+		mutex_exit(&th->th_lock);
+	}
+
+	mutex_exit(&dp->dp_lock);
+
+	return (0);
+}
+
+static void
+dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg)
+{
+	char name[KSTAT_STRLEN];
+
+	list_create(&dp->dp_txg_history, sizeof (txg_history_t),
+	    offsetof(txg_history_t, th_link));
+	dsl_pool_txg_history_add(dp, txg);
+
+	(void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa));
+	dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc",
+	    KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL);
+	if (dp->dp_txg_kstat) {
+		dp->dp_txg_kstat->ks_data = NULL;
+		dp->dp_txg_kstat->ks_private = dp;
+		dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update;
+		kstat_install(dp->dp_txg_kstat);
+	}
+}
+
+static void
+dsl_pool_txg_history_destroy(dsl_pool_t *dp)
+{
+	txg_history_t *th;
+
+	if (dp->dp_txg_kstat) {
+		if (dp->dp_txg_kstat->ks_data)
+			kmem_free(dp->dp_txg_kstat->ks_data,
+			    dp->dp_txg_kstat->ks_data_size);
+
+		kstat_delete(dp->dp_txg_kstat);
+	}
+
+	mutex_enter(&dp->dp_lock);
+	while ((th = list_remove_head(&dp->dp_txg_history))) {
+		dp->dp_txg_history_size--;
+		mutex_destroy(&th->th_lock);
+		kmem_free(th, sizeof(txg_history_t));
+	}
+
+	ASSERT3U(dp->dp_txg_history_size, ==, 0);
+	list_destroy(&dp->dp_txg_history);
+	mutex_exit(&dp->dp_lock);
+}
+
+txg_history_t *
+dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg)
+{
+	txg_history_t *th, *rm;
+
+	th = kmem_zalloc(sizeof(txg_history_t), KM_SLEEP);
+	mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL);
+	th->th_kstat.txg = txg;
+	th->th_kstat.state = TXG_STATE_OPEN;
+	th->th_kstat.birth = gethrtime();
+
+	mutex_enter(&dp->dp_lock);
+
+	list_insert_head(&dp->dp_txg_history, th);
+	dp->dp_txg_history_size++;
+
+	while (dp->dp_txg_history_size > zfs_txg_history) {
+		dp->dp_txg_history_size--;
+		rm = list_remove_tail(&dp->dp_txg_history);
+		mutex_destroy(&rm->th_lock);
+		kmem_free(rm, sizeof(txg_history_t));
+	}
+
+	mutex_exit(&dp->dp_lock);
+
+	return (th);
+}
+
+/*
+ * Traversed youngest to oldest because lookups are only done for open
+ * or syncing txgs which are guaranteed to be at the head of the list.
+ * The txg_history_t structure will be returned locked.
+ */
+txg_history_t *
+dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg)
+{
+	txg_history_t *th;
+
+	mutex_enter(&dp->dp_lock);
+	for (th = list_head(&dp->dp_txg_history); th != NULL;
+	     th = list_next(&dp->dp_txg_history, th)) {
+		if (th->th_kstat.txg == txg) {
+			mutex_enter(&th->th_lock);
+			break;
+		}
+	}
+	mutex_exit(&dp->dp_lock);
+
+	return (th);
+}
+
+void
+dsl_pool_txg_history_put(txg_history_t *th)
+{
+	mutex_exit(&th->th_lock);
+}
+
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
@@ -96,6 +234,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
 	    1, 4, 0);

+	dsl_pool_txg_history_init(dp, txg);
+
 	return (dp);
 }

@@ -213,6 +353,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	arc_flush(dp->dp_spa);
 	txg_fini(dp);
 	dsl_scan_fini(dp);
+	dsl_pool_txg_history_destroy(dp);
 	rw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	taskq_destroy(dp->dp_iput_taskq);
@@ -864,6 +1005,9 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg");
 module_param(zfs_txg_synctime_ms, int, 0644);
 MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync");

+module_param(zfs_txg_history, int, 0644);
+MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs");
+
 module_param(zfs_write_limit_min, ulong, 0444);
 MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit");

@@ -29,6 +29,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/callb.h>
+#include <sys/spa_impl.h>

 /*
 * Pool-wide transaction groups.
@@ -279,6 +280,8 @@ txg_rele_to_sync(txg_handle_t *th)
 static void
 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 {
+	hrtime_t start;
+	txg_history_t *th;
 	tx_state_t *tx = &dp->dp_tx;
 	int g = txg & TXG_MASK;
 	int c;
@@ -292,6 +295,15 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;

+	/*
+	 * Measure how long the txg was open and replace the kstat.
+	 */
+	th = dsl_pool_txg_history_get(dp, txg);
+	th->th_kstat.open_time = gethrtime() - th->th_kstat.birth;
+	th->th_kstat.state = TXG_STATE_QUIESCING;
+	dsl_pool_txg_history_put(th);
+	dsl_pool_txg_history_add(dp, tx->tx_open_txg);
+
 	/*
 	 * Now that we've incremented tx_open_txg, we can let threads
 	 * enter the next transaction group.
@@ -302,6 +314,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 	/*
 	 * Quiesce the transaction group by waiting for everyone to txg_exit().
 	 */
+	start = gethrtime();
+
 	for (c = 0; c < max_ncpus; c++) {
 		tx_cpu_t *tc = &tx->tx_cpu[c];
 		mutex_enter(&tc->tc_lock);
@@ -309,6 +323,13 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 		mutex_exit(&tc->tc_lock);
 	}
+
+	/*
+	 * Measure how long the txg took to quiesce.
+	 */
+	th = dsl_pool_txg_history_get(dp, txg);
+	th->th_kstat.quiesce_time = gethrtime() - start;
+	dsl_pool_txg_history_put(th);
 }

 static void
@@ -395,6 +416,8 @@ txg_sync_thread(dsl_pool_t *dp)

 	start = delta = 0;
 	for (;;) {
+		hrtime_t hrstart;
+		txg_history_t *th;
 		uint64_t timer, timeout;
 		uint64_t txg;

@@ -441,11 +464,17 @@ txg_sync_thread(dsl_pool_t *dp)
 		tx->tx_syncing_txg = txg;
 		cv_broadcast(&tx->tx_quiesce_more_cv);

+		th = dsl_pool_txg_history_get(dp, txg);
+		th->th_kstat.state = TXG_STATE_SYNCING;
+		vdev_get_stats(spa->spa_root_vdev, &th->th_vs1);
+		dsl_pool_txg_history_put(th);
+
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);

 		start = ddi_get_lbolt();
+		hrstart = gethrtime();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;

@@ -458,6 +487,23 @@ txg_sync_thread(dsl_pool_t *dp)
 		 * Dispatch commit callbacks to worker threads.
 		 */
 		txg_dispatch_callbacks(dp, txg);
+
+		/*
+		 * Measure the txg sync time determine the amount of I/O done.
+		 */
+		th = dsl_pool_txg_history_get(dp, txg);
+		vdev_get_stats(spa->spa_root_vdev, &th->th_vs2);
+		th->th_kstat.sync_time = gethrtime() - hrstart;
+		th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] -
+		    th->th_vs1.vs_bytes[ZIO_TYPE_READ];
+		th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] -
+		    th->th_vs1.vs_bytes[ZIO_TYPE_WRITE];
+		th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] -
+		    th->th_vs1.vs_ops[ZIO_TYPE_READ];
+		th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] -
+		    th->th_vs1.vs_ops[ZIO_TYPE_WRITE];
+		th->th_kstat.state = TXG_STATE_COMMITTED;
+		dsl_pool_txg_history_put(th);
 	}
 }

@@ -294,6 +294,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
 		    DATA_TYPE_INT32, zio->io_flags, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
+		    DATA_TYPE_UINT32, zio->io_stage, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
+		    DATA_TYPE_UINT32, zio->io_pipeline, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
 		    DATA_TYPE_UINT64, zio->io_delay, NULL);

@@ -1305,18 +1305,34 @@ __zio_execute(zio_t *zio)
 int
 zio_wait(zio_t *zio)
 {
+	uint64_t timeout;
 	int error;

 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_executor == NULL);

 	zio->io_waiter = curthread;
+	timeout = ddi_get_lbolt() + (zio_delay_max / MILLISEC * hz);

 	__zio_execute(zio);

 	mutex_enter(&zio->io_lock);
-	while (zio->io_executor != NULL)
-		cv_wait(&zio->io_cv, &zio->io_lock);
+	while (zio->io_executor != NULL) {
+		/*
+		 * Wake up periodically to prevent the kernel from complaining
+		 * about a blocked task.  However, check zio_delay_max to see
+		 * if the I/O has exceeded the timeout and post an ereport.
+		 */
+		cv_timedwait_interruptible(&zio->io_cv, &zio->io_lock,
+		    ddi_get_lbolt() + hz);
+
+		if (timeout && (ddi_get_lbolt() > timeout)) {
+			zio->io_delay = zio_delay_max;
+			zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+			    zio->io_spa, zio->io_vd, zio, 0, 0);
+			timeout = 0;
+		}
+	}
 	mutex_exit(&zio->io_lock);

 	error = zio->io_error;
@@ -2889,15 +2905,11 @@ zio_done(zio_t *zio)
 	vdev_stat_update(zio, zio->io_size);

 	/*
-	 * If this I/O is attached to a particular vdev is slow, exeeding
-	 * 30 seconds to complete, post an error described the I/O delay.
-	 * We ignore these errors if the device is currently unavailable.
+	 * When an I/O completes but was slow post an ereport.
 	 */
-	if (zio->io_delay >= zio_delay_max) {
-		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
-			zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
-                                         zio->io_vd, zio, 0, 0);
-	}
+	if (zio->io_delay >= zio_delay_max)
+		zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
+		    zio->io_vd, zio, 0, 0);

 	if (zio->io_error) {
 		/*