Add TRIM support

UNMAP/TRIM support is a frequently-requested feature to help prevent performance from degrading on SSDs and on various other SAN-like storage back-ends. By issuing UNMAP/TRIM commands for sectors which are no longer allocated the underlying device can often more efficiently manage itself. This TRIM implementation is modeled on the `zpool initialize` feature which writes a pattern to all unallocated space in the pool. The new `zpool trim` command uses the same vdev_xlate() code to calculate what sectors are unallocated, the same per- vdev TRIM thread model and locking, and the same basic CLI for a consistent user experience. The core difference is that instead of writing a pattern it will issue UNMAP/TRIM commands for those extents. The zio pipeline was updated to accommodate this by adding a new ZIO_TYPE_TRIM type and associated spa taskq. This new type makes is straight forward to add the platform specific TRIM/UNMAP calls to vdev_disk.c and vdev_file.c. These new ZIO_TYPE_TRIM zios are handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs. This makes it possible to largely avoid changing the pipieline, one exception is that TRIM zio's may exceed the 16M block size limit since they contain no data. In addition to the manual `zpool trim` command, a background automatic TRIM was added and is controlled by the 'autotrim' property. It relies on the exact same infrastructure as the manual TRIM. However, instead of relying on the extents in a metaslab's ms_allocatable range tree, a ms_trim tree is kept per metaslab. When 'autotrim=on', ranges added back to the ms_allocatable tree are also added to the ms_free tree. The ms_free tree is then periodically consumed by an autotrim thread which systematically walks a top level vdev's metaslabs. Since the automatic TRIM will skip ranges it considers too small there is value in occasionally running a full `zpool trim`. This may occur when the freed blocks are small and not enough time was allowed to aggregate them. An automatic TRIM and a manual `zpool trim` may be run concurrently, in which case the automatic TRIM will yield to the manual TRIM. Reviewed-by: Jorgen Lundman <lundman@lundman.net> Reviewed-by: Tim Chase <tim@chase2k.com> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Reviewed-by: George Wilson <george.wilson@delphix.com> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Contributions-by: Saso Kiselkov <saso.kiselkov@nexenta.com> Contributions-by: Tim Chase <tim@chase2k.com> Contributions-by: Chunwei Chen <tuxoko@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #8419 Closes #598
2026-05-22 02:27:36 +03:00 · 2019-03-29 09:13:20 -07:00
parent f94b3cbf43
commit 1b939560be
91 changed files with 5593 additions and 439 deletions
@@ -99,6 +99,7 @@ $(MODULE)-objs += vdev_raidz_math.o
 $(MODULE)-objs += vdev_raidz_math_scalar.o
 $(MODULE)-objs += vdev_removal.o
 $(MODULE)-objs += vdev_root.o
+$(MODULE)-objs += vdev_trim.o
 $(MODULE)-objs += zap.o
 $(MODULE)-objs += zap_leaf.o
 $(MODULE)-objs += zap_micro.o
@@ -842,7 +842,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
-			txg_wait_open(dp, 0);
+			txg_wait_open(dp, 0, B_TRUE);
 			continue;
 		}

@@ -181,7 +181,6 @@ int metaslab_lba_weighting_enabled = B_TRUE;
 */
 int metaslab_bias_enabled = B_TRUE;

-
 /*
 * Enable/disable remapping of indirect DVAs to their concrete vdevs.
 */
@@ -219,6 +218,12 @@ boolean_t metaslab_trace_enabled = B_TRUE;
 uint64_t metaslab_trace_max_entries = 5000;
 #endif

+/*
+ * Maximum number of metaslabs per group that can be disabled
+ * simultaneously.
+ */
+int max_disabled_ms = 3;
+
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -652,8 +657,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)

 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 	    KM_SLEEP);
 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -700,8 +705,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
 	kmem_free(mg->mg_secondaries, mg->mg_allocators *
 	    sizeof (metaslab_t *));
 	mutex_destroy(&mg->mg_lock);
-	mutex_destroy(&mg->mg_ms_initialize_lock);
-	cv_destroy(&mg->mg_ms_initialize_cv);
+	mutex_destroy(&mg->mg_ms_disabled_lock);
+	cv_destroy(&mg->mg_ms_disabled_cv);

 	for (int i = 0; i < mg->mg_allocators; i++) {
 		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1846,8 +1851,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	 */
 	ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
 	    &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
-	metaslab_group_add(mg, ms);

+	ms->ms_trim = range_tree_create(NULL, NULL);
+
+	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms);

 	/*
@@ -1921,6 +1928,9 @@ metaslab_fini(metaslab_t *msp)
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));

+	range_tree_vacate(msp->ms_trim, NULL, NULL);
+	range_tree_destroy(msp->ms_trim);
+
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
@@ -2727,6 +2737,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	ASSERT3P(msp->ms_freeing, !=, NULL);
 	ASSERT3P(msp->ms_freed, !=, NULL);
 	ASSERT3P(msp->ms_checkpointing, !=, NULL);
+	ASSERT3P(msp->ms_trim, !=, NULL);

 	/*
 	 * Normally, we don't want to process a metaslab if there are no
@@ -2999,6 +3010,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 */
 	metaslab_load_wait(msp);

+	/*
+	 * When auto-trimming is enabled, free ranges which are added to
+	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
+	 * periodically consumed by the vdev_autotrim_thread() which issues
+	 * trims for all ranges and then vacates the tree.  The ms_trim tree
+	 * can be discarded at any time with the sole consequence of recent
+	 * frees not being trimmed.
+	 */
+	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
+		range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
+		if (!defer_allowed) {
+			range_tree_walk(msp->ms_freed, range_tree_add,
+			    msp->ms_trim);
+		}
+	} else {
+		range_tree_vacate(msp->ms_trim, NULL, NULL);
+	}
+
 	/*
 	 * Move the frees from the defer_tree back to the free
 	 * range tree (if it's loaded). Swap the freed_tree and
@@ -3047,7 +3076,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
 	 */
 	if (msp->ms_loaded &&
-	    msp->ms_initializing == 0 &&
+	    msp->ms_disabled == 0 &&
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {

 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
@@ -3330,7 +3359,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	metaslab_class_t *mc = msp->ms_group->mg_class;

 	VERIFY(!msp->ms_condensing);
-	VERIFY0(msp->ms_initializing);
+	VERIFY0(msp->ms_disabled);

 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
@@ -3341,6 +3370,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 		range_tree_remove(rt, start, size);
+		range_tree_clear(msp->ms_trim, start, size);

 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
@@ -3391,10 +3421,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
 		}

 		/*
-		 * If the selected metaslab is condensing or being
-		 * initialized, skip it.
+		 * If the selected metaslab is condensing or disabled,
+		 * skip it.
 		 */
-		if (msp->ms_condensing || msp->ms_initializing > 0)
+		if (msp->ms_condensing || msp->ms_disabled > 0)
 			continue;

 		*was_active = msp->ms_allocator != -1;
@@ -3566,9 +3596,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 			continue;
-		} else if (msp->ms_initializing > 0) {
+		} else if (msp->ms_disabled > 0) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_INITIALIZING, allocator);
+			    TRACE_DISABLED, allocator);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
@@ -4294,6 +4324,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
 	    msp->ms_size);
 	range_tree_remove(msp->ms_allocatable, offset, size);
+	range_tree_clear(msp->ms_trim, offset, size);

 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
@@ -4606,6 +4637,7 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 		    offset, size);
 	}

+	range_tree_verify_not_present(msp->ms_trim, offset, size);
 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	range_tree_verify_not_present(msp->ms_freed, offset, size);
@@ -4637,6 +4669,89 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }

+static void
+metaslab_group_disable_wait(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+	while (mg->mg_disabled_updating) {
+		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+	}
+}
+
+static void
+metaslab_group_disabled_increment(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+	ASSERT(mg->mg_disabled_updating);
+
+	while (mg->mg_ms_disabled >= max_disabled_ms) {
+		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+	}
+	mg->mg_ms_disabled++;
+	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
+}
+
+/*
+ * Mark the metaslab as disabled to prevent any allocations on this metaslab.
+ * We must also track how many metaslabs are currently disabled within a
+ * metaslab group and limit them to prevent allocation failures from
+ * occurring because all metaslabs are disabled.
+ */
+void
+metaslab_disable(metaslab_t *msp)
+{
+	ASSERT(!MUTEX_HELD(&msp->ms_lock));
+	metaslab_group_t *mg = msp->ms_group;
+
+	mutex_enter(&mg->mg_ms_disabled_lock);
+
+	/*
+	 * To keep an accurate count of how many threads have disabled
+	 * a specific metaslab group, we only allow one thread to mark
+	 * the metaslab group at a time. This ensures that the value of
+	 * ms_disabled will be accurate when we decide to mark a metaslab
+	 * group as disabled. To do this we force all other threads
+	 * to wait till the metaslab's mg_disabled_updating flag is no
+	 * longer set.
+	 */
+	metaslab_group_disable_wait(mg);
+	mg->mg_disabled_updating = B_TRUE;
+	if (msp->ms_disabled == 0) {
+		metaslab_group_disabled_increment(mg);
+	}
+	mutex_enter(&msp->ms_lock);
+	msp->ms_disabled++;
+	mutex_exit(&msp->ms_lock);
+
+	mg->mg_disabled_updating = B_FALSE;
+	cv_broadcast(&mg->mg_ms_disabled_cv);
+	mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
+void
+metaslab_enable(metaslab_t *msp, boolean_t sync)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	spa_t *spa = mg->mg_vd->vdev_spa;
+
+	/*
+	 * Wait for the outstanding IO to be synced to prevent newly
+	 * allocated blocks from being overwritten.  This used by
+	 * initialize and TRIM which are modifying unallocated space.
+	 */
+	if (sync)
+		txg_wait_synced(spa_get_dsl(spa), 0);
+
+	mutex_enter(&mg->mg_ms_disabled_lock);
+	mutex_enter(&msp->ms_lock);
+	if (--msp->ms_disabled == 0) {
+		mg->mg_ms_disabled--;
+		cv_broadcast(&mg->mg_ms_disabled_cv);
+	}
+	mutex_exit(&msp->ms_lock);
+	mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
 #if defined(_KERNEL)
 /* BEGIN CSTYLED */
 module_param(metaslab_aliquot, ulong, 0644);
@@ -57,6 +57,7 @@
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -132,7 +133,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
 * macros. Other operations process a large amount of data; the ZTI_BATCH
 * macro causes us to create a taskq oriented for throughput. Some operations
- * are so high frequency and short-lived that the taskq itself can become a a
+ * are so high frequency and short-lived that the taskq itself can become a
 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
 * additional degree of parallelism specified by the number of threads per-
 * taskq and the number of taskqs; when dispatching an event in this case, the
@@ -150,6 +151,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
+	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };

 static void spa_sync_version(void *arg, dmu_tx_t *tx);
@@ -554,6 +556,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
+		case ZPOOL_PROP_AUTOTRIM:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
@@ -1442,8 +1445,10 @@ spa_unload(spa_t *spa)
 	spa_async_suspend(spa);

 	if (spa->spa_root_vdev) {
-		vdev_initialize_stop_all(spa->spa_root_vdev,
-		    VDEV_INITIALIZE_ACTIVE);
+		vdev_t *root_vdev = spa->spa_root_vdev;
+		vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
+		vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
+		vdev_autotrim_stop_all(spa);
 	}

 	/*
@@ -3585,7 +3590,7 @@ spa_ld_get_props(spa_t *spa)
 		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
 		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
 		    &spa->spa_dedup_ditto);
-
+		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
 		spa->spa_autoreplace = (autoreplace != 0);
 	}

@@ -4336,6 +4341,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)

 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_initialize_restart(spa->spa_root_vdev);
+		vdev_trim_restart(spa->spa_root_vdev);
+		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}

@@ -5338,6 +5345,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
+	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);

 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
@@ -5746,14 +5754,16 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,

 		/*
 		 * We're about to export or destroy this pool. Make sure
-		 * we stop all initializtion activity here before we
-		 * set the spa_final_txg. This will ensure that all
+		 * we stop all initialization and trim activity here before
+		 * we set the spa_final_txg. This will ensure that all
 		 * dirty data resulting from the initialization is
 		 * committed to disk before we unload the pool.
 		 */
 		if (spa->spa_root_vdev != NULL) {
-			vdev_initialize_stop_all(spa->spa_root_vdev,
-			    VDEV_INITIALIZE_ACTIVE);
+			vdev_t *rvd = spa->spa_root_vdev;
+			vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
+			vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
+			vdev_autotrim_stop_all(spa);
 		}

 		/*
@@ -6376,7 +6386,6 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 		vdev_remove_parent(cvd);
 	}

-
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
@@ -6490,7 +6499,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	 * a previous initialization process which has completed but
 	 * the thread is not exited.
 	 */
-	if (cmd_type == POOL_INITIALIZE_DO &&
+	if (cmd_type == POOL_INITIALIZE_START &&
 	    (vd->vdev_initialize_thread != NULL ||
 	    vd->vdev_top->vdev_removing)) {
 		mutex_exit(&vd->vdev_initialize_lock);
@@ -6507,7 +6516,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
 	}

 	switch (cmd_type) {
-	case POOL_INITIALIZE_DO:
+	case POOL_INITIALIZE_START:
 		vdev_initialize(vd);
 		break;
 	case POOL_INITIALIZE_CANCEL:
@@ -6571,6 +6580,126 @@ spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
 	return (total_errors);
 }

+static int
+spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+    uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+	/* Look up vdev and ensure it's a leaf. */
+	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_detached) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(ENODEV));
+	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EINVAL));
+	} else if (!vdev_writeable(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EROFS));
+	} else if (!vd->vdev_has_trim) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EOPNOTSUPP));
+	} else if (secure && !vd->vdev_has_securetrim) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+	mutex_enter(&vd->vdev_trim_lock);
+	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+	/*
+	 * When we activate a TRIM action we check to see if the
+	 * vdev_trim_thread is NULL. We do this instead of using the
+	 * vdev_trim_state since there might be a previous TRIM process
+	 * which has completed but the thread is not exited.
+	 */
+	if (cmd_type == POOL_TRIM_START &&
+	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+		mutex_exit(&vd->vdev_trim_lock);
+		return (SET_ERROR(EBUSY));
+	} else if (cmd_type == POOL_TRIM_CANCEL &&
+	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
+	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
+		mutex_exit(&vd->vdev_trim_lock);
+		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_TRIM_SUSPEND &&
+	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
+		mutex_exit(&vd->vdev_trim_lock);
+		return (SET_ERROR(ESRCH));
+	}
+
+	switch (cmd_type) {
+	case POOL_TRIM_START:
+		vdev_trim(vd, rate, partial, secure);
+		break;
+	case POOL_TRIM_CANCEL:
+		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
+		break;
+	case POOL_TRIM_SUSPEND:
+		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
+		break;
+	default:
+		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+	}
+	mutex_exit(&vd->vdev_trim_lock);
+
+	return (0);
+}
+
+/*
+ * Initiates a manual TRIM for the requested vdevs. This kicks off individual
+ * TRIM threads for each child vdev.  These threads pass over all of the free
+ * space in the vdev's metaslabs and issues TRIM commands for that space.
+ */
+int
+spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
+    boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
+{
+	int total_errors = 0;
+	list_t vd_list;
+
+	list_create(&vd_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_trim_node));
+
+	/*
+	 * We hold the namespace lock through the whole function
+	 * to prevent any changes to the pool while we're starting or
+	 * stopping TRIM. The config and state locks are held so that
+	 * we can properly assess the vdev state before we commit to
+	 * the TRIM operation.
+	 */
+	mutex_enter(&spa_namespace_lock);
+
+	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+		uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
+		    rate, partial, secure, &vd_list);
+		if (error != 0) {
+			char guid_as_str[MAXNAMELEN];
+
+			(void) snprintf(guid_as_str, sizeof (guid_as_str),
+			    "%llu", (unsigned long long)vdev_guid);
+			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+			total_errors++;
+		}
+	}
+
+	/* Wait for all TRIM threads to stop. */
+	vdev_trim_stop_wait(spa, &vd_list);
+
+	/* Sync out the TRIM state */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	mutex_exit(&spa_namespace_lock);
+
+	list_destroy(&vd_list);
+
+	return (total_errors);
+}
+
 /*
 * Split a set of devices from their mirrors, and create a new pool from them.
 */
@@ -6780,24 +6909,36 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 	spa_async_suspend(newspa);

 	/*
-	 * Temporarily stop the initializing activity. We set the state to
-	 * ACTIVE so that we know to resume the initializing once the split
-	 * has completed.
+	 * Temporarily stop the initializing and TRIM activity.  We set the
+	 * state to ACTIVE so that we know to resume initializing or TRIM
+	 * once the split has completed.
 	 */
-	list_t vd_list;
-	list_create(&vd_list, sizeof (vdev_t),
+	list_t vd_initialize_list;
+	list_create(&vd_initialize_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));

+	list_t vd_trim_list;
+	list_create(&vd_trim_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_trim_node));
+
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL) {
 			mutex_enter(&vml[c]->vdev_initialize_lock);
-			vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE,
-			    &vd_list);
+			vdev_initialize_stop(vml[c],
+			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
 			mutex_exit(&vml[c]->vdev_initialize_lock);
+
+			mutex_enter(&vml[c]->vdev_trim_lock);
+			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
+			mutex_exit(&vml[c]->vdev_trim_lock);
 		}
 	}
-	vdev_initialize_stop_wait(spa, &vd_list);
-	list_destroy(&vd_list);
+
+	vdev_initialize_stop_wait(spa, &vd_initialize_list);
+	vdev_trim_stop_wait(spa, &vd_trim_list);
+
+	list_destroy(&vd_initialize_list);
+	list_destroy(&vd_trim_list);

 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;

@@ -6899,8 +7040,10 @@ out:
 			vml[c]->vdev_offline = B_FALSE;
 	}

-	/* restart initializing disks as necessary */
+	/* restart initializing or trimming disks as necessary */
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);

 	vdev_reopen(spa->spa_root_vdev);

@@ -7283,6 +7426,22 @@ spa_async_thread(void *arg)
 		mutex_exit(&spa_namespace_lock);
 	}

+	if (tasks & SPA_ASYNC_TRIM_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_trim_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_autotrim_restart(spa);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
 	/*
 	 * Let the world know that we're done.
 	 */
@@ -7782,6 +7941,11 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
 			case ZPOOL_PROP_FAILUREMODE:
 				spa->spa_failmode = intval;
 				break;
+			case ZPOOL_PROP_AUTOTRIM:
+				spa->spa_autotrim = intval;
+				spa_async_request(spa,
+				    SPA_ASYNC_AUTOTRIM_RESTART);
+				break;
 			case ZPOOL_PROP_AUTOEXPAND:
 				spa->spa_autoexpand = intval;
 				if (tx->tx_txg != TXG_INITIAL)
@@ -39,6 +39,7 @@
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
 #include <sys/metaslab.h>
@@ -1128,6 +1129,9 @@ spa_vdev_enter(spa_t *spa)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
+
+	vdev_autotrim_stop_all(spa);
+
 	return (spa_vdev_config_enter(spa));
 }

@@ -1204,8 +1208,17 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
 			    NULL);
 			mutex_exit(&vd->vdev_initialize_lock);
+
+			mutex_enter(&vd->vdev_trim_lock);
+			vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+			mutex_exit(&vd->vdev_trim_lock);
 		}

+		/*
+		 * The vdev may be both a leaf and top-level device.
+		 */
+		vdev_autotrim_stop_wait(vd);
+
 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_ALL, spa);
@@ -1227,6 +1240,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
+	vdev_autotrim_restart(spa);
+
 	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
@@ -1923,6 +1938,12 @@ spa_deadman_synctime(spa_t *spa)
 	return (spa->spa_deadman_synctime);
 }

+spa_autotrim_t
+spa_get_autotrim(spa_t *spa)
+{
+	return (spa->spa_autotrim);
+}
+
 uint64_t
 spa_deadman_ziotime(spa_t *spa)
 {
@@ -887,6 +887,105 @@ spa_health_destroy(spa_t *spa)
 	mutex_destroy(&shk->lock);
 }

+static spa_iostats_t spa_iostats_template = {
+	{ "trim_extents_written",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_written",			KSTAT_DATA_UINT64 },
+	{ "trim_extents_skipped",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_skipped",			KSTAT_DATA_UINT64 },
+	{ "trim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_failed",			KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_written",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_written",		KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_skipped",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_skipped",		KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_failed",		KSTAT_DATA_UINT64 },
+};
+
+#define	SPA_IOSTATS_ADD(stat, val) \
+    atomic_add_64(&iostats->stat.value.ui64, (val));
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+    uint64_t extents_written, uint64_t bytes_written,
+    uint64_t extents_skipped, uint64_t bytes_skipped,
+    uint64_t extents_failed, uint64_t bytes_failed)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+	spa_iostats_t *iostats;
+
+	if (ksp == NULL)
+		return;
+
+	iostats = ksp->ks_data;
+	if (type == TRIM_TYPE_MANUAL) {
+		SPA_IOSTATS_ADD(trim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
+	} else {
+		SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
+	}
+}
+
+int
+spa_iostats_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE) {
+		memcpy(ksp->ks_data, &spa_iostats_template,
+		    sizeof (spa_iostats_t));
+	}
+
+	return (0);
+}
+
+static void
+spa_iostats_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	char *name = kmem_asprintf("zfs/%s", spa_name(spa));
+	kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	shk->kstat = ksp;
+	if (ksp) {
+		int size = sizeof (spa_iostats_t);
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_private = spa;
+		ksp->ks_update = spa_iostats_update;
+		ksp->ks_data = kmem_alloc(size, KM_SLEEP);
+		memcpy(ksp->ks_data, &spa_iostats_template, size);
+		kstat_install(ksp);
+	}
+
+	strfree(name);
+}
+
+static void
+spa_iostats_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+	if (ksp) {
+		kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
+		kstat_delete(ksp);
+	}
+
+	mutex_destroy(&shk->lock);
+}
+
 void
 spa_stats_init(spa_t *spa)
 {
@@ -896,11 +995,13 @@ spa_stats_init(spa_t *spa)
 	spa_io_history_init(spa);
 	spa_mmp_history_init(spa);
 	spa_state_init(spa);
+	spa_iostats_init(spa);
 }

 void
 spa_stats_destroy(spa_t *spa)
 {
+	spa_iostats_destroy(spa);
 	spa_health_destroy(spa);
 	spa_tx_assign_destroy(spa);
 	spa_txg_history_destroy(spa);
@@ -694,8 +694,12 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 	mutex_exit(&tx->tx_sync_lock);
 }

+/*
+ * Wait for the specified open transaction group.  Set should_quiesce
+ * when the current open txg should be quiesced immediately.
+ */
 void
-txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
 {
 	tx_state_t *tx = &dp->dp_tx;

@@ -705,7 +709,7 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 	ASSERT3U(tx->tx_threads, ==, 2);
 	if (txg == 0)
 		txg = tx->tx_open_txg + 1;
-	if (tx->tx_quiesce_txg_waiting < txg)
+	if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
 		tx->tx_quiesce_txg_waiting = txg;
 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
@@ -51,6 +51,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>

@@ -543,6 +544,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	list_link_init(&vd->vdev_state_dirty_node);
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
+	list_link_init(&vd->vdev_trim_node);
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -551,6 +553,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);

 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
@@ -875,7 +883,10 @@ void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
+
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);

 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
@@ -906,7 +917,6 @@ vdev_free(vdev_t *vd)

 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
-	ASSERT(vd->vdev_initialize_thread == NULL);

 	/*
 	 * Discard allocation state.
@@ -988,6 +998,12 @@ vdev_free(vdev_t *vd)
 	mutex_destroy(&vd->vdev_initialize_io_lock);
 	cv_destroy(&vd->vdev_initialize_io_cv);
 	cv_destroy(&vd->vdev_initialize_cv);
+	mutex_destroy(&vd->vdev_trim_lock);
+	mutex_destroy(&vd->vdev_autotrim_lock);
+	mutex_destroy(&vd->vdev_trim_io_lock);
+	cv_destroy(&vd->vdev_trim_cv);
+	cv_destroy(&vd->vdev_autotrim_cv);
+	cv_destroy(&vd->vdev_trim_io_cv);

 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -3475,6 +3491,16 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	}
 	mutex_exit(&vd->vdev_initialize_lock);

+	/* Restart trimming if necessary */
+	mutex_enter(&vd->vdev_trim_lock);
+	if (vdev_writeable(vd) &&
+	    vd->vdev_trim_thread == NULL &&
+	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
+		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
+		    vd->vdev_trim_secure);
+	}
+	mutex_exit(&vd->vdev_trim_lock);
+
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3745,8 +3771,7 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
-	int t;
-	for (t = 0; t < ZIO_TYPES; t++) {
+	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
 	}
@@ -3873,7 +3898,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
-			 * Report intializing progress. Since we don't
+			 * Report initializing progress. Since we don't
 			 * have the initializing locks held, this is only
 			 * an estimate (although a fairly accurate one).
 			 */
@@ -3884,9 +3909,20 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 			vs->vs_initialize_state = vd->vdev_initialize_state;
 			vs->vs_initialize_action_time =
 			    vd->vdev_initialize_action_time;
+
+			/*
+			 * Report manual TRIM progress. Since we don't have
+			 * the manual TRIM locks held, this is only an
+			 * estimate (although fairly accurate one).
+			 */
+			vs->vs_trim_notsup = !vd->vdev_has_trim;
+			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
+			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
+			vs->vs_trim_state = vd->vdev_trim_state;
+			vs->vs_trim_action_time = vd->vdev_trim_action_time;
 		}
 		/*
-		 * Report expandable space on top-level, non-auxillary devices
+		 * Report expandable space on top-level, non-auxiliary devices
 		 * only. The expandable space is reported in terms of metaslab
 		 * sized units since that determines how much space the pool
 		 * can expand.
@@ -4004,9 +4040,18 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
+			zio_type_t vs_type = type;

-			vs->vs_ops[type]++;
-			vs->vs_bytes[type] += psize;
+			/*
+			 * TRIM ops and bytes are reported to user space as
+			 * ZIO_TYPE_IOCTL.  This is done to preserve the
+			 * vdev_stat_t structure layout for user space.
+			 */
+			if (type == ZIO_TYPE_TRIM)
+				vs_type = ZIO_TYPE_IOCTL;
+
+			vs->vs_ops[vs_type]++;
+			vs->vs_bytes[vs_type] += psize;

 			if (flags & ZIO_FLAG_DELEGATED) {
 				vsx->vsx_agg_histo[zio->io_priority]
@@ -4104,7 +4149,8 @@ vdev_deflated_space(vdev_t *vd, int64_t space)
 }

 /*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
 */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
@@ -4650,12 +4696,56 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
 	spa->spa_resilver_deferred = B_TRUE;
 }

+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+	/*
+	 * Walk up the vdev tree
+	 */
+	if (vd != vd->vdev_top) {
+		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+	} else {
+		/*
+		 * We've reached the top-level vdev, initialize the
+		 * physical range to the logical range and start to
+		 * unwind.
+		 */
+		physical_rs->rs_start = logical_rs->rs_start;
+		physical_rs->rs_end = logical_rs->rs_end;
+		return;
+	}
+
+	vdev_t *pvd = vd->vdev_parent;
+	ASSERT3P(pvd, !=, NULL);
+	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+	/*
+	 * As this recursive function unwinds, translate the logical
+	 * range into its physical components by calling the
+	 * vdev specific translate function.
+	 */
+	range_seg_t intermediate = { { { 0, 0 } } };
+	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+	physical_rs->rs_start = intermediate.rs_start;
+	physical_rs->rs_end = intermediate.rs_end;
+}
+
 #if defined(_KERNEL)
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
+
 /* BEGIN CSTYLED */
 module_param(zfs_vdev_default_ms_count, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_default_ms_count,
@@ -30,6 +30,7 @@
 #include <sys/spa_impl.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
@@ -223,7 +224,7 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
 	strfree(argv[2]);
 #endif /* HAVE_ELEVATOR_CHANGE */
 	if (error) {
-		zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
+		zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
 		    elevator, v->vdev_path, device, error);
 	}
 }
@@ -322,7 +323,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,

 	if (IS_ERR(bdev)) {
 		int error = -PTR_ERR(bdev);
-		vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
+		vdev_dbgmsg(v, "open error=%d count=%d", error, count);
 		vd->vd_bdev = NULL;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
@@ -333,14 +334,22 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 		rw_exit(&vd->vd_lock);
 	}

+	struct request_queue *q = bdev_get_queue(vd->vd_bdev);
+
 	/*  Determine the physical block size */
 	block_size = vdev_bdev_block_size(vd->vd_bdev);

 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	v->vdev_nowritecache = B_FALSE;

+	/* Set when device reports it supports TRIM. */
+	v->vdev_has_trim = !!blk_queue_discard(q);
+
+	/* Set when device reports it supports secure TRIM. */
+	v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
+
 	/* Inform the ZIO pipeline that we are non-rotational */
-	v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
+	v->vdev_nonrot = blk_queue_nonrot(q);

 	/* Physical volume size in bytes for the partition */
 	*psize = bdev_capacity(vd->vd_bdev);
@@ -728,6 +737,7 @@ vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
+	unsigned long trim_flags = 0;
 	int rw, flags, error;

 	/*
@@ -813,6 +823,19 @@ vdev_disk_io_start(zio_t *zio)
 #endif
 		break;

+	case ZIO_TYPE_TRIM:
+#if defined(BLKDEV_DISCARD_SECURE)
+		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+			trim_flags |= BLKDEV_DISCARD_SECURE;
+#endif
+		zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
+		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
+		    trim_flags);
+
+		rw_exit(&vd->vd_lock);
+		zio_interrupt(zio);
+		return;
+
 	default:
 		rw_exit(&vd->vd_lock);
 		zio->io_error = SET_ERROR(ENOTSUP);
@@ -28,10 +28,13 @@
 #include <sys/spa_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/abd.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>

 /*
 * Virtual device vector for files.
@@ -60,9 +63,24 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 	vattr_t vattr;
 	int error;

-	/* Rotational optimizations only make sense on block devices */
+	/*
+	 * Rotational optimizations only make sense on block devices.
+	 */
 	vd->vdev_nonrot = B_TRUE;

+	/*
+	 * Allow TRIM on file based vdevs.  This may not always be supported,
+	 * since it depends on your kernel version and underlying filesystem
+	 * type but it is always safe to attempt.
+	 */
+	vd->vdev_has_trim = B_TRUE;
+
+	/*
+	 * Disable secure TRIM on file based vdevs.  There is no way to
+	 * request this behavior from the underlying filesystem.
+	 */
+	vd->vdev_has_securetrim = B_FALSE;
+
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
@@ -227,6 +245,21 @@ vdev_file_io_start(zio_t *zio)
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}

+		zio_execute(zio);
+		return;
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+		struct flock flck;
+
+		ASSERT3U(zio->io_size, !=, 0);
+		bzero(&flck, sizeof (flck));
+		flck.l_type = F_FREESP;
+		flck.l_start = zio->io_offset;
+		flck.l_len = zio->io_size;
+		flck.l_whence = 0;
+
+		zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck,
+		    0, 0, kcred, NULL);
+
 		zio_execute(zio);
 		return;
 	}
@@ -33,12 +33,6 @@
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>

-/*
- * Maximum number of metaslabs per group that can be initialized
- * simultaneously.
- */
-int max_initialize_ms = 3;
-
 /*
 * Value that is written to disk during initialization.
 */
@@ -132,7 +126,7 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
-	    guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+	    guid, 2, ZFS_SPACE_CHECK_NONE, tx);

 	switch (new_state) {
 	case VDEV_INITIALIZE_ACTIVE:
@@ -250,49 +244,6 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
 	return (0);
 }

-/*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
- */
-void
-vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
-{
-	/*
-	 * Walk up the vdev tree
-	 */
-	if (vd != vd->vdev_top) {
-		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
-	} else {
-		/*
-		 * We've reached the top-level vdev, initialize the
-		 * physical range to the logical range and start to
-		 * unwind.
-		 */
-		physical_rs->rs_start = logical_rs->rs_start;
-		physical_rs->rs_end = logical_rs->rs_end;
-		return;
-	}
-
-	vdev_t *pvd = vd->vdev_parent;
-	ASSERT3P(pvd, !=, NULL);
-	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
-
-	/*
-	 * As this recursive function unwinds, translate the logical
-	 * range into its physical components by calling the
-	 * vdev specific translate function.
-	 */
-	range_seg_t intermediate = { { { 0, 0 } } };
-	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
-
-	physical_rs->rs_start = intermediate.rs_start;
-	physical_rs->rs_end = intermediate.rs_end;
-}
-
 /*
 * Callback to fill each ABD chunk with zfs_initialize_value. len must be
 * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
@@ -362,81 +313,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 	return (0);
 }

-static void
-vdev_initialize_mg_wait(metaslab_group_t *mg)
-{
-	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
-	while (mg->mg_initialize_updating) {
-		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
-	}
-}
-
-static void
-vdev_initialize_mg_mark(metaslab_group_t *mg)
-{
-	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
-	ASSERT(mg->mg_initialize_updating);
-
-	while (mg->mg_ms_initializing >= max_initialize_ms) {
-		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
-	}
-	mg->mg_ms_initializing++;
-	ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
-}
-
-/*
- * Mark the metaslab as being initialized to prevent any allocations
- * on this metaslab. We must also track how many metaslabs are currently
- * being initialized within a metaslab group and limit them to prevent
- * allocation failures from occurring because all metaslabs are being
- * initialized.
- */
-static void
-vdev_initialize_ms_mark(metaslab_t *msp)
-{
-	ASSERT(!MUTEX_HELD(&msp->ms_lock));
-	metaslab_group_t *mg = msp->ms_group;
-
-	mutex_enter(&mg->mg_ms_initialize_lock);
-
-	/*
-	 * To keep an accurate count of how many threads are initializing
-	 * a specific metaslab group, we only allow one thread to mark
-	 * the metaslab group at a time. This ensures that the value of
-	 * ms_initializing will be accurate when we decide to mark a metaslab
-	 * group as being initialized. To do this we force all other threads
-	 * to wait till the metaslab's mg_initialize_updating flag is no
-	 * longer set.
-	 */
-	vdev_initialize_mg_wait(mg);
-	mg->mg_initialize_updating = B_TRUE;
-	if (msp->ms_initializing == 0) {
-		vdev_initialize_mg_mark(mg);
-	}
-	mutex_enter(&msp->ms_lock);
-	msp->ms_initializing++;
-	mutex_exit(&msp->ms_lock);
-
-	mg->mg_initialize_updating = B_FALSE;
-	cv_broadcast(&mg->mg_ms_initialize_cv);
-	mutex_exit(&mg->mg_ms_initialize_lock);
-}
-
-static void
-vdev_initialize_ms_unmark(metaslab_t *msp)
-{
-	ASSERT(!MUTEX_HELD(&msp->ms_lock));
-	metaslab_group_t *mg = msp->ms_group;
-	mutex_enter(&mg->mg_ms_initialize_lock);
-	mutex_enter(&msp->ms_lock);
-	if (--msp->ms_initializing == 0) {
-		mg->mg_ms_initializing--;
-		cv_broadcast(&mg->mg_ms_initialize_cv);
-	}
-	mutex_exit(&msp->ms_lock);
-	mutex_exit(&mg->mg_ms_initialize_lock);
-}
-
 static void
 vdev_initialize_calculate_progress(vdev_t *vd)
 {
@@ -535,9 +411,8 @@ vdev_initialize_load(vdev_t *vd)
 	return (err);
 }

-
 /*
- * Convert the logical range into a physcial range and add it to our
+ * Convert the logical range into a physical range and add it to our
 * avl tree.
 */
 void
@@ -618,7 +493,8 @@ vdev_initialize_thread(void *arg)
 			ms_count = vd->vdev_top->vdev_ms_count;
 		}

-		vdev_initialize_ms_mark(msp);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
 		VERIFY0(metaslab_load(msp));

@@ -626,16 +502,8 @@ vdev_initialize_thread(void *arg)
 		    vd);
 		mutex_exit(&msp->ms_lock);

-		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		error = vdev_initialize_ranges(vd, deadbeef);
-
-		/*
-		 * Wait for the outstanding IO to be synced to prevent
-		 * newly allocated blocks from being overwritten.
-		 */
-		txg_wait_synced(spa_get_dsl(spa), 0);
-
-		vdev_initialize_ms_unmark(msp);
+		metaslab_enable(msp, B_TRUE);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

 		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
@@ -853,12 +721,11 @@ vdev_initialize_restart(vdev_t *vd)
 }

 #if defined(_KERNEL)
-EXPORT_SYMBOL(vdev_initialize_restart);
-EXPORT_SYMBOL(vdev_xlate);
 EXPORT_SYMBOL(vdev_initialize);
 EXPORT_SYMBOL(vdev_initialize_stop);
 EXPORT_SYMBOL(vdev_initialize_stop_all);
 EXPORT_SYMBOL(vdev_initialize_stop_wait);
+EXPORT_SYMBOL(vdev_initialize_restart);

 /* CSTYLED */
 module_param(zfs_initialize_value, ulong, 0644);
@@ -251,6 +251,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
 	    vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);

+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+	    vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);
+
 	/* ZIOs pending */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
@@ -267,6 +270,9 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
 	    vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);

+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+	    vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);
+
 	/* Histograms */
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
 	    vsx->vsx_total_histo[ZIO_TYPE_READ],
@@ -304,6 +310,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	    vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));

+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+	    vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
+	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));
+
 	/* Request sizes */
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
@@ -325,6 +335,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	    vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));

+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+	    vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
+	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));
+
 	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
@@ -345,6 +359,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
 	    vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
 	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));

+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+	    vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
+	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));
+
 	/* IO delays */
 	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);

@@ -156,6 +156,8 @@ uint32_t zfs_vdev_removal_min_active = 1;
 uint32_t zfs_vdev_removal_max_active = 2;
 uint32_t zfs_vdev_initializing_min_active = 1;
 uint32_t zfs_vdev_initializing_max_active = 1;
+uint32_t zfs_vdev_trim_min_active = 1;
+uint32_t zfs_vdev_trim_max_active = 2;

 /*
 * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -203,6 +205,12 @@ int zfs_vdev_queue_depth_pct = 300;
 */
 int zfs_vdev_def_queue_depth = 32;

+/*
+ * Allow TRIM I/Os to be aggregated.  This should normally not be needed since
+ * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
+ * by the TRIM code in zfs_trim.c.
+ */
+int zfs_vdev_aggregate_trim = 0;

 int
 vdev_queue_offset_compare(const void *x1, const void *x2)
@@ -227,11 +235,13 @@ vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
 static inline avl_tree_t *
 vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
 {
-	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
+	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
 	if (t == ZIO_TYPE_READ)
 		return (&vq->vq_read_offset_tree);
-	else
+	else if (t == ZIO_TYPE_WRITE)
 		return (&vq->vq_write_offset_tree);
+	else
+		return (&vq->vq_trim_offset_tree);
 }

 int
@@ -266,6 +276,8 @@ vdev_queue_class_min_active(zio_priority_t p)
 		return (zfs_vdev_removal_min_active);
 	case ZIO_PRIORITY_INITIALIZING:
 		return (zfs_vdev_initializing_min_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_min_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -338,6 +350,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
 		return (zfs_vdev_removal_max_active);
 	case ZIO_PRIORITY_INITIALIZING:
 		return (zfs_vdev_initializing_max_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -398,19 +412,25 @@ vdev_queue_init(vdev_t *vd)
 	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));

 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		int (*compfn) (const void *, const void *);

 		/*
-		 * The synchronous i/o queues are dispatched in FIFO rather
+		 * The synchronous/trim i/o queues are dispatched in FIFO rather
 		 * than LBA order. This provides more consistent latency for
 		 * these i/os.
 		 */
-		if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+		if (p == ZIO_PRIORITY_SYNC_READ ||
+		    p == ZIO_PRIORITY_SYNC_WRITE ||
+		    p == ZIO_PRIORITY_TRIM) {
 			compfn = vdev_queue_timestamp_compare;
-		else
+		} else {
 			compfn = vdev_queue_offset_compare;
+		}
 		avl_create(vdev_queue_class_tree(vq, p), compfn,
 		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
@@ -428,6 +448,7 @@ vdev_queue_fini(vdev_t *vd)
 	avl_destroy(&vq->vq_active_tree);
 	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
 	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));

 	mutex_destroy(&vq->vq_lock);
 }
@@ -559,6 +580,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
 		return (NULL);

+	/*
+	 * While TRIM commands could be aggregated based on offset this
+	 * behavior is disabled until it's determined to be beneficial.
+	 */
+	if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+		return (NULL);
+
 	first = last = zio;

 	if (zio->io_type == ZIO_TYPE_READ)
@@ -732,7 +760,7 @@ again:
 	 * For LBA-ordered queues (async / scrub / initializing), issue the
 	 * i/o which follows the most recently issued i/o in LBA (offset) order.
 	 *
-	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+	 * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
 	 */
 	tree = vdev_queue_class_tree(vq, p);
 	vq->vq_io_search.io_timestamp = 0;
@@ -783,19 +811,27 @@ vdev_queue_io(zio_t *zio)
 	 * not match the child's i/o type.  Fix it up here.
 	 */
 	if (zio->io_type == ZIO_TYPE_READ) {
+		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
 		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
-	} else {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		}
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
+		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+		}
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_TRIM);
+		ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
 	}

 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
@@ -922,6 +958,9 @@ module_param(zfs_vdev_aggregation_limit_non_rotating, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_aggregation_limit_non_rotating,
 	"Max vdev I/O aggregation size for non-rotating media");

+module_param(zfs_vdev_aggregate_trim, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_aggregate_trim, "Allow TRIM I/O to be aggregated");
+
 module_param(zfs_vdev_read_gap_limit, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");

@@ -995,6 +1034,14 @@ module_param(zfs_vdev_sync_write_min_active, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
 	"Min active sync write I/Os per vdev");

+module_param(zfs_vdev_trim_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_trim_max_active,
+	"Max active trim/discard I/Os per vdev");
+
+module_param(zfs_vdev_trim_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_trim_min_active,
+	"Min active trim/discard I/Os per vdev");
+
 module_param(zfs_vdev_queue_depth_pct, int, 0644);
 MODULE_PARM_DESC(zfs_vdev_queue_depth_pct,
 	"Queue depth percentage for each top-level vdev");
@@ -37,7 +37,7 @@
 #include <sys/vdev_raidz_impl.h>

 #ifdef ZFS_DEBUG
-#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
+#include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
 #endif

 /*
@@ -45,6 +45,7 @@
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
 #include <sys/trace_vdev.h>

 /*
@@ -1181,6 +1182,8 @@ vdev_remove_complete(spa_t *spa)
 	txg = spa_vdev_enter(spa);
 	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);

 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1869,8 +1872,10 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)

 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);

-	/* Stop initializing */
+	/* Stop initializing and TRIM */
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+	vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
+	vdev_autotrim_stop_wait(vd);

 	*txg = spa_vdev_config_enter(spa);

@@ -2051,11 +2056,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	error = spa_reset_logs(spa);

 	/*
-	 * We stop any initializing that is currently in progress but leave
-	 * the state as "active". This will allow the initializing to resume
-	 * if the removal is canceled sometime later.
+	 * We stop any initializing and TRIM that is currently in progress
+	 * but leave the state as "active". This will allow the process to
+	 * resume if the removal is canceled sometime later.
 	 */
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+	vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
+	vdev_autotrim_stop_wait(vd);

 	*txg = spa_vdev_config_enter(spa);

@@ -2069,6 +2076,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	if (error != 0) {
 		metaslab_group_activate(mg);
 		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 		return (error);
 	}

@@ -204,6 +204,7 @@
 #include <sys/zfs_sysfs.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>

 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@@ -3885,7 +3886,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc)

 /*
 * innvl: {
- *     "initialize_command" -> POOL_INITIALIZE_{CANCEL|DO|SUSPEND} (uint64)
+ *     "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64)
 *     "initialize_vdevs": { -> guids to initialize (nvlist)
 *         "vdev_path_1": vdev_guid_1, (uint64),
 *         "vdev_path_2": vdev_guid_2, (uint64),
@@ -3919,7 +3920,7 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	}

 	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
-	    cmd_type == POOL_INITIALIZE_DO ||
+	    cmd_type == POOL_INITIALIZE_START ||
 	    cmd_type == POOL_INITIALIZE_SUSPEND)) {
 		return (SET_ERROR(EINVAL));
 	}
@@ -3957,6 +3958,91 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 	return (total_errors > 0 ? EINVAL : 0);
 }

+/*
+ * innvl: {
+ *     "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64)
+ *     "trim_vdevs": { -> guids to TRIM (nvlist)
+ *         "vdev_path_1": vdev_guid_1, (uint64),
+ *         "vdev_path_2": vdev_guid_2, (uint64),
+ *         ...
+ *     },
+ *     "trim_rate" -> Target TRIM rate in bytes/sec.
+ *     "trim_secure" -> Set to request a secure TRIM.
+ * }
+ *
+ * outnvl: {
+ *     "trim_vdevs": { -> TRIM errors (nvlist)
+ *         "vdev_path_1": errno, see function body for possible errnos (uint64)
+ *         "vdev_path_2": errno, ... (uint64)
+ *         ...
+ *     }
+ * }
+ *
+ * EINVAL is returned for an unknown commands or if any of the provided vdev
+ * guids have be specified with a type other than uint64.
+ */
+static const zfs_ioc_key_t zfs_keys_pool_trim[] = {
+	{ZPOOL_TRIM_COMMAND,	DATA_TYPE_UINT64,		0},
+	{ZPOOL_TRIM_VDEVS,	DATA_TYPE_NVLIST,		0},
+	{ZPOOL_TRIM_RATE,	DATA_TYPE_UINT64,		ZK_OPTIONAL},
+	{ZPOOL_TRIM_SECURE,	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	uint64_t cmd_type;
+	if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (!(cmd_type == POOL_TRIM_CANCEL ||
+	    cmd_type == POOL_TRIM_START ||
+	    cmd_type == POOL_TRIM_SUSPEND)) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	nvlist_t *vdev_guids;
+	if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0)
+		return (SET_ERROR(EINVAL));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+		uint64_t vdev_guid;
+		if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
+			return (SET_ERROR(EINVAL));
+		}
+	}
+
+	/* Optional, defaults to maximum rate when not provided */
+	uint64_t rate;
+	if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0)
+		rate = 0;
+
+	/* Optional, defaults to standard TRIM when not provided */
+	boolean_t secure;
+	if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE,
+	    &secure) != 0) {
+		secure = B_FALSE;
+	}
+
+	spa_t *spa;
+	int error = spa_open(poolname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	nvlist_t *vdev_errlist = fnvlist_alloc();
+	int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type,
+	    rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist);
+
+	if (fnvlist_size(vdev_errlist) > 0)
+		fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist);
+
+	fnvlist_free(vdev_errlist);
+
+	spa_close(spa, FTAG);
+	return (total_errors > 0 ? EINVAL : 0);
+}
+
 /*
 * fsname is name of dataset to rollback (to most recent snapshot)
 *
@@ -6580,6 +6666,11 @@ zfs_ioctl_init(void)
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));

+	zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM,
+	    zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim));
+
 	/* IOCTLS that use the legacy function signature */

 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
@@ -358,7 +358,8 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
 */
 static const char *zfs_features[]  = {
 	/* --> Add new kernel features here (post ZoL 0.8.0) */
-	"vdev_initialize"
+	"initialize",
+	"trim",
 };

 #define	ZFS_FEATURE_COUNT	ARRAY_SIZE(zfs_features)
@@ -32,6 +32,7 @@
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
@@ -58,7 +59,7 @@ const char *zio_type_name[ZIO_TYPES] = {
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
-	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
+	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
 };

 int zio_dva_throttle_enabled = B_TRUE;
@@ -761,7 +762,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 {
 	zio_t *zio;

-	ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE);
+	IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);

@@ -1211,6 +1212,26 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 	return (zio);
 }

+zio_t *
+zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, enum trim_flag trim_flags)
+{
+	zio_t *zio;
+
+	ASSERT0(vd->vdev_children);
+	ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+	ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+	ASSERT3U(size, !=, 0);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
+	    private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
+	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
+	zio->io_trim_flags = trim_flags;
+
+	return (zio);
+}
+
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    abd_t *data, int checksum, zio_done_func_t *done, void *private,
@@ -3562,7 +3583,6 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
 * ==========================================================================
 */

-
 /*
 * Issue an I/O to the underlying vdev. Typically the issue pipeline
 * stops after this stage and will resume upon I/O completion.
@@ -3685,8 +3705,8 @@ zio_vdev_io_start(zio_t *zio)
 		return (zio);
 	}

-	if (vd->vdev_ops->vdev_op_leaf &&
-	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+	if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {

 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
 			return (zio);
@@ -3717,7 +3737,8 @@ zio_vdev_io_done(zio_t *zio)
 		return (NULL);
 	}

-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);

 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
@@ -3736,7 +3757,7 @@ zio_vdev_io_done(zio_t *zio)
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);

-		if (zio->io_error) {
+		if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
@@ -3866,8 +3887,8 @@ zio_vdev_io_assess(zio_t *zio)

 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
-	 * attempts will ever succeed. In this case we set a persistent bit so
-	 * that we don't bother with it in the future.
+	 * attempts will ever succeed. In this case we set a persistent
+	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
 	    zio->io_type == ZIO_TYPE_IOCTL &&