Add TRIM support

UNMAP/TRIM support is a frequently-requested feature to help
prevent performance from degrading on SSDs and on various other
SAN-like storage back-ends.  By issuing UNMAP/TRIM commands for
sectors which are no longer allocated the underlying device can
often more efficiently manage itself.

This TRIM implementation is modeled on the `zpool initialize`
feature which writes a pattern to all unallocated space in the
pool.  The new `zpool trim` command uses the same vdev_xlate()
code to calculate what sectors are unallocated, the same per-
vdev TRIM thread model and locking, and the same basic CLI for
a consistent user experience.  The core difference is that
instead of writing a pattern it will issue UNMAP/TRIM commands
for those extents.

The zio pipeline was updated to accommodate this by adding a new
ZIO_TYPE_TRIM type and associated spa taskq.  This new type makes
is straight forward to add the platform specific TRIM/UNMAP calls
to vdev_disk.c and vdev_file.c.  These new ZIO_TYPE_TRIM zios are
handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs.
This makes it possible to largely avoid changing the pipieline,
one exception is that TRIM zio's may exceed the 16M block size
limit since they contain no data.

In addition to the manual `zpool trim` command, a background
automatic TRIM was added and is controlled by the 'autotrim'
property.  It relies on the exact same infrastructure as the
manual TRIM.  However, instead of relying on the extents in a
metaslab's ms_allocatable range tree, a ms_trim tree is kept
per metaslab.  When 'autotrim=on', ranges added back to the
ms_allocatable tree are also added to the ms_free tree.  The
ms_free tree is then periodically consumed by an autotrim
thread which systematically walks a top level vdev's metaslabs.

Since the automatic TRIM will skip ranges it considers too small
there is value in occasionally running a full `zpool trim`.  This
may occur when the freed blocks are small and not enough time
was allowed to aggregate them.  An automatic TRIM and a manual
`zpool trim` may be run concurrently, in which case the automatic
TRIM will yield to the manual TRIM.

Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Tim Chase <tim@chase2k.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Contributions-by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Contributions-by: Tim Chase <tim@chase2k.com>
Contributions-by: Chunwei Chen <tuxoko@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8419 
Closes #598
This commit is contained in:
Brian Behlendorf
2019-03-29 09:13:20 -07:00
committed by GitHub
parent f94b3cbf43
commit 1b939560be
91 changed files with 5593 additions and 439 deletions
+57 -10
View File
@@ -156,6 +156,8 @@ uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2;
uint32_t zfs_vdev_initializing_min_active = 1;
uint32_t zfs_vdev_initializing_max_active = 1;
uint32_t zfs_vdev_trim_min_active = 1;
uint32_t zfs_vdev_trim_max_active = 2;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -203,6 +205,12 @@ int zfs_vdev_queue_depth_pct = 300;
*/
int zfs_vdev_def_queue_depth = 32;
/*
* Allow TRIM I/Os to be aggregated. This should normally not be needed since
* TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
* by the TRIM code in zfs_trim.c.
*/
int zfs_vdev_aggregate_trim = 0;
int
vdev_queue_offset_compare(const void *x1, const void *x2)
@@ -227,11 +235,13 @@ vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
else
else if (t == ZIO_TYPE_WRITE)
return (&vq->vq_write_offset_tree);
else
return (&vq->vq_trim_offset_tree);
}
int
@@ -266,6 +276,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_removal_min_active);
case ZIO_PRIORITY_INITIALIZING:
return (zfs_vdev_initializing_min_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -338,6 +350,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_removal_max_active);
case ZIO_PRIORITY_INITIALIZING:
return (zfs_vdev_initializing_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -398,19 +412,25 @@ vdev_queue_init(vdev_t *vd)
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);
/*
* The synchronous i/o queues are dispatched in FIFO rather
* The synchronous/trim i/o queues are dispatched in FIFO rather
* than LBA order. This provides more consistent latency for
* these i/os.
*/
if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
if (p == ZIO_PRIORITY_SYNC_READ ||
p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM) {
compfn = vdev_queue_timestamp_compare;
else
} else {
compfn = vdev_queue_offset_compare;
}
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
@@ -428,6 +448,7 @@ vdev_queue_fini(vdev_t *vd)
avl_destroy(&vq->vq_active_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
mutex_destroy(&vq->vq_lock);
}
@@ -559,6 +580,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
return (NULL);
/*
* While TRIM commands could be aggregated based on offset this
* behavior is disabled until it's determined to be beneficial.
*/
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL);
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
@@ -732,7 +760,7 @@ again:
* For LBA-ordered queues (async / scrub / initializing), issue the
* i/o which follows the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
* For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
*/
tree = vdev_queue_class_tree(vq, p);
vq->vq_io_search.io_timestamp = 0;
@@ -783,19 +811,27 @@ vdev_queue_io(zio_t *zio)
* not match the child's i/o type. Fix it up here.
*/
if (zio->io_type == ZIO_TYPE_READ) {
ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
}
} else {
ASSERT(zio->io_type == ZIO_TYPE_TRIM);
ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
}
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
@@ -922,6 +958,9 @@ module_param(zfs_vdev_aggregation_limit_non_rotating, int, 0644);
MODULE_PARM_DESC(zfs_vdev_aggregation_limit_non_rotating,
"Max vdev I/O aggregation size for non-rotating media");
module_param(zfs_vdev_aggregate_trim, int, 0644);
MODULE_PARM_DESC(zfs_vdev_aggregate_trim, "Allow TRIM I/O to be aggregated");
module_param(zfs_vdev_read_gap_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");
@@ -995,6 +1034,14 @@ module_param(zfs_vdev_sync_write_min_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
"Min active sync write I/Os per vdev");
module_param(zfs_vdev_trim_max_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_trim_max_active,
"Max active trim/discard I/Os per vdev");
module_param(zfs_vdev_trim_min_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_trim_min_active,
"Min active trim/discard I/Os per vdev");
module_param(zfs_vdev_queue_depth_pct, int, 0644);
MODULE_PARM_DESC(zfs_vdev_queue_depth_pct,
"Queue depth percentage for each top-level vdev");