mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Add TRIM support
UNMAP/TRIM support is a frequently-requested feature to help prevent performance from degrading on SSDs and on various other SAN-like storage back-ends. By issuing UNMAP/TRIM commands for sectors which are no longer allocated the underlying device can often more efficiently manage itself. This TRIM implementation is modeled on the `zpool initialize` feature which writes a pattern to all unallocated space in the pool. The new `zpool trim` command uses the same vdev_xlate() code to calculate what sectors are unallocated, the same per- vdev TRIM thread model and locking, and the same basic CLI for a consistent user experience. The core difference is that instead of writing a pattern it will issue UNMAP/TRIM commands for those extents. The zio pipeline was updated to accommodate this by adding a new ZIO_TYPE_TRIM type and associated spa taskq. This new type makes is straight forward to add the platform specific TRIM/UNMAP calls to vdev_disk.c and vdev_file.c. These new ZIO_TYPE_TRIM zios are handled largely the same way as ZIO_TYPE_READs or ZIO_TYPE_WRITEs. This makes it possible to largely avoid changing the pipieline, one exception is that TRIM zio's may exceed the 16M block size limit since they contain no data. In addition to the manual `zpool trim` command, a background automatic TRIM was added and is controlled by the 'autotrim' property. It relies on the exact same infrastructure as the manual TRIM. However, instead of relying on the extents in a metaslab's ms_allocatable range tree, a ms_trim tree is kept per metaslab. When 'autotrim=on', ranges added back to the ms_allocatable tree are also added to the ms_free tree. The ms_free tree is then periodically consumed by an autotrim thread which systematically walks a top level vdev's metaslabs. Since the automatic TRIM will skip ranges it considers too small there is value in occasionally running a full `zpool trim`. This may occur when the freed blocks are small and not enough time was allowed to aggregate them. An automatic TRIM and a manual `zpool trim` may be run concurrently, in which case the automatic TRIM will yield to the manual TRIM. Reviewed-by: Jorgen Lundman <lundman@lundman.net> Reviewed-by: Tim Chase <tim@chase2k.com> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Reviewed-by: George Wilson <george.wilson@delphix.com> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Contributions-by: Saso Kiselkov <saso.kiselkov@nexenta.com> Contributions-by: Tim Chase <tim@chase2k.com> Contributions-by: Chunwei Chen <tuxoko@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #8419 Closes #598
This commit is contained in:
+83
-2
@@ -107,6 +107,7 @@
|
||||
#include <sys/vdev_impl.h>
|
||||
#include <sys/vdev_file.h>
|
||||
#include <sys/vdev_initialize.h>
|
||||
#include <sys/vdev_trim.h>
|
||||
#include <sys/spa_impl.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/dsl_prop.h>
|
||||
@@ -374,6 +375,7 @@ ztest_func_t ztest_spa_upgrade;
|
||||
ztest_func_t ztest_device_removal;
|
||||
ztest_func_t ztest_spa_checkpoint_create_discard;
|
||||
ztest_func_t ztest_initialize;
|
||||
ztest_func_t ztest_trim;
|
||||
ztest_func_t ztest_fletcher;
|
||||
ztest_func_t ztest_fletcher_incr;
|
||||
ztest_func_t ztest_verify_dnode_bt;
|
||||
@@ -427,6 +429,7 @@ ztest_info_t ztest_info[] = {
|
||||
ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
|
||||
ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
|
||||
ZTI_INIT(ztest_initialize, 1, &zopt_sometimes),
|
||||
ZTI_INIT(ztest_trim, 1, &zopt_sometimes),
|
||||
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
|
||||
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
|
||||
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
|
||||
@@ -4897,7 +4900,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
|
||||
umem_free(bigcheck, bigsize);
|
||||
}
|
||||
if (i == 2) {
|
||||
txg_wait_open(dmu_objset_pool(os), 0);
|
||||
txg_wait_open(dmu_objset_pool(os), 0, B_TRUE);
|
||||
} else if (i == 3) {
|
||||
txg_wait_synced(dmu_objset_pool(os), 0);
|
||||
}
|
||||
@@ -5574,6 +5577,8 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
|
||||
(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
|
||||
ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
|
||||
|
||||
(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2));
|
||||
|
||||
VERIFY0(spa_prop_get(ztest_spa, &props));
|
||||
|
||||
if (ztest_opts.zo_verbose >= 6)
|
||||
@@ -6484,7 +6489,7 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id)
|
||||
(void) printf("\n");
|
||||
}
|
||||
break;
|
||||
case POOL_INITIALIZE_DO:
|
||||
case POOL_INITIALIZE_START:
|
||||
if (ztest_opts.zo_verbose >= 4) {
|
||||
(void) printf("Start initialize %s", path);
|
||||
if (active && error == 0)
|
||||
@@ -6507,6 +6512,82 @@ ztest_initialize(ztest_ds_t *zd, uint64_t id)
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
void
|
||||
ztest_trim(ztest_ds_t *zd, uint64_t id)
|
||||
{
|
||||
spa_t *spa = ztest_spa;
|
||||
int error = 0;
|
||||
|
||||
mutex_enter(&ztest_vdev_lock);
|
||||
|
||||
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
|
||||
|
||||
/* Random leaf vdev */
|
||||
vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
|
||||
if (rand_vd == NULL) {
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The random vdev we've selected may change as soon as we
|
||||
* drop the spa_config_lock. We create local copies of things
|
||||
* we're interested in.
|
||||
*/
|
||||
uint64_t guid = rand_vd->vdev_guid;
|
||||
char *path = strdup(rand_vd->vdev_path);
|
||||
boolean_t active = rand_vd->vdev_trim_thread != NULL;
|
||||
|
||||
zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
|
||||
spa_config_exit(spa, SCL_VDEV, FTAG);
|
||||
|
||||
uint64_t cmd = ztest_random(POOL_TRIM_FUNCS);
|
||||
uint64_t rate = 1 << ztest_random(30);
|
||||
boolean_t partial = (ztest_random(5) > 0);
|
||||
boolean_t secure = (ztest_random(5) > 0);
|
||||
|
||||
nvlist_t *vdev_guids = fnvlist_alloc();
|
||||
nvlist_t *vdev_errlist = fnvlist_alloc();
|
||||
fnvlist_add_uint64(vdev_guids, path, guid);
|
||||
error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial,
|
||||
secure, vdev_errlist);
|
||||
fnvlist_free(vdev_guids);
|
||||
fnvlist_free(vdev_errlist);
|
||||
|
||||
switch (cmd) {
|
||||
case POOL_TRIM_CANCEL:
|
||||
if (ztest_opts.zo_verbose >= 4) {
|
||||
(void) printf("Cancel TRIM %s", path);
|
||||
if (!active)
|
||||
(void) printf(" failed (no TRIM active)");
|
||||
(void) printf("\n");
|
||||
}
|
||||
break;
|
||||
case POOL_TRIM_START:
|
||||
if (ztest_opts.zo_verbose >= 4) {
|
||||
(void) printf("Start TRIM %s", path);
|
||||
if (active && error == 0)
|
||||
(void) printf(" failed (already active)");
|
||||
else if (error != 0)
|
||||
(void) printf(" failed (error %d)", error);
|
||||
(void) printf("\n");
|
||||
}
|
||||
break;
|
||||
case POOL_TRIM_SUSPEND:
|
||||
if (ztest_opts.zo_verbose >= 4) {
|
||||
(void) printf("Suspend TRIM %s", path);
|
||||
if (!active)
|
||||
(void) printf(" failed (no TRIM active)");
|
||||
(void) printf("\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
free(path);
|
||||
mutex_exit(&ztest_vdev_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify pool integrity by running zdb.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user