OpenZFS 9238 - ZFS Spacemap Encoding V2

Motivation
==========

The current space map encoding has the following disadvantages:
[1] Assuming 512 sector size each entry can represent at most 16MB for a segment.
    This makes the encoding very inefficient for large regions of space.
[2] As vdev-wide space maps have started to be used by new features (i.e.
    device removal, zpool checkpoint) we've started imposing limits in the
    vdevs that can be used with them based on the maximum addressable offset
    (currently 64PB for a top-level vdev).

New encoding
============

The layout can be found at space_map.h and it remains backwards compatible with
the old one. The introduced two-word entry format, besides extending the limits
imposed by the single-entry layout, also includes a vdev field and some extra
padding after its prefix.

The extra padding after the prefix should is reserved for future usage (e.g.
new prefixes for future encodings or new fields for flags). The new vdev field
not only makes the space maps more self-descriptive, but also opens the doors
for pool-wide space maps (expected to be used in the log spacemap project).

One final important note is that the number of bits used for vdevs is reduced
to 24 bits for blkptrs. That was decided as we don't know of any setups that
use more than 16M vdevs for the time being and we wanted to fit the vdev field
in the space map. In addition that gives us some extra bits in dva_t.

Other references:
=================

The new encoding is also discussed towards the end of the Log Space Map
presentation from 2017's OpenZFS summit.
Link: https://www.youtube.com/watch?v=jj2IxRkl5bQ

Authored by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>

OpenZFS-commit: https://github.com/openzfs/openzfs/commit/90a56e6d
OpenZFS-issue: https://www.illumos.org/issues/9238
Closes #7665
This commit is contained in:
Serapheim Dimitropoulos
2017-08-04 09:30:49 -07:00
committed by Brian Behlendorf
parent 4e82b4be78
commit 4d044c4c1d
15 changed files with 867 additions and 373 deletions
+26 -24
View File
@@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_arg {
} spa_checkpoint_discard_sync_callback_arg_t;
static int
spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
uint64_t size, void *arg)
spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
{
spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
vdev_t *vd = sdc->sdc_vd;
metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
uint64_t end = offset + size;
metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
uint64_t end = sme->sme_offset + sme->sme_run;
if (sdc->sdc_entry_limit == 0)
return (EINTR);
@@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
* metaslab boundaries. So if needed we could add code
* that handles metaslab-crossing segments in the future.
*/
VERIFY3U(type, ==, SM_FREE);
VERIFY3U(offset, >=, ms->ms_start);
VERIFY3U(sme->sme_type, ==, SM_FREE);
VERIFY3U(sme->sme_offset, >=, ms->ms_start);
VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
/*
@@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
mutex_enter(&ms->ms_lock);
if (range_tree_is_empty(ms->ms_freeing))
vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
range_tree_add(ms->ms_freeing, offset, size);
range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);
ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
sme->sme_run);
ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
vd->vdev_stat.vs_checkpoint_space -= size;
vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
sdc->sdc_entry_limit--;
return (0);
@@ -291,12 +291,13 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
* Thus, we set the maximum entries that the space map callback
* will be applied to be half the entries that could fit in the
* imposed memory limit.
*
* Note that since this is a conservative estimate we also
* assume the worst case scenario in our computation where each
* entry is two-word.
*/
uint64_t max_entry_limit =
(zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
uint64_t entries_in_sm =
space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
(zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
/*
* Iterate from the end of the space map towards the beginning,
@@ -320,14 +321,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
spa_checkpoint_discard_sync_callback_arg_t sdc;
sdc.sdc_vd = vd;
sdc.sdc_txg = tx->tx_txg;
sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
sdc.sdc_entry_limit = max_entry_limit;
uint64_t entries_before = entries_in_sm;
uint64_t words_before =
space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
spa_checkpoint_discard_sync_callback, &sdc, tx);
uint64_t entries_after =
uint64_t words_after =
space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
#ifdef ZFS_DEBUG
@@ -335,9 +337,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
#endif
zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
"deleted %llu entries - %llu entries are left",
tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
entries_after);
"deleted %llu words - %llu words are left",
tx->tx_txg, vd->vdev_id, (words_before - words_after),
words_after);
if (error != EINTR) {
if (error != 0) {
@@ -346,15 +348,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
"space map of vdev %llu\n",
error, vd->vdev_id);
}
ASSERT0(entries_after);
ASSERT0(words_after);
ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
ASSERT0(vd->vdev_checkpoint_sm->sm_length);
ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
space_map_free(vd->vdev_checkpoint_sm, tx);
space_map_close(vd->vdev_checkpoint_sm);
vd->vdev_checkpoint_sm = NULL;
VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
}
}