Illumos #3104: eliminate empty bpobjs

3104 eliminate empty bpobjs
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Eric Schrock <eric.schrock@delphix.com>

References:
  illumos/illumos-gate@f174573681
  illumos changeset: 13782:8f78aae28a63
  https://www.illumos.org/issues/3104

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Matthew Ahrens 2012-12-23 15:57:14 -08:00 committed by Brian Behlendorf
parent 91579709fc
commit 753c38392d
12 changed files with 166 additions and 11 deletions

View File

@ -20,6 +20,7 @@
*/ */
/* /*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_BPOBJ_H #ifndef _SYS_BPOBJ_H
@ -67,7 +68,9 @@ typedef struct bpobj {
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
void bpobj_close(bpobj_t *bpo); void bpobj_close(bpobj_t *bpo);

View File

@ -309,6 +309,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_SCAN "scan" #define DMU_POOL_SCAN "scan"
#define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_FREE_BPOBJ "free_bpobj"
#define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_BPTREE_OBJ "bptree_obj"
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
/* /*
* Allocate an object from this objset. The range of object numbers * Allocate an object from this objset. The range of object numbers

View File

@ -96,6 +96,7 @@ typedef struct dsl_pool {
uint64_t dp_tmp_userrefs_obj; uint64_t dp_tmp_userrefs_obj;
bpobj_t dp_free_bpobj; bpobj_t dp_free_bpobj;
uint64_t dp_bptree_obj; uint64_t dp_bptree_obj;
uint64_t dp_empty_bpobj;
struct dsl_scan *dp_scan; struct dsl_scan *dp_scan;

View File

@ -300,6 +300,8 @@ int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
/* Here the key is an int and the value is a different int. */ /* Here the key is an int and the value is a different int. */
int zap_add_int_key(objset_t *os, uint64_t obj, int zap_add_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx); uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_update_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int_key(objset_t *os, uint64_t obj, int zap_lookup_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t *valuep); uint64_t key, uint64_t *valuep);

View File

@ -51,6 +51,7 @@ typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);
typedef enum spa_feature { typedef enum spa_feature {
SPA_FEATURE_ASYNC_DESTROY, SPA_FEATURE_ASYNC_DESTROY,
SPA_FEATURE_EMPTY_BPOBJ,
SPA_FEATURES SPA_FEATURES
} spa_feature_t; } spa_feature_t;

View File

@ -169,5 +169,33 @@ through the \fBfreeing\fR property.
This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero. This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero.
.RE .RE
.sp
.ne 2
.na
\fB\fBempty_bpobj\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.delphix:empty_bpobj
READ\-ONLY COMPATIBLE yes
DEPENDENCIES none
.TE
This feature increases the performance of creating and using a large
number of snapshots of a single filesystem or volume, and also reduces
the disk space required.
When there are many snapshots, each snapshot uses many Block Pointer
Objects (bpobj's) to track blocks associated with that snapshot.
However, in common use cases, most of these bpobj's are empty. This
feature allows us to create each bpobj on-demand, thus eliminating the
empty bpobjs.
This feature is \fBactive\fR while there are any filesystems, volumes,
or snapshots which were created after enabling this feature.
.RE
.SH "SEE ALSO" .SH "SEE ALSO"
\fBzpool\fR(1M) \fBzpool\fR(1M)

View File

@ -20,13 +20,61 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved.
*/ */
#include <sys/bpobj.h> #include <sys/bpobj.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/refcount.h> #include <sys/refcount.h>
#include <sys/dsl_pool.h> #include <sys/dsl_pool.h>
#include <sys/zfeature.h>
#include <sys/zap.h>
/*
* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
*/
uint64_t
bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
{
zfeature_info_t *empty_bpobj_feat =
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
spa_t *spa = dmu_objset_spa(os);
dsl_pool_t *dp = dmu_objset_pool(os);
if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
ASSERT3U(dp->dp_empty_bpobj, ==, 0);
dp->dp_empty_bpobj =
bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj, tx) == 0);
}
spa_feature_incr(spa, empty_bpobj_feat, tx);
ASSERT(dp->dp_empty_bpobj != 0);
return (dp->dp_empty_bpobj);
} else {
return (bpobj_alloc(os, blocksize, tx));
}
}
void
bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
{
zfeature_info_t *empty_bpobj_feat =
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
dsl_pool_t *dp = dmu_objset_pool(os);
spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, tx));
VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
dp->dp_empty_bpobj = 0;
}
}
uint64_t uint64_t
bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
@ -53,6 +101,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
int epb; int epb;
dmu_buf_t *dbuf = NULL; dmu_buf_t *dbuf = NULL;
ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
mutex_enter(&bpo.bpo_lock); mutex_enter(&bpo.bpo_lock);
@ -320,6 +369,12 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
ASSERT(bpo->bpo_havesubobj); ASSERT(bpo->bpo_havesubobj);
ASSERT(bpo->bpo_havecomp); ASSERT(bpo->bpo_havecomp);
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
bpobj_decr_empty(bpo->bpo_os, tx);
return;
}
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
@ -388,6 +443,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
blkptr_t *bparray; blkptr_t *bparray;
ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
/* We never need the fill count. */ /* We never need the fill count. */
stored_bp.blk_fill = 0; stored_bp.blk_fill = 0;

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved.
*/ */
#include <sys/dsl_dataset.h> #include <sys/dsl_dataset.h>
@ -165,12 +165,49 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
for (zap_cursor_init(&zc, os, dlobj); for (zap_cursor_init(&zc, os, dlobj);
zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) zap_cursor_advance(&zc)) {
bpobj_free(os, za.za_first_integer, tx); uint64_t obj = za.za_first_integer;
if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
bpobj_decr_empty(os, tx);
else
bpobj_free(os, obj, tx);
}
zap_cursor_fini(&zc); zap_cursor_fini(&zc);
VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
} }
static void
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
const blkptr_t *bp, dmu_tx_t *tx)
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
}
static void
dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
uint64_t obj, dmu_tx_t *tx)
{
if (dle->dle_bpobj.bpo_object !=
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
} else {
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
}
void void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
{ {
@ -199,7 +236,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
else else
dle = AVL_PREV(&dl->dl_tree, dle); dle = AVL_PREV(&dl->dl_tree, dle);
bpobj_enqueue(&dle->dle_bpobj, bp, tx); dle_enqueue(dl, dle, bp, tx);
} }
/* /*
@ -219,7 +256,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE); dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE);
dle->dle_mintxg = mintxg; dle->dle_mintxg = mintxg;
obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle); avl_add(&dl->dl_tree, dle);
@ -245,8 +282,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
dle_prev = AVL_PREV(&dl->dl_tree, dle); dle_prev = AVL_PREV(&dl->dl_tree, dle);
bpobj_enqueue_subobj(&dle_prev->dle_bpobj, dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
dle->dle_bpobj.bpo_object, tx);
avl_remove(&dl->dl_tree, dle); avl_remove(&dl->dl_tree, dle);
bpobj_close(&dle->dle_bpobj); bpobj_close(&dle->dle_bpobj);
@ -304,7 +340,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
if (dle->dle_mintxg >= maxtxg) if (dle->dle_mintxg >= maxtxg)
break; break;
obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx)); dle->dle_mintxg, obj, tx));
} }
@ -402,7 +438,7 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
dle = avl_find(&dl->dl_tree, &dle_tofind, &where); dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL) if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); dle_enqueue_subobj(dl, dle, obj, tx);
} }
static int static int

View File

@ -322,6 +322,15 @@ dsl_pool_open(dsl_pool_t *dp)
goto out; goto out;
} }
if (spa_feature_is_active(dp->dp_spa,
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj);
if (err != 0)
goto out;
}
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
&dp->dp_tmp_userrefs_obj); &dp->dp_tmp_userrefs_obj);

View File

@ -1093,6 +1093,16 @@ zap_add_int_key(objset_t *os, uint64_t obj,
return (zap_add(os, obj, name, 8, 1, &value, tx)); return (zap_add(os, obj, name, 8, 1, &value, tx));
} }
int
zap_update_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx)
{
char name[20];
(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
return (zap_update(os, obj, name, 8, 1, &value, tx));
}
int int
zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
{ {

View File

@ -229,7 +229,12 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
uint64_t refcount; uint64_t refcount;
uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj; uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
ASSERT(0 != zapobj); /*
* If the pool is currently being created, the feature objects may not
* have been allocated yet. Act as though all features are disabled.
*/
if (zapobj == 0)
return (ENOTSUP);
err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1, err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
&refcount); &refcount);

View File

@ -157,4 +157,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_ASYNC_DESTROY, zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
"com.delphix:async_destroy", "async_destroy", "com.delphix:async_destroy", "async_destroy",
"Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
"com.delphix:empty_bpobj", "empty_bpobj",
"Snapshots use less space.", B_TRUE, B_FALSE, NULL);
} }