Illumos 5960, 5925

5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

References:
  https://www.illumos.org/issues/5960
  https://www.illumos.org/issues/5925
  https://github.com/illumos/illumos-gate/commit/a2cdcdd

Porting notes:
- [lib/libzfs/libzfs_sendrecv.c]
  - b8864a2 Fix gcc cast warnings
  - 325f023 Add linux kernel device support
  - 5c3f61e Increase Linux pipe buffer size on 'zfs receive'
- [module/zfs/zfs_vnops.c]
  - 3558fd7 Prototype/structure update for Linux
  - c12e3a5 Restructure zfs_readdir() to fix regressions
- [module/zfs/zvol.c]
  - Function @zvol_map_block() isn't needed in ZoL
  - 9965059 Prefetch start and end of volumes
- [module/zfs/dmu.c]
  - Fixed ISO C90 - mixed declarations and code
  - Function dmu_prefetch() 'int i' is initialized before
    the following code block (c90 vs. c99)
- [module/zfs/dbuf.c]
  - fc5bb51 Fix stack dbuf_hold_impl()
  - 9b67f60 Illumos 4757, 4913
  - 34229a2 Reduce stack usage for recursive traverse_visitbp()
- [module/zfs/dmu_send.c]
  - Fixed ISO C90 - mixed declarations and code
  - b58986e Use large stacks when available
  - 241b541 Illumos 5959 - clean up per-dataset feature count code
  - 77aef6f Use vmem_alloc() for nvlists
  - 00b4602 Add linux kernel memory support

Ported-by: kernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Paul Dagnelie
2015-12-22 02:31:57 +01:00
committed by Brian Behlendorf
parent 00af2ff6f2
commit fcff0f35bd
40 changed files with 1426 additions and 397 deletions
+250 -46
View File
@@ -51,7 +51,8 @@ struct dbuf_hold_impl_data {
dnode_t *dh_dn;
uint8_t dh_level;
uint64_t dh_blkid;
int dh_fail_sparse;
boolean_t dh_fail_sparse;
boolean_t dh_fail_uncached;
void *dh_tag;
dmu_buf_impl_t **dh_dbp;
/* Local variables */
@@ -65,8 +66,9 @@ struct dbuf_hold_impl_data {
};
static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
void *tag, dmu_buf_impl_t **dbp, int depth);
dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse,
boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp, int depth);
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
/*
@@ -604,11 +606,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
return (abuf);
}
/*
* Calculate which level n block references the data at the level 0 offset
* provided.
*/
uint64_t
dbuf_whichblock(dnode_t *dn, uint64_t offset)
dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
{
if (dn->dn_datablkshift) {
return (offset >> dn->dn_datablkshift);
if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
/*
* The level n blkid is equal to the level 0 blkid divided by
* the number of level 0s in a level n block.
*
* The level 0 blkid is offset >> datablkshift =
* offset / 2^datablkshift.
*
* The number of level 0s in a level n is the number of block
* pointers in an indirect block, raised to the power of level.
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
*
* Thus, the level n blkid is: offset /
* ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
* = offset / 2^(datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
* = offset >> (datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
*/
return (offset >> (dn->dn_datablkshift + level *
(dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
} else {
ASSERT3U(offset, <, dn->dn_datablksz);
return (0);
@@ -1786,6 +1812,12 @@ dbuf_clear(dmu_buf_impl_t *db)
dbuf_rele(parent, db);
}
/*
* Note: While bpp will always be updated if the function returns success,
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
* this happens when the dnode is the meta-dnode, or a userused or groupused
* object.
*/
__attribute__((always_inline))
static inline int
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
@@ -1828,12 +1860,12 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
/* this block is referenced from an indirect block */
int err;
if (dh == NULL) {
err = dbuf_hold_impl(dn, level+1, blkid >> epbs,
fail_sparse, NULL, parentp);
err = dbuf_hold_impl(dn, level+1,
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
} else {
__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
blkid >> epbs, fail_sparse, NULL,
parentp, dh->dh_depth + 1);
blkid >> epbs, fail_sparse, FALSE, NULL,
parentp, dh->dh_depth + 1);
err = __dbuf_hold_impl(dh + 1);
}
if (err)
@@ -2011,11 +2043,102 @@ dbuf_destroy(dmu_buf_impl_t *db)
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
}
void
dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
typedef struct dbuf_prefetch_arg {
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
int dpa_curlevel; /* The current level that we're reading */
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
} dbuf_prefetch_arg_t;
/*
* Actually issue the prefetch read for the block given.
*/
static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
{
dmu_buf_impl_t *db = NULL;
blkptr_t *bp = NULL;
arc_flags_t aflags;
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return;
aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &dpa->dpa_zb);
}
/*
* Called when an indirect block above our prefetch target is read in. This
* will either read in the next indirect block down the tree or issue the actual
* prefetch if the next block down is our target.
*/
static void
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
uint64_t nextblkid;
blkptr_t *bp;
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0);
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
}
dpa->dpa_curlevel--;
nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
(void) arc_buf_remove_ref(abuf, private);
}
/*
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
* complete.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
arc_flags_t aflags)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
uint64_t curblkid;
dmu_buf_impl_t *db;
zio_t *pio;
dbuf_prefetch_arg_t *dpa;
dsl_dataset_t *ds;
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -2023,35 +2146,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
if (dnode_block_freed(dn, blkid))
return;
/* dbuf_find() returns with db_mtx held */
if ((db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid))) {
/*
* This dbuf is already in the cache. We assume that
* it is already CACHED, or else about to be either
* read or filled.
*/
/*
* This dnode hasn't been written to disk yet, so there's nothing to
* prefetch.
*/
nlevels = dn->dn_phys->dn_nlevels;
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
return;
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
return;
db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
if (db != NULL) {
mutex_exit(&db->db_mtx);
/*
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
return;
}
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
arc_flags_t aflags =
ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
zbookmark_phys_t zb;
/*
* Find the closest ancestor (indirect block) of the target block
* that is present in the cache. In this indirect block, we will
* find the bp that is at curlevel, curblkid.
*/
curlevel = level;
curblkid = blkid;
while (curlevel < nlevels - 1) {
int parent_level = curlevel + 1;
uint64_t parent_blkid = curblkid >> epbs;
dmu_buf_impl_t *db;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, 0, blkid);
(void) arc_read(NULL, dn->dn_objset->os_spa,
bp, NULL, NULL, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
if (dbuf_hold_impl(dn, parent_level, parent_blkid,
FALSE, TRUE, FTAG, &db) == 0) {
blkptr_t *bpp = db->db_buf->b_data;
bp = bpp[P2PHASE(curblkid, 1 << epbs)];
dbuf_rele(db, FTAG);
break;
}
if (db)
dbuf_rele(db, NULL);
curlevel = parent_level;
curblkid = parent_blkid;
}
if (curlevel == nlevels - 1) {
/* No cached indirect blocks found. */
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
bp = dn->dn_phys->dn_blkptr[curblkid];
}
if (BP_IS_HOLE(&bp))
return;
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
ZIO_FLAG_CANFAIL);
dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
ds = dn->dn_objset->os_dsl_dataset;
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, level, blkid);
dpa->dpa_curlevel = curlevel;
dpa->dpa_prio = prio;
dpa->dpa_aflags = aflags;
dpa->dpa_spa = dn->dn_objset->os_spa;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
/*
* If we have the indirect just above us, no need to do the asynchronous
* prefetch chain; we'll just run the last step ourselves. If we're at
* a higher level, though, we want to issue the prefetches for all the
* indirect blocks asynchronously, so we can go on with whatever we were
* doing.
*/
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, curlevel, curblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
&bp, dbuf_prefetch_indirect_done, dpa, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
/*
* We use pio here instead of dpa_zio since it's possible that
* dpa may have already been freed.
*/
zio_nowait(pio);
}
#define DBUF_HOLD_IMPL_MAX_DEPTH 20
@@ -2079,6 +2271,9 @@ top:
if (dh->dh_db == NULL) {
dh->dh_bp = NULL;
if (dh->dh_fail_uncached)
return (SET_ERROR(ENOENT));
ASSERT3P(dh->dh_parent, ==, NULL);
dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
dh->dh_fail_sparse, &dh->dh_parent,
@@ -2099,6 +2294,11 @@ top:
dh->dh_parent, dh->dh_bp);
}
if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
mutex_exit(&dh->dh_db->db_mtx);
return (SET_ERROR(ENOENT));
}
if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) {
arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db);
if (dh->dh_db->db_buf->b_data == NULL) {
@@ -2159,7 +2359,8 @@ top:
* on the stack for 20 levels of recursion.
*/
int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp)
{
struct dbuf_hold_impl_data *dh;
@@ -2167,7 +2368,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) *
DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse,
fail_uncached, tag, dbp, 0);
error = __dbuf_hold_impl(dh);
@@ -2179,13 +2381,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
static void
__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
void *tag, dmu_buf_impl_t **dbp, int depth)
dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp, int depth)
{
dh->dh_dn = dn;
dh->dh_level = level;
dh->dh_blkid = blkid;
dh->dh_fail_sparse = fail_sparse;
dh->dh_fail_uncached = fail_uncached;
dh->dh_tag = tag;
dh->dh_dbp = dbp;
dh->dh_depth = depth;
@@ -2194,16 +2400,14 @@ __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{
dmu_buf_impl_t *db;
int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
return (err ? NULL : db);
return (dbuf_hold_level(dn, 0, blkid, tag));
}
dmu_buf_impl_t *
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
{
dmu_buf_impl_t *db;
int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
return (err ? NULL : db);
}
@@ -2531,8 +2735,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
if (parent == NULL) {
mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
(void) dbuf_hold_impl(dn, db->db_level+1,
db->db_blkid >> epbs, FALSE, db, &parent);
parent = dbuf_hold_level(dn, db->db_level + 1,
db->db_blkid >> epbs, db);
rw_exit(&dn->dn_struct_rwlock);
mutex_enter(&db->db_mtx);
db->db_parent = parent;