Illumos 5960, 5925

5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

References:
  https://www.illumos.org/issues/5960
  https://www.illumos.org/issues/5925
  https://github.com/illumos/illumos-gate/commit/a2cdcdd

Porting notes:
- [lib/libzfs/libzfs_sendrecv.c]
  - b8864a2 Fix gcc cast warnings
  - 325f023 Add linux kernel device support
  - 5c3f61e Increase Linux pipe buffer size on 'zfs receive'
- [module/zfs/zfs_vnops.c]
  - 3558fd7 Prototype/structure update for Linux
  - c12e3a5 Restructure zfs_readdir() to fix regressions
- [module/zfs/zvol.c]
  - Function @zvol_map_block() isn't needed in ZoL
  - 9965059 Prefetch start and end of volumes
- [module/zfs/dmu.c]
  - Fixed ISO C90 - mixed declarations and code
  - Function dmu_prefetch() 'int i' is initialized before
    the following code block (c90 vs. c99)
- [module/zfs/dbuf.c]
  - fc5bb51 Fix stack dbuf_hold_impl()
  - 9b67f60 Illumos 4757, 4913
  - 34229a2 Reduce stack usage for recursive traverse_visitbp()
- [module/zfs/dmu_send.c]
  - Fixed ISO C90 - mixed declarations and code
  - b58986e Use large stacks when available
  - 241b541 Illumos 5959 - clean up per-dataset feature count code
  - 77aef6f Use vmem_alloc() for nvlists
  - 00b4602 Add linux kernel memory support

Ported-by: kernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Paul Dagnelie 2015-12-22 02:31:57 +01:00 committed by Brian Behlendorf
parent 00af2ff6f2
commit fcff0f35bd
40 changed files with 1426 additions and 397 deletions

View File

@ -2489,6 +2489,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type; dmu_object_type_t type;
boolean_t is_metadata; boolean_t is_metadata;
if (bp == NULL)
return (0);
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
char blkbuf[BP_SPRINTF_LEN]; char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
@ -2985,7 +2988,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
avl_index_t where; avl_index_t where;
zdb_ddt_entry_t *zdde, zdde_search; zdb_ddt_entry_t *zdde, zdde_search;
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {

View File

@ -250,7 +250,8 @@ get_usage(zfs_help_t idx)
case HELP_RECEIVE: case HELP_RECEIVE:
return (gettext("\treceive [-vnFu] <filesystem|volume|" return (gettext("\treceive [-vnFu] <filesystem|volume|"
"snapshot>\n" "snapshot>\n"
"\treceive [-vnFu] [-d | -e] <filesystem>\n")); "\treceive [-vnFu] [-o origin=<snapshot>] [-d | -e] "
"<filesystem>\n"));
case HELP_RENAME: case HELP_RENAME:
return (gettext("\trename [-f] <filesystem|volume|snapshot> " return (gettext("\trename [-f] <filesystem|volume|snapshot> "
"<filesystem|volume|snapshot>\n" "<filesystem|volume|snapshot>\n"
@ -793,7 +794,7 @@ zfs_do_create(int argc, char **argv)
nomem(); nomem();
break; break;
case 'o': case 'o':
if (parseprop(props, optarg)) if (parseprop(props, optarg) != 0)
goto error; goto error;
break; break;
case 's': case 's':
@ -3622,7 +3623,7 @@ zfs_do_snapshot(int argc, char **argv)
while ((c = getopt(argc, argv, "ro:")) != -1) { while ((c = getopt(argc, argv, "ro:")) != -1) {
switch (c) { switch (c) {
case 'o': case 'o':
if (parseprop(props, optarg)) if (parseprop(props, optarg) != 0)
return (1); return (1);
break; break;
case 'r': case 'r':
@ -3881,10 +3882,19 @@ zfs_do_receive(int argc, char **argv)
{ {
int c, err; int c, err;
recvflags_t flags = { 0 }; recvflags_t flags = { 0 };
nvlist_t *props;
nvpair_t *nvp = NULL;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
/* check options */ /* check options */
while ((c = getopt(argc, argv, ":denuvF")) != -1) { while ((c = getopt(argc, argv, ":o:denuvF")) != -1) {
switch (c) { switch (c) {
case 'o':
if (parseprop(props, optarg) != 0)
return (1);
break;
case 'd': case 'd':
flags.isprefix = B_TRUE; flags.isprefix = B_TRUE;
break; break;
@ -3929,6 +3939,13 @@ zfs_do_receive(int argc, char **argv)
usage(B_FALSE); usage(B_FALSE);
} }
while ((nvp = nvlist_next_nvpair(props, nvp))) {
if (strcmp(nvpair_name(nvp), "origin") != 0) {
(void) fprintf(stderr, gettext("invalid option"));
usage(B_FALSE);
}
}
if (isatty(STDIN_FILENO)) { if (isatty(STDIN_FILENO)) {
(void) fprintf(stderr, (void) fprintf(stderr,
gettext("Error: Backup stream can not be read " gettext("Error: Backup stream can not be read "
@ -3937,7 +3954,7 @@ zfs_do_receive(int argc, char **argv)
return (1); return (1);
} }
err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL); err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
return (err != 0); return (err != 0);
} }

View File

@ -3728,7 +3728,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
*/ */
n = ztest_random(regions) * stride + ztest_random(width); n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(2 * width - 1); s = 1 + ztest_random(2 * width - 1);
dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
ZIO_PRIORITY_SYNC_READ);
/* /*
* Pick a random index and compute the offsets into packobj and bigobj. * Pick a random index and compute the offsets into packobj and bigobj.
@ -5930,8 +5931,10 @@ ztest_run(ztest_shared_t *zs)
* Right before closing the pool, kick off a bunch of async I/O; * Right before closing the pool, kick off a bunch of async I/O;
* spa_close() should wait for it to complete. * spa_close() should wait for it to complete.
*/ */
for (object = 1; object < 50; object++) for (object = 1; object < 50; object++) {
dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
ZIO_PRIORITY_SYNC_READ);
}
/* Verify that at least one commit cb was called in a timely fashion */ /* Verify that at least one commit cb was called in a timely fashion */
if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG)

View File

@ -678,8 +678,8 @@ typedef struct recvflags {
boolean_t nomount; boolean_t nomount;
} recvflags_t; } recvflags_t;
extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *, extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
int, avl_tree_t *); recvflags_t *, int, avl_tree_t *);
typedef enum diff_flags { typedef enum diff_flags {
ZFS_DIFF_PARSEABLE = 0x1, ZFS_DIFF_PARSEABLE = 0x1,

View File

@ -9,6 +9,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/bplist.h \ $(top_srcdir)/include/sys/bplist.h \
$(top_srcdir)/include/sys/bpobj.h \ $(top_srcdir)/include/sys/bpobj.h \
$(top_srcdir)/include/sys/bptree.h \ $(top_srcdir)/include/sys/bptree.h \
$(top_srcdir)/include/sys/bqueue.h \
$(top_srcdir)/include/sys/dbuf.h \ $(top_srcdir)/include/sys/dbuf.h \
$(top_srcdir)/include/sys/ddt.h \ $(top_srcdir)/include/sys/ddt.h \
$(top_srcdir)/include/sys/dmu.h \ $(top_srcdir)/include/sys/dmu.h \
@ -96,6 +97,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/zio_compress.h \ $(top_srcdir)/include/sys/zio_compress.h \
$(top_srcdir)/include/sys/zio.h \ $(top_srcdir)/include/sys/zio.h \
$(top_srcdir)/include/sys/zio_impl.h \ $(top_srcdir)/include/sys/zio_impl.h \
$(top_srcdir)/include/sys/zio_priority.h \
$(top_srcdir)/include/sys/zrlock.h $(top_srcdir)/include/sys/zrlock.h
KERNEL_H = \ KERNEL_H = \

54
include/sys/bqueue.h Normal file
View File

@ -0,0 +1,54 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
#ifndef _BQUEUE_H
#define _BQUEUE_H
#ifdef __cplusplus
extern "C" {
#endif
#include <sys/zfs_context.h>
typedef struct bqueue {
list_t bq_list;
kmutex_t bq_lock;
kcondvar_t bq_add_cv;
kcondvar_t bq_pop_cv;
uint64_t bq_size;
uint64_t bq_maxsize;
size_t bq_node_offset;
} bqueue_t;
typedef struct bqueue_node {
list_node_t bqn_node;
uint64_t bqn_size;
} bqueue_node_t;
int bqueue_init(bqueue_t *, uint64_t, size_t);
void bqueue_destroy(bqueue_t *);
void bqueue_enqueue(bqueue_t *, void *, uint64_t);
void *bqueue_dequeue(bqueue_t *);
boolean_t bqueue_empty(bqueue_t *);
#ifdef __cplusplus
}
#endif
#endif /* _BQUEUE_H */

View File

@ -261,8 +261,7 @@ typedef struct dbuf_hash_table {
kmutex_t hash_mutexes[DBUF_MUTEXES]; kmutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t; } dbuf_hash_table_t;
uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
void dbuf_create_bonus(struct dnode *dn); void dbuf_create_bonus(struct dnode *dn);
int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
@ -272,10 +271,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
void *tag); void *tag);
int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp); void *tag, dmu_buf_impl_t **dbp);
void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
zio_priority_t prio, arc_flags_t aflags);
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,

View File

@ -44,6 +44,7 @@
#include <sys/inttypes.h> #include <sys/inttypes.h>
#include <sys/cred.h> #include <sys/cred.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/zio_priority.h>
#include <sys/uio.h> #include <sys/uio.h>
#ifdef __cplusplus #ifdef __cplusplus
@ -737,8 +738,8 @@ extern int zfs_max_recordsize;
/* /*
* Asynchronously try to read in the data. * Asynchronously try to read in the data.
*/ */
void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len); uint64_t len, enum zio_priority pri);
typedef struct dmu_object_info { typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */ /* All sizes are in bytes unless otherwise indicated. */

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */

View File

@ -160,8 +160,18 @@ extern int aok;
/* /*
* DTrace SDT probes have different signatures in userland than they do in * DTrace SDT probes have different signatures in userland than they do in
* kernel. If they're being used in kernel code, re-define them out of * the kernel. If they're being used in kernel code, re-define them out of
* existence for their counterparts in libzpool. * existence for their counterparts in libzpool.
*
* Here's an example of how to use the set-error probes in userland:
* zfs$target:::set-error /arg0 == EBUSY/ {stack();}
*
* Here's an example of how to use DTRACE_PROBE probes in userland:
* If there is a probe declared as follows:
* DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn);
* Then you can use it as follows:
* zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/
* {printf("%u %p\n", arg1, arg2);}
*/ */
#ifdef DTRACE_PROBE #ifdef DTRACE_PROBE

View File

@ -29,6 +29,7 @@
#ifndef _ZIO_H #ifndef _ZIO_H
#define _ZIO_H #define _ZIO_H
#include <sys/zio_priority.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/txg.h> #include <sys/txg.h>
@ -147,17 +148,6 @@ enum zio_compress {
#define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2 #define ZIO_FAILURE_MODE_PANIC 2
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
enum zio_flag { enum zio_flag {
/* /*
* Flags inherited by gang, ddt, and vdev children, * Flags inherited by gang, ddt, and vdev children,
@ -262,6 +252,7 @@ extern const char *zio_type_name[ZIO_TYPES];
* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>. * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
* dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
* *
* Note: this structure is called a bookmark because its original purpose * Note: this structure is called a bookmark because its original purpose
* was to remember where to resume a pool-wide traverse. * was to remember where to resume a pool-wide traverse.
@ -294,6 +285,9 @@ struct zbookmark_phys {
#define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_OBJECT (0ULL)
#define ZB_ZIL_LEVEL (-2LL) #define ZB_ZIL_LEVEL (-2LL)
#define ZB_DNODE_LEVEL (-3LL)
#define ZB_DNODE_BLKID (0ULL)
#define ZB_IS_ZERO(zb) \ #define ZB_IS_ZERO(zb) \
((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
(zb)->zb_level == 0 && (zb)->zb_blkid == 0) (zb)->zb_level == 0 && (zb)->zb_blkid == 0)
@ -599,8 +593,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
extern void spa_handle_ignored_writes(spa_t *spa); extern void spa_handle_ignored_writes(spa_t *spa);
/* zbookmark_phys functions */ /* zbookmark_phys functions */
boolean_t zbookmark_is_before(const struct dnode_phys *dnp, boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -44,7 +44,7 @@ typedef const struct zio_checksum_info {
zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
int ci_correctable; /* number of correctable bits */ int ci_correctable; /* number of correctable bits */
int ci_eck; /* uses zio embedded checksum? */ int ci_eck; /* uses zio embedded checksum? */
int ci_dedup; /* strong enough for dedup? */ boolean_t ci_dedup; /* strong enough for dedup? */
char *ci_name; /* descriptive name */ char *ci_name; /* descriptive name */
} zio_checksum_info_t; } zio_checksum_info_t;

View File

@ -0,0 +1,40 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
#ifdef __cplusplus
extern "C" {
#endif
typedef enum zio_priority {
ZIO_PRIORITY_SYNC_READ,
ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
} zio_priority_t;
#ifdef __cplusplus
}
#endif
#endif /* _ZIO_PRIORITY_H */

View File

@ -3529,7 +3529,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
} }
static int static int
zbookmark_compare(const void *a, const void *b) zbookmark_mem_compare(const void *a, const void *b)
{ {
return (memcmp(a, b, sizeof (zbookmark_phys_t))); return (memcmp(a, b, sizeof (zbookmark_phys_t)));
} }
@ -3592,7 +3592,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
zc.zc_nvlist_dst_size; zc.zc_nvlist_dst_size;
count -= zc.zc_nvlist_dst_size; count -= zc.zc_nvlist_dst_size;
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare); qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);

View File

@ -63,8 +63,9 @@
/* in libzfs_dataset.c */ /* in libzfs_dataset.c */
extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *, static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
uint64_t *);
static const zio_cksum_t zero_cksum = { { 0 } }; static const zio_cksum_t zero_cksum = { { 0 } };
@ -2523,7 +2524,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
* zfs_receive_one() will take care of it (ie, * zfs_receive_one() will take care of it (ie,
* recv_skip() and return 0). * recv_skip() and return 0).
*/ */
error = zfs_receive_impl(hdl, destname, flags, fd, error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
action_handlep); action_handlep);
if (error == ENODATA) { if (error == ENODATA) {
@ -2656,9 +2657,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
*/ */
static int static int
zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
recvflags_t *flags, dmu_replay_record_t *drr, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
dmu_replay_record_t *drr_noswap, const char *sendfs, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
uint64_t *action_handlep) uint64_t *action_handlep)
{ {
zfs_cmd_t zc = {"\0"}; zfs_cmd_t zc = {"\0"};
@ -2808,10 +2809,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
} }
if (flags->verbose) if (flags->verbose)
(void) printf("found clone origin %s\n", zc.zc_string); (void) printf("found clone origin %s\n", zc.zc_string);
} else if (originsnap) {
(void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN);
if (flags->verbose)
(void) printf("using provided clone origin %s\n",
zc.zc_string);
} }
stream_wantsnewfs = (drrb->drr_fromguid == 0 || stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
(drrb->drr_flags & DRR_FLAG_CLONE)); (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap);
if (stream_wantsnewfs) { if (stream_wantsnewfs) {
/* /*
@ -3189,9 +3195,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
} }
static int static int
zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
char **top_zfs, int cleanup_fd, uint64_t *action_handlep) nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
uint64_t *action_handlep)
{ {
int err; int err;
dmu_replay_record_t drr, drr_noswap; dmu_replay_record_t drr, drr_noswap;
@ -3210,6 +3217,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
"(%s) does not exist"), tosnap); "(%s) does not exist"), tosnap);
return (zfs_error(hdl, EZFS_NOENT, errbuf)); return (zfs_error(hdl, EZFS_NOENT, errbuf));
} }
if (originsnap &&
!zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
"(%s) does not exist"), originsnap);
return (zfs_error(hdl, EZFS_NOENT, errbuf));
}
/* read in the BEGIN record */ /* read in the BEGIN record */
if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
@ -3282,14 +3295,14 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
*cp = '\0'; *cp = '\0';
sendfs = nonpackage_sendfs; sendfs = nonpackage_sendfs;
} }
return (zfs_receive_one(hdl, infd, tosnap, flags, return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
&drr, &drr_noswap, sendfs, stream_nv, stream_avl, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
top_zfs, cleanup_fd, action_handlep)); cleanup_fd, action_handlep));
} else { } else {
assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
DMU_COMPOUNDSTREAM); DMU_COMPOUNDSTREAM);
return (zfs_receive_package(hdl, infd, tosnap, flags, return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
&drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); &zcksum, top_zfs, cleanup_fd, action_handlep));
} }
} }
@ -3300,14 +3313,15 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
* (-1 will override -2). * (-1 will override -2).
*/ */
int int
zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
int infd, avl_tree_t *stream_avl) recvflags_t *flags, int infd, avl_tree_t *stream_avl)
{ {
char *top_zfs = NULL; char *top_zfs = NULL;
int err; int err;
int cleanup_fd; int cleanup_fd;
uint64_t action_handle = 0; uint64_t action_handle = 0;
struct stat sb; struct stat sb;
char *originsnap = NULL;
/* /*
* The only way fstat can fail is if we do not have a valid file * The only way fstat can fail is if we do not have a valid file
@ -3350,10 +3364,16 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
} }
#endif /* __linux__ */ #endif /* __linux__ */
if (props) {
err = nvlist_lookup_string(props, "origin", &originsnap);
if (err && err != ENOENT)
return (err);
}
cleanup_fd = open(ZFS_DEV, O_RDWR); cleanup_fd = open(ZFS_DEV, O_RDWR);
VERIFY(cleanup_fd >= 0); VERIFY(cleanup_fd >= 0);
err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL, err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
stream_avl, &top_zfs, cleanup_fd, &action_handle); stream_avl, &top_zfs, cleanup_fd, &action_handle);
VERIFY(0 == close(cleanup_fd)); VERIFY(0 == close(cleanup_fd));

View File

@ -32,6 +32,7 @@ KERNEL_C = \
bplist.c \ bplist.c \
bpobj.c \ bpobj.c \
bptree.c \ bptree.c \
bqueue.c \
dbuf.c \ dbuf.c \
dbuf_stats.c \ dbuf_stats.c \
ddt.c \ ddt.c \

View File

@ -184,12 +184,12 @@ zfs \- configures ZFS file systems
.LP .LP
.nf .nf
\fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
.fi .fi
.LP .LP
.nf .nf
\fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR \fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR
.fi .fi
.LP .LP
@ -2929,11 +2929,11 @@ then the receiving system must have that feature enabled as well. See
.ne 2 .ne 2
.mk .mk
.na .na
\fB\fBzfs receive\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR \fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
.ad .ad
.br .br
.na .na
\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR\fR \fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR
.ad .ad
.sp .6 .sp .6
.RS 4n .RS 4n
@ -3001,6 +3001,17 @@ Print verbose information about the stream and the time required to perform the
Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use. Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use.
.RE .RE
.sp
.ne 2
.mk
.na
\fB\fB-o\fR \fBorigin\fR=\fIsnapshot\fR
.ad
.sp .6
.RS 4n
Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin.
.RE
.sp .sp
.ne 2 .ne 2
.mk .mk

View File

@ -14,6 +14,7 @@ $(MODULE)-objs += bpobj.o
$(MODULE)-objs += dbuf.o $(MODULE)-objs += dbuf.o
$(MODULE)-objs += dbuf_stats.o $(MODULE)-objs += dbuf_stats.o
$(MODULE)-objs += bptree.o $(MODULE)-objs += bptree.o
$(MODULE)-objs += bqueue.o
$(MODULE)-objs += ddt.o $(MODULE)-objs += ddt.o
$(MODULE)-objs += ddt_zap.o $(MODULE)-objs += ddt_zap.o
$(MODULE)-objs += dmu.o $(MODULE)-objs += dmu.o

View File

@ -156,7 +156,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
int err; int err;
struct bptree_args *ba = arg; struct bptree_args *ba = arg;
if (BP_IS_HOLE(bp)) if (bp == NULL || BP_IS_HOLE(bp))
return (0); return (0);
err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);

111
module/zfs/bqueue.c Normal file
View File

@ -0,0 +1,111 @@
/*
* CDDL HEADER START
*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
#include <sys/bqueue.h>
#include <sys/zfs_context.h>
static inline bqueue_node_t *
obj2node(bqueue_t *q, void *data)
{
return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
}
/*
* Initialize a blocking queue The maximum capacity of the queue is set to
* size. Types that want to be stored in a bqueue must contain a bqueue_node_t,
* and offset should give its offset from the start of the struct. Return 0 on
* success, or -1 on failure.
*/
int
bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
{
list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
node_offset + offsetof(bqueue_node_t, bqn_node));
cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
q->bq_node_offset = node_offset;
q->bq_size = 0;
q->bq_maxsize = size;
return (0);
}
/*
* Destroy a blocking queue. This function asserts that there are no
* elements in the queue, and no one is blocked on the condition
* variables.
*/
void
bqueue_destroy(bqueue_t *q)
{
ASSERT0(q->bq_size);
cv_destroy(&q->bq_add_cv);
cv_destroy(&q->bq_pop_cv);
mutex_destroy(&q->bq_lock);
list_destroy(&q->bq_list);
}
/*
* Add data to q, consuming size units of capacity. If there is insufficient
* capacity to consume size units, block until capacity exists. Asserts size is
* > 0.
*/
void
bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
{
ASSERT3U(item_size, >, 0);
ASSERT3U(item_size, <, q->bq_maxsize);
mutex_enter(&q->bq_lock);
obj2node(q, data)->bqn_size = item_size;
while (q->bq_size + item_size > q->bq_maxsize) {
cv_wait(&q->bq_add_cv, &q->bq_lock);
}
q->bq_size += item_size;
list_insert_tail(&q->bq_list, data);
cv_signal(&q->bq_pop_cv);
mutex_exit(&q->bq_lock);
}
/*
* Take the first element off of q. If there are no elements on the queue, wait
* until one is put there. Return the removed element.
*/
void *
bqueue_dequeue(bqueue_t *q)
{
void *ret;
uint64_t item_size;
mutex_enter(&q->bq_lock);
while (q->bq_size == 0) {
cv_wait(&q->bq_pop_cv, &q->bq_lock);
}
ret = list_remove_head(&q->bq_list);
item_size = obj2node(q, ret)->bqn_size;
q->bq_size -= item_size;
mutex_exit(&q->bq_lock);
cv_signal(&q->bq_add_cv);
return (ret);
}
/*
* Returns true if the space used is 0.
*/
boolean_t
bqueue_empty(bqueue_t *q)
{
return (q->bq_size == 0);
}

View File

@ -51,7 +51,8 @@ struct dbuf_hold_impl_data {
dnode_t *dh_dn; dnode_t *dh_dn;
uint8_t dh_level; uint8_t dh_level;
uint64_t dh_blkid; uint64_t dh_blkid;
int dh_fail_sparse; boolean_t dh_fail_sparse;
boolean_t dh_fail_uncached;
void *dh_tag; void *dh_tag;
dmu_buf_impl_t **dh_dbp; dmu_buf_impl_t **dh_dbp;
/* Local variables */ /* Local variables */
@ -65,7 +66,8 @@ struct dbuf_hold_impl_data {
}; };
static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse,
boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp, int depth); void *tag, dmu_buf_impl_t **dbp, int depth);
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
@ -604,11 +606,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
return (abuf); return (abuf);
} }
/*
* Calculate which level n block references the data at the level 0 offset
* provided.
*/
uint64_t uint64_t
dbuf_whichblock(dnode_t *dn, uint64_t offset) dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
{ {
if (dn->dn_datablkshift) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
return (offset >> dn->dn_datablkshift); /*
* The level n blkid is equal to the level 0 blkid divided by
* the number of level 0s in a level n block.
*
* The level 0 blkid is offset >> datablkshift =
* offset / 2^datablkshift.
*
* The number of level 0s in a level n is the number of block
* pointers in an indirect block, raised to the power of level.
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
*
* Thus, the level n blkid is: offset /
* ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
* = offset / 2^(datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
* = offset >> (datablkshift + level *
* (indblkshift - SPA_BLKPTRSHIFT))
*/
return (offset >> (dn->dn_datablkshift + level *
(dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
} else { } else {
ASSERT3U(offset, <, dn->dn_datablksz); ASSERT3U(offset, <, dn->dn_datablksz);
return (0); return (0);
@ -1786,6 +1812,12 @@ dbuf_clear(dmu_buf_impl_t *db)
dbuf_rele(parent, db); dbuf_rele(parent, db);
} }
/*
* Note: While bpp will always be updated if the function returns success,
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
* this happens when the dnode is the meta-dnode, or a userused or groupused
* object.
*/
__attribute__((always_inline)) __attribute__((always_inline))
static inline int static inline int
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
@ -1828,11 +1860,11 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
/* this block is referenced from an indirect block */ /* this block is referenced from an indirect block */
int err; int err;
if (dh == NULL) { if (dh == NULL) {
err = dbuf_hold_impl(dn, level+1, blkid >> epbs, err = dbuf_hold_impl(dn, level+1,
fail_sparse, NULL, parentp); blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
} else { } else {
__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
blkid >> epbs, fail_sparse, NULL, blkid >> epbs, fail_sparse, FALSE, NULL,
parentp, dh->dh_depth + 1); parentp, dh->dh_depth + 1);
err = __dbuf_hold_impl(dh + 1); err = __dbuf_hold_impl(dh + 1);
} }
@ -2011,11 +2043,102 @@ dbuf_destroy(dmu_buf_impl_t *db)
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
} }
void typedef struct dbuf_prefetch_arg {
dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) spa_t *dpa_spa; /* The spa to issue the prefetch in. */
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
int dpa_curlevel; /* The current level that we're reading */
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
} dbuf_prefetch_arg_t;
/*
* Actually issue the prefetch read for the block given.
*/
static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
{ {
dmu_buf_impl_t *db = NULL; arc_flags_t aflags;
blkptr_t *bp = NULL; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return;
aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &dpa->dpa_zb);
}
/*
* Called when an indirect block above our prefetch target is read in. This
* will either read in the next indirect block down the tree or issue the actual
* prefetch if the next block down is our target.
*/
static void
dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
uint64_t nextblkid;
blkptr_t *bp;
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0);
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
}
dpa->dpa_curlevel--;
nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
(void) arc_buf_remove_ref(abuf, private);
}
/*
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
* complete.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
arc_flags_t aflags)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
uint64_t curblkid;
dmu_buf_impl_t *db;
zio_t *pio;
dbuf_prefetch_arg_t *dpa;
dsl_dataset_t *ds;
ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@ -2023,35 +2146,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
if (dnode_block_freed(dn, blkid)) if (dnode_block_freed(dn, blkid))
return; return;
/* dbuf_find() returns with db_mtx held */
if ((db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid))) {
/* /*
* This dbuf is already in the cache. We assume that * This dnode hasn't been written to disk yet, so there's nothing to
* it is already CACHED, or else about to be either * prefetch.
* read or filled.
*/ */
nlevels = dn->dn_phys->dn_nlevels;
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
return;
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
return;
db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
if (db != NULL) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
/*
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
return; return;
} }
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { /*
if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { * Find the closest ancestor (indirect block) of the target block
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; * that is present in the cache. In this indirect block, we will
arc_flags_t aflags = * find the bp that is at curlevel, curblkid.
ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; */
curlevel = level;
curblkid = blkid;
while (curlevel < nlevels - 1) {
int parent_level = curlevel + 1;
uint64_t parent_blkid = curblkid >> epbs;
dmu_buf_impl_t *db;
if (dbuf_hold_impl(dn, parent_level, parent_blkid,
FALSE, TRUE, FTAG, &db) == 0) {
blkptr_t *bpp = db->db_buf->b_data;
bp = bpp[P2PHASE(curblkid, 1 << epbs)];
dbuf_rele(db, FTAG);
break;
}
curlevel = parent_level;
curblkid = parent_blkid;
}
if (curlevel == nlevels - 1) {
/* No cached indirect blocks found. */
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
bp = dn->dn_phys->dn_blkptr[curblkid];
}
if (BP_IS_HOLE(&bp))
return;
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
ZIO_FLAG_CANFAIL);
dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
ds = dn->dn_objset->os_dsl_dataset;
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, level, blkid);
dpa->dpa_curlevel = curlevel;
dpa->dpa_prio = prio;
dpa->dpa_aflags = aflags;
dpa->dpa_spa = dn->dn_objset->os_spa;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
/*
* If we have the indirect just above us, no need to do the asynchronous
* prefetch chain; we'll just run the last step ourselves. If we're at
* a higher level, though, we want to issue the prefetches for all the
* indirect blocks asynchronously, so we can go on with whatever we were
* doing.
*/
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb; zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, 0, blkid); dn->dn_object, curlevel, curblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
(void) arc_read(NULL, dn->dn_objset->os_spa, &bp, dbuf_prefetch_indirect_done, dpa, prio,
bp, NULL, NULL, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb); &iter_aflags, &zb);
}
if (db)
dbuf_rele(db, NULL);
} }
/*
* We use pio here instead of dpa_zio since it's possible that
* dpa may have already been freed.
*/
zio_nowait(pio);
} }
#define DBUF_HOLD_IMPL_MAX_DEPTH 20 #define DBUF_HOLD_IMPL_MAX_DEPTH 20
@ -2079,6 +2271,9 @@ top:
if (dh->dh_db == NULL) { if (dh->dh_db == NULL) {
dh->dh_bp = NULL; dh->dh_bp = NULL;
if (dh->dh_fail_uncached)
return (SET_ERROR(ENOENT));
ASSERT3P(dh->dh_parent, ==, NULL); ASSERT3P(dh->dh_parent, ==, NULL);
dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
dh->dh_fail_sparse, &dh->dh_parent, dh->dh_fail_sparse, &dh->dh_parent,
@ -2099,6 +2294,11 @@ top:
dh->dh_parent, dh->dh_bp); dh->dh_parent, dh->dh_bp);
} }
if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
mutex_exit(&dh->dh_db->db_mtx);
return (SET_ERROR(ENOENT));
}
if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) {
arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db);
if (dh->dh_db->db_buf->b_data == NULL) { if (dh->dh_db->db_buf->b_data == NULL) {
@ -2159,7 +2359,8 @@ top:
* on the stack for 20 levels of recursion. * on the stack for 20 levels of recursion.
*/ */
int int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp) void *tag, dmu_buf_impl_t **dbp)
{ {
struct dbuf_hold_impl_data *dh; struct dbuf_hold_impl_data *dh;
@ -2167,7 +2368,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) * dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) *
DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0); __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse,
fail_uncached, tag, dbp, 0);
error = __dbuf_hold_impl(dh); error = __dbuf_hold_impl(dh);
@ -2179,13 +2381,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
static void static void
__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp, int depth) void *tag, dmu_buf_impl_t **dbp, int depth)
{ {
dh->dh_dn = dn; dh->dh_dn = dn;
dh->dh_level = level; dh->dh_level = level;
dh->dh_blkid = blkid; dh->dh_blkid = blkid;
dh->dh_fail_sparse = fail_sparse; dh->dh_fail_sparse = fail_sparse;
dh->dh_fail_uncached = fail_uncached;
dh->dh_tag = tag; dh->dh_tag = tag;
dh->dh_dbp = dbp; dh->dh_dbp = dbp;
dh->dh_depth = depth; dh->dh_depth = depth;
@ -2194,16 +2400,14 @@ __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
dmu_buf_impl_t * dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{ {
dmu_buf_impl_t *db; return (dbuf_hold_level(dn, 0, blkid, tag));
int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
return (err ? NULL : db);
} }
dmu_buf_impl_t * dmu_buf_impl_t *
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
{ {
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
return (err ? NULL : db); return (err ? NULL : db);
} }
@ -2531,8 +2735,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
if (parent == NULL) { if (parent == NULL) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
(void) dbuf_hold_impl(dn, db->db_level+1, parent = dbuf_hold_level(dn, db->db_level + 1,
db->db_blkid >> epbs, FALSE, db, &parent); db->db_blkid >> epbs, db);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
db->db_parent = parent; db->db_parent = parent;

View File

@ -138,7 +138,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
err = dnode_hold(os, object, FTAG, &dn); err = dnode_hold(os, object, FTAG, &dn);
if (err) if (err)
return (err); return (err);
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag); db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -421,7 +421,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) { for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
if (db == NULL) { if (db == NULL) {
@ -522,17 +522,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
} }
/* /*
* Issue prefetch i/os for the given blocks. * Issue prefetch i/os for the given blocks. If level is greater than 0, the
* indirect blocks prefeteched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
* *
* Note: The assumption is that we *know* these blocks will be needed * Note that if the indirect blocks above the blocks being prefetched are not in
* almost immediately. Therefore, the prefetch i/os will be issued at * cache, they will be asychronously read in.
* ZIO_PRIORITY_SYNC_READ
*
* Note: indirect blocks and other metadata will be read synchronously,
* causing this function to block if they are not already cached.
*/ */
void void
dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{ {
dnode_t *dn; dnode_t *dn;
uint64_t blkid; uint64_t blkid;
@ -548,8 +547,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return; return;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); blkid = dbuf_whichblock(dn, level,
dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
return; return;
} }
@ -564,10 +564,16 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return; return;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) { /*
int blkshift = dn->dn_datablkshift; * offset + len - 1 is the last byte we want to prefetch for, and offset
nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
P2ALIGN(offset, 1 << blkshift)) >> blkshift; * last block we want to prefetch, and dbuf_whichblock(dn, level,
* offset) is the first. Then the number we need to prefetch is the
* last - first + 1.
*/
if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1;
} else { } else {
nblks = (offset < dn->dn_datablksz); nblks = (offset < dn->dn_datablksz);
} }
@ -575,9 +581,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
if (nblks != 0) { if (nblks != 0) {
int i; int i;
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, level, offset);
for (i = 0; i < nblks; i++) for (i = 0; i < nblks; i++)
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); dbuf_prefetch(dn, level, blkid + i, pri, 0);
} }
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -1293,7 +1299,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
DB_DNODE_ENTER(dbuf); DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf); dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, offset); blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(dbuf); DB_DNODE_EXIT(dbuf);

View File

@ -115,7 +115,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (issig(JUSTLOOKING) && issig(FORREAL)) if (issig(JUSTLOOKING) && issig(FORREAL))
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));
if (zb->zb_object != DMU_META_DNODE_OBJECT) if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
return (0); return (0);
if (BP_IS_HOLE(bp)) { if (BP_IS_HOLE(bp)) {

View File

@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (0); return (0);
} }
/*
* Return (in *objectp) the next object which is allocated (or a hole)
* after *object, taking into account only objects that may have been modified
* after the specified txg.
*/
int int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{ {

File diff suppressed because it is too large Load Diff

View File

@ -157,7 +157,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
* If we already visited this bp & everything below, * If we already visited this bp & everything below,
* don't bother doing it again. * don't bother doing it again.
*/ */
if (zbookmark_is_before(dnp, zb, td->td_resume)) if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
return (RESUME_SKIP_ALL); return (RESUME_SKIP_ALL);
/* /*
@ -428,6 +428,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
int j, err = 0; int j, err = 0;
zbookmark_phys_t czb; zbookmark_phys_t czb;
if (td->td_flags & TRAVERSE_PRE) {
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
ZB_DNODE_BLKID);
err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
if (err != 0)
return (err);
}
for (j = 0; j < dnp->dn_nblkptr; j++) { for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
@ -435,10 +446,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
break; break;
} }
if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
} }
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
ZB_DNODE_BLKID);
err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
if (err != 0)
return (err);
}
return (err); return (err);
} }
@ -451,6 +473,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT(pfd->pd_bytes_fetched >= 0); ASSERT(pfd->pd_bytes_fetched >= 0);
if (bp == NULL)
return (0);
if (pfd->pd_cancel) if (pfd->pd_cancel)
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));

View File

@ -332,7 +332,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); err = dbuf_hold_impl(dn, 0, start,
FALSE, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (err) { if (err) {
@ -533,7 +534,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkoff = P2PHASE(blkid, epb); blkoff = P2PHASE(blkid, epb);
tochk = MIN(epb - blkoff, nblks); tochk = MIN(epb - blkoff, nblks);
err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); err = dbuf_hold_impl(dn, 1, blkid >> epbs,
FALSE, FALSE, FTAG, &dbuf);
if (err) { if (err) {
txh->txh_tx->tx_err = err; txh->txh_tx->tx_err = err;
break; break;

View File

@ -293,7 +293,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
for (i = 0; i < fetchsz; i++) { for (i = 0; i < fetchsz; i++) {
dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
ARC_FLAG_PREFETCH);
} }
return (fetchsz); return (fetchsz);

View File

@ -1112,7 +1112,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
drop_struct_lock = TRUE; drop_struct_lock = TRUE;
} }
blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG); db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock) if (drop_struct_lock)
@ -1409,7 +1409,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
goto fail; goto fail;
/* resize the old block */ /* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0) if (err == 0)
dbuf_new_size(db, size, tx); dbuf_new_size(db, size, tx);
else if (err != ENOENT) else if (err != ENOENT)
@ -1582,8 +1582,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
ASSERT3U(blkoff + head, ==, blksz); ASSERT3U(blkoff + head, ==, blksz);
if (len < head) if (len < head)
head = len; head = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
FTAG, &db) == 0) { TRUE, FALSE, FTAG, &db) == 0) {
caddr_t data; caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */ /* don't dirty if it isn't on disk and isn't dirty */
@ -1620,8 +1620,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (tail) { if (tail) {
if (len < tail) if (len < tail)
tail = len; tail = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
TRUE, FTAG, &db) == 0) { TRUE, FALSE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */ /* don't dirty if not on disk and not dirty */
if (db->db_last_dirty || if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
@ -1875,8 +1875,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
epb = dn->dn_phys->dn_nblkptr; epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr; data = dn->dn_phys->dn_blkptr;
} else { } else {
uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
if (error) { if (error) {
if (error != ENOENT) if (error != ENOENT)
return (error); return (error);

View File

@ -192,7 +192,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, db->db_level-1, err = dbuf_hold_impl(dn, db->db_level-1,
(db->db_blkid << epbs) + i, TRUE, FTAG, &child); (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT) if (err == ENOENT)
continue; continue;
@ -288,7 +288,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
continue; continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
i, B_TRUE, FTAG, &subdb)); i, TRUE, FALSE, FTAG, &subdb));
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr); ASSERT3P(bp, ==, subdb->db_blkptr);
@ -362,7 +362,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
continue; continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
TRUE, FTAG, &db)); TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
free_children(db, blkid, nblks, tx); free_children(db, blkid, nblks, tx);

View File

@ -547,6 +547,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
const char *snapname; const char *snapname;
uint64_t obj; uint64_t obj;
int err = 0; int err = 0;
dsl_dataset_t *ds;
err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
if (err != 0) if (err != 0)
@ -555,36 +556,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
ASSERT(dsl_pool_config_held(dp)); ASSERT(dsl_pool_config_held(dp));
obj = dsl_dir_phys(dd)->dd_head_dataset_obj; obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0) if (obj != 0)
err = dsl_dataset_hold_obj(dp, obj, tag, dsp); err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
else else
err = SET_ERROR(ENOENT); err = SET_ERROR(ENOENT);
/* we may be looking for a snapshot */ /* we may be looking for a snapshot */
if (err == 0 && snapname != NULL) { if (err == 0 && snapname != NULL) {
dsl_dataset_t *ds; dsl_dataset_t *snap_ds;
if (*snapname++ != '@') { if (*snapname++ != '@') {
dsl_dataset_rele(*dsp, tag); dsl_dataset_rele(ds, tag);
dsl_dir_rele(dd, FTAG); dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT)); return (SET_ERROR(ENOENT));
} }
dprintf("looking for snapshot '%s'\n", snapname); dprintf("looking for snapshot '%s'\n", snapname);
err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); err = dsl_dataset_snap_lookup(ds, snapname, &obj);
if (err == 0) if (err == 0)
err = dsl_dataset_hold_obj(dp, obj, tag, &ds); err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
dsl_dataset_rele(*dsp, tag); dsl_dataset_rele(ds, tag);
if (err == 0) { if (err == 0) {
mutex_enter(&ds->ds_lock); mutex_enter(&snap_ds->ds_lock);
if (ds->ds_snapname[0] == 0) if (snap_ds->ds_snapname[0] == 0)
(void) strlcpy(ds->ds_snapname, snapname, (void) strlcpy(snap_ds->ds_snapname, snapname,
sizeof (ds->ds_snapname)); sizeof (snap_ds->ds_snapname));
mutex_exit(&ds->ds_lock); mutex_exit(&snap_ds->ds_lock);
ds = snap_ds;
}
}
if (err == 0)
*dsp = ds; *dsp = ds;
}
}
dsl_dir_rele(dd, FTAG); dsl_dir_rele(dd, FTAG);
return (err); return (err);
} }

View File

@ -560,7 +560,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg; struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx; dmu_tx_t *tx = ka->tx;
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) { if (zb->zb_level == ZB_ZIL_LEVEL) {

View File

@ -619,7 +619,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
* If we already visited this bp & everything below (in * If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again. * a prior txg sync), don't bother doing it again.
*/ */
if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) if (zbookmark_subtree_completed(dnp, zb,
&scn->scn_phys.scn_bookmark))
return (B_TRUE); return (B_TRUE);
/* /*

View File

@ -1921,7 +1921,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
size_t size; size_t size;
void *data; void *data;
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
/* /*
* Note: normally this routine will not be called if * Note: normally this routine will not be called if

View File

@ -76,8 +76,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
mutex_exit(sm->sm_lock); mutex_exit(sm->sm_lock);
if (end > bufsize) { if (end > bufsize) {
dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize, dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
end - bufsize); end - bufsize, ZIO_PRIORITY_SYNC_READ);
} }
mutex_enter(sm->sm_lock); mutex_enter(sm->sm_lock);

View File

@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk; tbl->zt_nextblk = newblk;
ASSERT0(tbl->zt_blks_copied); ASSERT0(tbl->zt_blks_copied);
dmu_prefetch(zap->zap_objset, zap->zap_object, dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
tbl->zt_blk << bs, tbl->zt_numblks << bs); tbl->zt_blk << bs, tbl->zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
} }
/* /*
@ -949,7 +950,8 @@ fzap_prefetch(zap_name_t *zn)
if (zap_idx_to_blk(zap, idx, &blk) != 0) if (zap_idx_to_blk(zap, idx, &blk) != 0)
return; return;
bs = FZAP_BLOCK_SHIFT(zap); bs = FZAP_BLOCK_SHIFT(zap);
dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
ZIO_PRIORITY_SYNC_READ);
} }
/* /*
@ -1295,9 +1297,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
} else { } else {
int b; int b;
dmu_prefetch(zap->zap_objset, zap->zap_object, dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
b++) { b++) {

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/ */
/* Portions Copyright 2010 Robert Milkowski */ /* Portions Copyright 2010 Robert Milkowski */

View File

@ -2118,7 +2118,8 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
/* Prefetch znode */ /* Prefetch znode */
if (prefetch) { if (prefetch) {
dmu_prefetch(os, objnum, 0, 0); dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
} }
/* /*

View File

@ -63,6 +63,9 @@ int zio_delay_max = ZIO_DELAY_MAX;
#define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101 #define ZIO_PIPELINE_STOP 0x101
#define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
#define COMPARE_META_LEVEL 0x80000000ul
/* /*
* The following actions directly effect the spa's sync-to-convergence logic. * The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action. * The values below define the sync pass when we start performing the action.
@ -3450,39 +3453,129 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_done zio_done
}; };
/* dnp is the dnode for zb1->zb_object */
boolean_t
zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
const zbookmark_phys_t *zb2)
{
uint64_t zb1nextL0, zb2thisobj;
ASSERT(zb1->zb_objset == zb2->zb_objset);
ASSERT(zb2->zb_level == 0);
/*
* Compare two zbookmark_phys_t's to see which we would reach first in a
* pre-order traversal of the object tree.
*
* This is simple in every case aside from the meta-dnode object. For all other
* objects, we traverse them in order (object 1 before object 2, and so on).
* However, all of these objects are traversed while traversing object 0, since
* the data it points to is the list of objects. Thus, we need to convert to a
* canonical representation so we can compare meta-dnode bookmarks to
* non-meta-dnode bookmarks.
*
* We do this by calculating "equivalents" for each field of the zbookmark.
* zbookmarks outside of the meta-dnode use their own object and level, and
* calculate the level 0 equivalent (the first L0 blkid that is contained in the
* blocks this bookmark refers to) by multiplying their blkid by their span
* (the number of L0 blocks contained within one block at their level).
* zbookmarks inside the meta-dnode calculate their object equivalent
* (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
* level + 1<<31 (any value larger than a level could ever be) for their level.
* This causes them to always compare before a bookmark in their object
* equivalent, compare appropriately to bookmarks in other objects, and to
* compare appropriately to other bookmarks in the meta-dnode.
*/
int
zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
{
/*
* These variables represent the "equivalent" values for the zbookmark,
* after converting zbookmarks inside the meta dnode to their
* normal-object equivalents.
*/
uint64_t zb1obj, zb2obj;
uint64_t zb1L0, zb2L0;
uint64_t zb1level, zb2level;
if (zb1->zb_object == zb2->zb_object &&
zb1->zb_level == zb2->zb_level &&
zb1->zb_blkid == zb2->zb_blkid)
return (0);
/*
* BP_SPANB calculates the span in blocks.
*/
zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
zb1L0 = 0;
zb1level = zb1->zb_level + COMPARE_META_LEVEL;
} else {
zb1obj = zb1->zb_object;
zb1level = zb1->zb_level;
}
if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
zb2L0 = 0;
zb2level = zb2->zb_level + COMPARE_META_LEVEL;
} else {
zb2obj = zb2->zb_object;
zb2level = zb2->zb_level;
}
/* Now that we have a canonical representation, do the comparison. */
if (zb1obj != zb2obj)
return (zb1obj < zb2obj ? -1 : 1);
else if (zb1L0 != zb2L0)
return (zb1L0 < zb2L0 ? -1 : 1);
else if (zb1level != zb2level)
return (zb1level > zb2level ? -1 : 1);
/*
* This can (theoretically) happen if the bookmarks have the same object
* and level, but different blkids, if the block sizes are not the same.
* There is presently no way to change the indirect block sizes
*/
return (0);
}
/*
* This function checks the following: given that last_block is the place that
* our traversal stopped last time, does that guarantee that we've visited
* every node under subtree_root? Therefore, we can't just use the raw output
* of zbookmark_compare. We have to pass in a modified version of
* subtree_root; by incrementing the block id, and then checking whether
* last_block is before or equal to that, we can tell whether or not having
* visited last_block implies that all of subtree_root's children have been
* visited.
*/
boolean_t
zbookmark_subtree_completed(const dnode_phys_t *dnp,
const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
{
zbookmark_phys_t mod_zb = *subtree_root;
mod_zb.zb_blkid++;
ASSERT(last_block->zb_level == 0);
/* The objset_phys_t isn't before anything. */ /* The objset_phys_t isn't before anything. */
if (dnp == NULL) if (dnp == NULL)
return (B_FALSE); return (B_FALSE);
zb1nextL0 = (zb1->zb_blkid + 1) << /*
((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
* data block size in sectors, because that variable is only used if
zb2thisobj = zb2->zb_object ? zb2->zb_object : * the bookmark refers to a block in the meta-dnode. Since we don't
zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); * know without examining it what object it refers to, and there's no
* harm in passing in this value in other cases, we always pass it in.
if (zb1->zb_object == DMU_META_DNODE_OBJECT) { *
uint64_t nextobj = zb1nextL0 * * We pass in 0 for the indirect block size shift because zb2 must be
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; * level 0. The indirect block size is only used to calculate the span
return (nextobj <= zb2thisobj); * of the bookmark, but since the bookmark must be level 0, the span is
} * always 1, so the math works out.
*
if (zb1->zb_object < zb2thisobj) * If you make changes to how the zbookmark_compare code works, be sure
return (B_TRUE); * to make sure that this code still works afterwards.
if (zb1->zb_object > zb2thisobj) */
return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
if (zb2->zb_object == DMU_META_DNODE_OBJECT) 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
return (B_FALSE); last_block) <= 0);
return (zb1nextL0 <= zb2->zb_blkid);
} }
#if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_KERNEL) && defined(HAVE_SPL)

View File

@ -1397,8 +1397,9 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev)
*/ */
len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
if (len > 0) { if (len > 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, len); dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch(os, ZVOL_OBJ, volsize - len, len); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
ZIO_PRIORITY_SYNC_READ);
} }
zv->zv_objset = NULL; zv->zv_objset = NULL;