mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-20 23:15:01 +03:00 
			
		
		
		
	Illumos 5960, 5925
5960 zfs recv should prefetch indirect blocks 5925 zfs receive -o origin= Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> References: https://www.illumos.org/issues/5960 https://www.illumos.org/issues/5925 https://github.com/illumos/illumos-gate/commit/a2cdcdd Porting notes: - [lib/libzfs/libzfs_sendrecv.c] -b8864a2Fix gcc cast warnings -325f023Add linux kernel device support -5c3f61eIncrease Linux pipe buffer size on 'zfs receive' - [module/zfs/zfs_vnops.c] -3558fd7Prototype/structure update for Linux -c12e3a5Restructure zfs_readdir() to fix regressions - [module/zfs/zvol.c] - Function @zvol_map_block() isn't needed in ZoL -9965059Prefetch start and end of volumes - [module/zfs/dmu.c] - Fixed ISO C90 - mixed declarations and code - Function dmu_prefetch() 'int i' is initialized before the following code block (c90 vs. c99) - [module/zfs/dbuf.c] -fc5bb51Fix stack dbuf_hold_impl() -9b67f60Illumos 4757, 4913 - 34229a2 Reduce stack usage for recursive traverse_visitbp() - [module/zfs/dmu_send.c] - Fixed ISO C90 - mixed declarations and code -b58986eUse large stacks when available -241b541Illumos 5959 - clean up per-dataset feature count code -77aef6fUse vmem_alloc() for nvlists -00b4602Add linux kernel memory support Ported-by: kernelOfTruth kerneloftruth@gmail.com Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
		
							parent
							
								
									00af2ff6f2
								
							
						
					
					
						commit
						fcff0f35bd
					
				| @ -2489,6 +2489,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |||||||
| 	dmu_object_type_t type; | 	dmu_object_type_t type; | ||||||
| 	boolean_t is_metadata; | 	boolean_t is_metadata; | ||||||
| 
 | 
 | ||||||
|  | 	if (bp == NULL) | ||||||
|  | 		return (0); | ||||||
|  | 
 | ||||||
| 	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { | 	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { | ||||||
| 		char blkbuf[BP_SPRINTF_LEN]; | 		char blkbuf[BP_SPRINTF_LEN]; | ||||||
| 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); | 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); | ||||||
| @ -2985,7 +2988,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |||||||
| 	avl_index_t where; | 	avl_index_t where; | ||||||
| 	zdb_ddt_entry_t *zdde, zdde_search; | 	zdb_ddt_entry_t *zdde, zdde_search; | ||||||
| 
 | 
 | ||||||
| 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | ||||||
| 		return (0); | 		return (0); | ||||||
| 
 | 
 | ||||||
| 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { | 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { | ||||||
|  | |||||||
| @ -249,8 +249,9 @@ get_usage(zfs_help_t idx) | |||||||
| 		return (gettext("\tpromote <clone-filesystem>\n")); | 		return (gettext("\tpromote <clone-filesystem>\n")); | ||||||
| 	case HELP_RECEIVE: | 	case HELP_RECEIVE: | ||||||
| 		return (gettext("\treceive [-vnFu] <filesystem|volume|" | 		return (gettext("\treceive [-vnFu] <filesystem|volume|" | ||||||
| 		"snapshot>\n" | 		    "snapshot>\n" | ||||||
| 		"\treceive [-vnFu] [-d | -e] <filesystem>\n")); | 		    "\treceive [-vnFu] [-o origin=<snapshot>] [-d | -e] " | ||||||
|  | 		    "<filesystem>\n")); | ||||||
| 	case HELP_RENAME: | 	case HELP_RENAME: | ||||||
| 		return (gettext("\trename [-f] <filesystem|volume|snapshot> " | 		return (gettext("\trename [-f] <filesystem|volume|snapshot> " | ||||||
| 		    "<filesystem|volume|snapshot>\n" | 		    "<filesystem|volume|snapshot>\n" | ||||||
| @ -793,7 +794,7 @@ zfs_do_create(int argc, char **argv) | |||||||
| 				nomem(); | 				nomem(); | ||||||
| 			break; | 			break; | ||||||
| 		case 'o': | 		case 'o': | ||||||
| 			if (parseprop(props, optarg)) | 			if (parseprop(props, optarg) != 0) | ||||||
| 				goto error; | 				goto error; | ||||||
| 			break; | 			break; | ||||||
| 		case 's': | 		case 's': | ||||||
| @ -3622,7 +3623,7 @@ zfs_do_snapshot(int argc, char **argv) | |||||||
| 	while ((c = getopt(argc, argv, "ro:")) != -1) { | 	while ((c = getopt(argc, argv, "ro:")) != -1) { | ||||||
| 		switch (c) { | 		switch (c) { | ||||||
| 		case 'o': | 		case 'o': | ||||||
| 			if (parseprop(props, optarg)) | 			if (parseprop(props, optarg) != 0) | ||||||
| 				return (1); | 				return (1); | ||||||
| 			break; | 			break; | ||||||
| 		case 'r': | 		case 'r': | ||||||
| @ -3881,10 +3882,19 @@ zfs_do_receive(int argc, char **argv) | |||||||
| { | { | ||||||
| 	int c, err; | 	int c, err; | ||||||
| 	recvflags_t flags = { 0 }; | 	recvflags_t flags = { 0 }; | ||||||
|  | 	nvlist_t *props; | ||||||
|  | 	nvpair_t *nvp = NULL; | ||||||
|  | 
 | ||||||
|  | 	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) | ||||||
|  | 		nomem(); | ||||||
| 
 | 
 | ||||||
| 	/* check options */ | 	/* check options */ | ||||||
| 	while ((c = getopt(argc, argv, ":denuvF")) != -1) { | 	while ((c = getopt(argc, argv, ":o:denuvF")) != -1) { | ||||||
| 		switch (c) { | 		switch (c) { | ||||||
|  | 		case 'o': | ||||||
|  | 			if (parseprop(props, optarg) != 0) | ||||||
|  | 				return (1); | ||||||
|  | 			break; | ||||||
| 		case 'd': | 		case 'd': | ||||||
| 			flags.isprefix = B_TRUE; | 			flags.isprefix = B_TRUE; | ||||||
| 			break; | 			break; | ||||||
| @ -3929,6 +3939,13 @@ zfs_do_receive(int argc, char **argv) | |||||||
| 		usage(B_FALSE); | 		usage(B_FALSE); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	while ((nvp = nvlist_next_nvpair(props, nvp))) { | ||||||
|  | 		if (strcmp(nvpair_name(nvp), "origin") != 0) { | ||||||
|  | 			(void) fprintf(stderr, gettext("invalid option")); | ||||||
|  | 			usage(B_FALSE); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (isatty(STDIN_FILENO)) { | 	if (isatty(STDIN_FILENO)) { | ||||||
| 		(void) fprintf(stderr, | 		(void) fprintf(stderr, | ||||||
| 		    gettext("Error: Backup stream can not be read " | 		    gettext("Error: Backup stream can not be read " | ||||||
| @ -3937,7 +3954,7 @@ zfs_do_receive(int argc, char **argv) | |||||||
| 		return (1); | 		return (1); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL); | 	err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); | ||||||
| 
 | 
 | ||||||
| 	return (err != 0); | 	return (err != 0); | ||||||
| } | } | ||||||
|  | |||||||
| @ -3728,7 +3728,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) | |||||||
| 	 */ | 	 */ | ||||||
| 	n = ztest_random(regions) * stride + ztest_random(width); | 	n = ztest_random(regions) * stride + ztest_random(width); | ||||||
| 	s = 1 + ztest_random(2 * width - 1); | 	s = 1 + ztest_random(2 * width - 1); | ||||||
| 	dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); | 	dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, | ||||||
|  | 	    ZIO_PRIORITY_SYNC_READ); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Pick a random index and compute the offsets into packobj and bigobj. | 	 * Pick a random index and compute the offsets into packobj and bigobj. | ||||||
| @ -5930,8 +5931,10 @@ ztest_run(ztest_shared_t *zs) | |||||||
| 	 * Right before closing the pool, kick off a bunch of async I/O; | 	 * Right before closing the pool, kick off a bunch of async I/O; | ||||||
| 	 * spa_close() should wait for it to complete. | 	 * spa_close() should wait for it to complete. | ||||||
| 	 */ | 	 */ | ||||||
| 	for (object = 1; object < 50; object++) | 	for (object = 1; object < 50; object++) { | ||||||
| 		dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); | 		dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, | ||||||
|  | 		    ZIO_PRIORITY_SYNC_READ); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	/* Verify that at least one commit cb was called in a timely fashion */ | 	/* Verify that at least one commit cb was called in a timely fashion */ | ||||||
| 	if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) | 	if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) | ||||||
|  | |||||||
| @ -678,8 +678,8 @@ typedef struct recvflags { | |||||||
| 	boolean_t nomount; | 	boolean_t nomount; | ||||||
| } recvflags_t; | } recvflags_t; | ||||||
| 
 | 
 | ||||||
| extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *, | extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, | ||||||
|     int, avl_tree_t *); |     recvflags_t *, int, avl_tree_t *); | ||||||
| 
 | 
 | ||||||
| typedef enum diff_flags { | typedef enum diff_flags { | ||||||
| 	ZFS_DIFF_PARSEABLE = 0x1, | 	ZFS_DIFF_PARSEABLE = 0x1, | ||||||
|  | |||||||
| @ -9,6 +9,7 @@ COMMON_H = \ | |||||||
| 	$(top_srcdir)/include/sys/bplist.h \
 | 	$(top_srcdir)/include/sys/bplist.h \
 | ||||||
| 	$(top_srcdir)/include/sys/bpobj.h \
 | 	$(top_srcdir)/include/sys/bpobj.h \
 | ||||||
| 	$(top_srcdir)/include/sys/bptree.h \
 | 	$(top_srcdir)/include/sys/bptree.h \
 | ||||||
|  | 	$(top_srcdir)/include/sys/bqueue.h \
 | ||||||
| 	$(top_srcdir)/include/sys/dbuf.h \
 | 	$(top_srcdir)/include/sys/dbuf.h \
 | ||||||
| 	$(top_srcdir)/include/sys/ddt.h \
 | 	$(top_srcdir)/include/sys/ddt.h \
 | ||||||
| 	$(top_srcdir)/include/sys/dmu.h \
 | 	$(top_srcdir)/include/sys/dmu.h \
 | ||||||
| @ -96,6 +97,7 @@ COMMON_H = \ | |||||||
| 	$(top_srcdir)/include/sys/zio_compress.h \
 | 	$(top_srcdir)/include/sys/zio_compress.h \
 | ||||||
| 	$(top_srcdir)/include/sys/zio.h \
 | 	$(top_srcdir)/include/sys/zio.h \
 | ||||||
| 	$(top_srcdir)/include/sys/zio_impl.h \
 | 	$(top_srcdir)/include/sys/zio_impl.h \
 | ||||||
|  | 	$(top_srcdir)/include/sys/zio_priority.h \
 | ||||||
| 	$(top_srcdir)/include/sys/zrlock.h | 	$(top_srcdir)/include/sys/zrlock.h | ||||||
| 
 | 
 | ||||||
| KERNEL_H = \
 | KERNEL_H = \
 | ||||||
|  | |||||||
							
								
								
									
										54
									
								
								include/sys/bqueue.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								include/sys/bqueue.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,54 @@ | |||||||
|  | /*
 | ||||||
|  |  * CDDL HEADER START | ||||||
|  |  * | ||||||
|  |  * This file and its contents are supplied under the terms of the | ||||||
|  |  * Common Development and Distribution License ("CDDL"), version 1.0. | ||||||
|  |  * You may only use this file in accordance with the terms of version | ||||||
|  |  * 1.0 of the CDDL. | ||||||
|  |  * | ||||||
|  |  * A full copy of the text of the CDDL should have accompanied this | ||||||
|  |  * source.  A copy of the CDDL is also available via the Internet at | ||||||
|  |  * http://www.illumos.org/license/CDDL.
 | ||||||
|  |  * | ||||||
|  |  * CDDL HEADER END | ||||||
|  |  */ | ||||||
|  | /*
 | ||||||
|  |  * Copyright (c) 2014 by Delphix. All rights reserved. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | #ifndef	_BQUEUE_H | ||||||
|  | #define	_BQUEUE_H | ||||||
|  | 
 | ||||||
|  | #ifdef	__cplusplus | ||||||
|  | extern "C" { | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #include	<sys/zfs_context.h> | ||||||
|  | 
 | ||||||
|  | typedef struct bqueue { | ||||||
|  | 	list_t bq_list; | ||||||
|  | 	kmutex_t bq_lock; | ||||||
|  | 	kcondvar_t bq_add_cv; | ||||||
|  | 	kcondvar_t bq_pop_cv; | ||||||
|  | 	uint64_t bq_size; | ||||||
|  | 	uint64_t bq_maxsize; | ||||||
|  | 	size_t bq_node_offset; | ||||||
|  | } bqueue_t; | ||||||
|  | 
 | ||||||
|  | typedef struct bqueue_node { | ||||||
|  | 	list_node_t bqn_node; | ||||||
|  | 	uint64_t bqn_size; | ||||||
|  | } bqueue_node_t; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | int bqueue_init(bqueue_t *, uint64_t, size_t); | ||||||
|  | void bqueue_destroy(bqueue_t *); | ||||||
|  | void bqueue_enqueue(bqueue_t *, void *, uint64_t); | ||||||
|  | void *bqueue_dequeue(bqueue_t *); | ||||||
|  | boolean_t bqueue_empty(bqueue_t *); | ||||||
|  | 
 | ||||||
|  | #ifdef	__cplusplus | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #endif	/* _BQUEUE_H */ | ||||||
| @ -261,8 +261,7 @@ typedef struct dbuf_hash_table { | |||||||
| 	kmutex_t hash_mutexes[DBUF_MUTEXES]; | 	kmutex_t hash_mutexes[DBUF_MUTEXES]; | ||||||
| } dbuf_hash_table_t; | } dbuf_hash_table_t; | ||||||
| 
 | 
 | ||||||
| 
 | uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); | ||||||
| uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); |  | ||||||
| 
 | 
 | ||||||
| void dbuf_create_bonus(struct dnode *dn); | void dbuf_create_bonus(struct dnode *dn); | ||||||
| int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); | int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); | ||||||
| @ -272,10 +271,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); | |||||||
| dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); | dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); | ||||||
| dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, | dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, | ||||||
|     void *tag); |     void *tag); | ||||||
| int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, | int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, | ||||||
|  |     boolean_t fail_sparse, boolean_t fail_uncached, | ||||||
|     void *tag, dmu_buf_impl_t **dbp); |     void *tag, dmu_buf_impl_t **dbp); | ||||||
| 
 | 
 | ||||||
| void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); | void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, | ||||||
|  |     zio_priority_t prio, arc_flags_t aflags); | ||||||
| 
 | 
 | ||||||
| void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); | void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); | ||||||
| boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, | boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, | ||||||
|  | |||||||
| @ -44,6 +44,7 @@ | |||||||
| #include <sys/inttypes.h> | #include <sys/inttypes.h> | ||||||
| #include <sys/cred.h> | #include <sys/cred.h> | ||||||
| #include <sys/fs/zfs.h> | #include <sys/fs/zfs.h> | ||||||
|  | #include <sys/zio_priority.h> | ||||||
| #include <sys/uio.h> | #include <sys/uio.h> | ||||||
| 
 | 
 | ||||||
| #ifdef	__cplusplus | #ifdef	__cplusplus | ||||||
| @ -737,8 +738,8 @@ extern int zfs_max_recordsize; | |||||||
| /*
 | /*
 | ||||||
|  * Asynchronously try to read in the data. |  * Asynchronously try to read in the data. | ||||||
|  */ |  */ | ||||||
| void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, | void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, | ||||||
|     uint64_t len); | 	uint64_t len, enum zio_priority pri); | ||||||
| 
 | 
 | ||||||
| typedef struct dmu_object_info { | typedef struct dmu_object_info { | ||||||
| 	/* All sizes are in bytes unless otherwise indicated. */ | 	/* All sizes are in bytes unless otherwise indicated. */ | ||||||
|  | |||||||
| @ -21,7 +21,7 @@ | |||||||
| /*
 | /*
 | ||||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||||
|  * Copyright (c) 2011, 2015 by Delphix. All rights reserved. |  * Copyright (c) 2011, 2015 by Delphix. All rights reserved. | ||||||
|  * Copyright (c) 2013, Joyent, Inc. All rights reserved. |  * Copyright (c) 2011, 2014 by Delphix. All rights reserved. | ||||||
|  * Copyright (c) 2013 Steven Hartland. All rights reserved. |  * Copyright (c) 2013 Steven Hartland. All rights reserved. | ||||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. |  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||||
|  */ |  */ | ||||||
|  | |||||||
| @ -160,8 +160,18 @@ extern int aok; | |||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * DTrace SDT probes have different signatures in userland than they do in |  * DTrace SDT probes have different signatures in userland than they do in | ||||||
|  * kernel.  If they're being used in kernel code, re-define them out of |  * the kernel.  If they're being used in kernel code, re-define them out of | ||||||
|  * existence for their counterparts in libzpool. |  * existence for their counterparts in libzpool. | ||||||
|  |  * | ||||||
|  |  * Here's an example of how to use the set-error probes in userland: | ||||||
|  |  * zfs$target:::set-error /arg0 == EBUSY/ {stack();} | ||||||
|  |  * | ||||||
|  |  * Here's an example of how to use DTRACE_PROBE probes in userland: | ||||||
|  |  * If there is a probe declared as follows: | ||||||
|  |  * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); | ||||||
|  |  * Then you can use it as follows: | ||||||
|  |  * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ | ||||||
|  |  *     {printf("%u %p\n", arg1, arg2);} | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| #ifdef DTRACE_PROBE | #ifdef DTRACE_PROBE | ||||||
|  | |||||||
| @ -29,6 +29,7 @@ | |||||||
| #ifndef _ZIO_H | #ifndef _ZIO_H | ||||||
| #define	_ZIO_H | #define	_ZIO_H | ||||||
| 
 | 
 | ||||||
|  | #include <sys/zio_priority.h> | ||||||
| #include <sys/zfs_context.h> | #include <sys/zfs_context.h> | ||||||
| #include <sys/spa.h> | #include <sys/spa.h> | ||||||
| #include <sys/txg.h> | #include <sys/txg.h> | ||||||
| @ -147,17 +148,6 @@ enum zio_compress { | |||||||
| #define	ZIO_FAILURE_MODE_CONTINUE	1 | #define	ZIO_FAILURE_MODE_CONTINUE	1 | ||||||
| #define	ZIO_FAILURE_MODE_PANIC		2 | #define	ZIO_FAILURE_MODE_PANIC		2 | ||||||
| 
 | 
 | ||||||
| typedef enum zio_priority { |  | ||||||
| 	ZIO_PRIORITY_SYNC_READ, |  | ||||||
| 	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */ |  | ||||||
| 	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */ |  | ||||||
| 	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */ |  | ||||||
| 	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */ |  | ||||||
| 	ZIO_PRIORITY_NUM_QUEUEABLE, |  | ||||||
| 
 |  | ||||||
| 	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */ |  | ||||||
| } zio_priority_t; |  | ||||||
| 
 |  | ||||||
| enum zio_flag { | enum zio_flag { | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Flags inherited by gang, ddt, and vdev children, | 	 * Flags inherited by gang, ddt, and vdev children, | ||||||
| @ -262,6 +252,7 @@ extern const char *zio_type_name[ZIO_TYPES]; | |||||||
|  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>. |  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>. | ||||||
|  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. |  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. | ||||||
|  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. |  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. | ||||||
|  |  * dnode visit bookmarks are <objset, object id of dnode, -3, 0>. | ||||||
|  * |  * | ||||||
|  * Note: this structure is called a bookmark because its original purpose |  * Note: this structure is called a bookmark because its original purpose | ||||||
|  * was to remember where to resume a pool-wide traverse. |  * was to remember where to resume a pool-wide traverse. | ||||||
| @ -294,6 +285,9 @@ struct zbookmark_phys { | |||||||
| #define	ZB_ZIL_OBJECT		(0ULL) | #define	ZB_ZIL_OBJECT		(0ULL) | ||||||
| #define	ZB_ZIL_LEVEL		(-2LL) | #define	ZB_ZIL_LEVEL		(-2LL) | ||||||
| 
 | 
 | ||||||
|  | #define	ZB_DNODE_LEVEL		(-3LL) | ||||||
|  | #define	ZB_DNODE_BLKID		(0ULL) | ||||||
|  | 
 | ||||||
| #define	ZB_IS_ZERO(zb)						\ | #define	ZB_IS_ZERO(zb)						\ | ||||||
| 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\ | 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\ | ||||||
| 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0) | 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0) | ||||||
| @ -599,8 +593,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, | |||||||
| extern void spa_handle_ignored_writes(spa_t *spa); | extern void spa_handle_ignored_writes(spa_t *spa); | ||||||
| 
 | 
 | ||||||
| /* zbookmark_phys functions */ | /* zbookmark_phys functions */ | ||||||
| boolean_t zbookmark_is_before(const struct dnode_phys *dnp, | boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, | ||||||
|     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); |     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); | ||||||
|  | int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, | ||||||
|  |     uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); | ||||||
| 
 | 
 | ||||||
| #ifdef	__cplusplus | #ifdef	__cplusplus | ||||||
| } | } | ||||||
|  | |||||||
| @ -44,7 +44,7 @@ typedef const struct zio_checksum_info { | |||||||
| 	zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ | 	zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ | ||||||
| 	int		ci_correctable;	/* number of correctable bits	*/ | 	int		ci_correctable;	/* number of correctable bits	*/ | ||||||
| 	int		ci_eck;		/* uses zio embedded checksum? */ | 	int		ci_eck;		/* uses zio embedded checksum? */ | ||||||
| 	int		ci_dedup;	/* strong enough for dedup? */ | 	boolean_t	ci_dedup;	/* strong enough for dedup? */ | ||||||
| 	char		*ci_name;	/* descriptive name */ | 	char		*ci_name;	/* descriptive name */ | ||||||
| } zio_checksum_info_t; | } zio_checksum_info_t; | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										40
									
								
								include/sys/zio_priority.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								include/sys/zio_priority.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,40 @@ | |||||||
|  | /*
 | ||||||
|  |  * CDDL HEADER START | ||||||
|  |  * | ||||||
|  |  * This file and its contents are supplied under the terms of the | ||||||
|  |  * Common Development and Distribution License ("CDDL"), version 1.0. | ||||||
|  |  * You may only use this file in accordance with the terms of version | ||||||
|  |  * 1.0 of the CDDL. | ||||||
|  |  * | ||||||
|  |  * A full copy of the text of the CDDL should have accompanied this | ||||||
|  |  * source.  A copy of the CDDL is also available via the Internet at | ||||||
|  |  * http://www.illumos.org/license/CDDL.
 | ||||||
|  |  * | ||||||
|  |  * CDDL HEADER END | ||||||
|  |  */ | ||||||
|  | /*
 | ||||||
|  |  * Copyright (c) 2014 by Delphix. All rights reserved. | ||||||
|  |  */ | ||||||
|  | #ifndef	_ZIO_PRIORITY_H | ||||||
|  | #define	_ZIO_PRIORITY_H | ||||||
|  | 
 | ||||||
|  | #ifdef	__cplusplus | ||||||
|  | extern "C" { | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | typedef enum zio_priority { | ||||||
|  | 	ZIO_PRIORITY_SYNC_READ, | ||||||
|  | 	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */ | ||||||
|  | 	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */ | ||||||
|  | 	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */ | ||||||
|  | 	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */ | ||||||
|  | 	ZIO_PRIORITY_NUM_QUEUEABLE, | ||||||
|  | 
 | ||||||
|  | 	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */ | ||||||
|  | } zio_priority_t; | ||||||
|  | 
 | ||||||
|  | #ifdef	__cplusplus | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #endif	/* _ZIO_PRIORITY_H */ | ||||||
| @ -3529,7 +3529,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| zbookmark_compare(const void *a, const void *b) | zbookmark_mem_compare(const void *a, const void *b) | ||||||
| { | { | ||||||
| 	return (memcmp(a, b, sizeof (zbookmark_phys_t))); | 	return (memcmp(a, b, sizeof (zbookmark_phys_t))); | ||||||
| } | } | ||||||
| @ -3592,7 +3592,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) | |||||||
| 	    zc.zc_nvlist_dst_size; | 	    zc.zc_nvlist_dst_size; | ||||||
| 	count -= zc.zc_nvlist_dst_size; | 	count -= zc.zc_nvlist_dst_size; | ||||||
| 
 | 
 | ||||||
| 	qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare); | 	qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); | ||||||
| 
 | 
 | ||||||
| 	verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); | 	verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -63,8 +63,9 @@ | |||||||
| /* in libzfs_dataset.c */ | /* in libzfs_dataset.c */ | ||||||
| extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); | extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); | ||||||
| 
 | 
 | ||||||
| static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *, | static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, | ||||||
|     int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); |     recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, | ||||||
|  |     uint64_t *); | ||||||
| 
 | 
 | ||||||
| static const zio_cksum_t zero_cksum = { { 0 } }; | static const zio_cksum_t zero_cksum = { { 0 } }; | ||||||
| 
 | 
 | ||||||
| @ -2523,7 +2524,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, | |||||||
| 		 * zfs_receive_one() will take care of it (ie, | 		 * zfs_receive_one() will take care of it (ie, | ||||||
| 		 * recv_skip() and return 0). | 		 * recv_skip() and return 0). | ||||||
| 		 */ | 		 */ | ||||||
| 		error = zfs_receive_impl(hdl, destname, flags, fd, | 		error = zfs_receive_impl(hdl, destname, NULL, flags, fd, | ||||||
| 		    sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, | 		    sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, | ||||||
| 		    action_handlep); | 		    action_handlep); | ||||||
| 		if (error == ENODATA) { | 		if (error == ENODATA) { | ||||||
| @ -2656,9 +2657,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) | |||||||
|  */ |  */ | ||||||
| static int | static int | ||||||
| zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, | zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, | ||||||
|     recvflags_t *flags, dmu_replay_record_t *drr, |     const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, | ||||||
|     dmu_replay_record_t *drr_noswap, const char *sendfs, |     dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, | ||||||
|     nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, |     avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, | ||||||
|     uint64_t *action_handlep) |     uint64_t *action_handlep) | ||||||
| { | { | ||||||
| 	zfs_cmd_t zc = {"\0"}; | 	zfs_cmd_t zc = {"\0"}; | ||||||
| @ -2808,10 +2809,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, | |||||||
| 		} | 		} | ||||||
| 		if (flags->verbose) | 		if (flags->verbose) | ||||||
| 			(void) printf("found clone origin %s\n", zc.zc_string); | 			(void) printf("found clone origin %s\n", zc.zc_string); | ||||||
|  | 	} else if (originsnap) { | ||||||
|  | 		(void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN); | ||||||
|  | 		if (flags->verbose) | ||||||
|  | 			(void) printf("using provided clone origin %s\n", | ||||||
|  | 			    zc.zc_string); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	stream_wantsnewfs = (drrb->drr_fromguid == 0 || | 	stream_wantsnewfs = (drrb->drr_fromguid == 0 || | ||||||
| 	    (drrb->drr_flags & DRR_FLAG_CLONE)); | 	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap); | ||||||
| 
 | 
 | ||||||
| 	if (stream_wantsnewfs) { | 	if (stream_wantsnewfs) { | ||||||
| 		/*
 | 		/*
 | ||||||
| @ -3189,9 +3195,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, | ||||||
|     int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, |     const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, | ||||||
|     char **top_zfs, int cleanup_fd, uint64_t *action_handlep) |     nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, | ||||||
|  |     uint64_t *action_handlep) | ||||||
| { | { | ||||||
| 	int err; | 	int err; | ||||||
| 	dmu_replay_record_t drr, drr_noswap; | 	dmu_replay_record_t drr, drr_noswap; | ||||||
| @ -3210,6 +3217,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |||||||
| 		    "(%s) does not exist"), tosnap); | 		    "(%s) does not exist"), tosnap); | ||||||
| 		return (zfs_error(hdl, EZFS_NOENT, errbuf)); | 		return (zfs_error(hdl, EZFS_NOENT, errbuf)); | ||||||
| 	} | 	} | ||||||
|  | 	if (originsnap && | ||||||
|  | 	    !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { | ||||||
|  | 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " | ||||||
|  | 		    "(%s) does not exist"), originsnap); | ||||||
|  | 		return (zfs_error(hdl, EZFS_NOENT, errbuf)); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	/* read in the BEGIN record */ | 	/* read in the BEGIN record */ | ||||||
| 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, | 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, | ||||||
| @ -3282,14 +3295,14 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |||||||
| 				*cp = '\0'; | 				*cp = '\0'; | ||||||
| 			sendfs = nonpackage_sendfs; | 			sendfs = nonpackage_sendfs; | ||||||
| 		} | 		} | ||||||
| 		return (zfs_receive_one(hdl, infd, tosnap, flags, | 		return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, | ||||||
| 		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl, | 		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, | ||||||
| 		    top_zfs, cleanup_fd, action_handlep)); | 		    cleanup_fd, action_handlep)); | ||||||
| 	} else { | 	} else { | ||||||
| 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == | 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == | ||||||
| 		    DMU_COMPOUNDSTREAM); | 		    DMU_COMPOUNDSTREAM); | ||||||
| 		return (zfs_receive_package(hdl, infd, tosnap, flags, | 		return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, | ||||||
| 		    &drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); | 		    &zcksum, top_zfs, cleanup_fd, action_handlep)); | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -3300,14 +3313,15 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |||||||
|  * (-1 will override -2). |  * (-1 will override -2). | ||||||
|  */ |  */ | ||||||
| int | int | ||||||
| zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, | ||||||
|     int infd, avl_tree_t *stream_avl) |     recvflags_t *flags, int infd, avl_tree_t *stream_avl) | ||||||
| { | { | ||||||
| 	char *top_zfs = NULL; | 	char *top_zfs = NULL; | ||||||
| 	int err; | 	int err; | ||||||
| 	int cleanup_fd; | 	int cleanup_fd; | ||||||
| 	uint64_t action_handle = 0; | 	uint64_t action_handle = 0; | ||||||
| 	struct stat sb; | 	struct stat sb; | ||||||
|  | 	char *originsnap = NULL; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * The only way fstat can fail is if we do not have a valid file | 	 * The only way fstat can fail is if we do not have a valid file | ||||||
| @ -3350,10 +3364,16 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, | |||||||
| 	} | 	} | ||||||
| #endif /* __linux__ */ | #endif /* __linux__ */ | ||||||
| 
 | 
 | ||||||
|  | 	if (props) { | ||||||
|  | 		err = nvlist_lookup_string(props, "origin", &originsnap); | ||||||
|  | 		if (err && err != ENOENT) | ||||||
|  | 			return (err); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	cleanup_fd = open(ZFS_DEV, O_RDWR); | 	cleanup_fd = open(ZFS_DEV, O_RDWR); | ||||||
| 	VERIFY(cleanup_fd >= 0); | 	VERIFY(cleanup_fd >= 0); | ||||||
| 
 | 
 | ||||||
| 	err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL, | 	err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, | ||||||
| 	    stream_avl, &top_zfs, cleanup_fd, &action_handle); | 	    stream_avl, &top_zfs, cleanup_fd, &action_handle); | ||||||
| 
 | 
 | ||||||
| 	VERIFY(0 == close(cleanup_fd)); | 	VERIFY(0 == close(cleanup_fd)); | ||||||
|  | |||||||
| @ -32,6 +32,7 @@ KERNEL_C = \ | |||||||
| 	bplist.c \
 | 	bplist.c \
 | ||||||
| 	bpobj.c \
 | 	bpobj.c \
 | ||||||
| 	bptree.c \
 | 	bptree.c \
 | ||||||
|  | 	bqueue.c \
 | ||||||
| 	dbuf.c \
 | 	dbuf.c \
 | ||||||
| 	dbuf_stats.c \
 | 	dbuf_stats.c \
 | ||||||
| 	ddt.c \
 | 	ddt.c \
 | ||||||
|  | |||||||
| @ -184,12 +184,12 @@ zfs \- configures ZFS file systems | |||||||
| 
 | 
 | ||||||
| .LP | .LP | ||||||
| .nf | .nf | ||||||
| \fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR | \fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR | ||||||
| .fi | .fi | ||||||
| 
 | 
 | ||||||
| .LP | .LP | ||||||
| .nf | .nf | ||||||
| \fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR | \fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR | ||||||
| .fi | .fi | ||||||
| 
 | 
 | ||||||
| .LP | .LP | ||||||
| @ -2929,11 +2929,11 @@ then the receiving system must have that feature enabled as well. See | |||||||
| .ne 2 | .ne 2 | ||||||
| .mk | .mk | ||||||
| .na | .na | ||||||
| \fB\fBzfs receive\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR | \fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR | ||||||
| .ad | .ad | ||||||
| .br | .br | ||||||
| .na | .na | ||||||
| \fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR\fR | \fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR | ||||||
| .ad | .ad | ||||||
| .sp .6 | .sp .6 | ||||||
| .RS 4n | .RS 4n | ||||||
| @ -3001,6 +3001,17 @@ Print verbose information about the stream and the time required to perform the | |||||||
| Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use. | Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use. | ||||||
| .RE | .RE | ||||||
| 
 | 
 | ||||||
|  | .sp | ||||||
|  | .ne 2 | ||||||
|  | .mk | ||||||
|  | .na | ||||||
|  | \fB\fB-o\fR \fBorigin\fR=\fIsnapshot\fR | ||||||
|  | .ad | ||||||
|  | .sp .6 | ||||||
|  | .RS 4n | ||||||
|  | Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin. | ||||||
|  | .RE | ||||||
|  | 
 | ||||||
| .sp | .sp | ||||||
| .ne 2 | .ne 2 | ||||||
| .mk | .mk | ||||||
|  | |||||||
| @ -14,6 +14,7 @@ $(MODULE)-objs += bpobj.o | |||||||
| $(MODULE)-objs += dbuf.o | $(MODULE)-objs += dbuf.o | ||||||
| $(MODULE)-objs += dbuf_stats.o | $(MODULE)-objs += dbuf_stats.o | ||||||
| $(MODULE)-objs += bptree.o | $(MODULE)-objs += bptree.o | ||||||
|  | $(MODULE)-objs += bqueue.o | ||||||
| $(MODULE)-objs += ddt.o | $(MODULE)-objs += ddt.o | ||||||
| $(MODULE)-objs += ddt_zap.o | $(MODULE)-objs += ddt_zap.o | ||||||
| $(MODULE)-objs += dmu.o | $(MODULE)-objs += dmu.o | ||||||
|  | |||||||
| @ -156,7 +156,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |||||||
| 	int err; | 	int err; | ||||||
| 	struct bptree_args *ba = arg; | 	struct bptree_args *ba = arg; | ||||||
| 
 | 
 | ||||||
| 	if (BP_IS_HOLE(bp)) | 	if (bp == NULL || BP_IS_HOLE(bp)) | ||||||
| 		return (0); | 		return (0); | ||||||
| 
 | 
 | ||||||
| 	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); | 	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); | ||||||
|  | |||||||
							
								
								
									
										111
									
								
								module/zfs/bqueue.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								module/zfs/bqueue.c
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,111 @@ | |||||||
|  | /*
 | ||||||
|  |  * CDDL HEADER START | ||||||
|  |  * | ||||||
|  |  * This file and its contents are supplied under the terms of the | ||||||
|  |  * Common Development and Distribution License ("CDDL"), version 1.0. | ||||||
|  |  * You may only use this file in accordance with the terms of version | ||||||
|  |  * 1.0 of the CDDL. | ||||||
|  |  * | ||||||
|  |  * A full copy of the text of the CDDL should have accompanied this | ||||||
|  |  * source.  A copy of the CDDL is also available via the Internet at | ||||||
|  |  * http://www.illumos.org/license/CDDL.
 | ||||||
|  |  * | ||||||
|  |  * CDDL HEADER END | ||||||
|  |  */ | ||||||
|  | /*
 | ||||||
|  |  * Copyright (c) 2014 by Delphix. All rights reserved. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | #include	<sys/bqueue.h> | ||||||
|  | #include	<sys/zfs_context.h> | ||||||
|  | 
 | ||||||
|  | static inline bqueue_node_t * | ||||||
|  | obj2node(bqueue_t *q, void *data) | ||||||
|  | { | ||||||
|  | 	return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Initialize a blocking queue  The maximum capacity of the queue is set to | ||||||
|  |  * size.  Types that want to be stored in a bqueue must contain a bqueue_node_t, | ||||||
|  |  * and offset should give its offset from the start of the struct.  Return 0 on | ||||||
|  |  * success, or -1 on failure. | ||||||
|  |  */ | ||||||
|  | int | ||||||
|  | bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) | ||||||
|  | { | ||||||
|  | 	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), | ||||||
|  | 	    node_offset + offsetof(bqueue_node_t, bqn_node)); | ||||||
|  | 	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); | ||||||
|  | 	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); | ||||||
|  | 	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); | ||||||
|  | 	q->bq_node_offset = node_offset; | ||||||
|  | 	q->bq_size = 0; | ||||||
|  | 	q->bq_maxsize = size; | ||||||
|  | 	return (0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Destroy a blocking queue.  This function asserts that there are no | ||||||
|  |  * elements in the queue, and no one is blocked on the condition | ||||||
|  |  * variables. | ||||||
|  |  */ | ||||||
|  | void | ||||||
|  | bqueue_destroy(bqueue_t *q) | ||||||
|  | { | ||||||
|  | 	ASSERT0(q->bq_size); | ||||||
|  | 	cv_destroy(&q->bq_add_cv); | ||||||
|  | 	cv_destroy(&q->bq_pop_cv); | ||||||
|  | 	mutex_destroy(&q->bq_lock); | ||||||
|  | 	list_destroy(&q->bq_list); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Add data to q, consuming size units of capacity.  If there is insufficient | ||||||
|  |  * capacity to consume size units, block until capacity exists.  Asserts size is | ||||||
|  |  * > 0. | ||||||
|  |  */ | ||||||
|  | void | ||||||
|  | bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) | ||||||
|  | { | ||||||
|  | 	ASSERT3U(item_size, >, 0); | ||||||
|  | 	ASSERT3U(item_size, <, q->bq_maxsize); | ||||||
|  | 	mutex_enter(&q->bq_lock); | ||||||
|  | 	obj2node(q, data)->bqn_size = item_size; | ||||||
|  | 	while (q->bq_size + item_size > q->bq_maxsize) { | ||||||
|  | 		cv_wait(&q->bq_add_cv, &q->bq_lock); | ||||||
|  | 	} | ||||||
|  | 	q->bq_size += item_size; | ||||||
|  | 	list_insert_tail(&q->bq_list, data); | ||||||
|  | 	cv_signal(&q->bq_pop_cv); | ||||||
|  | 	mutex_exit(&q->bq_lock); | ||||||
|  | } | ||||||
|  | /*
 | ||||||
|  |  * Take the first element off of q.  If there are no elements on the queue, wait | ||||||
|  |  * until one is put there.  Return the removed element. | ||||||
|  |  */ | ||||||
|  | void * | ||||||
|  | bqueue_dequeue(bqueue_t *q) | ||||||
|  | { | ||||||
|  | 	void *ret; | ||||||
|  | 	uint64_t item_size; | ||||||
|  | 	mutex_enter(&q->bq_lock); | ||||||
|  | 	while (q->bq_size == 0) { | ||||||
|  | 		cv_wait(&q->bq_pop_cv, &q->bq_lock); | ||||||
|  | 	} | ||||||
|  | 	ret = list_remove_head(&q->bq_list); | ||||||
|  | 	item_size = obj2node(q, ret)->bqn_size; | ||||||
|  | 	q->bq_size -= item_size; | ||||||
|  | 	mutex_exit(&q->bq_lock); | ||||||
|  | 	cv_signal(&q->bq_add_cv); | ||||||
|  | 	return (ret); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Returns true if the space used is 0. | ||||||
|  |  */ | ||||||
|  | boolean_t | ||||||
|  | bqueue_empty(bqueue_t *q) | ||||||
|  | { | ||||||
|  | 	return (q->bq_size == 0); | ||||||
|  | } | ||||||
| @ -51,7 +51,8 @@ struct dbuf_hold_impl_data { | |||||||
| 	dnode_t *dh_dn; | 	dnode_t *dh_dn; | ||||||
| 	uint8_t dh_level; | 	uint8_t dh_level; | ||||||
| 	uint64_t dh_blkid; | 	uint64_t dh_blkid; | ||||||
| 	int dh_fail_sparse; | 	boolean_t dh_fail_sparse; | ||||||
|  | 	boolean_t dh_fail_uncached; | ||||||
| 	void *dh_tag; | 	void *dh_tag; | ||||||
| 	dmu_buf_impl_t **dh_dbp; | 	dmu_buf_impl_t **dh_dbp; | ||||||
| 	/* Local variables */ | 	/* Local variables */ | ||||||
| @ -65,8 +66,9 @@ struct dbuf_hold_impl_data { | |||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, | static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, | ||||||
|     dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, |     dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, | ||||||
|     void *tag, dmu_buf_impl_t **dbp, int depth); | 	boolean_t fail_uncached, | ||||||
|  | 	void *tag, dmu_buf_impl_t **dbp, int depth); | ||||||
| static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); | static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
| @ -604,11 +606,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) | |||||||
| 	return (abuf); | 	return (abuf); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Calculate which level n block references the data at the level 0 offset | ||||||
|  |  * provided. | ||||||
|  |  */ | ||||||
| uint64_t | uint64_t | ||||||
| dbuf_whichblock(dnode_t *dn, uint64_t offset) | dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) | ||||||
| { | { | ||||||
| 	if (dn->dn_datablkshift) { | 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { | ||||||
| 		return (offset >> dn->dn_datablkshift); | 		/*
 | ||||||
|  | 		 * The level n blkid is equal to the level 0 blkid divided by | ||||||
|  | 		 * the number of level 0s in a level n block. | ||||||
|  | 		 * | ||||||
|  | 		 * The level 0 blkid is offset >> datablkshift = | ||||||
|  | 		 * offset / 2^datablkshift. | ||||||
|  | 		 * | ||||||
|  | 		 * The number of level 0s in a level n is the number of block | ||||||
|  | 		 * pointers in an indirect block, raised to the power of level. | ||||||
|  | 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = | ||||||
|  | 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). | ||||||
|  | 		 * | ||||||
|  | 		 * Thus, the level n blkid is: offset / | ||||||
|  | 		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) | ||||||
|  | 		 * = offset / 2^(datablkshift + level * | ||||||
|  | 		 *   (indblkshift - SPA_BLKPTRSHIFT)) | ||||||
|  | 		 * = offset >> (datablkshift + level * | ||||||
|  | 		 *   (indblkshift - SPA_BLKPTRSHIFT)) | ||||||
|  | 		 */ | ||||||
|  | 		return (offset >> (dn->dn_datablkshift + level * | ||||||
|  | 		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); | ||||||
| 	} else { | 	} else { | ||||||
| 		ASSERT3U(offset, <, dn->dn_datablksz); | 		ASSERT3U(offset, <, dn->dn_datablksz); | ||||||
| 		return (0); | 		return (0); | ||||||
| @ -1786,6 +1812,12 @@ dbuf_clear(dmu_buf_impl_t *db) | |||||||
| 		dbuf_rele(parent, db); | 		dbuf_rele(parent, db); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Note: While bpp will always be updated if the function returns success, | ||||||
|  |  * parentp will not be updated if the dnode does not have dn_dbuf filled in; | ||||||
|  |  * this happens when the dnode is the meta-dnode, or a userused or groupused | ||||||
|  |  * object. | ||||||
|  |  */ | ||||||
| __attribute__((always_inline)) | __attribute__((always_inline)) | ||||||
| static inline int | static inline int | ||||||
| dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, | dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, | ||||||
| @ -1828,12 +1860,12 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, | |||||||
| 		/* this block is referenced from an indirect block */ | 		/* this block is referenced from an indirect block */ | ||||||
| 		int err; | 		int err; | ||||||
| 		if (dh == NULL) { | 		if (dh == NULL) { | ||||||
| 			err = dbuf_hold_impl(dn, level+1, blkid >> epbs, | 			err = dbuf_hold_impl(dn, level+1, | ||||||
| 					fail_sparse, NULL, parentp); | 			    blkid >> epbs, fail_sparse, FALSE, NULL, parentp); | ||||||
| 		} else { | 		} else { | ||||||
| 			__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, | 			__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, | ||||||
| 					blkid >> epbs, fail_sparse, NULL, | 			    blkid >> epbs, fail_sparse, FALSE, NULL, | ||||||
| 					parentp, dh->dh_depth + 1); | 			    parentp, dh->dh_depth + 1); | ||||||
| 			err = __dbuf_hold_impl(dh + 1); | 			err = __dbuf_hold_impl(dh + 1); | ||||||
| 		} | 		} | ||||||
| 		if (err) | 		if (err) | ||||||
| @ -2011,11 +2043,102 @@ dbuf_destroy(dmu_buf_impl_t *db) | |||||||
| 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); | 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void | typedef struct dbuf_prefetch_arg { | ||||||
| dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) | 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */ | ||||||
|  | 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ | ||||||
|  | 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ | ||||||
|  | 	int dpa_curlevel; /* The current level that we're reading */ | ||||||
|  | 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ | ||||||
|  | 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ | ||||||
|  | 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ | ||||||
|  | } dbuf_prefetch_arg_t; | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Actually issue the prefetch read for the block given. | ||||||
|  |  */ | ||||||
|  | static void | ||||||
|  | dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) | ||||||
| { | { | ||||||
| 	dmu_buf_impl_t *db = NULL; | 	arc_flags_t aflags; | ||||||
| 	blkptr_t *bp = NULL; | 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | ||||||
|  | 
 | ||||||
|  | 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); | ||||||
|  | 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); | ||||||
|  | 	ASSERT(dpa->dpa_zio != NULL); | ||||||
|  | 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, | ||||||
|  | 	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | ||||||
|  | 	    &aflags, &dpa->dpa_zb); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Called when an indirect block above our prefetch target is read in.  This | ||||||
|  |  * will either read in the next indirect block down the tree or issue the actual | ||||||
|  |  * prefetch if the next block down is our target. | ||||||
|  |  */ | ||||||
|  | static void | ||||||
|  | dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) | ||||||
|  | { | ||||||
|  | 	dbuf_prefetch_arg_t *dpa = private; | ||||||
|  | 	uint64_t nextblkid; | ||||||
|  | 	blkptr_t *bp; | ||||||
|  | 
 | ||||||
|  | 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); | ||||||
|  | 	ASSERT3S(dpa->dpa_curlevel, >, 0); | ||||||
|  | 	if (zio != NULL) { | ||||||
|  | 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); | ||||||
|  | 		ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); | ||||||
|  | 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	dpa->dpa_curlevel--; | ||||||
|  | 
 | ||||||
|  | 	nextblkid = dpa->dpa_zb.zb_blkid >> | ||||||
|  | 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); | ||||||
|  | 	bp = ((blkptr_t *)abuf->b_data) + | ||||||
|  | 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); | ||||||
|  | 	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { | ||||||
|  | 		kmem_free(dpa, sizeof (*dpa)); | ||||||
|  | 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { | ||||||
|  | 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); | ||||||
|  | 		dbuf_issue_final_prefetch(dpa, bp); | ||||||
|  | 		kmem_free(dpa, sizeof (*dpa)); | ||||||
|  | 	} else { | ||||||
|  | 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; | ||||||
|  | 		zbookmark_phys_t zb; | ||||||
|  | 
 | ||||||
|  | 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); | ||||||
|  | 
 | ||||||
|  | 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, | ||||||
|  | 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); | ||||||
|  | 
 | ||||||
|  | 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, | ||||||
|  | 		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, | ||||||
|  | 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | ||||||
|  | 		    &iter_aflags, &zb); | ||||||
|  | 	} | ||||||
|  | 	(void) arc_buf_remove_ref(abuf, private); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Issue prefetch reads for the given block on the given level.  If the indirect | ||||||
|  |  * blocks above that block are not in memory, we will read them in | ||||||
|  |  * asynchronously.  As a result, this call never blocks waiting for a read to | ||||||
|  |  * complete. | ||||||
|  |  */ | ||||||
|  | void | ||||||
|  | dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, | ||||||
|  |     arc_flags_t aflags) | ||||||
|  | { | ||||||
|  | 	blkptr_t bp; | ||||||
|  | 	int epbs, nlevels, curlevel; | ||||||
|  | 	uint64_t curblkid; | ||||||
|  | 	dmu_buf_impl_t *db; | ||||||
|  | 	zio_t *pio; | ||||||
|  | 	dbuf_prefetch_arg_t *dpa; | ||||||
|  | 	dsl_dataset_t *ds; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(blkid != DMU_BONUS_BLKID); | 	ASSERT(blkid != DMU_BONUS_BLKID); | ||||||
| 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); | 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); | ||||||
| @ -2023,35 +2146,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) | |||||||
| 	if (dnode_block_freed(dn, blkid)) | 	if (dnode_block_freed(dn, blkid)) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	/* dbuf_find() returns with db_mtx held */ | 	/*
 | ||||||
| 	if ((db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid))) { | 	 * This dnode hasn't been written to disk yet, so there's nothing to | ||||||
| 		/*
 | 	 * prefetch. | ||||||
| 		 * This dbuf is already in the cache.  We assume that | 	 */ | ||||||
| 		 * it is already CACHED, or else about to be either | 	nlevels = dn->dn_phys->dn_nlevels; | ||||||
| 		 * read or filled. | 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) | ||||||
| 		 */ | 		return; | ||||||
|  | 
 | ||||||
|  | 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; | ||||||
|  | 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	db = dbuf_find(dn->dn_objset, dn->dn_object, | ||||||
|  | 	    level, blkid); | ||||||
|  | 	if (db != NULL) { | ||||||
| 		mutex_exit(&db->db_mtx); | 		mutex_exit(&db->db_mtx); | ||||||
|  | 		/*
 | ||||||
|  | 		 * This dbuf already exists.  It is either CACHED, or | ||||||
|  | 		 * (we assume) about to be read or filled. | ||||||
|  | 		 */ | ||||||
| 		return; | 		return; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { | 	/*
 | ||||||
| 		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { | 	 * Find the closest ancestor (indirect block) of the target block | ||||||
| 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; | 	 * that is present in the cache.  In this indirect block, we will | ||||||
| 			arc_flags_t aflags = | 	 * find the bp that is at curlevel, curblkid. | ||||||
| 			    ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | 	 */ | ||||||
| 			zbookmark_phys_t zb; | 	curlevel = level; | ||||||
|  | 	curblkid = blkid; | ||||||
|  | 	while (curlevel < nlevels - 1) { | ||||||
|  | 		int parent_level = curlevel + 1; | ||||||
|  | 		uint64_t parent_blkid = curblkid >> epbs; | ||||||
|  | 		dmu_buf_impl_t *db; | ||||||
| 
 | 
 | ||||||
| 			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, | 		if (dbuf_hold_impl(dn, parent_level, parent_blkid, | ||||||
| 			    dn->dn_object, 0, blkid); | 		    FALSE, TRUE, FTAG, &db) == 0) { | ||||||
| 
 | 			blkptr_t *bpp = db->db_buf->b_data; | ||||||
| 			(void) arc_read(NULL, dn->dn_objset->os_spa, | 			bp = bpp[P2PHASE(curblkid, 1 << epbs)]; | ||||||
| 			    bp, NULL, NULL, prio, | 			dbuf_rele(db, FTAG); | ||||||
| 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | 			break; | ||||||
| 			    &aflags, &zb); |  | ||||||
| 		} | 		} | ||||||
| 		if (db) | 
 | ||||||
| 			dbuf_rele(db, NULL); | 		curlevel = parent_level; | ||||||
|  | 		curblkid = parent_blkid; | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	if (curlevel == nlevels - 1) { | ||||||
|  | 		/* No cached indirect blocks found. */ | ||||||
|  | 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); | ||||||
|  | 		bp = dn->dn_phys->dn_blkptr[curblkid]; | ||||||
|  | 	} | ||||||
|  | 	if (BP_IS_HOLE(&bp)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); | ||||||
|  | 
 | ||||||
|  | 	pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, | ||||||
|  | 	    ZIO_FLAG_CANFAIL); | ||||||
|  | 
 | ||||||
|  | 	dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); | ||||||
|  | 	ds = dn->dn_objset->os_dsl_dataset; | ||||||
|  | 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, | ||||||
|  | 	    dn->dn_object, level, blkid); | ||||||
|  | 	dpa->dpa_curlevel = curlevel; | ||||||
|  | 	dpa->dpa_prio = prio; | ||||||
|  | 	dpa->dpa_aflags = aflags; | ||||||
|  | 	dpa->dpa_spa = dn->dn_objset->os_spa; | ||||||
|  | 	dpa->dpa_epbs = epbs; | ||||||
|  | 	dpa->dpa_zio = pio; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * If we have the indirect just above us, no need to do the asynchronous | ||||||
|  | 	 * prefetch chain; we'll just run the last step ourselves.  If we're at | ||||||
|  | 	 * a higher level, though, we want to issue the prefetches for all the | ||||||
|  | 	 * indirect blocks asynchronously, so we can go on with whatever we were | ||||||
|  | 	 * doing. | ||||||
|  | 	 */ | ||||||
|  | 	if (curlevel == level) { | ||||||
|  | 		ASSERT3U(curblkid, ==, blkid); | ||||||
|  | 		dbuf_issue_final_prefetch(dpa, &bp); | ||||||
|  | 		kmem_free(dpa, sizeof (*dpa)); | ||||||
|  | 	} else { | ||||||
|  | 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; | ||||||
|  | 		zbookmark_phys_t zb; | ||||||
|  | 
 | ||||||
|  | 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, | ||||||
|  | 		    dn->dn_object, curlevel, curblkid); | ||||||
|  | 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, | ||||||
|  | 		    &bp, dbuf_prefetch_indirect_done, dpa, prio, | ||||||
|  | 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, | ||||||
|  | 		    &iter_aflags, &zb); | ||||||
|  | 	} | ||||||
|  | 	/*
 | ||||||
|  | 	 * We use pio here instead of dpa_zio since it's possible that | ||||||
|  | 	 * dpa may have already been freed. | ||||||
|  | 	 */ | ||||||
|  | 	zio_nowait(pio); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define	DBUF_HOLD_IMPL_MAX_DEPTH	20 | #define	DBUF_HOLD_IMPL_MAX_DEPTH	20 | ||||||
| @ -2079,6 +2271,9 @@ top: | |||||||
| 	if (dh->dh_db == NULL) { | 	if (dh->dh_db == NULL) { | ||||||
| 		dh->dh_bp = NULL; | 		dh->dh_bp = NULL; | ||||||
| 
 | 
 | ||||||
|  | 		if (dh->dh_fail_uncached) | ||||||
|  | 			return (SET_ERROR(ENOENT)); | ||||||
|  | 
 | ||||||
| 		ASSERT3P(dh->dh_parent, ==, NULL); | 		ASSERT3P(dh->dh_parent, ==, NULL); | ||||||
| 		dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, | 		dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, | ||||||
| 					dh->dh_fail_sparse, &dh->dh_parent, | 					dh->dh_fail_sparse, &dh->dh_parent, | ||||||
| @ -2099,6 +2294,11 @@ top: | |||||||
| 					dh->dh_parent, dh->dh_bp); | 					dh->dh_parent, dh->dh_bp); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { | ||||||
|  | 		mutex_exit(&dh->dh_db->db_mtx); | ||||||
|  | 		return (SET_ERROR(ENOENT)); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { | 	if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { | ||||||
| 		arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); | 		arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); | ||||||
| 		if (dh->dh_db->db_buf->b_data == NULL) { | 		if (dh->dh_db->db_buf->b_data == NULL) { | ||||||
| @ -2159,7 +2359,8 @@ top: | |||||||
|  * on the stack for 20 levels of recursion. |  * on the stack for 20 levels of recursion. | ||||||
|  */ |  */ | ||||||
| int | int | ||||||
| dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, | dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, | ||||||
|  |     boolean_t fail_sparse, boolean_t fail_uncached, | ||||||
|     void *tag, dmu_buf_impl_t **dbp) |     void *tag, dmu_buf_impl_t **dbp) | ||||||
| { | { | ||||||
| 	struct dbuf_hold_impl_data *dh; | 	struct dbuf_hold_impl_data *dh; | ||||||
| @ -2167,7 +2368,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, | |||||||
| 
 | 
 | ||||||
| 	dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) * | 	dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) * | ||||||
| 	    DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); | 	    DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); | ||||||
| 	__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0); | 	__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, | ||||||
|  | 		fail_uncached, tag, dbp, 0); | ||||||
| 
 | 
 | ||||||
| 	error = __dbuf_hold_impl(dh); | 	error = __dbuf_hold_impl(dh); | ||||||
| 
 | 
 | ||||||
| @ -2179,13 +2381,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, | |||||||
| 
 | 
 | ||||||
| static void | static void | ||||||
| __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, | __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, | ||||||
|     dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, |     dnode_t *dn, uint8_t level, uint64_t blkid, | ||||||
|     void *tag, dmu_buf_impl_t **dbp, int depth) | 	boolean_t fail_sparse, boolean_t fail_uncached, | ||||||
|  | 	void *tag, dmu_buf_impl_t **dbp, int depth) | ||||||
| { | { | ||||||
| 	dh->dh_dn = dn; | 	dh->dh_dn = dn; | ||||||
| 	dh->dh_level = level; | 	dh->dh_level = level; | ||||||
| 	dh->dh_blkid = blkid; | 	dh->dh_blkid = blkid; | ||||||
|  | 
 | ||||||
| 	dh->dh_fail_sparse = fail_sparse; | 	dh->dh_fail_sparse = fail_sparse; | ||||||
|  | 	dh->dh_fail_uncached = fail_uncached; | ||||||
|  | 
 | ||||||
| 	dh->dh_tag = tag; | 	dh->dh_tag = tag; | ||||||
| 	dh->dh_dbp = dbp; | 	dh->dh_dbp = dbp; | ||||||
| 	dh->dh_depth = depth; | 	dh->dh_depth = depth; | ||||||
| @ -2194,16 +2400,14 @@ __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, | |||||||
| dmu_buf_impl_t * | dmu_buf_impl_t * | ||||||
| dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) | dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) | ||||||
| { | { | ||||||
| 	dmu_buf_impl_t *db; | 	return (dbuf_hold_level(dn, 0, blkid, tag)); | ||||||
| 	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); |  | ||||||
| 	return (err ? NULL : db); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| dmu_buf_impl_t * | dmu_buf_impl_t * | ||||||
| dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) | dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) | ||||||
| { | { | ||||||
| 	dmu_buf_impl_t *db; | 	dmu_buf_impl_t *db; | ||||||
| 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); | 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); | ||||||
| 	return (err ? NULL : db); | 	return (err ? NULL : db); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -2531,8 +2735,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) | |||||||
| 		if (parent == NULL) { | 		if (parent == NULL) { | ||||||
| 			mutex_exit(&db->db_mtx); | 			mutex_exit(&db->db_mtx); | ||||||
| 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 			(void) dbuf_hold_impl(dn, db->db_level+1, | 			parent = dbuf_hold_level(dn, db->db_level + 1, | ||||||
| 			    db->db_blkid >> epbs, FALSE, db, &parent); | 			    db->db_blkid >> epbs, db); | ||||||
| 			rw_exit(&dn->dn_struct_rwlock); | 			rw_exit(&dn->dn_struct_rwlock); | ||||||
| 			mutex_enter(&db->db_mtx); | 			mutex_enter(&db->db_mtx); | ||||||
| 			db->db_parent = parent; | 			db->db_parent = parent; | ||||||
|  | |||||||
| @ -138,7 +138,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, | |||||||
| 	err = dnode_hold(os, object, FTAG, &dn); | 	err = dnode_hold(os, object, FTAG, &dn); | ||||||
| 	if (err) | 	if (err) | ||||||
| 		return (err); | 		return (err); | ||||||
| 	blkid = dbuf_whichblock(dn, offset); | 	blkid = dbuf_whichblock(dn, 0, offset); | ||||||
| 	rw_enter(&dn->dn_struct_rwlock, RW_READER); | 	rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 	db = dbuf_hold(dn, blkid, tag); | 	db = dbuf_hold(dn, blkid, tag); | ||||||
| 	rw_exit(&dn->dn_struct_rwlock); | 	rw_exit(&dn->dn_struct_rwlock); | ||||||
| @ -421,7 +421,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, | |||||||
| 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); | 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); | ||||||
| 
 | 
 | ||||||
| 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); | 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); | ||||||
| 	blkid = dbuf_whichblock(dn, offset); | 	blkid = dbuf_whichblock(dn, 0, offset); | ||||||
| 	for (i = 0; i < nblks; i++) { | 	for (i = 0; i < nblks; i++) { | ||||||
| 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); | 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); | ||||||
| 		if (db == NULL) { | 		if (db == NULL) { | ||||||
| @ -522,17 +522,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Issue prefetch i/os for the given blocks. |  * Issue prefetch i/os for the given blocks.  If level is greater than 0, the | ||||||
|  |  * indirect blocks prefeteched will be those that point to the blocks containing | ||||||
|  |  * the data starting at offset, and continuing to offset + len. | ||||||
|  * |  * | ||||||
|  * Note: The assumption is that we *know* these blocks will be needed |  * Note that if the indirect blocks above the blocks being prefetched are not in | ||||||
|  * almost immediately.  Therefore, the prefetch i/os will be issued at |  * cache, they will be asychronously read in. | ||||||
|  * ZIO_PRIORITY_SYNC_READ |  | ||||||
|  * |  | ||||||
|  * Note: indirect blocks and other metadata will be read synchronously, |  | ||||||
|  * causing this function to block if they are not already cached. |  | ||||||
|  */ |  */ | ||||||
| void | void | ||||||
| dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, | ||||||
|  |     uint64_t len, zio_priority_t pri) | ||||||
| { | { | ||||||
| 	dnode_t *dn; | 	dnode_t *dn; | ||||||
| 	uint64_t blkid; | 	uint64_t blkid; | ||||||
| @ -548,8 +547,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |||||||
| 			return; | 			return; | ||||||
| 
 | 
 | ||||||
| 		rw_enter(&dn->dn_struct_rwlock, RW_READER); | 		rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); | 		blkid = dbuf_whichblock(dn, level, | ||||||
| 		dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); | 		    object * sizeof (dnode_phys_t)); | ||||||
|  | 		dbuf_prefetch(dn, level, blkid, pri, 0); | ||||||
| 		rw_exit(&dn->dn_struct_rwlock); | 		rw_exit(&dn->dn_struct_rwlock); | ||||||
| 		return; | 		return; | ||||||
| 	} | 	} | ||||||
| @ -564,10 +564,16 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	rw_enter(&dn->dn_struct_rwlock, RW_READER); | 	rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 	if (dn->dn_datablkshift) { | 	/*
 | ||||||
| 		int blkshift = dn->dn_datablkshift; | 	 * offset + len - 1 is the last byte we want to prefetch for, and offset | ||||||
| 		nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - | 	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the | ||||||
| 		    P2ALIGN(offset, 1 << blkshift)) >> blkshift; | 	 * last block we want to prefetch, and dbuf_whichblock(dn, level, | ||||||
|  | 	 * offset)  is the first.  Then the number we need to prefetch is the | ||||||
|  | 	 * last - first + 1. | ||||||
|  | 	 */ | ||||||
|  | 	if (level > 0 || dn->dn_datablkshift != 0) { | ||||||
|  | 		nblks = dbuf_whichblock(dn, level, offset + len - 1) - | ||||||
|  | 		    dbuf_whichblock(dn, level, offset) + 1; | ||||||
| 	} else { | 	} else { | ||||||
| 		nblks = (offset < dn->dn_datablksz); | 		nblks = (offset < dn->dn_datablksz); | ||||||
| 	} | 	} | ||||||
| @ -575,9 +581,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |||||||
| 	if (nblks != 0) { | 	if (nblks != 0) { | ||||||
| 		int i; | 		int i; | ||||||
| 
 | 
 | ||||||
| 		blkid = dbuf_whichblock(dn, offset); | 		blkid = dbuf_whichblock(dn, level, offset); | ||||||
| 		for (i = 0; i < nblks; i++) | 		for (i = 0; i < nblks; i++) | ||||||
| 			dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); | 			dbuf_prefetch(dn, level, blkid + i, pri, 0); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	rw_exit(&dn->dn_struct_rwlock); | 	rw_exit(&dn->dn_struct_rwlock); | ||||||
| @ -1293,7 +1299,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, | |||||||
| 	DB_DNODE_ENTER(dbuf); | 	DB_DNODE_ENTER(dbuf); | ||||||
| 	dn = DB_DNODE(dbuf); | 	dn = DB_DNODE(dbuf); | ||||||
| 	rw_enter(&dn->dn_struct_rwlock, RW_READER); | 	rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 	blkid = dbuf_whichblock(dn, offset); | 	blkid = dbuf_whichblock(dn, 0, offset); | ||||||
| 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); | 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); | ||||||
| 	rw_exit(&dn->dn_struct_rwlock); | 	rw_exit(&dn->dn_struct_rwlock); | ||||||
| 	DB_DNODE_EXIT(dbuf); | 	DB_DNODE_EXIT(dbuf); | ||||||
|  | |||||||
| @ -115,7 +115,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |||||||
| 	if (issig(JUSTLOOKING) && issig(FORREAL)) | 	if (issig(JUSTLOOKING) && issig(FORREAL)) | ||||||
| 		return (SET_ERROR(EINTR)); | 		return (SET_ERROR(EINTR)); | ||||||
| 
 | 
 | ||||||
| 	if (zb->zb_object != DMU_META_DNODE_OBJECT) | 	if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) | ||||||
| 		return (0); | 		return (0); | ||||||
| 
 | 
 | ||||||
| 	if (BP_IS_HOLE(bp)) { | 	if (BP_IS_HOLE(bp)) { | ||||||
|  | |||||||
| @ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) | |||||||
| 	return (0); | 	return (0); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Return (in *objectp) the next object which is allocated (or a hole) | ||||||
|  |  * after *object, taking into account only objects that may have been modified | ||||||
|  |  * after the specified txg. | ||||||
|  |  */ | ||||||
| int | int | ||||||
| dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) | dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) | ||||||
| { | { | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -157,7 +157,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, | |||||||
| 		 * If we already visited this bp & everything below, | 		 * If we already visited this bp & everything below, | ||||||
| 		 * don't bother doing it again. | 		 * don't bother doing it again. | ||||||
| 		 */ | 		 */ | ||||||
| 		if (zbookmark_is_before(dnp, zb, td->td_resume)) | 		if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) | ||||||
| 			return (RESUME_SKIP_ALL); | 			return (RESUME_SKIP_ALL); | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| @ -428,6 +428,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, | |||||||
| 	int j, err = 0; | 	int j, err = 0; | ||||||
| 	zbookmark_phys_t czb; | 	zbookmark_phys_t czb; | ||||||
| 
 | 
 | ||||||
|  | 	if (td->td_flags & TRAVERSE_PRE) { | ||||||
|  | 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, | ||||||
|  | 		    ZB_DNODE_BLKID); | ||||||
|  | 		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, | ||||||
|  | 		    td->td_arg); | ||||||
|  | 		if (err == TRAVERSE_VISIT_NO_CHILDREN) | ||||||
|  | 			return (0); | ||||||
|  | 		if (err != 0) | ||||||
|  | 			return (err); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	for (j = 0; j < dnp->dn_nblkptr; j++) { | 	for (j = 0; j < dnp->dn_nblkptr; j++) { | ||||||
| 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); | 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); | ||||||
| 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); | 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); | ||||||
| @ -435,10 +446,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, | |||||||
| 			break; | 			break; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { | 	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { | ||||||
| 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); | 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); | ||||||
| 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); | 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	if (err == 0 && (td->td_flags & TRAVERSE_POST)) { | ||||||
|  | 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, | ||||||
|  | 		    ZB_DNODE_BLKID); | ||||||
|  | 		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, | ||||||
|  | 		    td->td_arg); | ||||||
|  | 		if (err == TRAVERSE_VISIT_NO_CHILDREN) | ||||||
|  | 			return (0); | ||||||
|  | 		if (err != 0) | ||||||
|  | 			return (err); | ||||||
|  | 	} | ||||||
| 	return (err); | 	return (err); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -451,6 +473,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |||||||
| 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(pfd->pd_bytes_fetched >= 0); | 	ASSERT(pfd->pd_bytes_fetched >= 0); | ||||||
|  | 	if (bp == NULL) | ||||||
|  | 		return (0); | ||||||
| 	if (pfd->pd_cancel) | 	if (pfd->pd_cancel) | ||||||
| 		return (SET_ERROR(EINTR)); | 		return (SET_ERROR(EINTR)); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -332,7 +332,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |||||||
| 			dmu_buf_impl_t *db; | 			dmu_buf_impl_t *db; | ||||||
| 
 | 
 | ||||||
| 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); | 			err = dbuf_hold_impl(dn, 0, start, | ||||||
|  | 			    FALSE, FALSE, FTAG, &db); | ||||||
| 			rw_exit(&dn->dn_struct_rwlock); | 			rw_exit(&dn->dn_struct_rwlock); | ||||||
| 
 | 
 | ||||||
| 			if (err) { | 			if (err) { | ||||||
| @ -533,7 +534,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |||||||
| 		blkoff = P2PHASE(blkid, epb); | 		blkoff = P2PHASE(blkid, epb); | ||||||
| 		tochk = MIN(epb - blkoff, nblks); | 		tochk = MIN(epb - blkoff, nblks); | ||||||
| 
 | 
 | ||||||
| 		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); | 		err = dbuf_hold_impl(dn, 1, blkid >> epbs, | ||||||
|  | 		    FALSE, FALSE, FTAG, &dbuf); | ||||||
| 		if (err) { | 		if (err) { | ||||||
| 			txh->txh_tx->tx_err = err; | 			txh->txh_tx->tx_err = err; | ||||||
| 			break; | 			break; | ||||||
|  | |||||||
| @ -293,7 +293,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) | |||||||
| 	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); | 	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); | ||||||
| 
 | 
 | ||||||
| 	for (i = 0; i < fetchsz; i++) { | 	for (i = 0; i < fetchsz; i++) { | ||||||
| 		dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); | 		dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ, | ||||||
|  | 		    ARC_FLAG_PREFETCH); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return (fetchsz); | 	return (fetchsz); | ||||||
|  | |||||||
| @ -1112,7 +1112,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, | |||||||
| 		drop_struct_lock = TRUE; | 		drop_struct_lock = TRUE; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); | 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); | ||||||
| 
 | 
 | ||||||
| 	db = dbuf_hold(mdn, blk, FTAG); | 	db = dbuf_hold(mdn, blk, FTAG); | ||||||
| 	if (drop_struct_lock) | 	if (drop_struct_lock) | ||||||
| @ -1409,7 +1409,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) | |||||||
| 		goto fail; | 		goto fail; | ||||||
| 
 | 
 | ||||||
| 	/* resize the old block */ | 	/* resize the old block */ | ||||||
| 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); | 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); | ||||||
| 	if (err == 0) | 	if (err == 0) | ||||||
| 		dbuf_new_size(db, size, tx); | 		dbuf_new_size(db, size, tx); | ||||||
| 	else if (err != ENOENT) | 	else if (err != ENOENT) | ||||||
| @ -1582,8 +1582,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) | |||||||
| 		ASSERT3U(blkoff + head, ==, blksz); | 		ASSERT3U(blkoff + head, ==, blksz); | ||||||
| 		if (len < head) | 		if (len < head) | ||||||
| 			head = len; | 			head = len; | ||||||
| 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, | 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), | ||||||
| 		    FTAG, &db) == 0) { | 		    TRUE, FALSE, FTAG, &db) == 0) { | ||||||
| 			caddr_t data; | 			caddr_t data; | ||||||
| 
 | 
 | ||||||
| 			/* don't dirty if it isn't on disk and isn't dirty */ | 			/* don't dirty if it isn't on disk and isn't dirty */ | ||||||
| @ -1620,8 +1620,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) | |||||||
| 	if (tail) { | 	if (tail) { | ||||||
| 		if (len < tail) | 		if (len < tail) | ||||||
| 			tail = len; | 			tail = len; | ||||||
| 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), | 		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), | ||||||
| 		    TRUE, FTAG, &db) == 0) { | 		    TRUE, FALSE, FTAG, &db) == 0) { | ||||||
| 			/* don't dirty if not on disk and not dirty */ | 			/* don't dirty if not on disk and not dirty */ | ||||||
| 			if (db->db_last_dirty || | 			if (db->db_last_dirty || | ||||||
| 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { | 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { | ||||||
| @ -1853,7 +1853,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) | |||||||
|  */ |  */ | ||||||
| static int | static int | ||||||
| dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | ||||||
| 	int lvl, uint64_t blkfill, uint64_t txg) |     int lvl, uint64_t blkfill, uint64_t txg) | ||||||
| { | { | ||||||
| 	dmu_buf_impl_t *db = NULL; | 	dmu_buf_impl_t *db = NULL; | ||||||
| 	void *data = NULL; | 	void *data = NULL; | ||||||
| @ -1875,8 +1875,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, | |||||||
| 		epb = dn->dn_phys->dn_nblkptr; | 		epb = dn->dn_phys->dn_nblkptr; | ||||||
| 		data = dn->dn_phys->dn_blkptr; | 		data = dn->dn_phys->dn_blkptr; | ||||||
| 	} else { | 	} else { | ||||||
| 		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); | 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); | ||||||
| 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); | 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); | ||||||
| 		if (error) { | 		if (error) { | ||||||
| 			if (error != ENOENT) | 			if (error != ENOENT) | ||||||
| 				return (error); | 				return (error); | ||||||
|  | |||||||
| @ -192,7 +192,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) | |||||||
| 
 | 
 | ||||||
| 		rw_enter(&dn->dn_struct_rwlock, RW_READER); | 		rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 		err = dbuf_hold_impl(dn, db->db_level-1, | 		err = dbuf_hold_impl(dn, db->db_level-1, | ||||||
| 		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child); | 		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); | ||||||
| 		rw_exit(&dn->dn_struct_rwlock); | 		rw_exit(&dn->dn_struct_rwlock); | ||||||
| 		if (err == ENOENT) | 		if (err == ENOENT) | ||||||
| 			continue; | 			continue; | ||||||
| @ -288,7 +288,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, | |||||||
| 				continue; | 				continue; | ||||||
| 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, | 			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, | ||||||
| 			    i, B_TRUE, FTAG, &subdb)); | 			    i, TRUE, FALSE, FTAG, &subdb)); | ||||||
| 			rw_exit(&dn->dn_struct_rwlock); | 			rw_exit(&dn->dn_struct_rwlock); | ||||||
| 			ASSERT3P(bp, ==, subdb->db_blkptr); | 			ASSERT3P(bp, ==, subdb->db_blkptr); | ||||||
| 
 | 
 | ||||||
| @ -362,7 +362,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, | |||||||
| 				continue; | 				continue; | ||||||
| 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | 			rw_enter(&dn->dn_struct_rwlock, RW_READER); | ||||||
| 			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, | 			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, | ||||||
| 			    TRUE, FTAG, &db)); | 			    TRUE, FALSE, FTAG, &db)); | ||||||
| 			rw_exit(&dn->dn_struct_rwlock); | 			rw_exit(&dn->dn_struct_rwlock); | ||||||
| 
 | 
 | ||||||
| 			free_children(db, blkid, nblks, tx); | 			free_children(db, blkid, nblks, tx); | ||||||
|  | |||||||
| @ -547,6 +547,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, | |||||||
| 	const char *snapname; | 	const char *snapname; | ||||||
| 	uint64_t obj; | 	uint64_t obj; | ||||||
| 	int err = 0; | 	int err = 0; | ||||||
|  | 	dsl_dataset_t *ds; | ||||||
| 
 | 
 | ||||||
| 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); | 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); | ||||||
| 	if (err != 0) | 	if (err != 0) | ||||||
| @ -555,36 +556,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, | |||||||
| 	ASSERT(dsl_pool_config_held(dp)); | 	ASSERT(dsl_pool_config_held(dp)); | ||||||
| 	obj = dsl_dir_phys(dd)->dd_head_dataset_obj; | 	obj = dsl_dir_phys(dd)->dd_head_dataset_obj; | ||||||
| 	if (obj != 0) | 	if (obj != 0) | ||||||
| 		err = dsl_dataset_hold_obj(dp, obj, tag, dsp); | 		err = dsl_dataset_hold_obj(dp, obj, tag, &ds); | ||||||
| 	else | 	else | ||||||
| 		err = SET_ERROR(ENOENT); | 		err = SET_ERROR(ENOENT); | ||||||
| 
 | 
 | ||||||
| 	/* we may be looking for a snapshot */ | 	/* we may be looking for a snapshot */ | ||||||
| 	if (err == 0 && snapname != NULL) { | 	if (err == 0 && snapname != NULL) { | ||||||
| 		dsl_dataset_t *ds; | 		dsl_dataset_t *snap_ds; | ||||||
| 
 | 
 | ||||||
| 		if (*snapname++ != '@') { | 		if (*snapname++ != '@') { | ||||||
| 			dsl_dataset_rele(*dsp, tag); | 			dsl_dataset_rele(ds, tag); | ||||||
| 			dsl_dir_rele(dd, FTAG); | 			dsl_dir_rele(dd, FTAG); | ||||||
| 			return (SET_ERROR(ENOENT)); | 			return (SET_ERROR(ENOENT)); | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		dprintf("looking for snapshot '%s'\n", snapname); | 		dprintf("looking for snapshot '%s'\n", snapname); | ||||||
| 		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); | 		err = dsl_dataset_snap_lookup(ds, snapname, &obj); | ||||||
| 		if (err == 0) | 		if (err == 0) | ||||||
| 			err = dsl_dataset_hold_obj(dp, obj, tag, &ds); | 			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); | ||||||
| 		dsl_dataset_rele(*dsp, tag); | 		dsl_dataset_rele(ds, tag); | ||||||
| 
 | 
 | ||||||
| 		if (err == 0) { | 		if (err == 0) { | ||||||
| 			mutex_enter(&ds->ds_lock); | 			mutex_enter(&snap_ds->ds_lock); | ||||||
| 			if (ds->ds_snapname[0] == 0) | 			if (snap_ds->ds_snapname[0] == 0) | ||||||
| 				(void) strlcpy(ds->ds_snapname, snapname, | 				(void) strlcpy(snap_ds->ds_snapname, snapname, | ||||||
| 				    sizeof (ds->ds_snapname)); | 				    sizeof (snap_ds->ds_snapname)); | ||||||
| 			mutex_exit(&ds->ds_lock); | 			mutex_exit(&snap_ds->ds_lock); | ||||||
| 			*dsp = ds; | 			ds = snap_ds; | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 	if (err == 0) | ||||||
|  | 		*dsp = ds; | ||||||
| 	dsl_dir_rele(dd, FTAG); | 	dsl_dir_rele(dd, FTAG); | ||||||
| 	return (err); | 	return (err); | ||||||
| } | } | ||||||
|  | |||||||
| @ -560,7 +560,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |||||||
| 	struct killarg *ka = arg; | 	struct killarg *ka = arg; | ||||||
| 	dmu_tx_t *tx = ka->tx; | 	dmu_tx_t *tx = ka->tx; | ||||||
| 
 | 
 | ||||||
| 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | ||||||
| 		return (0); | 		return (0); | ||||||
| 
 | 
 | ||||||
| 	if (zb->zb_level == ZB_ZIL_LEVEL) { | 	if (zb->zb_level == ZB_ZIL_LEVEL) { | ||||||
|  | |||||||
| @ -619,7 +619,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, | |||||||
| 		 * If we already visited this bp & everything below (in | 		 * If we already visited this bp & everything below (in | ||||||
| 		 * a prior txg sync), don't bother doing it again. | 		 * a prior txg sync), don't bother doing it again. | ||||||
| 		 */ | 		 */ | ||||||
| 		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) | 		if (zbookmark_subtree_completed(dnp, zb, | ||||||
|  | 		    &scn->scn_phys.scn_bookmark)) | ||||||
| 			return (B_TRUE); | 			return (B_TRUE); | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
|  | |||||||
| @ -1921,7 +1921,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |||||||
| 	size_t size; | 	size_t size; | ||||||
| 	void *data; | 	void *data; | ||||||
| 
 | 
 | ||||||
| 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | 	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) | ||||||
| 		return (0); | 		return (0); | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Note: normally this routine will not be called if | 	 * Note: normally this routine will not be called if | ||||||
|  | |||||||
| @ -76,8 +76,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) | |||||||
| 
 | 
 | ||||||
| 	mutex_exit(sm->sm_lock); | 	mutex_exit(sm->sm_lock); | ||||||
| 	if (end > bufsize) { | 	if (end > bufsize) { | ||||||
| 		dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize, | 		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, | ||||||
| 		    end - bufsize); | 		    end - bufsize, ZIO_PRIORITY_SYNC_READ); | ||||||
| 	} | 	} | ||||||
| 	mutex_enter(sm->sm_lock); | 	mutex_enter(sm->sm_lock); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, | |||||||
| 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); | 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); | ||||||
| 		tbl->zt_nextblk = newblk; | 		tbl->zt_nextblk = newblk; | ||||||
| 		ASSERT0(tbl->zt_blks_copied); | 		ASSERT0(tbl->zt_blks_copied); | ||||||
| 		dmu_prefetch(zap->zap_objset, zap->zap_object, | 		dmu_prefetch(zap->zap_objset, zap->zap_object, 0, | ||||||
| 		    tbl->zt_blk << bs, tbl->zt_numblks << bs); | 		    tbl->zt_blk << bs, tbl->zt_numblks << bs, | ||||||
|  | 		    ZIO_PRIORITY_SYNC_READ); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| @ -949,7 +950,8 @@ fzap_prefetch(zap_name_t *zn) | |||||||
| 	if (zap_idx_to_blk(zap, idx, &blk) != 0) | 	if (zap_idx_to_blk(zap, idx, &blk) != 0) | ||||||
| 		return; | 		return; | ||||||
| 	bs = FZAP_BLOCK_SHIFT(zap); | 	bs = FZAP_BLOCK_SHIFT(zap); | ||||||
| 	dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); | 	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, | ||||||
|  | 	    ZIO_PRIORITY_SYNC_READ); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
| @ -1295,9 +1297,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) | |||||||
| 	} else { | 	} else { | ||||||
| 		int b; | 		int b; | ||||||
| 
 | 
 | ||||||
| 		dmu_prefetch(zap->zap_objset, zap->zap_object, | 		dmu_prefetch(zap->zap_objset, zap->zap_object, 0, | ||||||
| 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, | 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, | ||||||
| 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); | 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, | ||||||
|  | 		    ZIO_PRIORITY_SYNC_READ); | ||||||
| 
 | 
 | ||||||
| 		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; | 		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; | ||||||
| 		    b++) { | 		    b++) { | ||||||
|  | |||||||
| @ -20,7 +20,7 @@ | |||||||
|  */ |  */ | ||||||
| /*
 | /*
 | ||||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||||
|  * Copyright (c) 2013 by Delphix. All rights reserved. |  * Copyright (c) 2012, 2014 by Delphix. All rights reserved. | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| /* Portions Copyright 2010 Robert Milkowski */ | /* Portions Copyright 2010 Robert Milkowski */ | ||||||
|  | |||||||
| @ -2118,7 +2118,8 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) | |||||||
| 
 | 
 | ||||||
| 		/* Prefetch znode */ | 		/* Prefetch znode */ | ||||||
| 		if (prefetch) { | 		if (prefetch) { | ||||||
| 			dmu_prefetch(os, objnum, 0, 0); | 			dmu_prefetch(os, objnum, 0, 0, 0, | ||||||
|  | 			    ZIO_PRIORITY_SYNC_READ); | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
|  | |||||||
							
								
								
									
										147
									
								
								module/zfs/zio.c
									
									
									
									
									
								
							
							
						
						
									
										147
									
								
								module/zfs/zio.c
									
									
									
									
									
								
							| @ -63,6 +63,9 @@ int zio_delay_max = ZIO_DELAY_MAX; | |||||||
| #define	ZIO_PIPELINE_CONTINUE		0x100 | #define	ZIO_PIPELINE_CONTINUE		0x100 | ||||||
| #define	ZIO_PIPELINE_STOP		0x101 | #define	ZIO_PIPELINE_STOP		0x101 | ||||||
| 
 | 
 | ||||||
|  | #define	BP_SPANB(indblkshift, level) \ | ||||||
|  | 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) | ||||||
|  | #define	COMPARE_META_LEVEL	0x80000000ul | ||||||
| /*
 | /*
 | ||||||
|  * The following actions directly effect the spa's sync-to-convergence logic. |  * The following actions directly effect the spa's sync-to-convergence logic. | ||||||
|  * The values below define the sync pass when we start performing the action. |  * The values below define the sync pass when we start performing the action. | ||||||
| @ -3450,39 +3453,129 @@ static zio_pipe_stage_t *zio_pipeline[] = { | |||||||
| 	zio_done | 	zio_done | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /* dnp is the dnode for zb1->zb_object */ |  | ||||||
| boolean_t |  | ||||||
| zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, |  | ||||||
|     const zbookmark_phys_t *zb2) |  | ||||||
| { |  | ||||||
| 	uint64_t zb1nextL0, zb2thisobj; |  | ||||||
| 
 | 
 | ||||||
| 	ASSERT(zb1->zb_objset == zb2->zb_objset); | 
 | ||||||
| 	ASSERT(zb2->zb_level == 0); | 
 | ||||||
|  | /*
 | ||||||
|  |  * Compare two zbookmark_phys_t's to see which we would reach first in a | ||||||
|  |  * pre-order traversal of the object tree. | ||||||
|  |  * | ||||||
|  |  * This is simple in every case aside from the meta-dnode object. For all other | ||||||
|  |  * objects, we traverse them in order (object 1 before object 2, and so on). | ||||||
|  |  * However, all of these objects are traversed while traversing object 0, since | ||||||
|  |  * the data it points to is the list of objects.  Thus, we need to convert to a | ||||||
|  |  * canonical representation so we can compare meta-dnode bookmarks to | ||||||
|  |  * non-meta-dnode bookmarks. | ||||||
|  |  * | ||||||
|  |  * We do this by calculating "equivalents" for each field of the zbookmark. | ||||||
|  |  * zbookmarks outside of the meta-dnode use their own object and level, and | ||||||
|  |  * calculate the level 0 equivalent (the first L0 blkid that is contained in the | ||||||
|  |  * blocks this bookmark refers to) by multiplying their blkid by their span | ||||||
|  |  * (the number of L0 blocks contained within one block at their level). | ||||||
|  |  * zbookmarks inside the meta-dnode calculate their object equivalent | ||||||
|  |  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use | ||||||
|  |  * level + 1<<31 (any value larger than a level could ever be) for their level. | ||||||
|  |  * This causes them to always compare before a bookmark in their object | ||||||
|  |  * equivalent, compare appropriately to bookmarks in other objects, and to | ||||||
|  |  * compare appropriately to other bookmarks in the meta-dnode. | ||||||
|  |  */ | ||||||
|  | int | ||||||
|  | zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, | ||||||
|  |     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * These variables represent the "equivalent" values for the zbookmark, | ||||||
|  | 	 * after converting zbookmarks inside the meta dnode to their | ||||||
|  | 	 * normal-object equivalents. | ||||||
|  | 	 */ | ||||||
|  | 	uint64_t zb1obj, zb2obj; | ||||||
|  | 	uint64_t zb1L0, zb2L0; | ||||||
|  | 	uint64_t zb1level, zb2level; | ||||||
|  | 
 | ||||||
|  | 	if (zb1->zb_object == zb2->zb_object && | ||||||
|  | 	    zb1->zb_level == zb2->zb_level && | ||||||
|  | 	    zb1->zb_blkid == zb2->zb_blkid) | ||||||
|  | 		return (0); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * BP_SPANB calculates the span in blocks. | ||||||
|  | 	 */ | ||||||
|  | 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); | ||||||
|  | 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); | ||||||
|  | 
 | ||||||
|  | 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) { | ||||||
|  | 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); | ||||||
|  | 		zb1L0 = 0; | ||||||
|  | 		zb1level = zb1->zb_level + COMPARE_META_LEVEL; | ||||||
|  | 	} else { | ||||||
|  | 		zb1obj = zb1->zb_object; | ||||||
|  | 		zb1level = zb1->zb_level; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) { | ||||||
|  | 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); | ||||||
|  | 		zb2L0 = 0; | ||||||
|  | 		zb2level = zb2->zb_level + COMPARE_META_LEVEL; | ||||||
|  | 	} else { | ||||||
|  | 		zb2obj = zb2->zb_object; | ||||||
|  | 		zb2level = zb2->zb_level; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	/* Now that we have a canonical representation, do the comparison. */ | ||||||
|  | 	if (zb1obj != zb2obj) | ||||||
|  | 		return (zb1obj < zb2obj ? -1 : 1); | ||||||
|  | 	else if (zb1L0 != zb2L0) | ||||||
|  | 		return (zb1L0 < zb2L0 ? -1 : 1); | ||||||
|  | 	else if (zb1level != zb2level) | ||||||
|  | 		return (zb1level > zb2level ? -1 : 1); | ||||||
|  | 	/*
 | ||||||
|  | 	 * This can (theoretically) happen if the bookmarks have the same object | ||||||
|  | 	 * and level, but different blkids, if the block sizes are not the same. | ||||||
|  | 	 * There is presently no way to change the indirect block sizes | ||||||
|  | 	 */ | ||||||
|  | 	return (0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  *  This function checks the following: given that last_block is the place that | ||||||
|  |  *  our traversal stopped last time, does that guarantee that we've visited | ||||||
|  |  *  every node under subtree_root?  Therefore, we can't just use the raw output | ||||||
|  |  *  of zbookmark_compare.  We have to pass in a modified version of | ||||||
|  |  *  subtree_root; by incrementing the block id, and then checking whether | ||||||
|  |  *  last_block is before or equal to that, we can tell whether or not having | ||||||
|  |  *  visited last_block implies that all of subtree_root's children have been | ||||||
|  |  *  visited. | ||||||
|  |  */ | ||||||
|  | boolean_t | ||||||
|  | zbookmark_subtree_completed(const dnode_phys_t *dnp, | ||||||
|  |     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) | ||||||
|  | { | ||||||
|  | 	zbookmark_phys_t mod_zb = *subtree_root; | ||||||
|  | 	mod_zb.zb_blkid++; | ||||||
|  | 	ASSERT(last_block->zb_level == 0); | ||||||
| 
 | 
 | ||||||
| 	/* The objset_phys_t isn't before anything. */ | 	/* The objset_phys_t isn't before anything. */ | ||||||
| 	if (dnp == NULL) | 	if (dnp == NULL) | ||||||
| 		return (B_FALSE); | 		return (B_FALSE); | ||||||
| 
 | 
 | ||||||
| 	zb1nextL0 = (zb1->zb_blkid + 1) << | 	/*
 | ||||||
| 	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); | 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the | ||||||
| 
 | 	 * data block size in sectors, because that variable is only used if | ||||||
| 	zb2thisobj = zb2->zb_object ? zb2->zb_object : | 	 * the bookmark refers to a block in the meta-dnode.  Since we don't | ||||||
| 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); | 	 * know without examining it what object it refers to, and there's no | ||||||
| 
 | 	 * harm in passing in this value in other cases, we always pass it in. | ||||||
| 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) { | 	 * | ||||||
| 		uint64_t nextobj = zb1nextL0 * | 	 * We pass in 0 for the indirect block size shift because zb2 must be | ||||||
| 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; | 	 * level 0.  The indirect block size is only used to calculate the span | ||||||
| 		return (nextobj <= zb2thisobj); | 	 * of the bookmark, but since the bookmark must be level 0, the span is | ||||||
| 	} | 	 * always 1, so the math works out. | ||||||
| 
 | 	 * | ||||||
| 	if (zb1->zb_object < zb2thisobj) | 	 * If you make changes to how the zbookmark_compare code works, be sure | ||||||
| 		return (B_TRUE); | 	 * to make sure that this code still works afterwards. | ||||||
| 	if (zb1->zb_object > zb2thisobj) | 	 */ | ||||||
| 		return (B_FALSE); | 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, | ||||||
| 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) | 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, | ||||||
| 		return (B_FALSE); | 	    last_block) <= 0); | ||||||
| 	return (zb1nextL0 <= zb2->zb_blkid); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #if defined(_KERNEL) && defined(HAVE_SPL) | #if defined(_KERNEL) && defined(HAVE_SPL) | ||||||
|  | |||||||
| @ -1397,8 +1397,9 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) | |||||||
| 	 */ | 	 */ | ||||||
| 	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); | 	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); | ||||||
| 	if (len > 0) { | 	if (len > 0) { | ||||||
| 		dmu_prefetch(os, ZVOL_OBJ, 0, len); | 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); | ||||||
| 		dmu_prefetch(os, ZVOL_OBJ, volsize - len, len); | 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, | ||||||
|  | 			ZIO_PRIORITY_SYNC_READ); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	zv->zv_objset = NULL; | 	zv->zv_objset = NULL; | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Paul Dagnelie
						Paul Dagnelie