Merge branch 'illumos'

Merge in ten upstream fixes which have already been made to both the Illumos and FreeBSD ZFS implementations. This brings us up to date with the latest ZFS changes in Illumos. Credit goes to Martin Matuska of the FreeBSD project for posting an excellent summary of the upstream patches we were missing. Illumos #1313: Integer overflow in txg_delay() Illumos #278: get rid zfs of python and pyzfs dependencies Illumos #1043: Recursive zfs snapshot destroy fails Illumos #883: ZIL reuse during remount corruption Illumos #1092: zfs refratio property Illumos #1051: zfs should handle Illumos #510: 'zfs get' enhancement - mountpoint as an argument Illumos #175: zfs vdev cache consumes excessive memory Illumos #764: panic in zfs:dbuf_sync_list Illumos #xxx: zdb -vvv broken after zfs diff integration Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #340
2026-01-12 00:02:04 +03:00 · 2011-08-01 12:10:54 -07:00 · 2011-08-01 12:10:54 -07:00 · 77999e804f
commit 77999e804f
parent bfb73f9277 cddafdcbc5
23 changed files with 2701 additions and 107 deletions
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 /*
@ -205,6 +206,7 @@ typedef struct ztest_od {
 */
 typedef struct ztest_ds {
 	objset_t	*zd_os;
+	krwlock_t	zd_zilog_lock;
 	zilog_t		*zd_zilog;
 	uint64_t	zd_seq;
 	ztest_od_t	*zd_od;		/* debugging aid */
@ -238,6 +240,7 @@ ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_zil_remount;
 ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_prealloc;
@ -273,6 +276,7 @@ ztest_info_t ztest_info[] = {
 	{ ztest_zap_parallel,			100,	&zopt_always	},
 	{ ztest_split_pool,			1,	&zopt_always	},
 	{ ztest_zil_commit,			1,	&zopt_incessant	},
+	{ ztest_zil_remount,			1,	&zopt_sometimes	},
 	{ ztest_dmu_read_write_zcopy,		1,	&zopt_often	},
 	{ ztest_dmu_objset_create_destroy,	1,	&zopt_often	},
 	{ ztest_dsl_prop_get_set,		1,	&zopt_often	},
@ -1006,6 +1010,7 @@ ztest_zd_init(ztest_ds_t *zd, objset_t *os)
 	dmu_objset_name(os, zd->zd_name);
 	int l;

+	rw_init(&zd->zd_zilog_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);

 	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
@ -1021,6 +1026,7 @@ ztest_zd_fini(ztest_ds_t *zd)
 	int l;

 	mutex_destroy(&zd->zd_dirobj_lock);
+	rw_destroy(&zd->zd_zilog_lock);

 	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
 		ztest_rll_destroy(&zd->zd_object_lock[l]);
@ -1992,6 +1998,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 	if (ztest_random(2) == 0)
 		io_type = ZTEST_IO_WRITE_TAG;

+	(void) rw_enter(&zd->zd_zilog_lock, RW_READER);
+
 	switch (io_type) {

 	case ZTEST_IO_WRITE_TAG:
@ -2029,6 +2037,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 		break;
 	}

+	(void) rw_exit(&zd->zd_zilog_lock);
+
 	umem_free(data, blocksize);
 }

@ -2083,6 +2093,8 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 {
 	zilog_t *zilog = zd->zd_zilog;

+	(void) rw_enter(&zd->zd_zilog_lock, RW_READER);
+
 	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));

 	/*
@ -2094,6 +2106,31 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 	ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
 	zd->zd_seq = zilog->zl_commit_lr_seq;
 	mutex_exit(&zilog->zl_lock);
+
+	(void) rw_exit(&zd->zd_zilog_lock);
+}
+
+/*
+ * This function is designed to simulate the operations that occur during a
+ * mount/unmount operation.  We hold the dataset across these operations in an
+ * attempt to expose any implicit assumptions about ZIL management.
+ */
+/* ARGSUSED */
+void
+ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+
+	(void) rw_enter(&zd->zd_zilog_lock, RW_WRITER);
+
+	/* zfsvfs_teardown() */
+	zil_close(zd->zd_zilog);
+
+	/* zfsvfs_setup() */
+	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
+	zil_replay(os, zd, ztest_replay_vector);
+
+	(void) rw_exit(&zd->zd_zilog_lock);
 }

 /*
@ -5300,6 +5337,7 @@ ztest_run(ztest_shared_t *zs)
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+	spa->spa_debug = B_TRUE;
 	zs->zs_spa = spa;

 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
--- a/include/libzfs.h
+++ b/include/libzfs.h
@ -21,6 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
 */

 #ifndef	_LIBZFS_H
@ -572,13 +573,17 @@ extern int zfs_promote(zfs_handle_t *);
 extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
    boolean_t, boolean_t, int, uint64_t, uint64_t);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
 extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);

 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
    uid_t rid, uint64_t space);

-extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
-    zfs_userspace_cb_t func, void *arg);
+extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
+    zfs_userspace_cb_t, void *);
+
+extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **);
+extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *);

 typedef struct recvflags {
 	/* print informational messages (ie, -v was specified) */
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@ -21,6 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 /* Portions Copyright 2010 Robert Milkowski */
@ -122,6 +123,7 @@ typedef enum {
 	ZFS_PROP_DEDUP,
 	ZFS_PROP_MLSLABEL,
 	ZFS_PROP_SYNC,
+	ZFS_PROP_REFRATIO,
 	ZFS_NUM_PROPS
 } zfs_prop_t;

--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_H
@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
 #define	METASLAB_HINTBP_FAVOR	0x0
 #define	METASLAB_HINTBP_AVOID	0x1
 #define	METASLAB_GANG_HEADER	0x2
+#define	METASLAB_GANG_CHILD	0x4
+#define	METASLAB_GANG_AVOID	0x8

 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
    blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@ -21,6 +21,7 @@
 /*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #ifndef _SYS_METASLAB_IMPL_H
@ -52,6 +53,7 @@ struct metaslab_group {
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	uint64_t		mg_bonus_area;
+	uint64_t		mg_alloc_failures;
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #ifndef _SYS_SPA_H
@ -698,6 +699,13 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif

+extern boolean_t spa_debug_enabled(spa_t *spa);
+#define	spa_dbgmsg(spa, ...)			\
+{						\
+	if (spa_debug_enabled(spa))		\
+		zfs_dbgmsg(__VA_ARGS__);	\
+}
+
 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */

 #ifdef	__cplusplus
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #ifndef _SYS_SPA_IMPL_H
@ -196,6 +197,7 @@ struct spa {
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
 	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
+	boolean_t	spa_debug;		/* debug enabled? */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */
--- a/include/zfs_deleg.h
+++ b/include/zfs_deleg.h
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
 */

 #ifndef	_ZFS_DELEG_H
@ -51,6 +52,7 @@ typedef enum {
 	ZFS_DELEG_NOTE_CLONE,
 	ZFS_DELEG_NOTE_PROMOTE,
 	ZFS_DELEG_NOTE_RENAME,
+	ZFS_DELEG_NOTE_SEND,
 	ZFS_DELEG_NOTE_RECEIVE,
 	ZFS_DELEG_NOTE_ALLOW,
 	ZFS_DELEG_NOTE_USERPROP,
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@ -21,6 +21,8 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #include <ctype.h>
@ -94,6 +96,7 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
 	namecheck_err_t why;
 	char what;

+	(void) zfs_prop_get_table();
 	if (dataset_namecheck(path, &why, &what) != 0) {
 		if (hdl != NULL) {
 			switch (why) {
@ -2025,6 +2028,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		}
 		break;

+	case ZFS_PROP_REFRATIO:
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
@ -4311,6 +4315,193 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
 	return (0);
 }

+int
+zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	int nvsz = 2048;
+	void *nvbuf;
+	int err = 0;
+	char errbuf[ZFS_MAXNAMELEN+32];
+
+	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
+
+tryagain:
+
+	nvbuf = malloc(nvsz);
+	if (nvbuf == NULL) {
+		err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
+		goto out;
+	}
+
+	zc.zc_nvlist_dst_size = nvsz;
+	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
+		    zc.zc_name);
+		switch (errno) {
+		case ENOMEM:
+			free(nvbuf);
+			nvsz = zc.zc_nvlist_dst_size;
+			goto tryagain;
+
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	} else {
+		/* success */
+		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
+		if (rc) {
+			(void) snprintf(errbuf, sizeof (errbuf), dgettext(
+			    TEXT_DOMAIN, "cannot get permissions on '%s'"),
+			    zc.zc_name);
+			err = zfs_standard_error_fmt(hdl, rc, errbuf);
+		}
+	}
+
+	free(nvbuf);
+out:
+	return (err);
+}
+
+int
+zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	char *nvbuf;
+	char errbuf[ZFS_MAXNAMELEN+32];
+	size_t nvsz;
+	int err;
+
+	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
+
+	err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
+	assert(err == 0);
+
+	nvbuf = malloc(nvsz);
+
+	err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
+	assert(err == 0);
+
+	zc.zc_nvlist_src_size = nvsz;
+	zc.zc_nvlist_src = (uintptr_t)nvbuf;
+	zc.zc_perm_action = un;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
+		    zc.zc_name);
+		switch (errno) {
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	}
+
+	free(nvbuf);
+
+	return (err);
+}
+
+int
+zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	int nvsz = 2048;
+	void *nvbuf;
+	int err = 0;
+	char errbuf[ZFS_MAXNAMELEN+32];
+
+	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+
+tryagain:
+
+	nvbuf = malloc(nvsz);
+	if (nvbuf == NULL) {
+		err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
+		goto out;
+	}
+
+	zc.zc_nvlist_dst_size = nvsz;
+	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
+		    zc.zc_name);
+		switch (errno) {
+		case ENOMEM:
+			free(nvbuf);
+			nvsz = zc.zc_nvlist_dst_size;
+			goto tryagain;
+
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	} else {
+		/* success */
+		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
+		if (rc) {
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
+			    zc.zc_name);
+			err = zfs_standard_error_fmt(hdl, rc, errbuf);
+		}
+	}
+
+	free(nvbuf);
+out:
+	return (err);
+}
+
 uint64_t
 zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
 {
--- a/man/man8/zfs.8
+++ b/man/man8/zfs.8
@ -360,7 +360,7 @@ This property can also be referred to by its shortened column name, \fBavail\fR.
 .ad
 .sp .6
 .RS 4n
-The compression ratio achieved for this dataset, expressed as a multiplier. Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR.
+For non-snapshots, the compression ratio achieved for the \fBused\fR space of this dataset, expressed as a multiplier.  The \fBused\fR property includes descendant datasets, and, for clones, does not include the space shared with the origin snapshot.  For snapshots, the \fBcompressratio\fR is the same as the \fBrefcompressratio\fR property.  Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR.
 .RE

 .sp
@ -420,6 +420,19 @@ The amount of data that is accessible by this dataset, which may or may not be s
 This property can also be referred to by its shortened column name, \fBrefer\fR.
 .RE

+.sp
+.ne 2
+.mk
+.na
+\fB\fBrefcompressratio\fR\fR
+.ad
+.sp .6
+.RS 4n
+The compression ratio achieved for the \fBreferenced\fR space of this
+dataset, expressed as a multiplier.  See also the \fBcompressratio\fR
+property.
+.RE
+
 .sp
 .ne 2
 .mk
@ -1235,7 +1248,7 @@ Recursively destroy all dependents, including cloned file systems outside the ta
 Force an unmount of any file systems using the \fBunmount -f\fR command. This option has no effect on non-file systems or unmounted file systems.
 .RE

-Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. 
+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use.
 .RE

 .sp
--- a/module/zcommon/zfs_deleg.c
+++ b/module/zcommon/zfs_deleg.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
 */

 #if defined(_KERNEL)
@ -60,7 +61,7 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
 	{ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
 	{ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
 	{ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
-	{ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
+	{ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
 	{ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
 	{ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
 	{ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 /* Portions Copyright 2010 Robert Milkowski */
@ -311,6 +312,9 @@ zfs_prop_init(void)
 	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<1.00x or higher if compressed>", "RATIO");
+	zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET,
+	    "<1.00x or higher if compressed>", "REFRATIO");
 	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
 	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
 	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -1347,13 +1348,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 * it, since one of the current holders may be in the
 	 * middle of an update.  Note that users of dbuf_undirty()
 	 * should not place a hold on the dbuf before the call.
+	 * Also note: we can get here with a spill block, so
+	 * test for that similar to how dbuf_dirty does.
 	 */
 	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		mutex_exit(&db->db_mtx);
 		/* Make sure we don't toss this buffer at sync phase */
-		mutex_enter(&dn->dn_mtx);
-		dnode_clear_range(dn, db->db_blkid, 1, tx);
-		mutex_exit(&dn->dn_mtx);
+		if (db->db_blkid != DMU_SPILL_BLKID) {
+			mutex_enter(&dn->dn_mtx);
+			dnode_clear_range(dn, db->db_blkid, 1, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
 		DB_DNODE_EXIT(db);
 		return (0);
 	}
@ -1366,11 +1371,18 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)

 	*drp = dr->dr_next;

+	/*
+	 * Note that there are three places in dbuf_dirty()
+	 * where this dirty record may be put on a list.
+	 * Make sure to do a list_remove corresponding to
+	 * every one of those list_insert calls.
+	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
-	} else if (db->db_level+1 == dn->dn_nlevels) {
+	} else if (db->db_blkid == DMU_SPILL_BLKID ||
+	    db->db_level+1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #include <sys/dmu_objset.h>
@ -2153,7 +2154,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
-	uint64_t refd, avail, uobjs, aobjs;
+	uint64_t refd, avail, uobjs, aobjs, ratio;

 	dsl_dir_stats(ds->ds_dir, nv);

@ -2180,6 +2181,11 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);

+	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
+	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
+	    ds->ds_phys->ds_compressed_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
+
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
 		 * This is a snapshot; override the dd's space used with
@ -2187,10 +2193,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 		 */
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 		    ds->ds_phys->ds_unique_bytes);
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-		    ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
-		    (ds->ds_phys->ds_uncompressed_bytes * 100 /
-		    ds->ds_phys->ds_compressed_bytes));
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
 	}
 }

--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -30,11 +31,30 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>

-#define WITH_NDF_BLOCK_ALLOCATOR
+#define WITH_DF_BLOCK_ALLOCATOR
+
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang or dump device related allocations
+ * to "fast" gang.
+ */
+#define	CAN_FASTGANG(flags) \
+	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+	METASLAB_GANG_AVOID)))

 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */

+/*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures;
+
 /*
 * Metaslab debugging: when set, keeps all space maps in core to verify frees.
 */
@ -865,7 +885,7 @@ metaslab_prefetch(metaslab_group_t *mg)
 }

 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
@ -899,13 +919,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 			mutex_exit(&mg->mg_lock);
 		}

-		/*
-		 * If we were able to load the map then make sure
-		 * that this map is still able to satisfy our request.
-		 */
-		if (msp->ms_weight < size)
-			return (ENOSPC);
-
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@ -1123,6 +1136,7 @@ void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
+	int64_t failures = mg->mg_alloc_failures;
 	int m;

 	/*
@ -1140,6 +1154,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
 		mutex_exit(&msp->ms_lock);
 	}

+	atomic_add_64(&mg->mg_alloc_failures, -failures);
+
 	/*
 	 * Prefetch the next potential metaslabs
 	 */
@ -1164,9 +1180,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
 }

 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
 {
+	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@ -1187,11 +1204,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,

 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
+			if (msp->ms_weight < asize) {
+				spa_dbgmsg(spa, "%s: failed to meet weight "
+				    "requirement: vdev %llu, txg %llu, mg %p, "
+				    "msp %p, psize %llu, asize %llu, "
+				    "failures %llu, weight %llu",
+				    spa_name(spa), mg->mg_vd->vdev_id, txg,
+				    mg, msp, psize, asize,
+				    mg->mg_alloc_failures, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
-
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@ -1210,6 +1233,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		if (msp == NULL)
 			return (-1ULL);

+		/*
+		 * If we've already reached the allowable number of failed
+		 * allocation attempts on this metaslab group then we
+		 * consider skipping it. We skip it only if we're allowed
+		 * to "fast" gang, the physical size is larger than
+		 * a gang block, and we're attempting to allocate from
+		 * the primary metaslab.
+		 */
+		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			spa_dbgmsg(spa, "%s: skipping metaslab group: "
+			    "vdev %llu, txg %llu, mg %p, psize %llu, "
+			    "asize %llu, failures %llu", spa_name(spa),
+			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
+			    mg->mg_alloc_failures);
+			return (-1ULL);
+		}
+
 		mutex_enter(&msp->ms_lock);

 		/*
@ -1218,7 +1260,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size || (was_active &&
+		if (msp->ms_weight < asize || (was_active &&
 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
@ -1233,14 +1275,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 			continue;
 		}

-		if (metaslab_activate(msp, activation_weight, size) != 0) {
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}

-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
 			break;

+		atomic_inc_64(&mg->mg_alloc_failures);
+
 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));

 		mutex_exit(&msp->ms_lock);
@ -1249,7 +1293,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);

-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);

 	mutex_exit(&msp->ms_lock);

@ -1376,7 +1420,8 @@ top:
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+		    dva, d, flags);
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@ -1388,18 +1433,24 @@ top:
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;

-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);

 				/*
-				 * Bias by at most +/- 25% of the aliquot.
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 * For example, if a device is 80% full
+				 * and the pool is 20% full then we should
+				 * reduce allocations by 60% on this device.
+				 *
+				 * mg_bias = (20 - 80) * 512K / 100 = -307K
+				 *
+				 * This reduces allocations by 307K for this
+				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+				    (int64_t)mg->mg_aliquot) / 100;
 			}

 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@ -1513,7 +1564,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	mutex_enter(&msp->ms_lock);

 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);

 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
 		error = ENOENT;
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -1680,6 +1681,12 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	return (0);
 }

+boolean_t
+spa_debug_enabled(spa_t *spa)
+{
+	return (spa->spa_debug);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@ -506,7 +506,7 @@ void
 txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	int timeout = ddi_get_lbolt() + ticks;
+	clock_t timeout = ddi_get_lbolt() + ticks;

 	/* don't delay if this txg could transition to quiesing immediately */
 	if (tx->tx_open_txg > txg ||
--- a/module/zfs/vdev_cache.c
+++ b/module/zfs/vdev_cache.c
@ -71,9 +71,16 @@
 * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
 * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
 * vdev's vdev_cache.
+ *
+ * TODO: Note that with the current ZFS code, it turns out that the
+ * vdev cache is not helpful, and in some cases actually harmful.  It
+ * is better if we disable this.  Once some time has passed, we should
+ * actually remove this to simplify the code.  For now we just disable
+ * it by setting the zfs_vdev_cache_size to zero.  Note that Solaris 11
+ * has made these same changes.
 */
 int zfs_vdev_cache_max = 1<<14;			/* 16KB */
-int zfs_vdev_cache_size = 10ULL << 20;		/* 10MB */
+int zfs_vdev_cache_size = 0;
 int zfs_vdev_cache_bshift = 16;

 #define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@ -701,6 +701,9 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
 * and destroying snapshots requires descendent permissions, a successfull
 * check of the top level snapshot applies to snapshots of all descendent
 * datasets as well.
+ *
+ * The target snapshot may not exist when doing a recursive destroy.
+ * In this case fallback to permissions of the parent dataset.
 */
 static int
 zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
@ -711,6 +714,8 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
 	dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);

 	error = zfs_secpolicy_destroy_perms(dsname, cr);
+	if (error == ENOENT)
+		error = zfs_secpolicy_destroy_perms(zc->zc_name, cr);

 	strfree(dsname);
 	return (error);
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@ -1560,12 +1560,12 @@ zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)

 static int
 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
-    dmu_buf_t **db)
+    dmu_buf_t **db, void *tag)
 {
 	dmu_object_info_t doi;
 	int error;

-	if ((error = sa_buf_hold(osp, obj, FTAG, db)) != 0)
+	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
 		return (error);

 	dmu_object_info_from_db(*db, &doi);
@ -1573,13 +1573,13 @@ zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t))) {
-		sa_buf_rele(*db, FTAG);
+		sa_buf_rele(*db, tag);
 		return (ENOTSUP);
 	}

 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
 	if (error != 0) {
-		sa_buf_rele(*db, FTAG);
+		sa_buf_rele(*db, tag);
 		return (error);
 	}

@ -1587,10 +1587,10 @@ zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
 }

 void
-zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db)
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
 {
 	sa_handle_destroy(hdl);
-	sa_buf_rele(db, FTAG);
+	sa_buf_rele(db, tag);
 }

 /*
@ -1667,7 +1667,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
 		int is_xattrdir;

 		if (prevdb)
-			zfs_release_sa_handle(prevhdl, prevdb);
+			zfs_release_sa_handle(prevhdl, prevdb, FTAG);

 		if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj,
 		    &is_xattrdir)) != 0)
@ -1699,7 +1699,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
 			prevhdl = sa_hdl;
 			prevdb = sa_db;
 		}
-		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db);
+		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
 		if (error != 0) {
 			sa_hdl = prevhdl;
 			sa_db = prevdb;
@ -1709,7 +1709,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,

 	if (sa_hdl != NULL && sa_hdl != hdl) {
 		ASSERT(sa_db != NULL);
-		zfs_release_sa_handle(sa_hdl, sa_db);
+		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
 	}

 	if (error == 0)
@ -1730,13 +1730,13 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 	if (error != 0)
 		return (error);

-	error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);

 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);

-	zfs_release_sa_handle(hdl, db);
+	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }

@ -1756,19 +1756,19 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
 	if (error != 0)
 		return (error);

-	error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);

 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
 	if (error != 0) {
-		zfs_release_sa_handle(hdl, db);
+		zfs_release_sa_handle(hdl, db, FTAG);
 		return (error);
 	}

 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);

-	zfs_release_sa_handle(hdl, db);
+	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }

--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 /* Portions Copyright 2010 Robert Milkowski */
@ -562,7 +563,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)

 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
-		ASSERT(!keep_first);
+		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
@ -1665,21 +1666,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 void
 zil_free(zilog_t *zilog)
 {
-	lwb_t *head_lwb;
 	int i;

 	zilog->zl_stop_sync = 1;

-	/*
-	 * After zil_close() there should only be one lwb with a buffer.
-	 */
-	head_lwb = list_head(&zilog->zl_lwb_list);
-	if (head_lwb) {
-		ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
-		list_remove(&zilog->zl_lwb_list, head_lwb);
-		zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
-		kmem_cache_free(zil_lwb_cache, head_lwb);
-	}
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);

 	avl_destroy(&zilog->zl_vdev_tree);
@ -1719,6 +1710,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 {
 	zilog_t *zilog = dmu_objset_zil(os);

+	ASSERT(zilog->zl_clean_taskq == NULL);
+	ASSERT(zilog->zl_get_data == NULL);
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
 	zilog->zl_get_data = get_data;
 	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
 	    2, 2, TASKQ_PREPOPULATE);
@ -1732,7 +1727,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 void
 zil_close(zilog_t *zilog)
 {
-	lwb_t *tail_lwb;
+	lwb_t *lwb;
 	uint64_t txg = 0;

 	zil_commit(zilog, 0); /* commit all itx */
@ -1744,9 +1739,9 @@ zil_close(zilog_t *zilog)
 	 * destroy the zl_clean_taskq.
 	 */
 	mutex_enter(&zilog->zl_lock);
-	tail_lwb = list_tail(&zilog->zl_lwb_list);
-	if (tail_lwb != NULL)
-		txg = tail_lwb->lwb_max_txg;
+	lwb = list_tail(&zilog->zl_lwb_list);
+	if (lwb != NULL)
+		txg = lwb->lwb_max_txg;
 	mutex_exit(&zilog->zl_lock);
 	if (txg)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
@ -1754,6 +1749,19 @@ zil_close(zilog_t *zilog)
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
 	zilog->zl_get_data = NULL;
+
+	/*
+	 * We should have only one LWB left on the list; remove it now.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	lwb = list_head(&zilog->zl_lwb_list);
+	if (lwb != NULL) {
+		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
+		list_remove(&zilog->zl_lwb_list, lwb);
+		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	mutex_exit(&zilog->zl_lock);
 }

 /*
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -20,6 +20,7 @@
 */
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -79,6 +80,7 @@ int zio_delay_max = ZIO_DELAY_MAX;
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
+extern int zfs_mg_alloc_failures;

 /*
 * An allocating zio is one that either currently has the DVA allocate
@ -158,6 +160,12 @@ zio_init(void)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}

+	/*
+	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+	 * to fail 3 times per txg or 8 failures, whichever is greater.
+	 */
+	zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+
 	zio_inject_init();
 }

@ -2151,6 +2159,7 @@ zio_dva_allocate(zio_t *zio)
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
+	int flags = 0;

 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@ -2163,10 +2172,21 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));

+	/*
+	 * The dump device does not support gang blocks so allocation on
+	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+	 * the "fast" gang feature.
+	 */
+	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+	    METASLAB_GANG_CHILD : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);

 	if (error) {
+		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;