From 7b8518cb8d39aa340fecf559143763b27b212b0d Mon Sep 17 00:00:00 2001
From: Tim Haley <Tim.Haley@Sun.COM>
Date: Tue, 26 Jul 2011 16:38:27 -0700
Subject: [PATCH 01/10] Illumos #xxx: zdb -vvv broken after zfs diff
 integration

References to Illumos issue and patch:
- https://github.com/illumos/illumos-gate/commit/163eb7ff

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 module/zfs/zfs_znode.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index ea8b4c505..dfbe11aca 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -1560,12 +1560,12 @@ zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
 
 static int
 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
-    dmu_buf_t **db)
+    dmu_buf_t **db, void *tag)
 {
 	dmu_object_info_t doi;
 	int error;
 
-	if ((error = sa_buf_hold(osp, obj, FTAG, db)) != 0)
+	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
 		return (error);
 
 	dmu_object_info_from_db(*db, &doi);
@@ -1573,13 +1573,13 @@ zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t))) {
-		sa_buf_rele(*db, FTAG);
+		sa_buf_rele(*db, tag);
 		return (ENOTSUP);
 	}
 
 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
 	if (error != 0) {
-		sa_buf_rele(*db, FTAG);
+		sa_buf_rele(*db, tag);
 		return (error);
 	}
 
@@ -1587,10 +1587,10 @@ zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
 }
 
 void
-zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db)
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
 {
 	sa_handle_destroy(hdl);
-	sa_buf_rele(db, FTAG);
+	sa_buf_rele(db, tag);
 }
 
 /*
@@ -1667,7 +1667,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
 		int is_xattrdir;
 
 		if (prevdb)
-			zfs_release_sa_handle(prevhdl, prevdb);
+			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
 
 		if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj,
 		    &is_xattrdir)) != 0)
@@ -1699,7 +1699,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
 			prevhdl = sa_hdl;
 			prevdb = sa_db;
 		}
-		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db);
+		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
 		if (error != 0) {
 			sa_hdl = prevhdl;
 			sa_db = prevdb;
@@ -1709,7 +1709,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
 
 	if (sa_hdl != NULL && sa_hdl != hdl) {
 		ASSERT(sa_db != NULL);
-		zfs_release_sa_handle(sa_hdl, sa_db);
+		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
 	}
 
 	if (error == 0)
@@ -1730,13 +1730,13 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 	if (error != 0)
 		return (error);
 
-	error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
 
-	zfs_release_sa_handle(hdl, db);
+	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }
 
@@ -1756,19 +1756,19 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
 	if (error != 0)
 		return (error);
 
-	error = zfs_grab_sa_handle(osp, obj, &hdl, &db);
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
 	if (error != 0) {
-		zfs_release_sa_handle(hdl, db);
+		zfs_release_sa_handle(hdl, db, FTAG);
 		return (error);
 	}
 
 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
 
-	zfs_release_sa_handle(hdl, db);
+	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }
 

From ef3c1dea7024b07b4ace6115de9f22a99c1394d8 Mon Sep 17 00:00:00 2001
From: Gordon Ross <gwr@nexenta.com>
Date: Tue, 26 Jul 2011 11:37:06 -0700
Subject: [PATCH 02/10] Illumos #764: panic in zfs:dbuf_sync_list

Hypothesis about what's going on here.

At some time in the past, something, i.e. dnode_reallocate()
calls one of:
dbuf_rm_spill(dn, tx);

These will do:
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx)
dbuf_undirty(db, tx)

Currently dbuf_undirty can leave a spill block in dn_dirty_records[],
(it having been put there previously by dbuf_dirty) and free it.
Sometime later, dbuf_sync_list trips over this reference to free'd
(and typically reused) memory.

Also, dbuf_undirty can call dnode_clear_range with a bogus
block ID. It needs to test for DMU_SPILL_BLKID, similar to
how dnode_clear_range is called in dbuf_dirty().

References to Illumos issue and patch:
- https://www.illumos.org/issues/764
- https://github.com/illumos/illumos-gate/commit/3f2366c2bb

Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Mark.Maybe@oracle.com
Reviewed by: Albert Lee <trisk@nexenta.com
Approved by: Garrett D'Amore <garrett@nexenta.com>

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 module/zfs/dbuf.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index e166c81df..34ce2f62b 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1347,13 +1348,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 * it, since one of the current holders may be in the
 	 * middle of an update.  Note that users of dbuf_undirty()
 	 * should not place a hold on the dbuf before the call.
+	 * Also note: we can get here with a spill block, so
+	 * test for that similar to how dbuf_dirty does.
 	 */
 	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		mutex_exit(&db->db_mtx);
 		/* Make sure we don't toss this buffer at sync phase */
-		mutex_enter(&dn->dn_mtx);
-		dnode_clear_range(dn, db->db_blkid, 1, tx);
-		mutex_exit(&dn->dn_mtx);
+		if (db->db_blkid != DMU_SPILL_BLKID) {
+			mutex_enter(&dn->dn_mtx);
+			dnode_clear_range(dn, db->db_blkid, 1, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
 		DB_DNODE_EXIT(db);
 		return (0);
 	}
@@ -1366,11 +1371,18 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	*drp = dr->dr_next;
 
+	/*
+	 * Note that there are three places in dbuf_dirty()
+	 * where this dirty record may be put on a list.
+	 * Make sure to do a list_remove corresponding to
+	 * every one of those list_insert calls.
+	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
-	} else if (db->db_level+1 == dn->dn_nlevels) {
+	} else if (db->db_blkid == DMU_SPILL_BLKID ||
+	    db->db_level+1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);

From 2cc6c8db12eaf8d1a1c17a13fbc6d25d8244486e Mon Sep 17 00:00:00 2001
From: Garrett D'Amore <garrett@nexenta.com>
Date: Fri, 22 Apr 2011 00:49:41 -0700
Subject: [PATCH 03/10] Illumos #175: zfs vdev cache consumes excessive memory

Note that with the current ZFS code, it turns out that the vdev
cache is not helpful, and in some cases actually harmful.  It
is better if we disable this.  Once some time has passed, we
should actually remove this to simplify the code.  For now we
just disable it by setting the zfs_vdev_cache_size to zero.
Note that Solaris 11 has made these same changes.

References to Illumos issue and patch:
- https://www.illumos.org/issues/175
- https://github.com/illumos/illumos-gate/commit/b68a40a845

Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 module/zfs/vdev_cache.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c
index 0d1fe7d2c..e2f8040d1 100644
--- a/module/zfs/vdev_cache.c
+++ b/module/zfs/vdev_cache.c
@@ -71,9 +71,16 @@
  * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
  * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
  * vdev's vdev_cache.
+ *
+ * TODO: Note that with the current ZFS code, it turns out that the
+ * vdev cache is not helpful, and in some cases actually harmful.  It
+ * is better if we disable this.  Once some time has passed, we should
+ * actually remove this to simplify the code.  For now we just disable
+ * it by setting the zfs_vdev_cache_size to zero.  Note that Solaris 11
+ * has made these same changes.
  */
 int zfs_vdev_cache_max = 1<<14;			/* 16KB */
-int zfs_vdev_cache_size = 10ULL << 20;		/* 10MB */
+int zfs_vdev_cache_size = 0;
 int zfs_vdev_cache_bshift = 16;
 
 #define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */

From bb939d10859e4f56bc793c4d5077e680a8c6c3aa Mon Sep 17 00:00:00 2001
From: Shampavman <sham.pavman@nexenta.com>
Date: Tue, 26 Jul 2011 11:53:09 -0700
Subject: [PATCH 04/10] Illumos #510: 'zfs get' enhancement - mountpoint as an
 argument

The 'zfs get' command should be able to deal with mountpoint
as an argument.  It already works with 'zfs list' command:

  # zfs list /export/home/estibi
  NAME                       USED  AVAIL  REFER  MOUNTPOINT
  rpool/export/home/estibi  1.14G  3.86G  1.14G  /export/home/estibi

but it fails with 'zfs get':

  # zfs get all /export/home/estibi
  cannot open '/export/home/estibi': invalid dataset name

Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Reviewed by: Deano <deano@rattie.demon.co.uk>
Reviewed by: Garrett D'Amore <garrett@nexenta.com>
Approved by: Garrett D'Amore <garrett@nexenta.com>

References to Illumos issue and patch:
- https://www.illumos.org/issues/510
- https://github.com/illumos/illumos-gate/commit/5ead3ed965

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 cmd/zfs/zfs_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index b6dc6d49d..69a6736a6 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <assert.h>
@@ -1274,7 +1275,7 @@ static int
 zfs_do_get(int argc, char **argv)
 {
 	zprop_get_cbdata_t cb = { 0 };
-	int i, c, flags = 0;
+	int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
 	char *value, *fields;
 	int ret;
 	int limit = 0;

From 6d974228ef05366c546bb04198dafcb38785c16d Mon Sep 17 00:00:00 2001
From: George Wilson <George.Wilson@delphix.com>
Date: Tue, 26 Jul 2011 12:08:52 -0700
Subject: [PATCH 05/10] Illumos #1051: zfs should handle imbalanced luns

Today zfs tries to allocate blocks evenly across all devices.
This means when devices are imbalanced zfs will use lots of
CPU searching for space on devices which tend to be pretty
full.  It should instead fail quickly on the full LUNs and
move onto devices which have more availability.

Reviewed by: Eric Schrock <Eric.Schrock@delphix.com>
Reviewed by: Matt Ahrens <Matt.Ahrens@delphix.com>
Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com>
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Gordon Ross <gwr@nexenta.com>
Approved by: Garrett D'Amore <garrett@nexenta.com>

References to Illumos issue and patch:
- https://www.illumos.org/issues/510
- https://github.com/illumos/illumos-gate/commit/5ead3ed965

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 cmd/ztest/ztest.c           |   2 +
 include/sys/metaslab.h      |   3 ++
 include/sys/metaslab_impl.h |   2 +
 include/sys/spa.h           |   8 +++
 include/sys/spa_impl.h      |   2 +
 module/zfs/metaslab.c       | 105 ++++++++++++++++++++++++++----------
 module/zfs/spa_misc.c       |   7 +++
 module/zfs/zio.c            |  22 +++++++-
 8 files changed, 123 insertions(+), 28 deletions(-)

diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 6acba5290..235bf56ef 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /*
@@ -5300,6 +5301,7 @@ ztest_run(ztest_shared_t *zs)
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+	spa->spa_debug = B_TRUE;
 	zs->zs_spa = spa;
 
 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 583d6303b..2cf4d2b48 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
 #define	METASLAB_HINTBP_FAVOR	0x0
 #define	METASLAB_HINTBP_AVOID	0x1
 #define	METASLAB_GANG_HEADER	0x2
+#define	METASLAB_GANG_CHILD	0x4
+#define	METASLAB_GANG_AVOID	0x8
 
 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 07988dd51..6c670a162 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +53,7 @@ struct metaslab_group {
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	uint64_t		mg_bonus_area;
+	uint64_t		mg_alloc_failures;
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 52737ebc2..c9028fb09 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPA_H
@@ -698,6 +699,13 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
+extern boolean_t spa_debug_enabled(spa_t *spa);
+#define	spa_dbgmsg(spa, ...)			\
+{						\
+	if (spa_debug_enabled(spa))		\
+		zfs_dbgmsg(__VA_ARGS__);	\
+}
+
 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 1c34873b6..3f5cd9a73 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -196,6 +197,7 @@ struct spa {
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
 	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
+	boolean_t	spa_debug;		/* debug enabled? */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 56c46100d..b089f1eac 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -30,11 +31,30 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 
-#define WITH_NDF_BLOCK_ALLOCATOR
+#define WITH_DF_BLOCK_ALLOCATOR
+
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang or dump device related allocations
+ * to "fast" gang.
+ */
+#define	CAN_FASTGANG(flags) \
+	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+	METASLAB_GANG_AVOID)))
 
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
+/*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures;
+
 /*
  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  */
@@ -865,7 +885,7 @@ metaslab_prefetch(metaslab_group_t *mg)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
@@ -899,13 +919,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 			mutex_exit(&mg->mg_lock);
 		}
 
-		/*
-		 * If we were able to load the map then make sure
-		 * that this map is still able to satisfy our request.
-		 */
-		if (msp->ms_weight < size)
-			return (ENOSPC);
-
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -1123,6 +1136,7 @@ void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
+	int64_t failures = mg->mg_alloc_failures;
 	int m;
 
 	/*
@@ -1140,6 +1154,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
 		mutex_exit(&msp->ms_lock);
 	}
 
+	atomic_add_64(&mg->mg_alloc_failures, -failures);
+
 	/*
 	 * Prefetch the next potential metaslabs
 	 */
@@ -1164,9 +1180,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
 }
 
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
 {
+	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1187,11 +1204,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
+			if (msp->ms_weight < asize) {
+				spa_dbgmsg(spa, "%s: failed to meet weight "
+				    "requirement: vdev %llu, txg %llu, mg %p, "
+				    "msp %p, psize %llu, asize %llu, "
+				    "failures %llu, weight %llu",
+				    spa_name(spa), mg->mg_vd->vdev_id, txg,
+				    mg, msp, psize, asize,
+				    mg->mg_alloc_failures, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
-
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@@ -1210,6 +1233,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		if (msp == NULL)
 			return (-1ULL);
 
+		/*
+		 * If we've already reached the allowable number of failed
+		 * allocation attempts on this metaslab group then we
+		 * consider skipping it. We skip it only if we're allowed
+		 * to "fast" gang, the physical size is larger than
+		 * a gang block, and we're attempting to allocate from
+		 * the primary metaslab.
+		 */
+		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			spa_dbgmsg(spa, "%s: skipping metaslab group: "
+			    "vdev %llu, txg %llu, mg %p, psize %llu, "
+			    "asize %llu, failures %llu", spa_name(spa),
+			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
+			    mg->mg_alloc_failures);
+			return (-1ULL);
+		}
+
 		mutex_enter(&msp->ms_lock);
 
 		/*
@@ -1218,7 +1260,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size || (was_active &&
+		if (msp->ms_weight < asize || (was_active &&
 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
@@ -1233,14 +1275,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight, size) != 0) {
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
 			break;
 
+		atomic_inc_64(&mg->mg_alloc_failures);
+
 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
@@ -1249,7 +1293,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
 
 	mutex_exit(&msp->ms_lock);
 
@@ -1376,7 +1420,8 @@ top:
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+		    dva, d, flags);
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@@ -1388,18 +1433,24 @@ top:
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
-				 * Bias by at most +/- 25% of the aliquot.
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 * For example, if a device is 80% full
+				 * and the pool is 20% full then we should
+				 * reduce allocations by 60% on this device.
+				 *
+				 * mg_bias = (20 - 80) * 512K / 100 = -307K
+				 *
+				 * This reduces allocations by 307K for this
+				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+				    (int64_t)mg->mg_aliquot) / 100;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1513,7 +1564,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
 		error = ENOENT;
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 487a76d71..e4e0c35f0 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1680,6 +1681,12 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	return (0);
 }
 
+boolean_t
+spa_debug_enabled(spa_t *spa)
+{
+	return (spa->spa_debug);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 0fa823687..0022c64cc 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -79,6 +80,7 @@ int zio_delay_max = ZIO_DELAY_MAX;
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
+extern int zfs_mg_alloc_failures;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
@@ -158,6 +160,12 @@ zio_init(void)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
+	/*
+	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+	 * to fail 3 times per txg or 8 failures, whichever is greater.
+	 */
+	zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+
 	zio_inject_init();
 }
 
@@ -2151,6 +2159,7 @@ zio_dva_allocate(zio_t *zio)
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
+	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2163,10 +2172,21 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
+	/*
+	 * The dump device does not support gang blocks so allocation on
+	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+	 * the "fast" gang feature.
+	 */
+	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+	    METASLAB_GANG_CHILD : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
 
 	if (error) {
+		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;

From f5fc4acaa77e2c1782a9495bbf1a39884b4c3940 Mon Sep 17 00:00:00 2001
From: Matt Ahrens <Matt.Ahrens@delphix.com>
Date: Tue, 26 Jul 2011 12:23:00 -0700
Subject: [PATCH 06/10] Illumos #1092: zfs refratio property

Add a "REFRATIO" property, which is the compression ratio based on
data referenced. For snapshots, this is the same as COMPRESSRATIO,
but for filesystems/volumes, the COMPRESSRATIO is based on the
data "USED" (ie, includes blocks in children, but not blocks
shared with the origin).

This is needed to figure out how much space a filesystem would
use if it were not compressed (ignoring snapshots).

Reviewed by: George Wilson <George.Wilson@delphix.com>
Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com>
Reviewed by: Dan McDonald <danmcd@nexenta.com>
Reviewed by: Richard Elling <richard.elling@richardelling.com>
Reviewed by: Mark Musante <Mark.Musante@oracle.com>
Reviewed by: Garrett D'Amore <garrett@nexenta.com>
Approved by: Garrett D'Amore <garrett@nexenta.com>

References to Illumos issue and patch:
- https://www.illumos.org/issues/1092
- https://github.com/illumos/illumos-gate/commit/187d6ac08a

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 include/sys/fs/zfs.h        |  2 ++
 lib/libzfs/libzfs_dataset.c |  2 ++
 man/man8/zfs.8              | 17 +++++++++++++++--
 module/zcommon/zfs_prop.c   |  4 ++++
 module/zfs/dsl_dataset.c    | 13 ++++++++-----
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index a2b68eddf..2ac84f645 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -122,6 +123,7 @@ typedef enum {
 	ZFS_PROP_DEDUP,
 	ZFS_PROP_MLSLABEL,
 	ZFS_PROP_SYNC,
+	ZFS_PROP_REFRATIO,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 5f8847a93..996bae2d7 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <ctype.h>
@@ -2025,6 +2026,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		}
 		break;
 
+	case ZFS_PROP_REFRATIO:
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
diff --git a/man/man8/zfs.8 b/man/man8/zfs.8
index 3da2e44a7..26dc6cadc 100644
--- a/man/man8/zfs.8
+++ b/man/man8/zfs.8
@@ -360,7 +360,7 @@ This property can also be referred to by its shortened column name, \fBavail\fR.
 .ad
 .sp .6
 .RS 4n
-The compression ratio achieved for this dataset, expressed as a multiplier. Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR.
+For non-snapshots, the compression ratio achieved for the \fBused\fR space of this dataset, expressed as a multiplier.  The \fBused\fR property includes descendant datasets, and, for clones, does not include the space shared with the origin snapshot.  For snapshots, the \fBcompressratio\fR is the same as the \fBrefcompressratio\fR property.  Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR.
 .RE
 
 .sp
@@ -420,6 +420,19 @@ The amount of data that is accessible by this dataset, which may or may not be s
 This property can also be referred to by its shortened column name, \fBrefer\fR.
 .RE
 
+.sp
+.ne 2
+.mk
+.na
+\fB\fBrefcompressratio\fR\fR
+.ad
+.sp .6
+.RS 4n
+The compression ratio achieved for the \fBreferenced\fR space of this
+dataset, expressed as a multiplier.  See also the \fBcompressratio\fR
+property.
+.RE
+
 .sp
 .ne 2
 .mk
@@ -1235,7 +1248,7 @@ Recursively destroy all dependents, including cloned file systems outside the ta
 Force an unmount of any file systems using the \fBunmount -f\fR command. This option has no effect on non-file systems or unmounted file systems.
 .RE
 
-Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. 
+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use.
 .RE
 
 .sp
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c
index ce03a498c..9d65e35de 100644
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -311,6 +312,9 @@ zfs_prop_init(void)
 	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<1.00x or higher if compressed>", "RATIO");
+	zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET,
+	    "<1.00x or higher if compressed>", "REFRATIO");
 	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
 	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
 	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index c34ac2a76..26362c95c 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
@@ -2153,7 +2154,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
-	uint64_t refd, avail, uobjs, aobjs;
+	uint64_t refd, avail, uobjs, aobjs, ratio;
 
 	dsl_dir_stats(ds->ds_dir, nv);
 
@@ -2180,6 +2181,11 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
 	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
+	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
+	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
+	    ds->ds_phys->ds_compressed_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
+
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
 		 * This is a snapshot; override the dd's space used with
@@ -2187,10 +2193,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 		 */
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 		    ds->ds_phys->ds_unique_bytes);
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-		    ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
-		    (ds->ds_phys->ds_uncompressed_bytes * 100 /
-		    ds->ds_phys->ds_compressed_bytes));
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
 	}
 }
 

From 3e31d2b080b4e6665a93691d171a13d7e29a768a Mon Sep 17 00:00:00 2001
From: Eric Schrock <Eric.Schrock@delphix.com>
Date: Tue, 26 Jul 2011 12:41:53 -0700
Subject: [PATCH 07/10] Illumos #883: ZIL reuse during remount corruption

Moving the zil_free() cleanup to zil_close() prevents this
problem from occurring in the first place.  There is a very
good description of the issue and fix in Illumus #883.

Reviewed by: Matt Ahrens <Matt.Ahrens@delphix.com>
Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com>
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Gordon Ross <gwr@nexenta.com>
Reviewed by: Garrett D'Amore <garrett@nexenta.com>
Reivewed by: Dan McDonald <danmcd@nexenta.com>
Approved by: Gordon Ross <gwr@nexenta.com>

References to Illumos issue and patch:
- https://www.illumos.org/issues/883
- https://github.com/illumos/illumos-gate/commit/c9ba2a43cb

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 cmd/ztest/ztest.c | 36 ++++++++++++++++++++++++++++++++++++
 module/zfs/zil.c  | 40 ++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 235bf56ef..715815859 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -206,6 +206,7 @@ typedef struct ztest_od {
  */
 typedef struct ztest_ds {
 	objset_t	*zd_os;
+	krwlock_t	zd_zilog_lock;
 	zilog_t		*zd_zilog;
 	uint64_t	zd_seq;
 	ztest_od_t	*zd_od;		/* debugging aid */
@@ -239,6 +240,7 @@ ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_zil_remount;
 ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_prealloc;
@@ -274,6 +276,7 @@ ztest_info_t ztest_info[] = {
 	{ ztest_zap_parallel,			100,	&zopt_always	},
 	{ ztest_split_pool,			1,	&zopt_always	},
 	{ ztest_zil_commit,			1,	&zopt_incessant	},
+	{ ztest_zil_remount,			1,	&zopt_sometimes	},
 	{ ztest_dmu_read_write_zcopy,		1,	&zopt_often	},
 	{ ztest_dmu_objset_create_destroy,	1,	&zopt_often	},
 	{ ztest_dsl_prop_get_set,		1,	&zopt_often	},
@@ -1007,6 +1010,7 @@ ztest_zd_init(ztest_ds_t *zd, objset_t *os)
 	dmu_objset_name(os, zd->zd_name);
 	int l;
 
+	rw_init(&zd->zd_zilog_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
@@ -1022,6 +1026,7 @@ ztest_zd_fini(ztest_ds_t *zd)
 	int l;
 
 	mutex_destroy(&zd->zd_dirobj_lock);
+	rw_destroy(&zd->zd_zilog_lock);
 
 	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
 		ztest_rll_destroy(&zd->zd_object_lock[l]);
@@ -1993,6 +1998,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 	if (ztest_random(2) == 0)
 		io_type = ZTEST_IO_WRITE_TAG;
 
+	(void) rw_enter(&zd->zd_zilog_lock, RW_READER);
+
 	switch (io_type) {
 
 	case ZTEST_IO_WRITE_TAG:
@@ -2030,6 +2037,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 		break;
 	}
 
+	(void) rw_exit(&zd->zd_zilog_lock);
+
 	umem_free(data, blocksize);
 }
 
@@ -2084,6 +2093,8 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 {
 	zilog_t *zilog = zd->zd_zilog;
 
+	(void) rw_enter(&zd->zd_zilog_lock, RW_READER);
+
 	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
 
 	/*
@@ -2095,6 +2106,31 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
 	ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
 	zd->zd_seq = zilog->zl_commit_lr_seq;
 	mutex_exit(&zilog->zl_lock);
+
+	(void) rw_exit(&zd->zd_zilog_lock);
+}
+
+/*
+ * This function is designed to simulate the operations that occur during a
+ * mount/unmount operation.  We hold the dataset across these operations in an
+ * attempt to expose any implicit assumptions about ZIL management.
+ */
+/* ARGSUSED */
+void
+ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+
+	(void) rw_enter(&zd->zd_zilog_lock, RW_WRITER);
+
+	/* zfsvfs_teardown() */
+	zil_close(zd->zd_zilog);
+
+	/* zfsvfs_setup() */
+	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
+	zil_replay(os, zd, ztest_replay_vector);
+
+	(void) rw_exit(&zd->zd_zilog_lock);
 }
 
 /*
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 8aa811db2..5296b38be 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -562,7 +563,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
-		ASSERT(!keep_first);
+		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
@@ -1665,21 +1666,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 void
 zil_free(zilog_t *zilog)
 {
-	lwb_t *head_lwb;
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
-	/*
-	 * After zil_close() there should only be one lwb with a buffer.
-	 */
-	head_lwb = list_head(&zilog->zl_lwb_list);
-	if (head_lwb) {
-		ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
-		list_remove(&zilog->zl_lwb_list, head_lwb);
-		zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
-		kmem_cache_free(zil_lwb_cache, head_lwb);
-	}
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	avl_destroy(&zilog->zl_vdev_tree);
@@ -1719,6 +1710,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
+	ASSERT(zilog->zl_clean_taskq == NULL);
+	ASSERT(zilog->zl_get_data == NULL);
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
 	zilog->zl_get_data = get_data;
 	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
 	    2, 2, TASKQ_PREPOPULATE);
@@ -1732,7 +1727,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 void
 zil_close(zilog_t *zilog)
 {
-	lwb_t *tail_lwb;
+	lwb_t *lwb;
 	uint64_t txg = 0;
 
 	zil_commit(zilog, 0); /* commit all itx */
@@ -1744,9 +1739,9 @@ zil_close(zilog_t *zilog)
 	 * destroy the zl_clean_taskq.
 	 */
 	mutex_enter(&zilog->zl_lock);
-	tail_lwb = list_tail(&zilog->zl_lwb_list);
-	if (tail_lwb != NULL)
-		txg = tail_lwb->lwb_max_txg;
+	lwb = list_tail(&zilog->zl_lwb_list);
+	if (lwb != NULL)
+		txg = lwb->lwb_max_txg;
 	mutex_exit(&zilog->zl_lock);
 	if (txg)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
@@ -1754,6 +1749,19 @@ zil_close(zilog_t *zilog)
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
 	zilog->zl_get_data = NULL;
+
+	/*
+	 * We should have only one LWB left on the list; remove it now.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	lwb = list_head(&zilog->zl_lwb_list);
+	if (lwb != NULL) {
+		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
+		list_remove(&zilog->zl_lwb_list, lwb);
+		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	mutex_exit(&zilog->zl_lock);
 }
 
 /*

From ca5252204aa25f81e9f19084917e0a46fdd470b0 Mon Sep 17 00:00:00 2001
From: Martin Matuska <mm@FreeBSD.org>
Date: Tue, 26 Jul 2011 13:08:02 -0700
Subject: [PATCH 08/10] Illumos #1043: Recursive zfs snapshot destroy fails

Prior to revision 11314 if a user was recursively destroying
snapshots of a dataset the target dataset was not required to
exist.  The zfs_secpolicy_destroy_snaps() function introduced
the security check on the target dataset, so since then if the
target dataset does not exist, the recursive destroy is not
performed.  Before 11314, only a delete permission check on
the snapshot's master dataset was performed.

Steps to reproduce:
zfs create pool/a
zfs snapshot pool/a@s1
zfs destroy -r pool@s1

Therefore I suggest to fallback to the old security check, if
the target snapshot does not exist and continue with the destroy.

References to Illumos issue and patch:
- https://www.illumos.org/issues/1043
- https://www.illumos.org/attachments/217/recursive_dataset_destroy.patch

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
---
 module/zfs/zfs_ioctl.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 693ffc0c8..088c64b27 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -701,6 +701,9 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
  * and destroying snapshots requires descendent permissions, a successfull
  * check of the top level snapshot applies to snapshots of all descendent
  * datasets as well.
+ *
+ * The target snapshot may not exist when doing a recursive destroy.
+ * In this case fallback to permissions of the parent dataset.
  */
 static int
 zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
@@ -711,6 +714,8 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
 	dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
 
 	error = zfs_secpolicy_destroy_perms(dsname, cr);
+	if (error == ENOENT)
+		error = zfs_secpolicy_destroy_perms(zc->zc_name, cr);
 
 	strfree(dsname);
 	return (error);

From 0b7936d5c2337bc976ac831c1c38de563844c36b Mon Sep 17 00:00:00 2001
From: Alexander Stetsenko <ams@nexenta.com>
Date: Tue, 26 Jul 2011 15:44:36 -0700
Subject: [PATCH 09/10] Illumos #278: get rid zfs of python and pyzfs
 dependencies

Remove all python and pyzfs dependencies for consistency and
to ensure full functionality even in a mimimalist environment.

Reviewed by: gordon.w.ross@gmail.com
Reviewed by: trisk@opensolaris.org
Reviewed by: alexander.r.eremin@gmail.com
Reviewed by: jerry.jelinek@joyent.com
Approved by: garrett@nexenta.com

References to Illumos issue and patch:
- https://www.illumos.org/issues/278
- https://github.com/illumos/illumos-gate/commit/1af68beac3

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
Issue #160

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 cmd/zfs/zfs_main.c          | 2273 ++++++++++++++++++++++++++++++++++-
 include/libzfs.h            |    9 +-
 include/zfs_deleg.h         |    2 +
 lib/libzfs/libzfs_dataset.c |  189 +++
 module/zcommon/zfs_deleg.c  |    3 +-
 5 files changed, 2441 insertions(+), 35 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 69a6736a6..54d057b1e 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -42,6 +42,7 @@
 #include <grp.h>
 #include <pwd.h>
 #include <signal.h>
+#include <sys/list.h>
 #include <sys/mkdev.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
@@ -52,7 +53,13 @@
 #include <time.h>
 
 #include <libzfs.h>
+#include <zfs_prop.h>
+#include <zfs_deleg.h>
 #include <libuutil.h>
+#ifdef HAVE_IDMAP
+#include <aclutils.h>
+#include <directory.h>
+#endif /* HAVE_IDMAP */
 
 #include "zfs_iter.h"
 #include "zfs_util.h"
@@ -62,7 +69,6 @@ libzfs_handle_t *g_zfs;
 
 static FILE *mnttab_file;
 static char history_str[HIS_MAX_RECORD_LEN];
-const char *pypath = "/usr/lib/zfs/pyzfs.py";
 
 static int zfs_do_clone(int argc, char **argv);
 static int zfs_do_create(int argc, char **argv);
@@ -83,8 +89,10 @@ static int zfs_do_send(int argc, char **argv);
 static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
 static int zfs_do_userspace(int argc, char **argv);
-static int zfs_do_python(int argc, char **argv);
+static int zfs_do_allow(int argc, char **argv);
+static int zfs_do_unallow(int argc, char **argv);
 static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_holds(int argc, char **argv);
 static int zfs_do_release(int argc, char **argv);
 static int zfs_do_diff(int argc, char **argv);
 
@@ -177,12 +185,12 @@ static zfs_command_t command_table[] = {
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
 	{ NULL },
-	{ "allow",	zfs_do_python,		HELP_ALLOW		},
+	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
 	{ NULL },
-	{ "unallow",	zfs_do_python,		HELP_UNALLOW		},
+	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
 	{ NULL },
 	{ "hold",	zfs_do_hold,		HELP_HOLD		},
-	{ "holds",	zfs_do_python,		HELP_HOLDS		},
+	{ "holds",	zfs_do_holds,		HELP_HOLDS		},
 	{ "release",	zfs_do_release,		HELP_RELEASE		},
 	{ "diff",	zfs_do_diff,		HELP_DIFF		},
 };
@@ -1825,71 +1833,731 @@ zfs_do_upgrade(int argc, char **argv)
 	return (ret);
 }
 
+#define	USTYPE_USR_BIT (0)
+#define	USTYPE_GRP_BIT (1)
+#define	USTYPE_PSX_BIT (2)
+#define	USTYPE_SMB_BIT (3)
+
+#define	USTYPE_USR (1 << USTYPE_USR_BIT)
+#define	USTYPE_GRP (1 << USTYPE_GRP_BIT)
+
+#define	USTYPE_PSX (1 << USTYPE_PSX_BIT)
+#define	USTYPE_SMB (1 << USTYPE_SMB_BIT)
+
+#define	USTYPE_PSX_USR (USTYPE_PSX | USTYPE_USR)
+#define	USTYPE_SMB_USR (USTYPE_SMB | USTYPE_USR)
+#define	USTYPE_PSX_GRP (USTYPE_PSX | USTYPE_GRP)
+#define	USTYPE_SMB_GRP (USTYPE_SMB | USTYPE_GRP)
+#define	USTYPE_ALL (USTYPE_PSX_USR | USTYPE_SMB_USR \
+		| USTYPE_PSX_GRP | USTYPE_SMB_GRP)
+
+
+#define	USPROP_USED_BIT (0)
+#define	USPROP_QUOTA_BIT (1)
+
+#define	USPROP_USED (1 << USPROP_USED_BIT)
+#define	USPROP_QUOTA (1 << USPROP_QUOTA_BIT)
+
+typedef struct us_node {
+	nvlist_t	*usn_nvl;
+	uu_avl_node_t	usn_avlnode;
+	uu_list_node_t	usn_listnode;
+} us_node_t;
+
+typedef struct us_cbdata {
+	nvlist_t		**cb_nvlp;
+	uu_avl_pool_t		*cb_avl_pool;
+	uu_avl_t		*cb_avl;
+	boolean_t		cb_numname;
+	boolean_t		cb_nicenum;
+	boolean_t		cb_sid2posix;
+	zfs_userquota_prop_t	cb_prop;
+	zfs_sort_column_t	*cb_sortcol;
+	size_t			cb_max_typelen;
+	size_t			cb_max_namelen;
+	size_t			cb_max_usedlen;
+	size_t			cb_max_quotalen;
+} us_cbdata_t;
+
+typedef struct {
+	zfs_sort_column_t *si_sortcol;
+	boolean_t si_num_name;
+	boolean_t si_parsable;
+} us_sort_info_t;
+
+static int
+us_compare(const void *larg, const void *rarg, void *unused)
+{
+	const us_node_t *l = larg;
+	const us_node_t *r = rarg;
+	int rc = 0;
+	us_sort_info_t *si = (us_sort_info_t *)unused;
+	zfs_sort_column_t *sortcol = si->si_sortcol;
+	boolean_t num_name = si->si_num_name;
+	nvlist_t *lnvl = l->usn_nvl;
+	nvlist_t *rnvl = r->usn_nvl;
+
+	for (; sortcol != NULL; sortcol = sortcol->sc_next) {
+		char *lvstr = "";
+		char *rvstr = "";
+		uint32_t lv32 = 0;
+		uint32_t rv32 = 0;
+		uint64_t lv64 = 0;
+		uint64_t rv64 = 0;
+		zfs_prop_t prop = sortcol->sc_prop;
+		const char *propname = NULL;
+		boolean_t reverse = sortcol->sc_reverse;
+
+		switch (prop) {
+		case ZFS_PROP_TYPE:
+			propname = "type";
+			(void) nvlist_lookup_uint32(lnvl, propname, &lv32);
+			(void) nvlist_lookup_uint32(rnvl, propname, &rv32);
+			if (rv32 != lv32)
+				rc = (rv32 > lv32) ? 1 : -1;
+			break;
+		case ZFS_PROP_NAME:
+			propname = "name";
+			if (num_name) {
+				(void) nvlist_lookup_uint32(lnvl, propname,
+				    &lv32);
+				(void) nvlist_lookup_uint32(rnvl, propname,
+				    &rv32);
+				if (rv32 != lv32)
+					rc = (rv32 > lv32) ? 1 : -1;
+			} else {
+				(void) nvlist_lookup_string(lnvl, propname,
+				    &lvstr);
+				(void) nvlist_lookup_string(rnvl, propname,
+				    &rvstr);
+				rc = strcmp(lvstr, rvstr);
+			}
+			break;
+
+		case ZFS_PROP_USED:
+		case ZFS_PROP_QUOTA:
+			if (ZFS_PROP_USED == prop)
+				propname = "used";
+			else
+				propname = "quota";
+			(void) nvlist_lookup_uint64(lnvl, propname, &lv64);
+			(void) nvlist_lookup_uint64(rnvl, propname, &rv64);
+			if (rv64 != lv64)
+				rc = (rv64 > lv64) ? 1 : -1;
+		default:
+			break;
+		}
+
+		if (rc) {
+			if (rc < 0)
+				return (reverse ? 1 : -1);
+			else
+				return (reverse ? -1 : 1);
+		}
+	}
+
+	return (rc);
+}
+
+static inline const char *
+us_type2str(unsigned field_type)
+{
+	switch (field_type) {
+	case USTYPE_PSX_USR:
+		return ("POSIX User");
+	case USTYPE_PSX_GRP:
+		return ("POSIX Group");
+	case USTYPE_SMB_USR:
+		return ("SMB User");
+	case USTYPE_SMB_GRP:
+		return ("SMB Group");
+	default:
+		return ("Undefined");
+	}
+}
+
 /*
  * zfs userspace
  */
 static int
 userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
 {
-	zfs_userquota_prop_t *typep = arg;
-	zfs_userquota_prop_t p = *typep;
+	us_cbdata_t *cb = (us_cbdata_t *)arg;
+	zfs_userquota_prop_t prop = cb->cb_prop;
 	char *name = NULL;
-	char *ug, *propname;
+	char *propname;
 	char namebuf[32];
 	char sizebuf[32];
+	us_node_t *node;
+	uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
+	uu_avl_t *avl = cb->cb_avl;
+	uu_avl_index_t idx;
+	nvlist_t *props;
+	us_node_t *n;
+	zfs_sort_column_t *sortcol = cb->cb_sortcol;
+	unsigned type;
+	const char *typestr;
+	size_t namelen;
+	size_t typelen;
+	size_t sizelen;
+	us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
 
 	if (domain == NULL || domain[0] == '\0') {
-		if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) {
+		/* POSIX */
+		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+			type = USTYPE_PSX_GRP;
 			struct group *g = getgrgid(rid);
 			if (g)
 				name = g->gr_name;
 		} else {
+			type = USTYPE_PSX_USR;
 			struct passwd *p = getpwuid(rid);
 			if (p)
 				name = p->pw_name;
 		}
+	} else {
+#ifdef HAVE_IDMAP
+		char sid[ZFS_MAXNAMELEN+32];
+		uid_t id;
+		uint64_t classes;
+		int err;
+		directory_error_t e;
+
+		(void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
+		/* SMB */
+		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+			type = USTYPE_SMB_GRP;
+			err = sid_to_id(sid, B_FALSE, &id);
+		} else {
+			type = USTYPE_SMB_USR;
+			err = sid_to_id(sid, B_TRUE, &id);
+		}
+
+		if (err == 0) {
+			rid = id;
+
+			e = directory_name_from_sid(NULL, sid, &name, &classes);
+			if (e != NULL) {
+				directory_error_free(e);
+				return (NULL);
+			}
+
+			if (name == NULL)
+				name = sid;
+		}
+#else
+		return (-1);
+#endif /* HAVE_IDMAP */
 	}
 
-	if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA)
-		ug = "group";
-	else
-		ug = "user";
+/*
+ *	if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA)
+ *		ug = "group";
+ *	else
+ *		ug = "user";
+ */
 
-	if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED)
+	if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED)
 		propname = "used";
 	else
 		propname = "quota";
 
-	if (name == NULL) {
-		(void) snprintf(namebuf, sizeof (namebuf),
-		    "%llu", (longlong_t)rid);
+	(void) snprintf(namebuf, sizeof (namebuf), "%u", rid);
+	if (name == NULL)
 		name = namebuf;
-	}
-	zfs_nicenum(space, sizebuf, sizeof (sizebuf));
 
-	(void) printf("%s %s %s%c%s %s\n", propname, ug, domain,
-	    domain[0] ? '-' : ' ', name, sizebuf);
+	if (cb->cb_nicenum)
+		zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+	else
+		(void) sprintf(sizebuf, "%llu", (u_longlong_t)space);
+
+	node = safe_malloc(sizeof (us_node_t));
+	uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
+		free(node);
+		return (-1);
+	}
+
+	if (nvlist_add_uint32(props, "type", type) != 0)
+		nomem();
+
+	if (cb->cb_numname) {
+		if (nvlist_add_uint32(props, "name", rid) != 0)
+			nomem();
+		namelen = strlen(namebuf);
+	} else {
+		if (nvlist_add_string(props, "name", name) != 0)
+			nomem();
+		namelen = strlen(name);
+	}
+
+	typestr = us_type2str(type);
+	typelen = strlen(gettext(typestr));
+	if (typelen > cb->cb_max_typelen)
+		cb->cb_max_typelen  = typelen;
+
+	if (namelen > cb->cb_max_namelen)
+		cb->cb_max_namelen  = namelen;
+
+	sizelen = strlen(sizebuf);
+	if (0 == strcmp(propname, "used")) {
+		if (sizelen > cb->cb_max_usedlen)
+			cb->cb_max_usedlen  = sizelen;
+	} else {
+		if (sizelen > cb->cb_max_quotalen)
+			cb->cb_max_quotalen  = sizelen;
+	}
+
+	node->usn_nvl = props;
+
+	n = uu_avl_find(avl, node, &sortinfo, &idx);
+	if (n == NULL)
+		uu_avl_insert(avl, node, idx);
+	else {
+		nvlist_free(props);
+		free(node);
+		node = n;
+		props = node->usn_nvl;
+	}
+
+	if (nvlist_add_uint64(props, propname, space) != 0)
+		nomem();
 
 	return (0);
 }
 
+static inline boolean_t
+usprop_check(zfs_userquota_prop_t p, unsigned types, unsigned props)
+{
+	unsigned type;
+	unsigned prop;
+
+	switch (p) {
+	case ZFS_PROP_USERUSED:
+		type = USTYPE_USR;
+		prop = USPROP_USED;
+		break;
+	case ZFS_PROP_USERQUOTA:
+		type = USTYPE_USR;
+		prop = USPROP_QUOTA;
+		break;
+	case ZFS_PROP_GROUPUSED:
+		type = USTYPE_GRP;
+		prop = USPROP_USED;
+		break;
+	case ZFS_PROP_GROUPQUOTA:
+		type = USTYPE_GRP;
+		prop = USPROP_QUOTA;
+		break;
+	default: /* ALL */
+		return (B_TRUE);
+	};
+
+	return (type & types && prop & props);
+}
+
+#define	USFIELD_TYPE (1 << 0)
+#define	USFIELD_NAME (1 << 1)
+#define	USFIELD_USED (1 << 2)
+#define	USFIELD_QUOTA (1 << 3)
+#define	USFIELD_ALL (USFIELD_TYPE | USFIELD_NAME | USFIELD_USED | USFIELD_QUOTA)
+
+static int
+parsefields(unsigned *fieldsp, char **names, unsigned *bits, size_t len)
+{
+	char *field = optarg;
+	char *delim;
+
+	do {
+		int i;
+		boolean_t found = B_FALSE;
+		delim = strchr(field, ',');
+		if (delim != NULL)
+			*delim = '\0';
+
+		for (i = 0; i < len; i++)
+			if (0 == strcmp(field, names[i])) {
+				found = B_TRUE;
+				*fieldsp |= bits[i];
+				break;
+			}
+
+		if (!found) {
+			(void) fprintf(stderr, gettext("invalid type '%s'"
+			    "for -t option\n"), field);
+			return (-1);
+		}
+
+		field = delim + 1;
+	} while (delim);
+
+	return (0);
+}
+
+
+static char *type_names[] = { "posixuser", "smbuser", "posixgroup", "smbgroup",
+	"all" };
+static unsigned type_bits[] = {
+	USTYPE_PSX_USR,
+	USTYPE_SMB_USR,
+	USTYPE_PSX_GRP,
+	USTYPE_SMB_GRP,
+	USTYPE_ALL
+};
+
+static char *us_field_names[] = { "type", "name", "used", "quota" };
+static unsigned us_field_bits[] = {
+	USFIELD_TYPE,
+	USFIELD_NAME,
+	USFIELD_USED,
+	USFIELD_QUOTA
+};
+
+static void
+print_us_node(boolean_t scripted, boolean_t parseable, unsigned fields,
+		size_t type_width, size_t name_width, size_t used_width,
+		size_t quota_width, us_node_t *node)
+{
+	nvlist_t *nvl = node->usn_nvl;
+	nvpair_t *nvp = NULL;
+	char valstr[ZFS_MAXNAMELEN];
+	boolean_t first = B_TRUE;
+	boolean_t quota_found = B_FALSE;
+
+	if (fields & USFIELD_QUOTA && !nvlist_exists(nvl, "quota"))
+		if (nvlist_add_string(nvl, "quota", "none") != 0)
+			nomem();
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		char *pname = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		uint32_t val32 = 0;
+		uint64_t val64 = 0;
+		char *strval = NULL;
+		unsigned field = 0;
+		unsigned width = 0;
+		int i;
+		for (i = 0; i < 4; i++) {
+			if (0 == strcmp(pname, us_field_names[i])) {
+				field = us_field_bits[i];
+				break;
+			}
+		}
+
+		if (!(field & fields))
+			continue;
+
+		switch (type) {
+		case DATA_TYPE_UINT32:
+			(void) nvpair_value_uint32(nvp, &val32);
+			break;
+		case DATA_TYPE_UINT64:
+			(void) nvpair_value_uint64(nvp, &val64);
+			break;
+		case DATA_TYPE_STRING:
+			(void) nvpair_value_string(nvp, &strval);
+			break;
+		default:
+			(void) fprintf(stderr, "Invalid data type\n");
+		}
+
+		if (!first) {
+			if (scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		}
+
+		switch (field) {
+		case USFIELD_TYPE:
+			strval = (char *)us_type2str(val32);
+			width = type_width;
+			break;
+		case USFIELD_NAME:
+			if (type == DATA_TYPE_UINT64) {
+				(void) sprintf(valstr, "%llu",
+				    (u_longlong_t) val64);
+				strval = valstr;
+			}
+			width = name_width;
+			break;
+		case USFIELD_USED:
+		case USFIELD_QUOTA:
+			if (type == DATA_TYPE_UINT64) {
+				(void) nvpair_value_uint64(nvp, &val64);
+				if (parseable)
+					(void) sprintf(valstr, "%llu",
+					    (u_longlong_t) val64);
+				else
+					zfs_nicenum(val64, valstr,
+					    sizeof (valstr));
+				strval = valstr;
+			}
+
+			if (field == USFIELD_USED)
+				width = used_width;
+			else {
+				quota_found = B_FALSE;
+				width = quota_width;
+			}
+
+			break;
+		}
+
+		if (field == USFIELD_QUOTA && !quota_found)
+			(void) printf("%*s", width, strval);
+		else {
+			if (type == DATA_TYPE_STRING)
+				(void) printf("%-*s", width, strval);
+			else
+				(void) printf("%*s", width, strval);
+		}
+
+		first = B_FALSE;
+
+	}
+
+	(void) printf("\n");
+}
+
+static void
+print_us(boolean_t scripted, boolean_t parsable, unsigned fields,
+		unsigned type_width, unsigned name_width, unsigned used_width,
+		unsigned quota_width, boolean_t rmnode, uu_avl_t *avl)
+{
+	static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
+	us_node_t *node;
+	const char *col;
+	int i;
+	int width[4] = { type_width, name_width, used_width, quota_width };
+
+	if (!scripted) {
+		boolean_t first = B_TRUE;
+		for (i = 0; i < 4; i++) {
+			unsigned field = us_field_bits[i];
+			if (!(field & fields))
+				continue;
+
+			col = gettext(us_field_hdr[i]);
+			if (field == USFIELD_TYPE || field == USFIELD_NAME)
+				(void) printf(first?"%-*s":"  %-*s", width[i],
+				    col);
+			else
+				(void) printf(first?"%*s":"  %*s", width[i],
+				    col);
+			first = B_FALSE;
+		}
+		(void) printf("\n");
+	}
+
+	for (node = uu_avl_first(avl); node != NULL;
+	    node = uu_avl_next(avl, node)) {
+		print_us_node(scripted, parsable, fields, type_width,
+		    name_width, used_width, used_width, node);
+		if (rmnode)
+			nvlist_free(node->usn_nvl);
+	}
+}
+
 static int
 zfs_do_userspace(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
 	zfs_userquota_prop_t p;
+	uu_avl_pool_t *avl_pool;
+	uu_avl_t *avl_tree;
+	uu_avl_walk_t *walk;
+
+	char *cmd;
+	boolean_t scripted = B_FALSE;
+	boolean_t prtnum = B_FALSE;
+	boolean_t parseable = B_FALSE;
+	boolean_t sid2posix = B_FALSE;
 	int error;
+	int c;
+	zfs_sort_column_t *default_sortcol = NULL;
+	zfs_sort_column_t *sortcol = NULL;
+	unsigned types = USTYPE_PSX_USR | USTYPE_SMB_USR;
+	unsigned fields = 0;
+	unsigned props = USPROP_USED | USPROP_QUOTA;
+	us_cbdata_t cb;
+	us_node_t *node;
+	boolean_t resort_avl = B_FALSE;
+
+	if (argc < 2)
+		usage(B_FALSE);
+
+	cmd = argv[0];
+	if (0 == strcmp(cmd, "groupspace"))
+		/* toggle default group types */
+		types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
+		switch (c) {
+		case 'n':
+			prtnum = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case 'p':
+			parseable = B_TRUE;
+			break;
+		case 'o':
+			if (parsefields(&fields, us_field_names, us_field_bits,
+			    4) != 0)
+				return (1);
+			break;
+		case 's':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    B_FALSE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid property '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 'S':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    B_TRUE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid property '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 't':
+			if (parsefields(&types, type_names, type_bits, 5))
+				return (1);
+			break;
+		case 'i':
+			sid2posix = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* ok, now we have sorted by default colums (type,name) avl tree */
+	if (sortcol) {
+		zfs_sort_column_t *sc;
+		for (sc = sortcol; sc; sc = sc->sc_next) {
+			if (sc->sc_prop == ZFS_PROP_QUOTA) {
+				resort_avl = B_TRUE;
+				break;
+			}
+		}
+	}
+
+	if (!fields)
+		fields = USFIELD_ALL;
 
 	if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL)
 		return (1);
 
-	(void) printf("PROP TYPE NAME VALUE\n");
+	if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
+	    offsetof(us_node_t, usn_avlnode),
+	    us_compare, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+		nomem();
+
+	if (sortcol && !resort_avl)
+		cb.cb_sortcol = sortcol;
+	else {
+		(void) zfs_add_sort_column(&default_sortcol, "type", B_FALSE);
+		(void) zfs_add_sort_column(&default_sortcol, "name", B_FALSE);
+		cb.cb_sortcol = default_sortcol;
+	}
+	cb.cb_numname = prtnum;
+	cb.cb_nicenum = !parseable;
+	cb.cb_avl_pool = avl_pool;
+	cb.cb_avl = avl_tree;
+	cb.cb_sid2posix = sid2posix;
+	cb.cb_max_typelen = strlen(gettext("TYPE"));
+	cb.cb_max_namelen = strlen(gettext("NAME"));
+	cb.cb_max_usedlen = strlen(gettext("USED"));
+	cb.cb_max_quotalen = strlen(gettext("QUOTA"));
 
 	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
-		error = zfs_userspace(zhp, p, userspace_cb, &p);
+		if (!usprop_check(p, types, props))
+			continue;
+
+		cb.cb_prop = p;
+		error = zfs_userspace(zhp, p, userspace_cb, &cb);
 		if (error)
 			break;
 	}
+
+
+	if (resort_avl) {
+		us_node_t *node;
+		us_node_t *rmnode;
+		uu_list_pool_t *listpool;
+		uu_list_t *list;
+		uu_avl_index_t idx = 0;
+		uu_list_index_t idx2 = 0;
+		listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
+		    offsetof(us_node_t, usn_listnode), NULL,
+		    UU_DEFAULT);
+		list = uu_list_create(listpool, NULL, UU_DEFAULT);
+
+		node = uu_avl_first(avl_tree);
+		uu_list_node_init(node, &node->usn_listnode, listpool);
+		while (node != NULL) {
+			rmnode = node;
+			node = uu_avl_next(avl_tree, node);
+			uu_avl_remove(avl_tree, rmnode);
+			if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) {
+				uu_list_insert(list, rmnode, idx2);
+			}
+		}
+
+		for (node = uu_list_first(list); node != NULL;
+		    node = uu_list_next(list, node)) {
+			us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
+			if (uu_avl_find(avl_tree, node, &sortinfo, &idx) ==
+			    NULL)
+			uu_avl_insert(avl_tree, node, idx);
+		}
+
+		uu_list_destroy(list);
+	}
+
+	/* print & free node`s nvlist memory */
+	print_us(scripted, parseable, fields, cb.cb_max_typelen,
+	    cb.cb_max_namelen, cb.cb_max_usedlen,
+	    cb.cb_max_quotalen, B_TRUE, cb.cb_avl);
+
+	if (sortcol)
+		zfs_free_sort_columns(sortcol);
+	zfs_free_sort_columns(default_sortcol);
+
+	/*
+	 * Finally, clean up the AVL tree.
+	 */
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(cb.cb_avl, node);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(avl_tree);
+	uu_avl_pool_destroy(avl_pool);
+
 	return (error);
 }
 
@@ -2821,6 +3489,1362 @@ zfs_do_receive(int argc, char **argv)
 	return (err != 0);
 }
 
+/*
+ * allow/unallow stuff
+ */
+/* copied from zfs/sys/dsl_deleg.h */
+#define	ZFS_DELEG_PERM_CREATE		"create"
+#define	ZFS_DELEG_PERM_DESTROY		"destroy"
+#define	ZFS_DELEG_PERM_SNAPSHOT		"snapshot"
+#define	ZFS_DELEG_PERM_ROLLBACK		"rollback"
+#define	ZFS_DELEG_PERM_CLONE		"clone"
+#define	ZFS_DELEG_PERM_PROMOTE		"promote"
+#define	ZFS_DELEG_PERM_RENAME		"rename"
+#define	ZFS_DELEG_PERM_MOUNT		"mount"
+#define	ZFS_DELEG_PERM_SHARE		"share"
+#define	ZFS_DELEG_PERM_SEND		"send"
+#define	ZFS_DELEG_PERM_RECEIVE		"receive"
+#define	ZFS_DELEG_PERM_ALLOW		"allow"
+#define	ZFS_DELEG_PERM_USERPROP		"userprop"
+#define	ZFS_DELEG_PERM_VSCAN		"vscan" /* ??? */
+#define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
+#define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
+#define	ZFS_DELEG_PERM_USERUSED		"userused"
+#define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
+#define	ZFS_DELEG_PERM_HOLD		"hold"
+#define	ZFS_DELEG_PERM_RELEASE		"release"
+#define	ZFS_DELEG_PERM_DIFF		"diff"
+
+#define	ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
+
+static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
+	{ ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
+	{ ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+	{ ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+	{ ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+	{ ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
+	{ ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+	{ ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+	{ ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+	{ ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+	{ ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+	{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+	{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+	{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
+	{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+	{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+
+	{ ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
+	{ ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+	{ ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+	{ ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
+	{ ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
+	{ NULL, ZFS_DELEG_NOTE_NONE }
+};
+
+/* permission structure */
+typedef struct deleg_perm {
+	zfs_deleg_who_type_t	dp_who_type;
+	const char		*dp_name;
+	boolean_t		dp_local;
+	boolean_t		dp_descend;
+} deleg_perm_t;
+
+/* */
+typedef struct deleg_perm_node {
+	deleg_perm_t		dpn_perm;
+
+	uu_avl_node_t		dpn_avl_node;
+} deleg_perm_node_t;
+
+typedef struct fs_perm fs_perm_t;
+
+/* permissions set */
+typedef struct who_perm {
+	zfs_deleg_who_type_t	who_type;
+	const char		*who_name;		/* id */
+	char			who_ug_name[256];	/* user/group name */
+	fs_perm_t		*who_fsperm;		/* uplink */
+
+	uu_avl_t		*who_deleg_perm_avl;	/* permissions */
+} who_perm_t;
+
+/* */
+typedef struct who_perm_node {
+	who_perm_t	who_perm;
+	uu_avl_node_t	who_avl_node;
+} who_perm_node_t;
+
+typedef struct fs_perm_set fs_perm_set_t;
+/* fs permissions */
+struct fs_perm {
+	const char		*fsp_name;
+
+	uu_avl_t		*fsp_sc_avl;	/* sets,create */
+	uu_avl_t		*fsp_uge_avl;	/* user,group,everyone */
+
+	fs_perm_set_t		*fsp_set;	/* uplink */
+};
+
+/* */
+typedef struct fs_perm_node {
+	fs_perm_t	fspn_fsperm;
+	uu_avl_t	*fspn_avl;
+
+	uu_list_node_t	fspn_list_node;
+} fs_perm_node_t;
+
+/* top level structure */
+struct fs_perm_set {
+	uu_list_pool_t	*fsps_list_pool;
+	uu_list_t	*fsps_list; /* list of fs_perms */
+
+	uu_avl_pool_t	*fsps_named_set_avl_pool;
+	uu_avl_pool_t	*fsps_who_perm_avl_pool;
+	uu_avl_pool_t	*fsps_deleg_perm_avl_pool;
+};
+
+static inline const char *
+deleg_perm_type(zfs_deleg_note_t note)
+{
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+	case ZFS_DELEG_NOTE_GROUPUSED:
+	case ZFS_DELEG_NOTE_USERPROP:
+	case ZFS_DELEG_NOTE_USERQUOTA:
+	case ZFS_DELEG_NOTE_USERUSED:
+		/* other */
+		return (gettext("other"));
+	default:
+		return (gettext("subcommand"));
+	}
+}
+
+static int inline
+who_type2weight(zfs_deleg_who_type_t who_type)
+{
+	int res;
+	switch (who_type) {
+		case ZFS_DELEG_NAMED_SET_SETS:
+		case ZFS_DELEG_NAMED_SET:
+			res = 0;
+			break;
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_CREATE:
+			res = 1;
+			break;
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_USER:
+			res = 2;
+			break;
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_GROUP:
+			res = 3;
+			break;
+		case ZFS_DELEG_EVERYONE_SETS:
+		case ZFS_DELEG_EVERYONE:
+			res = 4;
+			break;
+		default:
+			res = -1;
+	}
+
+	return (res);
+}
+
+/* ARGSUSED */
+static int
+who_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+	const who_perm_node_t *l = larg;
+	const who_perm_node_t *r = rarg;
+	zfs_deleg_who_type_t ltype = l->who_perm.who_type;
+	zfs_deleg_who_type_t rtype = r->who_perm.who_type;
+	int lweight = who_type2weight(ltype);
+	int rweight = who_type2weight(rtype);
+	int res = lweight - rweight;
+	if (res == 0)
+		res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
+		    ZFS_MAX_DELEG_NAME-1);
+
+	if (res == 0)
+		return (0);
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
+}
+
+/* ARGSUSED */
+static int
+deleg_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+	const deleg_perm_node_t *l = larg;
+	const deleg_perm_node_t *r = rarg;
+	int res =  strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
+	    ZFS_MAX_DELEG_NAME-1);
+
+	if (res == 0)
+		return (0);
+
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
+}
+
+static inline void
+fs_perm_set_init(fs_perm_set_t *fspset)
+{
+	bzero(fspset, sizeof (fs_perm_set_t));
+
+	if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
+	    sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
+	    NULL, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
+	    "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
+	    "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
+	    "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
+	    deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
+	    == NULL)
+		nomem();
+}
+
+static inline void fs_perm_fini(fs_perm_t *);
+static inline void who_perm_fini(who_perm_t *);
+
+static inline void
+fs_perm_set_fini(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
+
+	while (node != NULL) {
+		fs_perm_node_t *next_node =
+		    uu_list_next(fspset->fsps_list, node);
+		fs_perm_t *fsperm = &node->fspn_fsperm;
+		fs_perm_fini(fsperm);
+		uu_list_remove(fspset->fsps_list, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
+}
+
+static inline void
+deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
+    const char *name)
+{
+	deleg_perm->dp_who_type = type;
+	deleg_perm->dp_name = name;
+}
+
+static inline void
+who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
+    zfs_deleg_who_type_t type, const char *name)
+{
+	uu_avl_pool_t	*pool;
+	pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
+
+	bzero(who_perm, sizeof (who_perm_t));
+
+	if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	who_perm->who_type = type;
+	who_perm->who_name = name;
+	who_perm->who_fsperm = fsperm;
+}
+
+static inline void
+who_perm_fini(who_perm_t *who_perm)
+{
+	deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
+
+	while (node != NULL) {
+		deleg_perm_node_t *next_node =
+		    uu_avl_next(who_perm->who_deleg_perm_avl, node);
+
+		uu_avl_remove(who_perm->who_deleg_perm_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(who_perm->who_deleg_perm_avl);
+}
+
+static inline void
+fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
+{
+	uu_avl_pool_t	*nset_pool = fspset->fsps_named_set_avl_pool;
+	uu_avl_pool_t	*who_pool = fspset->fsps_who_perm_avl_pool;
+
+	bzero(fsperm, sizeof (fs_perm_t));
+
+	if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	fsperm->fsp_set = fspset;
+	fsperm->fsp_name = fsname;
+}
+
+static inline void
+fs_perm_fini(fs_perm_t *fsperm)
+{
+	who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_sc_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	node = uu_avl_first(fsperm->fsp_uge_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_uge_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(fsperm->fsp_sc_avl);
+	uu_avl_destroy(fsperm->fsp_uge_avl);
+}
+
+static void inline
+set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
+    zfs_deleg_who_type_t who_type, const char *name, char locality)
+{
+	uu_avl_index_t idx = 0;
+
+	deleg_perm_node_t *found_node = NULL;
+	deleg_perm_t	*deleg_perm = &node->dpn_perm;
+
+	deleg_perm_init(deleg_perm, who_type, name);
+
+	if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+	    == NULL)
+		uu_avl_insert(avl, node, idx);
+	else {
+		node = found_node;
+		deleg_perm = &node->dpn_perm;
+	}
+
+
+	switch (locality) {
+	case ZFS_DELEG_LOCAL:
+		deleg_perm->dp_local = B_TRUE;
+		break;
+	case ZFS_DELEG_DESCENDENT:
+		deleg_perm->dp_descend = B_TRUE;
+		break;
+	case ZFS_DELEG_NA:
+		break;
+	default:
+		assert(B_FALSE); /* invalid locality */
+	}
+}
+
+static inline int
+parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
+	uu_avl_t *avl = who_perm->who_deleg_perm_avl;
+	zfs_deleg_who_type_t who_type = who_perm->who_type;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *name = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
+		deleg_perm_node_t *node =
+		    safe_malloc(sizeof (deleg_perm_node_t));
+
+		VERIFY(type == DATA_TYPE_BOOLEAN);
+
+		uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
+		set_deleg_perm_node(avl, node, who_type, name, locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = fsperm->fsp_set;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *name = nvpair_name(nvp);
+		uu_avl_t *avl = NULL;
+		uu_avl_pool_t *avl_pool = NULL;
+		zfs_deleg_who_type_t perm_type = name[0];
+		char perm_locality = name[1];
+		const char *perm_name = name + 3;
+		boolean_t is_set = B_TRUE;
+		who_perm_t *who_perm = NULL;
+
+		assert('$' == name[2]);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		switch (perm_type) {
+		case ZFS_DELEG_CREATE:
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_NAMED_SET:
+		case ZFS_DELEG_NAMED_SET_SETS:
+			avl_pool = fspset->fsps_named_set_avl_pool;
+			avl = fsperm->fsp_sc_avl;
+			break;
+		case ZFS_DELEG_USER:
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_GROUP:
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_EVERYONE:
+		case ZFS_DELEG_EVERYONE_SETS:
+			avl_pool = fspset->fsps_who_perm_avl_pool;
+			avl = fsperm->fsp_uge_avl;
+			break;
+		default:
+			break;
+		}
+
+		if (is_set) {
+			who_perm_node_t *found_node = NULL;
+			who_perm_node_t *node = safe_malloc(
+			    sizeof (who_perm_node_t));
+			who_perm = &node->who_perm;
+			uu_avl_index_t idx = 0;
+
+			uu_avl_node_init(node, &node->who_avl_node, avl_pool);
+			who_perm_init(who_perm, fsperm, perm_type, perm_name);
+
+			if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+			    == NULL) {
+				if (avl == fsperm->fsp_uge_avl) {
+					uid_t rid = 0;
+					struct passwd *p = NULL;
+					struct group *g = NULL;
+					const char *nice_name = NULL;
+
+					switch (perm_type) {
+					case ZFS_DELEG_USER_SETS:
+					case ZFS_DELEG_USER:
+						rid = atoi(perm_name);
+						p = getpwuid(rid);
+						if (p)
+							nice_name = p->pw_name;
+						break;
+					case ZFS_DELEG_GROUP_SETS:
+					case ZFS_DELEG_GROUP:
+						rid = atoi(perm_name);
+						g = getgrgid(rid);
+						if (g)
+							nice_name = g->gr_name;
+						break;
+					default:
+						break;
+					}
+
+					if (nice_name != NULL)
+						(void) strlcpy(
+						    node->who_perm.who_ug_name,
+						    nice_name, 256);
+				}
+
+				uu_avl_insert(avl, node, idx);
+			} else {
+				node = found_node;
+				who_perm = &node->who_perm;
+			}
+		}
+
+		(void) parse_who_perm(who_perm, nvl2, perm_locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	uu_avl_index_t idx = 0;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *fsname = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		fs_perm_t *fsperm = NULL;
+		fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
+		if (node == NULL)
+			nomem();
+
+		fsperm = &node->fspn_fsperm;
+
+		VERIFY(DATA_TYPE_NVLIST == type);
+
+		uu_list_node_init(node, &node->fspn_list_node,
+		    fspset->fsps_list_pool);
+
+		idx = uu_list_numnodes(fspset->fsps_list);
+		fs_perm_init(fsperm, fspset, fsname);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		(void) parse_fs_perm(fsperm, nvl2);
+
+		uu_list_insert(fspset->fsps_list, node, idx);
+	}
+
+	return (0);
+}
+
+static inline const char *
+deleg_perm_comment(zfs_deleg_note_t note)
+{
+	const char *str = "";
+
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+	case ZFS_DELEG_NOTE_ALLOW:
+		str = gettext("Must also have the permission that is being"
+		    "\n\t\t\t\tallowed");
+		break;
+	case ZFS_DELEG_NOTE_CLONE:
+		str = gettext("Must also have the 'create' ability and 'mount'"
+		    "\n\t\t\t\tability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_CREATE:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DESTROY:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DIFF:
+		str = gettext("Allows lookup of paths within a dataset;"
+		    "\n\t\t\t\tgiven an object number. Ordinary users need this"
+		    "\n\t\t\t\tin order to use zfs diff");
+		break;
+	case ZFS_DELEG_NOTE_HOLD:
+		str = gettext("Allows adding a user hold to a snapshot");
+		break;
+	case ZFS_DELEG_NOTE_MOUNT:
+		str = gettext("Allows mount/umount of ZFS datasets");
+		break;
+	case ZFS_DELEG_NOTE_PROMOTE:
+		str = gettext("Must also have the 'mount'\n\t\t\t\tand"
+		    " 'promote' ability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_RECEIVE:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    " ability");
+		break;
+	case ZFS_DELEG_NOTE_RELEASE:
+		str = gettext("Allows releasing a user hold which\n\t\t\t\t"
+		    "might destroy the snapshot");
+		break;
+	case ZFS_DELEG_NOTE_RENAME:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    "\n\t\t\t\tability in the new parent");
+		break;
+	case ZFS_DELEG_NOTE_ROLLBACK:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SEND:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SHARE:
+		str = gettext("Allows sharing file systems over NFS or SMB"
+		    "\n\t\t\t\tprotocols");
+		break;
+	case ZFS_DELEG_NOTE_SNAPSHOT:
+		str = gettext("");
+		break;
+/*
+ *	case ZFS_DELEG_NOTE_VSCAN:
+ *		str = gettext("");
+ *		break;
+ */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+		str = gettext("Allows accessing any groupquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_GROUPUSED:
+		str = gettext("Allows reading any groupused@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERPROP:
+		str = gettext("Allows changing any user property");
+		break;
+	case ZFS_DELEG_NOTE_USERQUOTA:
+		str = gettext("Allows accessing any userquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERUSED:
+		str = gettext("Allows reading any userused@... property");
+		break;
+		/* other */
+	default:
+		str = "";
+	}
+
+	return (str);
+}
+
+struct allow_opts {
+	boolean_t local;
+	boolean_t descend;
+	boolean_t user;
+	boolean_t group;
+	boolean_t everyone;
+	boolean_t create;
+	boolean_t set;
+	boolean_t recursive; /* unallow only */
+	boolean_t prt_usage;
+
+	boolean_t prt_perms;
+	char *who;
+	char *perms;
+	const char *dataset;
+};
+
+static inline int
+prop_cmp(const void *a, const void *b)
+{
+	const char *str1 = *(const char **)a;
+	const char *str2 = *(const char **)b;
+	return (strcmp(str1, str2));
+}
+
+static void
+allow_usage(boolean_t un, boolean_t requested, const char *msg)
+{
+	const char *opt_desc[] = {
+		"-h", gettext("show this help message and exit"),
+		"-l", gettext("set permission locally"),
+		"-d", gettext("set permission for descents"),
+		"-u", gettext("set permission for user"),
+		"-g", gettext("set permission for group"),
+		"-e", gettext("set permission for everyone"),
+		"-c", gettext("set create time permission"),
+		"-s", gettext("define permission set"),
+		/* unallow only */
+		"-r", gettext("remove permissions recursively"),
+	};
+	size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
+	size_t allow_size = unallow_size - 2;
+	const char *props[ZFS_NUM_PROPS];
+	int i;
+	size_t count = 0;
+	FILE *fp = requested ? stdout : stderr;
+	zprop_desc_t *pdtbl = zfs_prop_get_table();
+	const char *fmt = gettext("%-16s %-14s\t%s\n");
+
+	(void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
+	    HELP_ALLOW));
+	(void) fprintf(fp, gettext("Options:\n"));
+	for (i = 0; i < (un ? unallow_size : allow_size); i++) {
+		const char *opt = opt_desc[i++];
+		const char *optdsc = opt_desc[i];
+		(void) fprintf(fp, gettext("  %-10s  %s\n"), opt, optdsc);
+	}
+
+	(void) fprintf(fp, gettext("\nThe following permissions are "
+	    "supported:\n\n"));
+	(void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
+	    gettext("NOTES"));
+	for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
+		const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
+		zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
+		const char *perm_type = deleg_perm_type(perm_note);
+		const char *perm_comment = deleg_perm_comment(perm_note);
+		(void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
+	}
+
+	for (i = 0; i < ZFS_NUM_PROPS; i++) {
+		zprop_desc_t *pd = &pdtbl[i];
+		if (pd->pd_visible != B_TRUE)
+			continue;
+
+		if (pd->pd_attr == PROP_READONLY)
+			continue;
+
+		props[count++] = pd->pd_name;
+	}
+	props[count] = NULL;
+
+	qsort(props, count, sizeof (char *), prop_cmp);
+
+	for (i = 0; i < count; i++)
+		(void) fprintf(fp, fmt, props[i], gettext("property"), "");
+
+	if (msg != NULL)
+		(void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
+
+	exit(requested ? 0 : 2);
+}
+
+static inline const char *
+munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
+    char **permsp)
+{
+	if (un && argc == expected_argc - 1)
+		*permsp = NULL;
+	else if (argc == expected_argc)
+		*permsp = argv[argc - 2];
+	else
+		allow_usage(un, B_FALSE,
+		    gettext("wrong number of parameters\n"));
+
+	return (argv[argc - 1]);
+}
+
+static void
+parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
+{
+	int uge_sum = opts->user + opts->group + opts->everyone;
+	int csuge_sum = opts->create + opts->set + uge_sum;
+	int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
+	int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
+
+	if (uge_sum > 1)
+		allow_usage(un, B_FALSE,
+		    gettext("-u, -g, and -e are mutually exclusive\n"));
+
+	if (opts->prt_usage) {
+		if (argc == 0 && all_sum == 0)
+			allow_usage(un, B_TRUE, NULL);
+		else
+			usage(B_FALSE);
+	}
+
+	if (opts->set) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -s\n"));
+
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		if (argv[0][0] != '@')
+			allow_usage(un, B_FALSE,
+			    gettext("invalid set name: missing '@' prefix\n"));
+		opts->who = argv[0];
+	} else if (opts->create) {
+		if (ldcsuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -c\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (opts->everyone) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -e\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
+	    == 0) {
+		opts->everyone = B_TRUE;
+		argc--;
+		argv++;
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (argc == 1) {
+		opts->prt_perms = B_TRUE;
+		opts->dataset = argv[argc-1];
+	} else {
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		opts->who = argv[0];
+	}
+
+	if (!opts->local && !opts->descend) {
+		opts->local = B_TRUE;
+		opts->descend = B_TRUE;
+	}
+}
+
+static void
+store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
+    const char *who, char *perms, nvlist_t *top_nvl)
+{
+	int i;
+	char ld[2] = { '\0', '\0' };
+	char who_buf[ZFS_MAXNAMELEN+32];
+	char base_type = ZFS_DELEG_WHO_UNKNOWN;
+	char set_type = ZFS_DELEG_WHO_UNKNOWN;
+	nvlist_t *base_nvl = NULL;
+	nvlist_t *set_nvl = NULL;
+	nvlist_t *nvl;
+
+	if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) !=  0)
+		nomem();
+
+	switch (type) {
+	case ZFS_DELEG_NAMED_SET_SETS:
+	case ZFS_DELEG_NAMED_SET:
+		set_type = ZFS_DELEG_NAMED_SET_SETS;
+		base_type = ZFS_DELEG_NAMED_SET;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_CREATE_SETS:
+	case ZFS_DELEG_CREATE:
+		set_type = ZFS_DELEG_CREATE_SETS;
+		base_type = ZFS_DELEG_CREATE;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_USER_SETS:
+	case ZFS_DELEG_USER:
+		set_type = ZFS_DELEG_USER_SETS;
+		base_type = ZFS_DELEG_USER;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_GROUP_SETS:
+	case ZFS_DELEG_GROUP:
+		set_type = ZFS_DELEG_GROUP_SETS;
+		base_type = ZFS_DELEG_GROUP;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_EVERYONE_SETS:
+	case ZFS_DELEG_EVERYONE:
+		set_type = ZFS_DELEG_EVERYONE_SETS;
+		base_type = ZFS_DELEG_EVERYONE;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+	default:
+		break;
+	}
+
+	if (perms != NULL) {
+		char *curr = perms;
+		char *end = curr + strlen(perms);
+
+		while (curr < end) {
+			char *delim = strchr(curr, ',');
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			if (curr[0] == '@')
+				nvl = set_nvl;
+			else
+				nvl = base_nvl;
+
+			(void) nvlist_add_boolean(nvl, curr);
+			if (delim != end)
+				*delim = ',';
+			curr = delim + 1;
+		}
+
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (!nvlist_empty(base_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    base_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    base_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    base_nvl);
+			}
+
+
+			if (!nvlist_empty(set_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    set_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    set_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    set_nvl);
+			}
+		}
+	} else {
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", base_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", base_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", set_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", set_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+		}
+	}
+}
+
+static int
+construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
+{
+	if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	if (opts->set) {
+		store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
+		    opts->descend, opts->who, opts->perms, *nvlp);
+	} else if (opts->create) {
+		store_allow_perm(ZFS_DELEG_CREATE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else if (opts->everyone) {
+		store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else {
+		char *curr = opts->who;
+		char *end = curr + strlen(curr);
+
+		while (curr < end) {
+			const char *who;
+			zfs_deleg_who_type_t who_type;
+			char *endch;
+			char *delim = strchr(curr, ',');
+			char errbuf[256];
+			char id[64];
+			struct passwd *p = NULL;
+			struct group *g = NULL;
+
+			uid_t rid;
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			rid = (uid_t)strtol(curr, &endch, 0);
+			if (opts->user) {
+				who_type = ZFS_DELEG_USER;
+				if (*endch != '\0')
+					p = getpwnam(curr);
+				else
+					p = getpwuid(rid);
+
+				if (p != NULL)
+					rid = p->pw_uid;
+				else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user %s"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else if (opts->group) {
+				who_type = ZFS_DELEG_GROUP;
+				if (*endch != '\0')
+					g = getgrnam(curr);
+				else
+					g = getgrgid(rid);
+
+				if (g != NULL)
+					rid = g->gr_gid;
+				else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid group %s"),  curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else {
+				if (*endch != '\0') {
+					p = getpwnam(curr);
+				} else {
+					p = getpwuid(rid);
+				}
+
+				if (p == NULL) {
+					if (*endch != '\0') {
+						g = getgrnam(curr);
+					} else {
+						g = getgrgid(rid);
+					}
+				}
+
+				if (p != NULL) {
+					who_type = ZFS_DELEG_USER;
+					rid = p->pw_uid;
+				} else if (g != NULL) {
+					who_type = ZFS_DELEG_GROUP;
+					rid = g->gr_gid;
+				} else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user/group %s"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			}
+
+			(void) sprintf(id, "%u", rid);
+			who = id;
+
+			store_allow_perm(who_type, opts->local,
+			    opts->descend, who, opts->perms, *nvlp);
+			curr = delim + 1;
+		}
+	}
+
+	return (0);
+}
+
+static void
+print_set_creat_perms(uu_avl_t *who_avl)
+{
+	const char *sc_title[] = {
+		gettext("Permission sets:\n"),
+		gettext("Create time permissions:\n"),
+		NULL
+	};
+	const char **title_ptr = sc_title;
+	who_perm_node_t *who_node = NULL;
+	int prev_weight = -1;
+
+	for (who_node = uu_avl_first(who_avl); who_node != NULL;
+	    who_node = uu_avl_next(who_avl, who_node)) {
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		const char *who_name = who_node->who_perm.who_name;
+		int weight = who_type2weight(who_type);
+		boolean_t first = B_TRUE;
+		deleg_perm_node_t *deleg_node;
+
+		if (prev_weight != weight) {
+			(void) printf(*title_ptr++);
+			prev_weight = weight;
+		}
+
+		if (who_name == NULL || strnlen(who_name, 1) == 0)
+			(void) printf("\t");
+		else
+			(void) printf("\t%s ", who_name);
+
+		for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (first) {
+				(void) printf("%s",
+				    deleg_node->dpn_perm.dp_name);
+				first = B_FALSE;
+			} else
+				(void) printf(",%s",
+				    deleg_node->dpn_perm.dp_name);
+		}
+
+		(void) printf("\n");
+	}
+}
+
+static void inline
+print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
+    const char *title)
+{
+	who_perm_node_t *who_node = NULL;
+	boolean_t prt_title = B_TRUE;
+	uu_avl_walk_t *walk;
+
+	if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((who_node = uu_avl_walk_next(walk)) != NULL) {
+		const char *who_name = who_node->who_perm.who_name;
+		const char *nice_who_name = who_node->who_perm.who_ug_name;
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		char delim = ' ';
+		deleg_perm_node_t *deleg_node;
+		boolean_t prt_who = B_TRUE;
+
+		for (deleg_node = uu_avl_first(avl);
+		    deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (local != deleg_node->dpn_perm.dp_local ||
+			    descend != deleg_node->dpn_perm.dp_descend)
+				continue;
+
+			if (prt_who) {
+				const char *who = NULL;
+				if (prt_title) {
+					prt_title = B_FALSE;
+					(void) printf(title);
+				}
+
+				switch (who_type) {
+				case ZFS_DELEG_USER_SETS:
+				case ZFS_DELEG_USER:
+					who = gettext("user");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_GROUP_SETS:
+				case ZFS_DELEG_GROUP:
+					who = gettext("group");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_EVERYONE_SETS:
+				case ZFS_DELEG_EVERYONE:
+					who = gettext("everyone");
+					who_name = NULL;
+				default:
+					break;
+				}
+
+				prt_who = B_FALSE;
+				if (who_name == NULL)
+					(void) printf("\t%s", who);
+				else
+					(void) printf("\t%s %s", who, who_name);
+			}
+
+			(void) printf("%c%s", delim,
+			    deleg_node->dpn_perm.dp_name);
+			delim = ',';
+		}
+
+		if (!prt_who)
+			(void) printf("\n");
+	}
+
+	uu_avl_walk_end(walk);
+}
+
+static void
+print_fs_perms(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = NULL;
+	char buf[ZFS_MAXNAMELEN+32];
+	const char *dsname = buf;
+
+	for (node = uu_list_first(fspset->fsps_list); node != NULL;
+	    node = uu_list_next(fspset->fsps_list, node)) {
+		uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
+		uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
+		int left = 0;
+
+		(void) snprintf(buf, ZFS_MAXNAMELEN+32,
+		    gettext("---- Permissions on %s "),
+		    node->fspn_fsperm.fsp_name);
+		(void) printf(dsname);
+		left = 70 - strlen(buf);
+		while (left-- > 0)
+			(void) printf("-");
+		(void) printf("\n");
+
+		print_set_creat_perms(sc_avl);
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
+		    gettext("Local permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
+		    gettext("Descendent permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
+		    gettext("Local+Descendent permissions:\n"));
+	}
+}
+
+static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
+
+struct deleg_perms {
+	boolean_t un;
+	nvlist_t *nvl;
+};
+
+static int
+set_deleg_perms(zfs_handle_t *zhp, void *data)
+{
+	struct deleg_perms *perms = (struct deleg_perms *)data;
+	zfs_type_t zfs_type = zfs_get_type(zhp);
+
+	if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
+		return (0);
+
+	return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
+}
+
+static int
+zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
+{
+	zfs_handle_t *zhp;
+	nvlist_t *perm_nvl = NULL;
+	nvlist_t *update_perm_nvl = NULL;
+	int error = 1;
+	int c;
+	struct allow_opts opts = { 0 };
+
+	const char *optstr = un ? "ldugecsrh" : "ldugecsh";
+
+	/* check opts */
+	while ((c = getopt(argc, argv, optstr)) != -1) {
+		switch (c) {
+		case 'l':
+			opts.local = B_TRUE;
+			break;
+		case 'd':
+			opts.descend = B_TRUE;
+			break;
+		case 'u':
+			opts.user = B_TRUE;
+			break;
+		case 'g':
+			opts.group = B_TRUE;
+			break;
+		case 'e':
+			opts.everyone = B_TRUE;
+			break;
+		case 's':
+			opts.set = B_TRUE;
+			break;
+		case 'c':
+			opts.create = B_TRUE;
+			break;
+		case 'r':
+			opts.recursive = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case 'h':
+			opts.prt_usage = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	parse_allow_args(argc, argv, un, &opts);
+
+	/* try to open the dataset */
+	if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM))
+	    == NULL) {
+		(void) fprintf(stderr, "Failed to open Dataset *%s*\n",
+		    opts.dataset);
+		return (-1);
+	}
+
+	if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
+		goto cleanup2;
+
+	fs_perm_set_init(&fs_perm_set);
+	if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
+		(void) fprintf(stderr, "Failed to parse fsacl permissionsn");
+		goto cleanup1;
+	}
+
+	if (opts.prt_perms)
+		print_fs_perms(&fs_perm_set);
+	else {
+		(void) construct_fsacl_list(un, &opts, &update_perm_nvl);
+		if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
+			goto cleanup0;
+
+		if (un && opts.recursive) {
+			struct deleg_perms data = { un, update_perm_nvl };
+			if (zfs_iter_filesystems(zhp, set_deleg_perms,
+			    &data) != 0)
+				goto cleanup0;
+		}
+	}
+
+	error = 0;
+
+cleanup0:
+	nvlist_free(perm_nvl);
+	if (update_perm_nvl != NULL)
+		nvlist_free(update_perm_nvl);
+cleanup1:
+	fs_perm_set_fini(&fs_perm_set);
+cleanup2:
+	zfs_close(zhp);
+
+	return (error);
+}
+
+/*
+ * zfs allow [-r] [-t] <tag> <snap> ...
+ *
+ *	-r	Recursively hold
+ *	-t	Temporary hold (hidden option)
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_allow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
+}
+
+/*
+ * zfs unallow [-r] [-t] <tag> <snap> ...
+ *
+ *	-r	Recursively hold
+ *	-t	Temporary hold (hidden option)
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_unallow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
+}
+
 static int
 zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
 {
@@ -2928,6 +4952,200 @@ zfs_do_release(int argc, char **argv)
 	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
 }
 
+typedef struct holds_cbdata {
+	boolean_t	cb_recursive;
+	const char	*cb_snapname;
+	nvlist_t	**cb_nvlp;
+	size_t		cb_max_namelen;
+	size_t		cb_max_taglen;
+} holds_cbdata_t;
+
+#define	STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
+#define	DATETIME_BUF_LEN (32)
+/*
+ *
+ */
+static void
+print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl)
+{
+	int i;
+	nvpair_t *nvp = NULL;
+	char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
+	const char *col;
+
+	if (!scripted) {
+		for (i = 0; i < 3; i++) {
+			col = gettext(hdr_cols[i]);
+			if (i < 2)
+				(void) printf("%-*s  ", i ? tagwidth : nwidth,
+				    col);
+			else
+				(void) printf("%s\n", col);
+		}
+	}
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		char *zname = nvpair_name(nvp);
+		nvlist_t *nvl2;
+		nvpair_t *nvp2 = NULL;
+		(void) nvpair_value_nvlist(nvp, &nvl2);
+		while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
+			char tsbuf[DATETIME_BUF_LEN];
+			char *tagname = nvpair_name(nvp2);
+			uint64_t val = 0;
+			time_t time;
+			struct tm t;
+			char sep = scripted ? '\t' : ' ';
+			int sepnum = scripted ? 1 : 2;
+
+			(void) nvpair_value_uint64(nvp2, &val);
+			time = (time_t)val;
+			(void) localtime_r(&time, &t);
+			(void) strftime(tsbuf, DATETIME_BUF_LEN,
+			    gettext(STRFTIME_FMT_STR), &t);
+
+			(void) printf("%-*s%*c%-*s%*c%s\n", nwidth, zname,
+			    sepnum, sep, tagwidth, tagname, sepnum, sep, tsbuf);
+		}
+	}
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+holds_callback(zfs_handle_t *zhp, void *data)
+{
+	holds_cbdata_t *cbp = data;
+	nvlist_t *top_nvl = *cbp->cb_nvlp;
+	nvlist_t *nvl = NULL;
+	nvpair_t *nvp = NULL;
+	const char *zname = zfs_get_name(zhp);
+	size_t znamelen = strnlen(zname, ZFS_MAXNAMELEN);
+
+	if (cbp->cb_recursive) {
+		const char *snapname;
+		char *delim  = strchr(zname, '@');
+		if (delim == NULL)
+			return (0);
+
+		snapname = delim + 1;
+		if (strcmp(cbp->cb_snapname, snapname))
+			return (0);
+	}
+
+	if (zfs_get_holds(zhp, &nvl) != 0)
+		return (-1);
+
+	if (znamelen > cbp->cb_max_namelen)
+		cbp->cb_max_namelen  = znamelen;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *tag = nvpair_name(nvp);
+		size_t taglen = strnlen(tag, MAXNAMELEN);
+		if (taglen > cbp->cb_max_taglen)
+			cbp->cb_max_taglen  = taglen;
+	}
+
+	return (nvlist_add_nvlist(top_nvl, zname, nvl));
+}
+
+/*
+ * zfs holds [-r] <snap> ...
+ *
+ *	-r	Recursively hold
+ */
+static int
+zfs_do_holds(int argc, char **argv)
+{
+	int errors = 0;
+	int c;
+	int i;
+	boolean_t scripted = B_FALSE;
+	boolean_t recursive = B_FALSE;
+	const char *opts = "rH";
+	nvlist_t *nvl;
+
+	int types = ZFS_TYPE_SNAPSHOT;
+	holds_cbdata_t cb = { 0 };
+
+	int limit = 0;
+	int ret;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'r':
+			recursive = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (recursive) {
+		types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+		flags |= ZFS_ITER_RECURSE;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1)
+		usage(B_FALSE);
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	for (i = 0; i < argc; ++i) {
+		char *snapshot = argv[i];
+		const char *delim;
+		const char *snapname;
+
+		delim = strchr(snapshot, '@');
+		if (delim == NULL) {
+			(void) fprintf(stderr,
+			    gettext("'%s' is not a snapshot\n"), snapshot);
+			++errors;
+			continue;
+		}
+		snapname = delim + 1;
+		if (recursive)
+			snapshot[delim - snapshot] = '\0';
+
+		cb.cb_recursive = recursive;
+		cb.cb_snapname = snapname;
+		cb.cb_nvlp = &nvl;
+
+		/*
+		 *  1. collect holds data, set format options
+		 */
+		ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
+		    holds_callback, &cb);
+		if (ret != 0)
+			++errors;
+	}
+
+	/*
+	 *  2. print holds data
+	 */
+	print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl);
+
+	if (nvlist_empty(nvl))
+		(void) printf(gettext("no datasets available\n"));
+
+	nvlist_free(nvl);
+
+	return (0 != errors);
+}
+
 #define	CHECK_SPINNER 30
 #define	SPINNER_TIME 3		/* seconds */
 #define	MOUNT_TIME 5		/* seconds */
@@ -3809,15 +6027,6 @@ zfs_do_unshare(int argc, char **argv)
 	return (unshare_unmount(OP_SHARE, argc, argv));
 }
 
-/* ARGSUSED */
-static int
-zfs_do_python(int argc, char **argv)
-{
-	(void) execv(pypath, argv-1);
-	(void) printf("internal error: %s not found\n", pypath);
-	return (-1);
-}
-
 static int
 find_command_idx(char *command, int *idx)
 {
diff --git a/include/libzfs.h b/include/libzfs.h
index 23422b2c9..26b1ce302 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
  */
 
 #ifndef	_LIBZFS_H
@@ -572,13 +573,17 @@ extern int zfs_promote(zfs_handle_t *);
 extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
     boolean_t, boolean_t, int, uint64_t, uint64_t);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
 extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
 
 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
     uid_t rid, uint64_t space);
 
-extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
-    zfs_userspace_cb_t func, void *arg);
+extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
+    zfs_userspace_cb_t, void *);
+
+extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **);
+extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *);
 
 typedef struct recvflags {
 	/* print informational messages (ie, -v was specified) */
diff --git a/include/zfs_deleg.h b/include/zfs_deleg.h
index b4cb8e2b4..9997dffae 100644
--- a/include/zfs_deleg.h
+++ b/include/zfs_deleg.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
  */
 
 #ifndef	_ZFS_DELEG_H
@@ -51,6 +52,7 @@ typedef enum {
 	ZFS_DELEG_NOTE_CLONE,
 	ZFS_DELEG_NOTE_PROMOTE,
 	ZFS_DELEG_NOTE_RENAME,
+	ZFS_DELEG_NOTE_SEND,
 	ZFS_DELEG_NOTE_RECEIVE,
 	ZFS_DELEG_NOTE_ALLOW,
 	ZFS_DELEG_NOTE_USERPROP,
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 996bae2d7..74015139a 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
@@ -95,6 +96,7 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
 	namecheck_err_t why;
 	char what;
 
+	(void) zfs_prop_get_table();
 	if (dataset_namecheck(path, &why, &what) != 0) {
 		if (hdl != NULL) {
 			switch (why) {
@@ -4313,6 +4315,193 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
 	return (0);
 }
 
+int
+zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	int nvsz = 2048;
+	void *nvbuf;
+	int err = 0;
+	char errbuf[ZFS_MAXNAMELEN+32];
+
+	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
+
+tryagain:
+
+	nvbuf = malloc(nvsz);
+	if (nvbuf == NULL) {
+		err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
+		goto out;
+	}
+
+	zc.zc_nvlist_dst_size = nvsz;
+	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
+		    zc.zc_name);
+		switch (errno) {
+		case ENOMEM:
+			free(nvbuf);
+			nvsz = zc.zc_nvlist_dst_size;
+			goto tryagain;
+
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	} else {
+		/* success */
+		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
+		if (rc) {
+			(void) snprintf(errbuf, sizeof (errbuf), dgettext(
+			    TEXT_DOMAIN, "cannot get permissions on '%s'"),
+			    zc.zc_name);
+			err = zfs_standard_error_fmt(hdl, rc, errbuf);
+		}
+	}
+
+	free(nvbuf);
+out:
+	return (err);
+}
+
+int
+zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	char *nvbuf;
+	char errbuf[ZFS_MAXNAMELEN+32];
+	size_t nvsz;
+	int err;
+
+	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
+
+	err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
+	assert(err == 0);
+
+	nvbuf = malloc(nvsz);
+
+	err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
+	assert(err == 0);
+
+	zc.zc_nvlist_src_size = nvsz;
+	zc.zc_nvlist_src = (uintptr_t)nvbuf;
+	zc.zc_perm_action = un;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
+		    zc.zc_name);
+		switch (errno) {
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	}
+
+	free(nvbuf);
+
+	return (err);
+}
+
+int
+zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
+{
+	zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	int nvsz = 2048;
+	void *nvbuf;
+	int err = 0;
+	char errbuf[ZFS_MAXNAMELEN+32];
+
+	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+
+tryagain:
+
+	nvbuf = malloc(nvsz);
+	if (nvbuf == NULL) {
+		err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
+		goto out;
+	}
+
+	zc.zc_nvlist_dst_size = nvsz;
+	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
+		    zc.zc_name);
+		switch (errno) {
+		case ENOMEM:
+			free(nvbuf);
+			nvsz = zc.zc_nvlist_dst_size;
+			goto tryagain;
+
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	} else {
+		/* success */
+		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
+		if (rc) {
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
+			    zc.zc_name);
+			err = zfs_standard_error_fmt(hdl, rc, errbuf);
+		}
+	}
+
+	free(nvbuf);
+out:
+	return (err);
+}
+
 uint64_t
 zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
 {
diff --git a/module/zcommon/zfs_deleg.c b/module/zcommon/zfs_deleg.c
index 6754ab84b..9de61790c 100644
--- a/module/zcommon/zfs_deleg.c
+++ b/module/zcommon/zfs_deleg.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
  */
 
 #if defined(_KERNEL)
@@ -60,7 +61,7 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
 	{ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
 	{ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
 	{ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
-	{ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
+	{ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
 	{ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
 	{ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
 	{ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },

From cddafdcbc55a38cdbdd3dc8c58f447b22bd847ee Mon Sep 17 00:00:00 2001
From: Martin Matuska <mm@FreeBSD.org>
Date: Mon, 1 Aug 2011 10:34:06 -0700
Subject: [PATCH 10/10] Illumos #1313: Integer overflow in txg_delay()

The function txg_delay() is used to delay txg (transaction group)
threads in ZFS.  The timeout value for this function is calculated
using:

    int timeout = ddi_get_lbolt() + ticks;

Later, the actual wait is performed:

    while (ddi_get_lbolt() < timeout &&
        tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
            (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
                timeout - ddi_get_lbolt());

The ddi_get_lbolt() function returns current uptime in clock ticks
and is typed as clock_t.  The clock_t type on 64-bit architectures
is int64_t.

The "timeout" variable will overflow depending on the tick frequency
(e.g. for 1000 it will overflow in 28.855 days). This will make the
expression "ddi_get_lbolt() < timeout" always false - txg threads will
not be delayed anymore at all. This leads to a slowdown in ZFS writes.

The attached patch initializes timeout as clock_t to match the return
value of ddi_get_lbolt().

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #352
---
 module/zfs/txg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 340c42ae8..d0d2b1716 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -506,7 +506,7 @@ void
 txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	int timeout = ddi_get_lbolt() + ticks;
+	clock_t timeout = ddi_get_lbolt() + ticks;
 
 	/* don't delay if this txg could transition to quiesing immediately */
 	if (tx->tx_open_txg > txg ||