Initial Linux ZFS GIT Repo

2026-05-25 11:47:43 +03:00 · 2008-11-20 12:01:55 -08:00
commit 34dc7c2f25
444 changed files with 187636 additions and 0 deletions
@@ -0,0 +1,28 @@
+# NOTE: dctl_client.c, dctl_common.c, dctl_server.c, dctl_thrpool.c unused
+# by kernel port.  Potentially they should just be removed if we don't care
+# able user space lustre intergration from this source base.
+
+# NOTE: For clarity this directly should simply be renamed libzpl and
+# the full kernel implementation should be minimally stubbed out.
+
+subdir-m += include
+DISTFILES  = dctl_client.c dctl_common.c dctl_server.c dctl_thrpool.c
+DISTFILES += dmu_send.c rrwlock.c zfs_acl.c zfs_ctldir.c
+DISTFILES += zfs_dir.c zfs_fuid.c zfs_ioctl.c zfs_log.c zfs_replay.c
+DISTFILES += zfs_rlock.c zfs_vfsops.c zfs_vnops.c zvol.c
+
+MODULE := zctl
+
+EXTRA_CFLAGS  = @KERNELCPPFLAGS@
+EXTRA_CFLAGS += -I@LIBDIR@/libzcommon/include
+EXTRA_CFLAGS += -I@LIBDIR@/libdmu-ctl/include
+EXTRA_CFLAGS += -I@LIBDIR@/libavl/include
+EXTRA_CFLAGS += -I@LIBDIR@/libport/include
+EXTRA_CFLAGS += -I@LIBDIR@/libnvpair/include
+
+obj-m := ${MODULE}.o
+
+${MODULE}-objs += zvol.o	# Volume emulation interface
+${MODULE}-objs += zfs_ioctl.o	# /dev/zfs_ioctl interface
+${MODULE}-objs += zfs_vfsops.o
+${MODULE}-objs += dmu_send.o
@@ -0,0 +1,263 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ftw.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/debug.h>
+
+#include <sys/dmu_ctl.h>
+#include <sys/dmu_ctl_impl.h>
+
+/*
+ * Try to connect to the socket given in path.
+ *
+ * For nftw() convenience, returns 0 if unsuccessful, otherwise
+ * returns the socket descriptor.
+ */
+static int try_connect(const char *path)
+{
+	struct sockaddr_un name;
+	int sock;
+
+	sock = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (sock == -1) {
+		perror("socket");
+		return 0;
+	}
+
+	/*
+	 * The socket fd cannot be 0 otherwise nftw() will not interpret the
+	 * return code correctly.
+	 */
+	VERIFY(sock != 0);
+
+	name.sun_family = AF_UNIX;
+	strncpy(name.sun_path, path, sizeof(name.sun_path));
+
+	name.sun_path[sizeof(name.sun_path) - 1] = '\0';
+
+	if (connect(sock, (struct sockaddr *) &name, sizeof(name)) == -1) {
+		close(sock);
+		return 0;
+	}
+
+	return sock;
+}
+
+/*
+ * nftw() callback.
+ */
+static int nftw_cb(const char *fpath, const struct stat *sb, int typeflag,
+    struct FTW *ftwbuf)
+{
+	if (!S_ISSOCK(sb->st_mode))
+		return 0;
+
+	if (strcmp(&fpath[ftwbuf->base], SOCKNAME) != 0)
+		return 0;
+
+	return try_connect(fpath);
+}
+
+/*
+ * For convenience, if check_subdirs is true we walk the directory tree to
+ * find a good socket.
+ */
+int dctlc_connect(const char *dir, boolean_t check_subdirs)
+{
+	char *fpath;
+	int fd;
+
+	if (check_subdirs)
+		fd = nftw(dir, nftw_cb, 10, FTW_PHYS);
+	else {
+		fpath = malloc(strlen(dir) + strlen(SOCKNAME) + 2);
+		if (fpath == NULL)
+			return -1;
+
+		strcpy(fpath, dir);
+		strcat(fpath, "/" SOCKNAME);
+
+		fd = try_connect(fpath);
+
+		free(fpath);
+	}
+
+	return fd == 0 ? -1 : fd;
+}
+
+void dctlc_disconnect(int fd)
+{
+	(void) shutdown(fd, SHUT_RDWR);
+}
+
+static int dctl_reply_copyin(int fd, dctl_cmd_t *cmd)
+{
+	return dctl_send_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
+	    cmd->u.dcmd_copy.size);
+}
+
+static int dctl_reply_copyinstr(int fd, dctl_cmd_t *cmd)
+{
+	dctl_cmd_t reply;
+	char *from;
+	size_t len, buflen, to_copy;
+	int error;
+
+	reply.dcmd_msg = DCTL_GEN_REPLY;
+
+	from = (char *)(uintptr_t) cmd->u.dcmd_copy.ptr;
+
+	buflen = cmd->u.dcmd_copy.size;
+	to_copy = strnlen(from, buflen - 1);
+
+	reply.u.dcmd_reply.rc = from[to_copy] == '\0' ? 0 : ENAMETOOLONG;
+	reply.u.dcmd_reply.size = to_copy;
+
+	error = dctl_send_msg(fd, &reply);
+
+	if (!error && to_copy > 0)
+		error = dctl_send_data(fd, from, to_copy);
+
+	return error;
+}
+
+static int dctl_reply_copyout(int fd, dctl_cmd_t *cmd)
+{
+	return dctl_read_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
+	    cmd->u.dcmd_copy.size);
+}
+
+static int dctl_reply_fd_read(int fd, dctl_cmd_t *cmd)
+{
+	dctl_cmd_t reply;
+	void *buf;
+	int error;
+	ssize_t rrc, size = cmd->u.dcmd_fd_io.size;
+
+	buf = malloc(size);
+	if (buf == NULL)
+		return ENOMEM;
+
+	rrc = read(cmd->u.dcmd_fd_io.fd, buf, size);
+
+	reply.dcmd_msg = DCTL_GEN_REPLY;
+	reply.u.dcmd_reply.rc = rrc == -1 ? errno : 0;
+	reply.u.dcmd_reply.size = rrc;
+
+	error = dctl_send_msg(fd, &reply);
+
+	if (!error && rrc > 0)
+		error = dctl_send_data(fd, buf, rrc);
+
+out:
+	free(buf);
+
+	return error;
+}
+
+static int dctl_reply_fd_write(int fd, dctl_cmd_t *cmd)
+{
+	dctl_cmd_t reply;
+	void *buf;
+	int error;
+	ssize_t wrc, size = cmd->u.dcmd_fd_io.size;
+
+	buf = malloc(size);
+	if (buf == NULL)
+		return ENOMEM;
+
+	error = dctl_read_data(fd, buf, size);
+	if (error)
+		goto out;
+
+	wrc = write(cmd->u.dcmd_fd_io.fd, buf, size);
+
+	reply.dcmd_msg = DCTL_GEN_REPLY;
+	reply.u.dcmd_reply.rc = wrc == -1 ? errno : 0;
+	reply.u.dcmd_reply.size = wrc;
+
+	error = dctl_send_msg(fd, &reply);
+
+out:
+	free(buf);
+
+	return error;
+}
+
+int dctlc_ioctl(int fd, int32_t request, void *arg)
+{
+	int error;
+	dctl_cmd_t cmd;
+
+	ASSERT(fd != 0);
+
+	cmd.dcmd_msg = DCTL_IOCTL;
+
+	cmd.u.dcmd_ioctl.cmd = request;
+	cmd.u.dcmd_ioctl.arg = (uintptr_t) arg;
+
+	error = dctl_send_msg(fd, &cmd);
+
+	while (!error && (error = dctl_read_msg(fd, &cmd)) == 0) {
+		switch (cmd.dcmd_msg) {
+			case DCTL_IOCTL_REPLY:
+				error = cmd.u.dcmd_reply.rc;
+				goto out;
+			case DCTL_COPYIN:
+				error = dctl_reply_copyin(fd, &cmd);
+				break;
+			case DCTL_COPYINSTR:
+				error = dctl_reply_copyinstr(fd, &cmd);
+				break;
+			case DCTL_COPYOUT:
+				error = dctl_reply_copyout(fd, &cmd);
+				break;
+			case DCTL_FD_READ:
+				error = dctl_reply_fd_read(fd, &cmd);
+				break;
+			case DCTL_FD_WRITE:
+				error = dctl_reply_fd_write(fd, &cmd);
+				break;
+			default:
+				fprintf(stderr, "%s(): invalid message "
+				    "received.\n", __func__);
+				error = EINVAL;
+				goto out;
+		}
+	}
+
+out:
+	errno = error;
+	return error ? -1 : 0;
+}
@@ -0,0 +1,109 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <sys/dmu_ctl.h>
+#include <sys/dmu_ctl_impl.h>
+
+int dctl_read_msg(int fd, dctl_cmd_t *cmd)
+{
+	int error;
+
+	/*
+	 * First, read only the magic number and the protocol version.
+	 *
+	 * This prevents blocking forever in case the size of dctl_cmd_t
+	 * shrinks in future protocol versions.
+	 */
+	error = dctl_read_data(fd, cmd, DCTL_CMD_HEADER_SIZE);
+
+	if (!error &&cmd->dcmd_magic != DCTL_MAGIC) {
+		fprintf(stderr, "%s(): invalid magic number\n", __func__);
+		error = EIO;
+	}
+
+	if (!error && cmd->dcmd_version != DCTL_PROTOCOL_VER) {
+		fprintf(stderr, "%s(): invalid protocol version\n", __func__);
+		error = ENOTSUP;
+	}
+
+	if (error)
+		return error;
+
+	/* Get the rest of the command */
+	return dctl_read_data(fd, (caddr_t) cmd + DCTL_CMD_HEADER_SIZE,
+	    sizeof(dctl_cmd_t) - DCTL_CMD_HEADER_SIZE);
+}
+
+int dctl_send_msg(int fd, dctl_cmd_t *cmd)
+{
+	cmd->dcmd_magic = DCTL_MAGIC;
+	cmd->dcmd_version = DCTL_PROTOCOL_VER;
+
+	return dctl_send_data(fd, cmd, sizeof(dctl_cmd_t));
+}
+
+int dctl_read_data(int fd, void *ptr, size_t size)
+{
+	size_t read = 0;
+	size_t left = size;
+	ssize_t rc;
+
+	while (left > 0) {
+		rc = recv(fd, (caddr_t) ptr + read, left, 0);
+
+		/* File descriptor closed */
+		if (rc == 0)
+			return ECONNRESET;
+
+		if (rc == -1) {
+			if (errno == EINTR)
+				continue;
+			return errno;
+		}
+
+		read += rc;
+		left -= rc;
+	}
+
+	return 0;
+}
+
+int dctl_send_data(int fd, const void *ptr, size_t size)
+{
+	ssize_t rc;
+
+	do {
+		rc = send(fd, ptr, size, MSG_NOSIGNAL);
+	} while(rc == -1 && errno == EINTR);
+
+	return rc == size ? 0 : EIO;
+}
+
@@ -0,0 +1,476 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <limits.h>
+#include <errno.h>
+#include <poll.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/debug.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/list.h>
+#include <sys/cred.h>
+
+#include <sys/dmu_ctl.h>
+#include <sys/dmu_ctl_impl.h>
+
+static dctl_sock_info_t ctl_sock = {
+	.dsi_mtx = PTHREAD_MUTEX_INITIALIZER,
+	.dsi_fd = -1
+};
+
+static int dctl_create_socket_common();
+
+/*
+ * Routines from zfs_ioctl.c
+ */
+extern int zfs_ioctl_init();
+extern int zfs_ioctl_fini();
+extern int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+    int *rvalp);
+
+/*
+ * We can't simply put the client file descriptor in wthr_info_t because we
+ * have no way of accessing it from the DMU code without extensive
+ * modifications.
+ *
+ * Therefore each worker thread will have it's own global thread-specific
+ * client_fd variable.
+ */
+static __thread int client_fd = -1;
+
+int dctls_copyin(const void *src, void *dest, size_t size)
+{
+	dctl_cmd_t cmd;
+
+	VERIFY(client_fd >= 0);
+
+	cmd.dcmd_msg = DCTL_COPYIN;
+	cmd.u.dcmd_copy.ptr = (uintptr_t) src;
+	cmd.u.dcmd_copy.size = size;
+
+	if (dctl_send_msg(client_fd, &cmd) != 0)
+		return EFAULT;
+
+	if (dctl_read_data(client_fd, dest, size) != 0)
+		return EFAULT;
+
+	return 0;
+}
+
+int dctls_copyinstr(const char *from, char *to, size_t max, size_t *len)
+{
+	dctl_cmd_t msg;
+	size_t copied;
+
+	VERIFY(client_fd >= 0);
+
+	if (max == 0)
+		return ENAMETOOLONG;
+	if (max < 0)
+		return EFAULT;
+
+	msg.dcmd_msg = DCTL_COPYINSTR;
+	msg.u.dcmd_copy.ptr = (uintptr_t) from;
+	msg.u.dcmd_copy.size = max;
+
+	if (dctl_send_msg(client_fd, &msg) != 0)
+		return EFAULT;
+
+	if (dctl_read_msg(client_fd, &msg) != 0)
+		return EFAULT;
+
+	if (msg.dcmd_msg != DCTL_GEN_REPLY)
+		return EFAULT;
+
+	copied = msg.u.dcmd_reply.size;
+
+	if (copied >= max)
+		return EFAULT;
+
+	if (copied > 0)
+		if (dctl_read_data(client_fd, to, copied) != 0)
+			return EFAULT;
+
+	to[copied] = '\0';
+
+	if (len != NULL)
+		*len = copied + 1;
+
+	return msg.u.dcmd_reply.rc;
+}
+
+int dctls_copyout(const void *src, void *dest, size_t size)
+{
+	dctl_cmd_t cmd;
+
+	VERIFY(client_fd >= 0);
+
+	cmd.dcmd_msg = DCTL_COPYOUT;
+	cmd.u.dcmd_copy.ptr = (uintptr_t) dest;
+	cmd.u.dcmd_copy.size = size;
+
+	if (dctl_send_msg(client_fd, &cmd) != 0)
+		return EFAULT;
+
+	if (dctl_send_data(client_fd, src, size) != 0)
+		return EFAULT;
+
+	return 0;
+}
+
+int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp)
+{
+	dctl_cmd_t msg;
+	uint64_t dsize;
+	int error;
+
+	VERIFY(client_fd >= 0);
+
+	msg.dcmd_msg = DCTL_FD_READ;
+	msg.u.dcmd_fd_io.fd = fd;
+	msg.u.dcmd_fd_io.size = len;
+
+	if ((error = dctl_send_msg(client_fd, &msg)) != 0)
+		return error;
+
+	if ((error = dctl_read_msg(client_fd, &msg)) != 0)
+		return error;
+
+	if (msg.dcmd_msg != DCTL_GEN_REPLY)
+		return EIO;
+
+	if (msg.u.dcmd_reply.rc != 0)
+		return msg.u.dcmd_reply.rc;
+
+	dsize = msg.u.dcmd_reply.size;
+
+	if (dsize > 0)
+		error = dctl_read_data(client_fd, buf, dsize);
+
+	*residp = len - dsize;
+
+	return error;
+}
+
+int dctls_fd_write(int fd, const void *src, ssize_t len)
+{
+	dctl_cmd_t msg;
+	int error;
+
+	VERIFY(client_fd >= 0);
+
+	msg.dcmd_msg = DCTL_FD_WRITE;
+	msg.u.dcmd_fd_io.fd = fd;
+	msg.u.dcmd_fd_io.size = len;
+
+	error = dctl_send_msg(client_fd, &msg);
+
+	if (!error)
+		error = dctl_send_data(client_fd, src, len);
+
+	if (!error)
+		error = dctl_read_msg(client_fd, &msg);
+
+	if (error)
+		return error;
+
+	if (msg.dcmd_msg != DCTL_GEN_REPLY)
+		return EIO;
+
+	if (msg.u.dcmd_reply.rc != 0)
+		return msg.u.dcmd_reply.rc;
+
+	/*
+	 * We have to do this because the original upstream code
+	 * does not check if residp == len.
+	 */
+	if (msg.u.dcmd_reply.size != len)
+		return EIO;
+
+	return 0;
+}
+
+/* Handle a new connection */
+static void dctl_handle_conn(int sock_fd)
+{
+	dctl_cmd_t cmd;
+	dev_t dev = { 0 };
+	int rc;
+
+	client_fd = sock_fd;
+
+	while (dctl_read_msg(sock_fd, &cmd) == 0) {
+		if (cmd.dcmd_msg != DCTL_IOCTL) {
+			fprintf(stderr, "%s(): unexpected message type.\n",
+			    __func__);
+			break;
+		}
+
+		rc = zfsdev_ioctl(dev, cmd.u.dcmd_ioctl.cmd,
+		    (intptr_t) cmd.u.dcmd_ioctl.arg, 0, NULL, NULL);
+
+		cmd.dcmd_msg = DCTL_IOCTL_REPLY;
+		cmd.u.dcmd_reply.rc = rc;
+
+		if (dctl_send_msg(sock_fd, &cmd) != 0)
+			break;
+	}
+	close(sock_fd);
+
+	client_fd = -1;
+}
+
+/* Main worker thread loop */
+static void *dctl_thread(void *arg)
+{
+	wthr_info_t *thr = arg;
+	struct pollfd fds[1];
+
+	fds[0].events = POLLIN;
+
+	pthread_mutex_lock(&ctl_sock.dsi_mtx);
+
+	while (!thr->wthr_exit) {
+		/* Clean-up dead threads */
+		dctl_thr_join();
+
+		/* The file descriptor might change in the thread lifetime */
+		fds[0].fd = ctl_sock.dsi_fd;
+
+		/* Poll socket with 1-second timeout */
+		int rc = poll(fds, 1, 1000);
+		if (rc == 0 || (rc == -1 && errno == EINTR))
+			continue;
+
+		/* Recheck the exit flag */
+		if (thr->wthr_exit)
+			break;
+
+		if (rc == -1) {
+			/* Unknown error, let's try to recreate the socket */
+			close(ctl_sock.dsi_fd);
+			ctl_sock.dsi_fd = -1;
+
+			if (dctl_create_socket_common() != 0)
+				break;
+
+			continue;
+		}
+		ASSERT(rc == 1);
+
+		short rev = fds[0].revents;
+		if (rev == 0)
+			continue;
+		ASSERT(rev == POLLIN);
+
+		/*
+		 * At this point there should be a connection ready to be
+		 * accepted.
+		 */
+		int client_fd = accept(ctl_sock.dsi_fd, NULL, NULL);
+		/* Many possible errors here, we'll just retry */
+		if (client_fd == -1)
+			continue;
+
+		/*
+		 * Now lets handle the request. This can take a very
+		 * long time (hours even), so we'll let other threads
+		 * handle new connections.
+		 */
+		pthread_mutex_unlock(&ctl_sock.dsi_mtx);
+
+		dctl_thr_rebalance(thr, B_FALSE);
+		dctl_handle_conn(client_fd);
+		dctl_thr_rebalance(thr, B_TRUE);
+
+		pthread_mutex_lock(&ctl_sock.dsi_mtx);
+	}
+	pthread_mutex_unlock(&ctl_sock.dsi_mtx);
+
+	dctl_thr_die(thr);
+
+	return NULL;
+}
+
+static int dctl_create_socket_common()
+{
+	dctl_sock_info_t *s = &ctl_sock;
+	size_t size;
+	int error;
+
+	ASSERT(s->dsi_fd == -1);
+
+	/*
+	 * Unlink old socket, in case it exists.
+	 * We don't care about errors here.
+	 */
+	unlink(s->dsi_path);
+
+	/* Create the socket */
+	s->dsi_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (s->dsi_fd == -1) {
+		error = errno;
+		perror("socket");
+		return error;
+	}
+
+	s->dsi_addr.sun_family = AF_UNIX;
+
+	size = sizeof(s->dsi_addr.sun_path) - 1;
+	strncpy(s->dsi_addr.sun_path, s->dsi_path, size);
+
+	s->dsi_addr.sun_path[size] = '\0';
+
+	if (bind(s->dsi_fd, (struct sockaddr *) &s->dsi_addr,
+	    sizeof(s->dsi_addr)) != 0) {
+		error = errno;
+		perror("bind");
+		return error;
+	}
+
+	if (listen(s->dsi_fd, LISTEN_BACKLOG) != 0) {
+		error = errno;
+		perror("listen");
+		unlink(s->dsi_path);
+		return error;
+	}
+
+	return 0;
+}
+
+static int dctl_create_socket(const char *cfg_dir)
+{
+	int error;
+	dctl_sock_info_t *s = &ctl_sock;
+
+	ASSERT(s->dsi_path == NULL);
+	ASSERT(s->dsi_fd == -1);
+
+	int pathsize = strlen(cfg_dir) + strlen(SOCKNAME) + 2;
+	if (pathsize > sizeof(s->dsi_addr.sun_path))
+		return ENAMETOOLONG;
+
+	s->dsi_path = malloc(pathsize);
+	if (s->dsi_path == NULL)
+		return ENOMEM;
+
+	strcpy(s->dsi_path, cfg_dir);
+	strcat(s->dsi_path, "/" SOCKNAME);
+
+	/*
+	 * For convenience, create the directory in case it doesn't exist.
+	 * We don't care about errors here.
+	 */
+	mkdir(cfg_dir, 0770);
+
+	error = dctl_create_socket_common();
+
+	if (error) {
+		free(s->dsi_path);
+
+		if (s->dsi_fd != -1) {
+			close(s->dsi_fd);
+			s->dsi_fd = -1;
+		}
+	}
+
+	return error;
+}
+
+static void dctl_destroy_socket()
+{
+	dctl_sock_info_t *s = &ctl_sock;
+
+	ASSERT(s->dsi_path != NULL);
+	ASSERT(s->dsi_fd != -1);
+
+	close(s->dsi_fd);
+	s->dsi_fd = -1;
+
+	unlink(s->dsi_path);
+	free(s->dsi_path);
+}
+
+/*
+ * Initialize the DMU userspace control interface.
+ * This should be called after kernel_init().
+ *
+ * Note that only very rarely we have more than a couple of simultaneous
+ * lzfs/lzpool connections. Since the thread pool grows automatically when all
+ * threads are busy, a good value for min_thr and max_free_thr is 2.
+ */
+int dctl_server_init(const char *cfg_dir, int min_thr, int max_free_thr)
+{
+	int error;
+
+	ASSERT(min_thr > 0);
+	ASSERT(max_free_thr >= min_thr);
+
+	error = zfs_ioctl_init();
+	if (error)
+		return error;
+
+	error = dctl_create_socket(cfg_dir);
+	if (error) {
+		(void) zfs_ioctl_fini();
+		return error;
+	}
+
+	error = dctl_thr_pool_create(min_thr, max_free_thr, dctl_thread);
+	if (error) {
+		(void) zfs_ioctl_fini();
+		dctl_destroy_socket();
+		return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Terminate control interface.
+ * This should be called after closing all objsets, but before calling
+ * kernel_fini().
+ * May return EBUSY if the SPA is busy.
+ *
+ * Thread pool destruction can take a while due to poll()
+ * timeout or due to a thread being busy (e.g. a backup is being taken).
+ */
+int dctl_server_fini()
+{
+	dctl_thr_pool_stop();
+	dctl_destroy_socket();
+
+	return zfs_ioctl_fini();
+}
@@ -0,0 +1,253 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <time.h>
+#include <pthread.h>
+#include <errno.h>
+#include <sys/list.h>
+#include <sys/debug.h>
+
+#include <sys/dmu_ctl.h>
+#include <sys/dmu_ctl_impl.h>
+
+static dctl_thr_info_t thr_pool = {
+	.dti_mtx = PTHREAD_MUTEX_INITIALIZER
+};
+
+/*
+ * Create n threads.
+ * Callers must acquire thr_pool.dti_mtx first.
+ */
+static int dctl_thr_create(int n)
+{
+	dctl_thr_info_t *p = &thr_pool;
+	int error;
+
+	for (int i = 0; i < n; i++) {
+		wthr_info_t *thr = malloc(sizeof(wthr_info_t));
+		if (thr == NULL)
+			return ENOMEM;
+
+		thr->wthr_exit = B_FALSE;
+		thr->wthr_free = B_TRUE;
+
+		error = pthread_create(&thr->wthr_id, NULL, p->dti_thr_func,
+		    thr);
+		if (error) {
+			free(thr);
+			return error;
+		}
+
+		p->dti_free++;
+
+		list_insert_tail(&p->dti_list, thr);
+	}
+	return 0;
+}
+
+/*
+ * Mark the thread as dead.
+ * Must be called right before exiting the main thread function.
+ */
+void dctl_thr_die(wthr_info_t *thr)
+{
+	dctl_thr_info_t *p = &thr_pool;
+
+	thr->wthr_exit = B_TRUE;
+	dctl_thr_rebalance(thr, B_FALSE);
+
+	pthread_mutex_lock(&p->dti_mtx);
+
+	list_remove(&p->dti_list, thr);
+	list_insert_tail(&p->dti_join_list, thr);
+
+	pthread_mutex_unlock(&p->dti_mtx);
+}
+
+/*
+ * Clean-up dead threads.
+ */
+void dctl_thr_join()
+{
+	dctl_thr_info_t *p = &thr_pool;
+	wthr_info_t *thr;
+
+	pthread_mutex_lock(&p->dti_mtx);
+
+	while ((thr = list_head(&p->dti_join_list))) {
+		list_remove(&p->dti_join_list, thr);
+
+		ASSERT(!pthread_equal(thr->wthr_id, pthread_self()));
+
+		/*
+		 * This should not block because all the threads
+		 * on this list should have died already.
+		 *
+		 * pthread_join() can only return an error if
+		 * we made a programming mistake.
+		 */
+		VERIFY(pthread_join(thr->wthr_id, NULL) == 0);
+
+		ASSERT(thr->wthr_exit);
+		ASSERT(!thr->wthr_free);
+
+		free(thr);
+	}
+
+	pthread_mutex_unlock(&p->dti_mtx);
+}
+
+/*
+ * Adjust the number of free threads in the pool and the thread status.
+ *
+ * Callers must acquire thr_pool.dti_mtx first.
+ */
+static void dctl_thr_adjust_free(wthr_info_t *thr, boolean_t set_free)
+{
+	dctl_thr_info_t *p = &thr_pool;
+
+	ASSERT(p->dti_free >= 0);
+
+	if (!thr->wthr_free && set_free)
+		p->dti_free++;
+	else if (thr->wthr_free && !set_free)
+		p->dti_free--;
+
+	ASSERT(p->dti_free >= 0);
+
+	thr->wthr_free = set_free;
+}
+
+/*
+ * Rebalance threads. Also adjusts the free status of the thread.
+ * Will set the thread exit flag if the number of free threads is above
+ * the limit.
+ */
+void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free)
+{
+	dctl_thr_info_t *p = &thr_pool;
+
+	pthread_mutex_lock(&p->dti_mtx);
+
+	if (p->dti_exit || p->dti_free > p->dti_max_free)
+		thr->wthr_exit = B_TRUE;
+
+	if (thr->wthr_exit)
+		set_free = B_FALSE;
+
+	dctl_thr_adjust_free(thr, set_free);
+
+	if (!p->dti_exit && p->dti_free == 0)
+		dctl_thr_create(1);
+
+	pthread_mutex_unlock(&p->dti_mtx);
+}
+
+/*
+ * Stop the thread pool.
+ *
+ * This can take a while since it actually waits for all threads to exit.
+ */
+void dctl_thr_pool_stop()
+{
+	dctl_thr_info_t *p = &thr_pool;
+	wthr_info_t *thr;
+	struct timespec ts;
+
+	pthread_mutex_lock(&p->dti_mtx);
+
+	ASSERT(!p->dti_exit);
+	p->dti_exit = B_TRUE;
+
+	/* Let's flag the threads first */
+	thr = list_head(&p->dti_list);
+	while (thr != NULL) {
+		thr->wthr_exit = B_TRUE;
+		dctl_thr_adjust_free(thr, B_FALSE);
+
+		thr = list_next(&p->dti_list, thr);
+	}
+
+	pthread_mutex_unlock(&p->dti_mtx);
+
+	/* Now let's wait for them to exit */
+	ts.tv_sec = 0;
+	ts.tv_nsec = 50000000; /* 50ms */
+	do {
+		nanosleep(&ts, NULL);
+
+		pthread_mutex_lock(&p->dti_mtx);
+		thr = list_head(&p->dti_list);
+		pthread_mutex_unlock(&p->dti_mtx);
+
+		dctl_thr_join();
+	} while(thr != NULL);
+
+	ASSERT(p->dti_free == 0);
+
+	ASSERT(list_is_empty(&p->dti_list));
+	ASSERT(list_is_empty(&p->dti_join_list));
+
+	list_destroy(&p->dti_list);
+	list_destroy(&p->dti_join_list);
+}
+
+/*
+ * Create thread pool.
+ *
+ * If at least one thread creation fails, it will stop all previous
+ * threads and return a non-zero value.
+ */
+int dctl_thr_pool_create(int min_thr, int max_free_thr,
+    thr_func_t *thr_func)
+{
+	int error;
+	dctl_thr_info_t *p = &thr_pool;
+
+	ASSERT(p->dti_free == 0);
+
+	/* Initialize global variables */
+	p->dti_min = min_thr;
+	p->dti_max_free = max_free_thr;
+	p->dti_exit = B_FALSE;
+	p->dti_thr_func = thr_func;
+
+	list_create(&p->dti_list, sizeof(wthr_info_t), offsetof(wthr_info_t,
+	    wthr_node));
+	list_create(&p->dti_join_list, sizeof(wthr_info_t),
+	    offsetof(wthr_info_t, wthr_node));
+
+	pthread_mutex_lock(&p->dti_mtx);
+	error = dctl_thr_create(min_thr);
+	pthread_mutex_unlock(&p->dti_mtx);
+
+	if (error)
+		dctl_thr_pool_stop();
+
+	return error;
+}
@@ -0,0 +1 @@
+subdir-m += sys
@@ -0,0 +1 @@
+DISTFILES = dmu_ctl.h dmu_ctl_impl.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_CTL_H
+#define _SYS_DMU_CTL_H
+
+#include <sys/types.h>
+
+/* Default directory where the clients search for sockets to connect */
+#define DMU_CTL_DEFAULT_DIR "/var/run/zfs/udmu"
+
+/*
+ * These functions are called by the server process.
+ *
+ * kernel_init() must be called before dctl_server_init().
+ * kernel_fini() must not be called before dctl_server_fini().
+ *
+ * All objsets must be closed and object references be released before calling
+ * dctl_server_fini(), otherwise it will return EBUSY.
+ *
+ * Note: On Solaris, it is highly recommended to either catch or ignore the
+ * SIGPIPE signal, otherwise the server process will die if the client is
+ * killed.
+ */
+int dctl_server_init(const char *cfg_dir, int min_threads,
+    int max_free_threads);
+int dctl_server_fini();
+
+/*
+ * The following functions are called by the DMU from the server process context
+ * (in the worker threads).
+ */
+int dctls_copyin(const void *src, void *dest, size_t size);
+int dctls_copyinstr(const char *from, char *to, size_t max,
+    size_t *len);
+int dctls_copyout(const void *src, void *dest, size_t size);
+int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp);
+int dctls_fd_write(int fd, const void *src, ssize_t len);
+
+/*
+ * These functions are called by the client process (libzfs).
+ */
+int dctlc_connect(const char *dir, boolean_t check_subdirs);
+void dctlc_disconnect(int fd);
+
+int dctlc_ioctl(int fd, int32_t request, void *arg);
+
+#endif
@@ -0,0 +1,144 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_CTL_IMPL_H
+#define _SYS_DMU_CTL_IMPL_H
+
+#include <sys/list.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <pthread.h>
+
+#define SOCKNAME "dmu_socket"
+
+#define DCTL_PROTOCOL_VER 1
+#define DCTL_MAGIC 0xdc71b1070c01dc71ll
+
+/* Message types */
+enum {
+	DCTL_IOCTL,
+	DCTL_IOCTL_REPLY,
+	DCTL_COPYIN,
+	DCTL_COPYINSTR,
+	DCTL_COPYOUT,
+	DCTL_FD_READ,
+	DCTL_FD_WRITE,
+	DCTL_GEN_REPLY /* generic reply */
+};
+
+/* On-the-wire message */
+typedef struct dctl_cmd {
+	uint64_t dcmd_magic;
+	int8_t   dcmd_version;
+	int8_t   dcmd_msg;
+	uint8_t  dcmd_pad[6];
+	union {
+		struct dcmd_ioctl {
+			uint64_t arg;
+			int32_t cmd;
+			uint8_t pad[4];
+		} dcmd_ioctl;
+
+		struct dcmd_copy_req {
+			uint64_t ptr;
+			uint64_t size;
+		} dcmd_copy;
+
+		struct dcmd_fd_req {
+			int64_t size;
+			int32_t fd;
+			uint8_t pad[4];
+		} dcmd_fd_io;
+
+		struct dcmd_reply {
+			uint64_t size;  /* used by reply to DCTL_COPYINSTR,
+			                   DCTL_FD_READ and DCTL_FD_WRITE */
+			int32_t rc;     /* return code */
+			uint8_t pad[4];
+		} dcmd_reply;
+	} u;
+} dctl_cmd_t;
+
+#define DCTL_CMD_HEADER_SIZE (sizeof(uint64_t) + sizeof(uint8_t))
+
+/*
+ * The following definitions are only used by the server code.
+ */
+
+#define LISTEN_BACKLOG 5
+
+/* Worker thread data */
+typedef struct wthr_info {
+	list_node_t wthr_node;
+	pthread_t   wthr_id;
+	boolean_t   wthr_exit; /* termination flag */
+	boolean_t   wthr_free;
+} wthr_info_t;
+
+/* Control socket data */
+typedef struct dctl_sock_info {
+	pthread_mutex_t    dsi_mtx;
+	char               *dsi_path;
+	struct sockaddr_un dsi_addr;
+	int                dsi_fd;
+} dctl_sock_info_t;
+
+typedef void *thr_func_t(void *);
+
+/* Thread pool data */
+typedef struct dctl_thr_info {
+	thr_func_t *dti_thr_func;
+
+	pthread_mutex_t dti_mtx; /* protects the thread lists and dti_free */
+	list_t dti_list;         /* list of threads in the thread pool */
+	list_t dti_join_list;    /* list of threads that are waiting to be
+	                            joined */
+	int    dti_free;         /* number of free worker threads */
+
+	int dti_min;
+	int dti_max_free;
+
+	boolean_t dti_exit; /* global termination flag */
+} dctl_thr_info_t;
+
+/* Messaging functions functions */
+int dctl_read_msg(int fd, dctl_cmd_t *cmd);
+int dctl_send_msg(int fd, dctl_cmd_t *cmd);
+
+int dctl_read_data(int fd, void *ptr, size_t size);
+int dctl_send_data(int fd, const void *ptr, size_t size);
+
+/* Thread pool functions */
+int dctl_thr_pool_create(int min_thr, int max_free_thr,
+    thr_func_t *thr_func);
+void dctl_thr_pool_stop();
+
+void dctl_thr_join();
+void dctl_thr_die(wthr_info_t *thr);
+void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free);
+
+#endif
@@ -0,0 +1,249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"@(#)rrwlock.c	1.1	07/10/24 SMI"
+
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t.  This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting.  If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed.  Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock).  Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out.  At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed.  If they are not, then the reader blocks for the
+ * waiting writers.  Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+	struct rrw_node	*rn_next;
+	rrwlock_t	*rn_rrl;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+
+	if (refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (NULL);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl)
+			return (rn);
+	}
+	return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+
+	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+	rn->rn_rrl = rrl;
+	rn->rn_next = tsd_get(rrw_tsd_key);
+	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+	rrw_node_t *prev = NULL;
+
+	if (refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (NULL);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl) {
+			if (prev)
+				prev->rn_next = rn->rn_next;
+			else
+				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+			kmem_free(rn, sizeof (*rn));
+			return (B_TRUE);
+		}
+		prev = rn;
+	}
+	return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl)
+{
+	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+	rrl->rr_writer = NULL;
+	refcount_create(&rrl->rr_anon_rcount);
+	refcount_create(&rrl->rr_linked_rcount);
+	rrl->rr_writer_wanted = B_FALSE;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+	mutex_destroy(&rrl->rr_lock);
+	cv_destroy(&rrl->rr_cv);
+	ASSERT(rrl->rr_writer == NULL);
+	refcount_destroy(&rrl->rr_anon_rcount);
+	refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(rrl->rr_writer != curthread);
+	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
+	    refcount_is_zero(&rrl->rr_anon_rcount) &&
+	    rrn_find(rrl) == NULL))
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+	if (rrl->rr_writer_wanted) {
+		/* may or may not be a re-entrant enter */
+		rrn_add(rrl);
+		(void) refcount_add(&rrl->rr_linked_rcount, tag);
+	} else {
+		(void) refcount_add(&rrl->rr_anon_rcount, tag);
+	}
+	ASSERT(rrl->rr_writer == NULL);
+	mutex_exit(&rrl->rr_lock);
+}
+
+static void
+rrw_enter_write(rrwlock_t *rrl)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(rrl->rr_writer != curthread);
+
+	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
+	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
+	    rrl->rr_writer != NULL) {
+		rrl->rr_writer_wanted = B_TRUE;
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+	}
+	rrl->rr_writer_wanted = B_FALSE;
+	rrl->rr_writer = curthread;
+	mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rrw_enter_read(rrl, tag);
+	else
+		rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
+	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
+	    rrl->rr_writer != NULL);
+
+	if (rrl->rr_writer == NULL) {
+		if (rrn_find_and_remove(rrl)) {
+			if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
+				cv_broadcast(&rrl->rr_cv);
+
+		} else {
+			if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
+				cv_broadcast(&rrl->rr_cv);
+		}
+	} else {
+		ASSERT(rrl->rr_writer == curthread);
+		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
+		    refcount_is_zero(&rrl->rr_linked_rcount));
+		rrl->rr_writer = NULL;
+		cv_broadcast(&rrl->rr_cv);
+	}
+	mutex_exit(&rrl->rr_lock);
+}
+
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+	boolean_t held;
+
+	mutex_enter(&rrl->rr_lock);
+	if (rw == RW_WRITER) {
+		held = (rrl->rr_writer == curthread);
+	} else {
+		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
+		    !refcount_is_zero(&rrl->rr_linked_rcount));
+	}
+	mutex_exit(&rrl->rr_lock);
+
+	return (held);
+}
@@ -0,0 +1,968 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"@(#)zfs_dir.c	1.25	08/04/27 SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include "fs/fs_subr.h"
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dnlc.h>
+#include <sys/extdirent.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
+    boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+{
+	int error;
+
+	if (zfsvfs->z_norm) {
+		matchtype_t mt = MT_FIRST;
+		boolean_t conflict = B_FALSE;
+		size_t bufsz = 0;
+		char *buf = NULL;
+
+		if (rpnp) {
+			buf = rpnp->pn_buf;
+			bufsz = rpnp->pn_bufsize;
+		}
+		if (exact)
+			mt = MT_EXACT;
+		/*
+		 * In the non-mixed case we only expect there would ever
+		 * be one match, but we need to use the normalizing lookup.
+		 */
+		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+		    zoid, mt, buf, bufsz, &conflict);
+		if (!error && deflags)
+			*deflags = conflict ? ED_CASE_CONFLICT : 0;
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+	}
+	*zoid = ZFS_DIRENT_OBJ(*zoid);
+
+	if (error == ENOENT && update)
+		dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
+
+	return (error);
+}
+
+/*
+ * Lock a directory entry.  A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object.  As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ *	dzp	- znode for directory
+ *	name	- name of entry to lock
+ *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
+ *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
+ *		  ZSHARED: allow concurrent access with other ZSHARED callers.
+ *		  ZXATTR: we want dzp's xattr directory
+ *		  ZCILOOK: On a mixed sensitivity file system,
+ *			   this lookup should be case-insensitive.
+ *		  ZCIEXACT: On a purely case-insensitive file system,
+ *			    this lookup should be case-sensitive.
+ *		  ZRENAMING: we are locking for renaming, force narrow locks
+ *
+ * Output arguments:
+ *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
+ *	dlpp	- pointer to the dirlock for this entry (NULL on error)
+ *      direntflags - (case-insensitive lookup only)
+ *		flags if multiple case-sensitive matches exist in directory
+ *      realpnp     - (case-insensitive lookup only)
+ *		actual name matched within the directory
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ * NOTE: For case-insensitive file systems we take wide locks (see below),
+ *	 but return znode pointers to a single match.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+    int flag, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_dirlock_t	*dl;
+	boolean_t	update;
+	boolean_t	exact;
+	uint64_t	zoid;
+	vnode_t		*vp = NULL;
+	int		error = 0;
+	int		cmpflags;
+
+	*zpp = NULL;
+	*dlpp = NULL;
+
+	/*
+	 * Verify that we are not trying to lock '.', '..', or '.zfs'
+	 */
+	if (name[0] == '.' &&
+	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
+	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
+		return (EEXIST);
+
+	/*
+	 * Case sensitivity and normalization preferences are set when
+	 * the file system is created.  These are stored in the
+	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
+	 * affect what vnodes can be cached in the DNLC, how we
+	 * perform zap lookups, and the "width" of our dirlocks.
+	 *
+	 * A normal dirlock locks a single name.  Note that with
+	 * normalization a name can be composed multiple ways, but
+	 * when normalized, these names all compare equal.  A wide
+	 * dirlock locks multiple names.  We need these when the file
+	 * system is supporting mixed-mode access.  It is sometimes
+	 * necessary to lock all case permutations of file name at
+	 * once so that simultaneous case-insensitive/case-sensitive
+	 * behaves as rationally as possible.
+	 */
+
+	/*
+	 * Decide if exact matches should be requested when performing
+	 * a zap lookup on file systems supporting case-insensitive
+	 * access.
+	 */
+	exact =
+	    ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
+
+	/*
+	 * Only look in or update the DNLC if we are looking for the
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name.
+	 *
+	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
+	 * case for performance improvement?
+	 */
+	update = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+
+	/*
+	 * ZRENAMING indicates we are in a situation where we should
+	 * take narrow locks regardless of the file system's
+	 * preferences for normalizing and case folding.  This will
+	 * prevent us deadlocking trying to grab the same wide lock
+	 * twice if the two names happen to be case-insensitive
+	 * matches.
+	 */
+	if (flag & ZRENAMING)
+		cmpflags = 0;
+	else
+		cmpflags = zfsvfs->z_norm;
+
+	/*
+	 * Wait until there are no locks on this name.
+	 */
+	rw_enter(&dzp->z_name_lock, RW_READER);
+	mutex_enter(&dzp->z_lock);
+	for (;;) {
+		if (dzp->z_unlinked) {
+			mutex_exit(&dzp->z_lock);
+			rw_exit(&dzp->z_name_lock);
+			return (ENOENT);
+		}
+		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
+			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
+			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
+				break;
+		}
+		if (error != 0) {
+			mutex_exit(&dzp->z_lock);
+			rw_exit(&dzp->z_name_lock);
+			return (ENOENT);
+		}
+		if (dl == NULL)	{
+			/*
+			 * Allocate a new dirlock and add it to the list.
+			 */
+			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+			dl->dl_name = name;
+			dl->dl_sharecnt = 0;
+			dl->dl_namesize = 0;
+			dl->dl_dzp = dzp;
+			dl->dl_next = dzp->z_dirlocks;
+			dzp->z_dirlocks = dl;
+			break;
+		}
+		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+			break;
+		cv_wait(&dl->dl_cv, &dzp->z_lock);
+	}
+
+	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+		/*
+		 * We're the second shared reference to dl.  Make a copy of
+		 * dl_name in case the first thread goes away before we do.
+		 * Note that we initialize the new name before storing its
+		 * pointer into dl_name, because the first thread may load
+		 * dl->dl_name at any time.  He'll either see the old value,
+		 * which is his, or the new shared copy; either is OK.
+		 */
+		dl->dl_namesize = strlen(dl->dl_name) + 1;
+		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+		bcopy(dl->dl_name, name, dl->dl_namesize);
+		dl->dl_name = name;
+	}
+
+	mutex_exit(&dzp->z_lock);
+
+	/*
+	 * We have a dirlock on the name.  (Note that it is the dirlock,
+	 * not the dzp's z_lock, that protects the name in the zap object.)
+	 * See if there's an object by this name; if so, put a hold on it.
+	 */
+	if (flag & ZXATTR) {
+		zoid = dzp->z_phys->zp_xattr;
+		error = (zoid == 0 ? ENOENT : 0);
+	} else {
+		if (update)
+			vp = dnlc_lookup(ZTOV(dzp), name);
+		if (vp == DNLC_NO_VNODE) {
+			VN_RELE(vp);
+			error = ENOENT;
+		} else if (vp) {
+			if (flag & ZNEW) {
+				zfs_dirent_unlock(dl);
+				VN_RELE(vp);
+				return (EEXIST);
+			}
+			*dlpp = dl;
+			*zpp = VTOZ(vp);
+			return (0);
+		} else {
+			error = zfs_match_find(zfsvfs, dzp, name, exact,
+			    update, direntflags, realpnp, &zoid);
+		}
+	}
+	if (error) {
+		if (error != ENOENT || (flag & ZEXISTS)) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	} else {
+		if (flag & ZNEW) {
+			zfs_dirent_unlock(dl);
+			return (EEXIST);
+		}
+		error = zfs_zget(zfsvfs, zoid, zpp);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+		if (!(flag & ZXATTR) && update)
+			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+	}
+
+	*dlpp = dl;
+
+	return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfs_dirlock_t **prev_dl, *cur_dl;
+
+	mutex_enter(&dzp->z_lock);
+	rw_exit(&dzp->z_name_lock);
+	if (dl->dl_sharecnt > 1) {
+		dl->dl_sharecnt--;
+		mutex_exit(&dzp->z_lock);
+		return;
+	}
+	prev_dl = &dzp->z_dirlocks;
+	while ((cur_dl = *prev_dl) != dl)
+		prev_dl = &cur_dl->dl_next;
+	*prev_dl = dl->dl_next;
+	cv_broadcast(&dl->dl_cv);
+	mutex_exit(&dzp->z_lock);
+
+	if (dl->dl_namesize != 0)
+		kmem_free(dl->dl_name, dl->dl_namesize);
+	cv_destroy(&dl->dl_cv);
+	kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ *	no directory entries are actually stored for them.  If this is
+ *	the root of a filesystem, then '.zfs' is also treated as a
+ *	special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
+    int *deflg, pathname_t *rpnp)
+{
+	zfs_dirlock_t *dl;
+	znode_t *zp;
+	int error = 0;
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*vpp = ZTOV(dzp);
+		VN_HOLD(*vpp);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (dzp->z_phys->zp_parent == dzp->z_id &&
+		    zfsvfs->z_parent != zfsvfs) {
+			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+			    "snapshot", vpp, NULL, 0, NULL, kcred,
+			    NULL, NULL, NULL);
+			return (error);
+		}
+		rw_enter(&dzp->z_parent_lock, RW_READER);
+		error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+		rw_exit(&dzp->z_parent_lock);
+	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+		*vpp = zfsctl_root(dzp);
+	} else {
+		int zf;
+
+		zf = ZEXISTS | ZSHARED;
+		if (flags & FIGNORECASE)
+			zf |= ZCILOOK;
+
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+		if (error == 0) {
+			*vpp = ZTOV(zp);
+			zfs_dirent_unlock(dl);
+			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+		}
+		rpnp = NULL;
+	}
+
+	if ((flags & FIGNORECASE) && rpnp && !error)
+		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
+
+	return (error);
+}
+
+static char *
+zfs_unlinked_hexname(char namebuf[17], uint64_t x)
+{
+	char *name = &namebuf[16];
+	const char digits[16] = "0123456789abcdef";
+
+	*name = '\0';
+	do {
+		*--name = digits[x & 0xf];
+		x >>= 4;
+	} while (x != 0);
+
+	return (name);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating.  We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem).  So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error.  On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	char obj_name[17];
+	int error;
+
+	ASSERT(zp->z_unlinked);
+	ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+	error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+	    zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
+	ASSERT3U(error, ==, 0);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	int		error;
+
+	/*
+	 * Interate over the contents of the unlinked set.
+	 */
+	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * See what kind of object we have in list
+		 */
+
+		error = dmu_object_info(zfsvfs->z_os,
+		    zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these list entries for deletion,
+		 * so we pull them back into core and set zp->z_unlinked.
+		 */
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for deletion.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system marked z_unlinked.
+		 */
+		if (error != 0)
+			continue;
+
+		zp->z_unlinked = B_TRUE;
+		VN_RELE(ZTOV(zp));
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory.  Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ *	so there is no need to lock its entries before deletion.
+ *	Also, it assumes the directory contents is *only* regular
+ *	files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	znode_t		*xzp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_dirlock_t	dl;
+	int skipped = 0;
+	int error;
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		error = zfs_zget(zfsvfs,
+		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+		if (error) {
+			skipped += 1;
+			continue;
+		}
+
+		ASSERT((ZTOV(xzp)->v_type == VREG) ||
+		    (ZTOV(xzp)->v_type == VLNK));
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_bonus(tx, dzp->z_id);
+		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+		dmu_tx_hold_bonus(tx, xzp->z_id);
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			VN_RELE(ZTOV(xzp));
+			skipped += 1;
+			continue;
+		}
+		bzero(&dl, sizeof (dl));
+		dl.dl_dzp = dzp;
+		dl.dl_name = zap.za_name;
+
+		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		if (error)
+			skipped += 1;
+		dmu_tx_commit(tx);
+
+		VN_RELE(ZTOV(xzp));
+	}
+	zap_cursor_fini(&zc);
+	if (error != ENOENT)
+		skipped += 1;
+	return (skipped);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os = zfsvfs->z_os;
+	znode_t		*xzp = NULL;
+	char		obj_name[17];
+	dmu_tx_t	*tx;
+	uint64_t	acl_obj;
+	int		error;
+
+	ASSERT(ZTOV(zp)->v_count == 0);
+	ASSERT(zp->z_phys->zp_links == 0);
+
+	/*
+	 * If this is an attribute directory, purge its contents.
+	 */
+	if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
+		if (zfs_purgedir(zp) != 0) {
+			/*
+			 * Not enough space to delete some xattrs.
+			 * Leave it on the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			return;
+		}
+	}
+
+	/*
+	 * If the file has extended attributes, we're going to unlink
+	 * the xattr dir.
+	 */
+	if (zp->z_phys->zp_xattr) {
+		error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+		ASSERT(error == 0);
+	}
+
+	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+
+	/*
+	 * Set up the transaction.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	if (xzp) {
+		dmu_tx_hold_bonus(tx, xzp->z_id);
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+	}
+	if (acl_obj)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		/*
+		 * Not enough space to delete the file.  Leave it in the
+		 * unlinked set, leaking it until the fs is remounted (at
+		 * which point we'll call zfs_unlinked_drain() to process it).
+		 */
+		dmu_tx_abort(tx);
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		goto out;
+	}
+
+	if (xzp) {
+		dmu_buf_will_dirty(xzp->z_dbuf, tx);
+		mutex_enter(&xzp->z_lock);
+		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
+		xzp->z_phys->zp_links = 0;	/* no more links to it */
+		mutex_exit(&xzp->z_lock);
+		zfs_unlinked_add(xzp, tx);
+	}
+
+	/* Remove this znode from the unlinked set */
+	error = zap_remove(os, zfsvfs->z_unlinkedobj,
+	    zfs_unlinked_hexname(obj_name, zp->z_id), tx);
+	ASSERT3U(error, ==, 0);
+
+	zfs_znode_delete(zp, tx);
+
+	dmu_tx_commit(tx);
+out:
+	if (xzp)
+		VN_RELE(ZTOV(xzp));
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp)
+{
+	uint64_t de = zp->z_id;
+	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
+		de |= IFTODT((zp)->z_phys->zp_mode) << 60;
+	return (de);
+}
+
+/*
+ * Link zp into dl.  Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+	znode_t *dzp = dl->dl_dzp;
+	vnode_t *vp = ZTOV(zp);
+	uint64_t value;
+	int zp_is_dir = (vp->v_type == VDIR);
+	int error;
+
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	mutex_enter(&zp->z_lock);
+
+	if (!(flag & ZRENAMING)) {
+		if (zp->z_unlinked) {	/* no new links to unlinked zp */
+			ASSERT(!(flag & (ZNEW | ZEXISTS)));
+			mutex_exit(&zp->z_lock);
+			return (ENOENT);
+		}
+		zp->z_phys->zp_links++;
+	}
+	zp->z_phys->zp_parent = dzp->z_id;	/* dzp is now zp's parent */
+
+	if (!(flag & ZNEW))
+		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+	mutex_exit(&zp->z_lock);
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+	mutex_enter(&dzp->z_lock);
+	dzp->z_phys->zp_size++;			/* one dirent added */
+	dzp->z_phys->zp_links += zp_is_dir;	/* ".." link from zp */
+	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+	mutex_exit(&dzp->z_lock);
+
+	value = zfs_dirent(zp);
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+	    8, 1, &value, tx);
+	ASSERT(error == 0);
+
+	dnlc_update(ZTOV(dzp), dl->dl_name, vp);
+
+	return (0);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+	boolean_t *unlinkedp)
+{
+	znode_t *dzp = dl->dl_dzp;
+	vnode_t *vp = ZTOV(zp);
+	int zp_is_dir = (vp->v_type == VDIR);
+	boolean_t unlinked = B_FALSE;
+	int error;
+
+	dnlc_remove(ZTOV(dzp), dl->dl_name);
+
+	if (!(flag & ZRENAMING)) {
+		dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
+			return (EBUSY);
+
+		if (vn_ismntpt(vp)) {		/* don't remove mount point */
+			vn_vfsunlock(vp);
+			return (EBUSY);
+		}
+
+		mutex_enter(&zp->z_lock);
+		if (zp_is_dir && !zfs_dirempty(zp)) {	/* dir not empty */
+			mutex_exit(&zp->z_lock);
+			vn_vfsunlock(vp);
+			return (EEXIST);
+		}
+		if (zp->z_phys->zp_links <= zp_is_dir) {
+			zfs_panic_recover("zfs: link count on %s is %u, "
+			    "should be at least %u",
+			    zp->z_vnode->v_path ? zp->z_vnode->v_path :
+			    "<unknown>", (int)zp->z_phys->zp_links,
+			    zp_is_dir + 1);
+			zp->z_phys->zp_links = zp_is_dir + 1;
+		}
+		if (--zp->z_phys->zp_links == zp_is_dir) {
+			zp->z_unlinked = B_TRUE;
+			zp->z_phys->zp_links = 0;
+			unlinked = B_TRUE;
+		} else {
+			zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+		}
+		mutex_exit(&zp->z_lock);
+		vn_vfsunlock(vp);
+	}
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+	mutex_enter(&dzp->z_lock);
+	dzp->z_phys->zp_size--;			/* one dirent removed */
+	dzp->z_phys->zp_links -= zp_is_dir;	/* ".." link from zp */
+	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+	mutex_exit(&dzp->z_lock);
+
+	if (zp->z_zfsvfs->z_norm) {
+		if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
+		    (flag & ZCIEXACT)) ||
+		    ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
+		    !(flag & ZCILOOK)))
+			error = zap_remove_norm(zp->z_zfsvfs->z_os,
+			    dzp->z_id, dl->dl_name, MT_EXACT, tx);
+		else
+			error = zap_remove_norm(zp->z_zfsvfs->z_os,
+			    dzp->z_id, dl->dl_name, MT_FIRST, tx);
+	} else {
+		error = zap_remove(zp->z_zfsvfs->z_os,
+		    dzp->z_id, dl->dl_name, tx);
+	}
+	ASSERT(error == 0);
+
+	if (unlinkedp != NULL)
+		*unlinkedp = unlinked;
+	else if (unlinked)
+		zfs_unlinked_add(zp, tx);
+
+	return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.  Works with or without z_lock
+ * held, but can only be consider a hint in the latter case.  Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+	return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_t *xzp;
+	dmu_tx_t *tx;
+	int error;
+	zfs_fuid_info_t *fuidp = NULL;
+
+	*xvpp = NULL;
+
+	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
+		return (error);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
+		if (zfsvfs->z_fuid_obj == 0) {
+			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+		} else {
+			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+			    FUID_SIZE_ESTIMATE(zfsvfs));
+		}
+	}
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+			dmu_tx_wait(tx);
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
+	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	zp->z_phys->zp_xattr = xzp->z_id;
+
+	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+	    xzp, "", NULL, fuidp, vap);
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
+	dmu_tx_commit(tx);
+
+	*xvpp = ZTOV(xzp);
+
+	return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ *	IN:	zp	- znode to obtain attribute directory from
+ *		cr	- credentials of caller
+ *		flags	- flags from the VOP_LOOKUP call
+ *
+ *	OUT:	xzpp	- pointer to extended attribute znode
+ *
+ *	RETURN:	0 on success
+ *		error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	znode_t		*xzp;
+	zfs_dirlock_t	*dl;
+	vattr_t		va;
+	int		error;
+top:
+	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+	if (error)
+		return (error);
+
+	if (xzp != NULL) {
+		*xvpp = ZTOV(xzp);
+		zfs_dirent_unlock(dl);
+		return (0);
+	}
+
+	ASSERT(zp->z_phys->zp_xattr == 0);
+
+	if (!(flags & CREATE_XATTR_DIR)) {
+		zfs_dirent_unlock(dl);
+		return (ENOENT);
+	}
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		zfs_dirent_unlock(dl);
+		return (EROFS);
+	}
+
+	/*
+	 * The ability to 'create' files in an attribute
+	 * directory comes from the write_xattr permission on the base file.
+	 *
+	 * The ability to 'search' an attribute directory requires
+	 * read_xattr permission on the base file.
+	 *
+	 * Once in a directory the ability to read/write attributes
+	 * is controlled by the permissions on the attribute file.
+	 */
+	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
+	va.va_type = VDIR;
+	va.va_mode = S_IFDIR | S_ISVTX | 0777;
+	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
+	zfs_dirent_unlock(dl);
+
+	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		/* NB: we already did dmu_tx_wait() if necessary */
+		goto top;
+	}
+
+	return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ *	you own the directory,
+ *	you own the entry,
+ *	the entry is a plain file and you have write access,
+ *	or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+	uid_t  		uid;
+	uid_t		downer;
+	uid_t		fowner;
+	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
+
+	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
+		return (0);
+
+	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
+		return (0);
+
+	downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
+	fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
+
+	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+	    (ZTOV(zp)->v_type == VREG &&
+	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
+		return (0);
+	else
+		return (secpolicy_vnode_remove(cr));
+}
@@ -0,0 +1,688 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"@(#)zfs_fuid.c	1.5	08/01/31 SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/sunddi.h>
+#include <sys/dmu.h>
+#include <sys/avl.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/nvpair.h>
+#ifdef _KERNEL
+#include <sys/kidmap.h>
+#include <sys/sid.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#endif
+#include <sys/zfs_fuid.h>
+
+/*
+ * FUID Domain table(s).
+ *
+ * The FUID table is stored as a packed nvlist of an array
+ * of nvlists which contain an index, domain string and offset
+ *
+ * During file system initialization the nvlist(s) are read and
+ * two AVL trees are created.  One tree is keyed by the index number
+ * and the other by the domain string.  Nodes are never removed from
+ * trees, but new entries may be added.  If a new entry is added then the
+ * on-disk packed nvlist will also be updated.
+ */
+
+#define	FUID_IDX	"fuid_idx"
+#define	FUID_DOMAIN	"fuid_domain"
+#define	FUID_OFFSET	"fuid_offset"
+#define	FUID_NVP_ARRAY	"fuid_nvlist"
+
+typedef struct fuid_domain {
+	avl_node_t	f_domnode;
+	avl_node_t	f_idxnode;
+	ksiddomain_t	*f_ksid;
+	uint64_t	f_idx;
+} fuid_domain_t;
+
+/*
+ * Compare two indexes.
+ */
+static int
+idx_compare(const void *arg1, const void *arg2)
+{
+	const fuid_domain_t *node1 = arg1;
+	const fuid_domain_t *node2 = arg2;
+
+	if (node1->f_idx < node2->f_idx)
+		return (-1);
+	else if (node1->f_idx > node2->f_idx)
+		return (1);
+	return (0);
+}
+
+/*
+ * Compare two domain strings.
+ */
+static int
+domain_compare(const void *arg1, const void *arg2)
+{
+	const fuid_domain_t *node1 = arg1;
+	const fuid_domain_t *node2 = arg2;
+	int val;
+
+	val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
+	if (val == 0)
+		return (0);
+	return (val > 0 ? 1 : -1);
+}
+
+/*
+ * load initial fuid domain and idx trees.  This function is used by
+ * both the kernel and zdb.
+ */
+uint64_t
+zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
+    avl_tree_t *domain_tree)
+{
+	dmu_buf_t *db;
+	uint64_t fuid_size;
+
+	avl_create(idx_tree, idx_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+	avl_create(domain_tree, domain_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+
+	VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
+	fuid_size = *(uint64_t *)db->db_data;
+	dmu_buf_rele(db, FTAG);
+
+	if (fuid_size)  {
+		nvlist_t **fuidnvp;
+		nvlist_t *nvp = NULL;
+		uint_t count;
+		char *packed;
+		int i;
+
+		packed = kmem_alloc(fuid_size, KM_SLEEP);
+		VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
+		VERIFY(nvlist_unpack(packed, fuid_size,
+		    &nvp, 0) == 0);
+		VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
+		    &fuidnvp, &count) == 0);
+
+		for (i = 0; i != count; i++) {
+			fuid_domain_t *domnode;
+			char *domain;
+			uint64_t idx;
+
+			VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
+			    &domain) == 0);
+			VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
+			    &idx) == 0);
+
+			domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+
+			domnode->f_idx = idx;
+			domnode->f_ksid = ksid_lookupdomain(domain);
+			avl_add(idx_tree, domnode);
+			avl_add(domain_tree, domnode);
+		}
+		nvlist_free(nvp);
+		kmem_free(packed, fuid_size);
+	}
+	return (fuid_size);
+}
+
+void
+zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+	fuid_domain_t *domnode;
+	void *cookie;
+
+	cookie = NULL;
+	while (domnode = avl_destroy_nodes(domain_tree, &cookie))
+		ksiddomain_rele(domnode->f_ksid);
+
+	avl_destroy(domain_tree);
+	cookie = NULL;
+	while (domnode = avl_destroy_nodes(idx_tree, &cookie))
+		kmem_free(domnode, sizeof (fuid_domain_t));
+	avl_destroy(idx_tree);
+}
+
+char *
+zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
+{
+	fuid_domain_t searchnode, *findnode;
+	avl_index_t loc;
+
+	searchnode.f_idx = idx;
+
+	findnode = avl_find(idx_tree, &searchnode, &loc);
+
+	return (findnode->f_ksid->kd_name);
+}
+
+#ifdef _KERNEL
+/*
+ * Load the fuid table(s) into memory.
+ */
+static void
+zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	int error = 0;
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+	if (zfsvfs->z_fuid_loaded) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return;
+	}
+
+	if (zfsvfs->z_fuid_obj == 0) {
+
+		/* first make sure we need to allocate object */
+
+		error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
+		if (error == ENOENT && tx != NULL) {
+			zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+			    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+			    sizeof (uint64_t), tx);
+			VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+			    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+			    &zfsvfs->z_fuid_obj, tx) == 0);
+		}
+	}
+
+	zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
+	    zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+
+	zfsvfs->z_fuid_loaded = B_TRUE;
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Query domain table for a given domain.
+ *
+ * If domain isn't found it is added to AVL trees and
+ * the results are pushed out to disk.
+ */
+int
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
+    dmu_tx_t *tx)
+{
+	fuid_domain_t searchnode, *findnode;
+	avl_index_t loc;
+
+	/*
+	 * If the dummy "nobody" domain then return an index of 0
+	 * to cause the created FUID to be a standard POSIX id
+	 * for the user nobody.
+	 */
+	if (domain[0] == '\0') {
+		*retdomain = "";
+		return (0);
+	}
+
+	searchnode.f_ksid = ksid_lookupdomain(domain);
+	if (retdomain) {
+		*retdomain = searchnode.f_ksid->kd_name;
+	}
+	if (!zfsvfs->z_fuid_loaded)
+		zfs_fuid_init(zfsvfs, tx);
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
+	findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
+	rw_exit(&zfsvfs->z_fuid_lock);
+
+	if (findnode) {
+		ksiddomain_rele(searchnode.f_ksid);
+		return (findnode->f_idx);
+	} else {
+		fuid_domain_t *domnode;
+		nvlist_t *nvp;
+		nvlist_t **fuids;
+		uint64_t retidx;
+		size_t nvsize = 0;
+		char *packed;
+		dmu_buf_t *db;
+		int i = 0;
+
+		domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+		domnode->f_ksid = searchnode.f_ksid;
+
+		rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+		retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
+
+		avl_add(&zfsvfs->z_fuid_domain, domnode);
+		avl_add(&zfsvfs->z_fuid_idx, domnode);
+		/*
+		 * Now resync the on-disk nvlist.
+		 */
+		VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		domnode = avl_first(&zfsvfs->z_fuid_domain);
+		fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
+		while (domnode) {
+			VERIFY(nvlist_alloc(&fuids[i],
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+			VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+			    domnode->f_idx) == 0);
+			VERIFY(nvlist_add_uint64(fuids[i],
+			    FUID_OFFSET, 0) == 0);
+			VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
+			    domnode->f_ksid->kd_name) == 0);
+			domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
+		}
+		VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+		    fuids, retidx) == 0);
+		for (i = 0; i != retidx; i++)
+			nvlist_free(fuids[i]);
+		kmem_free(fuids, retidx * sizeof (void *));
+		VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+		packed = kmem_alloc(nvsize, KM_SLEEP);
+		VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+		    NV_ENCODE_XDR, KM_SLEEP) == 0);
+		nvlist_free(nvp);
+		zfsvfs->z_fuid_size = nvsize;
+		dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+		    zfsvfs->z_fuid_size, packed, tx);
+		kmem_free(packed, zfsvfs->z_fuid_size);
+		VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+		    FTAG, &db));
+		dmu_buf_will_dirty(db, tx);
+		*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+		dmu_buf_rele(db, FTAG);
+
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return (retidx);
+	}
+}
+
+/*
+ * Query domain table by index, returning domain string
+ *
+ * Returns a pointer from an avl node of the domain string.
+ *
+ */
+static char *
+zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
+{
+	char *domain;
+
+	if (idx == 0 || !zfsvfs->z_use_fuids)
+		return (NULL);
+
+	if (!zfsvfs->z_fuid_loaded)
+		zfs_fuid_init(zfsvfs, NULL);
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
+	domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
+	rw_exit(&zfsvfs->z_fuid_lock);
+
+	ASSERT(domain);
+	return (domain);
+}
+
+void
+zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
+{
+	*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
+	    cr, ZFS_OWNER);
+	*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
+	    cr, ZFS_GROUP);
+}
+
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+    cred_t *cr, zfs_fuid_type_t type)
+{
+	uint32_t index = FUID_INDEX(fuid);
+	char *domain;
+	uid_t id;
+
+	if (index == 0)
+		return (fuid);
+
+	domain = zfs_fuid_find_by_idx(zfsvfs, index);
+	ASSERT(domain != NULL);
+
+	if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
+		(void) kidmap_getuidbysid(crgetzone(cr), domain,
+		    FUID_RID(fuid), &id);
+	} else {
+		(void) kidmap_getgidbysid(crgetzone(cr), domain,
+		    FUID_RID(fuid), &id);
+	}
+	return (id);
+}
+
+/*
+ * Add a FUID node to the list of fuid's being created for this
+ * ACL
+ *
+ * If ACL has multiple domains, then keep only one copy of each unique
+ * domain.
+ */
+static void
+zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
+    uint64_t idx, uint64_t id, zfs_fuid_type_t type)
+{
+	zfs_fuid_t *fuid;
+	zfs_fuid_domain_t *fuid_domain;
+	zfs_fuid_info_t *fuidp;
+	uint64_t fuididx;
+	boolean_t found = B_FALSE;
+
+	if (*fuidpp == NULL)
+		*fuidpp = zfs_fuid_info_alloc();
+
+	fuidp = *fuidpp;
+	/*
+	 * First find fuid domain index in linked list
+	 *
+	 * If one isn't found then create an entry.
+	 */
+
+	for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
+	    fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
+	    fuid_domain), fuididx++) {
+		if (idx == fuid_domain->z_domidx) {
+			found = B_TRUE;
+			break;
+		}
+	}
+
+	if (!found) {
+		fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
+		fuid_domain->z_domain = domain;
+		fuid_domain->z_domidx = idx;
+		list_insert_tail(&fuidp->z_domains, fuid_domain);
+		fuidp->z_domain_str_sz += strlen(domain) + 1;
+		fuidp->z_domain_cnt++;
+	}
+
+	if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+		/*
+		 * Now allocate fuid entry and add it on the end of the list
+		 */
+
+		fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+		fuid->z_id = id;
+		fuid->z_domidx = idx;
+		fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
+
+		list_insert_tail(&fuidp->z_fuids, fuid);
+		fuidp->z_fuid_cnt++;
+	} else {
+		if (type == ZFS_OWNER)
+			fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
+		else
+			fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
+	}
+}
+
+/*
+ * Create a file system FUID, based on information in the users cred
+ */
+uint64_t
+zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
+    dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
+{
+	uint64_t	idx;
+	ksid_t		*ksid;
+	uint32_t	rid;
+	char 		*kdomain;
+	const char	*domain;
+	uid_t		id;
+
+	VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
+
+	if (type == ZFS_OWNER)
+		id = crgetuid(cr);
+	else
+		id = crgetgid(cr);
+
+	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id))
+		return ((uint64_t)id);
+
+	ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+
+	VERIFY(ksid != NULL);
+	rid = ksid_getrid(ksid);
+	domain = ksid_getdomain(ksid);
+
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+
+	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
+
+	return (FUID_ENCODE(idx, rid));
+}
+
+/*
+ * Create a file system FUID for an ACL ace
+ * or a chown/chgrp of the file.
+ * This is similar to zfs_fuid_create_cred, except that
+ * we can't find the domain + rid information in the
+ * cred.  Instead we have to query Winchester for the
+ * domain and rid.
+ *
+ * During replay operations the domain+rid information is
+ * found in the zfs_fuid_info_t that the replay code has
+ * attached to the zfsvfs of the file system.
+ */
+uint64_t
+zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
+    zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
+{
+	const char *domain;
+	char *kdomain;
+	uint32_t fuid_idx = FUID_INDEX(id);
+	uint32_t rid;
+	idmap_stat status;
+	uint64_t idx;
+	boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
+	zfs_fuid_t *zfuid = NULL;
+	zfs_fuid_info_t *fuidp;
+
+	/*
+	 * If POSIX ID, or entry is already a FUID then
+	 * just return the id
+	 *
+	 * We may also be handed an already FUID'ized id via
+	 * chmod.
+	 */
+
+	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
+		return (id);
+
+	if (is_replay) {
+		fuidp = zfsvfs->z_fuid_replay;
+
+		/*
+		 * If we are passed an ephemeral id, but no
+		 * fuid_info was logged then return NOBODY.
+		 * This is most likely a result of idmap service
+		 * not being available.
+		 */
+		if (fuidp == NULL)
+			return (UID_NOBODY);
+
+		switch (type) {
+		case ZFS_ACE_USER:
+		case ZFS_ACE_GROUP:
+			zfuid = list_head(&fuidp->z_fuids);
+			rid = FUID_RID(zfuid->z_logfuid);
+			idx = FUID_INDEX(zfuid->z_logfuid);
+			break;
+		case ZFS_OWNER:
+			rid = FUID_RID(fuidp->z_fuid_owner);
+			idx = FUID_INDEX(fuidp->z_fuid_owner);
+			break;
+		case ZFS_GROUP:
+			rid = FUID_RID(fuidp->z_fuid_group);
+			idx = FUID_INDEX(fuidp->z_fuid_group);
+			break;
+		};
+		domain = fuidp->z_domain_table[idx -1];
+	} else {
+		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
+			status = kidmap_getsidbyuid(crgetzone(cr), id,
+			    &domain, &rid);
+		else
+			status = kidmap_getsidbygid(crgetzone(cr), id,
+			    &domain, &rid);
+
+		if (status != 0) {
+			/*
+			 * When returning nobody we will need to
+			 * make a dummy fuid table entry for logging
+			 * purposes.
+			 */
+			rid = UID_NOBODY;
+			domain = "";
+		}
+	}
+
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+
+	if (!is_replay)
+		zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
+	else if (zfuid != NULL) {
+		list_remove(&fuidp->z_fuids, zfuid);
+		kmem_free(zfuid, sizeof (zfs_fuid_t));
+	}
+	return (FUID_ENCODE(idx, rid));
+}
+
+void
+zfs_fuid_destroy(zfsvfs_t *zfsvfs)
+{
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+	if (!zfsvfs->z_fuid_loaded) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return;
+	}
+	zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Allocate zfs_fuid_info for tracking FUIDs created during
+ * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
+ */
+zfs_fuid_info_t *
+zfs_fuid_info_alloc(void)
+{
+	zfs_fuid_info_t *fuidp;
+
+	fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
+	list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
+	    offsetof(zfs_fuid_domain_t, z_next));
+	list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
+	    offsetof(zfs_fuid_t, z_next));
+	return (fuidp);
+}
+
+/*
+ * Release all memory associated with zfs_fuid_info_t
+ */
+void
+zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
+{
+	zfs_fuid_t *zfuid;
+	zfs_fuid_domain_t *zdomain;
+
+	while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
+		list_remove(&fuidp->z_fuids, zfuid);
+		kmem_free(zfuid, sizeof (zfs_fuid_t));
+	}
+
+	if (fuidp->z_domain_table != NULL)
+		kmem_free(fuidp->z_domain_table,
+		    (sizeof (char **)) * fuidp->z_domain_cnt);
+
+	while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
+		list_remove(&fuidp->z_domains, zdomain);
+		kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
+	}
+
+	kmem_free(fuidp, sizeof (zfs_fuid_info_t));
+}
+
+/*
+ * Check to see if id is a groupmember.  If cred
+ * has ksid info then sidlist is checked first
+ * and if still not found then POSIX groups are checked
+ *
+ * Will use a straight FUID compare when possible.
+ */
+boolean_t
+zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
+{
+	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
+	uid_t		gid;
+
+	if (ksid) {
+		int 		i;
+		ksid_t		*ksid_groups;
+		ksidlist_t	*ksidlist = crgetsidlist(cr);
+		uint32_t	idx = FUID_INDEX(id);
+		uint32_t	rid = FUID_RID(id);
+
+		ASSERT(ksidlist);
+		ksid_groups = ksidlist->ksl_sids;
+
+		for (i = 0; i != ksidlist->ksl_nsid; i++) {
+			if (idx == 0) {
+				if (id != IDMAP_WK_CREATOR_GROUP_GID &&
+				    id == ksid_groups[i].ks_id) {
+					return (B_TRUE);
+				}
+			} else {
+				char *domain;
+
+				domain = zfs_fuid_find_by_idx(zfsvfs, idx);
+				ASSERT(domain != NULL);
+
+				if (strcmp(domain,
+				    IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
+					return (B_FALSE);
+
+				if ((strcmp(domain,
+				    ksid_groups[i].ks_domain->kd_name) == 0) &&
+				    rid == ksid_groups[i].ks_rid)
+					return (B_TRUE);
+			}
+		}
+	}
+
+	/*
+	 * Not found in ksidlist, check posix groups
+	 */
+	gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
+	return (groupmember(gid, cr));
+}
+#endif
@@ -0,0 +1,693 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"@(#)zfs_log.c	1.13	08/04/09 SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/byteorder.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/zfs_fuid.h>
+#include <sys/ddi.h>
+
+/*
+ * All the functions in this file are used to construct the log entries
+ * to record transactions. They allocate * an intent log transaction
+ * structure (itx_t) and save within it all the information necessary to
+ * possibly replay the transaction. The itx is then assigned a sequence
+ * number and inserted in the in-memory list anchored in the zilog.
+ */
+
+int
+zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
+{
+	int isxvattr = (vap->va_mask & AT_XVATTR);
+	switch (type) {
+	case Z_FILE:
+		if (vsecp == NULL && !isxvattr)
+			return (TX_CREATE);
+		if (vsecp && isxvattr)
+			return (TX_CREATE_ACL_ATTR);
+		if (vsecp)
+			return (TX_CREATE_ACL);
+		else
+			return (TX_CREATE_ATTR);
+		/*NOTREACHED*/
+	case Z_DIR:
+		if (vsecp == NULL && !isxvattr)
+			return (TX_MKDIR);
+		if (vsecp && isxvattr)
+			return (TX_MKDIR_ACL_ATTR);
+		if (vsecp)
+			return (TX_MKDIR_ACL);
+		else
+			return (TX_MKDIR_ATTR);
+	case Z_XATTRDIR:
+		return (TX_MKXATTR);
+	}
+	ASSERT(0);
+	return (TX_MAX_TYPE);
+}
+
+/*
+ * build up the log data necessary for logging xvattr_t
+ * First lr_attr_t is initialized.  following the lr_attr_t
+ * is the mapsize and attribute bitmap copied from the xvattr_t.
+ * Following the bitmap and bitmapsize two 64 bit words are reserved
+ * for the create time which may be set.  Following the create time
+ * records a single 64 bit integer which has the bits to set on
+ * replay for the xvattr.
+ */
+static void
+zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+	uint32_t	*bitmap;
+	uint64_t	*attrs;
+	uint64_t	*crtime;
+	xoptattr_t	*xoap;
+	void		*scanstamp;
+	int		i;
+
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
+
+	lrattr->lr_attr_masksize = xvap->xva_mapsize;
+	bitmap = &lrattr->lr_attr_bitmap;
+	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
+		*bitmap = xvap->xva_reqattrmap[i];
+	}
+
+	/* Now pack the attributes up in a single uint64_t */
+	attrs = (uint64_t *)bitmap;
+	crtime = attrs + 1;
+	scanstamp = (caddr_t)(crtime + 2);
+	*attrs = 0;
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
+		    XAT0_READONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
+		    XAT0_HIDDEN;
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+		*attrs |= (xoap->xoa_system == 0) ? 0 :
+		    XAT0_SYSTEM;
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+		*attrs |= (xoap->xoa_archive == 0) ? 0 :
+		    XAT0_ARCHIVE;
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
+		    XAT0_IMMUTABLE;
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
+		    XAT0_NOUNLINK;
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
+		    XAT0_APPENDONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
+		    XAT0_APPENDONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
+		    XAT0_NODUMP;
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
+		    XAT0_AV_QUARANTINED;
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
+		    XAT0_AV_MODIFIED;
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+}
+
+static void *
+zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
+{
+	zfs_fuid_t *zfuid;
+	uint64_t *fuidloc = start;
+
+	/* First copy in the ACE FUIDs */
+	for (zfuid = list_head(&fuidp->z_fuids); zfuid;
+	    zfuid = list_next(&fuidp->z_fuids, zfuid)) {
+		*fuidloc++ = zfuid->z_logfuid;
+	}
+	return (fuidloc);
+}
+
+
+static void *
+zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
+{
+	zfs_fuid_domain_t *zdomain;
+
+	/* now copy in the domain info, if any */
+	if (fuidp->z_domain_str_sz != 0) {
+		for (zdomain = list_head(&fuidp->z_domains); zdomain;
+		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
+			bcopy((void *)zdomain->z_domain, start,
+			    strlen(zdomain->z_domain) + 1);
+			start = (caddr_t)start +
+			    strlen(zdomain->z_domain) + 1;
+		}
+	}
+	return (start);
+}
+
+/*
+ * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
+ * TX_MKDIR_ATTR and TX_MKXATTR
+ * transactions.
+ *
+ * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
+ * domain information appended prior to the name.  In this case the
+ * uid/gid in the log record will be a log centric FUID.
+ *
+ * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
+ * may contain attributes, ACL and optional fuid information.
+ *
+ * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
+ * and ACL and normal users/groups in the ACEs.
+ *
+ * There may be an optional xvattr attribute information similar
+ * to zfs_log_setattr.
+ *
+ * Also, after the file name "domain" strings may be appended.
+ */
+void
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
+    zfs_fuid_info_t *fuidp, vattr_t *vap)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_create_t *lr;
+	lr_acl_create_t *lracl;
+	size_t aclsize;
+	size_t xvatsize = 0;
+	size_t txsize;
+	xvattr_t *xvap = (xvattr_t *)vap;
+	void *end;
+	size_t lrsize;
+
+	size_t namesize = strlen(name) + 1;
+	size_t fuidsz = 0;
+
+	if (zilog == NULL)
+		return;
+
+	/*
+	 * If we have FUIDs present then add in space for
+	 * domains and ACE fuid's if any.
+	 */
+	if (fuidp) {
+		fuidsz += fuidp->z_domain_str_sz;
+		fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
+	}
+
+	if (vap->va_mask & AT_XVATTR)
+		xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+	if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
+	    (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
+	    (int)txtype == TX_MKXATTR) {
+		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
+		lrsize = sizeof (*lr);
+	} else {
+		aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
+		txsize =
+		    sizeof (lr_acl_create_t) + namesize + fuidsz +
+		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
+		lrsize = sizeof (lr_acl_create_t);
+	}
+
+	itx = zil_itx_create(txtype, txsize);
+
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mode = zp->z_phys->zp_mode;
+	if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
+		lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
+	} else {
+		lr->lr_uid = fuidp->z_fuid_owner;
+	}
+	if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
+		lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
+	} else {
+		lr->lr_gid = fuidp->z_fuid_group;
+	}
+	lr->lr_gen = zp->z_phys->zp_gen;
+	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+	lr->lr_rdev = zp->z_phys->zp_rdev;
+
+	/*
+	 * Fill in xvattr info if any
+	 */
+	if (vap->va_mask & AT_XVATTR) {
+		zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
+		end = (caddr_t)lr + lrsize + xvatsize;
+	} else {
+		end = (caddr_t)lr + lrsize;
+	}
+
+	/* Now fill in any ACL info */
+
+	if (vsecp) {
+		lracl = (lr_acl_create_t *)&itx->itx_lr;
+		lracl->lr_aclcnt = vsecp->vsa_aclcnt;
+		lracl->lr_acl_bytes = aclsize;
+		lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+		lracl->lr_fuidcnt  = fuidp ? fuidp->z_fuid_cnt : 0;
+		if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
+			lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+		else
+			lracl->lr_acl_flags = 0;
+
+		bcopy(vsecp->vsa_aclentp, end, aclsize);
+		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
+	}
+
+	/* drop in FUID info */
+	if (fuidp) {
+		end = zfs_log_fuid_ids(fuidp, end);
+		end = zfs_log_fuid_domains(fuidp, end);
+	}
+	/*
+	 * Now place file name in log record
+	 */
+	bcopy(name, end, namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+void
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+	znode_t *dzp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_remove_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_remove_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_link() handles TX_LINK transactions.
+ */
+void
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+	znode_t *dzp, znode_t *zp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_link_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_link_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_link_obj = zp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_symlink() handles TX_SYMLINK transactions.
+ */
+void
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, char *name, char *link)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_create_t *lr;
+	size_t namesize = strlen(name) + 1;
+	size_t linksize = strlen(link) + 1;
+
+	if (zilog == NULL)
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mode = zp->z_phys->zp_mode;
+	lr->lr_uid = zp->z_phys->zp_uid;
+	lr->lr_gid = zp->z_phys->zp_gid;
+	lr->lr_gen = zp->z_phys->zp_gen;
+	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+	bcopy(name, (char *)(lr + 1), namesize);
+	bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_rename() handles TX_RENAME transactions.
+ */
+void
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_rename_t *lr;
+	size_t snamesize = strlen(sname) + 1;
+	size_t dnamesize = strlen(dname) + 1;
+
+	if (zilog == NULL)
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+	lr = (lr_rename_t *)&itx->itx_lr;
+	lr->lr_sdoid = sdzp->z_id;
+	lr->lr_tdoid = tdzp->z_id;
+	bcopy(sname, (char *)(lr + 1), snamesize);
+	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	sdzp->z_last_itx = seq;
+	tdzp->z_last_itx = seq;
+	szp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions.
+ */
+ssize_t zfs_immediate_write_sz = 32768;
+
+#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+    sizeof (lr_write_t))
+
+void
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+{
+	itx_wr_state_t write_state;
+	boolean_t slogging;
+	uintptr_t fsync_cnt;
+
+	if (zilog == NULL || zp->z_unlinked)
+		return;
+
+	/*
+	 * Writes are handled in three different ways:
+	 *
+	 * WR_INDIRECT:
+	 *    If the write is greater than zfs_immediate_write_sz and there are
+	 *    no separate logs in this pool then later *if* we need to log the
+	 *    write then dmu_sync() is used to immediately write the block and
+	 *    its block pointer is put in the log record.
+	 * WR_COPIED:
+	 *    If we know we'll immediately be committing the
+	 *    transaction (FSYNC or FDSYNC), the we allocate a larger
+	 *    log record here for the data and copy the data in.
+	 * WR_NEED_COPY:
+	 *    Otherwise we don't allocate a buffer, and *if* we need to
+	 *    flush the write later then a buffer is allocated and
+	 *    we retrieve the data using the dmu.
+	 */
+	slogging = spa_has_slogs(zilog->zl_spa);
+	if (resid > zfs_immediate_write_sz && !slogging)
+		write_state = WR_INDIRECT;
+	else if (ioflag & (FSYNC | FDSYNC))
+		write_state = WR_COPIED;
+	else
+		write_state = WR_NEED_COPY;
+
+	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
+		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
+	}
+
+	while (resid) {
+		itx_t *itx;
+		lr_write_t *lr;
+		ssize_t len;
+
+		/*
+		 * If there are slogs and the write would overflow the largest
+		 * block, then because we don't want to use the main pool
+		 * to dmu_sync, we have to split the write.
+		 */
+		if (slogging && resid > ZIL_MAX_LOG_DATA)
+			len = SPA_MAXBLOCKSIZE >> 1;
+		else
+			len = resid;
+
+		itx = zil_itx_create(txtype, sizeof (*lr) +
+		    (write_state == WR_COPIED ? len : 0));
+		lr = (lr_write_t *)&itx->itx_lr;
+		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+		    zp->z_id, off, len, lr + 1) != 0) {
+			kmem_free(itx, offsetof(itx_t, itx_lr) +
+			    itx->itx_lr.lrc_reclen);
+			itx = zil_itx_create(txtype, sizeof (*lr));
+			lr = (lr_write_t *)&itx->itx_lr;
+			write_state = WR_NEED_COPY;
+		}
+
+		itx->itx_wr_state = write_state;
+		if (write_state == WR_NEED_COPY)
+			itx->itx_sod += len;
+		lr->lr_foid = zp->z_id;
+		lr->lr_offset = off;
+		lr->lr_length = len;
+		lr->lr_blkoff = 0;
+		BP_ZERO(&lr->lr_blkptr);
+
+		itx->itx_private = zp->z_zfsvfs;
+
+		if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
+		    (ioflag & (FSYNC | FDSYNC)))
+			itx->itx_sync = B_TRUE;
+		else
+			itx->itx_sync = B_FALSE;
+
+		zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
+
+		off += len;
+		resid -= len;
+	}
+}
+
+/*
+ * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ */
+void
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, uint64_t off, uint64_t len)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_truncate_t *lr;
+
+	if (zilog == NULL || zp->z_unlinked)
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr));
+	lr = (lr_truncate_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+
+	itx->itx_sync = (zp->z_sync_cnt != 0);
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_setattr() handles TX_SETATTR transactions.
+ */
+void
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+{
+	itx_t		*itx;
+	uint64_t	seq;
+	lr_setattr_t	*lr;
+	xvattr_t	*xvap = (xvattr_t *)vap;
+	size_t		recsize = sizeof (lr_setattr_t);
+	void		*start;
+
+
+	if (zilog == NULL || zp->z_unlinked)
+		return;
+
+	/*
+	 * If XVATTR set, then log record size needs to allow
+	 * for lr_attr_t + xvattr mask, mapsize and create time
+	 * plus actual attribute values
+	 */
+	if (vap->va_mask & AT_XVATTR)
+		recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+	if (fuidp)
+		recsize += fuidp->z_domain_str_sz;
+
+	itx = zil_itx_create(txtype, recsize);
+	lr = (lr_setattr_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mask = (uint64_t)mask_applied;
+	lr->lr_mode = (uint64_t)vap->va_mode;
+	if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
+		lr->lr_uid = fuidp->z_fuid_owner;
+	else
+		lr->lr_uid = (uint64_t)vap->va_uid;
+
+	if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
+		lr->lr_gid = fuidp->z_fuid_group;
+	else
+		lr->lr_gid = (uint64_t)vap->va_gid;
+
+	lr->lr_size = (uint64_t)vap->va_size;
+	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+	start = (lr_setattr_t *)(lr + 1);
+	if (vap->va_mask & AT_XVATTR) {
+		zfs_log_xvattr((lr_attr_t *)start, xvap);
+		start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+	}
+
+	/*
+	 * Now stick on domain information if any on end
+	 */
+
+	if (fuidp)
+		(void) zfs_log_fuid_domains(fuidp, start);
+
+	itx->itx_sync = (zp->z_sync_cnt != 0);
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_acl() handles TX_ACL transactions.
+ */
+void
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+    vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_acl_v0_t *lrv0;
+	lr_acl_t *lr;
+	int txtype;
+	int lrsize;
+	size_t txsize;
+	size_t aclbytes = vsecp->vsa_aclentsz;
+
+	txtype = (zp->z_zfsvfs->z_version == ZPL_VERSION_INITIAL) ?
+	    TX_ACL_V0 : TX_ACL;
+
+	if (txtype == TX_ACL)
+		lrsize = sizeof (*lr);
+	else
+		lrsize = sizeof (*lrv0);
+
+	if (zilog == NULL || zp->z_unlinked)
+		return;
+
+	txsize = lrsize +
+	    ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
+	    (fuidp ? fuidp->z_domain_str_sz : 0) +
+	    sizeof (uint64) * (fuidp ? fuidp->z_fuid_cnt : 0);
+
+	itx = zil_itx_create(txtype, txsize);
+
+	lr = (lr_acl_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	if (txtype == TX_ACL) {
+		lr->lr_acl_bytes = aclbytes;
+		lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+		lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
+		if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
+			lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+		else
+			lr->lr_acl_flags = 0;
+	}
+	lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
+
+	if (txtype == TX_ACL_V0) {
+		lrv0 = (lr_acl_v0_t *)lr;
+		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
+	} else {
+		void *start = (ace_t *)(lr + 1);
+
+		bcopy(vsecp->vsa_aclentp, start, aclbytes);
+
+		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
+
+		if (fuidp) {
+			start = zfs_log_fuid_ids(fuidp, start);
+			(void) zfs_log_fuid_domains(fuidp, start);
+		}
+	}
+
+	itx->itx_sync = (zp->z_sync_cnt != 0);
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+}
@@ -0,0 +1,876 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"@(#)zfs_replay.c	1.7	08/01/14 SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_fuid.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+	bzero(vap, sizeof (*vap));
+	vap->va_mask = (uint_t)mask;
+	vap->va_type = IFTOVT(mode);
+	vap->va_mode = mode & MODEMASK;
+	vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
+	vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
+	vap->va_rdev = zfs_cmpldev(rdev);
+	vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
+{
+	return (ENOTSUP);
+}
+
+static void
+zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+	xoptattr_t *xoap = NULL;
+	uint64_t *attrs;
+	uint64_t *crtime;
+	uint32_t *bitmap;
+	void *scanstamp;
+	int i;
+
+	xvap->xva_vattr.va_mask |= AT_XVATTR;
+	if ((xoap = xva_getxoptattr(xvap)) == NULL) {
+		xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
+		return;
+	}
+
+	ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
+
+	bitmap = &lrattr->lr_attr_bitmap;
+	for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
+		xvap->xva_reqattrmap[i] = *bitmap;
+
+	attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
+	crtime = attrs + 1;
+	scanstamp = (caddr_t)(crtime + 2);
+
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+		xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+		xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+		xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+		xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+		xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+		xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+		xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+		xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+		xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+		xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+		xoap->xoa_av_quarantined =
+		    ((*attrs & XAT0_AV_QUARANTINED) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+}
+
+static int
+zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
+{
+	uint64_t uid_idx;
+	uint64_t gid_idx;
+	int domcnt = 0;
+
+	uid_idx = FUID_INDEX(uid);
+	gid_idx = FUID_INDEX(gid);
+	if (uid_idx)
+		domcnt++;
+	if (gid_idx > 0 && gid_idx != uid_idx)
+		domcnt++;
+
+	return (domcnt);
+}
+
+static void *
+zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
+    int domcnt)
+{
+	int i;
+
+	for (i = 0; i != domcnt; i++) {
+		fuid_infop->z_domain_table[i] = start;
+		start = (caddr_t)start + strlen(start) + 1;
+	}
+
+	return (start);
+}
+
+/*
+ * Set the uid/gid in the fuid_info structure.
+ */
+static void
+zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
+{
+	/*
+	 * If owner or group are log specific FUIDs then slurp up
+	 * domain information and build zfs_fuid_info_t
+	 */
+	if (IS_EPHEMERAL(uid))
+		fuid_infop->z_fuid_owner = uid;
+
+	if (IS_EPHEMERAL(gid))
+		fuid_infop->z_fuid_group = gid;
+}
+
+/*
+ * Load fuid domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
+{
+	int domcnt;
+
+	zfs_fuid_info_t *fuid_infop;
+
+	fuid_infop = zfs_fuid_info_alloc();
+
+	domcnt = zfs_replay_domain_cnt(uid, gid);
+
+	if (domcnt == 0)
+		return (fuid_infop);
+
+	fuid_infop->z_domain_table =
+	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
+
+	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+	fuid_infop->z_domain_cnt = domcnt;
+	*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
+	return (fuid_infop);
+}
+
+/*
+ * load zfs_fuid_t's and fuid_domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
+    uint64_t gid)
+{
+	uint64_t *log_fuid = (uint64_t *)start;
+	zfs_fuid_info_t *fuid_infop;
+	int i;
+
+	fuid_infop = zfs_fuid_info_alloc();
+	fuid_infop->z_domain_cnt = domcnt;
+
+	fuid_infop->z_domain_table =
+	    kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
+
+	for (i = 0; i != idcnt; i++) {
+		zfs_fuid_t *zfuid;
+
+		zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+		zfuid->z_logfuid = *log_fuid;
+		zfuid->z_id = -1;
+		zfuid->z_domidx = 0;
+		list_insert_tail(&fuid_infop->z_fuids, zfuid);
+		log_fuid++;
+	}
+
+	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+	*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
+	return (fuid_infop);
+}
+
+static void
+zfs_replay_swap_attrs(lr_attr_t *lrattr)
+{
+	/* swap the lr_attr structure */
+	byteswap_uint32_array(lrattr, sizeof (*lrattr));
+	/* swap the bitmap */
+	byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
+	    sizeof (uint32_t));
+	/* swap the attributes, create time + 64 bit word for attributes */
+	byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
+	    (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
+}
+
+/*
+ * Replay file create with optional ACL, xvattr information as well
+ * as option FUID information.
+ */
+static int
+zfs_replay_create_acl(zfsvfs_t *zfsvfs,
+    lr_acl_create_t *lracl, boolean_t byteswap)
+{
+	char *name = NULL;		/* location determined later */
+	lr_create_t *lr = (lr_create_t *)lracl;
+	znode_t *dzp;
+	vnode_t *vp = NULL;
+	xvattr_t xva;
+	int vflg = 0;
+	vsecattr_t vsec = { 0 };
+	lr_attr_t *lrattr;
+	void *aclstart;
+	void *fuidstart;
+	size_t xvatlen = 0;
+	uint64_t txtype;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lracl, sizeof (*lracl));
+		txtype = (int)lr->lr_common.lrc_txtype;
+		if (txtype == TX_CREATE_ACL_ATTR ||
+		    txtype == TX_MKDIR_ACL_ATTR) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			zfs_replay_swap_attrs(lrattr);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		}
+
+		aclstart = (caddr_t)(lracl + 1) + xvatlen;
+		zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
+		/* swap fuids */
+		if (lracl->lr_fuidcnt) {
+			byteswap_uint64_array((caddr_t)aclstart +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
+			    lracl->lr_fuidcnt * sizeof (uint64_t));
+		}
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	xva_init(&xva);
+	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+	/*
+	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+	 * eventually end up in zfs_mknode(), which assigns the object's
+	 * creation time and generation number.  The generic VOP_CREATE()
+	 * doesn't have either concept, so we smuggle the values inside
+	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
+	 */
+	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+	xva.xva_vattr.va_nblocks = lr->lr_gen;
+
+	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
+	if (error != ENOENT)
+		goto bail;
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_CREATE_ACL:
+		aclstart = (caddr_t)(lracl + 1);
+		fuidstart = (caddr_t)aclstart +
+		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+		    lr->lr_uid, lr->lr_gid);
+		/*FALLTHROUGH*/
+	case TX_CREATE_ACL_ATTR:
+		if (name == NULL) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+			xva.xva_vattr.va_mask |= AT_XVATTR;
+			zfs_replay_xvattr(lrattr, &xva);
+		}
+		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+		vsec.vsa_aclcnt = lracl->lr_aclcnt;
+		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+		vsec.vsa_aclflags = lracl->lr_acl_flags;
+		if (zfsvfs->z_fuid_replay == NULL) {
+			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+			zfsvfs->z_fuid_replay =
+			    zfs_replay_fuids(fuidstart,
+			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+			    lr->lr_uid, lr->lr_gid);
+		}
+
+		error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
+		    0, 0, &vp, kcred, vflg, NULL, &vsec);
+		break;
+	case TX_MKDIR_ACL:
+		aclstart = (caddr_t)(lracl + 1);
+		fuidstart = (caddr_t)aclstart +
+		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+		    lr->lr_uid, lr->lr_gid);
+		/*FALLTHROUGH*/
+	case TX_MKDIR_ACL_ATTR:
+		if (name == NULL) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+			zfs_replay_xvattr(lrattr, &xva);
+		}
+		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+		vsec.vsa_aclcnt = lracl->lr_aclcnt;
+		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+		vsec.vsa_aclflags = lracl->lr_acl_flags;
+		if (zfsvfs->z_fuid_replay == NULL) {
+			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+			zfsvfs->z_fuid_replay =
+			    zfs_replay_fuids(fuidstart,
+			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+			    lr->lr_uid, lr->lr_gid);
+		}
+		error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
+		    &vp, kcred, NULL, vflg, &vsec);
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+bail:
+	if (error == 0 && vp != NULL)
+		VN_RELE(vp);
+
+	VN_RELE(ZTOV(dzp));
+
+	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
+
+	return (error);
+}
+
+static int
+zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
+{
+	char *name = NULL;		/* location determined later */
+	char *link;			/* symlink content follows name */
+	znode_t *dzp;
+	vnode_t *vp = NULL;
+	xvattr_t xva;
+	int vflg = 0;
+	size_t lrsize = sizeof (lr_create_t);
+	lr_attr_t *lrattr;
+	void *start;
+	size_t xvatlen;
+	uint64_t txtype;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		txtype = (int)lr->lr_common.lrc_txtype;
+		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
+			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+	}
+
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	xva_init(&xva);
+	zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+	/*
+	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+	 * eventually end up in zfs_mknode(), which assigns the object's
+	 * creation time and generation number.  The generic VOP_CREATE()
+	 * doesn't have either concept, so we smuggle the values inside
+	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
+	 */
+	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+	xva.xva_vattr.va_nblocks = lr->lr_gen;
+
+	error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
+	if (error != ENOENT)
+		goto out;
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	/*
+	 * Symlinks don't have fuid info, and CIFS never creates
+	 * symlinks.
+	 *
+	 * The _ATTR versions will grab the fuid info in their subcases.
+	 */
+	if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
+	    (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
+	    (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+		start = (lr + 1);
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+	}
+
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_CREATE_ATTR:
+		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+		start = (caddr_t)(lr + 1) + xvatlen;
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+		name = (char *)start;
+
+		/*FALLTHROUGH*/
+	case TX_CREATE:
+		if (name == NULL)
+			name = (char *)start;
+
+		error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
+		    0, 0, &vp, kcred, vflg, NULL, NULL);
+		break;
+	case TX_MKDIR_ATTR:
+		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+		start = (caddr_t)(lr + 1) + xvatlen;
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+		name = (char *)start;
+
+		/*FALLTHROUGH*/
+	case TX_MKDIR:
+		if (name == NULL)
+			name = (char *)(lr + 1);
+
+		error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
+		    &vp, kcred, NULL, vflg, NULL);
+		break;
+	case TX_MKXATTR:
+		name = (char *)(lr + 1);
+		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
+		break;
+	case TX_SYMLINK:
+		name = (char *)(lr + 1);
+		link = name + strlen(name) + 1;
+		error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr,
+		    link, kcred, NULL, vflg);
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+out:
+	if (error == 0 && vp != NULL)
+		VN_RELE(vp);
+
+	VN_RELE(ZTOV(dzp));
+
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
+	return (error);
+}
+
+static int
+zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
+	znode_t *dzp;
+	int error;
+	int vflg = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_REMOVE:
+		error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg);
+		break;
+	case TX_RMDIR:
+		error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg);
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
+	znode_t *dzp, *zp;
+	int error;
+	int vflg = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+		VN_RELE(ZTOV(dzp));
+		return (error);
+	}
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg);
+
+	VN_RELE(ZTOV(zp));
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
+{
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
+	znode_t *sdzp, *tdzp;
+	int error;
+	int vflg = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+		VN_RELE(ZTOV(sdzp));
+		return (error);
+	}
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred,
+	    NULL, vflg);
+
+	VN_RELE(ZTOV(tdzp));
+	VN_RELE(ZTOV(sdzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
+	znode_t	*zp;
+	int error;
+	ssize_t resid;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log writes out of order, it's possible the
+		 * file has been removed. In this case just drop the write
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
+	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
+{
+	znode_t *zp;
+	flock64_t fl;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log truncates out of order, it's possible the
+		 * file has been removed. In this case just drop the truncate
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	bzero(&fl, sizeof (fl));
+	fl.l_type = F_WRLCK;
+	fl.l_whence = 0;
+	fl.l_start = lr->lr_offset;
+	fl.l_len = lr->lr_length;
+
+	error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
+	    lr->lr_offset, kcred, NULL);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
+{
+	znode_t *zp;
+	xvattr_t xva;
+	vattr_t *vap = &xva.xva_vattr;
+	int error;
+	void *start;
+
+	xva_init(&xva);
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+		if ((lr->lr_mask & AT_XVATTR) &&
+		    zfsvfs->z_version >= ZPL_VERSION_INITIAL)
+			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log setattrs out of order, it's possible the
+		 * file has been removed. In this case just drop the setattr
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
+	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+	vap->va_size = lr->lr_size;
+	ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
+
+	/*
+	 * Fill in xvattr_t portions if necessary.
+	 */
+
+	start = (lr_setattr_t *)(lr + 1);
+	if (vap->va_mask & AT_XVATTR) {
+		zfs_replay_xvattr((lr_attr_t *)start, &xva);
+		start = (caddr_t)start +
+		    ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
+	} else
+		xva.xva_vattr.va_mask &= ~AT_XVATTR;
+
+	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
+	    lr->lr_uid, lr->lr_gid);
+
+	error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL);
+
+	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
+{
+	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
+	vsecattr_t vsa;
+	znode_t *zp;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log acls out of order, it's possible the
+		 * file has been removed. In this case just drop the acl
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	bzero(&vsa, sizeof (vsa));
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+	vsa.vsa_aclcnt = lr->lr_aclcnt;
+	vsa.vsa_aclentp = ace;
+
+	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * Replaying ACLs is complicated by FUID support.
+ * The log record may contain some optional data
+ * to be used for replaying FUID's.  These pieces
+ * are the actual FUIDs that were created initially.
+ * The FUID table index may no longer be valid and
+ * during zfs_create() a new index may be assigned.
+ * Because of this the log will contain the original
+ * doman+rid in order to create a new FUID.
+ *
+ * The individual ACEs may contain an ephemeral uid/gid which is no
+ * longer valid and will need to be replaced with an actual FUID.
+ *
+ */
+static int
+zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+{
+	ace_t *ace = (ace_t *)(lr + 1);
+	vsecattr_t vsa;
+	znode_t *zp;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
+		if (lr->lr_fuidcnt) {
+			byteswap_uint64_array((caddr_t)ace +
+			    ZIL_ACE_LENGTH(lr->lr_acl_bytes),
+			    lr->lr_fuidcnt * sizeof (uint64_t));
+		}
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log acls out of order, it's possible the
+		 * file has been removed. In this case just drop the acl
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	bzero(&vsa, sizeof (vsa));
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
+	vsa.vsa_aclcnt = lr->lr_aclcnt;
+	vsa.vsa_aclentp = ace;
+	vsa.vsa_aclentsz = lr->lr_acl_bytes;
+	vsa.vsa_aclflags = lr->lr_acl_flags;
+
+	if (lr->lr_fuidcnt) {
+		void *fuidstart = (caddr_t)ace +
+		    ZIL_ACE_LENGTH(lr->lr_acl_bytes);
+
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuids(fuidstart, &fuidstart,
+		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
+	}
+
+	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
+
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+
+	zfsvfs->z_fuid_replay = NULL;
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+	zfs_replay_error,	/* 0 no such transaction type */
+	zfs_replay_create,	/* TX_CREATE */
+	zfs_replay_create,	/* TX_MKDIR */
+	zfs_replay_create,	/* TX_MKXATTR */
+	zfs_replay_create,	/* TX_SYMLINK */
+	zfs_replay_remove,	/* TX_REMOVE */
+	zfs_replay_remove,	/* TX_RMDIR */
+	zfs_replay_link,	/* TX_LINK */
+	zfs_replay_rename,	/* TX_RENAME */
+	zfs_replay_write,	/* TX_WRITE */
+	zfs_replay_truncate,	/* TX_TRUNCATE */
+	zfs_replay_setattr,	/* TX_SETATTR */
+	zfs_replay_acl_v0,	/* TX_ACL_V0 */
+	zfs_replay_acl,		/* TX_ACL */
+	zfs_replay_create_acl,	/* TX_CREATE_ACL */
+	zfs_replay_create,	/* TX_CREATE_ATTR */
+	zfs_replay_create_acl,	/* TX_CREATE_ACL_ATTR */
+	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
+	zfs_replay_create,	/* TX_MKDIR_ATTR */
+	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
+};
@@ -0,0 +1,602 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"@(#)zfs_rlock.c	1.4	07/08/08 SMI"
+
+/*
+ * This file contains the code to implement file range locking in
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind
+ * support for growing the blocksize).
+ *
+ * Interface
+ * ---------
+ * Defined in zfs_rlock.h but essentially:
+ *	rl = zfs_range_lock(zp, off, len, lock_type);
+ *	zfs_range_unlock(rl);
+ *	zfs_range_reduce(rl, off, len);
+ *
+ * AVL tree
+ * --------
+ * An AVL tree is used to maintain the state of the existing ranges
+ * that are locked for exclusive (writer) or shared (reader) use.
+ * The starting range offset is used for searching and sorting the tree.
+ *
+ * Common case
+ * -----------
+ * The (hopefully) usual case is of no overlaps or contention for
+ * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
+ * searched that finds no overlap, and *this* rl_t is placed in the tree.
+ *
+ * Overlaps/Reference counting/Proxy locks
+ * ---------------------------------------
+ * The avl code only allows one node at a particular offset. Also it's very
+ * inefficient to search through all previous entries looking for overlaps
+ * (because the very 1st in the ordered list might be at offset 0 but
+ * cover the whole file).
+ * So this implementation uses reference counts and proxy range locks.
+ * Firstly, only reader locks use reference counts and proxy locks,
+ * because writer locks are exclusive.
+ * When a reader lock overlaps with another then a proxy lock is created
+ * for that range and replaces the original lock. If the overlap
+ * is exact then the reference count of the proxy is simply incremented.
+ * Otherwise, the proxy lock is split into smaller lock ranges and
+ * new proxy locks created for non overlapping ranges.
+ * The reference counts are adjusted accordingly.
+ * Meanwhile, the orginal lock is kept around (this is the callers handle)
+ * and its offset and length are used when releasing the lock.
+ *
+ * Thread coordination
+ * -------------------
+ * In order to make wakeups efficient and to ensure multiple continuous
+ * readers on a range don't starve a writer for the same range lock,
+ * two condition variables are allocated in each rl_t.
+ * If a writer (or reader) can't get a range it initialises the writer
+ * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
+ * and waits on that cv. When a thread unlocks that range it wakes up all
+ * writers then all readers before destroying the lock.
+ *
+ * Append mode writes
+ * ------------------
+ * Append mode writes need to lock a range at the end of a file.
+ * The offset of the end of the file is determined under the
+ * range locking mutex, and the lock type converted from RL_APPEND to
+ * RL_WRITER and the range locked.
+ *
+ * Grow block handling
+ * -------------------
+ * ZFS supports multiple block sizes currently upto 128K. The smallest
+ * block size is used for the file which is grown as needed. During this
+ * growth all other writers and readers must be excluded.
+ * So if the block size needs to be grown then the whole file is
+ * exclusively locked, then later the caller will reduce the lock
+ * range to just the range to be written using zfs_reduce_range.
+ */
+
+#include <sys/zfs_rlock.h>
+
+/*
+ * Check if a write lock can be grabbed, or wait and recheck until available.
+ */
+static void
+zfs_range_lock_writer(znode_t *zp, rl_t *new)
+{
+	avl_tree_t *tree = &zp->z_range_avl;
+	rl_t *rl;
+	avl_index_t where;
+	uint64_t end_size;
+	uint64_t off = new->r_off;
+	uint64_t len = new->r_len;
+
+	for (;;) {
+		/*
+		 * Range locking is also used by zvol and uses a
+		 * dummied up znode. However, for zvol, we don't need to
+		 * append or grow blocksize, and besides we don't have
+		 * a z_phys or z_zfsvfs - so skip that processing.
+		 *
+		 * Yes, this is ugly, and would be solved by not handling
+		 * grow or append in range lock code. If that was done then
+		 * we could make the range locking code generically available
+		 * to other non-zfs consumers.
+		 */
+		if (zp->z_vnode) { /* caller is ZPL */
+			/*
+			 * If in append mode pick up the current end of file.
+			 * This is done under z_range_lock to avoid races.
+			 */
+			if (new->r_type == RL_APPEND)
+				new->r_off = zp->z_phys->zp_size;
+
+			/*
+			 * If we need to grow the block size then grab the whole
+			 * file range. This is also done under z_range_lock to
+			 * avoid races.
+			 */
+			end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+			if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+			    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+				new->r_off = 0;
+				new->r_len = UINT64_MAX;
+			}
+		}
+
+		/*
+		 * First check for the usual case of no locks
+		 */
+		if (avl_numnodes(tree) == 0) {
+			new->r_type = RL_WRITER; /* convert to writer */
+			avl_add(tree, new);
+			return;
+		}
+
+		/*
+		 * Look for any locks in the range.
+		 */
+		rl = avl_find(tree, new, &where);
+		if (rl)
+			goto wait; /* already locked at same offset */
+
+		rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+		if (rl && (rl->r_off < new->r_off + new->r_len))
+			goto wait;
+
+		rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+		if (rl && rl->r_off + rl->r_len > new->r_off)
+			goto wait;
+
+		new->r_type = RL_WRITER; /* convert possible RL_APPEND */
+		avl_insert(tree, new, where);
+		return;
+wait:
+		if (!rl->r_write_wanted) {
+			cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
+			rl->r_write_wanted = B_TRUE;
+		}
+		cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
+
+		/* reset to original */
+		new->r_off = off;
+		new->r_len = len;
+	}
+}
+
+/*
+ * If this is an original (non-proxy) lock then replace it by
+ * a proxy and return the proxy.
+ */
+static rl_t *
+zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
+{
+	rl_t *proxy;
+
+	if (rl->r_proxy)
+		return (rl); /* already a proxy */
+
+	ASSERT3U(rl->r_cnt, ==, 1);
+	ASSERT(rl->r_write_wanted == B_FALSE);
+	ASSERT(rl->r_read_wanted == B_FALSE);
+	avl_remove(tree, rl);
+	rl->r_cnt = 0;
+
+	/* create a proxy range lock */
+	proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+	proxy->r_off = rl->r_off;
+	proxy->r_len = rl->r_len;
+	proxy->r_cnt = 1;
+	proxy->r_type = RL_READER;
+	proxy->r_proxy = B_TRUE;
+	proxy->r_write_wanted = B_FALSE;
+	proxy->r_read_wanted = B_FALSE;
+	avl_add(tree, proxy);
+
+	return (proxy);
+}
+
+/*
+ * Split the range lock at the supplied offset
+ * returning the *front* proxy.
+ */
+static rl_t *
+zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
+{
+	rl_t *front, *rear;
+
+	ASSERT3U(rl->r_len, >, 1);
+	ASSERT3U(off, >, rl->r_off);
+	ASSERT3U(off, <, rl->r_off + rl->r_len);
+	ASSERT(rl->r_write_wanted == B_FALSE);
+	ASSERT(rl->r_read_wanted == B_FALSE);
+
+	/* create the rear proxy range lock */
+	rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+	rear->r_off = off;
+	rear->r_len = rl->r_off + rl->r_len - off;
+	rear->r_cnt = rl->r_cnt;
+	rear->r_type = RL_READER;
+	rear->r_proxy = B_TRUE;
+	rear->r_write_wanted = B_FALSE;
+	rear->r_read_wanted = B_FALSE;
+
+	front = zfs_range_proxify(tree, rl);
+	front->r_len = off - rl->r_off;
+
+	avl_insert_here(tree, rear, front, AVL_AFTER);
+	return (front);
+}
+
+/*
+ * Create and add a new proxy range lock for the supplied range.
+ */
+static void
+zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+{
+	rl_t *rl;
+
+	ASSERT(len);
+	rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+	rl->r_off = off;
+	rl->r_len = len;
+	rl->r_cnt = 1;
+	rl->r_type = RL_READER;
+	rl->r_proxy = B_TRUE;
+	rl->r_write_wanted = B_FALSE;
+	rl->r_read_wanted = B_FALSE;
+	avl_add(tree, rl);
+}
+
+static void
+zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
+{
+	rl_t *next;
+	uint64_t off = new->r_off;
+	uint64_t len = new->r_len;
+
+	/*
+	 * prev arrives either:
+	 * - pointing to an entry at the same offset
+	 * - pointing to the entry with the closest previous offset whose
+	 *   range may overlap with the new range
+	 * - null, if there were no ranges starting before the new one
+	 */
+	if (prev) {
+		if (prev->r_off + prev->r_len <= off) {
+			prev = NULL;
+		} else if (prev->r_off != off) {
+			/*
+			 * convert to proxy if needed then
+			 * split this entry and bump ref count
+			 */
+			prev = zfs_range_split(tree, prev, off);
+			prev = AVL_NEXT(tree, prev); /* move to rear range */
+		}
+	}
+	ASSERT((prev == NULL) || (prev->r_off == off));
+
+	if (prev)
+		next = prev;
+	else
+		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+
+	if (next == NULL || off + len <= next->r_off) {
+		/* no overlaps, use the original new rl_t in the tree */
+		avl_insert(tree, new, where);
+		return;
+	}
+
+	if (off < next->r_off) {
+		/* Add a proxy for initial range before the overlap */
+		zfs_range_new_proxy(tree, off, next->r_off - off);
+	}
+
+	new->r_cnt = 0; /* will use proxies in tree */
+	/*
+	 * We now search forward through the ranges, until we go past the end
+	 * of the new range. For each entry we make it a proxy if it
+	 * isn't already, then bump its reference count. If there's any
+	 * gaps between the ranges then we create a new proxy range.
+	 */
+	for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
+		if (off + len <= next->r_off)
+			break;
+		if (prev && prev->r_off + prev->r_len < next->r_off) {
+			/* there's a gap */
+			ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
+			zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
+			    next->r_off - (prev->r_off + prev->r_len));
+		}
+		if (off + len == next->r_off + next->r_len) {
+			/* exact overlap with end */
+			next = zfs_range_proxify(tree, next);
+			next->r_cnt++;
+			return;
+		}
+		if (off + len < next->r_off + next->r_len) {
+			/* new range ends in the middle of this block */
+			next = zfs_range_split(tree, next, off + len);
+			next->r_cnt++;
+			return;
+		}
+		ASSERT3U(off + len, >, next->r_off + next->r_len);
+		next = zfs_range_proxify(tree, next);
+		next->r_cnt++;
+	}
+
+	/* Add the remaining end range. */
+	zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
+	    (off + len) - (prev->r_off + prev->r_len));
+}
+
+/*
+ * Check if a reader lock can be grabbed, or wait and recheck until available.
+ */
+static void
+zfs_range_lock_reader(znode_t *zp, rl_t *new)
+{
+	avl_tree_t *tree = &zp->z_range_avl;
+	rl_t *prev, *next;
+	avl_index_t where;
+	uint64_t off = new->r_off;
+	uint64_t len = new->r_len;
+
+	/*
+	 * Look for any writer locks in the range.
+	 */
+retry:
+	prev = avl_find(tree, new, &where);
+	if (prev == NULL)
+		prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+
+	/*
+	 * Check the previous range for a writer lock overlap.
+	 */
+	if (prev && (off < prev->r_off + prev->r_len)) {
+		if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
+			if (!prev->r_read_wanted) {
+				cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
+				prev->r_read_wanted = B_TRUE;
+			}
+			cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
+			goto retry;
+		}
+		if (off + len < prev->r_off + prev->r_len)
+			goto got_lock;
+	}
+
+	/*
+	 * Search through the following ranges to see if there's
+	 * write lock any overlap.
+	 */
+	if (prev)
+		next = AVL_NEXT(tree, prev);
+	else
+		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+	for (; next; next = AVL_NEXT(tree, next)) {
+		if (off + len <= next->r_off)
+			goto got_lock;
+		if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
+			if (!next->r_read_wanted) {
+				cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
+				next->r_read_wanted = B_TRUE;
+			}
+			cv_wait(&next->r_rd_cv, &zp->z_range_lock);
+			goto retry;
+		}
+		if (off + len <= next->r_off + next->r_len)
+			goto got_lock;
+	}
+
+got_lock:
+	/*
+	 * Add the read lock, which may involve splitting existing
+	 * locks and bumping ref counts (r_cnt).
+	 */
+	zfs_range_add_reader(tree, new, prev, where);
+}
+
+/*
+ * Lock a range (offset, length) as either shared (RL_READER)
+ * or exclusive (RL_WRITER). Returns the range lock structure
+ * for later unlocking or reduce range (if entire file
+ * previously locked as RL_WRITER).
+ */
+rl_t *
+zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
+{
+	rl_t *new;
+
+	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
+
+	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+	new->r_zp = zp;
+	new->r_off = off;
+	new->r_len = len;
+	new->r_cnt = 1; /* assume it's going to be in the tree */
+	new->r_type = type;
+	new->r_proxy = B_FALSE;
+	new->r_write_wanted = B_FALSE;
+	new->r_read_wanted = B_FALSE;
+
+	mutex_enter(&zp->z_range_lock);
+	if (type == RL_READER) {
+		/*
+		 * First check for the usual case of no locks
+		 */
+		if (avl_numnodes(&zp->z_range_avl) == 0)
+			avl_add(&zp->z_range_avl, new);
+		else
+			zfs_range_lock_reader(zp, new);
+	} else
+		zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
+	mutex_exit(&zp->z_range_lock);
+	return (new);
+}
+
+/*
+ * Unlock a reader lock
+ */
+static void
+zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
+{
+	avl_tree_t *tree = &zp->z_range_avl;
+	rl_t *rl, *next;
+	uint64_t len;
+
+	/*
+	 * The common case is when the remove entry is in the tree
+	 * (cnt == 1) meaning there's been no other reader locks overlapping
+	 * with this one. Otherwise the remove entry will have been
+	 * removed from the tree and replaced by proxies (one or
+	 * more ranges mapping to the entire range).
+	 */
+	if (remove->r_cnt == 1) {
+		avl_remove(tree, remove);
+		if (remove->r_write_wanted) {
+			cv_broadcast(&remove->r_wr_cv);
+			cv_destroy(&remove->r_wr_cv);
+		}
+		if (remove->r_read_wanted) {
+			cv_broadcast(&remove->r_rd_cv);
+			cv_destroy(&remove->r_rd_cv);
+		}
+	} else {
+		ASSERT3U(remove->r_cnt, ==, 0);
+		ASSERT3U(remove->r_write_wanted, ==, 0);
+		ASSERT3U(remove->r_read_wanted, ==, 0);
+		/*
+		 * Find start proxy representing this reader lock,
+		 * then decrement ref count on all proxies
+		 * that make up this range, freeing them as needed.
+		 */
+		rl = avl_find(tree, remove, NULL);
+		ASSERT(rl);
+		ASSERT(rl->r_cnt);
+		ASSERT(rl->r_type == RL_READER);
+		for (len = remove->r_len; len != 0; rl = next) {
+			len -= rl->r_len;
+			if (len) {
+				next = AVL_NEXT(tree, rl);
+				ASSERT(next);
+				ASSERT(rl->r_off + rl->r_len == next->r_off);
+				ASSERT(next->r_cnt);
+				ASSERT(next->r_type == RL_READER);
+			}
+			rl->r_cnt--;
+			if (rl->r_cnt == 0) {
+				avl_remove(tree, rl);
+				if (rl->r_write_wanted) {
+					cv_broadcast(&rl->r_wr_cv);
+					cv_destroy(&rl->r_wr_cv);
+				}
+				if (rl->r_read_wanted) {
+					cv_broadcast(&rl->r_rd_cv);
+					cv_destroy(&rl->r_rd_cv);
+				}
+				kmem_free(rl, sizeof (rl_t));
+			}
+		}
+	}
+	kmem_free(remove, sizeof (rl_t));
+}
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void
+zfs_range_unlock(rl_t *rl)
+{
+	znode_t *zp = rl->r_zp;
+
+	ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
+	ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
+	ASSERT(!rl->r_proxy);
+
+	mutex_enter(&zp->z_range_lock);
+	if (rl->r_type == RL_WRITER) {
+		/* writer locks can't be shared or split */
+		avl_remove(&zp->z_range_avl, rl);
+		mutex_exit(&zp->z_range_lock);
+		if (rl->r_write_wanted) {
+			cv_broadcast(&rl->r_wr_cv);
+			cv_destroy(&rl->r_wr_cv);
+		}
+		if (rl->r_read_wanted) {
+			cv_broadcast(&rl->r_rd_cv);
+			cv_destroy(&rl->r_rd_cv);
+		}
+		kmem_free(rl, sizeof (rl_t));
+	} else {
+		/*
+		 * lock may be shared, let zfs_range_unlock_reader()
+		 * release the lock and free the rl_t
+		 */
+		zfs_range_unlock_reader(zp, rl);
+		mutex_exit(&zp->z_range_lock);
+	}
+}
+
+/*
+ * Reduce range locked as RL_WRITER from whole file to specified range.
+ * Asserts the whole file is exclusivly locked and so there's only one
+ * entry in the tree.
+ */
+void
+zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
+{
+	znode_t *zp = rl->r_zp;
+
+	/* Ensure there are no other locks */
+	ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
+	ASSERT(rl->r_off == 0);
+	ASSERT(rl->r_type == RL_WRITER);
+	ASSERT(!rl->r_proxy);
+	ASSERT3U(rl->r_len, ==, UINT64_MAX);
+	ASSERT3U(rl->r_cnt, ==, 1);
+
+	mutex_enter(&zp->z_range_lock);
+	rl->r_off = off;
+	rl->r_len = len;
+	mutex_exit(&zp->z_range_lock);
+	if (rl->r_write_wanted)
+		cv_broadcast(&rl->r_wr_cv);
+	if (rl->r_read_wanted)
+		cv_broadcast(&rl->r_rd_cv);
+}
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+int
+zfs_range_compare(const void *arg1, const void *arg2)
+{
+	const rl_t *rl1 = arg1;
+	const rl_t *rl2 = arg2;
+
+	if (rl1->r_off > rl2->r_off)
+		return (1);
+	if (rl1->r_off < rl2->r_off)
+		return (-1);
+	return (0);
+}