Initial Linux ZFS GIT Repo

This commit is contained in:
Brian Behlendorf
2008-11-20 12:01:55 -08:00
commit 34dc7c2f25
444 changed files with 187636 additions and 0 deletions
+28
View File
@@ -0,0 +1,28 @@
# NOTE: dctl_client.c, dctl_common.c, dctl_server.c, dctl_thrpool.c unused
# by kernel port. Potentially they should just be removed if we don't care
# able user space lustre intergration from this source base.
# NOTE: For clarity this directly should simply be renamed libzpl and
# the full kernel implementation should be minimally stubbed out.
subdir-m += include
DISTFILES = dctl_client.c dctl_common.c dctl_server.c dctl_thrpool.c
DISTFILES += dmu_send.c rrwlock.c zfs_acl.c zfs_ctldir.c
DISTFILES += zfs_dir.c zfs_fuid.c zfs_ioctl.c zfs_log.c zfs_replay.c
DISTFILES += zfs_rlock.c zfs_vfsops.c zfs_vnops.c zvol.c
MODULE := zctl
EXTRA_CFLAGS = @KERNELCPPFLAGS@
EXTRA_CFLAGS += -I@LIBDIR@/libzcommon/include
EXTRA_CFLAGS += -I@LIBDIR@/libdmu-ctl/include
EXTRA_CFLAGS += -I@LIBDIR@/libavl/include
EXTRA_CFLAGS += -I@LIBDIR@/libport/include
EXTRA_CFLAGS += -I@LIBDIR@/libnvpair/include
obj-m := ${MODULE}.o
${MODULE}-objs += zvol.o # Volume emulation interface
${MODULE}-objs += zfs_ioctl.o # /dev/zfs_ioctl interface
${MODULE}-objs += zfs_vfsops.o
${MODULE}-objs += dmu_send.o
+263
View File
@@ -0,0 +1,263 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ftw.h>
#include <errno.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/debug.h>
#include <sys/dmu_ctl.h>
#include <sys/dmu_ctl_impl.h>
/*
* Try to connect to the socket given in path.
*
* For nftw() convenience, returns 0 if unsuccessful, otherwise
* returns the socket descriptor.
*/
static int try_connect(const char *path)
{
struct sockaddr_un name;
int sock;
sock = socket(PF_UNIX, SOCK_STREAM, 0);
if (sock == -1) {
perror("socket");
return 0;
}
/*
* The socket fd cannot be 0 otherwise nftw() will not interpret the
* return code correctly.
*/
VERIFY(sock != 0);
name.sun_family = AF_UNIX;
strncpy(name.sun_path, path, sizeof(name.sun_path));
name.sun_path[sizeof(name.sun_path) - 1] = '\0';
if (connect(sock, (struct sockaddr *) &name, sizeof(name)) == -1) {
close(sock);
return 0;
}
return sock;
}
/*
* nftw() callback.
*/
static int nftw_cb(const char *fpath, const struct stat *sb, int typeflag,
struct FTW *ftwbuf)
{
if (!S_ISSOCK(sb->st_mode))
return 0;
if (strcmp(&fpath[ftwbuf->base], SOCKNAME) != 0)
return 0;
return try_connect(fpath);
}
/*
* For convenience, if check_subdirs is true we walk the directory tree to
* find a good socket.
*/
int dctlc_connect(const char *dir, boolean_t check_subdirs)
{
char *fpath;
int fd;
if (check_subdirs)
fd = nftw(dir, nftw_cb, 10, FTW_PHYS);
else {
fpath = malloc(strlen(dir) + strlen(SOCKNAME) + 2);
if (fpath == NULL)
return -1;
strcpy(fpath, dir);
strcat(fpath, "/" SOCKNAME);
fd = try_connect(fpath);
free(fpath);
}
return fd == 0 ? -1 : fd;
}
void dctlc_disconnect(int fd)
{
(void) shutdown(fd, SHUT_RDWR);
}
static int dctl_reply_copyin(int fd, dctl_cmd_t *cmd)
{
return dctl_send_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
cmd->u.dcmd_copy.size);
}
static int dctl_reply_copyinstr(int fd, dctl_cmd_t *cmd)
{
dctl_cmd_t reply;
char *from;
size_t len, buflen, to_copy;
int error;
reply.dcmd_msg = DCTL_GEN_REPLY;
from = (char *)(uintptr_t) cmd->u.dcmd_copy.ptr;
buflen = cmd->u.dcmd_copy.size;
to_copy = strnlen(from, buflen - 1);
reply.u.dcmd_reply.rc = from[to_copy] == '\0' ? 0 : ENAMETOOLONG;
reply.u.dcmd_reply.size = to_copy;
error = dctl_send_msg(fd, &reply);
if (!error && to_copy > 0)
error = dctl_send_data(fd, from, to_copy);
return error;
}
static int dctl_reply_copyout(int fd, dctl_cmd_t *cmd)
{
return dctl_read_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
cmd->u.dcmd_copy.size);
}
static int dctl_reply_fd_read(int fd, dctl_cmd_t *cmd)
{
dctl_cmd_t reply;
void *buf;
int error;
ssize_t rrc, size = cmd->u.dcmd_fd_io.size;
buf = malloc(size);
if (buf == NULL)
return ENOMEM;
rrc = read(cmd->u.dcmd_fd_io.fd, buf, size);
reply.dcmd_msg = DCTL_GEN_REPLY;
reply.u.dcmd_reply.rc = rrc == -1 ? errno : 0;
reply.u.dcmd_reply.size = rrc;
error = dctl_send_msg(fd, &reply);
if (!error && rrc > 0)
error = dctl_send_data(fd, buf, rrc);
out:
free(buf);
return error;
}
static int dctl_reply_fd_write(int fd, dctl_cmd_t *cmd)
{
dctl_cmd_t reply;
void *buf;
int error;
ssize_t wrc, size = cmd->u.dcmd_fd_io.size;
buf = malloc(size);
if (buf == NULL)
return ENOMEM;
error = dctl_read_data(fd, buf, size);
if (error)
goto out;
wrc = write(cmd->u.dcmd_fd_io.fd, buf, size);
reply.dcmd_msg = DCTL_GEN_REPLY;
reply.u.dcmd_reply.rc = wrc == -1 ? errno : 0;
reply.u.dcmd_reply.size = wrc;
error = dctl_send_msg(fd, &reply);
out:
free(buf);
return error;
}
int dctlc_ioctl(int fd, int32_t request, void *arg)
{
int error;
dctl_cmd_t cmd;
ASSERT(fd != 0);
cmd.dcmd_msg = DCTL_IOCTL;
cmd.u.dcmd_ioctl.cmd = request;
cmd.u.dcmd_ioctl.arg = (uintptr_t) arg;
error = dctl_send_msg(fd, &cmd);
while (!error && (error = dctl_read_msg(fd, &cmd)) == 0) {
switch (cmd.dcmd_msg) {
case DCTL_IOCTL_REPLY:
error = cmd.u.dcmd_reply.rc;
goto out;
case DCTL_COPYIN:
error = dctl_reply_copyin(fd, &cmd);
break;
case DCTL_COPYINSTR:
error = dctl_reply_copyinstr(fd, &cmd);
break;
case DCTL_COPYOUT:
error = dctl_reply_copyout(fd, &cmd);
break;
case DCTL_FD_READ:
error = dctl_reply_fd_read(fd, &cmd);
break;
case DCTL_FD_WRITE:
error = dctl_reply_fd_write(fd, &cmd);
break;
default:
fprintf(stderr, "%s(): invalid message "
"received.\n", __func__);
error = EINVAL;
goto out;
}
}
out:
errno = error;
return error ? -1 : 0;
}
+109
View File
@@ -0,0 +1,109 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/dmu_ctl.h>
#include <sys/dmu_ctl_impl.h>
int dctl_read_msg(int fd, dctl_cmd_t *cmd)
{
int error;
/*
* First, read only the magic number and the protocol version.
*
* This prevents blocking forever in case the size of dctl_cmd_t
* shrinks in future protocol versions.
*/
error = dctl_read_data(fd, cmd, DCTL_CMD_HEADER_SIZE);
if (!error &&cmd->dcmd_magic != DCTL_MAGIC) {
fprintf(stderr, "%s(): invalid magic number\n", __func__);
error = EIO;
}
if (!error && cmd->dcmd_version != DCTL_PROTOCOL_VER) {
fprintf(stderr, "%s(): invalid protocol version\n", __func__);
error = ENOTSUP;
}
if (error)
return error;
/* Get the rest of the command */
return dctl_read_data(fd, (caddr_t) cmd + DCTL_CMD_HEADER_SIZE,
sizeof(dctl_cmd_t) - DCTL_CMD_HEADER_SIZE);
}
int dctl_send_msg(int fd, dctl_cmd_t *cmd)
{
cmd->dcmd_magic = DCTL_MAGIC;
cmd->dcmd_version = DCTL_PROTOCOL_VER;
return dctl_send_data(fd, cmd, sizeof(dctl_cmd_t));
}
int dctl_read_data(int fd, void *ptr, size_t size)
{
size_t read = 0;
size_t left = size;
ssize_t rc;
while (left > 0) {
rc = recv(fd, (caddr_t) ptr + read, left, 0);
/* File descriptor closed */
if (rc == 0)
return ECONNRESET;
if (rc == -1) {
if (errno == EINTR)
continue;
return errno;
}
read += rc;
left -= rc;
}
return 0;
}
int dctl_send_data(int fd, const void *ptr, size_t size)
{
ssize_t rc;
do {
rc = send(fd, ptr, size, MSG_NOSIGNAL);
} while(rc == -1 && errno == EINTR);
return rc == size ? 0 : EIO;
}
+476
View File
@@ -0,0 +1,476 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <limits.h>
#include <errno.h>
#include <poll.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/debug.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/list.h>
#include <sys/cred.h>
#include <sys/dmu_ctl.h>
#include <sys/dmu_ctl_impl.h>
static dctl_sock_info_t ctl_sock = {
.dsi_mtx = PTHREAD_MUTEX_INITIALIZER,
.dsi_fd = -1
};
static int dctl_create_socket_common();
/*
* Routines from zfs_ioctl.c
*/
extern int zfs_ioctl_init();
extern int zfs_ioctl_fini();
extern int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
int *rvalp);
/*
* We can't simply put the client file descriptor in wthr_info_t because we
* have no way of accessing it from the DMU code without extensive
* modifications.
*
* Therefore each worker thread will have it's own global thread-specific
* client_fd variable.
*/
static __thread int client_fd = -1;
int dctls_copyin(const void *src, void *dest, size_t size)
{
dctl_cmd_t cmd;
VERIFY(client_fd >= 0);
cmd.dcmd_msg = DCTL_COPYIN;
cmd.u.dcmd_copy.ptr = (uintptr_t) src;
cmd.u.dcmd_copy.size = size;
if (dctl_send_msg(client_fd, &cmd) != 0)
return EFAULT;
if (dctl_read_data(client_fd, dest, size) != 0)
return EFAULT;
return 0;
}
int dctls_copyinstr(const char *from, char *to, size_t max, size_t *len)
{
dctl_cmd_t msg;
size_t copied;
VERIFY(client_fd >= 0);
if (max == 0)
return ENAMETOOLONG;
if (max < 0)
return EFAULT;
msg.dcmd_msg = DCTL_COPYINSTR;
msg.u.dcmd_copy.ptr = (uintptr_t) from;
msg.u.dcmd_copy.size = max;
if (dctl_send_msg(client_fd, &msg) != 0)
return EFAULT;
if (dctl_read_msg(client_fd, &msg) != 0)
return EFAULT;
if (msg.dcmd_msg != DCTL_GEN_REPLY)
return EFAULT;
copied = msg.u.dcmd_reply.size;
if (copied >= max)
return EFAULT;
if (copied > 0)
if (dctl_read_data(client_fd, to, copied) != 0)
return EFAULT;
to[copied] = '\0';
if (len != NULL)
*len = copied + 1;
return msg.u.dcmd_reply.rc;
}
int dctls_copyout(const void *src, void *dest, size_t size)
{
dctl_cmd_t cmd;
VERIFY(client_fd >= 0);
cmd.dcmd_msg = DCTL_COPYOUT;
cmd.u.dcmd_copy.ptr = (uintptr_t) dest;
cmd.u.dcmd_copy.size = size;
if (dctl_send_msg(client_fd, &cmd) != 0)
return EFAULT;
if (dctl_send_data(client_fd, src, size) != 0)
return EFAULT;
return 0;
}
int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp)
{
dctl_cmd_t msg;
uint64_t dsize;
int error;
VERIFY(client_fd >= 0);
msg.dcmd_msg = DCTL_FD_READ;
msg.u.dcmd_fd_io.fd = fd;
msg.u.dcmd_fd_io.size = len;
if ((error = dctl_send_msg(client_fd, &msg)) != 0)
return error;
if ((error = dctl_read_msg(client_fd, &msg)) != 0)
return error;
if (msg.dcmd_msg != DCTL_GEN_REPLY)
return EIO;
if (msg.u.dcmd_reply.rc != 0)
return msg.u.dcmd_reply.rc;
dsize = msg.u.dcmd_reply.size;
if (dsize > 0)
error = dctl_read_data(client_fd, buf, dsize);
*residp = len - dsize;
return error;
}
int dctls_fd_write(int fd, const void *src, ssize_t len)
{
dctl_cmd_t msg;
int error;
VERIFY(client_fd >= 0);
msg.dcmd_msg = DCTL_FD_WRITE;
msg.u.dcmd_fd_io.fd = fd;
msg.u.dcmd_fd_io.size = len;
error = dctl_send_msg(client_fd, &msg);
if (!error)
error = dctl_send_data(client_fd, src, len);
if (!error)
error = dctl_read_msg(client_fd, &msg);
if (error)
return error;
if (msg.dcmd_msg != DCTL_GEN_REPLY)
return EIO;
if (msg.u.dcmd_reply.rc != 0)
return msg.u.dcmd_reply.rc;
/*
* We have to do this because the original upstream code
* does not check if residp == len.
*/
if (msg.u.dcmd_reply.size != len)
return EIO;
return 0;
}
/* Handle a new connection */
static void dctl_handle_conn(int sock_fd)
{
dctl_cmd_t cmd;
dev_t dev = { 0 };
int rc;
client_fd = sock_fd;
while (dctl_read_msg(sock_fd, &cmd) == 0) {
if (cmd.dcmd_msg != DCTL_IOCTL) {
fprintf(stderr, "%s(): unexpected message type.\n",
__func__);
break;
}
rc = zfsdev_ioctl(dev, cmd.u.dcmd_ioctl.cmd,
(intptr_t) cmd.u.dcmd_ioctl.arg, 0, NULL, NULL);
cmd.dcmd_msg = DCTL_IOCTL_REPLY;
cmd.u.dcmd_reply.rc = rc;
if (dctl_send_msg(sock_fd, &cmd) != 0)
break;
}
close(sock_fd);
client_fd = -1;
}
/* Main worker thread loop */
static void *dctl_thread(void *arg)
{
wthr_info_t *thr = arg;
struct pollfd fds[1];
fds[0].events = POLLIN;
pthread_mutex_lock(&ctl_sock.dsi_mtx);
while (!thr->wthr_exit) {
/* Clean-up dead threads */
dctl_thr_join();
/* The file descriptor might change in the thread lifetime */
fds[0].fd = ctl_sock.dsi_fd;
/* Poll socket with 1-second timeout */
int rc = poll(fds, 1, 1000);
if (rc == 0 || (rc == -1 && errno == EINTR))
continue;
/* Recheck the exit flag */
if (thr->wthr_exit)
break;
if (rc == -1) {
/* Unknown error, let's try to recreate the socket */
close(ctl_sock.dsi_fd);
ctl_sock.dsi_fd = -1;
if (dctl_create_socket_common() != 0)
break;
continue;
}
ASSERT(rc == 1);
short rev = fds[0].revents;
if (rev == 0)
continue;
ASSERT(rev == POLLIN);
/*
* At this point there should be a connection ready to be
* accepted.
*/
int client_fd = accept(ctl_sock.dsi_fd, NULL, NULL);
/* Many possible errors here, we'll just retry */
if (client_fd == -1)
continue;
/*
* Now lets handle the request. This can take a very
* long time (hours even), so we'll let other threads
* handle new connections.
*/
pthread_mutex_unlock(&ctl_sock.dsi_mtx);
dctl_thr_rebalance(thr, B_FALSE);
dctl_handle_conn(client_fd);
dctl_thr_rebalance(thr, B_TRUE);
pthread_mutex_lock(&ctl_sock.dsi_mtx);
}
pthread_mutex_unlock(&ctl_sock.dsi_mtx);
dctl_thr_die(thr);
return NULL;
}
static int dctl_create_socket_common()
{
dctl_sock_info_t *s = &ctl_sock;
size_t size;
int error;
ASSERT(s->dsi_fd == -1);
/*
* Unlink old socket, in case it exists.
* We don't care about errors here.
*/
unlink(s->dsi_path);
/* Create the socket */
s->dsi_fd = socket(PF_UNIX, SOCK_STREAM, 0);
if (s->dsi_fd == -1) {
error = errno;
perror("socket");
return error;
}
s->dsi_addr.sun_family = AF_UNIX;
size = sizeof(s->dsi_addr.sun_path) - 1;
strncpy(s->dsi_addr.sun_path, s->dsi_path, size);
s->dsi_addr.sun_path[size] = '\0';
if (bind(s->dsi_fd, (struct sockaddr *) &s->dsi_addr,
sizeof(s->dsi_addr)) != 0) {
error = errno;
perror("bind");
return error;
}
if (listen(s->dsi_fd, LISTEN_BACKLOG) != 0) {
error = errno;
perror("listen");
unlink(s->dsi_path);
return error;
}
return 0;
}
static int dctl_create_socket(const char *cfg_dir)
{
int error;
dctl_sock_info_t *s = &ctl_sock;
ASSERT(s->dsi_path == NULL);
ASSERT(s->dsi_fd == -1);
int pathsize = strlen(cfg_dir) + strlen(SOCKNAME) + 2;
if (pathsize > sizeof(s->dsi_addr.sun_path))
return ENAMETOOLONG;
s->dsi_path = malloc(pathsize);
if (s->dsi_path == NULL)
return ENOMEM;
strcpy(s->dsi_path, cfg_dir);
strcat(s->dsi_path, "/" SOCKNAME);
/*
* For convenience, create the directory in case it doesn't exist.
* We don't care about errors here.
*/
mkdir(cfg_dir, 0770);
error = dctl_create_socket_common();
if (error) {
free(s->dsi_path);
if (s->dsi_fd != -1) {
close(s->dsi_fd);
s->dsi_fd = -1;
}
}
return error;
}
static void dctl_destroy_socket()
{
dctl_sock_info_t *s = &ctl_sock;
ASSERT(s->dsi_path != NULL);
ASSERT(s->dsi_fd != -1);
close(s->dsi_fd);
s->dsi_fd = -1;
unlink(s->dsi_path);
free(s->dsi_path);
}
/*
* Initialize the DMU userspace control interface.
* This should be called after kernel_init().
*
* Note that only very rarely we have more than a couple of simultaneous
* lzfs/lzpool connections. Since the thread pool grows automatically when all
* threads are busy, a good value for min_thr and max_free_thr is 2.
*/
int dctl_server_init(const char *cfg_dir, int min_thr, int max_free_thr)
{
int error;
ASSERT(min_thr > 0);
ASSERT(max_free_thr >= min_thr);
error = zfs_ioctl_init();
if (error)
return error;
error = dctl_create_socket(cfg_dir);
if (error) {
(void) zfs_ioctl_fini();
return error;
}
error = dctl_thr_pool_create(min_thr, max_free_thr, dctl_thread);
if (error) {
(void) zfs_ioctl_fini();
dctl_destroy_socket();
return error;
}
return 0;
}
/*
* Terminate control interface.
* This should be called after closing all objsets, but before calling
* kernel_fini().
* May return EBUSY if the SPA is busy.
*
* Thread pool destruction can take a while due to poll()
* timeout or due to a thread being busy (e.g. a backup is being taken).
*/
int dctl_server_fini()
{
dctl_thr_pool_stop();
dctl_destroy_socket();
return zfs_ioctl_fini();
}
+253
View File
@@ -0,0 +1,253 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <stdlib.h>
#include <stddef.h>
#include <time.h>
#include <pthread.h>
#include <errno.h>
#include <sys/list.h>
#include <sys/debug.h>
#include <sys/dmu_ctl.h>
#include <sys/dmu_ctl_impl.h>
static dctl_thr_info_t thr_pool = {
.dti_mtx = PTHREAD_MUTEX_INITIALIZER
};
/*
* Create n threads.
* Callers must acquire thr_pool.dti_mtx first.
*/
static int dctl_thr_create(int n)
{
dctl_thr_info_t *p = &thr_pool;
int error;
for (int i = 0; i < n; i++) {
wthr_info_t *thr = malloc(sizeof(wthr_info_t));
if (thr == NULL)
return ENOMEM;
thr->wthr_exit = B_FALSE;
thr->wthr_free = B_TRUE;
error = pthread_create(&thr->wthr_id, NULL, p->dti_thr_func,
thr);
if (error) {
free(thr);
return error;
}
p->dti_free++;
list_insert_tail(&p->dti_list, thr);
}
return 0;
}
/*
* Mark the thread as dead.
* Must be called right before exiting the main thread function.
*/
void dctl_thr_die(wthr_info_t *thr)
{
dctl_thr_info_t *p = &thr_pool;
thr->wthr_exit = B_TRUE;
dctl_thr_rebalance(thr, B_FALSE);
pthread_mutex_lock(&p->dti_mtx);
list_remove(&p->dti_list, thr);
list_insert_tail(&p->dti_join_list, thr);
pthread_mutex_unlock(&p->dti_mtx);
}
/*
* Clean-up dead threads.
*/
void dctl_thr_join()
{
dctl_thr_info_t *p = &thr_pool;
wthr_info_t *thr;
pthread_mutex_lock(&p->dti_mtx);
while ((thr = list_head(&p->dti_join_list))) {
list_remove(&p->dti_join_list, thr);
ASSERT(!pthread_equal(thr->wthr_id, pthread_self()));
/*
* This should not block because all the threads
* on this list should have died already.
*
* pthread_join() can only return an error if
* we made a programming mistake.
*/
VERIFY(pthread_join(thr->wthr_id, NULL) == 0);
ASSERT(thr->wthr_exit);
ASSERT(!thr->wthr_free);
free(thr);
}
pthread_mutex_unlock(&p->dti_mtx);
}
/*
* Adjust the number of free threads in the pool and the thread status.
*
* Callers must acquire thr_pool.dti_mtx first.
*/
static void dctl_thr_adjust_free(wthr_info_t *thr, boolean_t set_free)
{
dctl_thr_info_t *p = &thr_pool;
ASSERT(p->dti_free >= 0);
if (!thr->wthr_free && set_free)
p->dti_free++;
else if (thr->wthr_free && !set_free)
p->dti_free--;
ASSERT(p->dti_free >= 0);
thr->wthr_free = set_free;
}
/*
* Rebalance threads. Also adjusts the free status of the thread.
* Will set the thread exit flag if the number of free threads is above
* the limit.
*/
void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free)
{
dctl_thr_info_t *p = &thr_pool;
pthread_mutex_lock(&p->dti_mtx);
if (p->dti_exit || p->dti_free > p->dti_max_free)
thr->wthr_exit = B_TRUE;
if (thr->wthr_exit)
set_free = B_FALSE;
dctl_thr_adjust_free(thr, set_free);
if (!p->dti_exit && p->dti_free == 0)
dctl_thr_create(1);
pthread_mutex_unlock(&p->dti_mtx);
}
/*
* Stop the thread pool.
*
* This can take a while since it actually waits for all threads to exit.
*/
void dctl_thr_pool_stop()
{
dctl_thr_info_t *p = &thr_pool;
wthr_info_t *thr;
struct timespec ts;
pthread_mutex_lock(&p->dti_mtx);
ASSERT(!p->dti_exit);
p->dti_exit = B_TRUE;
/* Let's flag the threads first */
thr = list_head(&p->dti_list);
while (thr != NULL) {
thr->wthr_exit = B_TRUE;
dctl_thr_adjust_free(thr, B_FALSE);
thr = list_next(&p->dti_list, thr);
}
pthread_mutex_unlock(&p->dti_mtx);
/* Now let's wait for them to exit */
ts.tv_sec = 0;
ts.tv_nsec = 50000000; /* 50ms */
do {
nanosleep(&ts, NULL);
pthread_mutex_lock(&p->dti_mtx);
thr = list_head(&p->dti_list);
pthread_mutex_unlock(&p->dti_mtx);
dctl_thr_join();
} while(thr != NULL);
ASSERT(p->dti_free == 0);
ASSERT(list_is_empty(&p->dti_list));
ASSERT(list_is_empty(&p->dti_join_list));
list_destroy(&p->dti_list);
list_destroy(&p->dti_join_list);
}
/*
* Create thread pool.
*
* If at least one thread creation fails, it will stop all previous
* threads and return a non-zero value.
*/
int dctl_thr_pool_create(int min_thr, int max_free_thr,
thr_func_t *thr_func)
{
int error;
dctl_thr_info_t *p = &thr_pool;
ASSERT(p->dti_free == 0);
/* Initialize global variables */
p->dti_min = min_thr;
p->dti_max_free = max_free_thr;
p->dti_exit = B_FALSE;
p->dti_thr_func = thr_func;
list_create(&p->dti_list, sizeof(wthr_info_t), offsetof(wthr_info_t,
wthr_node));
list_create(&p->dti_join_list, sizeof(wthr_info_t),
offsetof(wthr_info_t, wthr_node));
pthread_mutex_lock(&p->dti_mtx);
error = dctl_thr_create(min_thr);
pthread_mutex_unlock(&p->dti_mtx);
if (error)
dctl_thr_pool_stop();
return error;
}
File diff suppressed because it is too large Load Diff
+1
View File
@@ -0,0 +1 @@
subdir-m += sys
@@ -0,0 +1 @@
DISTFILES = dmu_ctl.h dmu_ctl_impl.h
+71
View File
@@ -0,0 +1,71 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DMU_CTL_H
#define _SYS_DMU_CTL_H
#include <sys/types.h>
/* Default directory where the clients search for sockets to connect */
#define DMU_CTL_DEFAULT_DIR "/var/run/zfs/udmu"
/*
* These functions are called by the server process.
*
* kernel_init() must be called before dctl_server_init().
* kernel_fini() must not be called before dctl_server_fini().
*
* All objsets must be closed and object references be released before calling
* dctl_server_fini(), otherwise it will return EBUSY.
*
* Note: On Solaris, it is highly recommended to either catch or ignore the
* SIGPIPE signal, otherwise the server process will die if the client is
* killed.
*/
int dctl_server_init(const char *cfg_dir, int min_threads,
int max_free_threads);
int dctl_server_fini();
/*
* The following functions are called by the DMU from the server process context
* (in the worker threads).
*/
int dctls_copyin(const void *src, void *dest, size_t size);
int dctls_copyinstr(const char *from, char *to, size_t max,
size_t *len);
int dctls_copyout(const void *src, void *dest, size_t size);
int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp);
int dctls_fd_write(int fd, const void *src, ssize_t len);
/*
* These functions are called by the client process (libzfs).
*/
int dctlc_connect(const char *dir, boolean_t check_subdirs);
void dctlc_disconnect(int fd);
int dctlc_ioctl(int fd, int32_t request, void *arg);
#endif
@@ -0,0 +1,144 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DMU_CTL_IMPL_H
#define _SYS_DMU_CTL_IMPL_H
#include <sys/list.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <pthread.h>
#define SOCKNAME "dmu_socket"
#define DCTL_PROTOCOL_VER 1
#define DCTL_MAGIC 0xdc71b1070c01dc71ll
/* Message types */
enum {
DCTL_IOCTL,
DCTL_IOCTL_REPLY,
DCTL_COPYIN,
DCTL_COPYINSTR,
DCTL_COPYOUT,
DCTL_FD_READ,
DCTL_FD_WRITE,
DCTL_GEN_REPLY /* generic reply */
};
/* On-the-wire message */
typedef struct dctl_cmd {
uint64_t dcmd_magic;
int8_t dcmd_version;
int8_t dcmd_msg;
uint8_t dcmd_pad[6];
union {
struct dcmd_ioctl {
uint64_t arg;
int32_t cmd;
uint8_t pad[4];
} dcmd_ioctl;
struct dcmd_copy_req {
uint64_t ptr;
uint64_t size;
} dcmd_copy;
struct dcmd_fd_req {
int64_t size;
int32_t fd;
uint8_t pad[4];
} dcmd_fd_io;
struct dcmd_reply {
uint64_t size; /* used by reply to DCTL_COPYINSTR,
DCTL_FD_READ and DCTL_FD_WRITE */
int32_t rc; /* return code */
uint8_t pad[4];
} dcmd_reply;
} u;
} dctl_cmd_t;
#define DCTL_CMD_HEADER_SIZE (sizeof(uint64_t) + sizeof(uint8_t))
/*
* The following definitions are only used by the server code.
*/
#define LISTEN_BACKLOG 5
/* Worker thread data */
typedef struct wthr_info {
list_node_t wthr_node;
pthread_t wthr_id;
boolean_t wthr_exit; /* termination flag */
boolean_t wthr_free;
} wthr_info_t;
/* Control socket data */
typedef struct dctl_sock_info {
pthread_mutex_t dsi_mtx;
char *dsi_path;
struct sockaddr_un dsi_addr;
int dsi_fd;
} dctl_sock_info_t;
typedef void *thr_func_t(void *);
/* Thread pool data */
typedef struct dctl_thr_info {
thr_func_t *dti_thr_func;
pthread_mutex_t dti_mtx; /* protects the thread lists and dti_free */
list_t dti_list; /* list of threads in the thread pool */
list_t dti_join_list; /* list of threads that are waiting to be
joined */
int dti_free; /* number of free worker threads */
int dti_min;
int dti_max_free;
boolean_t dti_exit; /* global termination flag */
} dctl_thr_info_t;
/* Messaging functions functions */
int dctl_read_msg(int fd, dctl_cmd_t *cmd);
int dctl_send_msg(int fd, dctl_cmd_t *cmd);
int dctl_read_data(int fd, void *ptr, size_t size);
int dctl_send_data(int fd, const void *ptr, size_t size);
/* Thread pool functions */
int dctl_thr_pool_create(int min_thr, int max_free_thr,
thr_func_t *thr_func);
void dctl_thr_pool_stop();
void dctl_thr_join();
void dctl_thr_die(wthr_info_t *thr);
void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free);
#endif
+249
View File
@@ -0,0 +1,249 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "@(#)rrwlock.c 1.1 07/10/24 SMI"
#include <sys/refcount.h>
#include <sys/rrwlock.h>
/*
* This file contains the implementation of a re-entrant read
* reader/writer lock (aka "rrwlock").
*
* This is a normal reader/writer lock with the additional feature
* of allowing threads who have already obtained a read lock to
* re-enter another read lock (re-entrant read) - even if there are
* waiting writers.
*
* Callers who have not obtained a read lock give waiting writers priority.
*
* The rrwlock_t lock does not allow re-entrant writers, nor does it
* allow a re-entrant mix of reads and writes (that is, it does not
* allow a caller who has already obtained a read lock to be able to
* then grab a write lock without first dropping all read locks, and
* vice versa).
*
* The rrwlock_t uses tsd (thread specific data) to keep a list of
* nodes (rrw_node_t), where each node keeps track of which specific
* lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
* should be rare, a thread that grabs multiple reads on the same rrwlock_t
* will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
* tsd list can represent a different rrwlock_t. This allows a thread
* to enter multiple and unique rrwlock_ts for read locks at the same time.
*
* Since using tsd exposes some overhead, the rrwlock_t only needs to
* keep tsd data when writers are waiting. If no writers are waiting, then
* a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
* is needed. Once a writer attempts to grab the lock, readers then
* keep tsd data and bump the linked readers count (rr_linked_rcount).
*
* If there are waiting writers and there are anonymous readers, then a
* reader doesn't know if it is a re-entrant lock. But since it may be one,
* we allow the read to proceed (otherwise it could deadlock). Since once
* waiting writers are active, readers no longer bump the anonymous count,
* the anonymous readers will eventually flush themselves out. At this point,
* readers will be able to tell if they are a re-entrant lock (have a
* rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
* we must let the proceed. If they are not, then the reader blocks for the
* waiting writers. Hence, we do not starve writers.
*/
/* global key for TSD */
uint_t rrw_tsd_key;
typedef struct rrw_node {
struct rrw_node *rn_next;
rrwlock_t *rn_rrl;
} rrw_node_t;
static rrw_node_t *
rrn_find(rrwlock_t *rrl)
{
rrw_node_t *rn;
if (refcount_count(&rrl->rr_linked_rcount) == 0)
return (NULL);
for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
if (rn->rn_rrl == rrl)
return (rn);
}
return (NULL);
}
/*
* Add a node to the head of the singly linked list.
*/
static void
rrn_add(rrwlock_t *rrl)
{
rrw_node_t *rn;
rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
rn->rn_rrl = rrl;
rn->rn_next = tsd_get(rrw_tsd_key);
VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
}
/*
* If a node is found for 'rrl', then remove the node from this
* thread's list and return TRUE; otherwise return FALSE.
*/
static boolean_t
rrn_find_and_remove(rrwlock_t *rrl)
{
rrw_node_t *rn;
rrw_node_t *prev = NULL;
if (refcount_count(&rrl->rr_linked_rcount) == 0)
return (NULL);
for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
if (rn->rn_rrl == rrl) {
if (prev)
prev->rn_next = rn->rn_next;
else
VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
kmem_free(rn, sizeof (*rn));
return (B_TRUE);
}
prev = rn;
}
return (B_FALSE);
}
void
rrw_init(rrwlock_t *rrl)
{
mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
rrl->rr_writer = NULL;
refcount_create(&rrl->rr_anon_rcount);
refcount_create(&rrl->rr_linked_rcount);
rrl->rr_writer_wanted = B_FALSE;
}
void
rrw_destroy(rrwlock_t *rrl)
{
mutex_destroy(&rrl->rr_lock);
cv_destroy(&rrl->rr_cv);
ASSERT(rrl->rr_writer == NULL);
refcount_destroy(&rrl->rr_anon_rcount);
refcount_destroy(&rrl->rr_linked_rcount);
}
static void
rrw_enter_read(rrwlock_t *rrl, void *tag)
{
mutex_enter(&rrl->rr_lock);
ASSERT(rrl->rr_writer != curthread);
ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
while (rrl->rr_writer || (rrl->rr_writer_wanted &&
refcount_is_zero(&rrl->rr_anon_rcount) &&
rrn_find(rrl) == NULL))
cv_wait(&rrl->rr_cv, &rrl->rr_lock);
if (rrl->rr_writer_wanted) {
/* may or may not be a re-entrant enter */
rrn_add(rrl);
(void) refcount_add(&rrl->rr_linked_rcount, tag);
} else {
(void) refcount_add(&rrl->rr_anon_rcount, tag);
}
ASSERT(rrl->rr_writer == NULL);
mutex_exit(&rrl->rr_lock);
}
static void
rrw_enter_write(rrwlock_t *rrl)
{
mutex_enter(&rrl->rr_lock);
ASSERT(rrl->rr_writer != curthread);
while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
refcount_count(&rrl->rr_linked_rcount) > 0 ||
rrl->rr_writer != NULL) {
rrl->rr_writer_wanted = B_TRUE;
cv_wait(&rrl->rr_cv, &rrl->rr_lock);
}
rrl->rr_writer_wanted = B_FALSE;
rrl->rr_writer = curthread;
mutex_exit(&rrl->rr_lock);
}
void
rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
{
if (rw == RW_READER)
rrw_enter_read(rrl, tag);
else
rrw_enter_write(rrl);
}
void
rrw_exit(rrwlock_t *rrl, void *tag)
{
mutex_enter(&rrl->rr_lock);
ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
!refcount_is_zero(&rrl->rr_linked_rcount) ||
rrl->rr_writer != NULL);
if (rrl->rr_writer == NULL) {
if (rrn_find_and_remove(rrl)) {
if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
cv_broadcast(&rrl->rr_cv);
} else {
if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
cv_broadcast(&rrl->rr_cv);
}
} else {
ASSERT(rrl->rr_writer == curthread);
ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
refcount_is_zero(&rrl->rr_linked_rcount));
rrl->rr_writer = NULL;
cv_broadcast(&rrl->rr_cv);
}
mutex_exit(&rrl->rr_lock);
}
boolean_t
rrw_held(rrwlock_t *rrl, krw_t rw)
{
boolean_t held;
mutex_enter(&rrl->rr_lock);
if (rw == RW_WRITER) {
held = (rrl->rr_writer == curthread);
} else {
held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
!refcount_is_zero(&rrl->rr_linked_rcount));
}
mutex_exit(&rrl->rr_lock);
return (held);
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+968
View File
@@ -0,0 +1,968 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "@(#)zfs_dir.c 1.25 08/04/27 SMI"
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/mode.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/pathname.h>
#include <sys/cmn_err.h>
#include <sys/errno.h>
#include <sys/stat.h>
#include <sys/unistd.h>
#include <sys/sunddi.h>
#include <sys/random.h>
#include <sys/policy.h>
#include <sys/zfs_dir.h>
#include <sys/zfs_acl.h>
#include <sys/fs/zfs.h>
#include "fs/fs_subr.h"
#include <sys/zap.h>
#include <sys/dmu.h>
#include <sys/atomic.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/dnlc.h>
#include <sys/extdirent.h>
/*
* zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
* of names after deciding which is the appropriate lookup interface.
*/
static int
zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
{
int error;
if (zfsvfs->z_norm) {
matchtype_t mt = MT_FIRST;
boolean_t conflict = B_FALSE;
size_t bufsz = 0;
char *buf = NULL;
if (rpnp) {
buf = rpnp->pn_buf;
bufsz = rpnp->pn_bufsize;
}
if (exact)
mt = MT_EXACT;
/*
* In the non-mixed case we only expect there would ever
* be one match, but we need to use the normalizing lookup.
*/
error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
zoid, mt, buf, bufsz, &conflict);
if (!error && deflags)
*deflags = conflict ? ED_CASE_CONFLICT : 0;
} else {
error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
}
*zoid = ZFS_DIRENT_OBJ(*zoid);
if (error == ENOENT && update)
dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
return (error);
}
/*
* Lock a directory entry. A dirlock on <dzp, name> protects that name
* in dzp's directory zap object. As long as you hold a dirlock, you can
* assume two things: (1) dzp cannot be reaped, and (2) no other thread
* can change the zap entry for (i.e. link or unlink) this name.
*
* Input arguments:
* dzp - znode for directory
* name - name of entry to lock
* flag - ZNEW: if the entry already exists, fail with EEXIST.
* ZEXISTS: if the entry does not exist, fail with ENOENT.
* ZSHARED: allow concurrent access with other ZSHARED callers.
* ZXATTR: we want dzp's xattr directory
* ZCILOOK: On a mixed sensitivity file system,
* this lookup should be case-insensitive.
* ZCIEXACT: On a purely case-insensitive file system,
* this lookup should be case-sensitive.
* ZRENAMING: we are locking for renaming, force narrow locks
*
* Output arguments:
* zpp - pointer to the znode for the entry (NULL if there isn't one)
* dlpp - pointer to the dirlock for this entry (NULL on error)
* direntflags - (case-insensitive lookup only)
* flags if multiple case-sensitive matches exist in directory
* realpnp - (case-insensitive lookup only)
* actual name matched within the directory
*
* Return value: 0 on success or errno on failure.
*
* NOTE: Always checks for, and rejects, '.' and '..'.
* NOTE: For case-insensitive file systems we take wide locks (see below),
* but return znode pointers to a single match.
*/
int
zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
int flag, int *direntflags, pathname_t *realpnp)
{
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zfs_dirlock_t *dl;
boolean_t update;
boolean_t exact;
uint64_t zoid;
vnode_t *vp = NULL;
int error = 0;
int cmpflags;
*zpp = NULL;
*dlpp = NULL;
/*
* Verify that we are not trying to lock '.', '..', or '.zfs'
*/
if (name[0] == '.' &&
(name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
return (EEXIST);
/*
* Case sensitivity and normalization preferences are set when
* the file system is created. These are stored in the
* zfsvfs->z_case and zfsvfs->z_norm fields. These choices
* affect what vnodes can be cached in the DNLC, how we
* perform zap lookups, and the "width" of our dirlocks.
*
* A normal dirlock locks a single name. Note that with
* normalization a name can be composed multiple ways, but
* when normalized, these names all compare equal. A wide
* dirlock locks multiple names. We need these when the file
* system is supporting mixed-mode access. It is sometimes
* necessary to lock all case permutations of file name at
* once so that simultaneous case-insensitive/case-sensitive
* behaves as rationally as possible.
*/
/*
* Decide if exact matches should be requested when performing
* a zap lookup on file systems supporting case-insensitive
* access.
*/
exact =
((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
/*
* Only look in or update the DNLC if we are looking for the
* name on a file system that does not require normalization
* or case folding. We can also look there if we happen to be
* on a non-normalizing, mixed sensitivity file system IF we
* are looking for the exact name.
*
* Maybe can add TO-UPPERed version of name to dnlc in ci-only
* case for performance improvement?
*/
update = !zfsvfs->z_norm ||
((zfsvfs->z_case == ZFS_CASE_MIXED) &&
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
/*
* ZRENAMING indicates we are in a situation where we should
* take narrow locks regardless of the file system's
* preferences for normalizing and case folding. This will
* prevent us deadlocking trying to grab the same wide lock
* twice if the two names happen to be case-insensitive
* matches.
*/
if (flag & ZRENAMING)
cmpflags = 0;
else
cmpflags = zfsvfs->z_norm;
/*
* Wait until there are no locks on this name.
*/
rw_enter(&dzp->z_name_lock, RW_READER);
mutex_enter(&dzp->z_lock);
for (;;) {
if (dzp->z_unlinked) {
mutex_exit(&dzp->z_lock);
rw_exit(&dzp->z_name_lock);
return (ENOENT);
}
for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
U8_UNICODE_LATEST, &error) == 0) || error != 0)
break;
}
if (error != 0) {
mutex_exit(&dzp->z_lock);
rw_exit(&dzp->z_name_lock);
return (ENOENT);
}
if (dl == NULL) {
/*
* Allocate a new dirlock and add it to the list.
*/
dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
dl->dl_name = name;
dl->dl_sharecnt = 0;
dl->dl_namesize = 0;
dl->dl_dzp = dzp;
dl->dl_next = dzp->z_dirlocks;
dzp->z_dirlocks = dl;
break;
}
if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
break;
cv_wait(&dl->dl_cv, &dzp->z_lock);
}
if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
/*
* We're the second shared reference to dl. Make a copy of
* dl_name in case the first thread goes away before we do.
* Note that we initialize the new name before storing its
* pointer into dl_name, because the first thread may load
* dl->dl_name at any time. He'll either see the old value,
* which is his, or the new shared copy; either is OK.
*/
dl->dl_namesize = strlen(dl->dl_name) + 1;
name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
bcopy(dl->dl_name, name, dl->dl_namesize);
dl->dl_name = name;
}
mutex_exit(&dzp->z_lock);
/*
* We have a dirlock on the name. (Note that it is the dirlock,
* not the dzp's z_lock, that protects the name in the zap object.)
* See if there's an object by this name; if so, put a hold on it.
*/
if (flag & ZXATTR) {
zoid = dzp->z_phys->zp_xattr;
error = (zoid == 0 ? ENOENT : 0);
} else {
if (update)
vp = dnlc_lookup(ZTOV(dzp), name);
if (vp == DNLC_NO_VNODE) {
VN_RELE(vp);
error = ENOENT;
} else if (vp) {
if (flag & ZNEW) {
zfs_dirent_unlock(dl);
VN_RELE(vp);
return (EEXIST);
}
*dlpp = dl;
*zpp = VTOZ(vp);
return (0);
} else {
error = zfs_match_find(zfsvfs, dzp, name, exact,
update, direntflags, realpnp, &zoid);
}
}
if (error) {
if (error != ENOENT || (flag & ZEXISTS)) {
zfs_dirent_unlock(dl);
return (error);
}
} else {
if (flag & ZNEW) {
zfs_dirent_unlock(dl);
return (EEXIST);
}
error = zfs_zget(zfsvfs, zoid, zpp);
if (error) {
zfs_dirent_unlock(dl);
return (error);
}
if (!(flag & ZXATTR) && update)
dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
}
*dlpp = dl;
return (0);
}
/*
* Unlock this directory entry and wake anyone who was waiting for it.
*/
void
zfs_dirent_unlock(zfs_dirlock_t *dl)
{
znode_t *dzp = dl->dl_dzp;
zfs_dirlock_t **prev_dl, *cur_dl;
mutex_enter(&dzp->z_lock);
rw_exit(&dzp->z_name_lock);
if (dl->dl_sharecnt > 1) {
dl->dl_sharecnt--;
mutex_exit(&dzp->z_lock);
return;
}
prev_dl = &dzp->z_dirlocks;
while ((cur_dl = *prev_dl) != dl)
prev_dl = &cur_dl->dl_next;
*prev_dl = dl->dl_next;
cv_broadcast(&dl->dl_cv);
mutex_exit(&dzp->z_lock);
if (dl->dl_namesize != 0)
kmem_free(dl->dl_name, dl->dl_namesize);
cv_destroy(&dl->dl_cv);
kmem_free(dl, sizeof (*dl));
}
/*
* Look up an entry in a directory.
*
* NOTE: '.' and '..' are handled as special cases because
* no directory entries are actually stored for them. If this is
* the root of a filesystem, then '.zfs' is also treated as a
* special pseudo-directory.
*/
int
zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
int *deflg, pathname_t *rpnp)
{
zfs_dirlock_t *dl;
znode_t *zp;
int error = 0;
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
*vpp = ZTOV(dzp);
VN_HOLD(*vpp);
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
/*
* If we are a snapshot mounted under .zfs, return
* the vp for the snapshot directory.
*/
if (dzp->z_phys->zp_parent == dzp->z_id &&
zfsvfs->z_parent != zfsvfs) {
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
"snapshot", vpp, NULL, 0, NULL, kcred,
NULL, NULL, NULL);
return (error);
}
rw_enter(&dzp->z_parent_lock, RW_READER);
error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
if (error == 0)
*vpp = ZTOV(zp);
rw_exit(&dzp->z_parent_lock);
} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
*vpp = zfsctl_root(dzp);
} else {
int zf;
zf = ZEXISTS | ZSHARED;
if (flags & FIGNORECASE)
zf |= ZCILOOK;
error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
if (error == 0) {
*vpp = ZTOV(zp);
zfs_dirent_unlock(dl);
dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
}
rpnp = NULL;
}
if ((flags & FIGNORECASE) && rpnp && !error)
(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
return (error);
}
static char *
zfs_unlinked_hexname(char namebuf[17], uint64_t x)
{
char *name = &namebuf[16];
const char digits[16] = "0123456789abcdef";
*name = '\0';
do {
*--name = digits[x & 0xf];
x >>= 4;
} while (x != 0);
return (name);
}
/*
* unlinked Set (formerly known as the "delete queue") Error Handling
*
* When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
* don't specify the name of the entry that we will be manipulating. We
* also fib and say that we won't be adding any new entries to the
* unlinked set, even though we might (this is to lower the minimum file
* size that can be deleted in a full filesystem). So on the small
* chance that the nlink list is using a fat zap (ie. has more than
* 2000 entries), we *may* not pre-read a block that's needed.
* Therefore it is remotely possible for some of the assertions
* regarding the unlinked set below to fail due to i/o error. On a
* nondebug system, this will result in the space being leaked.
*/
void
zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
char obj_name[17];
int error;
ASSERT(zp->z_unlinked);
ASSERT3U(zp->z_phys->zp_links, ==, 0);
error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
ASSERT3U(error, ==, 0);
}
/*
* Clean up any znodes that had no links when we either crashed or
* (force) umounted the file system.
*/
void
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
{
zap_cursor_t zc;
zap_attribute_t zap;
dmu_object_info_t doi;
znode_t *zp;
int error;
/*
* Interate over the contents of the unlinked set.
*/
for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
zap_cursor_retrieve(&zc, &zap) == 0;
zap_cursor_advance(&zc)) {
/*
* See what kind of object we have in list
*/
error = dmu_object_info(zfsvfs->z_os,
zap.za_first_integer, &doi);
if (error != 0)
continue;
ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
(doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
/*
* We need to re-mark these list entries for deletion,
* so we pull them back into core and set zp->z_unlinked.
*/
error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
/*
* We may pick up znodes that are already marked for deletion.
* This could happen during the purge of an extended attribute
* directory. All we need to do is skip over them, since they
* are already in the system marked z_unlinked.
*/
if (error != 0)
continue;
zp->z_unlinked = B_TRUE;
VN_RELE(ZTOV(zp));
}
zap_cursor_fini(&zc);
}
/*
* Delete the entire contents of a directory. Return a count
* of the number of entries that could not be deleted. If we encounter
* an error, return a count of at least one so that the directory stays
* in the unlinked set.
*
* NOTE: this function assumes that the directory is inactive,
* so there is no need to lock its entries before deletion.
* Also, it assumes the directory contents is *only* regular
* files.
*/
static int
zfs_purgedir(znode_t *dzp)
{
zap_cursor_t zc;
zap_attribute_t zap;
znode_t *xzp;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zfs_dirlock_t dl;
int skipped = 0;
int error;
for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
(error = zap_cursor_retrieve(&zc, &zap)) == 0;
zap_cursor_advance(&zc)) {
error = zfs_zget(zfsvfs,
ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
if (error) {
skipped += 1;
continue;
}
ASSERT((ZTOV(xzp)->v_type == VREG) ||
(ZTOV(xzp)->v_type == VLNK));
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
dmu_tx_hold_bonus(tx, xzp->z_id);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
VN_RELE(ZTOV(xzp));
skipped += 1;
continue;
}
bzero(&dl, sizeof (dl));
dl.dl_dzp = dzp;
dl.dl_name = zap.za_name;
error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
if (error)
skipped += 1;
dmu_tx_commit(tx);
VN_RELE(ZTOV(xzp));
}
zap_cursor_fini(&zc);
if (error != ENOENT)
skipped += 1;
return (skipped);
}
void
zfs_rmnode(znode_t *zp)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zfsvfs->z_os;
znode_t *xzp = NULL;
char obj_name[17];
dmu_tx_t *tx;
uint64_t acl_obj;
int error;
ASSERT(ZTOV(zp)->v_count == 0);
ASSERT(zp->z_phys->zp_links == 0);
/*
* If this is an attribute directory, purge its contents.
*/
if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
if (zfs_purgedir(zp) != 0) {
/*
* Not enough space to delete some xattrs.
* Leave it on the unlinked set.
*/
zfs_znode_dmu_fini(zp);
zfs_znode_free(zp);
return;
}
}
/*
* If the file has extended attributes, we're going to unlink
* the xattr dir.
*/
if (zp->z_phys->zp_xattr) {
error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
ASSERT(error == 0);
}
acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
/*
* Set up the transaction.
*/
tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
if (xzp) {
dmu_tx_hold_bonus(tx, xzp->z_id);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
}
if (acl_obj)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
/*
* Not enough space to delete the file. Leave it in the
* unlinked set, leaking it until the fs is remounted (at
* which point we'll call zfs_unlinked_drain() to process it).
*/
dmu_tx_abort(tx);
zfs_znode_dmu_fini(zp);
zfs_znode_free(zp);
goto out;
}
if (xzp) {
dmu_buf_will_dirty(xzp->z_dbuf, tx);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
xzp->z_phys->zp_links = 0; /* no more links to it */
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
}
/* Remove this znode from the unlinked set */
error = zap_remove(os, zfsvfs->z_unlinkedobj,
zfs_unlinked_hexname(obj_name, zp->z_id), tx);
ASSERT3U(error, ==, 0);
zfs_znode_delete(zp, tx);
dmu_tx_commit(tx);
out:
if (xzp)
VN_RELE(ZTOV(xzp));
}
static uint64_t
zfs_dirent(znode_t *zp)
{
uint64_t de = zp->z_id;
if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
de |= IFTODT((zp)->z_phys->zp_mode) << 60;
return (de);
}
/*
* Link zp into dl. Can only fail if zp has been unlinked.
*/
int
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
{
znode_t *dzp = dl->dl_dzp;
vnode_t *vp = ZTOV(zp);
uint64_t value;
int zp_is_dir = (vp->v_type == VDIR);
int error;
dmu_buf_will_dirty(zp->z_dbuf, tx);
mutex_enter(&zp->z_lock);
if (!(flag & ZRENAMING)) {
if (zp->z_unlinked) { /* no new links to unlinked zp */
ASSERT(!(flag & (ZNEW | ZEXISTS)));
mutex_exit(&zp->z_lock);
return (ENOENT);
}
zp->z_phys->zp_links++;
}
zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
if (!(flag & ZNEW))
zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
mutex_exit(&zp->z_lock);
dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
dzp->z_phys->zp_size++; /* one dirent added */
dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
mutex_exit(&dzp->z_lock);
value = zfs_dirent(zp);
error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
8, 1, &value, tx);
ASSERT(error == 0);
dnlc_update(ZTOV(dzp), dl->dl_name, vp);
return (0);
}
/*
* Unlink zp from dl, and mark zp for deletion if this was the last link.
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
* If it's non-NULL, we use it to indicate whether the znode needs deletion,
* and it's the caller's job to do it.
*/
int
zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
boolean_t *unlinkedp)
{
znode_t *dzp = dl->dl_dzp;
vnode_t *vp = ZTOV(zp);
int zp_is_dir = (vp->v_type == VDIR);
boolean_t unlinked = B_FALSE;
int error;
dnlc_remove(ZTOV(dzp), dl->dl_name);
if (!(flag & ZRENAMING)) {
dmu_buf_will_dirty(zp->z_dbuf, tx);
if (vn_vfswlock(vp)) /* prevent new mounts on zp */
return (EBUSY);
if (vn_ismntpt(vp)) { /* don't remove mount point */
vn_vfsunlock(vp);
return (EBUSY);
}
mutex_enter(&zp->z_lock);
if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
return (EEXIST);
}
if (zp->z_phys->zp_links <= zp_is_dir) {
zfs_panic_recover("zfs: link count on %s is %u, "
"should be at least %u",
zp->z_vnode->v_path ? zp->z_vnode->v_path :
"<unknown>", (int)zp->z_phys->zp_links,
zp_is_dir + 1);
zp->z_phys->zp_links = zp_is_dir + 1;
}
if (--zp->z_phys->zp_links == zp_is_dir) {
zp->z_unlinked = B_TRUE;
zp->z_phys->zp_links = 0;
unlinked = B_TRUE;
} else {
zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
}
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
}
dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
dzp->z_phys->zp_size--; /* one dirent removed */
dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
mutex_exit(&dzp->z_lock);
if (zp->z_zfsvfs->z_norm) {
if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
(flag & ZCIEXACT)) ||
((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
!(flag & ZCILOOK)))
error = zap_remove_norm(zp->z_zfsvfs->z_os,
dzp->z_id, dl->dl_name, MT_EXACT, tx);
else
error = zap_remove_norm(zp->z_zfsvfs->z_os,
dzp->z_id, dl->dl_name, MT_FIRST, tx);
} else {
error = zap_remove(zp->z_zfsvfs->z_os,
dzp->z_id, dl->dl_name, tx);
}
ASSERT(error == 0);
if (unlinkedp != NULL)
*unlinkedp = unlinked;
else if (unlinked)
zfs_unlinked_add(zp, tx);
return (0);
}
/*
* Indicate whether the directory is empty. Works with or without z_lock
* held, but can only be consider a hint in the latter case. Returns true
* if only "." and ".." remain and there's no work in progress.
*/
boolean_t
zfs_dirempty(znode_t *dzp)
{
return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
}
int
zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
znode_t *xzp;
dmu_tx_t *tx;
int error;
zfs_fuid_info_t *fuidp = NULL;
*xvpp = NULL;
if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
return (error);
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
if (zfsvfs->z_fuid_obj == 0) {
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
error = dmu_tx_assign(tx, zfsvfs->z_assign);
if (error) {
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
dmu_tx_wait(tx);
dmu_tx_abort(tx);
return (error);
}
zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
ASSERT(xzp->z_phys->zp_parent == zp->z_id);
dmu_buf_will_dirty(zp->z_dbuf, tx);
zp->z_phys->zp_xattr = xzp->z_id;
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
xzp, "", NULL, fuidp, vap);
if (fuidp)
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx);
*xvpp = ZTOV(xzp);
return (0);
}
/*
* Return a znode for the extended attribute directory for zp.
* ** If the directory does not already exist, it is created **
*
* IN: zp - znode to obtain attribute directory from
* cr - credentials of caller
* flags - flags from the VOP_LOOKUP call
*
* OUT: xzpp - pointer to extended attribute znode
*
* RETURN: 0 on success
* error number on failure
*/
int
zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
znode_t *xzp;
zfs_dirlock_t *dl;
vattr_t va;
int error;
top:
error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
if (error)
return (error);
if (xzp != NULL) {
*xvpp = ZTOV(xzp);
zfs_dirent_unlock(dl);
return (0);
}
ASSERT(zp->z_phys->zp_xattr == 0);
if (!(flags & CREATE_XATTR_DIR)) {
zfs_dirent_unlock(dl);
return (ENOENT);
}
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
zfs_dirent_unlock(dl);
return (EROFS);
}
/*
* The ability to 'create' files in an attribute
* directory comes from the write_xattr permission on the base file.
*
* The ability to 'search' an attribute directory requires
* read_xattr permission on the base file.
*
* Once in a directory the ability to read/write attributes
* is controlled by the permissions on the attribute file.
*/
va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
va.va_type = VDIR;
va.va_mode = S_IFDIR | S_ISVTX | 0777;
zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
error = zfs_make_xattrdir(zp, &va, xvpp, cr);
zfs_dirent_unlock(dl);
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
/* NB: we already did dmu_tx_wait() if necessary */
goto top;
}
return (error);
}
/*
* Decide whether it is okay to remove within a sticky directory.
*
* In sticky directories, write access is not sufficient;
* you can remove entries from a directory only if:
*
* you own the directory,
* you own the entry,
* the entry is a plain file and you have write access,
* or you are privileged (checked in secpolicy...).
*
* The function returns 0 if remove access is granted.
*/
int
zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
{
uid_t uid;
uid_t downer;
uid_t fowner;
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
return (0);
if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
return (0);
downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
if ((uid = crgetuid(cr)) == downer || uid == fowner ||
(ZTOV(zp)->v_type == VREG &&
zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
return (0);
else
return (secpolicy_vnode_remove(cr));
}
+688
View File
@@ -0,0 +1,688 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "@(#)zfs_fuid.c 1.5 08/01/31 SMI"
#include <sys/zfs_context.h>
#include <sys/sunddi.h>
#include <sys/dmu.h>
#include <sys/avl.h>
#include <sys/zap.h>
#include <sys/refcount.h>
#include <sys/nvpair.h>
#ifdef _KERNEL
#include <sys/kidmap.h>
#include <sys/sid.h>
#include <sys/zfs_vfsops.h>
#include <sys/zfs_znode.h>
#endif
#include <sys/zfs_fuid.h>
/*
* FUID Domain table(s).
*
* The FUID table is stored as a packed nvlist of an array
* of nvlists which contain an index, domain string and offset
*
* During file system initialization the nvlist(s) are read and
* two AVL trees are created. One tree is keyed by the index number
* and the other by the domain string. Nodes are never removed from
* trees, but new entries may be added. If a new entry is added then the
* on-disk packed nvlist will also be updated.
*/
#define FUID_IDX "fuid_idx"
#define FUID_DOMAIN "fuid_domain"
#define FUID_OFFSET "fuid_offset"
#define FUID_NVP_ARRAY "fuid_nvlist"
typedef struct fuid_domain {
avl_node_t f_domnode;
avl_node_t f_idxnode;
ksiddomain_t *f_ksid;
uint64_t f_idx;
} fuid_domain_t;
/*
* Compare two indexes.
*/
static int
idx_compare(const void *arg1, const void *arg2)
{
const fuid_domain_t *node1 = arg1;
const fuid_domain_t *node2 = arg2;
if (node1->f_idx < node2->f_idx)
return (-1);
else if (node1->f_idx > node2->f_idx)
return (1);
return (0);
}
/*
* Compare two domain strings.
*/
static int
domain_compare(const void *arg1, const void *arg2)
{
const fuid_domain_t *node1 = arg1;
const fuid_domain_t *node2 = arg2;
int val;
val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
if (val == 0)
return (0);
return (val > 0 ? 1 : -1);
}
/*
* load initial fuid domain and idx trees. This function is used by
* both the kernel and zdb.
*/
uint64_t
zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
avl_tree_t *domain_tree)
{
dmu_buf_t *db;
uint64_t fuid_size;
avl_create(idx_tree, idx_compare,
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
avl_create(domain_tree, domain_compare,
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
fuid_size = *(uint64_t *)db->db_data;
dmu_buf_rele(db, FTAG);
if (fuid_size) {
nvlist_t **fuidnvp;
nvlist_t *nvp = NULL;
uint_t count;
char *packed;
int i;
packed = kmem_alloc(fuid_size, KM_SLEEP);
VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
VERIFY(nvlist_unpack(packed, fuid_size,
&nvp, 0) == 0);
VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
&fuidnvp, &count) == 0);
for (i = 0; i != count; i++) {
fuid_domain_t *domnode;
char *domain;
uint64_t idx;
VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
&domain) == 0);
VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
&idx) == 0);
domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
domnode->f_idx = idx;
domnode->f_ksid = ksid_lookupdomain(domain);
avl_add(idx_tree, domnode);
avl_add(domain_tree, domnode);
}
nvlist_free(nvp);
kmem_free(packed, fuid_size);
}
return (fuid_size);
}
void
zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
{
fuid_domain_t *domnode;
void *cookie;
cookie = NULL;
while (domnode = avl_destroy_nodes(domain_tree, &cookie))
ksiddomain_rele(domnode->f_ksid);
avl_destroy(domain_tree);
cookie = NULL;
while (domnode = avl_destroy_nodes(idx_tree, &cookie))
kmem_free(domnode, sizeof (fuid_domain_t));
avl_destroy(idx_tree);
}
char *
zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
{
fuid_domain_t searchnode, *findnode;
avl_index_t loc;
searchnode.f_idx = idx;
findnode = avl_find(idx_tree, &searchnode, &loc);
return (findnode->f_ksid->kd_name);
}
#ifdef _KERNEL
/*
* Load the fuid table(s) into memory.
*/
static void
zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
{
int error = 0;
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
if (zfsvfs->z_fuid_loaded) {
rw_exit(&zfsvfs->z_fuid_lock);
return;
}
if (zfsvfs->z_fuid_obj == 0) {
/* first make sure we need to allocate object */
error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
if (error == ENOENT && tx != NULL) {
zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
sizeof (uint64_t), tx);
VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
ZFS_FUID_TABLES, sizeof (uint64_t), 1,
&zfsvfs->z_fuid_obj, tx) == 0);
}
}
zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
zfsvfs->z_fuid_loaded = B_TRUE;
rw_exit(&zfsvfs->z_fuid_lock);
}
/*
* Query domain table for a given domain.
*
* If domain isn't found it is added to AVL trees and
* the results are pushed out to disk.
*/
int
zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
dmu_tx_t *tx)
{
fuid_domain_t searchnode, *findnode;
avl_index_t loc;
/*
* If the dummy "nobody" domain then return an index of 0
* to cause the created FUID to be a standard POSIX id
* for the user nobody.
*/
if (domain[0] == '\0') {
*retdomain = "";
return (0);
}
searchnode.f_ksid = ksid_lookupdomain(domain);
if (retdomain) {
*retdomain = searchnode.f_ksid->kd_name;
}
if (!zfsvfs->z_fuid_loaded)
zfs_fuid_init(zfsvfs, tx);
rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
rw_exit(&zfsvfs->z_fuid_lock);
if (findnode) {
ksiddomain_rele(searchnode.f_ksid);
return (findnode->f_idx);
} else {
fuid_domain_t *domnode;
nvlist_t *nvp;
nvlist_t **fuids;
uint64_t retidx;
size_t nvsize = 0;
char *packed;
dmu_buf_t *db;
int i = 0;
domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
domnode->f_ksid = searchnode.f_ksid;
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
avl_add(&zfsvfs->z_fuid_domain, domnode);
avl_add(&zfsvfs->z_fuid_idx, domnode);
/*
* Now resync the on-disk nvlist.
*/
VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
domnode = avl_first(&zfsvfs->z_fuid_domain);
fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
while (domnode) {
VERIFY(nvlist_alloc(&fuids[i],
NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
domnode->f_idx) == 0);
VERIFY(nvlist_add_uint64(fuids[i],
FUID_OFFSET, 0) == 0);
VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
domnode->f_ksid->kd_name) == 0);
domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
}
VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
fuids, retidx) == 0);
for (i = 0; i != retidx; i++)
nvlist_free(fuids[i]);
kmem_free(fuids, retidx * sizeof (void *));
VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
packed = kmem_alloc(nvsize, KM_SLEEP);
VERIFY(nvlist_pack(nvp, &packed, &nvsize,
NV_ENCODE_XDR, KM_SLEEP) == 0);
nvlist_free(nvp);
zfsvfs->z_fuid_size = nvsize;
dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
zfsvfs->z_fuid_size, packed, tx);
kmem_free(packed, zfsvfs->z_fuid_size);
VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
FTAG, &db));
dmu_buf_will_dirty(db, tx);
*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
dmu_buf_rele(db, FTAG);
rw_exit(&zfsvfs->z_fuid_lock);
return (retidx);
}
}
/*
* Query domain table by index, returning domain string
*
* Returns a pointer from an avl node of the domain string.
*
*/
static char *
zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
{
char *domain;
if (idx == 0 || !zfsvfs->z_use_fuids)
return (NULL);
if (!zfsvfs->z_fuid_loaded)
zfs_fuid_init(zfsvfs, NULL);
rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
rw_exit(&zfsvfs->z_fuid_lock);
ASSERT(domain);
return (domain);
}
void
zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
{
*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
cr, ZFS_OWNER);
*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
cr, ZFS_GROUP);
}
uid_t
zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
cred_t *cr, zfs_fuid_type_t type)
{
uint32_t index = FUID_INDEX(fuid);
char *domain;
uid_t id;
if (index == 0)
return (fuid);
domain = zfs_fuid_find_by_idx(zfsvfs, index);
ASSERT(domain != NULL);
if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
(void) kidmap_getuidbysid(crgetzone(cr), domain,
FUID_RID(fuid), &id);
} else {
(void) kidmap_getgidbysid(crgetzone(cr), domain,
FUID_RID(fuid), &id);
}
return (id);
}
/*
* Add a FUID node to the list of fuid's being created for this
* ACL
*
* If ACL has multiple domains, then keep only one copy of each unique
* domain.
*/
static void
zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
uint64_t idx, uint64_t id, zfs_fuid_type_t type)
{
zfs_fuid_t *fuid;
zfs_fuid_domain_t *fuid_domain;
zfs_fuid_info_t *fuidp;
uint64_t fuididx;
boolean_t found = B_FALSE;
if (*fuidpp == NULL)
*fuidpp = zfs_fuid_info_alloc();
fuidp = *fuidpp;
/*
* First find fuid domain index in linked list
*
* If one isn't found then create an entry.
*/
for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
fuid_domain), fuididx++) {
if (idx == fuid_domain->z_domidx) {
found = B_TRUE;
break;
}
}
if (!found) {
fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
fuid_domain->z_domain = domain;
fuid_domain->z_domidx = idx;
list_insert_tail(&fuidp->z_domains, fuid_domain);
fuidp->z_domain_str_sz += strlen(domain) + 1;
fuidp->z_domain_cnt++;
}
if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
/*
* Now allocate fuid entry and add it on the end of the list
*/
fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
fuid->z_id = id;
fuid->z_domidx = idx;
fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
list_insert_tail(&fuidp->z_fuids, fuid);
fuidp->z_fuid_cnt++;
} else {
if (type == ZFS_OWNER)
fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
else
fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
}
}
/*
* Create a file system FUID, based on information in the users cred
*/
uint64_t
zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
{
uint64_t idx;
ksid_t *ksid;
uint32_t rid;
char *kdomain;
const char *domain;
uid_t id;
VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
if (type == ZFS_OWNER)
id = crgetuid(cr);
else
id = crgetgid(cr);
if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id))
return ((uint64_t)id);
ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
VERIFY(ksid != NULL);
rid = ksid_getrid(ksid);
domain = ksid_getdomain(ksid);
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
return (FUID_ENCODE(idx, rid));
}
/*
* Create a file system FUID for an ACL ace
* or a chown/chgrp of the file.
* This is similar to zfs_fuid_create_cred, except that
* we can't find the domain + rid information in the
* cred. Instead we have to query Winchester for the
* domain and rid.
*
* During replay operations the domain+rid information is
* found in the zfs_fuid_info_t that the replay code has
* attached to the zfsvfs of the file system.
*/
uint64_t
zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
{
const char *domain;
char *kdomain;
uint32_t fuid_idx = FUID_INDEX(id);
uint32_t rid;
idmap_stat status;
uint64_t idx;
boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
zfs_fuid_t *zfuid = NULL;
zfs_fuid_info_t *fuidp;
/*
* If POSIX ID, or entry is already a FUID then
* just return the id
*
* We may also be handed an already FUID'ized id via
* chmod.
*/
if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
return (id);
if (is_replay) {
fuidp = zfsvfs->z_fuid_replay;
/*
* If we are passed an ephemeral id, but no
* fuid_info was logged then return NOBODY.
* This is most likely a result of idmap service
* not being available.
*/
if (fuidp == NULL)
return (UID_NOBODY);
switch (type) {
case ZFS_ACE_USER:
case ZFS_ACE_GROUP:
zfuid = list_head(&fuidp->z_fuids);
rid = FUID_RID(zfuid->z_logfuid);
idx = FUID_INDEX(zfuid->z_logfuid);
break;
case ZFS_OWNER:
rid = FUID_RID(fuidp->z_fuid_owner);
idx = FUID_INDEX(fuidp->z_fuid_owner);
break;
case ZFS_GROUP:
rid = FUID_RID(fuidp->z_fuid_group);
idx = FUID_INDEX(fuidp->z_fuid_group);
break;
};
domain = fuidp->z_domain_table[idx -1];
} else {
if (type == ZFS_OWNER || type == ZFS_ACE_USER)
status = kidmap_getsidbyuid(crgetzone(cr), id,
&domain, &rid);
else
status = kidmap_getsidbygid(crgetzone(cr), id,
&domain, &rid);
if (status != 0) {
/*
* When returning nobody we will need to
* make a dummy fuid table entry for logging
* purposes.
*/
rid = UID_NOBODY;
domain = "";
}
}
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
if (!is_replay)
zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
else if (zfuid != NULL) {
list_remove(&fuidp->z_fuids, zfuid);
kmem_free(zfuid, sizeof (zfs_fuid_t));
}
return (FUID_ENCODE(idx, rid));
}
void
zfs_fuid_destroy(zfsvfs_t *zfsvfs)
{
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
if (!zfsvfs->z_fuid_loaded) {
rw_exit(&zfsvfs->z_fuid_lock);
return;
}
zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
rw_exit(&zfsvfs->z_fuid_lock);
}
/*
* Allocate zfs_fuid_info for tracking FUIDs created during
* zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
*/
zfs_fuid_info_t *
zfs_fuid_info_alloc(void)
{
zfs_fuid_info_t *fuidp;
fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
offsetof(zfs_fuid_domain_t, z_next));
list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
offsetof(zfs_fuid_t, z_next));
return (fuidp);
}
/*
* Release all memory associated with zfs_fuid_info_t
*/
void
zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
{
zfs_fuid_t *zfuid;
zfs_fuid_domain_t *zdomain;
while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
list_remove(&fuidp->z_fuids, zfuid);
kmem_free(zfuid, sizeof (zfs_fuid_t));
}
if (fuidp->z_domain_table != NULL)
kmem_free(fuidp->z_domain_table,
(sizeof (char **)) * fuidp->z_domain_cnt);
while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
list_remove(&fuidp->z_domains, zdomain);
kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
}
kmem_free(fuidp, sizeof (zfs_fuid_info_t));
}
/*
* Check to see if id is a groupmember. If cred
* has ksid info then sidlist is checked first
* and if still not found then POSIX groups are checked
*
* Will use a straight FUID compare when possible.
*/
boolean_t
zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
{
ksid_t *ksid = crgetsid(cr, KSID_GROUP);
uid_t gid;
if (ksid) {
int i;
ksid_t *ksid_groups;
ksidlist_t *ksidlist = crgetsidlist(cr);
uint32_t idx = FUID_INDEX(id);
uint32_t rid = FUID_RID(id);
ASSERT(ksidlist);
ksid_groups = ksidlist->ksl_sids;
for (i = 0; i != ksidlist->ksl_nsid; i++) {
if (idx == 0) {
if (id != IDMAP_WK_CREATOR_GROUP_GID &&
id == ksid_groups[i].ks_id) {
return (B_TRUE);
}
} else {
char *domain;
domain = zfs_fuid_find_by_idx(zfsvfs, idx);
ASSERT(domain != NULL);
if (strcmp(domain,
IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
return (B_FALSE);
if ((strcmp(domain,
ksid_groups[i].ks_domain->kd_name) == 0) &&
rid == ksid_groups[i].ks_rid)
return (B_TRUE);
}
}
}
/*
* Not found in ksidlist, check posix groups
*/
gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
return (groupmember(gid, cr));
}
#endif
File diff suppressed because it is too large Load Diff
+693
View File
@@ -0,0 +1,693 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "@(#)zfs_log.c 1.13 08/04/09 SMI"
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/thread.h>
#include <sys/file.h>
#include <sys/vfs.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/byteorder.h>
#include <sys/policy.h>
#include <sys/stat.h>
#include <sys/mode.h>
#include <sys/acl.h>
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/zfs_fuid.h>
#include <sys/ddi.h>
/*
* All the functions in this file are used to construct the log entries
* to record transactions. They allocate * an intent log transaction
* structure (itx_t) and save within it all the information necessary to
* possibly replay the transaction. The itx is then assigned a sequence
* number and inserted in the in-memory list anchored in the zilog.
*/
int
zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
{
int isxvattr = (vap->va_mask & AT_XVATTR);
switch (type) {
case Z_FILE:
if (vsecp == NULL && !isxvattr)
return (TX_CREATE);
if (vsecp && isxvattr)
return (TX_CREATE_ACL_ATTR);
if (vsecp)
return (TX_CREATE_ACL);
else
return (TX_CREATE_ATTR);
/*NOTREACHED*/
case Z_DIR:
if (vsecp == NULL && !isxvattr)
return (TX_MKDIR);
if (vsecp && isxvattr)
return (TX_MKDIR_ACL_ATTR);
if (vsecp)
return (TX_MKDIR_ACL);
else
return (TX_MKDIR_ATTR);
case Z_XATTRDIR:
return (TX_MKXATTR);
}
ASSERT(0);
return (TX_MAX_TYPE);
}
/*
* build up the log data necessary for logging xvattr_t
* First lr_attr_t is initialized. following the lr_attr_t
* is the mapsize and attribute bitmap copied from the xvattr_t.
* Following the bitmap and bitmapsize two 64 bit words are reserved
* for the create time which may be set. Following the create time
* records a single 64 bit integer which has the bits to set on
* replay for the xvattr.
*/
static void
zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
{
uint32_t *bitmap;
uint64_t *attrs;
uint64_t *crtime;
xoptattr_t *xoap;
void *scanstamp;
int i;
xoap = xva_getxoptattr(xvap);
ASSERT(xoap);
lrattr->lr_attr_masksize = xvap->xva_mapsize;
bitmap = &lrattr->lr_attr_bitmap;
for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
*bitmap = xvap->xva_reqattrmap[i];
}
/* Now pack the attributes up in a single uint64_t */
attrs = (uint64_t *)bitmap;
crtime = attrs + 1;
scanstamp = (caddr_t)(crtime + 2);
*attrs = 0;
if (XVA_ISSET_REQ(xvap, XAT_READONLY))
*attrs |= (xoap->xoa_readonly == 0) ? 0 :
XAT0_READONLY;
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
*attrs |= (xoap->xoa_hidden == 0) ? 0 :
XAT0_HIDDEN;
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
*attrs |= (xoap->xoa_system == 0) ? 0 :
XAT0_SYSTEM;
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
*attrs |= (xoap->xoa_archive == 0) ? 0 :
XAT0_ARCHIVE;
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
*attrs |= (xoap->xoa_immutable == 0) ? 0 :
XAT0_IMMUTABLE;
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
XAT0_NOUNLINK;
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
XAT0_APPENDONLY;
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
*attrs |= (xoap->xoa_opaque == 0) ? 0 :
XAT0_APPENDONLY;
if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
*attrs |= (xoap->xoa_nodump == 0) ? 0 :
XAT0_NODUMP;
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
XAT0_AV_QUARANTINED;
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
XAT0_AV_MODIFIED;
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
}
static void *
zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
{
zfs_fuid_t *zfuid;
uint64_t *fuidloc = start;
/* First copy in the ACE FUIDs */
for (zfuid = list_head(&fuidp->z_fuids); zfuid;
zfuid = list_next(&fuidp->z_fuids, zfuid)) {
*fuidloc++ = zfuid->z_logfuid;
}
return (fuidloc);
}
static void *
zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
{
zfs_fuid_domain_t *zdomain;
/* now copy in the domain info, if any */
if (fuidp->z_domain_str_sz != 0) {
for (zdomain = list_head(&fuidp->z_domains); zdomain;
zdomain = list_next(&fuidp->z_domains, zdomain)) {
bcopy((void *)zdomain->z_domain, start,
strlen(zdomain->z_domain) + 1);
start = (caddr_t)start +
strlen(zdomain->z_domain) + 1;
}
}
return (start);
}
/*
* zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
* TX_MKDIR_ATTR and TX_MKXATTR
* transactions.
*
* TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
* domain information appended prior to the name. In this case the
* uid/gid in the log record will be a log centric FUID.
*
* TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
* may contain attributes, ACL and optional fuid information.
*
* TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
* and ACL and normal users/groups in the ACEs.
*
* There may be an optional xvattr attribute information similar
* to zfs_log_setattr.
*
* Also, after the file name "domain" strings may be appended.
*/
void
zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
zfs_fuid_info_t *fuidp, vattr_t *vap)
{
itx_t *itx;
uint64_t seq;
lr_create_t *lr;
lr_acl_create_t *lracl;
size_t aclsize;
size_t xvatsize = 0;
size_t txsize;
xvattr_t *xvap = (xvattr_t *)vap;
void *end;
size_t lrsize;
size_t namesize = strlen(name) + 1;
size_t fuidsz = 0;
if (zilog == NULL)
return;
/*
* If we have FUIDs present then add in space for
* domains and ACE fuid's if any.
*/
if (fuidp) {
fuidsz += fuidp->z_domain_str_sz;
fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
}
if (vap->va_mask & AT_XVATTR)
xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
(int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
(int)txtype == TX_MKXATTR) {
txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
lrsize = sizeof (*lr);
} else {
aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
txsize =
sizeof (lr_acl_create_t) + namesize + fuidsz +
ZIL_ACE_LENGTH(aclsize) + xvatsize;
lrsize = sizeof (lr_acl_create_t);
}
itx = zil_itx_create(txtype, txsize);
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
lr->lr_mode = zp->z_phys->zp_mode;
if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
} else {
lr->lr_uid = fuidp->z_fuid_owner;
}
if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
} else {
lr->lr_gid = fuidp->z_fuid_group;
}
lr->lr_gen = zp->z_phys->zp_gen;
lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
lr->lr_rdev = zp->z_phys->zp_rdev;
/*
* Fill in xvattr info if any
*/
if (vap->va_mask & AT_XVATTR) {
zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
end = (caddr_t)lr + lrsize + xvatsize;
} else {
end = (caddr_t)lr + lrsize;
}
/* Now fill in any ACL info */
if (vsecp) {
lracl = (lr_acl_create_t *)&itx->itx_lr;
lracl->lr_aclcnt = vsecp->vsa_aclcnt;
lracl->lr_acl_bytes = aclsize;
lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
else
lracl->lr_acl_flags = 0;
bcopy(vsecp->vsa_aclentp, end, aclsize);
end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
}
/* drop in FUID info */
if (fuidp) {
end = zfs_log_fuid_ids(fuidp, end);
end = zfs_log_fuid_domains(fuidp, end);
}
/*
* Now place file name in log record
*/
bcopy(name, end, namesize);
seq = zil_itx_assign(zilog, itx, tx);
dzp->z_last_itx = seq;
zp->z_last_itx = seq;
}
/*
* zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
*/
void
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, char *name)
{
itx_t *itx;
uint64_t seq;
lr_remove_t *lr;
size_t namesize = strlen(name) + 1;
if (zilog == NULL)
return;
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_remove_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
seq = zil_itx_assign(zilog, itx, tx);
dzp->z_last_itx = seq;
}
/*
* zfs_log_link() handles TX_LINK transactions.
*/
void
zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name)
{
itx_t *itx;
uint64_t seq;
lr_link_t *lr;
size_t namesize = strlen(name) + 1;
if (zilog == NULL)
return;
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_link_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_link_obj = zp->z_id;
bcopy(name, (char *)(lr + 1), namesize);
seq = zil_itx_assign(zilog, itx, tx);
dzp->z_last_itx = seq;
zp->z_last_itx = seq;
}
/*
* zfs_log_symlink() handles TX_SYMLINK transactions.
*/
void
zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name, char *link)
{
itx_t *itx;
uint64_t seq;
lr_create_t *lr;
size_t namesize = strlen(name) + 1;
size_t linksize = strlen(link) + 1;
if (zilog == NULL)
return;
itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
lr->lr_mode = zp->z_phys->zp_mode;
lr->lr_uid = zp->z_phys->zp_uid;
lr->lr_gid = zp->z_phys->zp_gid;
lr->lr_gen = zp->z_phys->zp_gen;
lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
bcopy(name, (char *)(lr + 1), namesize);
bcopy(link, (char *)(lr + 1) + namesize, linksize);
seq = zil_itx_assign(zilog, itx, tx);
dzp->z_last_itx = seq;
zp->z_last_itx = seq;
}
/*
* zfs_log_rename() handles TX_RENAME transactions.
*/
void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
itx_t *itx;
uint64_t seq;
lr_rename_t *lr;
size_t snamesize = strlen(sname) + 1;
size_t dnamesize = strlen(dname) + 1;
if (zilog == NULL)
return;
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
lr = (lr_rename_t *)&itx->itx_lr;
lr->lr_sdoid = sdzp->z_id;
lr->lr_tdoid = tdzp->z_id;
bcopy(sname, (char *)(lr + 1), snamesize);
bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
seq = zil_itx_assign(zilog, itx, tx);
sdzp->z_last_itx = seq;
tdzp->z_last_itx = seq;
szp->z_last_itx = seq;
}
/*
* zfs_log_write() handles TX_WRITE transactions.
*/
ssize_t zfs_immediate_write_sz = 32768;
#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
sizeof (lr_write_t))
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
itx_wr_state_t write_state;
boolean_t slogging;
uintptr_t fsync_cnt;
if (zilog == NULL || zp->z_unlinked)
return;
/*
* Writes are handled in three different ways:
*
* WR_INDIRECT:
* If the write is greater than zfs_immediate_write_sz and there are
* no separate logs in this pool then later *if* we need to log the
* write then dmu_sync() is used to immediately write the block and
* its block pointer is put in the log record.
* WR_COPIED:
* If we know we'll immediately be committing the
* transaction (FSYNC or FDSYNC), the we allocate a larger
* log record here for the data and copy the data in.
* WR_NEED_COPY:
* Otherwise we don't allocate a buffer, and *if* we need to
* flush the write later then a buffer is allocated and
* we retrieve the data using the dmu.
*/
slogging = spa_has_slogs(zilog->zl_spa);
if (resid > zfs_immediate_write_sz && !slogging)
write_state = WR_INDIRECT;
else if (ioflag & (FSYNC | FDSYNC))
write_state = WR_COPIED;
else
write_state = WR_NEED_COPY;
if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
}
while (resid) {
itx_t *itx;
lr_write_t *lr;
ssize_t len;
/*
* If there are slogs and the write would overflow the largest
* block, then because we don't want to use the main pool
* to dmu_sync, we have to split the write.
*/
if (slogging && resid > ZIL_MAX_LOG_DATA)
len = SPA_MAXBLOCKSIZE >> 1;
else
len = resid;
itx = zil_itx_create(txtype, sizeof (*lr) +
(write_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
zp->z_id, off, len, lr + 1) != 0) {
kmem_free(itx, offsetof(itx_t, itx_lr) +
itx->itx_lr.lrc_reclen);
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
write_state = WR_NEED_COPY;
}
itx->itx_wr_state = write_state;
if (write_state == WR_NEED_COPY)
itx->itx_sod += len;
lr->lr_foid = zp->z_id;
lr->lr_offset = off;
lr->lr_length = len;
lr->lr_blkoff = 0;
BP_ZERO(&lr->lr_blkptr);
itx->itx_private = zp->z_zfsvfs;
if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
(ioflag & (FSYNC | FDSYNC)))
itx->itx_sync = B_TRUE;
else
itx->itx_sync = B_FALSE;
zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
off += len;
resid -= len;
}
}
/*
* zfs_log_truncate() handles TX_TRUNCATE transactions.
*/
void
zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, uint64_t off, uint64_t len)
{
itx_t *itx;
uint64_t seq;
lr_truncate_t *lr;
if (zilog == NULL || zp->z_unlinked)
return;
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_truncate_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id;
lr->lr_offset = off;
lr->lr_length = len;
itx->itx_sync = (zp->z_sync_cnt != 0);
seq = zil_itx_assign(zilog, itx, tx);
zp->z_last_itx = seq;
}
/*
* zfs_log_setattr() handles TX_SETATTR transactions.
*/
void
zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
uint64_t seq;
lr_setattr_t *lr;
xvattr_t *xvap = (xvattr_t *)vap;
size_t recsize = sizeof (lr_setattr_t);
void *start;
if (zilog == NULL || zp->z_unlinked)
return;
/*
* If XVATTR set, then log record size needs to allow
* for lr_attr_t + xvattr mask, mapsize and create time
* plus actual attribute values
*/
if (vap->va_mask & AT_XVATTR)
recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
if (fuidp)
recsize += fuidp->z_domain_str_sz;
itx = zil_itx_create(txtype, recsize);
lr = (lr_setattr_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id;
lr->lr_mask = (uint64_t)mask_applied;
lr->lr_mode = (uint64_t)vap->va_mode;
if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
lr->lr_uid = fuidp->z_fuid_owner;
else
lr->lr_uid = (uint64_t)vap->va_uid;
if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
lr->lr_gid = fuidp->z_fuid_group;
else
lr->lr_gid = (uint64_t)vap->va_gid;
lr->lr_size = (uint64_t)vap->va_size;
ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
start = (lr_setattr_t *)(lr + 1);
if (vap->va_mask & AT_XVATTR) {
zfs_log_xvattr((lr_attr_t *)start, xvap);
start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
}
/*
* Now stick on domain information if any on end
*/
if (fuidp)
(void) zfs_log_fuid_domains(fuidp, start);
itx->itx_sync = (zp->z_sync_cnt != 0);
seq = zil_itx_assign(zilog, itx, tx);
zp->z_last_itx = seq;
}
/*
* zfs_log_acl() handles TX_ACL transactions.
*/
void
zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
uint64_t seq;
lr_acl_v0_t *lrv0;
lr_acl_t *lr;
int txtype;
int lrsize;
size_t txsize;
size_t aclbytes = vsecp->vsa_aclentsz;
txtype = (zp->z_zfsvfs->z_version == ZPL_VERSION_INITIAL) ?
TX_ACL_V0 : TX_ACL;
if (txtype == TX_ACL)
lrsize = sizeof (*lr);
else
lrsize = sizeof (*lrv0);
if (zilog == NULL || zp->z_unlinked)
return;
txsize = lrsize +
((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
(fuidp ? fuidp->z_domain_str_sz : 0) +
sizeof (uint64) * (fuidp ? fuidp->z_fuid_cnt : 0);
itx = zil_itx_create(txtype, txsize);
lr = (lr_acl_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id;
if (txtype == TX_ACL) {
lr->lr_acl_bytes = aclbytes;
lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
else
lr->lr_acl_flags = 0;
}
lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
if (txtype == TX_ACL_V0) {
lrv0 = (lr_acl_v0_t *)lr;
bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
} else {
void *start = (ace_t *)(lr + 1);
bcopy(vsecp->vsa_aclentp, start, aclbytes);
start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
if (fuidp) {
start = zfs_log_fuid_ids(fuidp, start);
(void) zfs_log_fuid_domains(fuidp, start);
}
}
itx->itx_sync = (zp->z_sync_cnt != 0);
seq = zil_itx_assign(zilog, itx, tx);
zp->z_last_itx = seq;
}
+876
View File
@@ -0,0 +1,876 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "@(#)zfs_replay.c 1.7 08/01/14 SMI"
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/thread.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/vfs.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_dir.h>
#include <sys/zfs_acl.h>
#include <sys/zfs_fuid.h>
#include <sys/spa.h>
#include <sys/zil.h>
#include <sys/byteorder.h>
#include <sys/stat.h>
#include <sys/mode.h>
#include <sys/acl.h>
#include <sys/atomic.h>
#include <sys/cred.h>
/*
* Functions to replay ZFS intent log (ZIL) records
* The functions are called through a function vector (zfs_replay_vector)
* which is indexed by the transaction type.
*/
static void
zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
{
bzero(vap, sizeof (*vap));
vap->va_mask = (uint_t)mask;
vap->va_type = IFTOVT(mode);
vap->va_mode = mode & MODEMASK;
vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
vap->va_rdev = zfs_cmpldev(rdev);
vap->va_nodeid = nodeid;
}
/* ARGSUSED */
static int
zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
{
return (ENOTSUP);
}
static void
zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
{
xoptattr_t *xoap = NULL;
uint64_t *attrs;
uint64_t *crtime;
uint32_t *bitmap;
void *scanstamp;
int i;
xvap->xva_vattr.va_mask |= AT_XVATTR;
if ((xoap = xva_getxoptattr(xvap)) == NULL) {
xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
return;
}
ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
bitmap = &lrattr->lr_attr_bitmap;
for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
xvap->xva_reqattrmap[i] = *bitmap;
attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
crtime = attrs + 1;
scanstamp = (caddr_t)(crtime + 2);
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
if (XVA_ISSET_REQ(xvap, XAT_READONLY))
xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
xoap->xoa_av_quarantined =
((*attrs & XAT0_AV_QUARANTINED) != 0);
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
}
static int
zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
{
uint64_t uid_idx;
uint64_t gid_idx;
int domcnt = 0;
uid_idx = FUID_INDEX(uid);
gid_idx = FUID_INDEX(gid);
if (uid_idx)
domcnt++;
if (gid_idx > 0 && gid_idx != uid_idx)
domcnt++;
return (domcnt);
}
static void *
zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
int domcnt)
{
int i;
for (i = 0; i != domcnt; i++) {
fuid_infop->z_domain_table[i] = start;
start = (caddr_t)start + strlen(start) + 1;
}
return (start);
}
/*
* Set the uid/gid in the fuid_info structure.
*/
static void
zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
{
/*
* If owner or group are log specific FUIDs then slurp up
* domain information and build zfs_fuid_info_t
*/
if (IS_EPHEMERAL(uid))
fuid_infop->z_fuid_owner = uid;
if (IS_EPHEMERAL(gid))
fuid_infop->z_fuid_group = gid;
}
/*
* Load fuid domains into fuid_info_t
*/
static zfs_fuid_info_t *
zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
{
int domcnt;
zfs_fuid_info_t *fuid_infop;
fuid_infop = zfs_fuid_info_alloc();
domcnt = zfs_replay_domain_cnt(uid, gid);
if (domcnt == 0)
return (fuid_infop);
fuid_infop->z_domain_table =
kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
zfs_replay_fuid_ugid(fuid_infop, uid, gid);
fuid_infop->z_domain_cnt = domcnt;
*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
return (fuid_infop);
}
/*
* load zfs_fuid_t's and fuid_domains into fuid_info_t
*/
static zfs_fuid_info_t *
zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
uint64_t gid)
{
uint64_t *log_fuid = (uint64_t *)start;
zfs_fuid_info_t *fuid_infop;
int i;
fuid_infop = zfs_fuid_info_alloc();
fuid_infop->z_domain_cnt = domcnt;
fuid_infop->z_domain_table =
kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
for (i = 0; i != idcnt; i++) {
zfs_fuid_t *zfuid;
zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
zfuid->z_logfuid = *log_fuid;
zfuid->z_id = -1;
zfuid->z_domidx = 0;
list_insert_tail(&fuid_infop->z_fuids, zfuid);
log_fuid++;
}
zfs_replay_fuid_ugid(fuid_infop, uid, gid);
*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
return (fuid_infop);
}
static void
zfs_replay_swap_attrs(lr_attr_t *lrattr)
{
/* swap the lr_attr structure */
byteswap_uint32_array(lrattr, sizeof (*lrattr));
/* swap the bitmap */
byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
sizeof (uint32_t));
/* swap the attributes, create time + 64 bit word for attributes */
byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
(lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
}
/*
* Replay file create with optional ACL, xvattr information as well
* as option FUID information.
*/
static int
zfs_replay_create_acl(zfsvfs_t *zfsvfs,
lr_acl_create_t *lracl, boolean_t byteswap)
{
char *name = NULL; /* location determined later */
lr_create_t *lr = (lr_create_t *)lracl;
znode_t *dzp;
vnode_t *vp = NULL;
xvattr_t xva;
int vflg = 0;
vsecattr_t vsec = { 0 };
lr_attr_t *lrattr;
void *aclstart;
void *fuidstart;
size_t xvatlen = 0;
uint64_t txtype;
int error;
if (byteswap) {
byteswap_uint64_array(lracl, sizeof (*lracl));
txtype = (int)lr->lr_common.lrc_txtype;
if (txtype == TX_CREATE_ACL_ATTR ||
txtype == TX_MKDIR_ACL_ATTR) {
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
zfs_replay_swap_attrs(lrattr);
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
}
aclstart = (caddr_t)(lracl + 1) + xvatlen;
zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
/* swap fuids */
if (lracl->lr_fuidcnt) {
byteswap_uint64_array((caddr_t)aclstart +
ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
lracl->lr_fuidcnt * sizeof (uint64_t));
}
}
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
return (error);
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
* creation time and generation number. The generic VOP_CREATE()
* doesn't have either concept, so we smuggle the values inside
* the vattr's otherwise unused va_ctime and va_nblocks fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
if (error != ENOENT)
goto bail;
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
switch ((int)lr->lr_common.lrc_txtype) {
case TX_CREATE_ACL:
aclstart = (caddr_t)(lracl + 1);
fuidstart = (caddr_t)aclstart +
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
lr->lr_uid, lr->lr_gid);
/*FALLTHROUGH*/
case TX_CREATE_ACL_ATTR:
if (name == NULL) {
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
xva.xva_vattr.va_mask |= AT_XVATTR;
zfs_replay_xvattr(lrattr, &xva);
}
vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
vsec.vsa_aclcnt = lracl->lr_aclcnt;
vsec.vsa_aclentsz = lracl->lr_acl_bytes;
vsec.vsa_aclflags = lracl->lr_acl_flags;
if (zfsvfs->z_fuid_replay == NULL) {
fuidstart = (caddr_t)(lracl + 1) + xvatlen +
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
zfsvfs->z_fuid_replay =
zfs_replay_fuids(fuidstart,
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
lr->lr_uid, lr->lr_gid);
}
error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
0, 0, &vp, kcred, vflg, NULL, &vsec);
break;
case TX_MKDIR_ACL:
aclstart = (caddr_t)(lracl + 1);
fuidstart = (caddr_t)aclstart +
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
lr->lr_uid, lr->lr_gid);
/*FALLTHROUGH*/
case TX_MKDIR_ACL_ATTR:
if (name == NULL) {
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
zfs_replay_xvattr(lrattr, &xva);
}
vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
vsec.vsa_aclcnt = lracl->lr_aclcnt;
vsec.vsa_aclentsz = lracl->lr_acl_bytes;
vsec.vsa_aclflags = lracl->lr_acl_flags;
if (zfsvfs->z_fuid_replay == NULL) {
fuidstart = (caddr_t)(lracl + 1) + xvatlen +
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
zfsvfs->z_fuid_replay =
zfs_replay_fuids(fuidstart,
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
lr->lr_uid, lr->lr_gid);
}
error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
&vp, kcred, NULL, vflg, &vsec);
break;
default:
error = ENOTSUP;
}
bail:
if (error == 0 && vp != NULL)
VN_RELE(vp);
VN_RELE(ZTOV(dzp));
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
zfsvfs->z_fuid_replay = NULL;
return (error);
}
static int
zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
{
char *name = NULL; /* location determined later */
char *link; /* symlink content follows name */
znode_t *dzp;
vnode_t *vp = NULL;
xvattr_t xva;
int vflg = 0;
size_t lrsize = sizeof (lr_create_t);
lr_attr_t *lrattr;
void *start;
size_t xvatlen;
uint64_t txtype;
int error;
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
txtype = (int)lr->lr_common.lrc_txtype;
if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
}
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
return (error);
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
* creation time and generation number. The generic VOP_CREATE()
* doesn't have either concept, so we smuggle the values inside
* the vattr's otherwise unused va_ctime and va_nblocks fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
if (error != ENOENT)
goto out;
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
/*
* Symlinks don't have fuid info, and CIFS never creates
* symlinks.
*
* The _ATTR versions will grab the fuid info in their subcases.
*/
if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
(int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
(int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
start = (lr + 1);
zfsvfs->z_fuid_replay =
zfs_replay_fuid_domain(start, &start,
lr->lr_uid, lr->lr_gid);
}
switch ((int)lr->lr_common.lrc_txtype) {
case TX_CREATE_ATTR:
lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
start = (caddr_t)(lr + 1) + xvatlen;
zfsvfs->z_fuid_replay =
zfs_replay_fuid_domain(start, &start,
lr->lr_uid, lr->lr_gid);
name = (char *)start;
/*FALLTHROUGH*/
case TX_CREATE:
if (name == NULL)
name = (char *)start;
error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
0, 0, &vp, kcred, vflg, NULL, NULL);
break;
case TX_MKDIR_ATTR:
lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
start = (caddr_t)(lr + 1) + xvatlen;
zfsvfs->z_fuid_replay =
zfs_replay_fuid_domain(start, &start,
lr->lr_uid, lr->lr_gid);
name = (char *)start;
/*FALLTHROUGH*/
case TX_MKDIR:
if (name == NULL)
name = (char *)(lr + 1);
error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
&vp, kcred, NULL, vflg, NULL);
break;
case TX_MKXATTR:
name = (char *)(lr + 1);
error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
break;
case TX_SYMLINK:
name = (char *)(lr + 1);
link = name + strlen(name) + 1;
error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr,
link, kcred, NULL, vflg);
break;
default:
error = ENOTSUP;
}
out:
if (error == 0 && vp != NULL)
VN_RELE(vp);
VN_RELE(ZTOV(dzp));
if (zfsvfs->z_fuid_replay)
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
zfsvfs->z_fuid_replay = NULL;
return (error);
}
static int
zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
{
char *name = (char *)(lr + 1); /* name follows lr_remove_t */
znode_t *dzp;
int error;
int vflg = 0;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
return (error);
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
switch ((int)lr->lr_common.lrc_txtype) {
case TX_REMOVE:
error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg);
break;
case TX_RMDIR:
error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg);
break;
default:
error = ENOTSUP;
}
VN_RELE(ZTOV(dzp));
return (error);
}
static int
zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
{
char *name = (char *)(lr + 1); /* name follows lr_link_t */
znode_t *dzp, *zp;
int error;
int vflg = 0;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
return (error);
if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
VN_RELE(ZTOV(dzp));
return (error);
}
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg);
VN_RELE(ZTOV(zp));
VN_RELE(ZTOV(dzp));
return (error);
}
static int
zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
{
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
znode_t *sdzp, *tdzp;
int error;
int vflg = 0;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
return (error);
if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
VN_RELE(ZTOV(sdzp));
return (error);
}
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred,
NULL, vflg);
VN_RELE(ZTOV(tdzp));
VN_RELE(ZTOV(sdzp));
return (error);
}
static int
zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
{
char *data = (char *)(lr + 1); /* data follows lr_write_t */
znode_t *zp;
int error;
ssize_t resid;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
/*
* As we can log writes out of order, it's possible the
* file has been removed. In this case just drop the write
* and return success.
*/
if (error == ENOENT)
error = 0;
return (error);
}
error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
VN_RELE(ZTOV(zp));
return (error);
}
static int
zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
{
znode_t *zp;
flock64_t fl;
int error;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
/*
* As we can log truncates out of order, it's possible the
* file has been removed. In this case just drop the truncate
* and return success.
*/
if (error == ENOENT)
error = 0;
return (error);
}
bzero(&fl, sizeof (fl));
fl.l_type = F_WRLCK;
fl.l_whence = 0;
fl.l_start = lr->lr_offset;
fl.l_len = lr->lr_length;
error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
lr->lr_offset, kcred, NULL);
VN_RELE(ZTOV(zp));
return (error);
}
static int
zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
{
znode_t *zp;
xvattr_t xva;
vattr_t *vap = &xva.xva_vattr;
int error;
void *start;
xva_init(&xva);
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
if ((lr->lr_mask & AT_XVATTR) &&
zfsvfs->z_version >= ZPL_VERSION_INITIAL)
zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
}
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
/*
* As we can log setattrs out of order, it's possible the
* file has been removed. In this case just drop the setattr
* and return success.
*/
if (error == ENOENT)
error = 0;
return (error);
}
zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
vap->va_size = lr->lr_size;
ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
/*
* Fill in xvattr_t portions if necessary.
*/
start = (lr_setattr_t *)(lr + 1);
if (vap->va_mask & AT_XVATTR) {
zfs_replay_xvattr((lr_attr_t *)start, &xva);
start = (caddr_t)start +
ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
} else
xva.xva_vattr.va_mask &= ~AT_XVATTR;
zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
lr->lr_uid, lr->lr_gid);
error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL);
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
zfsvfs->z_fuid_replay = NULL;
VN_RELE(ZTOV(zp));
return (error);
}
static int
zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
{
ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
vsecattr_t vsa;
znode_t *zp;
int error;
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
zfs_oldace_byteswap(ace, lr->lr_aclcnt);
}
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
/*
* As we can log acls out of order, it's possible the
* file has been removed. In this case just drop the acl
* and return success.
*/
if (error == ENOENT)
error = 0;
return (error);
}
bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
vsa.vsa_aclcnt = lr->lr_aclcnt;
vsa.vsa_aclentp = ace;
error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
VN_RELE(ZTOV(zp));
return (error);
}
/*
* Replaying ACLs is complicated by FUID support.
* The log record may contain some optional data
* to be used for replaying FUID's. These pieces
* are the actual FUIDs that were created initially.
* The FUID table index may no longer be valid and
* during zfs_create() a new index may be assigned.
* Because of this the log will contain the original
* doman+rid in order to create a new FUID.
*
* The individual ACEs may contain an ephemeral uid/gid which is no
* longer valid and will need to be replaced with an actual FUID.
*
*/
static int
zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
{
ace_t *ace = (ace_t *)(lr + 1);
vsecattr_t vsa;
znode_t *zp;
int error;
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
if (lr->lr_fuidcnt) {
byteswap_uint64_array((caddr_t)ace +
ZIL_ACE_LENGTH(lr->lr_acl_bytes),
lr->lr_fuidcnt * sizeof (uint64_t));
}
}
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
/*
* As we can log acls out of order, it's possible the
* file has been removed. In this case just drop the acl
* and return success.
*/
if (error == ENOENT)
error = 0;
return (error);
}
bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
vsa.vsa_aclcnt = lr->lr_aclcnt;
vsa.vsa_aclentp = ace;
vsa.vsa_aclentsz = lr->lr_acl_bytes;
vsa.vsa_aclflags = lr->lr_acl_flags;
if (lr->lr_fuidcnt) {
void *fuidstart = (caddr_t)ace +
ZIL_ACE_LENGTH(lr->lr_acl_bytes);
zfsvfs->z_fuid_replay =
zfs_replay_fuids(fuidstart, &fuidstart,
lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
}
error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
if (zfsvfs->z_fuid_replay)
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
zfsvfs->z_fuid_replay = NULL;
VN_RELE(ZTOV(zp));
return (error);
}
/*
* Callback vectors for replaying records
*/
zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_error, /* 0 no such transaction type */
zfs_replay_create, /* TX_CREATE */
zfs_replay_create, /* TX_MKDIR */
zfs_replay_create, /* TX_MKXATTR */
zfs_replay_create, /* TX_SYMLINK */
zfs_replay_remove, /* TX_REMOVE */
zfs_replay_remove, /* TX_RMDIR */
zfs_replay_link, /* TX_LINK */
zfs_replay_rename, /* TX_RENAME */
zfs_replay_write, /* TX_WRITE */
zfs_replay_truncate, /* TX_TRUNCATE */
zfs_replay_setattr, /* TX_SETATTR */
zfs_replay_acl_v0, /* TX_ACL_V0 */
zfs_replay_acl, /* TX_ACL */
zfs_replay_create_acl, /* TX_CREATE_ACL */
zfs_replay_create, /* TX_CREATE_ATTR */
zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL */
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
};
+602
View File
@@ -0,0 +1,602 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "@(#)zfs_rlock.c 1.4 07/08/08 SMI"
/*
* This file contains the code to implement file range locking in
* ZFS, although there isn't much specific to ZFS (all that comes to mind
* support for growing the blocksize).
*
* Interface
* ---------
* Defined in zfs_rlock.h but essentially:
* rl = zfs_range_lock(zp, off, len, lock_type);
* zfs_range_unlock(rl);
* zfs_range_reduce(rl, off, len);
*
* AVL tree
* --------
* An AVL tree is used to maintain the state of the existing ranges
* that are locked for exclusive (writer) or shared (reader) use.
* The starting range offset is used for searching and sorting the tree.
*
* Common case
* -----------
* The (hopefully) usual case is of no overlaps or contention for
* locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
* searched that finds no overlap, and *this* rl_t is placed in the tree.
*
* Overlaps/Reference counting/Proxy locks
* ---------------------------------------
* The avl code only allows one node at a particular offset. Also it's very
* inefficient to search through all previous entries looking for overlaps
* (because the very 1st in the ordered list might be at offset 0 but
* cover the whole file).
* So this implementation uses reference counts and proxy range locks.
* Firstly, only reader locks use reference counts and proxy locks,
* because writer locks are exclusive.
* When a reader lock overlaps with another then a proxy lock is created
* for that range and replaces the original lock. If the overlap
* is exact then the reference count of the proxy is simply incremented.
* Otherwise, the proxy lock is split into smaller lock ranges and
* new proxy locks created for non overlapping ranges.
* The reference counts are adjusted accordingly.
* Meanwhile, the orginal lock is kept around (this is the callers handle)
* and its offset and length are used when releasing the lock.
*
* Thread coordination
* -------------------
* In order to make wakeups efficient and to ensure multiple continuous
* readers on a range don't starve a writer for the same range lock,
* two condition variables are allocated in each rl_t.
* If a writer (or reader) can't get a range it initialises the writer
* (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
* and waits on that cv. When a thread unlocks that range it wakes up all
* writers then all readers before destroying the lock.
*
* Append mode writes
* ------------------
* Append mode writes need to lock a range at the end of a file.
* The offset of the end of the file is determined under the
* range locking mutex, and the lock type converted from RL_APPEND to
* RL_WRITER and the range locked.
*
* Grow block handling
* -------------------
* ZFS supports multiple block sizes currently upto 128K. The smallest
* block size is used for the file which is grown as needed. During this
* growth all other writers and readers must be excluded.
* So if the block size needs to be grown then the whole file is
* exclusively locked, then later the caller will reduce the lock
* range to just the range to be written using zfs_reduce_range.
*/
#include <sys/zfs_rlock.h>
/*
* Check if a write lock can be grabbed, or wait and recheck until available.
*/
static void
zfs_range_lock_writer(znode_t *zp, rl_t *new)
{
avl_tree_t *tree = &zp->z_range_avl;
rl_t *rl;
avl_index_t where;
uint64_t end_size;
uint64_t off = new->r_off;
uint64_t len = new->r_len;
for (;;) {
/*
* Range locking is also used by zvol and uses a
* dummied up znode. However, for zvol, we don't need to
* append or grow blocksize, and besides we don't have
* a z_phys or z_zfsvfs - so skip that processing.
*
* Yes, this is ugly, and would be solved by not handling
* grow or append in range lock code. If that was done then
* we could make the range locking code generically available
* to other non-zfs consumers.
*/
if (zp->z_vnode) { /* caller is ZPL */
/*
* If in append mode pick up the current end of file.
* This is done under z_range_lock to avoid races.
*/
if (new->r_type == RL_APPEND)
new->r_off = zp->z_phys->zp_size;
/*
* If we need to grow the block size then grab the whole
* file range. This is also done under z_range_lock to
* avoid races.
*/
end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
new->r_off = 0;
new->r_len = UINT64_MAX;
}
}
/*
* First check for the usual case of no locks
*/
if (avl_numnodes(tree) == 0) {
new->r_type = RL_WRITER; /* convert to writer */
avl_add(tree, new);
return;
}
/*
* Look for any locks in the range.
*/
rl = avl_find(tree, new, &where);
if (rl)
goto wait; /* already locked at same offset */
rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
if (rl && (rl->r_off < new->r_off + new->r_len))
goto wait;
rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
if (rl && rl->r_off + rl->r_len > new->r_off)
goto wait;
new->r_type = RL_WRITER; /* convert possible RL_APPEND */
avl_insert(tree, new, where);
return;
wait:
if (!rl->r_write_wanted) {
cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
rl->r_write_wanted = B_TRUE;
}
cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
/* reset to original */
new->r_off = off;
new->r_len = len;
}
}
/*
* If this is an original (non-proxy) lock then replace it by
* a proxy and return the proxy.
*/
static rl_t *
zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
{
rl_t *proxy;
if (rl->r_proxy)
return (rl); /* already a proxy */
ASSERT3U(rl->r_cnt, ==, 1);
ASSERT(rl->r_write_wanted == B_FALSE);
ASSERT(rl->r_read_wanted == B_FALSE);
avl_remove(tree, rl);
rl->r_cnt = 0;
/* create a proxy range lock */
proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
proxy->r_off = rl->r_off;
proxy->r_len = rl->r_len;
proxy->r_cnt = 1;
proxy->r_type = RL_READER;
proxy->r_proxy = B_TRUE;
proxy->r_write_wanted = B_FALSE;
proxy->r_read_wanted = B_FALSE;
avl_add(tree, proxy);
return (proxy);
}
/*
* Split the range lock at the supplied offset
* returning the *front* proxy.
*/
static rl_t *
zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
{
rl_t *front, *rear;
ASSERT3U(rl->r_len, >, 1);
ASSERT3U(off, >, rl->r_off);
ASSERT3U(off, <, rl->r_off + rl->r_len);
ASSERT(rl->r_write_wanted == B_FALSE);
ASSERT(rl->r_read_wanted == B_FALSE);
/* create the rear proxy range lock */
rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
rear->r_off = off;
rear->r_len = rl->r_off + rl->r_len - off;
rear->r_cnt = rl->r_cnt;
rear->r_type = RL_READER;
rear->r_proxy = B_TRUE;
rear->r_write_wanted = B_FALSE;
rear->r_read_wanted = B_FALSE;
front = zfs_range_proxify(tree, rl);
front->r_len = off - rl->r_off;
avl_insert_here(tree, rear, front, AVL_AFTER);
return (front);
}
/*
* Create and add a new proxy range lock for the supplied range.
*/
static void
zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
{
rl_t *rl;
ASSERT(len);
rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
rl->r_off = off;
rl->r_len = len;
rl->r_cnt = 1;
rl->r_type = RL_READER;
rl->r_proxy = B_TRUE;
rl->r_write_wanted = B_FALSE;
rl->r_read_wanted = B_FALSE;
avl_add(tree, rl);
}
static void
zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
{
rl_t *next;
uint64_t off = new->r_off;
uint64_t len = new->r_len;
/*
* prev arrives either:
* - pointing to an entry at the same offset
* - pointing to the entry with the closest previous offset whose
* range may overlap with the new range
* - null, if there were no ranges starting before the new one
*/
if (prev) {
if (prev->r_off + prev->r_len <= off) {
prev = NULL;
} else if (prev->r_off != off) {
/*
* convert to proxy if needed then
* split this entry and bump ref count
*/
prev = zfs_range_split(tree, prev, off);
prev = AVL_NEXT(tree, prev); /* move to rear range */
}
}
ASSERT((prev == NULL) || (prev->r_off == off));
if (prev)
next = prev;
else
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
if (next == NULL || off + len <= next->r_off) {
/* no overlaps, use the original new rl_t in the tree */
avl_insert(tree, new, where);
return;
}
if (off < next->r_off) {
/* Add a proxy for initial range before the overlap */
zfs_range_new_proxy(tree, off, next->r_off - off);
}
new->r_cnt = 0; /* will use proxies in tree */
/*
* We now search forward through the ranges, until we go past the end
* of the new range. For each entry we make it a proxy if it
* isn't already, then bump its reference count. If there's any
* gaps between the ranges then we create a new proxy range.
*/
for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
if (off + len <= next->r_off)
break;
if (prev && prev->r_off + prev->r_len < next->r_off) {
/* there's a gap */
ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
next->r_off - (prev->r_off + prev->r_len));
}
if (off + len == next->r_off + next->r_len) {
/* exact overlap with end */
next = zfs_range_proxify(tree, next);
next->r_cnt++;
return;
}
if (off + len < next->r_off + next->r_len) {
/* new range ends in the middle of this block */
next = zfs_range_split(tree, next, off + len);
next->r_cnt++;
return;
}
ASSERT3U(off + len, >, next->r_off + next->r_len);
next = zfs_range_proxify(tree, next);
next->r_cnt++;
}
/* Add the remaining end range. */
zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
(off + len) - (prev->r_off + prev->r_len));
}
/*
* Check if a reader lock can be grabbed, or wait and recheck until available.
*/
static void
zfs_range_lock_reader(znode_t *zp, rl_t *new)
{
avl_tree_t *tree = &zp->z_range_avl;
rl_t *prev, *next;
avl_index_t where;
uint64_t off = new->r_off;
uint64_t len = new->r_len;
/*
* Look for any writer locks in the range.
*/
retry:
prev = avl_find(tree, new, &where);
if (prev == NULL)
prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
/*
* Check the previous range for a writer lock overlap.
*/
if (prev && (off < prev->r_off + prev->r_len)) {
if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
if (!prev->r_read_wanted) {
cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
prev->r_read_wanted = B_TRUE;
}
cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
goto retry;
}
if (off + len < prev->r_off + prev->r_len)
goto got_lock;
}
/*
* Search through the following ranges to see if there's
* write lock any overlap.
*/
if (prev)
next = AVL_NEXT(tree, prev);
else
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
for (; next; next = AVL_NEXT(tree, next)) {
if (off + len <= next->r_off)
goto got_lock;
if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
if (!next->r_read_wanted) {
cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
next->r_read_wanted = B_TRUE;
}
cv_wait(&next->r_rd_cv, &zp->z_range_lock);
goto retry;
}
if (off + len <= next->r_off + next->r_len)
goto got_lock;
}
got_lock:
/*
* Add the read lock, which may involve splitting existing
* locks and bumping ref counts (r_cnt).
*/
zfs_range_add_reader(tree, new, prev, where);
}
/*
* Lock a range (offset, length) as either shared (RL_READER)
* or exclusive (RL_WRITER). Returns the range lock structure
* for later unlocking or reduce range (if entire file
* previously locked as RL_WRITER).
*/
rl_t *
zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
{
rl_t *new;
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
new->r_zp = zp;
new->r_off = off;
new->r_len = len;
new->r_cnt = 1; /* assume it's going to be in the tree */
new->r_type = type;
new->r_proxy = B_FALSE;
new->r_write_wanted = B_FALSE;
new->r_read_wanted = B_FALSE;
mutex_enter(&zp->z_range_lock);
if (type == RL_READER) {
/*
* First check for the usual case of no locks
*/
if (avl_numnodes(&zp->z_range_avl) == 0)
avl_add(&zp->z_range_avl, new);
else
zfs_range_lock_reader(zp, new);
} else
zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
mutex_exit(&zp->z_range_lock);
return (new);
}
/*
* Unlock a reader lock
*/
static void
zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
{
avl_tree_t *tree = &zp->z_range_avl;
rl_t *rl, *next;
uint64_t len;
/*
* The common case is when the remove entry is in the tree
* (cnt == 1) meaning there's been no other reader locks overlapping
* with this one. Otherwise the remove entry will have been
* removed from the tree and replaced by proxies (one or
* more ranges mapping to the entire range).
*/
if (remove->r_cnt == 1) {
avl_remove(tree, remove);
if (remove->r_write_wanted) {
cv_broadcast(&remove->r_wr_cv);
cv_destroy(&remove->r_wr_cv);
}
if (remove->r_read_wanted) {
cv_broadcast(&remove->r_rd_cv);
cv_destroy(&remove->r_rd_cv);
}
} else {
ASSERT3U(remove->r_cnt, ==, 0);
ASSERT3U(remove->r_write_wanted, ==, 0);
ASSERT3U(remove->r_read_wanted, ==, 0);
/*
* Find start proxy representing this reader lock,
* then decrement ref count on all proxies
* that make up this range, freeing them as needed.
*/
rl = avl_find(tree, remove, NULL);
ASSERT(rl);
ASSERT(rl->r_cnt);
ASSERT(rl->r_type == RL_READER);
for (len = remove->r_len; len != 0; rl = next) {
len -= rl->r_len;
if (len) {
next = AVL_NEXT(tree, rl);
ASSERT(next);
ASSERT(rl->r_off + rl->r_len == next->r_off);
ASSERT(next->r_cnt);
ASSERT(next->r_type == RL_READER);
}
rl->r_cnt--;
if (rl->r_cnt == 0) {
avl_remove(tree, rl);
if (rl->r_write_wanted) {
cv_broadcast(&rl->r_wr_cv);
cv_destroy(&rl->r_wr_cv);
}
if (rl->r_read_wanted) {
cv_broadcast(&rl->r_rd_cv);
cv_destroy(&rl->r_rd_cv);
}
kmem_free(rl, sizeof (rl_t));
}
}
}
kmem_free(remove, sizeof (rl_t));
}
/*
* Unlock range and destroy range lock structure.
*/
void
zfs_range_unlock(rl_t *rl)
{
znode_t *zp = rl->r_zp;
ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
ASSERT(!rl->r_proxy);
mutex_enter(&zp->z_range_lock);
if (rl->r_type == RL_WRITER) {
/* writer locks can't be shared or split */
avl_remove(&zp->z_range_avl, rl);
mutex_exit(&zp->z_range_lock);
if (rl->r_write_wanted) {
cv_broadcast(&rl->r_wr_cv);
cv_destroy(&rl->r_wr_cv);
}
if (rl->r_read_wanted) {
cv_broadcast(&rl->r_rd_cv);
cv_destroy(&rl->r_rd_cv);
}
kmem_free(rl, sizeof (rl_t));
} else {
/*
* lock may be shared, let zfs_range_unlock_reader()
* release the lock and free the rl_t
*/
zfs_range_unlock_reader(zp, rl);
mutex_exit(&zp->z_range_lock);
}
}
/*
* Reduce range locked as RL_WRITER from whole file to specified range.
* Asserts the whole file is exclusivly locked and so there's only one
* entry in the tree.
*/
void
zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
{
znode_t *zp = rl->r_zp;
/* Ensure there are no other locks */
ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
ASSERT(rl->r_off == 0);
ASSERT(rl->r_type == RL_WRITER);
ASSERT(!rl->r_proxy);
ASSERT3U(rl->r_len, ==, UINT64_MAX);
ASSERT3U(rl->r_cnt, ==, 1);
mutex_enter(&zp->z_range_lock);
rl->r_off = off;
rl->r_len = len;
mutex_exit(&zp->z_range_lock);
if (rl->r_write_wanted)
cv_broadcast(&rl->r_wr_cv);
if (rl->r_read_wanted)
cv_broadcast(&rl->r_rd_cv);
}
/*
* AVL comparison function used to order range locks
* Locks are ordered on the start offset of the range.
*/
int
zfs_range_compare(const void *arg1, const void *arg2)
{
const rl_t *rl1 = arg1;
const rl_t *rl2 = arg2;
if (rl1->r_off > rl2->r_off)
return (1);
if (rl1->r_off < rl2->r_off)
return (-1);
return (0);
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff