mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 11:47:43 +03:00
Initial Linux ZFS GIT Repo
This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
# NOTE: dctl_client.c, dctl_common.c, dctl_server.c, dctl_thrpool.c unused
|
||||
# by kernel port. Potentially they should just be removed if we don't care
|
||||
# able user space lustre intergration from this source base.
|
||||
|
||||
# NOTE: For clarity this directly should simply be renamed libzpl and
|
||||
# the full kernel implementation should be minimally stubbed out.
|
||||
|
||||
subdir-m += include
|
||||
DISTFILES = dctl_client.c dctl_common.c dctl_server.c dctl_thrpool.c
|
||||
DISTFILES += dmu_send.c rrwlock.c zfs_acl.c zfs_ctldir.c
|
||||
DISTFILES += zfs_dir.c zfs_fuid.c zfs_ioctl.c zfs_log.c zfs_replay.c
|
||||
DISTFILES += zfs_rlock.c zfs_vfsops.c zfs_vnops.c zvol.c
|
||||
|
||||
MODULE := zctl
|
||||
|
||||
EXTRA_CFLAGS = @KERNELCPPFLAGS@
|
||||
EXTRA_CFLAGS += -I@LIBDIR@/libzcommon/include
|
||||
EXTRA_CFLAGS += -I@LIBDIR@/libdmu-ctl/include
|
||||
EXTRA_CFLAGS += -I@LIBDIR@/libavl/include
|
||||
EXTRA_CFLAGS += -I@LIBDIR@/libport/include
|
||||
EXTRA_CFLAGS += -I@LIBDIR@/libnvpair/include
|
||||
|
||||
obj-m := ${MODULE}.o
|
||||
|
||||
${MODULE}-objs += zvol.o # Volume emulation interface
|
||||
${MODULE}-objs += zfs_ioctl.o # /dev/zfs_ioctl interface
|
||||
${MODULE}-objs += zfs_vfsops.o
|
||||
${MODULE}-objs += dmu_send.o
|
||||
@@ -0,0 +1,263 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License, Version 1.0 only
|
||||
* (the "License"). You may not use this file except in compliance
|
||||
* with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ftw.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <sys/debug.h>
|
||||
|
||||
#include <sys/dmu_ctl.h>
|
||||
#include <sys/dmu_ctl_impl.h>
|
||||
|
||||
/*
|
||||
* Try to connect to the socket given in path.
|
||||
*
|
||||
* For nftw() convenience, returns 0 if unsuccessful, otherwise
|
||||
* returns the socket descriptor.
|
||||
*/
|
||||
static int try_connect(const char *path)
|
||||
{
|
||||
struct sockaddr_un name;
|
||||
int sock;
|
||||
|
||||
sock = socket(PF_UNIX, SOCK_STREAM, 0);
|
||||
if (sock == -1) {
|
||||
perror("socket");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The socket fd cannot be 0 otherwise nftw() will not interpret the
|
||||
* return code correctly.
|
||||
*/
|
||||
VERIFY(sock != 0);
|
||||
|
||||
name.sun_family = AF_UNIX;
|
||||
strncpy(name.sun_path, path, sizeof(name.sun_path));
|
||||
|
||||
name.sun_path[sizeof(name.sun_path) - 1] = '\0';
|
||||
|
||||
if (connect(sock, (struct sockaddr *) &name, sizeof(name)) == -1) {
|
||||
close(sock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return sock;
|
||||
}
|
||||
|
||||
/*
|
||||
* nftw() callback.
|
||||
*/
|
||||
static int nftw_cb(const char *fpath, const struct stat *sb, int typeflag,
|
||||
struct FTW *ftwbuf)
|
||||
{
|
||||
if (!S_ISSOCK(sb->st_mode))
|
||||
return 0;
|
||||
|
||||
if (strcmp(&fpath[ftwbuf->base], SOCKNAME) != 0)
|
||||
return 0;
|
||||
|
||||
return try_connect(fpath);
|
||||
}
|
||||
|
||||
/*
|
||||
* For convenience, if check_subdirs is true we walk the directory tree to
|
||||
* find a good socket.
|
||||
*/
|
||||
int dctlc_connect(const char *dir, boolean_t check_subdirs)
|
||||
{
|
||||
char *fpath;
|
||||
int fd;
|
||||
|
||||
if (check_subdirs)
|
||||
fd = nftw(dir, nftw_cb, 10, FTW_PHYS);
|
||||
else {
|
||||
fpath = malloc(strlen(dir) + strlen(SOCKNAME) + 2);
|
||||
if (fpath == NULL)
|
||||
return -1;
|
||||
|
||||
strcpy(fpath, dir);
|
||||
strcat(fpath, "/" SOCKNAME);
|
||||
|
||||
fd = try_connect(fpath);
|
||||
|
||||
free(fpath);
|
||||
}
|
||||
|
||||
return fd == 0 ? -1 : fd;
|
||||
}
|
||||
|
||||
void dctlc_disconnect(int fd)
|
||||
{
|
||||
(void) shutdown(fd, SHUT_RDWR);
|
||||
}
|
||||
|
||||
static int dctl_reply_copyin(int fd, dctl_cmd_t *cmd)
|
||||
{
|
||||
return dctl_send_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
|
||||
cmd->u.dcmd_copy.size);
|
||||
}
|
||||
|
||||
static int dctl_reply_copyinstr(int fd, dctl_cmd_t *cmd)
|
||||
{
|
||||
dctl_cmd_t reply;
|
||||
char *from;
|
||||
size_t len, buflen, to_copy;
|
||||
int error;
|
||||
|
||||
reply.dcmd_msg = DCTL_GEN_REPLY;
|
||||
|
||||
from = (char *)(uintptr_t) cmd->u.dcmd_copy.ptr;
|
||||
|
||||
buflen = cmd->u.dcmd_copy.size;
|
||||
to_copy = strnlen(from, buflen - 1);
|
||||
|
||||
reply.u.dcmd_reply.rc = from[to_copy] == '\0' ? 0 : ENAMETOOLONG;
|
||||
reply.u.dcmd_reply.size = to_copy;
|
||||
|
||||
error = dctl_send_msg(fd, &reply);
|
||||
|
||||
if (!error && to_copy > 0)
|
||||
error = dctl_send_data(fd, from, to_copy);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static int dctl_reply_copyout(int fd, dctl_cmd_t *cmd)
|
||||
{
|
||||
return dctl_read_data(fd, (void *)(uintptr_t) cmd->u.dcmd_copy.ptr,
|
||||
cmd->u.dcmd_copy.size);
|
||||
}
|
||||
|
||||
static int dctl_reply_fd_read(int fd, dctl_cmd_t *cmd)
|
||||
{
|
||||
dctl_cmd_t reply;
|
||||
void *buf;
|
||||
int error;
|
||||
ssize_t rrc, size = cmd->u.dcmd_fd_io.size;
|
||||
|
||||
buf = malloc(size);
|
||||
if (buf == NULL)
|
||||
return ENOMEM;
|
||||
|
||||
rrc = read(cmd->u.dcmd_fd_io.fd, buf, size);
|
||||
|
||||
reply.dcmd_msg = DCTL_GEN_REPLY;
|
||||
reply.u.dcmd_reply.rc = rrc == -1 ? errno : 0;
|
||||
reply.u.dcmd_reply.size = rrc;
|
||||
|
||||
error = dctl_send_msg(fd, &reply);
|
||||
|
||||
if (!error && rrc > 0)
|
||||
error = dctl_send_data(fd, buf, rrc);
|
||||
|
||||
out:
|
||||
free(buf);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static int dctl_reply_fd_write(int fd, dctl_cmd_t *cmd)
|
||||
{
|
||||
dctl_cmd_t reply;
|
||||
void *buf;
|
||||
int error;
|
||||
ssize_t wrc, size = cmd->u.dcmd_fd_io.size;
|
||||
|
||||
buf = malloc(size);
|
||||
if (buf == NULL)
|
||||
return ENOMEM;
|
||||
|
||||
error = dctl_read_data(fd, buf, size);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
wrc = write(cmd->u.dcmd_fd_io.fd, buf, size);
|
||||
|
||||
reply.dcmd_msg = DCTL_GEN_REPLY;
|
||||
reply.u.dcmd_reply.rc = wrc == -1 ? errno : 0;
|
||||
reply.u.dcmd_reply.size = wrc;
|
||||
|
||||
error = dctl_send_msg(fd, &reply);
|
||||
|
||||
out:
|
||||
free(buf);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
int dctlc_ioctl(int fd, int32_t request, void *arg)
|
||||
{
|
||||
int error;
|
||||
dctl_cmd_t cmd;
|
||||
|
||||
ASSERT(fd != 0);
|
||||
|
||||
cmd.dcmd_msg = DCTL_IOCTL;
|
||||
|
||||
cmd.u.dcmd_ioctl.cmd = request;
|
||||
cmd.u.dcmd_ioctl.arg = (uintptr_t) arg;
|
||||
|
||||
error = dctl_send_msg(fd, &cmd);
|
||||
|
||||
while (!error && (error = dctl_read_msg(fd, &cmd)) == 0) {
|
||||
switch (cmd.dcmd_msg) {
|
||||
case DCTL_IOCTL_REPLY:
|
||||
error = cmd.u.dcmd_reply.rc;
|
||||
goto out;
|
||||
case DCTL_COPYIN:
|
||||
error = dctl_reply_copyin(fd, &cmd);
|
||||
break;
|
||||
case DCTL_COPYINSTR:
|
||||
error = dctl_reply_copyinstr(fd, &cmd);
|
||||
break;
|
||||
case DCTL_COPYOUT:
|
||||
error = dctl_reply_copyout(fd, &cmd);
|
||||
break;
|
||||
case DCTL_FD_READ:
|
||||
error = dctl_reply_fd_read(fd, &cmd);
|
||||
break;
|
||||
case DCTL_FD_WRITE:
|
||||
error = dctl_reply_fd_write(fd, &cmd);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s(): invalid message "
|
||||
"received.\n", __func__);
|
||||
error = EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
errno = error;
|
||||
return error ? -1 : 0;
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License, Version 1.0 only
|
||||
* (the "License"). You may not use this file except in compliance
|
||||
* with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include <sys/dmu_ctl.h>
|
||||
#include <sys/dmu_ctl_impl.h>
|
||||
|
||||
int dctl_read_msg(int fd, dctl_cmd_t *cmd)
|
||||
{
|
||||
int error;
|
||||
|
||||
/*
|
||||
* First, read only the magic number and the protocol version.
|
||||
*
|
||||
* This prevents blocking forever in case the size of dctl_cmd_t
|
||||
* shrinks in future protocol versions.
|
||||
*/
|
||||
error = dctl_read_data(fd, cmd, DCTL_CMD_HEADER_SIZE);
|
||||
|
||||
if (!error &&cmd->dcmd_magic != DCTL_MAGIC) {
|
||||
fprintf(stderr, "%s(): invalid magic number\n", __func__);
|
||||
error = EIO;
|
||||
}
|
||||
|
||||
if (!error && cmd->dcmd_version != DCTL_PROTOCOL_VER) {
|
||||
fprintf(stderr, "%s(): invalid protocol version\n", __func__);
|
||||
error = ENOTSUP;
|
||||
}
|
||||
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/* Get the rest of the command */
|
||||
return dctl_read_data(fd, (caddr_t) cmd + DCTL_CMD_HEADER_SIZE,
|
||||
sizeof(dctl_cmd_t) - DCTL_CMD_HEADER_SIZE);
|
||||
}
|
||||
|
||||
int dctl_send_msg(int fd, dctl_cmd_t *cmd)
|
||||
{
|
||||
cmd->dcmd_magic = DCTL_MAGIC;
|
||||
cmd->dcmd_version = DCTL_PROTOCOL_VER;
|
||||
|
||||
return dctl_send_data(fd, cmd, sizeof(dctl_cmd_t));
|
||||
}
|
||||
|
||||
int dctl_read_data(int fd, void *ptr, size_t size)
|
||||
{
|
||||
size_t read = 0;
|
||||
size_t left = size;
|
||||
ssize_t rc;
|
||||
|
||||
while (left > 0) {
|
||||
rc = recv(fd, (caddr_t) ptr + read, left, 0);
|
||||
|
||||
/* File descriptor closed */
|
||||
if (rc == 0)
|
||||
return ECONNRESET;
|
||||
|
||||
if (rc == -1) {
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
return errno;
|
||||
}
|
||||
|
||||
read += rc;
|
||||
left -= rc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dctl_send_data(int fd, const void *ptr, size_t size)
|
||||
{
|
||||
ssize_t rc;
|
||||
|
||||
do {
|
||||
rc = send(fd, ptr, size, MSG_NOSIGNAL);
|
||||
} while(rc == -1 && errno == EINTR);
|
||||
|
||||
return rc == size ? 0 : EIO;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,476 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License, Version 1.0 only
|
||||
* (the "License"). You may not use this file except in compliance
|
||||
* with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <signal.h>
|
||||
#include <limits.h>
|
||||
#include <errno.h>
|
||||
#include <poll.h>
|
||||
#include <pthread.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/debug.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/un.h>
|
||||
#include <sys/list.h>
|
||||
#include <sys/cred.h>
|
||||
|
||||
#include <sys/dmu_ctl.h>
|
||||
#include <sys/dmu_ctl_impl.h>
|
||||
|
||||
static dctl_sock_info_t ctl_sock = {
|
||||
.dsi_mtx = PTHREAD_MUTEX_INITIALIZER,
|
||||
.dsi_fd = -1
|
||||
};
|
||||
|
||||
static int dctl_create_socket_common();
|
||||
|
||||
/*
|
||||
* Routines from zfs_ioctl.c
|
||||
*/
|
||||
extern int zfs_ioctl_init();
|
||||
extern int zfs_ioctl_fini();
|
||||
extern int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
|
||||
int *rvalp);
|
||||
|
||||
/*
|
||||
* We can't simply put the client file descriptor in wthr_info_t because we
|
||||
* have no way of accessing it from the DMU code without extensive
|
||||
* modifications.
|
||||
*
|
||||
* Therefore each worker thread will have it's own global thread-specific
|
||||
* client_fd variable.
|
||||
*/
|
||||
static __thread int client_fd = -1;
|
||||
|
||||
int dctls_copyin(const void *src, void *dest, size_t size)
|
||||
{
|
||||
dctl_cmd_t cmd;
|
||||
|
||||
VERIFY(client_fd >= 0);
|
||||
|
||||
cmd.dcmd_msg = DCTL_COPYIN;
|
||||
cmd.u.dcmd_copy.ptr = (uintptr_t) src;
|
||||
cmd.u.dcmd_copy.size = size;
|
||||
|
||||
if (dctl_send_msg(client_fd, &cmd) != 0)
|
||||
return EFAULT;
|
||||
|
||||
if (dctl_read_data(client_fd, dest, size) != 0)
|
||||
return EFAULT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dctls_copyinstr(const char *from, char *to, size_t max, size_t *len)
|
||||
{
|
||||
dctl_cmd_t msg;
|
||||
size_t copied;
|
||||
|
||||
VERIFY(client_fd >= 0);
|
||||
|
||||
if (max == 0)
|
||||
return ENAMETOOLONG;
|
||||
if (max < 0)
|
||||
return EFAULT;
|
||||
|
||||
msg.dcmd_msg = DCTL_COPYINSTR;
|
||||
msg.u.dcmd_copy.ptr = (uintptr_t) from;
|
||||
msg.u.dcmd_copy.size = max;
|
||||
|
||||
if (dctl_send_msg(client_fd, &msg) != 0)
|
||||
return EFAULT;
|
||||
|
||||
if (dctl_read_msg(client_fd, &msg) != 0)
|
||||
return EFAULT;
|
||||
|
||||
if (msg.dcmd_msg != DCTL_GEN_REPLY)
|
||||
return EFAULT;
|
||||
|
||||
copied = msg.u.dcmd_reply.size;
|
||||
|
||||
if (copied >= max)
|
||||
return EFAULT;
|
||||
|
||||
if (copied > 0)
|
||||
if (dctl_read_data(client_fd, to, copied) != 0)
|
||||
return EFAULT;
|
||||
|
||||
to[copied] = '\0';
|
||||
|
||||
if (len != NULL)
|
||||
*len = copied + 1;
|
||||
|
||||
return msg.u.dcmd_reply.rc;
|
||||
}
|
||||
|
||||
int dctls_copyout(const void *src, void *dest, size_t size)
|
||||
{
|
||||
dctl_cmd_t cmd;
|
||||
|
||||
VERIFY(client_fd >= 0);
|
||||
|
||||
cmd.dcmd_msg = DCTL_COPYOUT;
|
||||
cmd.u.dcmd_copy.ptr = (uintptr_t) dest;
|
||||
cmd.u.dcmd_copy.size = size;
|
||||
|
||||
if (dctl_send_msg(client_fd, &cmd) != 0)
|
||||
return EFAULT;
|
||||
|
||||
if (dctl_send_data(client_fd, src, size) != 0)
|
||||
return EFAULT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp)
|
||||
{
|
||||
dctl_cmd_t msg;
|
||||
uint64_t dsize;
|
||||
int error;
|
||||
|
||||
VERIFY(client_fd >= 0);
|
||||
|
||||
msg.dcmd_msg = DCTL_FD_READ;
|
||||
msg.u.dcmd_fd_io.fd = fd;
|
||||
msg.u.dcmd_fd_io.size = len;
|
||||
|
||||
if ((error = dctl_send_msg(client_fd, &msg)) != 0)
|
||||
return error;
|
||||
|
||||
if ((error = dctl_read_msg(client_fd, &msg)) != 0)
|
||||
return error;
|
||||
|
||||
if (msg.dcmd_msg != DCTL_GEN_REPLY)
|
||||
return EIO;
|
||||
|
||||
if (msg.u.dcmd_reply.rc != 0)
|
||||
return msg.u.dcmd_reply.rc;
|
||||
|
||||
dsize = msg.u.dcmd_reply.size;
|
||||
|
||||
if (dsize > 0)
|
||||
error = dctl_read_data(client_fd, buf, dsize);
|
||||
|
||||
*residp = len - dsize;
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
int dctls_fd_write(int fd, const void *src, ssize_t len)
|
||||
{
|
||||
dctl_cmd_t msg;
|
||||
int error;
|
||||
|
||||
VERIFY(client_fd >= 0);
|
||||
|
||||
msg.dcmd_msg = DCTL_FD_WRITE;
|
||||
msg.u.dcmd_fd_io.fd = fd;
|
||||
msg.u.dcmd_fd_io.size = len;
|
||||
|
||||
error = dctl_send_msg(client_fd, &msg);
|
||||
|
||||
if (!error)
|
||||
error = dctl_send_data(client_fd, src, len);
|
||||
|
||||
if (!error)
|
||||
error = dctl_read_msg(client_fd, &msg);
|
||||
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
if (msg.dcmd_msg != DCTL_GEN_REPLY)
|
||||
return EIO;
|
||||
|
||||
if (msg.u.dcmd_reply.rc != 0)
|
||||
return msg.u.dcmd_reply.rc;
|
||||
|
||||
/*
|
||||
* We have to do this because the original upstream code
|
||||
* does not check if residp == len.
|
||||
*/
|
||||
if (msg.u.dcmd_reply.size != len)
|
||||
return EIO;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Handle a new connection */
|
||||
static void dctl_handle_conn(int sock_fd)
|
||||
{
|
||||
dctl_cmd_t cmd;
|
||||
dev_t dev = { 0 };
|
||||
int rc;
|
||||
|
||||
client_fd = sock_fd;
|
||||
|
||||
while (dctl_read_msg(sock_fd, &cmd) == 0) {
|
||||
if (cmd.dcmd_msg != DCTL_IOCTL) {
|
||||
fprintf(stderr, "%s(): unexpected message type.\n",
|
||||
__func__);
|
||||
break;
|
||||
}
|
||||
|
||||
rc = zfsdev_ioctl(dev, cmd.u.dcmd_ioctl.cmd,
|
||||
(intptr_t) cmd.u.dcmd_ioctl.arg, 0, NULL, NULL);
|
||||
|
||||
cmd.dcmd_msg = DCTL_IOCTL_REPLY;
|
||||
cmd.u.dcmd_reply.rc = rc;
|
||||
|
||||
if (dctl_send_msg(sock_fd, &cmd) != 0)
|
||||
break;
|
||||
}
|
||||
close(sock_fd);
|
||||
|
||||
client_fd = -1;
|
||||
}
|
||||
|
||||
/* Main worker thread loop */
|
||||
static void *dctl_thread(void *arg)
|
||||
{
|
||||
wthr_info_t *thr = arg;
|
||||
struct pollfd fds[1];
|
||||
|
||||
fds[0].events = POLLIN;
|
||||
|
||||
pthread_mutex_lock(&ctl_sock.dsi_mtx);
|
||||
|
||||
while (!thr->wthr_exit) {
|
||||
/* Clean-up dead threads */
|
||||
dctl_thr_join();
|
||||
|
||||
/* The file descriptor might change in the thread lifetime */
|
||||
fds[0].fd = ctl_sock.dsi_fd;
|
||||
|
||||
/* Poll socket with 1-second timeout */
|
||||
int rc = poll(fds, 1, 1000);
|
||||
if (rc == 0 || (rc == -1 && errno == EINTR))
|
||||
continue;
|
||||
|
||||
/* Recheck the exit flag */
|
||||
if (thr->wthr_exit)
|
||||
break;
|
||||
|
||||
if (rc == -1) {
|
||||
/* Unknown error, let's try to recreate the socket */
|
||||
close(ctl_sock.dsi_fd);
|
||||
ctl_sock.dsi_fd = -1;
|
||||
|
||||
if (dctl_create_socket_common() != 0)
|
||||
break;
|
||||
|
||||
continue;
|
||||
}
|
||||
ASSERT(rc == 1);
|
||||
|
||||
short rev = fds[0].revents;
|
||||
if (rev == 0)
|
||||
continue;
|
||||
ASSERT(rev == POLLIN);
|
||||
|
||||
/*
|
||||
* At this point there should be a connection ready to be
|
||||
* accepted.
|
||||
*/
|
||||
int client_fd = accept(ctl_sock.dsi_fd, NULL, NULL);
|
||||
/* Many possible errors here, we'll just retry */
|
||||
if (client_fd == -1)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Now lets handle the request. This can take a very
|
||||
* long time (hours even), so we'll let other threads
|
||||
* handle new connections.
|
||||
*/
|
||||
pthread_mutex_unlock(&ctl_sock.dsi_mtx);
|
||||
|
||||
dctl_thr_rebalance(thr, B_FALSE);
|
||||
dctl_handle_conn(client_fd);
|
||||
dctl_thr_rebalance(thr, B_TRUE);
|
||||
|
||||
pthread_mutex_lock(&ctl_sock.dsi_mtx);
|
||||
}
|
||||
pthread_mutex_unlock(&ctl_sock.dsi_mtx);
|
||||
|
||||
dctl_thr_die(thr);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int dctl_create_socket_common()
|
||||
{
|
||||
dctl_sock_info_t *s = &ctl_sock;
|
||||
size_t size;
|
||||
int error;
|
||||
|
||||
ASSERT(s->dsi_fd == -1);
|
||||
|
||||
/*
|
||||
* Unlink old socket, in case it exists.
|
||||
* We don't care about errors here.
|
||||
*/
|
||||
unlink(s->dsi_path);
|
||||
|
||||
/* Create the socket */
|
||||
s->dsi_fd = socket(PF_UNIX, SOCK_STREAM, 0);
|
||||
if (s->dsi_fd == -1) {
|
||||
error = errno;
|
||||
perror("socket");
|
||||
return error;
|
||||
}
|
||||
|
||||
s->dsi_addr.sun_family = AF_UNIX;
|
||||
|
||||
size = sizeof(s->dsi_addr.sun_path) - 1;
|
||||
strncpy(s->dsi_addr.sun_path, s->dsi_path, size);
|
||||
|
||||
s->dsi_addr.sun_path[size] = '\0';
|
||||
|
||||
if (bind(s->dsi_fd, (struct sockaddr *) &s->dsi_addr,
|
||||
sizeof(s->dsi_addr)) != 0) {
|
||||
error = errno;
|
||||
perror("bind");
|
||||
return error;
|
||||
}
|
||||
|
||||
if (listen(s->dsi_fd, LISTEN_BACKLOG) != 0) {
|
||||
error = errno;
|
||||
perror("listen");
|
||||
unlink(s->dsi_path);
|
||||
return error;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dctl_create_socket(const char *cfg_dir)
|
||||
{
|
||||
int error;
|
||||
dctl_sock_info_t *s = &ctl_sock;
|
||||
|
||||
ASSERT(s->dsi_path == NULL);
|
||||
ASSERT(s->dsi_fd == -1);
|
||||
|
||||
int pathsize = strlen(cfg_dir) + strlen(SOCKNAME) + 2;
|
||||
if (pathsize > sizeof(s->dsi_addr.sun_path))
|
||||
return ENAMETOOLONG;
|
||||
|
||||
s->dsi_path = malloc(pathsize);
|
||||
if (s->dsi_path == NULL)
|
||||
return ENOMEM;
|
||||
|
||||
strcpy(s->dsi_path, cfg_dir);
|
||||
strcat(s->dsi_path, "/" SOCKNAME);
|
||||
|
||||
/*
|
||||
* For convenience, create the directory in case it doesn't exist.
|
||||
* We don't care about errors here.
|
||||
*/
|
||||
mkdir(cfg_dir, 0770);
|
||||
|
||||
error = dctl_create_socket_common();
|
||||
|
||||
if (error) {
|
||||
free(s->dsi_path);
|
||||
|
||||
if (s->dsi_fd != -1) {
|
||||
close(s->dsi_fd);
|
||||
s->dsi_fd = -1;
|
||||
}
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static void dctl_destroy_socket()
|
||||
{
|
||||
dctl_sock_info_t *s = &ctl_sock;
|
||||
|
||||
ASSERT(s->dsi_path != NULL);
|
||||
ASSERT(s->dsi_fd != -1);
|
||||
|
||||
close(s->dsi_fd);
|
||||
s->dsi_fd = -1;
|
||||
|
||||
unlink(s->dsi_path);
|
||||
free(s->dsi_path);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the DMU userspace control interface.
|
||||
* This should be called after kernel_init().
|
||||
*
|
||||
* Note that only very rarely we have more than a couple of simultaneous
|
||||
* lzfs/lzpool connections. Since the thread pool grows automatically when all
|
||||
* threads are busy, a good value for min_thr and max_free_thr is 2.
|
||||
*/
|
||||
int dctl_server_init(const char *cfg_dir, int min_thr, int max_free_thr)
|
||||
{
|
||||
int error;
|
||||
|
||||
ASSERT(min_thr > 0);
|
||||
ASSERT(max_free_thr >= min_thr);
|
||||
|
||||
error = zfs_ioctl_init();
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
error = dctl_create_socket(cfg_dir);
|
||||
if (error) {
|
||||
(void) zfs_ioctl_fini();
|
||||
return error;
|
||||
}
|
||||
|
||||
error = dctl_thr_pool_create(min_thr, max_free_thr, dctl_thread);
|
||||
if (error) {
|
||||
(void) zfs_ioctl_fini();
|
||||
dctl_destroy_socket();
|
||||
return error;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Terminate control interface.
|
||||
* This should be called after closing all objsets, but before calling
|
||||
* kernel_fini().
|
||||
* May return EBUSY if the SPA is busy.
|
||||
*
|
||||
* Thread pool destruction can take a while due to poll()
|
||||
* timeout or due to a thread being busy (e.g. a backup is being taken).
|
||||
*/
|
||||
int dctl_server_fini()
|
||||
{
|
||||
dctl_thr_pool_stop();
|
||||
dctl_destroy_socket();
|
||||
|
||||
return zfs_ioctl_fini();
|
||||
}
|
||||
@@ -0,0 +1,253 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License, Version 1.0 only
|
||||
* (the "License"). You may not use this file except in compliance
|
||||
* with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
#include <time.h>
|
||||
#include <pthread.h>
|
||||
#include <errno.h>
|
||||
#include <sys/list.h>
|
||||
#include <sys/debug.h>
|
||||
|
||||
#include <sys/dmu_ctl.h>
|
||||
#include <sys/dmu_ctl_impl.h>
|
||||
|
||||
static dctl_thr_info_t thr_pool = {
|
||||
.dti_mtx = PTHREAD_MUTEX_INITIALIZER
|
||||
};
|
||||
|
||||
/*
|
||||
* Create n threads.
|
||||
* Callers must acquire thr_pool.dti_mtx first.
|
||||
*/
|
||||
static int dctl_thr_create(int n)
|
||||
{
|
||||
dctl_thr_info_t *p = &thr_pool;
|
||||
int error;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
wthr_info_t *thr = malloc(sizeof(wthr_info_t));
|
||||
if (thr == NULL)
|
||||
return ENOMEM;
|
||||
|
||||
thr->wthr_exit = B_FALSE;
|
||||
thr->wthr_free = B_TRUE;
|
||||
|
||||
error = pthread_create(&thr->wthr_id, NULL, p->dti_thr_func,
|
||||
thr);
|
||||
if (error) {
|
||||
free(thr);
|
||||
return error;
|
||||
}
|
||||
|
||||
p->dti_free++;
|
||||
|
||||
list_insert_tail(&p->dti_list, thr);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the thread as dead.
|
||||
* Must be called right before exiting the main thread function.
|
||||
*/
|
||||
void dctl_thr_die(wthr_info_t *thr)
|
||||
{
|
||||
dctl_thr_info_t *p = &thr_pool;
|
||||
|
||||
thr->wthr_exit = B_TRUE;
|
||||
dctl_thr_rebalance(thr, B_FALSE);
|
||||
|
||||
pthread_mutex_lock(&p->dti_mtx);
|
||||
|
||||
list_remove(&p->dti_list, thr);
|
||||
list_insert_tail(&p->dti_join_list, thr);
|
||||
|
||||
pthread_mutex_unlock(&p->dti_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean-up dead threads.
|
||||
*/
|
||||
void dctl_thr_join()
|
||||
{
|
||||
dctl_thr_info_t *p = &thr_pool;
|
||||
wthr_info_t *thr;
|
||||
|
||||
pthread_mutex_lock(&p->dti_mtx);
|
||||
|
||||
while ((thr = list_head(&p->dti_join_list))) {
|
||||
list_remove(&p->dti_join_list, thr);
|
||||
|
||||
ASSERT(!pthread_equal(thr->wthr_id, pthread_self()));
|
||||
|
||||
/*
|
||||
* This should not block because all the threads
|
||||
* on this list should have died already.
|
||||
*
|
||||
* pthread_join() can only return an error if
|
||||
* we made a programming mistake.
|
||||
*/
|
||||
VERIFY(pthread_join(thr->wthr_id, NULL) == 0);
|
||||
|
||||
ASSERT(thr->wthr_exit);
|
||||
ASSERT(!thr->wthr_free);
|
||||
|
||||
free(thr);
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&p->dti_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Adjust the number of free threads in the pool and the thread status.
|
||||
*
|
||||
* Callers must acquire thr_pool.dti_mtx first.
|
||||
*/
|
||||
static void dctl_thr_adjust_free(wthr_info_t *thr, boolean_t set_free)
|
||||
{
|
||||
dctl_thr_info_t *p = &thr_pool;
|
||||
|
||||
ASSERT(p->dti_free >= 0);
|
||||
|
||||
if (!thr->wthr_free && set_free)
|
||||
p->dti_free++;
|
||||
else if (thr->wthr_free && !set_free)
|
||||
p->dti_free--;
|
||||
|
||||
ASSERT(p->dti_free >= 0);
|
||||
|
||||
thr->wthr_free = set_free;
|
||||
}
|
||||
|
||||
/*
|
||||
* Rebalance threads. Also adjusts the free status of the thread.
|
||||
* Will set the thread exit flag if the number of free threads is above
|
||||
* the limit.
|
||||
*/
|
||||
void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free)
|
||||
{
|
||||
dctl_thr_info_t *p = &thr_pool;
|
||||
|
||||
pthread_mutex_lock(&p->dti_mtx);
|
||||
|
||||
if (p->dti_exit || p->dti_free > p->dti_max_free)
|
||||
thr->wthr_exit = B_TRUE;
|
||||
|
||||
if (thr->wthr_exit)
|
||||
set_free = B_FALSE;
|
||||
|
||||
dctl_thr_adjust_free(thr, set_free);
|
||||
|
||||
if (!p->dti_exit && p->dti_free == 0)
|
||||
dctl_thr_create(1);
|
||||
|
||||
pthread_mutex_unlock(&p->dti_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Stop the thread pool.
|
||||
*
|
||||
* This can take a while since it actually waits for all threads to exit.
|
||||
*/
|
||||
void dctl_thr_pool_stop()
|
||||
{
|
||||
dctl_thr_info_t *p = &thr_pool;
|
||||
wthr_info_t *thr;
|
||||
struct timespec ts;
|
||||
|
||||
pthread_mutex_lock(&p->dti_mtx);
|
||||
|
||||
ASSERT(!p->dti_exit);
|
||||
p->dti_exit = B_TRUE;
|
||||
|
||||
/* Let's flag the threads first */
|
||||
thr = list_head(&p->dti_list);
|
||||
while (thr != NULL) {
|
||||
thr->wthr_exit = B_TRUE;
|
||||
dctl_thr_adjust_free(thr, B_FALSE);
|
||||
|
||||
thr = list_next(&p->dti_list, thr);
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&p->dti_mtx);
|
||||
|
||||
/* Now let's wait for them to exit */
|
||||
ts.tv_sec = 0;
|
||||
ts.tv_nsec = 50000000; /* 50ms */
|
||||
do {
|
||||
nanosleep(&ts, NULL);
|
||||
|
||||
pthread_mutex_lock(&p->dti_mtx);
|
||||
thr = list_head(&p->dti_list);
|
||||
pthread_mutex_unlock(&p->dti_mtx);
|
||||
|
||||
dctl_thr_join();
|
||||
} while(thr != NULL);
|
||||
|
||||
ASSERT(p->dti_free == 0);
|
||||
|
||||
ASSERT(list_is_empty(&p->dti_list));
|
||||
ASSERT(list_is_empty(&p->dti_join_list));
|
||||
|
||||
list_destroy(&p->dti_list);
|
||||
list_destroy(&p->dti_join_list);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create thread pool.
|
||||
*
|
||||
* If at least one thread creation fails, it will stop all previous
|
||||
* threads and return a non-zero value.
|
||||
*/
|
||||
int dctl_thr_pool_create(int min_thr, int max_free_thr,
|
||||
thr_func_t *thr_func)
|
||||
{
|
||||
int error;
|
||||
dctl_thr_info_t *p = &thr_pool;
|
||||
|
||||
ASSERT(p->dti_free == 0);
|
||||
|
||||
/* Initialize global variables */
|
||||
p->dti_min = min_thr;
|
||||
p->dti_max_free = max_free_thr;
|
||||
p->dti_exit = B_FALSE;
|
||||
p->dti_thr_func = thr_func;
|
||||
|
||||
list_create(&p->dti_list, sizeof(wthr_info_t), offsetof(wthr_info_t,
|
||||
wthr_node));
|
||||
list_create(&p->dti_join_list, sizeof(wthr_info_t),
|
||||
offsetof(wthr_info_t, wthr_node));
|
||||
|
||||
pthread_mutex_lock(&p->dti_mtx);
|
||||
error = dctl_thr_create(min_thr);
|
||||
pthread_mutex_unlock(&p->dti_mtx);
|
||||
|
||||
if (error)
|
||||
dctl_thr_pool_stop();
|
||||
|
||||
return error;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1 @@
|
||||
subdir-m += sys
|
||||
@@ -0,0 +1 @@
|
||||
DISTFILES = dmu_ctl.h dmu_ctl_impl.h
|
||||
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License, Version 1.0 only
|
||||
* (the "License"). You may not use this file except in compliance
|
||||
* with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_DMU_CTL_H
|
||||
#define _SYS_DMU_CTL_H
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
/* Default directory where the clients search for sockets to connect */
|
||||
#define DMU_CTL_DEFAULT_DIR "/var/run/zfs/udmu"
|
||||
|
||||
/*
|
||||
* These functions are called by the server process.
|
||||
*
|
||||
* kernel_init() must be called before dctl_server_init().
|
||||
* kernel_fini() must not be called before dctl_server_fini().
|
||||
*
|
||||
* All objsets must be closed and object references be released before calling
|
||||
* dctl_server_fini(), otherwise it will return EBUSY.
|
||||
*
|
||||
* Note: On Solaris, it is highly recommended to either catch or ignore the
|
||||
* SIGPIPE signal, otherwise the server process will die if the client is
|
||||
* killed.
|
||||
*/
|
||||
int dctl_server_init(const char *cfg_dir, int min_threads,
|
||||
int max_free_threads);
|
||||
int dctl_server_fini();
|
||||
|
||||
/*
|
||||
* The following functions are called by the DMU from the server process context
|
||||
* (in the worker threads).
|
||||
*/
|
||||
int dctls_copyin(const void *src, void *dest, size_t size);
|
||||
int dctls_copyinstr(const char *from, char *to, size_t max,
|
||||
size_t *len);
|
||||
int dctls_copyout(const void *src, void *dest, size_t size);
|
||||
int dctls_fd_read(int fd, void *buf, ssize_t len, ssize_t *residp);
|
||||
int dctls_fd_write(int fd, const void *src, ssize_t len);
|
||||
|
||||
/*
|
||||
* These functions are called by the client process (libzfs).
|
||||
*/
|
||||
int dctlc_connect(const char *dir, boolean_t check_subdirs);
|
||||
void dctlc_disconnect(int fd);
|
||||
|
||||
int dctlc_ioctl(int fd, int32_t request, void *arg);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License, Version 1.0 only
|
||||
* (the "License"). You may not use this file except in compliance
|
||||
* with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_DMU_CTL_IMPL_H
|
||||
#define _SYS_DMU_CTL_IMPL_H
|
||||
|
||||
#include <sys/list.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#define SOCKNAME "dmu_socket"
|
||||
|
||||
#define DCTL_PROTOCOL_VER 1
|
||||
#define DCTL_MAGIC 0xdc71b1070c01dc71ll
|
||||
|
||||
/* Message types */
|
||||
enum {
|
||||
DCTL_IOCTL,
|
||||
DCTL_IOCTL_REPLY,
|
||||
DCTL_COPYIN,
|
||||
DCTL_COPYINSTR,
|
||||
DCTL_COPYOUT,
|
||||
DCTL_FD_READ,
|
||||
DCTL_FD_WRITE,
|
||||
DCTL_GEN_REPLY /* generic reply */
|
||||
};
|
||||
|
||||
/* On-the-wire message */
|
||||
typedef struct dctl_cmd {
|
||||
uint64_t dcmd_magic;
|
||||
int8_t dcmd_version;
|
||||
int8_t dcmd_msg;
|
||||
uint8_t dcmd_pad[6];
|
||||
union {
|
||||
struct dcmd_ioctl {
|
||||
uint64_t arg;
|
||||
int32_t cmd;
|
||||
uint8_t pad[4];
|
||||
} dcmd_ioctl;
|
||||
|
||||
struct dcmd_copy_req {
|
||||
uint64_t ptr;
|
||||
uint64_t size;
|
||||
} dcmd_copy;
|
||||
|
||||
struct dcmd_fd_req {
|
||||
int64_t size;
|
||||
int32_t fd;
|
||||
uint8_t pad[4];
|
||||
} dcmd_fd_io;
|
||||
|
||||
struct dcmd_reply {
|
||||
uint64_t size; /* used by reply to DCTL_COPYINSTR,
|
||||
DCTL_FD_READ and DCTL_FD_WRITE */
|
||||
int32_t rc; /* return code */
|
||||
uint8_t pad[4];
|
||||
} dcmd_reply;
|
||||
} u;
|
||||
} dctl_cmd_t;
|
||||
|
||||
#define DCTL_CMD_HEADER_SIZE (sizeof(uint64_t) + sizeof(uint8_t))
|
||||
|
||||
/*
|
||||
* The following definitions are only used by the server code.
|
||||
*/
|
||||
|
||||
#define LISTEN_BACKLOG 5
|
||||
|
||||
/* Worker thread data */
|
||||
typedef struct wthr_info {
|
||||
list_node_t wthr_node;
|
||||
pthread_t wthr_id;
|
||||
boolean_t wthr_exit; /* termination flag */
|
||||
boolean_t wthr_free;
|
||||
} wthr_info_t;
|
||||
|
||||
/* Control socket data */
|
||||
typedef struct dctl_sock_info {
|
||||
pthread_mutex_t dsi_mtx;
|
||||
char *dsi_path;
|
||||
struct sockaddr_un dsi_addr;
|
||||
int dsi_fd;
|
||||
} dctl_sock_info_t;
|
||||
|
||||
typedef void *thr_func_t(void *);
|
||||
|
||||
/* Thread pool data */
|
||||
typedef struct dctl_thr_info {
|
||||
thr_func_t *dti_thr_func;
|
||||
|
||||
pthread_mutex_t dti_mtx; /* protects the thread lists and dti_free */
|
||||
list_t dti_list; /* list of threads in the thread pool */
|
||||
list_t dti_join_list; /* list of threads that are waiting to be
|
||||
joined */
|
||||
int dti_free; /* number of free worker threads */
|
||||
|
||||
int dti_min;
|
||||
int dti_max_free;
|
||||
|
||||
boolean_t dti_exit; /* global termination flag */
|
||||
} dctl_thr_info_t;
|
||||
|
||||
/* Messaging functions functions */
|
||||
int dctl_read_msg(int fd, dctl_cmd_t *cmd);
|
||||
int dctl_send_msg(int fd, dctl_cmd_t *cmd);
|
||||
|
||||
int dctl_read_data(int fd, void *ptr, size_t size);
|
||||
int dctl_send_data(int fd, const void *ptr, size_t size);
|
||||
|
||||
/* Thread pool functions */
|
||||
int dctl_thr_pool_create(int min_thr, int max_free_thr,
|
||||
thr_func_t *thr_func);
|
||||
void dctl_thr_pool_stop();
|
||||
|
||||
void dctl_thr_join();
|
||||
void dctl_thr_die(wthr_info_t *thr);
|
||||
void dctl_thr_rebalance(wthr_info_t *thr, boolean_t set_free);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,249 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "@(#)rrwlock.c 1.1 07/10/24 SMI"
|
||||
|
||||
#include <sys/refcount.h>
|
||||
#include <sys/rrwlock.h>
|
||||
|
||||
/*
|
||||
* This file contains the implementation of a re-entrant read
|
||||
* reader/writer lock (aka "rrwlock").
|
||||
*
|
||||
* This is a normal reader/writer lock with the additional feature
|
||||
* of allowing threads who have already obtained a read lock to
|
||||
* re-enter another read lock (re-entrant read) - even if there are
|
||||
* waiting writers.
|
||||
*
|
||||
* Callers who have not obtained a read lock give waiting writers priority.
|
||||
*
|
||||
* The rrwlock_t lock does not allow re-entrant writers, nor does it
|
||||
* allow a re-entrant mix of reads and writes (that is, it does not
|
||||
* allow a caller who has already obtained a read lock to be able to
|
||||
* then grab a write lock without first dropping all read locks, and
|
||||
* vice versa).
|
||||
*
|
||||
* The rrwlock_t uses tsd (thread specific data) to keep a list of
|
||||
* nodes (rrw_node_t), where each node keeps track of which specific
|
||||
* lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering
|
||||
* should be rare, a thread that grabs multiple reads on the same rrwlock_t
|
||||
* will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
|
||||
* tsd list can represent a different rrwlock_t. This allows a thread
|
||||
* to enter multiple and unique rrwlock_ts for read locks at the same time.
|
||||
*
|
||||
* Since using tsd exposes some overhead, the rrwlock_t only needs to
|
||||
* keep tsd data when writers are waiting. If no writers are waiting, then
|
||||
* a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
|
||||
* is needed. Once a writer attempts to grab the lock, readers then
|
||||
* keep tsd data and bump the linked readers count (rr_linked_rcount).
|
||||
*
|
||||
* If there are waiting writers and there are anonymous readers, then a
|
||||
* reader doesn't know if it is a re-entrant lock. But since it may be one,
|
||||
* we allow the read to proceed (otherwise it could deadlock). Since once
|
||||
* waiting writers are active, readers no longer bump the anonymous count,
|
||||
* the anonymous readers will eventually flush themselves out. At this point,
|
||||
* readers will be able to tell if they are a re-entrant lock (have a
|
||||
* rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
|
||||
* we must let the proceed. If they are not, then the reader blocks for the
|
||||
* waiting writers. Hence, we do not starve writers.
|
||||
*/
|
||||
|
||||
/* global key for TSD */
|
||||
uint_t rrw_tsd_key;
|
||||
|
||||
typedef struct rrw_node {
|
||||
struct rrw_node *rn_next;
|
||||
rrwlock_t *rn_rrl;
|
||||
} rrw_node_t;
|
||||
|
||||
static rrw_node_t *
|
||||
rrn_find(rrwlock_t *rrl)
|
||||
{
|
||||
rrw_node_t *rn;
|
||||
|
||||
if (refcount_count(&rrl->rr_linked_rcount) == 0)
|
||||
return (NULL);
|
||||
|
||||
for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
|
||||
if (rn->rn_rrl == rrl)
|
||||
return (rn);
|
||||
}
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a node to the head of the singly linked list.
|
||||
*/
|
||||
static void
|
||||
rrn_add(rrwlock_t *rrl)
|
||||
{
|
||||
rrw_node_t *rn;
|
||||
|
||||
rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
|
||||
rn->rn_rrl = rrl;
|
||||
rn->rn_next = tsd_get(rrw_tsd_key);
|
||||
VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* If a node is found for 'rrl', then remove the node from this
|
||||
* thread's list and return TRUE; otherwise return FALSE.
|
||||
*/
|
||||
static boolean_t
|
||||
rrn_find_and_remove(rrwlock_t *rrl)
|
||||
{
|
||||
rrw_node_t *rn;
|
||||
rrw_node_t *prev = NULL;
|
||||
|
||||
if (refcount_count(&rrl->rr_linked_rcount) == 0)
|
||||
return (NULL);
|
||||
|
||||
for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
|
||||
if (rn->rn_rrl == rrl) {
|
||||
if (prev)
|
||||
prev->rn_next = rn->rn_next;
|
||||
else
|
||||
VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
|
||||
kmem_free(rn, sizeof (*rn));
|
||||
return (B_TRUE);
|
||||
}
|
||||
prev = rn;
|
||||
}
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
rrw_init(rrwlock_t *rrl)
|
||||
{
|
||||
mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
|
||||
rrl->rr_writer = NULL;
|
||||
refcount_create(&rrl->rr_anon_rcount);
|
||||
refcount_create(&rrl->rr_linked_rcount);
|
||||
rrl->rr_writer_wanted = B_FALSE;
|
||||
}
|
||||
|
||||
void
|
||||
rrw_destroy(rrwlock_t *rrl)
|
||||
{
|
||||
mutex_destroy(&rrl->rr_lock);
|
||||
cv_destroy(&rrl->rr_cv);
|
||||
ASSERT(rrl->rr_writer == NULL);
|
||||
refcount_destroy(&rrl->rr_anon_rcount);
|
||||
refcount_destroy(&rrl->rr_linked_rcount);
|
||||
}
|
||||
|
||||
static void
|
||||
rrw_enter_read(rrwlock_t *rrl, void *tag)
|
||||
{
|
||||
mutex_enter(&rrl->rr_lock);
|
||||
ASSERT(rrl->rr_writer != curthread);
|
||||
ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
|
||||
|
||||
while (rrl->rr_writer || (rrl->rr_writer_wanted &&
|
||||
refcount_is_zero(&rrl->rr_anon_rcount) &&
|
||||
rrn_find(rrl) == NULL))
|
||||
cv_wait(&rrl->rr_cv, &rrl->rr_lock);
|
||||
|
||||
if (rrl->rr_writer_wanted) {
|
||||
/* may or may not be a re-entrant enter */
|
||||
rrn_add(rrl);
|
||||
(void) refcount_add(&rrl->rr_linked_rcount, tag);
|
||||
} else {
|
||||
(void) refcount_add(&rrl->rr_anon_rcount, tag);
|
||||
}
|
||||
ASSERT(rrl->rr_writer == NULL);
|
||||
mutex_exit(&rrl->rr_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
rrw_enter_write(rrwlock_t *rrl)
|
||||
{
|
||||
mutex_enter(&rrl->rr_lock);
|
||||
ASSERT(rrl->rr_writer != curthread);
|
||||
|
||||
while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
|
||||
refcount_count(&rrl->rr_linked_rcount) > 0 ||
|
||||
rrl->rr_writer != NULL) {
|
||||
rrl->rr_writer_wanted = B_TRUE;
|
||||
cv_wait(&rrl->rr_cv, &rrl->rr_lock);
|
||||
}
|
||||
rrl->rr_writer_wanted = B_FALSE;
|
||||
rrl->rr_writer = curthread;
|
||||
mutex_exit(&rrl->rr_lock);
|
||||
}
|
||||
|
||||
void
|
||||
rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
|
||||
{
|
||||
if (rw == RW_READER)
|
||||
rrw_enter_read(rrl, tag);
|
||||
else
|
||||
rrw_enter_write(rrl);
|
||||
}
|
||||
|
||||
void
|
||||
rrw_exit(rrwlock_t *rrl, void *tag)
|
||||
{
|
||||
mutex_enter(&rrl->rr_lock);
|
||||
ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
|
||||
!refcount_is_zero(&rrl->rr_linked_rcount) ||
|
||||
rrl->rr_writer != NULL);
|
||||
|
||||
if (rrl->rr_writer == NULL) {
|
||||
if (rrn_find_and_remove(rrl)) {
|
||||
if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
|
||||
cv_broadcast(&rrl->rr_cv);
|
||||
|
||||
} else {
|
||||
if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
|
||||
cv_broadcast(&rrl->rr_cv);
|
||||
}
|
||||
} else {
|
||||
ASSERT(rrl->rr_writer == curthread);
|
||||
ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
|
||||
refcount_is_zero(&rrl->rr_linked_rcount));
|
||||
rrl->rr_writer = NULL;
|
||||
cv_broadcast(&rrl->rr_cv);
|
||||
}
|
||||
mutex_exit(&rrl->rr_lock);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
rrw_held(rrwlock_t *rrl, krw_t rw)
|
||||
{
|
||||
boolean_t held;
|
||||
|
||||
mutex_enter(&rrl->rr_lock);
|
||||
if (rw == RW_WRITER) {
|
||||
held = (rrl->rr_writer == curthread);
|
||||
} else {
|
||||
held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
|
||||
!refcount_is_zero(&rrl->rr_linked_rcount));
|
||||
}
|
||||
mutex_exit(&rrl->rr_lock);
|
||||
|
||||
return (held);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,968 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "@(#)zfs_dir.c 1.25 08/04/27 SMI"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/vfs.h>
|
||||
#include <sys/vnode.h>
|
||||
#include <sys/file.h>
|
||||
#include <sys/mode.h>
|
||||
#include <sys/kmem.h>
|
||||
#include <sys/uio.h>
|
||||
#include <sys/pathname.h>
|
||||
#include <sys/cmn_err.h>
|
||||
#include <sys/errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/unistd.h>
|
||||
#include <sys/sunddi.h>
|
||||
#include <sys/random.h>
|
||||
#include <sys/policy.h>
|
||||
#include <sys/zfs_dir.h>
|
||||
#include <sys/zfs_acl.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
#include "fs/fs_subr.h"
|
||||
#include <sys/zap.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/atomic.h>
|
||||
#include <sys/zfs_ctldir.h>
|
||||
#include <sys/zfs_fuid.h>
|
||||
#include <sys/dnlc.h>
|
||||
#include <sys/extdirent.h>
|
||||
|
||||
/*
|
||||
* zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
|
||||
* of names after deciding which is the appropriate lookup interface.
|
||||
*/
|
||||
static int
|
||||
zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
|
||||
boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (zfsvfs->z_norm) {
|
||||
matchtype_t mt = MT_FIRST;
|
||||
boolean_t conflict = B_FALSE;
|
||||
size_t bufsz = 0;
|
||||
char *buf = NULL;
|
||||
|
||||
if (rpnp) {
|
||||
buf = rpnp->pn_buf;
|
||||
bufsz = rpnp->pn_bufsize;
|
||||
}
|
||||
if (exact)
|
||||
mt = MT_EXACT;
|
||||
/*
|
||||
* In the non-mixed case we only expect there would ever
|
||||
* be one match, but we need to use the normalizing lookup.
|
||||
*/
|
||||
error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
|
||||
zoid, mt, buf, bufsz, &conflict);
|
||||
if (!error && deflags)
|
||||
*deflags = conflict ? ED_CASE_CONFLICT : 0;
|
||||
} else {
|
||||
error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
|
||||
}
|
||||
*zoid = ZFS_DIRENT_OBJ(*zoid);
|
||||
|
||||
if (error == ENOENT && update)
|
||||
dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock a directory entry. A dirlock on <dzp, name> protects that name
|
||||
* in dzp's directory zap object. As long as you hold a dirlock, you can
|
||||
* assume two things: (1) dzp cannot be reaped, and (2) no other thread
|
||||
* can change the zap entry for (i.e. link or unlink) this name.
|
||||
*
|
||||
* Input arguments:
|
||||
* dzp - znode for directory
|
||||
* name - name of entry to lock
|
||||
* flag - ZNEW: if the entry already exists, fail with EEXIST.
|
||||
* ZEXISTS: if the entry does not exist, fail with ENOENT.
|
||||
* ZSHARED: allow concurrent access with other ZSHARED callers.
|
||||
* ZXATTR: we want dzp's xattr directory
|
||||
* ZCILOOK: On a mixed sensitivity file system,
|
||||
* this lookup should be case-insensitive.
|
||||
* ZCIEXACT: On a purely case-insensitive file system,
|
||||
* this lookup should be case-sensitive.
|
||||
* ZRENAMING: we are locking for renaming, force narrow locks
|
||||
*
|
||||
* Output arguments:
|
||||
* zpp - pointer to the znode for the entry (NULL if there isn't one)
|
||||
* dlpp - pointer to the dirlock for this entry (NULL on error)
|
||||
* direntflags - (case-insensitive lookup only)
|
||||
* flags if multiple case-sensitive matches exist in directory
|
||||
* realpnp - (case-insensitive lookup only)
|
||||
* actual name matched within the directory
|
||||
*
|
||||
* Return value: 0 on success or errno on failure.
|
||||
*
|
||||
* NOTE: Always checks for, and rejects, '.' and '..'.
|
||||
* NOTE: For case-insensitive file systems we take wide locks (see below),
|
||||
* but return znode pointers to a single match.
|
||||
*/
|
||||
int
|
||||
zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
|
||||
int flag, int *direntflags, pathname_t *realpnp)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
|
||||
zfs_dirlock_t *dl;
|
||||
boolean_t update;
|
||||
boolean_t exact;
|
||||
uint64_t zoid;
|
||||
vnode_t *vp = NULL;
|
||||
int error = 0;
|
||||
int cmpflags;
|
||||
|
||||
*zpp = NULL;
|
||||
*dlpp = NULL;
|
||||
|
||||
/*
|
||||
* Verify that we are not trying to lock '.', '..', or '.zfs'
|
||||
*/
|
||||
if (name[0] == '.' &&
|
||||
(name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
|
||||
zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
|
||||
return (EEXIST);
|
||||
|
||||
/*
|
||||
* Case sensitivity and normalization preferences are set when
|
||||
* the file system is created. These are stored in the
|
||||
* zfsvfs->z_case and zfsvfs->z_norm fields. These choices
|
||||
* affect what vnodes can be cached in the DNLC, how we
|
||||
* perform zap lookups, and the "width" of our dirlocks.
|
||||
*
|
||||
* A normal dirlock locks a single name. Note that with
|
||||
* normalization a name can be composed multiple ways, but
|
||||
* when normalized, these names all compare equal. A wide
|
||||
* dirlock locks multiple names. We need these when the file
|
||||
* system is supporting mixed-mode access. It is sometimes
|
||||
* necessary to lock all case permutations of file name at
|
||||
* once so that simultaneous case-insensitive/case-sensitive
|
||||
* behaves as rationally as possible.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Decide if exact matches should be requested when performing
|
||||
* a zap lookup on file systems supporting case-insensitive
|
||||
* access.
|
||||
*/
|
||||
exact =
|
||||
((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
|
||||
((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
|
||||
|
||||
/*
|
||||
* Only look in or update the DNLC if we are looking for the
|
||||
* name on a file system that does not require normalization
|
||||
* or case folding. We can also look there if we happen to be
|
||||
* on a non-normalizing, mixed sensitivity file system IF we
|
||||
* are looking for the exact name.
|
||||
*
|
||||
* Maybe can add TO-UPPERed version of name to dnlc in ci-only
|
||||
* case for performance improvement?
|
||||
*/
|
||||
update = !zfsvfs->z_norm ||
|
||||
((zfsvfs->z_case == ZFS_CASE_MIXED) &&
|
||||
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
|
||||
|
||||
/*
|
||||
* ZRENAMING indicates we are in a situation where we should
|
||||
* take narrow locks regardless of the file system's
|
||||
* preferences for normalizing and case folding. This will
|
||||
* prevent us deadlocking trying to grab the same wide lock
|
||||
* twice if the two names happen to be case-insensitive
|
||||
* matches.
|
||||
*/
|
||||
if (flag & ZRENAMING)
|
||||
cmpflags = 0;
|
||||
else
|
||||
cmpflags = zfsvfs->z_norm;
|
||||
|
||||
/*
|
||||
* Wait until there are no locks on this name.
|
||||
*/
|
||||
rw_enter(&dzp->z_name_lock, RW_READER);
|
||||
mutex_enter(&dzp->z_lock);
|
||||
for (;;) {
|
||||
if (dzp->z_unlinked) {
|
||||
mutex_exit(&dzp->z_lock);
|
||||
rw_exit(&dzp->z_name_lock);
|
||||
return (ENOENT);
|
||||
}
|
||||
for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
|
||||
if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
|
||||
U8_UNICODE_LATEST, &error) == 0) || error != 0)
|
||||
break;
|
||||
}
|
||||
if (error != 0) {
|
||||
mutex_exit(&dzp->z_lock);
|
||||
rw_exit(&dzp->z_name_lock);
|
||||
return (ENOENT);
|
||||
}
|
||||
if (dl == NULL) {
|
||||
/*
|
||||
* Allocate a new dirlock and add it to the list.
|
||||
*/
|
||||
dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
|
||||
cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
|
||||
dl->dl_name = name;
|
||||
dl->dl_sharecnt = 0;
|
||||
dl->dl_namesize = 0;
|
||||
dl->dl_dzp = dzp;
|
||||
dl->dl_next = dzp->z_dirlocks;
|
||||
dzp->z_dirlocks = dl;
|
||||
break;
|
||||
}
|
||||
if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
|
||||
break;
|
||||
cv_wait(&dl->dl_cv, &dzp->z_lock);
|
||||
}
|
||||
|
||||
if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
|
||||
/*
|
||||
* We're the second shared reference to dl. Make a copy of
|
||||
* dl_name in case the first thread goes away before we do.
|
||||
* Note that we initialize the new name before storing its
|
||||
* pointer into dl_name, because the first thread may load
|
||||
* dl->dl_name at any time. He'll either see the old value,
|
||||
* which is his, or the new shared copy; either is OK.
|
||||
*/
|
||||
dl->dl_namesize = strlen(dl->dl_name) + 1;
|
||||
name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
|
||||
bcopy(dl->dl_name, name, dl->dl_namesize);
|
||||
dl->dl_name = name;
|
||||
}
|
||||
|
||||
mutex_exit(&dzp->z_lock);
|
||||
|
||||
/*
|
||||
* We have a dirlock on the name. (Note that it is the dirlock,
|
||||
* not the dzp's z_lock, that protects the name in the zap object.)
|
||||
* See if there's an object by this name; if so, put a hold on it.
|
||||
*/
|
||||
if (flag & ZXATTR) {
|
||||
zoid = dzp->z_phys->zp_xattr;
|
||||
error = (zoid == 0 ? ENOENT : 0);
|
||||
} else {
|
||||
if (update)
|
||||
vp = dnlc_lookup(ZTOV(dzp), name);
|
||||
if (vp == DNLC_NO_VNODE) {
|
||||
VN_RELE(vp);
|
||||
error = ENOENT;
|
||||
} else if (vp) {
|
||||
if (flag & ZNEW) {
|
||||
zfs_dirent_unlock(dl);
|
||||
VN_RELE(vp);
|
||||
return (EEXIST);
|
||||
}
|
||||
*dlpp = dl;
|
||||
*zpp = VTOZ(vp);
|
||||
return (0);
|
||||
} else {
|
||||
error = zfs_match_find(zfsvfs, dzp, name, exact,
|
||||
update, direntflags, realpnp, &zoid);
|
||||
}
|
||||
}
|
||||
if (error) {
|
||||
if (error != ENOENT || (flag & ZEXISTS)) {
|
||||
zfs_dirent_unlock(dl);
|
||||
return (error);
|
||||
}
|
||||
} else {
|
||||
if (flag & ZNEW) {
|
||||
zfs_dirent_unlock(dl);
|
||||
return (EEXIST);
|
||||
}
|
||||
error = zfs_zget(zfsvfs, zoid, zpp);
|
||||
if (error) {
|
||||
zfs_dirent_unlock(dl);
|
||||
return (error);
|
||||
}
|
||||
if (!(flag & ZXATTR) && update)
|
||||
dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
|
||||
}
|
||||
|
||||
*dlpp = dl;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlock this directory entry and wake anyone who was waiting for it.
|
||||
*/
|
||||
void
|
||||
zfs_dirent_unlock(zfs_dirlock_t *dl)
|
||||
{
|
||||
znode_t *dzp = dl->dl_dzp;
|
||||
zfs_dirlock_t **prev_dl, *cur_dl;
|
||||
|
||||
mutex_enter(&dzp->z_lock);
|
||||
rw_exit(&dzp->z_name_lock);
|
||||
if (dl->dl_sharecnt > 1) {
|
||||
dl->dl_sharecnt--;
|
||||
mutex_exit(&dzp->z_lock);
|
||||
return;
|
||||
}
|
||||
prev_dl = &dzp->z_dirlocks;
|
||||
while ((cur_dl = *prev_dl) != dl)
|
||||
prev_dl = &cur_dl->dl_next;
|
||||
*prev_dl = dl->dl_next;
|
||||
cv_broadcast(&dl->dl_cv);
|
||||
mutex_exit(&dzp->z_lock);
|
||||
|
||||
if (dl->dl_namesize != 0)
|
||||
kmem_free(dl->dl_name, dl->dl_namesize);
|
||||
cv_destroy(&dl->dl_cv);
|
||||
kmem_free(dl, sizeof (*dl));
|
||||
}
|
||||
|
||||
/*
|
||||
* Look up an entry in a directory.
|
||||
*
|
||||
* NOTE: '.' and '..' are handled as special cases because
|
||||
* no directory entries are actually stored for them. If this is
|
||||
* the root of a filesystem, then '.zfs' is also treated as a
|
||||
* special pseudo-directory.
|
||||
*/
|
||||
int
|
||||
zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
|
||||
int *deflg, pathname_t *rpnp)
|
||||
{
|
||||
zfs_dirlock_t *dl;
|
||||
znode_t *zp;
|
||||
int error = 0;
|
||||
|
||||
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
|
||||
*vpp = ZTOV(dzp);
|
||||
VN_HOLD(*vpp);
|
||||
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
|
||||
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
|
||||
/*
|
||||
* If we are a snapshot mounted under .zfs, return
|
||||
* the vp for the snapshot directory.
|
||||
*/
|
||||
if (dzp->z_phys->zp_parent == dzp->z_id &&
|
||||
zfsvfs->z_parent != zfsvfs) {
|
||||
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
|
||||
"snapshot", vpp, NULL, 0, NULL, kcred,
|
||||
NULL, NULL, NULL);
|
||||
return (error);
|
||||
}
|
||||
rw_enter(&dzp->z_parent_lock, RW_READER);
|
||||
error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
|
||||
if (error == 0)
|
||||
*vpp = ZTOV(zp);
|
||||
rw_exit(&dzp->z_parent_lock);
|
||||
} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
|
||||
*vpp = zfsctl_root(dzp);
|
||||
} else {
|
||||
int zf;
|
||||
|
||||
zf = ZEXISTS | ZSHARED;
|
||||
if (flags & FIGNORECASE)
|
||||
zf |= ZCILOOK;
|
||||
|
||||
error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
|
||||
if (error == 0) {
|
||||
*vpp = ZTOV(zp);
|
||||
zfs_dirent_unlock(dl);
|
||||
dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
|
||||
}
|
||||
rpnp = NULL;
|
||||
}
|
||||
|
||||
if ((flags & FIGNORECASE) && rpnp && !error)
|
||||
(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static char *
|
||||
zfs_unlinked_hexname(char namebuf[17], uint64_t x)
|
||||
{
|
||||
char *name = &namebuf[16];
|
||||
const char digits[16] = "0123456789abcdef";
|
||||
|
||||
*name = '\0';
|
||||
do {
|
||||
*--name = digits[x & 0xf];
|
||||
x >>= 4;
|
||||
} while (x != 0);
|
||||
|
||||
return (name);
|
||||
}
|
||||
|
||||
/*
|
||||
* unlinked Set (formerly known as the "delete queue") Error Handling
|
||||
*
|
||||
* When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
|
||||
* don't specify the name of the entry that we will be manipulating. We
|
||||
* also fib and say that we won't be adding any new entries to the
|
||||
* unlinked set, even though we might (this is to lower the minimum file
|
||||
* size that can be deleted in a full filesystem). So on the small
|
||||
* chance that the nlink list is using a fat zap (ie. has more than
|
||||
* 2000 entries), we *may* not pre-read a block that's needed.
|
||||
* Therefore it is remotely possible for some of the assertions
|
||||
* regarding the unlinked set below to fail due to i/o error. On a
|
||||
* nondebug system, this will result in the space being leaked.
|
||||
*/
|
||||
void
|
||||
zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
char obj_name[17];
|
||||
int error;
|
||||
|
||||
ASSERT(zp->z_unlinked);
|
||||
ASSERT3U(zp->z_phys->zp_links, ==, 0);
|
||||
|
||||
error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
|
||||
zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
|
||||
ASSERT3U(error, ==, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean up any znodes that had no links when we either crashed or
|
||||
* (force) umounted the file system.
|
||||
*/
|
||||
void
|
||||
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
|
||||
{
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t zap;
|
||||
dmu_object_info_t doi;
|
||||
znode_t *zp;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* Interate over the contents of the unlinked set.
|
||||
*/
|
||||
for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
|
||||
zap_cursor_retrieve(&zc, &zap) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
|
||||
/*
|
||||
* See what kind of object we have in list
|
||||
*/
|
||||
|
||||
error = dmu_object_info(zfsvfs->z_os,
|
||||
zap.za_first_integer, &doi);
|
||||
if (error != 0)
|
||||
continue;
|
||||
|
||||
ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
|
||||
(doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
|
||||
/*
|
||||
* We need to re-mark these list entries for deletion,
|
||||
* so we pull them back into core and set zp->z_unlinked.
|
||||
*/
|
||||
error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
|
||||
|
||||
/*
|
||||
* We may pick up znodes that are already marked for deletion.
|
||||
* This could happen during the purge of an extended attribute
|
||||
* directory. All we need to do is skip over them, since they
|
||||
* are already in the system marked z_unlinked.
|
||||
*/
|
||||
if (error != 0)
|
||||
continue;
|
||||
|
||||
zp->z_unlinked = B_TRUE;
|
||||
VN_RELE(ZTOV(zp));
|
||||
}
|
||||
zap_cursor_fini(&zc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete the entire contents of a directory. Return a count
|
||||
* of the number of entries that could not be deleted. If we encounter
|
||||
* an error, return a count of at least one so that the directory stays
|
||||
* in the unlinked set.
|
||||
*
|
||||
* NOTE: this function assumes that the directory is inactive,
|
||||
* so there is no need to lock its entries before deletion.
|
||||
* Also, it assumes the directory contents is *only* regular
|
||||
* files.
|
||||
*/
|
||||
static int
|
||||
zfs_purgedir(znode_t *dzp)
|
||||
{
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t zap;
|
||||
znode_t *xzp;
|
||||
dmu_tx_t *tx;
|
||||
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
|
||||
zfs_dirlock_t dl;
|
||||
int skipped = 0;
|
||||
int error;
|
||||
|
||||
for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
|
||||
(error = zap_cursor_retrieve(&zc, &zap)) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
error = zfs_zget(zfsvfs,
|
||||
ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
|
||||
if (error) {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
ASSERT((ZTOV(xzp)->v_type == VREG) ||
|
||||
(ZTOV(xzp)->v_type == VLNK));
|
||||
|
||||
tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_bonus(tx, dzp->z_id);
|
||||
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
|
||||
dmu_tx_hold_bonus(tx, xzp->z_id);
|
||||
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
||||
error = dmu_tx_assign(tx, TXG_WAIT);
|
||||
if (error) {
|
||||
dmu_tx_abort(tx);
|
||||
VN_RELE(ZTOV(xzp));
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
bzero(&dl, sizeof (dl));
|
||||
dl.dl_dzp = dzp;
|
||||
dl.dl_name = zap.za_name;
|
||||
|
||||
error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
|
||||
if (error)
|
||||
skipped += 1;
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
VN_RELE(ZTOV(xzp));
|
||||
}
|
||||
zap_cursor_fini(&zc);
|
||||
if (error != ENOENT)
|
||||
skipped += 1;
|
||||
return (skipped);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_rmnode(znode_t *zp)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
objset_t *os = zfsvfs->z_os;
|
||||
znode_t *xzp = NULL;
|
||||
char obj_name[17];
|
||||
dmu_tx_t *tx;
|
||||
uint64_t acl_obj;
|
||||
int error;
|
||||
|
||||
ASSERT(ZTOV(zp)->v_count == 0);
|
||||
ASSERT(zp->z_phys->zp_links == 0);
|
||||
|
||||
/*
|
||||
* If this is an attribute directory, purge its contents.
|
||||
*/
|
||||
if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
|
||||
if (zfs_purgedir(zp) != 0) {
|
||||
/*
|
||||
* Not enough space to delete some xattrs.
|
||||
* Leave it on the unlinked set.
|
||||
*/
|
||||
zfs_znode_dmu_fini(zp);
|
||||
zfs_znode_free(zp);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the file has extended attributes, we're going to unlink
|
||||
* the xattr dir.
|
||||
*/
|
||||
if (zp->z_phys->zp_xattr) {
|
||||
error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
|
||||
ASSERT(error == 0);
|
||||
}
|
||||
|
||||
acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
|
||||
|
||||
/*
|
||||
* Set up the transaction.
|
||||
*/
|
||||
tx = dmu_tx_create(os);
|
||||
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
|
||||
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
||||
if (xzp) {
|
||||
dmu_tx_hold_bonus(tx, xzp->z_id);
|
||||
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
|
||||
}
|
||||
if (acl_obj)
|
||||
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
|
||||
error = dmu_tx_assign(tx, TXG_WAIT);
|
||||
if (error) {
|
||||
/*
|
||||
* Not enough space to delete the file. Leave it in the
|
||||
* unlinked set, leaking it until the fs is remounted (at
|
||||
* which point we'll call zfs_unlinked_drain() to process it).
|
||||
*/
|
||||
dmu_tx_abort(tx);
|
||||
zfs_znode_dmu_fini(zp);
|
||||
zfs_znode_free(zp);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (xzp) {
|
||||
dmu_buf_will_dirty(xzp->z_dbuf, tx);
|
||||
mutex_enter(&xzp->z_lock);
|
||||
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
|
||||
xzp->z_phys->zp_links = 0; /* no more links to it */
|
||||
mutex_exit(&xzp->z_lock);
|
||||
zfs_unlinked_add(xzp, tx);
|
||||
}
|
||||
|
||||
/* Remove this znode from the unlinked set */
|
||||
error = zap_remove(os, zfsvfs->z_unlinkedobj,
|
||||
zfs_unlinked_hexname(obj_name, zp->z_id), tx);
|
||||
ASSERT3U(error, ==, 0);
|
||||
|
||||
zfs_znode_delete(zp, tx);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
out:
|
||||
if (xzp)
|
||||
VN_RELE(ZTOV(xzp));
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
zfs_dirent(znode_t *zp)
|
||||
{
|
||||
uint64_t de = zp->z_id;
|
||||
if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
|
||||
de |= IFTODT((zp)->z_phys->zp_mode) << 60;
|
||||
return (de);
|
||||
}
|
||||
|
||||
/*
|
||||
* Link zp into dl. Can only fail if zp has been unlinked.
|
||||
*/
|
||||
int
|
||||
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
|
||||
{
|
||||
znode_t *dzp = dl->dl_dzp;
|
||||
vnode_t *vp = ZTOV(zp);
|
||||
uint64_t value;
|
||||
int zp_is_dir = (vp->v_type == VDIR);
|
||||
int error;
|
||||
|
||||
dmu_buf_will_dirty(zp->z_dbuf, tx);
|
||||
mutex_enter(&zp->z_lock);
|
||||
|
||||
if (!(flag & ZRENAMING)) {
|
||||
if (zp->z_unlinked) { /* no new links to unlinked zp */
|
||||
ASSERT(!(flag & (ZNEW | ZEXISTS)));
|
||||
mutex_exit(&zp->z_lock);
|
||||
return (ENOENT);
|
||||
}
|
||||
zp->z_phys->zp_links++;
|
||||
}
|
||||
zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
|
||||
|
||||
if (!(flag & ZNEW))
|
||||
zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
|
||||
mutex_exit(&zp->z_lock);
|
||||
|
||||
dmu_buf_will_dirty(dzp->z_dbuf, tx);
|
||||
mutex_enter(&dzp->z_lock);
|
||||
dzp->z_phys->zp_size++; /* one dirent added */
|
||||
dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
|
||||
zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
|
||||
mutex_exit(&dzp->z_lock);
|
||||
|
||||
value = zfs_dirent(zp);
|
||||
error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
|
||||
8, 1, &value, tx);
|
||||
ASSERT(error == 0);
|
||||
|
||||
dnlc_update(ZTOV(dzp), dl->dl_name, vp);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlink zp from dl, and mark zp for deletion if this was the last link.
|
||||
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
|
||||
* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
|
||||
* If it's non-NULL, we use it to indicate whether the znode needs deletion,
|
||||
* and it's the caller's job to do it.
|
||||
*/
|
||||
int
|
||||
zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
|
||||
boolean_t *unlinkedp)
|
||||
{
|
||||
znode_t *dzp = dl->dl_dzp;
|
||||
vnode_t *vp = ZTOV(zp);
|
||||
int zp_is_dir = (vp->v_type == VDIR);
|
||||
boolean_t unlinked = B_FALSE;
|
||||
int error;
|
||||
|
||||
dnlc_remove(ZTOV(dzp), dl->dl_name);
|
||||
|
||||
if (!(flag & ZRENAMING)) {
|
||||
dmu_buf_will_dirty(zp->z_dbuf, tx);
|
||||
|
||||
if (vn_vfswlock(vp)) /* prevent new mounts on zp */
|
||||
return (EBUSY);
|
||||
|
||||
if (vn_ismntpt(vp)) { /* don't remove mount point */
|
||||
vn_vfsunlock(vp);
|
||||
return (EBUSY);
|
||||
}
|
||||
|
||||
mutex_enter(&zp->z_lock);
|
||||
if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
|
||||
mutex_exit(&zp->z_lock);
|
||||
vn_vfsunlock(vp);
|
||||
return (EEXIST);
|
||||
}
|
||||
if (zp->z_phys->zp_links <= zp_is_dir) {
|
||||
zfs_panic_recover("zfs: link count on %s is %u, "
|
||||
"should be at least %u",
|
||||
zp->z_vnode->v_path ? zp->z_vnode->v_path :
|
||||
"<unknown>", (int)zp->z_phys->zp_links,
|
||||
zp_is_dir + 1);
|
||||
zp->z_phys->zp_links = zp_is_dir + 1;
|
||||
}
|
||||
if (--zp->z_phys->zp_links == zp_is_dir) {
|
||||
zp->z_unlinked = B_TRUE;
|
||||
zp->z_phys->zp_links = 0;
|
||||
unlinked = B_TRUE;
|
||||
} else {
|
||||
zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
|
||||
}
|
||||
mutex_exit(&zp->z_lock);
|
||||
vn_vfsunlock(vp);
|
||||
}
|
||||
|
||||
dmu_buf_will_dirty(dzp->z_dbuf, tx);
|
||||
mutex_enter(&dzp->z_lock);
|
||||
dzp->z_phys->zp_size--; /* one dirent removed */
|
||||
dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
|
||||
zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
|
||||
mutex_exit(&dzp->z_lock);
|
||||
|
||||
if (zp->z_zfsvfs->z_norm) {
|
||||
if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
|
||||
(flag & ZCIEXACT)) ||
|
||||
((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
|
||||
!(flag & ZCILOOK)))
|
||||
error = zap_remove_norm(zp->z_zfsvfs->z_os,
|
||||
dzp->z_id, dl->dl_name, MT_EXACT, tx);
|
||||
else
|
||||
error = zap_remove_norm(zp->z_zfsvfs->z_os,
|
||||
dzp->z_id, dl->dl_name, MT_FIRST, tx);
|
||||
} else {
|
||||
error = zap_remove(zp->z_zfsvfs->z_os,
|
||||
dzp->z_id, dl->dl_name, tx);
|
||||
}
|
||||
ASSERT(error == 0);
|
||||
|
||||
if (unlinkedp != NULL)
|
||||
*unlinkedp = unlinked;
|
||||
else if (unlinked)
|
||||
zfs_unlinked_add(zp, tx);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Indicate whether the directory is empty. Works with or without z_lock
|
||||
* held, but can only be consider a hint in the latter case. Returns true
|
||||
* if only "." and ".." remain and there's no work in progress.
|
||||
*/
|
||||
boolean_t
|
||||
zfs_dirempty(znode_t *dzp)
|
||||
{
|
||||
return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
|
||||
}
|
||||
|
||||
int
|
||||
zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
znode_t *xzp;
|
||||
dmu_tx_t *tx;
|
||||
int error;
|
||||
zfs_fuid_info_t *fuidp = NULL;
|
||||
|
||||
*xvpp = NULL;
|
||||
|
||||
if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
|
||||
return (error);
|
||||
|
||||
tx = dmu_tx_create(zfsvfs->z_os);
|
||||
dmu_tx_hold_bonus(tx, zp->z_id);
|
||||
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
|
||||
if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
|
||||
if (zfsvfs->z_fuid_obj == 0) {
|
||||
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
|
||||
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
|
||||
FUID_SIZE_ESTIMATE(zfsvfs));
|
||||
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
|
||||
} else {
|
||||
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
|
||||
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
|
||||
FUID_SIZE_ESTIMATE(zfsvfs));
|
||||
}
|
||||
}
|
||||
error = dmu_tx_assign(tx, zfsvfs->z_assign);
|
||||
if (error) {
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
|
||||
dmu_tx_wait(tx);
|
||||
dmu_tx_abort(tx);
|
||||
return (error);
|
||||
}
|
||||
zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
|
||||
ASSERT(xzp->z_phys->zp_parent == zp->z_id);
|
||||
dmu_buf_will_dirty(zp->z_dbuf, tx);
|
||||
zp->z_phys->zp_xattr = xzp->z_id;
|
||||
|
||||
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
|
||||
xzp, "", NULL, fuidp, vap);
|
||||
if (fuidp)
|
||||
zfs_fuid_info_free(fuidp);
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
*xvpp = ZTOV(xzp);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a znode for the extended attribute directory for zp.
|
||||
* ** If the directory does not already exist, it is created **
|
||||
*
|
||||
* IN: zp - znode to obtain attribute directory from
|
||||
* cr - credentials of caller
|
||||
* flags - flags from the VOP_LOOKUP call
|
||||
*
|
||||
* OUT: xzpp - pointer to extended attribute znode
|
||||
*
|
||||
* RETURN: 0 on success
|
||||
* error number on failure
|
||||
*/
|
||||
int
|
||||
zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
|
||||
znode_t *xzp;
|
||||
zfs_dirlock_t *dl;
|
||||
vattr_t va;
|
||||
int error;
|
||||
top:
|
||||
error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
if (xzp != NULL) {
|
||||
*xvpp = ZTOV(xzp);
|
||||
zfs_dirent_unlock(dl);
|
||||
return (0);
|
||||
}
|
||||
|
||||
ASSERT(zp->z_phys->zp_xattr == 0);
|
||||
|
||||
if (!(flags & CREATE_XATTR_DIR)) {
|
||||
zfs_dirent_unlock(dl);
|
||||
return (ENOENT);
|
||||
}
|
||||
|
||||
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
|
||||
zfs_dirent_unlock(dl);
|
||||
return (EROFS);
|
||||
}
|
||||
|
||||
/*
|
||||
* The ability to 'create' files in an attribute
|
||||
* directory comes from the write_xattr permission on the base file.
|
||||
*
|
||||
* The ability to 'search' an attribute directory requires
|
||||
* read_xattr permission on the base file.
|
||||
*
|
||||
* Once in a directory the ability to read/write attributes
|
||||
* is controlled by the permissions on the attribute file.
|
||||
*/
|
||||
va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
|
||||
va.va_type = VDIR;
|
||||
va.va_mode = S_IFDIR | S_ISVTX | 0777;
|
||||
zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
|
||||
|
||||
error = zfs_make_xattrdir(zp, &va, xvpp, cr);
|
||||
zfs_dirent_unlock(dl);
|
||||
|
||||
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
|
||||
/* NB: we already did dmu_tx_wait() if necessary */
|
||||
goto top;
|
||||
}
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decide whether it is okay to remove within a sticky directory.
|
||||
*
|
||||
* In sticky directories, write access is not sufficient;
|
||||
* you can remove entries from a directory only if:
|
||||
*
|
||||
* you own the directory,
|
||||
* you own the entry,
|
||||
* the entry is a plain file and you have write access,
|
||||
* or you are privileged (checked in secpolicy...).
|
||||
*
|
||||
* The function returns 0 if remove access is granted.
|
||||
*/
|
||||
int
|
||||
zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
|
||||
{
|
||||
uid_t uid;
|
||||
uid_t downer;
|
||||
uid_t fowner;
|
||||
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
|
||||
|
||||
if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
|
||||
return (0);
|
||||
|
||||
if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
|
||||
return (0);
|
||||
|
||||
downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
|
||||
fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
|
||||
|
||||
if ((uid = crgetuid(cr)) == downer || uid == fowner ||
|
||||
(ZTOV(zp)->v_type == VREG &&
|
||||
zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
|
||||
return (0);
|
||||
else
|
||||
return (secpolicy_vnode_remove(cr));
|
||||
}
|
||||
@@ -0,0 +1,688 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "@(#)zfs_fuid.c 1.5 08/01/31 SMI"
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/sunddi.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/avl.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/refcount.h>
|
||||
#include <sys/nvpair.h>
|
||||
#ifdef _KERNEL
|
||||
#include <sys/kidmap.h>
|
||||
#include <sys/sid.h>
|
||||
#include <sys/zfs_vfsops.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#endif
|
||||
#include <sys/zfs_fuid.h>
|
||||
|
||||
/*
|
||||
* FUID Domain table(s).
|
||||
*
|
||||
* The FUID table is stored as a packed nvlist of an array
|
||||
* of nvlists which contain an index, domain string and offset
|
||||
*
|
||||
* During file system initialization the nvlist(s) are read and
|
||||
* two AVL trees are created. One tree is keyed by the index number
|
||||
* and the other by the domain string. Nodes are never removed from
|
||||
* trees, but new entries may be added. If a new entry is added then the
|
||||
* on-disk packed nvlist will also be updated.
|
||||
*/
|
||||
|
||||
#define FUID_IDX "fuid_idx"
|
||||
#define FUID_DOMAIN "fuid_domain"
|
||||
#define FUID_OFFSET "fuid_offset"
|
||||
#define FUID_NVP_ARRAY "fuid_nvlist"
|
||||
|
||||
typedef struct fuid_domain {
|
||||
avl_node_t f_domnode;
|
||||
avl_node_t f_idxnode;
|
||||
ksiddomain_t *f_ksid;
|
||||
uint64_t f_idx;
|
||||
} fuid_domain_t;
|
||||
|
||||
/*
|
||||
* Compare two indexes.
|
||||
*/
|
||||
static int
|
||||
idx_compare(const void *arg1, const void *arg2)
|
||||
{
|
||||
const fuid_domain_t *node1 = arg1;
|
||||
const fuid_domain_t *node2 = arg2;
|
||||
|
||||
if (node1->f_idx < node2->f_idx)
|
||||
return (-1);
|
||||
else if (node1->f_idx > node2->f_idx)
|
||||
return (1);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare two domain strings.
|
||||
*/
|
||||
static int
|
||||
domain_compare(const void *arg1, const void *arg2)
|
||||
{
|
||||
const fuid_domain_t *node1 = arg1;
|
||||
const fuid_domain_t *node2 = arg2;
|
||||
int val;
|
||||
|
||||
val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
|
||||
if (val == 0)
|
||||
return (0);
|
||||
return (val > 0 ? 1 : -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* load initial fuid domain and idx trees. This function is used by
|
||||
* both the kernel and zdb.
|
||||
*/
|
||||
uint64_t
|
||||
zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
|
||||
avl_tree_t *domain_tree)
|
||||
{
|
||||
dmu_buf_t *db;
|
||||
uint64_t fuid_size;
|
||||
|
||||
avl_create(idx_tree, idx_compare,
|
||||
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
|
||||
avl_create(domain_tree, domain_compare,
|
||||
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
|
||||
|
||||
VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
|
||||
fuid_size = *(uint64_t *)db->db_data;
|
||||
dmu_buf_rele(db, FTAG);
|
||||
|
||||
if (fuid_size) {
|
||||
nvlist_t **fuidnvp;
|
||||
nvlist_t *nvp = NULL;
|
||||
uint_t count;
|
||||
char *packed;
|
||||
int i;
|
||||
|
||||
packed = kmem_alloc(fuid_size, KM_SLEEP);
|
||||
VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
|
||||
VERIFY(nvlist_unpack(packed, fuid_size,
|
||||
&nvp, 0) == 0);
|
||||
VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
|
||||
&fuidnvp, &count) == 0);
|
||||
|
||||
for (i = 0; i != count; i++) {
|
||||
fuid_domain_t *domnode;
|
||||
char *domain;
|
||||
uint64_t idx;
|
||||
|
||||
VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
|
||||
&domain) == 0);
|
||||
VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
|
||||
&idx) == 0);
|
||||
|
||||
domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
|
||||
|
||||
domnode->f_idx = idx;
|
||||
domnode->f_ksid = ksid_lookupdomain(domain);
|
||||
avl_add(idx_tree, domnode);
|
||||
avl_add(domain_tree, domnode);
|
||||
}
|
||||
nvlist_free(nvp);
|
||||
kmem_free(packed, fuid_size);
|
||||
}
|
||||
return (fuid_size);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
|
||||
{
|
||||
fuid_domain_t *domnode;
|
||||
void *cookie;
|
||||
|
||||
cookie = NULL;
|
||||
while (domnode = avl_destroy_nodes(domain_tree, &cookie))
|
||||
ksiddomain_rele(domnode->f_ksid);
|
||||
|
||||
avl_destroy(domain_tree);
|
||||
cookie = NULL;
|
||||
while (domnode = avl_destroy_nodes(idx_tree, &cookie))
|
||||
kmem_free(domnode, sizeof (fuid_domain_t));
|
||||
avl_destroy(idx_tree);
|
||||
}
|
||||
|
||||
char *
|
||||
zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
|
||||
{
|
||||
fuid_domain_t searchnode, *findnode;
|
||||
avl_index_t loc;
|
||||
|
||||
searchnode.f_idx = idx;
|
||||
|
||||
findnode = avl_find(idx_tree, &searchnode, &loc);
|
||||
|
||||
return (findnode->f_ksid->kd_name);
|
||||
}
|
||||
|
||||
#ifdef _KERNEL
|
||||
/*
|
||||
* Load the fuid table(s) into memory.
|
||||
*/
|
||||
static void
|
||||
zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
|
||||
{
|
||||
int error = 0;
|
||||
|
||||
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
|
||||
|
||||
if (zfsvfs->z_fuid_loaded) {
|
||||
rw_exit(&zfsvfs->z_fuid_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
if (zfsvfs->z_fuid_obj == 0) {
|
||||
|
||||
/* first make sure we need to allocate object */
|
||||
|
||||
error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
|
||||
ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
|
||||
if (error == ENOENT && tx != NULL) {
|
||||
zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
|
||||
DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
|
||||
sizeof (uint64_t), tx);
|
||||
VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
|
||||
ZFS_FUID_TABLES, sizeof (uint64_t), 1,
|
||||
&zfsvfs->z_fuid_obj, tx) == 0);
|
||||
}
|
||||
}
|
||||
|
||||
zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
|
||||
zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
|
||||
|
||||
zfsvfs->z_fuid_loaded = B_TRUE;
|
||||
rw_exit(&zfsvfs->z_fuid_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Query domain table for a given domain.
|
||||
*
|
||||
* If domain isn't found it is added to AVL trees and
|
||||
* the results are pushed out to disk.
|
||||
*/
|
||||
int
|
||||
zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
|
||||
dmu_tx_t *tx)
|
||||
{
|
||||
fuid_domain_t searchnode, *findnode;
|
||||
avl_index_t loc;
|
||||
|
||||
/*
|
||||
* If the dummy "nobody" domain then return an index of 0
|
||||
* to cause the created FUID to be a standard POSIX id
|
||||
* for the user nobody.
|
||||
*/
|
||||
if (domain[0] == '\0') {
|
||||
*retdomain = "";
|
||||
return (0);
|
||||
}
|
||||
|
||||
searchnode.f_ksid = ksid_lookupdomain(domain);
|
||||
if (retdomain) {
|
||||
*retdomain = searchnode.f_ksid->kd_name;
|
||||
}
|
||||
if (!zfsvfs->z_fuid_loaded)
|
||||
zfs_fuid_init(zfsvfs, tx);
|
||||
|
||||
rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
|
||||
findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
|
||||
rw_exit(&zfsvfs->z_fuid_lock);
|
||||
|
||||
if (findnode) {
|
||||
ksiddomain_rele(searchnode.f_ksid);
|
||||
return (findnode->f_idx);
|
||||
} else {
|
||||
fuid_domain_t *domnode;
|
||||
nvlist_t *nvp;
|
||||
nvlist_t **fuids;
|
||||
uint64_t retidx;
|
||||
size_t nvsize = 0;
|
||||
char *packed;
|
||||
dmu_buf_t *db;
|
||||
int i = 0;
|
||||
|
||||
domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
|
||||
domnode->f_ksid = searchnode.f_ksid;
|
||||
|
||||
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
|
||||
retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
|
||||
|
||||
avl_add(&zfsvfs->z_fuid_domain, domnode);
|
||||
avl_add(&zfsvfs->z_fuid_idx, domnode);
|
||||
/*
|
||||
* Now resync the on-disk nvlist.
|
||||
*/
|
||||
VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
|
||||
|
||||
domnode = avl_first(&zfsvfs->z_fuid_domain);
|
||||
fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
|
||||
while (domnode) {
|
||||
VERIFY(nvlist_alloc(&fuids[i],
|
||||
NV_UNIQUE_NAME, KM_SLEEP) == 0);
|
||||
VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
|
||||
domnode->f_idx) == 0);
|
||||
VERIFY(nvlist_add_uint64(fuids[i],
|
||||
FUID_OFFSET, 0) == 0);
|
||||
VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
|
||||
domnode->f_ksid->kd_name) == 0);
|
||||
domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
|
||||
}
|
||||
VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
|
||||
fuids, retidx) == 0);
|
||||
for (i = 0; i != retidx; i++)
|
||||
nvlist_free(fuids[i]);
|
||||
kmem_free(fuids, retidx * sizeof (void *));
|
||||
VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
|
||||
packed = kmem_alloc(nvsize, KM_SLEEP);
|
||||
VERIFY(nvlist_pack(nvp, &packed, &nvsize,
|
||||
NV_ENCODE_XDR, KM_SLEEP) == 0);
|
||||
nvlist_free(nvp);
|
||||
zfsvfs->z_fuid_size = nvsize;
|
||||
dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
|
||||
zfsvfs->z_fuid_size, packed, tx);
|
||||
kmem_free(packed, zfsvfs->z_fuid_size);
|
||||
VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
|
||||
FTAG, &db));
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
|
||||
dmu_buf_rele(db, FTAG);
|
||||
|
||||
rw_exit(&zfsvfs->z_fuid_lock);
|
||||
return (retidx);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Query domain table by index, returning domain string
|
||||
*
|
||||
* Returns a pointer from an avl node of the domain string.
|
||||
*
|
||||
*/
|
||||
static char *
|
||||
zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
|
||||
{
|
||||
char *domain;
|
||||
|
||||
if (idx == 0 || !zfsvfs->z_use_fuids)
|
||||
return (NULL);
|
||||
|
||||
if (!zfsvfs->z_fuid_loaded)
|
||||
zfs_fuid_init(zfsvfs, NULL);
|
||||
|
||||
rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
|
||||
domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
|
||||
rw_exit(&zfsvfs->z_fuid_lock);
|
||||
|
||||
ASSERT(domain);
|
||||
return (domain);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
|
||||
{
|
||||
*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
|
||||
cr, ZFS_OWNER);
|
||||
*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
|
||||
cr, ZFS_GROUP);
|
||||
}
|
||||
|
||||
uid_t
|
||||
zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
|
||||
cred_t *cr, zfs_fuid_type_t type)
|
||||
{
|
||||
uint32_t index = FUID_INDEX(fuid);
|
||||
char *domain;
|
||||
uid_t id;
|
||||
|
||||
if (index == 0)
|
||||
return (fuid);
|
||||
|
||||
domain = zfs_fuid_find_by_idx(zfsvfs, index);
|
||||
ASSERT(domain != NULL);
|
||||
|
||||
if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
|
||||
(void) kidmap_getuidbysid(crgetzone(cr), domain,
|
||||
FUID_RID(fuid), &id);
|
||||
} else {
|
||||
(void) kidmap_getgidbysid(crgetzone(cr), domain,
|
||||
FUID_RID(fuid), &id);
|
||||
}
|
||||
return (id);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a FUID node to the list of fuid's being created for this
|
||||
* ACL
|
||||
*
|
||||
* If ACL has multiple domains, then keep only one copy of each unique
|
||||
* domain.
|
||||
*/
|
||||
static void
|
||||
zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
|
||||
uint64_t idx, uint64_t id, zfs_fuid_type_t type)
|
||||
{
|
||||
zfs_fuid_t *fuid;
|
||||
zfs_fuid_domain_t *fuid_domain;
|
||||
zfs_fuid_info_t *fuidp;
|
||||
uint64_t fuididx;
|
||||
boolean_t found = B_FALSE;
|
||||
|
||||
if (*fuidpp == NULL)
|
||||
*fuidpp = zfs_fuid_info_alloc();
|
||||
|
||||
fuidp = *fuidpp;
|
||||
/*
|
||||
* First find fuid domain index in linked list
|
||||
*
|
||||
* If one isn't found then create an entry.
|
||||
*/
|
||||
|
||||
for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
|
||||
fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
|
||||
fuid_domain), fuididx++) {
|
||||
if (idx == fuid_domain->z_domidx) {
|
||||
found = B_TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
|
||||
fuid_domain->z_domain = domain;
|
||||
fuid_domain->z_domidx = idx;
|
||||
list_insert_tail(&fuidp->z_domains, fuid_domain);
|
||||
fuidp->z_domain_str_sz += strlen(domain) + 1;
|
||||
fuidp->z_domain_cnt++;
|
||||
}
|
||||
|
||||
if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
|
||||
/*
|
||||
* Now allocate fuid entry and add it on the end of the list
|
||||
*/
|
||||
|
||||
fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
|
||||
fuid->z_id = id;
|
||||
fuid->z_domidx = idx;
|
||||
fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
|
||||
|
||||
list_insert_tail(&fuidp->z_fuids, fuid);
|
||||
fuidp->z_fuid_cnt++;
|
||||
} else {
|
||||
if (type == ZFS_OWNER)
|
||||
fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
|
||||
else
|
||||
fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a file system FUID, based on information in the users cred
|
||||
*/
|
||||
uint64_t
|
||||
zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
|
||||
dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
|
||||
{
|
||||
uint64_t idx;
|
||||
ksid_t *ksid;
|
||||
uint32_t rid;
|
||||
char *kdomain;
|
||||
const char *domain;
|
||||
uid_t id;
|
||||
|
||||
VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
|
||||
|
||||
if (type == ZFS_OWNER)
|
||||
id = crgetuid(cr);
|
||||
else
|
||||
id = crgetgid(cr);
|
||||
|
||||
if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id))
|
||||
return ((uint64_t)id);
|
||||
|
||||
ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
|
||||
|
||||
VERIFY(ksid != NULL);
|
||||
rid = ksid_getrid(ksid);
|
||||
domain = ksid_getdomain(ksid);
|
||||
|
||||
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
|
||||
|
||||
zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
|
||||
|
||||
return (FUID_ENCODE(idx, rid));
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a file system FUID for an ACL ace
|
||||
* or a chown/chgrp of the file.
|
||||
* This is similar to zfs_fuid_create_cred, except that
|
||||
* we can't find the domain + rid information in the
|
||||
* cred. Instead we have to query Winchester for the
|
||||
* domain and rid.
|
||||
*
|
||||
* During replay operations the domain+rid information is
|
||||
* found in the zfs_fuid_info_t that the replay code has
|
||||
* attached to the zfsvfs of the file system.
|
||||
*/
|
||||
uint64_t
|
||||
zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
|
||||
zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
|
||||
{
|
||||
const char *domain;
|
||||
char *kdomain;
|
||||
uint32_t fuid_idx = FUID_INDEX(id);
|
||||
uint32_t rid;
|
||||
idmap_stat status;
|
||||
uint64_t idx;
|
||||
boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
|
||||
zfs_fuid_t *zfuid = NULL;
|
||||
zfs_fuid_info_t *fuidp;
|
||||
|
||||
/*
|
||||
* If POSIX ID, or entry is already a FUID then
|
||||
* just return the id
|
||||
*
|
||||
* We may also be handed an already FUID'ized id via
|
||||
* chmod.
|
||||
*/
|
||||
|
||||
if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
|
||||
return (id);
|
||||
|
||||
if (is_replay) {
|
||||
fuidp = zfsvfs->z_fuid_replay;
|
||||
|
||||
/*
|
||||
* If we are passed an ephemeral id, but no
|
||||
* fuid_info was logged then return NOBODY.
|
||||
* This is most likely a result of idmap service
|
||||
* not being available.
|
||||
*/
|
||||
if (fuidp == NULL)
|
||||
return (UID_NOBODY);
|
||||
|
||||
switch (type) {
|
||||
case ZFS_ACE_USER:
|
||||
case ZFS_ACE_GROUP:
|
||||
zfuid = list_head(&fuidp->z_fuids);
|
||||
rid = FUID_RID(zfuid->z_logfuid);
|
||||
idx = FUID_INDEX(zfuid->z_logfuid);
|
||||
break;
|
||||
case ZFS_OWNER:
|
||||
rid = FUID_RID(fuidp->z_fuid_owner);
|
||||
idx = FUID_INDEX(fuidp->z_fuid_owner);
|
||||
break;
|
||||
case ZFS_GROUP:
|
||||
rid = FUID_RID(fuidp->z_fuid_group);
|
||||
idx = FUID_INDEX(fuidp->z_fuid_group);
|
||||
break;
|
||||
};
|
||||
domain = fuidp->z_domain_table[idx -1];
|
||||
} else {
|
||||
if (type == ZFS_OWNER || type == ZFS_ACE_USER)
|
||||
status = kidmap_getsidbyuid(crgetzone(cr), id,
|
||||
&domain, &rid);
|
||||
else
|
||||
status = kidmap_getsidbygid(crgetzone(cr), id,
|
||||
&domain, &rid);
|
||||
|
||||
if (status != 0) {
|
||||
/*
|
||||
* When returning nobody we will need to
|
||||
* make a dummy fuid table entry for logging
|
||||
* purposes.
|
||||
*/
|
||||
rid = UID_NOBODY;
|
||||
domain = "";
|
||||
}
|
||||
}
|
||||
|
||||
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
|
||||
|
||||
if (!is_replay)
|
||||
zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
|
||||
else if (zfuid != NULL) {
|
||||
list_remove(&fuidp->z_fuids, zfuid);
|
||||
kmem_free(zfuid, sizeof (zfs_fuid_t));
|
||||
}
|
||||
return (FUID_ENCODE(idx, rid));
|
||||
}
|
||||
|
||||
void
|
||||
zfs_fuid_destroy(zfsvfs_t *zfsvfs)
|
||||
{
|
||||
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
|
||||
if (!zfsvfs->z_fuid_loaded) {
|
||||
rw_exit(&zfsvfs->z_fuid_lock);
|
||||
return;
|
||||
}
|
||||
zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
|
||||
rw_exit(&zfsvfs->z_fuid_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate zfs_fuid_info for tracking FUIDs created during
|
||||
* zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
|
||||
*/
|
||||
zfs_fuid_info_t *
|
||||
zfs_fuid_info_alloc(void)
|
||||
{
|
||||
zfs_fuid_info_t *fuidp;
|
||||
|
||||
fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
|
||||
list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
|
||||
offsetof(zfs_fuid_domain_t, z_next));
|
||||
list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
|
||||
offsetof(zfs_fuid_t, z_next));
|
||||
return (fuidp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Release all memory associated with zfs_fuid_info_t
|
||||
*/
|
||||
void
|
||||
zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
|
||||
{
|
||||
zfs_fuid_t *zfuid;
|
||||
zfs_fuid_domain_t *zdomain;
|
||||
|
||||
while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
|
||||
list_remove(&fuidp->z_fuids, zfuid);
|
||||
kmem_free(zfuid, sizeof (zfs_fuid_t));
|
||||
}
|
||||
|
||||
if (fuidp->z_domain_table != NULL)
|
||||
kmem_free(fuidp->z_domain_table,
|
||||
(sizeof (char **)) * fuidp->z_domain_cnt);
|
||||
|
||||
while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
|
||||
list_remove(&fuidp->z_domains, zdomain);
|
||||
kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
|
||||
}
|
||||
|
||||
kmem_free(fuidp, sizeof (zfs_fuid_info_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if id is a groupmember. If cred
|
||||
* has ksid info then sidlist is checked first
|
||||
* and if still not found then POSIX groups are checked
|
||||
*
|
||||
* Will use a straight FUID compare when possible.
|
||||
*/
|
||||
boolean_t
|
||||
zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
|
||||
{
|
||||
ksid_t *ksid = crgetsid(cr, KSID_GROUP);
|
||||
uid_t gid;
|
||||
|
||||
if (ksid) {
|
||||
int i;
|
||||
ksid_t *ksid_groups;
|
||||
ksidlist_t *ksidlist = crgetsidlist(cr);
|
||||
uint32_t idx = FUID_INDEX(id);
|
||||
uint32_t rid = FUID_RID(id);
|
||||
|
||||
ASSERT(ksidlist);
|
||||
ksid_groups = ksidlist->ksl_sids;
|
||||
|
||||
for (i = 0; i != ksidlist->ksl_nsid; i++) {
|
||||
if (idx == 0) {
|
||||
if (id != IDMAP_WK_CREATOR_GROUP_GID &&
|
||||
id == ksid_groups[i].ks_id) {
|
||||
return (B_TRUE);
|
||||
}
|
||||
} else {
|
||||
char *domain;
|
||||
|
||||
domain = zfs_fuid_find_by_idx(zfsvfs, idx);
|
||||
ASSERT(domain != NULL);
|
||||
|
||||
if (strcmp(domain,
|
||||
IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
|
||||
return (B_FALSE);
|
||||
|
||||
if ((strcmp(domain,
|
||||
ksid_groups[i].ks_domain->kd_name) == 0) &&
|
||||
rid == ksid_groups[i].ks_rid)
|
||||
return (B_TRUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Not found in ksidlist, check posix groups
|
||||
*/
|
||||
gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
|
||||
return (groupmember(gid, cr));
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,693 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "@(#)zfs_log.c 1.13 08/04/09 SMI"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <sys/cmn_err.h>
|
||||
#include <sys/kmem.h>
|
||||
#include <sys/thread.h>
|
||||
#include <sys/file.h>
|
||||
#include <sys/vfs.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zfs_dir.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/zil_impl.h>
|
||||
#include <sys/byteorder.h>
|
||||
#include <sys/policy.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/mode.h>
|
||||
#include <sys/acl.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/spa.h>
|
||||
#include <sys/zfs_fuid.h>
|
||||
#include <sys/ddi.h>
|
||||
|
||||
/*
|
||||
* All the functions in this file are used to construct the log entries
|
||||
* to record transactions. They allocate * an intent log transaction
|
||||
* structure (itx_t) and save within it all the information necessary to
|
||||
* possibly replay the transaction. The itx is then assigned a sequence
|
||||
* number and inserted in the in-memory list anchored in the zilog.
|
||||
*/
|
||||
|
||||
int
|
||||
zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
|
||||
{
|
||||
int isxvattr = (vap->va_mask & AT_XVATTR);
|
||||
switch (type) {
|
||||
case Z_FILE:
|
||||
if (vsecp == NULL && !isxvattr)
|
||||
return (TX_CREATE);
|
||||
if (vsecp && isxvattr)
|
||||
return (TX_CREATE_ACL_ATTR);
|
||||
if (vsecp)
|
||||
return (TX_CREATE_ACL);
|
||||
else
|
||||
return (TX_CREATE_ATTR);
|
||||
/*NOTREACHED*/
|
||||
case Z_DIR:
|
||||
if (vsecp == NULL && !isxvattr)
|
||||
return (TX_MKDIR);
|
||||
if (vsecp && isxvattr)
|
||||
return (TX_MKDIR_ACL_ATTR);
|
||||
if (vsecp)
|
||||
return (TX_MKDIR_ACL);
|
||||
else
|
||||
return (TX_MKDIR_ATTR);
|
||||
case Z_XATTRDIR:
|
||||
return (TX_MKXATTR);
|
||||
}
|
||||
ASSERT(0);
|
||||
return (TX_MAX_TYPE);
|
||||
}
|
||||
|
||||
/*
|
||||
* build up the log data necessary for logging xvattr_t
|
||||
* First lr_attr_t is initialized. following the lr_attr_t
|
||||
* is the mapsize and attribute bitmap copied from the xvattr_t.
|
||||
* Following the bitmap and bitmapsize two 64 bit words are reserved
|
||||
* for the create time which may be set. Following the create time
|
||||
* records a single 64 bit integer which has the bits to set on
|
||||
* replay for the xvattr.
|
||||
*/
|
||||
static void
|
||||
zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
|
||||
{
|
||||
uint32_t *bitmap;
|
||||
uint64_t *attrs;
|
||||
uint64_t *crtime;
|
||||
xoptattr_t *xoap;
|
||||
void *scanstamp;
|
||||
int i;
|
||||
|
||||
xoap = xva_getxoptattr(xvap);
|
||||
ASSERT(xoap);
|
||||
|
||||
lrattr->lr_attr_masksize = xvap->xva_mapsize;
|
||||
bitmap = &lrattr->lr_attr_bitmap;
|
||||
for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
|
||||
*bitmap = xvap->xva_reqattrmap[i];
|
||||
}
|
||||
|
||||
/* Now pack the attributes up in a single uint64_t */
|
||||
attrs = (uint64_t *)bitmap;
|
||||
crtime = attrs + 1;
|
||||
scanstamp = (caddr_t)(crtime + 2);
|
||||
*attrs = 0;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_READONLY))
|
||||
*attrs |= (xoap->xoa_readonly == 0) ? 0 :
|
||||
XAT0_READONLY;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
|
||||
*attrs |= (xoap->xoa_hidden == 0) ? 0 :
|
||||
XAT0_HIDDEN;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
|
||||
*attrs |= (xoap->xoa_system == 0) ? 0 :
|
||||
XAT0_SYSTEM;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
|
||||
*attrs |= (xoap->xoa_archive == 0) ? 0 :
|
||||
XAT0_ARCHIVE;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
|
||||
*attrs |= (xoap->xoa_immutable == 0) ? 0 :
|
||||
XAT0_IMMUTABLE;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
|
||||
*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
|
||||
XAT0_NOUNLINK;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
|
||||
*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
|
||||
XAT0_APPENDONLY;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
|
||||
*attrs |= (xoap->xoa_opaque == 0) ? 0 :
|
||||
XAT0_APPENDONLY;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
|
||||
*attrs |= (xoap->xoa_nodump == 0) ? 0 :
|
||||
XAT0_NODUMP;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
|
||||
*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
|
||||
XAT0_AV_QUARANTINED;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
|
||||
*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
|
||||
XAT0_AV_MODIFIED;
|
||||
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
|
||||
ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
|
||||
bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
|
||||
}
|
||||
|
||||
static void *
|
||||
zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
|
||||
{
|
||||
zfs_fuid_t *zfuid;
|
||||
uint64_t *fuidloc = start;
|
||||
|
||||
/* First copy in the ACE FUIDs */
|
||||
for (zfuid = list_head(&fuidp->z_fuids); zfuid;
|
||||
zfuid = list_next(&fuidp->z_fuids, zfuid)) {
|
||||
*fuidloc++ = zfuid->z_logfuid;
|
||||
}
|
||||
return (fuidloc);
|
||||
}
|
||||
|
||||
|
||||
static void *
|
||||
zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
|
||||
{
|
||||
zfs_fuid_domain_t *zdomain;
|
||||
|
||||
/* now copy in the domain info, if any */
|
||||
if (fuidp->z_domain_str_sz != 0) {
|
||||
for (zdomain = list_head(&fuidp->z_domains); zdomain;
|
||||
zdomain = list_next(&fuidp->z_domains, zdomain)) {
|
||||
bcopy((void *)zdomain->z_domain, start,
|
||||
strlen(zdomain->z_domain) + 1);
|
||||
start = (caddr_t)start +
|
||||
strlen(zdomain->z_domain) + 1;
|
||||
}
|
||||
}
|
||||
return (start);
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
|
||||
* TX_MKDIR_ATTR and TX_MKXATTR
|
||||
* transactions.
|
||||
*
|
||||
* TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
|
||||
* domain information appended prior to the name. In this case the
|
||||
* uid/gid in the log record will be a log centric FUID.
|
||||
*
|
||||
* TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
|
||||
* may contain attributes, ACL and optional fuid information.
|
||||
*
|
||||
* TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
|
||||
* and ACL and normal users/groups in the ACEs.
|
||||
*
|
||||
* There may be an optional xvattr attribute information similar
|
||||
* to zfs_log_setattr.
|
||||
*
|
||||
* Also, after the file name "domain" strings may be appended.
|
||||
*/
|
||||
void
|
||||
zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
|
||||
zfs_fuid_info_t *fuidp, vattr_t *vap)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_create_t *lr;
|
||||
lr_acl_create_t *lracl;
|
||||
size_t aclsize;
|
||||
size_t xvatsize = 0;
|
||||
size_t txsize;
|
||||
xvattr_t *xvap = (xvattr_t *)vap;
|
||||
void *end;
|
||||
size_t lrsize;
|
||||
|
||||
size_t namesize = strlen(name) + 1;
|
||||
size_t fuidsz = 0;
|
||||
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we have FUIDs present then add in space for
|
||||
* domains and ACE fuid's if any.
|
||||
*/
|
||||
if (fuidp) {
|
||||
fuidsz += fuidp->z_domain_str_sz;
|
||||
fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
|
||||
}
|
||||
|
||||
if (vap->va_mask & AT_XVATTR)
|
||||
xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
|
||||
|
||||
if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
|
||||
(int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
|
||||
(int)txtype == TX_MKXATTR) {
|
||||
txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
|
||||
lrsize = sizeof (*lr);
|
||||
} else {
|
||||
aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
|
||||
txsize =
|
||||
sizeof (lr_acl_create_t) + namesize + fuidsz +
|
||||
ZIL_ACE_LENGTH(aclsize) + xvatsize;
|
||||
lrsize = sizeof (lr_acl_create_t);
|
||||
}
|
||||
|
||||
itx = zil_itx_create(txtype, txsize);
|
||||
|
||||
lr = (lr_create_t *)&itx->itx_lr;
|
||||
lr->lr_doid = dzp->z_id;
|
||||
lr->lr_foid = zp->z_id;
|
||||
lr->lr_mode = zp->z_phys->zp_mode;
|
||||
if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
|
||||
lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
|
||||
} else {
|
||||
lr->lr_uid = fuidp->z_fuid_owner;
|
||||
}
|
||||
if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
|
||||
lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
|
||||
} else {
|
||||
lr->lr_gid = fuidp->z_fuid_group;
|
||||
}
|
||||
lr->lr_gen = zp->z_phys->zp_gen;
|
||||
lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
|
||||
lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
|
||||
lr->lr_rdev = zp->z_phys->zp_rdev;
|
||||
|
||||
/*
|
||||
* Fill in xvattr info if any
|
||||
*/
|
||||
if (vap->va_mask & AT_XVATTR) {
|
||||
zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
|
||||
end = (caddr_t)lr + lrsize + xvatsize;
|
||||
} else {
|
||||
end = (caddr_t)lr + lrsize;
|
||||
}
|
||||
|
||||
/* Now fill in any ACL info */
|
||||
|
||||
if (vsecp) {
|
||||
lracl = (lr_acl_create_t *)&itx->itx_lr;
|
||||
lracl->lr_aclcnt = vsecp->vsa_aclcnt;
|
||||
lracl->lr_acl_bytes = aclsize;
|
||||
lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
|
||||
lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
|
||||
if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
|
||||
lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
|
||||
else
|
||||
lracl->lr_acl_flags = 0;
|
||||
|
||||
bcopy(vsecp->vsa_aclentp, end, aclsize);
|
||||
end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
|
||||
}
|
||||
|
||||
/* drop in FUID info */
|
||||
if (fuidp) {
|
||||
end = zfs_log_fuid_ids(fuidp, end);
|
||||
end = zfs_log_fuid_domains(fuidp, end);
|
||||
}
|
||||
/*
|
||||
* Now place file name in log record
|
||||
*/
|
||||
bcopy(name, end, namesize);
|
||||
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
dzp->z_last_itx = seq;
|
||||
zp->z_last_itx = seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
|
||||
*/
|
||||
void
|
||||
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
znode_t *dzp, char *name)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_remove_t *lr;
|
||||
size_t namesize = strlen(name) + 1;
|
||||
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
|
||||
lr = (lr_remove_t *)&itx->itx_lr;
|
||||
lr->lr_doid = dzp->z_id;
|
||||
bcopy(name, (char *)(lr + 1), namesize);
|
||||
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
dzp->z_last_itx = seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_link() handles TX_LINK transactions.
|
||||
*/
|
||||
void
|
||||
zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
znode_t *dzp, znode_t *zp, char *name)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_link_t *lr;
|
||||
size_t namesize = strlen(name) + 1;
|
||||
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
|
||||
lr = (lr_link_t *)&itx->itx_lr;
|
||||
lr->lr_doid = dzp->z_id;
|
||||
lr->lr_link_obj = zp->z_id;
|
||||
bcopy(name, (char *)(lr + 1), namesize);
|
||||
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
dzp->z_last_itx = seq;
|
||||
zp->z_last_itx = seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_symlink() handles TX_SYMLINK transactions.
|
||||
*/
|
||||
void
|
||||
zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
znode_t *dzp, znode_t *zp, char *name, char *link)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_create_t *lr;
|
||||
size_t namesize = strlen(name) + 1;
|
||||
size_t linksize = strlen(link) + 1;
|
||||
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
|
||||
lr = (lr_create_t *)&itx->itx_lr;
|
||||
lr->lr_doid = dzp->z_id;
|
||||
lr->lr_foid = zp->z_id;
|
||||
lr->lr_mode = zp->z_phys->zp_mode;
|
||||
lr->lr_uid = zp->z_phys->zp_uid;
|
||||
lr->lr_gid = zp->z_phys->zp_gid;
|
||||
lr->lr_gen = zp->z_phys->zp_gen;
|
||||
lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
|
||||
lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
|
||||
bcopy(name, (char *)(lr + 1), namesize);
|
||||
bcopy(link, (char *)(lr + 1) + namesize, linksize);
|
||||
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
dzp->z_last_itx = seq;
|
||||
zp->z_last_itx = seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_rename() handles TX_RENAME transactions.
|
||||
*/
|
||||
void
|
||||
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
|
||||
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_rename_t *lr;
|
||||
size_t snamesize = strlen(sname) + 1;
|
||||
size_t dnamesize = strlen(dname) + 1;
|
||||
|
||||
if (zilog == NULL)
|
||||
return;
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
|
||||
lr = (lr_rename_t *)&itx->itx_lr;
|
||||
lr->lr_sdoid = sdzp->z_id;
|
||||
lr->lr_tdoid = tdzp->z_id;
|
||||
bcopy(sname, (char *)(lr + 1), snamesize);
|
||||
bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
|
||||
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
sdzp->z_last_itx = seq;
|
||||
tdzp->z_last_itx = seq;
|
||||
szp->z_last_itx = seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_write() handles TX_WRITE transactions.
|
||||
*/
|
||||
ssize_t zfs_immediate_write_sz = 32768;
|
||||
|
||||
#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
|
||||
sizeof (lr_write_t))
|
||||
|
||||
void
|
||||
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
znode_t *zp, offset_t off, ssize_t resid, int ioflag)
|
||||
{
|
||||
itx_wr_state_t write_state;
|
||||
boolean_t slogging;
|
||||
uintptr_t fsync_cnt;
|
||||
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Writes are handled in three different ways:
|
||||
*
|
||||
* WR_INDIRECT:
|
||||
* If the write is greater than zfs_immediate_write_sz and there are
|
||||
* no separate logs in this pool then later *if* we need to log the
|
||||
* write then dmu_sync() is used to immediately write the block and
|
||||
* its block pointer is put in the log record.
|
||||
* WR_COPIED:
|
||||
* If we know we'll immediately be committing the
|
||||
* transaction (FSYNC or FDSYNC), the we allocate a larger
|
||||
* log record here for the data and copy the data in.
|
||||
* WR_NEED_COPY:
|
||||
* Otherwise we don't allocate a buffer, and *if* we need to
|
||||
* flush the write later then a buffer is allocated and
|
||||
* we retrieve the data using the dmu.
|
||||
*/
|
||||
slogging = spa_has_slogs(zilog->zl_spa);
|
||||
if (resid > zfs_immediate_write_sz && !slogging)
|
||||
write_state = WR_INDIRECT;
|
||||
else if (ioflag & (FSYNC | FDSYNC))
|
||||
write_state = WR_COPIED;
|
||||
else
|
||||
write_state = WR_NEED_COPY;
|
||||
|
||||
if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
|
||||
(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
|
||||
}
|
||||
|
||||
while (resid) {
|
||||
itx_t *itx;
|
||||
lr_write_t *lr;
|
||||
ssize_t len;
|
||||
|
||||
/*
|
||||
* If there are slogs and the write would overflow the largest
|
||||
* block, then because we don't want to use the main pool
|
||||
* to dmu_sync, we have to split the write.
|
||||
*/
|
||||
if (slogging && resid > ZIL_MAX_LOG_DATA)
|
||||
len = SPA_MAXBLOCKSIZE >> 1;
|
||||
else
|
||||
len = resid;
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr) +
|
||||
(write_state == WR_COPIED ? len : 0));
|
||||
lr = (lr_write_t *)&itx->itx_lr;
|
||||
if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
|
||||
zp->z_id, off, len, lr + 1) != 0) {
|
||||
kmem_free(itx, offsetof(itx_t, itx_lr) +
|
||||
itx->itx_lr.lrc_reclen);
|
||||
itx = zil_itx_create(txtype, sizeof (*lr));
|
||||
lr = (lr_write_t *)&itx->itx_lr;
|
||||
write_state = WR_NEED_COPY;
|
||||
}
|
||||
|
||||
itx->itx_wr_state = write_state;
|
||||
if (write_state == WR_NEED_COPY)
|
||||
itx->itx_sod += len;
|
||||
lr->lr_foid = zp->z_id;
|
||||
lr->lr_offset = off;
|
||||
lr->lr_length = len;
|
||||
lr->lr_blkoff = 0;
|
||||
BP_ZERO(&lr->lr_blkptr);
|
||||
|
||||
itx->itx_private = zp->z_zfsvfs;
|
||||
|
||||
if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
|
||||
(ioflag & (FSYNC | FDSYNC)))
|
||||
itx->itx_sync = B_TRUE;
|
||||
else
|
||||
itx->itx_sync = B_FALSE;
|
||||
|
||||
zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
|
||||
|
||||
off += len;
|
||||
resid -= len;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_truncate() handles TX_TRUNCATE transactions.
|
||||
*/
|
||||
void
|
||||
zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
znode_t *zp, uint64_t off, uint64_t len)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_truncate_t *lr;
|
||||
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
itx = zil_itx_create(txtype, sizeof (*lr));
|
||||
lr = (lr_truncate_t *)&itx->itx_lr;
|
||||
lr->lr_foid = zp->z_id;
|
||||
lr->lr_offset = off;
|
||||
lr->lr_length = len;
|
||||
|
||||
itx->itx_sync = (zp->z_sync_cnt != 0);
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
zp->z_last_itx = seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_setattr() handles TX_SETATTR transactions.
|
||||
*/
|
||||
void
|
||||
zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_setattr_t *lr;
|
||||
xvattr_t *xvap = (xvattr_t *)vap;
|
||||
size_t recsize = sizeof (lr_setattr_t);
|
||||
void *start;
|
||||
|
||||
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If XVATTR set, then log record size needs to allow
|
||||
* for lr_attr_t + xvattr mask, mapsize and create time
|
||||
* plus actual attribute values
|
||||
*/
|
||||
if (vap->va_mask & AT_XVATTR)
|
||||
recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
|
||||
|
||||
if (fuidp)
|
||||
recsize += fuidp->z_domain_str_sz;
|
||||
|
||||
itx = zil_itx_create(txtype, recsize);
|
||||
lr = (lr_setattr_t *)&itx->itx_lr;
|
||||
lr->lr_foid = zp->z_id;
|
||||
lr->lr_mask = (uint64_t)mask_applied;
|
||||
lr->lr_mode = (uint64_t)vap->va_mode;
|
||||
if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
|
||||
lr->lr_uid = fuidp->z_fuid_owner;
|
||||
else
|
||||
lr->lr_uid = (uint64_t)vap->va_uid;
|
||||
|
||||
if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
|
||||
lr->lr_gid = fuidp->z_fuid_group;
|
||||
else
|
||||
lr->lr_gid = (uint64_t)vap->va_gid;
|
||||
|
||||
lr->lr_size = (uint64_t)vap->va_size;
|
||||
ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
|
||||
ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
|
||||
start = (lr_setattr_t *)(lr + 1);
|
||||
if (vap->va_mask & AT_XVATTR) {
|
||||
zfs_log_xvattr((lr_attr_t *)start, xvap);
|
||||
start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
|
||||
}
|
||||
|
||||
/*
|
||||
* Now stick on domain information if any on end
|
||||
*/
|
||||
|
||||
if (fuidp)
|
||||
(void) zfs_log_fuid_domains(fuidp, start);
|
||||
|
||||
itx->itx_sync = (zp->z_sync_cnt != 0);
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
zp->z_last_itx = seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_log_acl() handles TX_ACL transactions.
|
||||
*/
|
||||
void
|
||||
zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
|
||||
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
|
||||
{
|
||||
itx_t *itx;
|
||||
uint64_t seq;
|
||||
lr_acl_v0_t *lrv0;
|
||||
lr_acl_t *lr;
|
||||
int txtype;
|
||||
int lrsize;
|
||||
size_t txsize;
|
||||
size_t aclbytes = vsecp->vsa_aclentsz;
|
||||
|
||||
txtype = (zp->z_zfsvfs->z_version == ZPL_VERSION_INITIAL) ?
|
||||
TX_ACL_V0 : TX_ACL;
|
||||
|
||||
if (txtype == TX_ACL)
|
||||
lrsize = sizeof (*lr);
|
||||
else
|
||||
lrsize = sizeof (*lrv0);
|
||||
|
||||
if (zilog == NULL || zp->z_unlinked)
|
||||
return;
|
||||
|
||||
txsize = lrsize +
|
||||
((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
|
||||
(fuidp ? fuidp->z_domain_str_sz : 0) +
|
||||
sizeof (uint64) * (fuidp ? fuidp->z_fuid_cnt : 0);
|
||||
|
||||
itx = zil_itx_create(txtype, txsize);
|
||||
|
||||
lr = (lr_acl_t *)&itx->itx_lr;
|
||||
lr->lr_foid = zp->z_id;
|
||||
if (txtype == TX_ACL) {
|
||||
lr->lr_acl_bytes = aclbytes;
|
||||
lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
|
||||
lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
|
||||
if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
|
||||
lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
|
||||
else
|
||||
lr->lr_acl_flags = 0;
|
||||
}
|
||||
lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
|
||||
|
||||
if (txtype == TX_ACL_V0) {
|
||||
lrv0 = (lr_acl_v0_t *)lr;
|
||||
bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
|
||||
} else {
|
||||
void *start = (ace_t *)(lr + 1);
|
||||
|
||||
bcopy(vsecp->vsa_aclentp, start, aclbytes);
|
||||
|
||||
start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
|
||||
|
||||
if (fuidp) {
|
||||
start = zfs_log_fuid_ids(fuidp, start);
|
||||
(void) zfs_log_fuid_domains(fuidp, start);
|
||||
}
|
||||
}
|
||||
|
||||
itx->itx_sync = (zp->z_sync_cnt != 0);
|
||||
seq = zil_itx_assign(zilog, itx, tx);
|
||||
zp->z_last_itx = seq;
|
||||
}
|
||||
@@ -0,0 +1,876 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "@(#)zfs_replay.c 1.7 08/01/14 SMI"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <sys/cmn_err.h>
|
||||
#include <sys/kmem.h>
|
||||
#include <sys/thread.h>
|
||||
#include <sys/file.h>
|
||||
#include <sys/fcntl.h>
|
||||
#include <sys/vfs.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zfs_dir.h>
|
||||
#include <sys/zfs_acl.h>
|
||||
#include <sys/zfs_fuid.h>
|
||||
#include <sys/spa.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/byteorder.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/mode.h>
|
||||
#include <sys/acl.h>
|
||||
#include <sys/atomic.h>
|
||||
#include <sys/cred.h>
|
||||
|
||||
/*
|
||||
* Functions to replay ZFS intent log (ZIL) records
|
||||
* The functions are called through a function vector (zfs_replay_vector)
|
||||
* which is indexed by the transaction type.
|
||||
*/
|
||||
|
||||
static void
|
||||
zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
|
||||
uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
|
||||
{
|
||||
bzero(vap, sizeof (*vap));
|
||||
vap->va_mask = (uint_t)mask;
|
||||
vap->va_type = IFTOVT(mode);
|
||||
vap->va_mode = mode & MODEMASK;
|
||||
vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
|
||||
vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
|
||||
vap->va_rdev = zfs_cmpldev(rdev);
|
||||
vap->va_nodeid = nodeid;
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
static int
|
||||
zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
|
||||
{
|
||||
return (ENOTSUP);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
|
||||
{
|
||||
xoptattr_t *xoap = NULL;
|
||||
uint64_t *attrs;
|
||||
uint64_t *crtime;
|
||||
uint32_t *bitmap;
|
||||
void *scanstamp;
|
||||
int i;
|
||||
|
||||
xvap->xva_vattr.va_mask |= AT_XVATTR;
|
||||
if ((xoap = xva_getxoptattr(xvap)) == NULL) {
|
||||
xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
|
||||
return;
|
||||
}
|
||||
|
||||
ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
|
||||
|
||||
bitmap = &lrattr->lr_attr_bitmap;
|
||||
for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
|
||||
xvap->xva_reqattrmap[i] = *bitmap;
|
||||
|
||||
attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
|
||||
crtime = attrs + 1;
|
||||
scanstamp = (caddr_t)(crtime + 2);
|
||||
|
||||
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
|
||||
xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
|
||||
xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
|
||||
xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_READONLY))
|
||||
xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
|
||||
xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
|
||||
xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
|
||||
xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
|
||||
xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
|
||||
xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
|
||||
xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
|
||||
xoap->xoa_av_quarantined =
|
||||
((*attrs & XAT0_AV_QUARANTINED) != 0);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
|
||||
ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
|
||||
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
|
||||
bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
|
||||
{
|
||||
uint64_t uid_idx;
|
||||
uint64_t gid_idx;
|
||||
int domcnt = 0;
|
||||
|
||||
uid_idx = FUID_INDEX(uid);
|
||||
gid_idx = FUID_INDEX(gid);
|
||||
if (uid_idx)
|
||||
domcnt++;
|
||||
if (gid_idx > 0 && gid_idx != uid_idx)
|
||||
domcnt++;
|
||||
|
||||
return (domcnt);
|
||||
}
|
||||
|
||||
static void *
|
||||
zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
|
||||
int domcnt)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i != domcnt; i++) {
|
||||
fuid_infop->z_domain_table[i] = start;
|
||||
start = (caddr_t)start + strlen(start) + 1;
|
||||
}
|
||||
|
||||
return (start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the uid/gid in the fuid_info structure.
|
||||
*/
|
||||
static void
|
||||
zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
|
||||
{
|
||||
/*
|
||||
* If owner or group are log specific FUIDs then slurp up
|
||||
* domain information and build zfs_fuid_info_t
|
||||
*/
|
||||
if (IS_EPHEMERAL(uid))
|
||||
fuid_infop->z_fuid_owner = uid;
|
||||
|
||||
if (IS_EPHEMERAL(gid))
|
||||
fuid_infop->z_fuid_group = gid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load fuid domains into fuid_info_t
|
||||
*/
|
||||
static zfs_fuid_info_t *
|
||||
zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
|
||||
{
|
||||
int domcnt;
|
||||
|
||||
zfs_fuid_info_t *fuid_infop;
|
||||
|
||||
fuid_infop = zfs_fuid_info_alloc();
|
||||
|
||||
domcnt = zfs_replay_domain_cnt(uid, gid);
|
||||
|
||||
if (domcnt == 0)
|
||||
return (fuid_infop);
|
||||
|
||||
fuid_infop->z_domain_table =
|
||||
kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
|
||||
|
||||
zfs_replay_fuid_ugid(fuid_infop, uid, gid);
|
||||
|
||||
fuid_infop->z_domain_cnt = domcnt;
|
||||
*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
|
||||
return (fuid_infop);
|
||||
}
|
||||
|
||||
/*
|
||||
* load zfs_fuid_t's and fuid_domains into fuid_info_t
|
||||
*/
|
||||
static zfs_fuid_info_t *
|
||||
zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
|
||||
uint64_t gid)
|
||||
{
|
||||
uint64_t *log_fuid = (uint64_t *)start;
|
||||
zfs_fuid_info_t *fuid_infop;
|
||||
int i;
|
||||
|
||||
fuid_infop = zfs_fuid_info_alloc();
|
||||
fuid_infop->z_domain_cnt = domcnt;
|
||||
|
||||
fuid_infop->z_domain_table =
|
||||
kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
|
||||
|
||||
for (i = 0; i != idcnt; i++) {
|
||||
zfs_fuid_t *zfuid;
|
||||
|
||||
zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
|
||||
zfuid->z_logfuid = *log_fuid;
|
||||
zfuid->z_id = -1;
|
||||
zfuid->z_domidx = 0;
|
||||
list_insert_tail(&fuid_infop->z_fuids, zfuid);
|
||||
log_fuid++;
|
||||
}
|
||||
|
||||
zfs_replay_fuid_ugid(fuid_infop, uid, gid);
|
||||
|
||||
*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
|
||||
return (fuid_infop);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_replay_swap_attrs(lr_attr_t *lrattr)
|
||||
{
|
||||
/* swap the lr_attr structure */
|
||||
byteswap_uint32_array(lrattr, sizeof (*lrattr));
|
||||
/* swap the bitmap */
|
||||
byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
|
||||
sizeof (uint32_t));
|
||||
/* swap the attributes, create time + 64 bit word for attributes */
|
||||
byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
|
||||
(lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Replay file create with optional ACL, xvattr information as well
|
||||
* as option FUID information.
|
||||
*/
|
||||
static int
|
||||
zfs_replay_create_acl(zfsvfs_t *zfsvfs,
|
||||
lr_acl_create_t *lracl, boolean_t byteswap)
|
||||
{
|
||||
char *name = NULL; /* location determined later */
|
||||
lr_create_t *lr = (lr_create_t *)lracl;
|
||||
znode_t *dzp;
|
||||
vnode_t *vp = NULL;
|
||||
xvattr_t xva;
|
||||
int vflg = 0;
|
||||
vsecattr_t vsec = { 0 };
|
||||
lr_attr_t *lrattr;
|
||||
void *aclstart;
|
||||
void *fuidstart;
|
||||
size_t xvatlen = 0;
|
||||
uint64_t txtype;
|
||||
int error;
|
||||
|
||||
if (byteswap) {
|
||||
byteswap_uint64_array(lracl, sizeof (*lracl));
|
||||
txtype = (int)lr->lr_common.lrc_txtype;
|
||||
if (txtype == TX_CREATE_ACL_ATTR ||
|
||||
txtype == TX_MKDIR_ACL_ATTR) {
|
||||
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
|
||||
zfs_replay_swap_attrs(lrattr);
|
||||
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
|
||||
}
|
||||
|
||||
aclstart = (caddr_t)(lracl + 1) + xvatlen;
|
||||
zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
|
||||
/* swap fuids */
|
||||
if (lracl->lr_fuidcnt) {
|
||||
byteswap_uint64_array((caddr_t)aclstart +
|
||||
ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
|
||||
lracl->lr_fuidcnt * sizeof (uint64_t));
|
||||
}
|
||||
}
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
|
||||
return (error);
|
||||
|
||||
xva_init(&xva);
|
||||
zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
|
||||
lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
|
||||
|
||||
/*
|
||||
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
|
||||
* eventually end up in zfs_mknode(), which assigns the object's
|
||||
* creation time and generation number. The generic VOP_CREATE()
|
||||
* doesn't have either concept, so we smuggle the values inside
|
||||
* the vattr's otherwise unused va_ctime and va_nblocks fields.
|
||||
*/
|
||||
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
|
||||
xva.xva_vattr.va_nblocks = lr->lr_gen;
|
||||
|
||||
error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
|
||||
if (error != ENOENT)
|
||||
goto bail;
|
||||
|
||||
if (lr->lr_common.lrc_txtype & TX_CI)
|
||||
vflg |= FIGNORECASE;
|
||||
switch ((int)lr->lr_common.lrc_txtype) {
|
||||
case TX_CREATE_ACL:
|
||||
aclstart = (caddr_t)(lracl + 1);
|
||||
fuidstart = (caddr_t)aclstart +
|
||||
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
|
||||
zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
|
||||
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
/*FALLTHROUGH*/
|
||||
case TX_CREATE_ACL_ATTR:
|
||||
if (name == NULL) {
|
||||
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
|
||||
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
|
||||
xva.xva_vattr.va_mask |= AT_XVATTR;
|
||||
zfs_replay_xvattr(lrattr, &xva);
|
||||
}
|
||||
vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
|
||||
vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
|
||||
vsec.vsa_aclcnt = lracl->lr_aclcnt;
|
||||
vsec.vsa_aclentsz = lracl->lr_acl_bytes;
|
||||
vsec.vsa_aclflags = lracl->lr_acl_flags;
|
||||
if (zfsvfs->z_fuid_replay == NULL) {
|
||||
fuidstart = (caddr_t)(lracl + 1) + xvatlen +
|
||||
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
|
||||
zfsvfs->z_fuid_replay =
|
||||
zfs_replay_fuids(fuidstart,
|
||||
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
}
|
||||
|
||||
error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
|
||||
0, 0, &vp, kcred, vflg, NULL, &vsec);
|
||||
break;
|
||||
case TX_MKDIR_ACL:
|
||||
aclstart = (caddr_t)(lracl + 1);
|
||||
fuidstart = (caddr_t)aclstart +
|
||||
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
|
||||
zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
|
||||
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
/*FALLTHROUGH*/
|
||||
case TX_MKDIR_ACL_ATTR:
|
||||
if (name == NULL) {
|
||||
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
|
||||
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
|
||||
zfs_replay_xvattr(lrattr, &xva);
|
||||
}
|
||||
vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
|
||||
vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
|
||||
vsec.vsa_aclcnt = lracl->lr_aclcnt;
|
||||
vsec.vsa_aclentsz = lracl->lr_acl_bytes;
|
||||
vsec.vsa_aclflags = lracl->lr_acl_flags;
|
||||
if (zfsvfs->z_fuid_replay == NULL) {
|
||||
fuidstart = (caddr_t)(lracl + 1) + xvatlen +
|
||||
ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
|
||||
zfsvfs->z_fuid_replay =
|
||||
zfs_replay_fuids(fuidstart,
|
||||
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
}
|
||||
error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
|
||||
&vp, kcred, NULL, vflg, &vsec);
|
||||
break;
|
||||
default:
|
||||
error = ENOTSUP;
|
||||
}
|
||||
|
||||
bail:
|
||||
if (error == 0 && vp != NULL)
|
||||
VN_RELE(vp);
|
||||
|
||||
VN_RELE(ZTOV(dzp));
|
||||
|
||||
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
|
||||
zfsvfs->z_fuid_replay = NULL;
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
|
||||
{
|
||||
char *name = NULL; /* location determined later */
|
||||
char *link; /* symlink content follows name */
|
||||
znode_t *dzp;
|
||||
vnode_t *vp = NULL;
|
||||
xvattr_t xva;
|
||||
int vflg = 0;
|
||||
size_t lrsize = sizeof (lr_create_t);
|
||||
lr_attr_t *lrattr;
|
||||
void *start;
|
||||
size_t xvatlen;
|
||||
uint64_t txtype;
|
||||
int error;
|
||||
|
||||
if (byteswap) {
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
txtype = (int)lr->lr_common.lrc_txtype;
|
||||
if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
|
||||
zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
|
||||
}
|
||||
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
|
||||
return (error);
|
||||
|
||||
xva_init(&xva);
|
||||
zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
|
||||
lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
|
||||
|
||||
/*
|
||||
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
|
||||
* eventually end up in zfs_mknode(), which assigns the object's
|
||||
* creation time and generation number. The generic VOP_CREATE()
|
||||
* doesn't have either concept, so we smuggle the values inside
|
||||
* the vattr's otherwise unused va_ctime and va_nblocks fields.
|
||||
*/
|
||||
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
|
||||
xva.xva_vattr.va_nblocks = lr->lr_gen;
|
||||
|
||||
error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
|
||||
if (error != ENOENT)
|
||||
goto out;
|
||||
|
||||
if (lr->lr_common.lrc_txtype & TX_CI)
|
||||
vflg |= FIGNORECASE;
|
||||
|
||||
/*
|
||||
* Symlinks don't have fuid info, and CIFS never creates
|
||||
* symlinks.
|
||||
*
|
||||
* The _ATTR versions will grab the fuid info in their subcases.
|
||||
*/
|
||||
if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
|
||||
(int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
|
||||
(int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
|
||||
start = (lr + 1);
|
||||
zfsvfs->z_fuid_replay =
|
||||
zfs_replay_fuid_domain(start, &start,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
}
|
||||
|
||||
switch ((int)lr->lr_common.lrc_txtype) {
|
||||
case TX_CREATE_ATTR:
|
||||
lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
|
||||
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
|
||||
zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
|
||||
start = (caddr_t)(lr + 1) + xvatlen;
|
||||
zfsvfs->z_fuid_replay =
|
||||
zfs_replay_fuid_domain(start, &start,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
name = (char *)start;
|
||||
|
||||
/*FALLTHROUGH*/
|
||||
case TX_CREATE:
|
||||
if (name == NULL)
|
||||
name = (char *)start;
|
||||
|
||||
error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
|
||||
0, 0, &vp, kcred, vflg, NULL, NULL);
|
||||
break;
|
||||
case TX_MKDIR_ATTR:
|
||||
lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
|
||||
xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
|
||||
zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
|
||||
start = (caddr_t)(lr + 1) + xvatlen;
|
||||
zfsvfs->z_fuid_replay =
|
||||
zfs_replay_fuid_domain(start, &start,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
name = (char *)start;
|
||||
|
||||
/*FALLTHROUGH*/
|
||||
case TX_MKDIR:
|
||||
if (name == NULL)
|
||||
name = (char *)(lr + 1);
|
||||
|
||||
error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
|
||||
&vp, kcred, NULL, vflg, NULL);
|
||||
break;
|
||||
case TX_MKXATTR:
|
||||
name = (char *)(lr + 1);
|
||||
error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
|
||||
break;
|
||||
case TX_SYMLINK:
|
||||
name = (char *)(lr + 1);
|
||||
link = name + strlen(name) + 1;
|
||||
error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr,
|
||||
link, kcred, NULL, vflg);
|
||||
break;
|
||||
default:
|
||||
error = ENOTSUP;
|
||||
}
|
||||
|
||||
out:
|
||||
if (error == 0 && vp != NULL)
|
||||
VN_RELE(vp);
|
||||
|
||||
VN_RELE(ZTOV(dzp));
|
||||
|
||||
if (zfsvfs->z_fuid_replay)
|
||||
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
|
||||
zfsvfs->z_fuid_replay = NULL;
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
|
||||
{
|
||||
char *name = (char *)(lr + 1); /* name follows lr_remove_t */
|
||||
znode_t *dzp;
|
||||
int error;
|
||||
int vflg = 0;
|
||||
|
||||
if (byteswap)
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
|
||||
return (error);
|
||||
|
||||
if (lr->lr_common.lrc_txtype & TX_CI)
|
||||
vflg |= FIGNORECASE;
|
||||
|
||||
switch ((int)lr->lr_common.lrc_txtype) {
|
||||
case TX_REMOVE:
|
||||
error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg);
|
||||
break;
|
||||
case TX_RMDIR:
|
||||
error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg);
|
||||
break;
|
||||
default:
|
||||
error = ENOTSUP;
|
||||
}
|
||||
|
||||
VN_RELE(ZTOV(dzp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
|
||||
{
|
||||
char *name = (char *)(lr + 1); /* name follows lr_link_t */
|
||||
znode_t *dzp, *zp;
|
||||
int error;
|
||||
int vflg = 0;
|
||||
|
||||
if (byteswap)
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
|
||||
return (error);
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
|
||||
VN_RELE(ZTOV(dzp));
|
||||
return (error);
|
||||
}
|
||||
|
||||
if (lr->lr_common.lrc_txtype & TX_CI)
|
||||
vflg |= FIGNORECASE;
|
||||
|
||||
error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg);
|
||||
|
||||
VN_RELE(ZTOV(zp));
|
||||
VN_RELE(ZTOV(dzp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
|
||||
{
|
||||
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
|
||||
char *tname = sname + strlen(sname) + 1;
|
||||
znode_t *sdzp, *tdzp;
|
||||
int error;
|
||||
int vflg = 0;
|
||||
|
||||
if (byteswap)
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
|
||||
return (error);
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
|
||||
VN_RELE(ZTOV(sdzp));
|
||||
return (error);
|
||||
}
|
||||
|
||||
if (lr->lr_common.lrc_txtype & TX_CI)
|
||||
vflg |= FIGNORECASE;
|
||||
|
||||
error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred,
|
||||
NULL, vflg);
|
||||
|
||||
VN_RELE(ZTOV(tdzp));
|
||||
VN_RELE(ZTOV(sdzp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
|
||||
{
|
||||
char *data = (char *)(lr + 1); /* data follows lr_write_t */
|
||||
znode_t *zp;
|
||||
int error;
|
||||
ssize_t resid;
|
||||
|
||||
if (byteswap)
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
|
||||
/*
|
||||
* As we can log writes out of order, it's possible the
|
||||
* file has been removed. In this case just drop the write
|
||||
* and return success.
|
||||
*/
|
||||
if (error == ENOENT)
|
||||
error = 0;
|
||||
return (error);
|
||||
}
|
||||
|
||||
error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
|
||||
lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
|
||||
|
||||
VN_RELE(ZTOV(zp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
|
||||
{
|
||||
znode_t *zp;
|
||||
flock64_t fl;
|
||||
int error;
|
||||
|
||||
if (byteswap)
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
|
||||
/*
|
||||
* As we can log truncates out of order, it's possible the
|
||||
* file has been removed. In this case just drop the truncate
|
||||
* and return success.
|
||||
*/
|
||||
if (error == ENOENT)
|
||||
error = 0;
|
||||
return (error);
|
||||
}
|
||||
|
||||
bzero(&fl, sizeof (fl));
|
||||
fl.l_type = F_WRLCK;
|
||||
fl.l_whence = 0;
|
||||
fl.l_start = lr->lr_offset;
|
||||
fl.l_len = lr->lr_length;
|
||||
|
||||
error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
|
||||
lr->lr_offset, kcred, NULL);
|
||||
|
||||
VN_RELE(ZTOV(zp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
|
||||
{
|
||||
znode_t *zp;
|
||||
xvattr_t xva;
|
||||
vattr_t *vap = &xva.xva_vattr;
|
||||
int error;
|
||||
void *start;
|
||||
|
||||
xva_init(&xva);
|
||||
if (byteswap) {
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
|
||||
if ((lr->lr_mask & AT_XVATTR) &&
|
||||
zfsvfs->z_version >= ZPL_VERSION_INITIAL)
|
||||
zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
|
||||
}
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
|
||||
/*
|
||||
* As we can log setattrs out of order, it's possible the
|
||||
* file has been removed. In this case just drop the setattr
|
||||
* and return success.
|
||||
*/
|
||||
if (error == ENOENT)
|
||||
error = 0;
|
||||
return (error);
|
||||
}
|
||||
|
||||
zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
|
||||
lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
|
||||
|
||||
vap->va_size = lr->lr_size;
|
||||
ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
|
||||
ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
|
||||
|
||||
/*
|
||||
* Fill in xvattr_t portions if necessary.
|
||||
*/
|
||||
|
||||
start = (lr_setattr_t *)(lr + 1);
|
||||
if (vap->va_mask & AT_XVATTR) {
|
||||
zfs_replay_xvattr((lr_attr_t *)start, &xva);
|
||||
start = (caddr_t)start +
|
||||
ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
|
||||
} else
|
||||
xva.xva_vattr.va_mask &= ~AT_XVATTR;
|
||||
|
||||
zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
|
||||
lr->lr_uid, lr->lr_gid);
|
||||
|
||||
error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL);
|
||||
|
||||
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
|
||||
zfsvfs->z_fuid_replay = NULL;
|
||||
VN_RELE(ZTOV(zp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
|
||||
{
|
||||
ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
|
||||
vsecattr_t vsa;
|
||||
znode_t *zp;
|
||||
int error;
|
||||
|
||||
if (byteswap) {
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
zfs_oldace_byteswap(ace, lr->lr_aclcnt);
|
||||
}
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
|
||||
/*
|
||||
* As we can log acls out of order, it's possible the
|
||||
* file has been removed. In this case just drop the acl
|
||||
* and return success.
|
||||
*/
|
||||
if (error == ENOENT)
|
||||
error = 0;
|
||||
return (error);
|
||||
}
|
||||
|
||||
bzero(&vsa, sizeof (vsa));
|
||||
vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
|
||||
vsa.vsa_aclcnt = lr->lr_aclcnt;
|
||||
vsa.vsa_aclentp = ace;
|
||||
|
||||
error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
|
||||
|
||||
VN_RELE(ZTOV(zp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Replaying ACLs is complicated by FUID support.
|
||||
* The log record may contain some optional data
|
||||
* to be used for replaying FUID's. These pieces
|
||||
* are the actual FUIDs that were created initially.
|
||||
* The FUID table index may no longer be valid and
|
||||
* during zfs_create() a new index may be assigned.
|
||||
* Because of this the log will contain the original
|
||||
* doman+rid in order to create a new FUID.
|
||||
*
|
||||
* The individual ACEs may contain an ephemeral uid/gid which is no
|
||||
* longer valid and will need to be replaced with an actual FUID.
|
||||
*
|
||||
*/
|
||||
static int
|
||||
zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
|
||||
{
|
||||
ace_t *ace = (ace_t *)(lr + 1);
|
||||
vsecattr_t vsa;
|
||||
znode_t *zp;
|
||||
int error;
|
||||
|
||||
if (byteswap) {
|
||||
byteswap_uint64_array(lr, sizeof (*lr));
|
||||
zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
|
||||
if (lr->lr_fuidcnt) {
|
||||
byteswap_uint64_array((caddr_t)ace +
|
||||
ZIL_ACE_LENGTH(lr->lr_acl_bytes),
|
||||
lr->lr_fuidcnt * sizeof (uint64_t));
|
||||
}
|
||||
}
|
||||
|
||||
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
|
||||
/*
|
||||
* As we can log acls out of order, it's possible the
|
||||
* file has been removed. In this case just drop the acl
|
||||
* and return success.
|
||||
*/
|
||||
if (error == ENOENT)
|
||||
error = 0;
|
||||
return (error);
|
||||
}
|
||||
|
||||
bzero(&vsa, sizeof (vsa));
|
||||
vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
|
||||
vsa.vsa_aclcnt = lr->lr_aclcnt;
|
||||
vsa.vsa_aclentp = ace;
|
||||
vsa.vsa_aclentsz = lr->lr_acl_bytes;
|
||||
vsa.vsa_aclflags = lr->lr_acl_flags;
|
||||
|
||||
if (lr->lr_fuidcnt) {
|
||||
void *fuidstart = (caddr_t)ace +
|
||||
ZIL_ACE_LENGTH(lr->lr_acl_bytes);
|
||||
|
||||
zfsvfs->z_fuid_replay =
|
||||
zfs_replay_fuids(fuidstart, &fuidstart,
|
||||
lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
|
||||
}
|
||||
|
||||
error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
|
||||
|
||||
if (zfsvfs->z_fuid_replay)
|
||||
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
|
||||
|
||||
zfsvfs->z_fuid_replay = NULL;
|
||||
VN_RELE(ZTOV(zp));
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Callback vectors for replaying records
|
||||
*/
|
||||
zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
|
||||
zfs_replay_error, /* 0 no such transaction type */
|
||||
zfs_replay_create, /* TX_CREATE */
|
||||
zfs_replay_create, /* TX_MKDIR */
|
||||
zfs_replay_create, /* TX_MKXATTR */
|
||||
zfs_replay_create, /* TX_SYMLINK */
|
||||
zfs_replay_remove, /* TX_REMOVE */
|
||||
zfs_replay_remove, /* TX_RMDIR */
|
||||
zfs_replay_link, /* TX_LINK */
|
||||
zfs_replay_rename, /* TX_RENAME */
|
||||
zfs_replay_write, /* TX_WRITE */
|
||||
zfs_replay_truncate, /* TX_TRUNCATE */
|
||||
zfs_replay_setattr, /* TX_SETATTR */
|
||||
zfs_replay_acl_v0, /* TX_ACL_V0 */
|
||||
zfs_replay_acl, /* TX_ACL */
|
||||
zfs_replay_create_acl, /* TX_CREATE_ACL */
|
||||
zfs_replay_create, /* TX_CREATE_ATTR */
|
||||
zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */
|
||||
zfs_replay_create_acl, /* TX_MKDIR_ACL */
|
||||
zfs_replay_create, /* TX_MKDIR_ATTR */
|
||||
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
|
||||
};
|
||||
@@ -0,0 +1,602 @@
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or http://www.opensolaris.org/os/licensing.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#pragma ident "@(#)zfs_rlock.c 1.4 07/08/08 SMI"
|
||||
|
||||
/*
|
||||
* This file contains the code to implement file range locking in
|
||||
* ZFS, although there isn't much specific to ZFS (all that comes to mind
|
||||
* support for growing the blocksize).
|
||||
*
|
||||
* Interface
|
||||
* ---------
|
||||
* Defined in zfs_rlock.h but essentially:
|
||||
* rl = zfs_range_lock(zp, off, len, lock_type);
|
||||
* zfs_range_unlock(rl);
|
||||
* zfs_range_reduce(rl, off, len);
|
||||
*
|
||||
* AVL tree
|
||||
* --------
|
||||
* An AVL tree is used to maintain the state of the existing ranges
|
||||
* that are locked for exclusive (writer) or shared (reader) use.
|
||||
* The starting range offset is used for searching and sorting the tree.
|
||||
*
|
||||
* Common case
|
||||
* -----------
|
||||
* The (hopefully) usual case is of no overlaps or contention for
|
||||
* locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
|
||||
* searched that finds no overlap, and *this* rl_t is placed in the tree.
|
||||
*
|
||||
* Overlaps/Reference counting/Proxy locks
|
||||
* ---------------------------------------
|
||||
* The avl code only allows one node at a particular offset. Also it's very
|
||||
* inefficient to search through all previous entries looking for overlaps
|
||||
* (because the very 1st in the ordered list might be at offset 0 but
|
||||
* cover the whole file).
|
||||
* So this implementation uses reference counts and proxy range locks.
|
||||
* Firstly, only reader locks use reference counts and proxy locks,
|
||||
* because writer locks are exclusive.
|
||||
* When a reader lock overlaps with another then a proxy lock is created
|
||||
* for that range and replaces the original lock. If the overlap
|
||||
* is exact then the reference count of the proxy is simply incremented.
|
||||
* Otherwise, the proxy lock is split into smaller lock ranges and
|
||||
* new proxy locks created for non overlapping ranges.
|
||||
* The reference counts are adjusted accordingly.
|
||||
* Meanwhile, the orginal lock is kept around (this is the callers handle)
|
||||
* and its offset and length are used when releasing the lock.
|
||||
*
|
||||
* Thread coordination
|
||||
* -------------------
|
||||
* In order to make wakeups efficient and to ensure multiple continuous
|
||||
* readers on a range don't starve a writer for the same range lock,
|
||||
* two condition variables are allocated in each rl_t.
|
||||
* If a writer (or reader) can't get a range it initialises the writer
|
||||
* (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
|
||||
* and waits on that cv. When a thread unlocks that range it wakes up all
|
||||
* writers then all readers before destroying the lock.
|
||||
*
|
||||
* Append mode writes
|
||||
* ------------------
|
||||
* Append mode writes need to lock a range at the end of a file.
|
||||
* The offset of the end of the file is determined under the
|
||||
* range locking mutex, and the lock type converted from RL_APPEND to
|
||||
* RL_WRITER and the range locked.
|
||||
*
|
||||
* Grow block handling
|
||||
* -------------------
|
||||
* ZFS supports multiple block sizes currently upto 128K. The smallest
|
||||
* block size is used for the file which is grown as needed. During this
|
||||
* growth all other writers and readers must be excluded.
|
||||
* So if the block size needs to be grown then the whole file is
|
||||
* exclusively locked, then later the caller will reduce the lock
|
||||
* range to just the range to be written using zfs_reduce_range.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_rlock.h>
|
||||
|
||||
/*
|
||||
* Check if a write lock can be grabbed, or wait and recheck until available.
|
||||
*/
|
||||
static void
|
||||
zfs_range_lock_writer(znode_t *zp, rl_t *new)
|
||||
{
|
||||
avl_tree_t *tree = &zp->z_range_avl;
|
||||
rl_t *rl;
|
||||
avl_index_t where;
|
||||
uint64_t end_size;
|
||||
uint64_t off = new->r_off;
|
||||
uint64_t len = new->r_len;
|
||||
|
||||
for (;;) {
|
||||
/*
|
||||
* Range locking is also used by zvol and uses a
|
||||
* dummied up znode. However, for zvol, we don't need to
|
||||
* append or grow blocksize, and besides we don't have
|
||||
* a z_phys or z_zfsvfs - so skip that processing.
|
||||
*
|
||||
* Yes, this is ugly, and would be solved by not handling
|
||||
* grow or append in range lock code. If that was done then
|
||||
* we could make the range locking code generically available
|
||||
* to other non-zfs consumers.
|
||||
*/
|
||||
if (zp->z_vnode) { /* caller is ZPL */
|
||||
/*
|
||||
* If in append mode pick up the current end of file.
|
||||
* This is done under z_range_lock to avoid races.
|
||||
*/
|
||||
if (new->r_type == RL_APPEND)
|
||||
new->r_off = zp->z_phys->zp_size;
|
||||
|
||||
/*
|
||||
* If we need to grow the block size then grab the whole
|
||||
* file range. This is also done under z_range_lock to
|
||||
* avoid races.
|
||||
*/
|
||||
end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
|
||||
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
|
||||
zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
|
||||
new->r_off = 0;
|
||||
new->r_len = UINT64_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* First check for the usual case of no locks
|
||||
*/
|
||||
if (avl_numnodes(tree) == 0) {
|
||||
new->r_type = RL_WRITER; /* convert to writer */
|
||||
avl_add(tree, new);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Look for any locks in the range.
|
||||
*/
|
||||
rl = avl_find(tree, new, &where);
|
||||
if (rl)
|
||||
goto wait; /* already locked at same offset */
|
||||
|
||||
rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
|
||||
if (rl && (rl->r_off < new->r_off + new->r_len))
|
||||
goto wait;
|
||||
|
||||
rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
|
||||
if (rl && rl->r_off + rl->r_len > new->r_off)
|
||||
goto wait;
|
||||
|
||||
new->r_type = RL_WRITER; /* convert possible RL_APPEND */
|
||||
avl_insert(tree, new, where);
|
||||
return;
|
||||
wait:
|
||||
if (!rl->r_write_wanted) {
|
||||
cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
|
||||
rl->r_write_wanted = B_TRUE;
|
||||
}
|
||||
cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
|
||||
|
||||
/* reset to original */
|
||||
new->r_off = off;
|
||||
new->r_len = len;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is an original (non-proxy) lock then replace it by
|
||||
* a proxy and return the proxy.
|
||||
*/
|
||||
static rl_t *
|
||||
zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
|
||||
{
|
||||
rl_t *proxy;
|
||||
|
||||
if (rl->r_proxy)
|
||||
return (rl); /* already a proxy */
|
||||
|
||||
ASSERT3U(rl->r_cnt, ==, 1);
|
||||
ASSERT(rl->r_write_wanted == B_FALSE);
|
||||
ASSERT(rl->r_read_wanted == B_FALSE);
|
||||
avl_remove(tree, rl);
|
||||
rl->r_cnt = 0;
|
||||
|
||||
/* create a proxy range lock */
|
||||
proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
|
||||
proxy->r_off = rl->r_off;
|
||||
proxy->r_len = rl->r_len;
|
||||
proxy->r_cnt = 1;
|
||||
proxy->r_type = RL_READER;
|
||||
proxy->r_proxy = B_TRUE;
|
||||
proxy->r_write_wanted = B_FALSE;
|
||||
proxy->r_read_wanted = B_FALSE;
|
||||
avl_add(tree, proxy);
|
||||
|
||||
return (proxy);
|
||||
}
|
||||
|
||||
/*
|
||||
* Split the range lock at the supplied offset
|
||||
* returning the *front* proxy.
|
||||
*/
|
||||
static rl_t *
|
||||
zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
|
||||
{
|
||||
rl_t *front, *rear;
|
||||
|
||||
ASSERT3U(rl->r_len, >, 1);
|
||||
ASSERT3U(off, >, rl->r_off);
|
||||
ASSERT3U(off, <, rl->r_off + rl->r_len);
|
||||
ASSERT(rl->r_write_wanted == B_FALSE);
|
||||
ASSERT(rl->r_read_wanted == B_FALSE);
|
||||
|
||||
/* create the rear proxy range lock */
|
||||
rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
|
||||
rear->r_off = off;
|
||||
rear->r_len = rl->r_off + rl->r_len - off;
|
||||
rear->r_cnt = rl->r_cnt;
|
||||
rear->r_type = RL_READER;
|
||||
rear->r_proxy = B_TRUE;
|
||||
rear->r_write_wanted = B_FALSE;
|
||||
rear->r_read_wanted = B_FALSE;
|
||||
|
||||
front = zfs_range_proxify(tree, rl);
|
||||
front->r_len = off - rl->r_off;
|
||||
|
||||
avl_insert_here(tree, rear, front, AVL_AFTER);
|
||||
return (front);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create and add a new proxy range lock for the supplied range.
|
||||
*/
|
||||
static void
|
||||
zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
|
||||
{
|
||||
rl_t *rl;
|
||||
|
||||
ASSERT(len);
|
||||
rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
|
||||
rl->r_off = off;
|
||||
rl->r_len = len;
|
||||
rl->r_cnt = 1;
|
||||
rl->r_type = RL_READER;
|
||||
rl->r_proxy = B_TRUE;
|
||||
rl->r_write_wanted = B_FALSE;
|
||||
rl->r_read_wanted = B_FALSE;
|
||||
avl_add(tree, rl);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
|
||||
{
|
||||
rl_t *next;
|
||||
uint64_t off = new->r_off;
|
||||
uint64_t len = new->r_len;
|
||||
|
||||
/*
|
||||
* prev arrives either:
|
||||
* - pointing to an entry at the same offset
|
||||
* - pointing to the entry with the closest previous offset whose
|
||||
* range may overlap with the new range
|
||||
* - null, if there were no ranges starting before the new one
|
||||
*/
|
||||
if (prev) {
|
||||
if (prev->r_off + prev->r_len <= off) {
|
||||
prev = NULL;
|
||||
} else if (prev->r_off != off) {
|
||||
/*
|
||||
* convert to proxy if needed then
|
||||
* split this entry and bump ref count
|
||||
*/
|
||||
prev = zfs_range_split(tree, prev, off);
|
||||
prev = AVL_NEXT(tree, prev); /* move to rear range */
|
||||
}
|
||||
}
|
||||
ASSERT((prev == NULL) || (prev->r_off == off));
|
||||
|
||||
if (prev)
|
||||
next = prev;
|
||||
else
|
||||
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
|
||||
|
||||
if (next == NULL || off + len <= next->r_off) {
|
||||
/* no overlaps, use the original new rl_t in the tree */
|
||||
avl_insert(tree, new, where);
|
||||
return;
|
||||
}
|
||||
|
||||
if (off < next->r_off) {
|
||||
/* Add a proxy for initial range before the overlap */
|
||||
zfs_range_new_proxy(tree, off, next->r_off - off);
|
||||
}
|
||||
|
||||
new->r_cnt = 0; /* will use proxies in tree */
|
||||
/*
|
||||
* We now search forward through the ranges, until we go past the end
|
||||
* of the new range. For each entry we make it a proxy if it
|
||||
* isn't already, then bump its reference count. If there's any
|
||||
* gaps between the ranges then we create a new proxy range.
|
||||
*/
|
||||
for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
|
||||
if (off + len <= next->r_off)
|
||||
break;
|
||||
if (prev && prev->r_off + prev->r_len < next->r_off) {
|
||||
/* there's a gap */
|
||||
ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
|
||||
zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
|
||||
next->r_off - (prev->r_off + prev->r_len));
|
||||
}
|
||||
if (off + len == next->r_off + next->r_len) {
|
||||
/* exact overlap with end */
|
||||
next = zfs_range_proxify(tree, next);
|
||||
next->r_cnt++;
|
||||
return;
|
||||
}
|
||||
if (off + len < next->r_off + next->r_len) {
|
||||
/* new range ends in the middle of this block */
|
||||
next = zfs_range_split(tree, next, off + len);
|
||||
next->r_cnt++;
|
||||
return;
|
||||
}
|
||||
ASSERT3U(off + len, >, next->r_off + next->r_len);
|
||||
next = zfs_range_proxify(tree, next);
|
||||
next->r_cnt++;
|
||||
}
|
||||
|
||||
/* Add the remaining end range. */
|
||||
zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
|
||||
(off + len) - (prev->r_off + prev->r_len));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a reader lock can be grabbed, or wait and recheck until available.
|
||||
*/
|
||||
static void
|
||||
zfs_range_lock_reader(znode_t *zp, rl_t *new)
|
||||
{
|
||||
avl_tree_t *tree = &zp->z_range_avl;
|
||||
rl_t *prev, *next;
|
||||
avl_index_t where;
|
||||
uint64_t off = new->r_off;
|
||||
uint64_t len = new->r_len;
|
||||
|
||||
/*
|
||||
* Look for any writer locks in the range.
|
||||
*/
|
||||
retry:
|
||||
prev = avl_find(tree, new, &where);
|
||||
if (prev == NULL)
|
||||
prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
|
||||
|
||||
/*
|
||||
* Check the previous range for a writer lock overlap.
|
||||
*/
|
||||
if (prev && (off < prev->r_off + prev->r_len)) {
|
||||
if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
|
||||
if (!prev->r_read_wanted) {
|
||||
cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
|
||||
prev->r_read_wanted = B_TRUE;
|
||||
}
|
||||
cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
|
||||
goto retry;
|
||||
}
|
||||
if (off + len < prev->r_off + prev->r_len)
|
||||
goto got_lock;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search through the following ranges to see if there's
|
||||
* write lock any overlap.
|
||||
*/
|
||||
if (prev)
|
||||
next = AVL_NEXT(tree, prev);
|
||||
else
|
||||
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
|
||||
for (; next; next = AVL_NEXT(tree, next)) {
|
||||
if (off + len <= next->r_off)
|
||||
goto got_lock;
|
||||
if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
|
||||
if (!next->r_read_wanted) {
|
||||
cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
|
||||
next->r_read_wanted = B_TRUE;
|
||||
}
|
||||
cv_wait(&next->r_rd_cv, &zp->z_range_lock);
|
||||
goto retry;
|
||||
}
|
||||
if (off + len <= next->r_off + next->r_len)
|
||||
goto got_lock;
|
||||
}
|
||||
|
||||
got_lock:
|
||||
/*
|
||||
* Add the read lock, which may involve splitting existing
|
||||
* locks and bumping ref counts (r_cnt).
|
||||
*/
|
||||
zfs_range_add_reader(tree, new, prev, where);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock a range (offset, length) as either shared (RL_READER)
|
||||
* or exclusive (RL_WRITER). Returns the range lock structure
|
||||
* for later unlocking or reduce range (if entire file
|
||||
* previously locked as RL_WRITER).
|
||||
*/
|
||||
rl_t *
|
||||
zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
|
||||
{
|
||||
rl_t *new;
|
||||
|
||||
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
|
||||
|
||||
new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
|
||||
new->r_zp = zp;
|
||||
new->r_off = off;
|
||||
new->r_len = len;
|
||||
new->r_cnt = 1; /* assume it's going to be in the tree */
|
||||
new->r_type = type;
|
||||
new->r_proxy = B_FALSE;
|
||||
new->r_write_wanted = B_FALSE;
|
||||
new->r_read_wanted = B_FALSE;
|
||||
|
||||
mutex_enter(&zp->z_range_lock);
|
||||
if (type == RL_READER) {
|
||||
/*
|
||||
* First check for the usual case of no locks
|
||||
*/
|
||||
if (avl_numnodes(&zp->z_range_avl) == 0)
|
||||
avl_add(&zp->z_range_avl, new);
|
||||
else
|
||||
zfs_range_lock_reader(zp, new);
|
||||
} else
|
||||
zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
|
||||
mutex_exit(&zp->z_range_lock);
|
||||
return (new);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlock a reader lock
|
||||
*/
|
||||
static void
|
||||
zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
|
||||
{
|
||||
avl_tree_t *tree = &zp->z_range_avl;
|
||||
rl_t *rl, *next;
|
||||
uint64_t len;
|
||||
|
||||
/*
|
||||
* The common case is when the remove entry is in the tree
|
||||
* (cnt == 1) meaning there's been no other reader locks overlapping
|
||||
* with this one. Otherwise the remove entry will have been
|
||||
* removed from the tree and replaced by proxies (one or
|
||||
* more ranges mapping to the entire range).
|
||||
*/
|
||||
if (remove->r_cnt == 1) {
|
||||
avl_remove(tree, remove);
|
||||
if (remove->r_write_wanted) {
|
||||
cv_broadcast(&remove->r_wr_cv);
|
||||
cv_destroy(&remove->r_wr_cv);
|
||||
}
|
||||
if (remove->r_read_wanted) {
|
||||
cv_broadcast(&remove->r_rd_cv);
|
||||
cv_destroy(&remove->r_rd_cv);
|
||||
}
|
||||
} else {
|
||||
ASSERT3U(remove->r_cnt, ==, 0);
|
||||
ASSERT3U(remove->r_write_wanted, ==, 0);
|
||||
ASSERT3U(remove->r_read_wanted, ==, 0);
|
||||
/*
|
||||
* Find start proxy representing this reader lock,
|
||||
* then decrement ref count on all proxies
|
||||
* that make up this range, freeing them as needed.
|
||||
*/
|
||||
rl = avl_find(tree, remove, NULL);
|
||||
ASSERT(rl);
|
||||
ASSERT(rl->r_cnt);
|
||||
ASSERT(rl->r_type == RL_READER);
|
||||
for (len = remove->r_len; len != 0; rl = next) {
|
||||
len -= rl->r_len;
|
||||
if (len) {
|
||||
next = AVL_NEXT(tree, rl);
|
||||
ASSERT(next);
|
||||
ASSERT(rl->r_off + rl->r_len == next->r_off);
|
||||
ASSERT(next->r_cnt);
|
||||
ASSERT(next->r_type == RL_READER);
|
||||
}
|
||||
rl->r_cnt--;
|
||||
if (rl->r_cnt == 0) {
|
||||
avl_remove(tree, rl);
|
||||
if (rl->r_write_wanted) {
|
||||
cv_broadcast(&rl->r_wr_cv);
|
||||
cv_destroy(&rl->r_wr_cv);
|
||||
}
|
||||
if (rl->r_read_wanted) {
|
||||
cv_broadcast(&rl->r_rd_cv);
|
||||
cv_destroy(&rl->r_rd_cv);
|
||||
}
|
||||
kmem_free(rl, sizeof (rl_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
kmem_free(remove, sizeof (rl_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlock range and destroy range lock structure.
|
||||
*/
|
||||
void
|
||||
zfs_range_unlock(rl_t *rl)
|
||||
{
|
||||
znode_t *zp = rl->r_zp;
|
||||
|
||||
ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
|
||||
ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
|
||||
ASSERT(!rl->r_proxy);
|
||||
|
||||
mutex_enter(&zp->z_range_lock);
|
||||
if (rl->r_type == RL_WRITER) {
|
||||
/* writer locks can't be shared or split */
|
||||
avl_remove(&zp->z_range_avl, rl);
|
||||
mutex_exit(&zp->z_range_lock);
|
||||
if (rl->r_write_wanted) {
|
||||
cv_broadcast(&rl->r_wr_cv);
|
||||
cv_destroy(&rl->r_wr_cv);
|
||||
}
|
||||
if (rl->r_read_wanted) {
|
||||
cv_broadcast(&rl->r_rd_cv);
|
||||
cv_destroy(&rl->r_rd_cv);
|
||||
}
|
||||
kmem_free(rl, sizeof (rl_t));
|
||||
} else {
|
||||
/*
|
||||
* lock may be shared, let zfs_range_unlock_reader()
|
||||
* release the lock and free the rl_t
|
||||
*/
|
||||
zfs_range_unlock_reader(zp, rl);
|
||||
mutex_exit(&zp->z_range_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Reduce range locked as RL_WRITER from whole file to specified range.
|
||||
* Asserts the whole file is exclusivly locked and so there's only one
|
||||
* entry in the tree.
|
||||
*/
|
||||
void
|
||||
zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
|
||||
{
|
||||
znode_t *zp = rl->r_zp;
|
||||
|
||||
/* Ensure there are no other locks */
|
||||
ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
|
||||
ASSERT(rl->r_off == 0);
|
||||
ASSERT(rl->r_type == RL_WRITER);
|
||||
ASSERT(!rl->r_proxy);
|
||||
ASSERT3U(rl->r_len, ==, UINT64_MAX);
|
||||
ASSERT3U(rl->r_cnt, ==, 1);
|
||||
|
||||
mutex_enter(&zp->z_range_lock);
|
||||
rl->r_off = off;
|
||||
rl->r_len = len;
|
||||
mutex_exit(&zp->z_range_lock);
|
||||
if (rl->r_write_wanted)
|
||||
cv_broadcast(&rl->r_wr_cv);
|
||||
if (rl->r_read_wanted)
|
||||
cv_broadcast(&rl->r_rd_cv);
|
||||
}
|
||||
|
||||
/*
|
||||
* AVL comparison function used to order range locks
|
||||
* Locks are ordered on the start offset of the range.
|
||||
*/
|
||||
int
|
||||
zfs_range_compare(const void *arg1, const void *arg2)
|
||||
{
|
||||
const rl_t *rl1 = arg1;
|
||||
const rl_t *rl2 = arg2;
|
||||
|
||||
if (rl1->r_off > rl2->r_off)
|
||||
return (1);
|
||||
if (rl1->r_off < rl2->r_off)
|
||||
return (-1);
|
||||
return (0);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user