Remove dummy znode from zvol_state

struct zvol_state contains a dummy znode, which is around 1KB on x64,
only for zfs_range_lock. But in reality, other than z_range_lock and
z_range_avl, zfs_range_lock only need znode on regular file, which
means we add 1KB on a structure and gain nothing.

In this patch, we remove the dummy znode for zvol_state. In order to
do that, we also need to refactor zfs_range_lock a bit. We move
z_range_lock and z_range_avl pair out of znode_t to form zfs_rlock_t.
This new struct replaces znode_t as the main handle inside the range
lock functions.

We also add pointers to z_size, z_blksz, and z_max_blksz so range lock
code doesn't depend on znode_t.  This allows non-ZPL consumers like
Lustre to use the range locks with their equivalent znode_t structure.

Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Boris Protopopov <boris.protopopov@actifio.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #4510
This commit is contained in:
Chunwei Chen
2016-04-11 14:53:48 -07:00
committed by Brian Behlendorf
parent 61a3d06f84
commit d88895a069
8 changed files with 105 additions and 76 deletions
-1
View File
@@ -485,7 +485,6 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
zp->z_gid = 0;
zp->z_mode = 0;
zp->z_sync_cnt = 0;
zp->z_is_zvol = B_FALSE;
zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_TRUE;
zp->z_is_sa = B_FALSE;
+43 -36
View File
@@ -101,9 +101,9 @@
* Check if a write lock can be grabbed, or wait and recheck until available.
*/
static void
zfs_range_lock_writer(znode_t *zp, rl_t *new)
zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new)
{
avl_tree_t *tree = &zp->z_range_avl;
avl_tree_t *tree = &zrl->zr_avl;
rl_t *rl;
avl_index_t where;
uint64_t end_size;
@@ -112,32 +112,32 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
for (;;) {
/*
* Range locking is also used by zvol and uses a
* dummied up znode. However, for zvol, we don't need to
* append or grow blocksize, and besides we don't have
* a "sa" data or zfs_sb_t - so skip that processing.
* Range locking is also used by zvol. However, for zvol, we
* don't need to append or grow blocksize, so skip that
* processing.
*
* Yes, this is ugly, and would be solved by not handling
* grow or append in range lock code. If that was done then
* we could make the range locking code generically available
* to other non-zfs consumers.
*/
if (!zp->z_is_zvol) { /* caller is ZPL */
if (zrl->zr_size) { /* caller is ZPL */
/*
* If in append mode pick up the current end of file.
* This is done under z_range_lock to avoid races.
*/
if (new->r_type == RL_APPEND)
new->r_off = zp->z_size;
new->r_off = *zrl->zr_size;
/*
* If we need to grow the block size then grab the whole
* file range. This is also done under z_range_lock to
* avoid races.
*/
end_size = MAX(zp->z_size, new->r_off + len);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
end_size = MAX(*zrl->zr_size, new->r_off + len);
if (end_size > *zrl->zr_blksz &&
(!ISP2(*zrl->zr_blksz) ||
*zrl->zr_blksz < *zrl->zr_max_blksz)) {
new->r_off = 0;
new->r_len = UINT64_MAX;
}
@@ -175,7 +175,7 @@ wait:
cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
rl->r_write_wanted = B_TRUE;
}
cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
cv_wait(&rl->r_wr_cv, &zrl->zr_mutex);
/* reset to original */
new->r_off = off;
@@ -353,9 +353,9 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
* Check if a reader lock can be grabbed, or wait and recheck until available.
*/
static void
zfs_range_lock_reader(znode_t *zp, rl_t *new)
zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
{
avl_tree_t *tree = &zp->z_range_avl;
avl_tree_t *tree = &zrl->zr_avl;
rl_t *prev, *next;
avl_index_t where;
uint64_t off = new->r_off;
@@ -378,7 +378,7 @@ retry:
cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
prev->r_read_wanted = B_TRUE;
}
cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
cv_wait(&prev->r_rd_cv, &zrl->zr_mutex);
goto retry;
}
if (off + len < prev->r_off + prev->r_len)
@@ -401,7 +401,7 @@ retry:
cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
next->r_read_wanted = B_TRUE;
}
cv_wait(&next->r_rd_cv, &zp->z_range_lock);
cv_wait(&next->r_rd_cv, &zrl->zr_mutex);
goto retry;
}
if (off + len <= next->r_off + next->r_len)
@@ -423,14 +423,14 @@ got_lock:
* previously locked as RL_WRITER).
*/
rl_t *
zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len, rl_type_t type)
{
rl_t *new;
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
new->r_zp = zp;
new->r_zrl = zrl;
new->r_off = off;
if (len + off < off) /* overflow */
len = UINT64_MAX - off;
@@ -441,18 +441,18 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
new->r_write_wanted = B_FALSE;
new->r_read_wanted = B_FALSE;
mutex_enter(&zp->z_range_lock);
mutex_enter(&zrl->zr_mutex);
if (type == RL_READER) {
/*
* First check for the usual case of no locks
*/
if (avl_numnodes(&zp->z_range_avl) == 0)
avl_add(&zp->z_range_avl, new);
if (avl_numnodes(&zrl->zr_avl) == 0)
avl_add(&zrl->zr_avl, new);
else
zfs_range_lock_reader(zp, new);
} else
zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
mutex_exit(&zp->z_range_lock);
zfs_range_lock_reader(zrl, new);
} else /* RL_WRITER or RL_APPEND */
zfs_range_lock_writer(zrl, new);
mutex_exit(&zrl->zr_mutex);
return (new);
}
@@ -474,9 +474,9 @@ zfs_range_free(void *arg)
* Unlock a reader lock
*/
static void
zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list)
zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
{
avl_tree_t *tree = &zp->z_range_avl;
avl_tree_t *tree = &zrl->zr_avl;
rl_t *rl, *next = NULL;
uint64_t len;
@@ -543,7 +543,7 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list)
void
zfs_range_unlock(rl_t *rl)
{
znode_t *zp = rl->r_zp;
zfs_rlock_t *zrl = rl->r_zrl;
list_t free_list;
rl_t *free_rl;
@@ -552,10 +552,10 @@ zfs_range_unlock(rl_t *rl)
ASSERT(!rl->r_proxy);
list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node));
mutex_enter(&zp->z_range_lock);
mutex_enter(&zrl->zr_mutex);
if (rl->r_type == RL_WRITER) {
/* writer locks can't be shared or split */
avl_remove(&zp->z_range_avl, rl);
avl_remove(&zrl->zr_avl, rl);
if (rl->r_write_wanted)
cv_broadcast(&rl->r_wr_cv);
@@ -568,9 +568,9 @@ zfs_range_unlock(rl_t *rl)
* lock may be shared, let zfs_range_unlock_reader()
* release the zp->z_range_lock lock and free the rl_t
*/
zfs_range_unlock_reader(zp, rl, &free_list);
zfs_range_unlock_reader(zrl, rl, &free_list);
}
mutex_exit(&zp->z_range_lock);
mutex_exit(&zrl->zr_mutex);
while ((free_rl = list_head(&free_list)) != NULL) {
list_remove(&free_list, free_rl);
@@ -588,17 +588,17 @@ zfs_range_unlock(rl_t *rl)
void
zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
{
znode_t *zp = rl->r_zp;
zfs_rlock_t *zrl = rl->r_zrl;
/* Ensure there are no other locks */
ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
ASSERT(avl_numnodes(&zrl->zr_avl) == 1);
ASSERT(rl->r_off == 0);
ASSERT(rl->r_type == RL_WRITER);
ASSERT(!rl->r_proxy);
ASSERT3U(rl->r_len, ==, UINT64_MAX);
ASSERT3U(rl->r_cnt, ==, 1);
mutex_enter(&zp->z_range_lock);
mutex_enter(&zrl->zr_mutex);
rl->r_off = off;
rl->r_len = len;
@@ -607,7 +607,7 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
if (rl->r_read_wanted)
cv_broadcast(&rl->r_rd_cv);
mutex_exit(&zp->z_range_lock);
mutex_exit(&zrl->zr_mutex);
}
/*
@@ -626,3 +626,10 @@ zfs_range_compare(const void *arg1, const void *arg2)
return (-1);
return (0);
}
#ifdef _KERNEL
EXPORT_SYMBOL(zfs_range_lock);
EXPORT_SYMBOL(zfs_range_unlock);
EXPORT_SYMBOL(zfs_range_reduce);
EXPORT_SYMBOL(zfs_range_compare);
#endif
+9 -7
View File
@@ -483,7 +483,8 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
/*
* Lock the range against changes.
*/
rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
rl = zfs_range_lock(&zp->z_range_lock, uio->uio_loffset, uio->uio_resid,
RL_READER);
/*
* If we are reading past end-of-file we can skip
@@ -673,7 +674,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
* Obtain an appending range lock to guarantee file append
* semantics. We reset the write offset once we have the lock.
*/
rl = zfs_range_lock(zp, 0, n, RL_APPEND);
rl = zfs_range_lock(&zp->z_range_lock, 0, n, RL_APPEND);
woff = rl->r_off;
if (rl->r_len == UINT64_MAX) {
/*
@@ -690,7 +691,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
* this write, then this range lock will lock the entire file
* so that we can re-write the block safely.
*/
rl = zfs_range_lock(zp, woff, n, RL_WRITER);
rl = zfs_range_lock(&zp->z_range_lock, woff, n, RL_WRITER);
}
if (woff >= limit) {
@@ -1016,7 +1017,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, size,
RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
@@ -1037,8 +1039,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_rl = zfs_range_lock(zp, offset, size,
RL_READER);
zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset,
size, RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
@@ -4007,7 +4009,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
redirty_page_for_writepage(wbc, pp);
unlock_page(pp);
rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER);
rl = zfs_range_lock(&zp->z_range_lock, pgoff, pglen, RL_WRITER);
lock_page(pp);
/* Page mapping changed or it was no longer dirty, we're done */
+8 -9
View File
@@ -113,9 +113,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
zfs_rlock_init(&zp->z_range_lock);
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
@@ -137,8 +135,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock);
rw_destroy(&zp->z_xattr_lock);
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);
zfs_rlock_destroy(&zp->z_range_lock);
ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
@@ -615,10 +612,12 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
zp->z_is_zvol = B_FALSE;
zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_FALSE;
zp->z_is_stale = B_FALSE;
zp->z_range_lock.zr_size = &zp->z_size;
zp->z_range_lock.zr_blksz = &zp->z_blksz;
zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
zfs_znode_sa_init(zsb, zp, db, obj_type, hdl);
@@ -1403,7 +1402,7 @@ zfs_extend(znode_t *zp, uint64_t end)
/*
* We will change zp_size, lock the whole file.
*/
rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
/*
* Nothing to do if file already at desired length.
@@ -1520,7 +1519,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
/*
* Lock the range being freed.
*/
rl = zfs_range_lock(zp, off, len, RL_WRITER);
rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
/*
* Nothing to do if file already at desired length.
@@ -1602,7 +1601,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
/*
* We will change zp_size, lock the whole file.
*/
rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
/*
* Nothing to do if file already at desired length.
+10 -13
View File
@@ -75,7 +75,7 @@ typedef struct zvol_state {
uint32_t zv_open_count; /* open counts */
uint32_t zv_changed; /* disk changed */
zilog_t *zv_zilog; /* ZIL handle */
znode_t zv_znode; /* for range locking */
zfs_rlock_t zv_range_lock; /* range lock */
dmu_buf_t *zv_dbuf; /* bonus handle */
dev_t zv_dev; /* device id */
struct gendisk *zv_disk; /* generic disk */
@@ -633,8 +633,8 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
ASSERT(zv && zv->zv_open_count > 0);
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_WRITER);
rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset,
uio->uio_resid, RL_WRITER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
@@ -725,7 +725,7 @@ zvol_discard(struct bio *bio)
if (start >= end)
return (0);
rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
rl = zfs_range_lock(&zv->zv_range_lock, start, size, RL_WRITER);
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -752,8 +752,8 @@ zvol_read(zvol_state_t *zv, uio_t *uio)
ASSERT(zv && zv->zv_open_count > 0);
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_READER);
rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset,
uio->uio_resid, RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
@@ -879,7 +879,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_zilog = zv->zv_zilog;
zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
RL_READER);
/*
* Write records come in two flavors: immediate and indirect.
@@ -1305,10 +1306,7 @@ zvol_alloc(dev_t dev, const char *name)
zv->zv_open_count = 0;
strlcpy(zv->zv_name, name, MAXNAMELEN);
mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
zv->zv_znode.z_is_zvol = TRUE;
zfs_rlock_init(&zv->zv_range_lock);
zv->zv_disk->major = zvol_major;
zv->zv_disk->first_minor = (dev & MINORMASK);
@@ -1337,8 +1335,7 @@ zvol_free(zvol_state_t *zv)
ASSERT(MUTEX_HELD(&zvol_state_lock));
ASSERT(zv->zv_open_count == 0);
avl_destroy(&zv->zv_znode.z_range_avl);
mutex_destroy(&zv->zv_znode.z_range_lock);
zfs_rlock_destroy(&zv->zv_range_lock);
zv->zv_disk->private_data = NULL;