OpenZFS 9689 - zfs range lock code should not be zpl-specific

The ZFS range locking code in zfs_rlock.c/h depends on ZPL-specific
data structures, specifically znode_t.  However, it's also used by
the ZVOL code, which uses a "dummy" znode_t to pass to the range
locking code.

We should clean this up so that the range locking code is generic
and can be used equally by ZPL and ZVOL, and also can be used by
future consumers that may need to run in userland (libzpool) as
well as the kernel.

Porting notes:
* Added missing sys/avl.h include to sys/zfs_rlock.h.
* Removed 'dbuf is within the locked range' ASSERTs from dmu_sync().
  This was needed because ztest does not yet use a locked_range_t.
* Removed "Approved by:" tag requirement from OpenZFS commit
  check to prevent needless warnings when integrating changes
  which has not been merged to illumos.
* Reverted free_list range lock changes which were originally
  needed to defer the cv_destroy() which was called immediately
  after cv_broadcast().  With d2733258 this should be safe but
  if not we may need to reintroduce this logic.
* Reverts: The following two commits were reverted and squashed in
  to this change in order to make it easier to apply OpenZFS 9689.
  - d88895a0, which removed the dummy znode from zvol_state
  - e3a07cd0, which updated ztest to use range locks
* Preserved optimized rangelock comparison function.  Preserved the
  rangelock free list.  The cv_destroy() function will block waiting
  for all processes in cv_wait() to be scheduled and drop their
  reference.  This is done to ensure it's safe to free the condition
  variable.  However, blocking while holding the rl->rl_lock mutex
  can result in a deadlock on Linux.  A free list is introduced to
  defer the cv_destroy() and kmem_free() until after the mutex is
  released.

Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://illumos.org/issues/9689
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/680
External-issue: DLPX-58662
Closes #7980
This commit is contained in:
Matt Ahrens 2018-10-01 15:13:12 -07:00 committed by Brian Behlendorf
parent 50a343d85c
commit 5d43cc9a59
10 changed files with 484 additions and 595 deletions

View File

@ -104,7 +104,6 @@
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zil.h> #include <sys/zil.h>
#include <sys/zil_impl.h> #include <sys/zil_impl.h>
#include <sys/zfs_rlock.h>
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/vdev_file.h> #include <sys/vdev_file.h>
#include <sys/spa_impl.h> #include <sys/spa_impl.h>
@ -258,6 +257,17 @@ typedef struct bufwad {
uint64_t bw_data; uint64_t bw_data;
} bufwad_t; } bufwad_t;
/*
* It would be better to use a rangelock_t per object. Unfortunately
* the rangelock_t is not a drop-in replacement for rl_t, because we
* still need to map from object ID to rangelock_t.
*/
typedef enum {
RL_READER,
RL_WRITER,
RL_APPEND
} rl_type_t;
typedef struct rll { typedef struct rll {
void *rll_writer; void *rll_writer;
int rll_readers; int rll_readers;
@ -265,10 +275,12 @@ typedef struct rll {
kcondvar_t rll_cv; kcondvar_t rll_cv;
} rll_t; } rll_t;
typedef struct zll { typedef struct rl {
list_t z_list; uint64_t rl_object;
kmutex_t z_lock; uint64_t rl_offset;
} zll_t; uint64_t rl_size;
rll_t *rl_lock;
} rl_t;
#define ZTEST_RANGE_LOCKS 64 #define ZTEST_RANGE_LOCKS 64
#define ZTEST_OBJECT_LOCKS 64 #define ZTEST_OBJECT_LOCKS 64
@ -301,7 +313,7 @@ typedef struct ztest_ds {
char zd_name[ZFS_MAX_DATASET_NAME_LEN]; char zd_name[ZFS_MAX_DATASET_NAME_LEN];
kmutex_t zd_dirobj_lock; kmutex_t zd_dirobj_lock;
rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
zll_t zd_range_lock[ZTEST_RANGE_LOCKS]; rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
} ztest_ds_t; } ztest_ds_t;
/* /*
@ -1318,100 +1330,6 @@ ztest_dmu_objset_own(const char *name, dmu_objset_type_t type,
return (err); return (err);
} }
/*
* Object and range lock mechanics
*/
typedef struct {
list_node_t z_lnode;
zfs_refcount_t z_refcnt;
uint64_t z_object;
zfs_rlock_t z_range_lock;
} ztest_znode_t;
typedef struct {
rl_t *z_rl;
ztest_znode_t *z_ztznode;
} ztest_zrl_t;
static ztest_znode_t *
ztest_znode_init(uint64_t object)
{
ztest_znode_t *zp = umem_alloc(sizeof (*zp), UMEM_NOFAIL);
list_link_init(&zp->z_lnode);
zfs_refcount_create(&zp->z_refcnt);
zp->z_object = object;
zfs_rlock_init(&zp->z_range_lock);
return (zp);
}
static void
ztest_znode_fini(ztest_znode_t *zp)
{
ASSERT(zfs_refcount_is_zero(&zp->z_refcnt));
zfs_rlock_destroy(&zp->z_range_lock);
zp->z_object = 0;
zfs_refcount_destroy(&zp->z_refcnt);
list_link_init(&zp->z_lnode);
umem_free(zp, sizeof (*zp));
}
static void
ztest_zll_init(zll_t *zll)
{
mutex_init(&zll->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zll->z_list, sizeof (ztest_znode_t),
offsetof(ztest_znode_t, z_lnode));
}
static void
ztest_zll_destroy(zll_t *zll)
{
list_destroy(&zll->z_list);
mutex_destroy(&zll->z_lock);
}
#define RL_TAG "range_lock"
static ztest_znode_t *
ztest_znode_get(ztest_ds_t *zd, uint64_t object)
{
zll_t *zll = &zd->zd_range_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
ztest_znode_t *zp = NULL;
mutex_enter(&zll->z_lock);
for (zp = list_head(&zll->z_list); (zp);
zp = list_next(&zll->z_list, zp)) {
if (zp->z_object == object) {
zfs_refcount_add(&zp->z_refcnt, RL_TAG);
break;
}
}
if (zp == NULL) {
zp = ztest_znode_init(object);
zfs_refcount_add(&zp->z_refcnt, RL_TAG);
list_insert_head(&zll->z_list, zp);
}
mutex_exit(&zll->z_lock);
return (zp);
}
static void
ztest_znode_put(ztest_ds_t *zd, ztest_znode_t *zp)
{
zll_t *zll = NULL;
ASSERT3U(zp->z_object, !=, 0);
zll = &zd->zd_range_lock[zp->z_object & (ZTEST_OBJECT_LOCKS - 1)];
mutex_enter(&zll->z_lock);
zfs_refcount_remove(&zp->z_refcnt, RL_TAG);
if (zfs_refcount_is_zero(&zp->z_refcnt)) {
list_remove(&zll->z_list, zp);
ztest_znode_fini(zp);
}
mutex_exit(&zll->z_lock);
}
static void static void
ztest_rll_init(rll_t *rll) ztest_rll_init(rll_t *rll)
{ {
@ -1484,37 +1402,33 @@ ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
ztest_rll_unlock(rll); ztest_rll_unlock(rll);
} }
static ztest_zrl_t * static rl_t *
ztest_zrl_init(rl_t *rl, ztest_znode_t *zp)
{
ztest_zrl_t *zrl = umem_alloc(sizeof (*zrl), UMEM_NOFAIL);
zrl->z_rl = rl;
zrl->z_ztznode = zp;
return (zrl);
}
static void
ztest_zrl_fini(ztest_zrl_t *zrl)
{
umem_free(zrl, sizeof (*zrl));
}
static ztest_zrl_t *
ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
uint64_t size, rl_type_t type) uint64_t size, rl_type_t type)
{ {
ztest_znode_t *zp = ztest_znode_get(zd, object); uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
rl_t *rl = zfs_range_lock(&zp->z_range_lock, offset, rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
size, type); rl_t *rl;
return (ztest_zrl_init(rl, zp));
rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
rl->rl_object = object;
rl->rl_offset = offset;
rl->rl_size = size;
rl->rl_lock = rll;
ztest_rll_lock(rll, type);
return (rl);
} }
static void static void
ztest_range_unlock(ztest_ds_t *zd, ztest_zrl_t *zrl) ztest_range_unlock(rl_t *rl)
{ {
zfs_range_unlock(zrl->z_rl); rll_t *rll = rl->rl_lock;
ztest_znode_put(zd, zrl->z_ztznode);
ztest_zrl_fini(zrl); ztest_rll_unlock(rll);
umem_free(rl, sizeof (*rl));
} }
static void static void
@ -1536,7 +1450,7 @@ ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
ztest_rll_init(&zd->zd_object_lock[l]); ztest_rll_init(&zd->zd_object_lock[l]);
for (l = 0; l < ZTEST_RANGE_LOCKS; l++) for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
ztest_zll_init(&zd->zd_range_lock[l]); ztest_rll_init(&zd->zd_range_lock[l]);
} }
static void static void
@ -1551,7 +1465,7 @@ ztest_zd_fini(ztest_ds_t *zd)
ztest_rll_destroy(&zd->zd_object_lock[l]); ztest_rll_destroy(&zd->zd_object_lock[l]);
for (l = 0; l < ZTEST_RANGE_LOCKS; l++) for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
ztest_zll_destroy(&zd->zd_range_lock[l]); ztest_rll_destroy(&zd->zd_range_lock[l]);
} }
#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
@ -1967,7 +1881,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_tx_t *tx; dmu_tx_t *tx;
dmu_buf_t *db; dmu_buf_t *db;
arc_buf_t *abuf = NULL; arc_buf_t *abuf = NULL;
ztest_zrl_t *rl; rl_t *rl;
if (byteswap) if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr)); byteswap_uint64_array(lr, sizeof (*lr));
@ -2016,7 +1930,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
if (abuf != NULL) if (abuf != NULL)
dmu_return_arcbuf(abuf); dmu_return_arcbuf(abuf);
dmu_buf_rele(db, FTAG); dmu_buf_rele(db, FTAG);
ztest_range_unlock(zd, rl); ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid); ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC); return (ENOSPC);
} }
@ -2074,7 +1988,7 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_tx_commit(tx); dmu_tx_commit(tx);
ztest_range_unlock(zd, rl); ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid); ztest_object_unlock(zd, lr->lr_foid);
return (0); return (0);
@ -2088,7 +2002,7 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
objset_t *os = zd->zd_os; objset_t *os = zd->zd_os;
dmu_tx_t *tx; dmu_tx_t *tx;
uint64_t txg; uint64_t txg;
ztest_zrl_t *rl; rl_t *rl;
if (byteswap) if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr)); byteswap_uint64_array(lr, sizeof (*lr));
@ -2103,7 +2017,7 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
if (txg == 0) { if (txg == 0) {
ztest_range_unlock(zd, rl); ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid); ztest_object_unlock(zd, lr->lr_foid);
return (ENOSPC); return (ENOSPC);
} }
@ -2115,7 +2029,7 @@ ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
dmu_tx_commit(tx); dmu_tx_commit(tx);
ztest_range_unlock(zd, rl); ztest_range_unlock(rl);
ztest_object_unlock(zd, lr->lr_foid); ztest_object_unlock(zd, lr->lr_foid);
return (0); return (0);
@ -2222,30 +2136,23 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
/* /*
* ZIL get_data callbacks * ZIL get_data callbacks
*/ */
typedef struct ztest_zgd_private {
ztest_ds_t *z_zd;
ztest_zrl_t *z_rl;
uint64_t z_object;
} ztest_zgd_private_t;
static void static void
ztest_get_done(zgd_t *zgd, int error) ztest_get_done(zgd_t *zgd, int error)
{ {
ztest_zgd_private_t *zzp = zgd->zgd_private; ztest_ds_t *zd = zgd->zgd_private;
ztest_ds_t *zd = zzp->z_zd; uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object;
uint64_t object = zzp->z_object;
if (zgd->zgd_db) if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd); dmu_buf_rele(zgd->zgd_db, zgd);
ztest_range_unlock(zd, zzp->z_rl); ztest_range_unlock((rl_t *)zgd->zgd_lr);
ztest_object_unlock(zd, object); ztest_object_unlock(zd, object);
if (error == 0 && zgd->zgd_bp) if (error == 0 && zgd->zgd_bp)
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
umem_free(zgd, sizeof (*zgd)); umem_free(zgd, sizeof (*zgd));
umem_free(zzp, sizeof (*zzp));
} }
static int static int
@ -2263,7 +2170,6 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
dmu_buf_t *db; dmu_buf_t *db;
zgd_t *zgd; zgd_t *zgd;
int error; int error;
ztest_zgd_private_t *zgd_private;
ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL); ASSERT3P(zio, !=, NULL);
@ -2290,15 +2196,11 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
zgd->zgd_lwb = lwb; zgd->zgd_lwb = lwb;
zgd_private = umem_zalloc(sizeof (ztest_zgd_private_t), UMEM_NOFAIL); zgd->zgd_private = zd;
zgd_private->z_zd = zd;
zgd_private->z_object = object;
zgd->zgd_private = zgd_private;
if (buf != NULL) { /* immediate write */ if (buf != NULL) { /* immediate write */
zgd_private->z_rl = ztest_range_lock(zd, object, offset, size, zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
RL_READER); object, offset, size, RL_READER);
zgd->zgd_rl = zgd_private->z_rl->z_rl;
error = dmu_read(os, object, offset, size, buf, error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH);
@ -2312,9 +2214,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
offset = 0; offset = 0;
} }
zgd_private->z_rl = ztest_range_lock(zd, object, offset, size, zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
RL_READER); object, offset, size, RL_READER);
zgd->zgd_rl = zgd_private->z_rl->z_rl;
error = dmu_buf_hold(os, object, offset, zgd, &db, error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH);
@ -2560,7 +2461,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
objset_t *os = zd->zd_os; objset_t *os = zd->zd_os;
dmu_tx_t *tx; dmu_tx_t *tx;
uint64_t txg; uint64_t txg;
ztest_zrl_t *rl; rl_t *rl;
txg_wait_synced(dmu_objset_pool(os), 0); txg_wait_synced(dmu_objset_pool(os), 0);
@ -2581,7 +2482,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
(void) dmu_free_long_range(os, object, offset, size); (void) dmu_free_long_range(os, object, offset, size);
} }
ztest_range_unlock(zd, rl); ztest_range_unlock(rl);
ztest_object_unlock(zd, object); ztest_object_unlock(zd, object);
} }

View File

@ -73,6 +73,7 @@ struct arc_buf;
struct zio_prop; struct zio_prop;
struct sa_handle; struct sa_handle;
struct dsl_crypto_params; struct dsl_crypto_params;
struct locked_range;
typedef struct objset objset_t; typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t; typedef struct dmu_tx dmu_tx_t;
@ -1034,7 +1035,7 @@ typedef struct zgd {
struct lwb *zgd_lwb; struct lwb *zgd_lwb;
struct blkptr *zgd_bp; struct blkptr *zgd_bp;
dmu_buf_t *zgd_db; dmu_buf_t *zgd_db;
struct rl *zgd_rl; struct locked_range *zgd_lr;
void *zgd_private; void *zgd_private;
} zgd_t; } zgd_t;

View File

@ -22,6 +22,9 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/*
* Copyright (c) 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_FS_ZFS_RLOCK_H #ifndef _SYS_FS_ZFS_RLOCK_H
#define _SYS_FS_ZFS_RLOCK_H #define _SYS_FS_ZFS_RLOCK_H
@ -30,85 +33,46 @@
extern "C" { extern "C" {
#endif #endif
#include <sys/list.h>
#include <sys/avl.h> #include <sys/avl.h>
#ifdef _KERNEL
#include <sys/condvar.h>
#else
#include <sys/zfs_context.h>
#endif
typedef enum { typedef enum {
RL_READER, RL_READER,
RL_WRITER, RL_WRITER,
RL_APPEND RL_APPEND
} rl_type_t; } rangelock_type_t;
typedef struct zfs_rlock { struct locked_range;
kmutex_t zr_mutex; /* protects changes to zr_avl */
avl_tree_t zr_avl; /* avl tree of range locks */
uint64_t *zr_size; /* points to znode->z_size */
uint_t *zr_blksz; /* points to znode->z_blksz */
uint64_t *zr_max_blksz; /* points to zfsvfs->z_max_blksz */
} zfs_rlock_t;
typedef struct rl { typedef void (rangelock_cb_t)(struct locked_range *, void *);
zfs_rlock_t *r_zrl;
avl_node_t r_node; /* avl node link */
uint64_t r_off; /* file range offset */
uint64_t r_len; /* file range length */
uint_t r_cnt; /* range reference count in tree */
rl_type_t r_type; /* range type */
kcondvar_t r_wr_cv; /* cv for waiting writers */
kcondvar_t r_rd_cv; /* cv for waiting readers */
uint8_t r_proxy; /* acting for original range */
uint8_t r_write_wanted; /* writer wants to lock this range */
uint8_t r_read_wanted; /* reader wants to lock this range */
list_node_t rl_node; /* used for deferred release */
} rl_t;
/* typedef struct rangelock {
* Lock a range (offset, length) as either shared (RL_READER) avl_tree_t rl_tree; /* contains locked_range_t */
* or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that kmutex_t rl_lock;
* is converted to RL_WRITER that specified to lock from the start of the rangelock_cb_t *rl_cb;
* end of file. Returns the range lock structure. void *rl_arg;
*/ } rangelock_t;
rl_t *zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len,
rl_type_t type);
/* Unlock range and destroy range lock structure. */ typedef struct locked_range {
void zfs_range_unlock(rl_t *rl); rangelock_t *lr_rangelock; /* rangelock that this lock applies to */
avl_node_t lr_node; /* avl node link */
uint64_t lr_offset; /* file range offset */
uint64_t lr_length; /* file range length */
uint_t lr_count; /* range reference count in tree */
rangelock_type_t lr_type; /* range type */
kcondvar_t lr_write_cv; /* cv for waiting writers */
kcondvar_t lr_read_cv; /* cv for waiting readers */
uint8_t lr_proxy; /* acting for original range */
uint8_t lr_write_wanted; /* writer wants to lock this range */
uint8_t lr_read_wanted; /* reader wants to lock this range */
} locked_range_t;
/* void rangelock_init(rangelock_t *, rangelock_cb_t *, void *);
* Reduce range locked as RW_WRITER from whole file to specified range. void rangelock_fini(rangelock_t *);
* Asserts the whole file was previously locked.
*/
void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
/* locked_range_t *rangelock_enter(rangelock_t *,
* AVL comparison function used to order range locks uint64_t, uint64_t, rangelock_type_t);
* Locks are ordered on the start offset of the range. void rangelock_exit(locked_range_t *);
*/ void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
int zfs_range_compare(const void *arg1, const void *arg2);
static inline void
zfs_rlock_init(zfs_rlock_t *zrl)
{
mutex_init(&zrl->zr_mutex, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zrl->zr_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
zrl->zr_size = NULL;
zrl->zr_blksz = NULL;
zrl->zr_max_blksz = NULL;
}
static inline void
zfs_rlock_destroy(zfs_rlock_t *zrl)
{
avl_destroy(&zrl->zr_avl);
mutex_destroy(&zrl->zr_mutex);
}
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/ */
@ -191,7 +191,7 @@ typedef struct znode {
krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_parent_lock; /* parent lock for directories */
krwlock_t z_name_lock; /* "master" lock for dirent locks */ krwlock_t z_name_lock; /* "master" lock for dirent locks */
zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
zfs_rlock_t z_range_lock; /* file range lock */ rangelock_t z_rangelock; /* file range locks */
uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_unlinked; /* file has been unlinked */
uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_zn_prefetch; /* Prefetch znodes? */ uint8_t z_zn_prefetch; /* Prefetch znodes? */

View File

@ -1924,11 +1924,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
ASSERT(pio != NULL); ASSERT(pio != NULL);
ASSERT(txg != 0); ASSERT(txg != 0);
/* dbuf is within the locked range */
ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off);
ASSERT3U(db->db.db_offset + db->db.db_size, <=,
zgd->zgd_rl->r_off + zgd->zgd_rl->r_len);
SET_BOOKMARK(&zb, ds->ds_object, SET_BOOKMARK(&zb, ds->ds_object,
db->db.db_object, db->db_level, db->db_blkid); db->db.db_object, db->db_level, db->db_blkid);

View File

@ -23,7 +23,7 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/* /*
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/ */
/* /*
@ -34,9 +34,9 @@
* Interface * Interface
* --------- * ---------
* Defined in zfs_rlock.h but essentially: * Defined in zfs_rlock.h but essentially:
* rl = zfs_range_lock(zp, off, len, lock_type); * lr = rangelock_enter(zp, off, len, lock_type);
* zfs_range_unlock(rl); * rangelock_reduce(lr, off, len); // optional
* zfs_range_reduce(rl, off, len); * rangelock_exit(lr);
* *
* AVL tree * AVL tree
* -------- * --------
@ -46,9 +46,10 @@
* *
* Common case * Common case
* ----------- * -----------
* The (hopefully) usual case is of no overlaps or contention for * The (hopefully) usual case is of no overlaps or contention for locks. On
* locks. On entry to zfs_lock_range() a rl_t is allocated; the tree * entry to rangelock_enter(), a locked_range_t is allocated; the tree
* searched that finds no overlap, and *this* rl_t is placed in the tree. * searched that finds no overlap, and *this* locked_range_t is placed in the
* tree.
* *
* Overlaps/Reference counting/Proxy locks * Overlaps/Reference counting/Proxy locks
* --------------------------------------- * ---------------------------------------
@ -87,68 +88,85 @@
* *
* Grow block handling * Grow block handling
* ------------------- * -------------------
* ZFS supports multiple block sizes currently up to 128K. The smallest * ZFS supports multiple block sizes, up to 16MB. The smallest
* block size is used for the file which is grown as needed. During this * block size is used for the file which is grown as needed. During this
* growth all other writers and readers must be excluded. * growth all other writers and readers must be excluded.
* So if the block size needs to be grown then the whole file is * So if the block size needs to be grown then the whole file is
* exclusively locked, then later the caller will reduce the lock * exclusively locked, then later the caller will reduce the lock
* range to just the range to be written using zfs_reduce_range. * range to just the range to be written using rangelock_reduce().
*/ */
#include <sys/zfs_context.h>
#include <sys/zfs_rlock.h> #include <sys/zfs_rlock.h>
#include <sys/sysmacros.h>
/*
* AVL comparison function used to order range locks
* Locks are ordered on the start offset of the range.
*/
static int
rangelock_compare(const void *arg1, const void *arg2)
{
const locked_range_t *rl1 = (const locked_range_t *)arg1;
const locked_range_t *rl2 = (const locked_range_t *)arg2;
return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
}
/*
* The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
* It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
* and may increase the range that's locked for RL_WRITER.
*/
void
rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
{
mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&rl->rl_tree, rangelock_compare,
sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
rl->rl_cb = cb;
rl->rl_arg = arg;
}
void
rangelock_fini(rangelock_t *rl)
{
mutex_destroy(&rl->rl_lock);
avl_destroy(&rl->rl_tree);
}
/* /*
* Check if a write lock can be grabbed, or wait and recheck until available. * Check if a write lock can be grabbed, or wait and recheck until available.
*/ */
static void static void
zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new) rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
{ {
avl_tree_t *tree = &zrl->zr_avl; avl_tree_t *tree = &rl->rl_tree;
rl_t *rl; locked_range_t *lr;
avl_index_t where; avl_index_t where;
uint64_t end_size; uint64_t orig_off = new->lr_offset;
uint64_t off = new->r_off; uint64_t orig_len = new->lr_length;
uint64_t len = new->r_len; rangelock_type_t orig_type = new->lr_type;
for (;;) { for (;;) {
/* /*
* Range locking is also used by zvol. However, for zvol, we * Call callback which can modify new->r_off,len,type.
* don't need to append or grow blocksize, so skip that * Note, the callback is used by the ZPL to handle appending
* processing. * and changing blocksizes. It isn't needed for zvols.
*
* Yes, this is ugly, and would be solved by not handling
* grow or append in range lock code. If that was done then
* we could make the range locking code generically available
* to other non-zfs consumers.
*/ */
if (zrl->zr_size) { /* caller is ZPL */ if (rl->rl_cb != NULL) {
/* rl->rl_cb(new, rl->rl_arg);
* If in append mode pick up the current end of file.
* This is done under z_range_lock to avoid races.
*/
if (new->r_type == RL_APPEND)
new->r_off = *zrl->zr_size;
/*
* If we need to grow the block size then grab the whole
* file range. This is also done under z_range_lock to
* avoid races.
*/
end_size = MAX(*zrl->zr_size, new->r_off + len);
if (end_size > *zrl->zr_blksz &&
(!ISP2(*zrl->zr_blksz) ||
*zrl->zr_blksz < *zrl->zr_max_blksz)) {
new->r_off = 0;
new->r_len = UINT64_MAX;
}
} }
/*
* If the type was APPEND, the callback must convert it to
* WRITER.
*/
ASSERT3U(new->lr_type, ==, RL_WRITER);
/* /*
* First check for the usual case of no locks * First check for the usual case of no locks
*/ */
if (avl_numnodes(tree) == 0) { if (avl_numnodes(tree) == 0) {
new->r_type = RL_WRITER; /* convert to writer */
avl_add(tree, new); avl_add(tree, new);
return; return;
} }
@ -156,31 +174,33 @@ zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new)
/* /*
* Look for any locks in the range. * Look for any locks in the range.
*/ */
rl = avl_find(tree, new, &where); lr = avl_find(tree, new, &where);
if (rl) if (lr != NULL)
goto wait; /* already locked at same offset */ goto wait; /* already locked at same offset */
rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
if (rl && (rl->r_off < new->r_off + new->r_len)) if (lr != NULL &&
lr->lr_offset < new->lr_offset + new->lr_length)
goto wait; goto wait;
rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
if (rl && rl->r_off + rl->r_len > new->r_off) if (lr != NULL &&
lr->lr_offset + lr->lr_length > new->lr_offset)
goto wait; goto wait;
new->r_type = RL_WRITER; /* convert possible RL_APPEND */
avl_insert(tree, new, where); avl_insert(tree, new, where);
return; return;
wait: wait:
if (!rl->r_write_wanted) { if (!lr->lr_write_wanted) {
cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
rl->r_write_wanted = B_TRUE; lr->lr_write_wanted = B_TRUE;
} }
cv_wait(&rl->r_wr_cv, &zrl->zr_mutex); cv_wait(&lr->lr_write_cv, &rl->rl_lock);
/* reset to original */ /* reset to original */
new->r_off = off; new->lr_offset = orig_off;
new->r_len = len; new->lr_length = orig_len;
new->lr_type = orig_type;
} }
} }
@ -188,29 +208,29 @@ wait:
* If this is an original (non-proxy) lock then replace it by * If this is an original (non-proxy) lock then replace it by
* a proxy and return the proxy. * a proxy and return the proxy.
*/ */
static rl_t * static locked_range_t *
zfs_range_proxify(avl_tree_t *tree, rl_t *rl) rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
{ {
rl_t *proxy; locked_range_t *proxy;
if (rl->r_proxy) if (lr->lr_proxy)
return (rl); /* already a proxy */ return (lr); /* already a proxy */
ASSERT3U(rl->r_cnt, ==, 1); ASSERT3U(lr->lr_count, ==, 1);
ASSERT(rl->r_write_wanted == B_FALSE); ASSERT(lr->lr_write_wanted == B_FALSE);
ASSERT(rl->r_read_wanted == B_FALSE); ASSERT(lr->lr_read_wanted == B_FALSE);
avl_remove(tree, rl); avl_remove(tree, lr);
rl->r_cnt = 0; lr->lr_count = 0;
/* create a proxy range lock */ /* create a proxy range lock */
proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
proxy->r_off = rl->r_off; proxy->lr_offset = lr->lr_offset;
proxy->r_len = rl->r_len; proxy->lr_length = lr->lr_length;
proxy->r_cnt = 1; proxy->lr_count = 1;
proxy->r_type = RL_READER; proxy->lr_type = RL_READER;
proxy->r_proxy = B_TRUE; proxy->lr_proxy = B_TRUE;
proxy->r_write_wanted = B_FALSE; proxy->lr_write_wanted = B_FALSE;
proxy->r_read_wanted = B_FALSE; proxy->lr_read_wanted = B_FALSE;
avl_add(tree, proxy); avl_add(tree, proxy);
return (proxy); return (proxy);
@ -220,29 +240,27 @@ zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
* Split the range lock at the supplied offset * Split the range lock at the supplied offset
* returning the *front* proxy. * returning the *front* proxy.
*/ */
static rl_t * static locked_range_t *
zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
{ {
rl_t *front, *rear; ASSERT3U(lr->lr_length, >, 1);
ASSERT3U(off, >, lr->lr_offset);
ASSERT3U(rl->r_len, >, 1); ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
ASSERT3U(off, >, rl->r_off); ASSERT(lr->lr_write_wanted == B_FALSE);
ASSERT3U(off, <, rl->r_off + rl->r_len); ASSERT(lr->lr_read_wanted == B_FALSE);
ASSERT(rl->r_write_wanted == B_FALSE);
ASSERT(rl->r_read_wanted == B_FALSE);
/* create the rear proxy range lock */ /* create the rear proxy range lock */
rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
rear->r_off = off; rear->lr_offset = off;
rear->r_len = rl->r_off + rl->r_len - off; rear->lr_length = lr->lr_offset + lr->lr_length - off;
rear->r_cnt = rl->r_cnt; rear->lr_count = lr->lr_count;
rear->r_type = RL_READER; rear->lr_type = RL_READER;
rear->r_proxy = B_TRUE; rear->lr_proxy = B_TRUE;
rear->r_write_wanted = B_FALSE; rear->lr_write_wanted = B_FALSE;
rear->r_read_wanted = B_FALSE; rear->lr_read_wanted = B_FALSE;
front = zfs_range_proxify(tree, rl); locked_range_t *front = rangelock_proxify(tree, lr);
front->r_len = off - rl->r_off; front->lr_length = off - lr->lr_offset;
avl_insert_here(tree, rear, front, AVL_AFTER); avl_insert_here(tree, rear, front, AVL_AFTER);
return (front); return (front);
@ -252,28 +270,27 @@ zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
* Create and add a new proxy range lock for the supplied range. * Create and add a new proxy range lock for the supplied range.
*/ */
static void static void
zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
{ {
rl_t *rl; ASSERT(len != 0);
locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
ASSERT(len); lr->lr_offset = off;
rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); lr->lr_length = len;
rl->r_off = off; lr->lr_count = 1;
rl->r_len = len; lr->lr_type = RL_READER;
rl->r_cnt = 1; lr->lr_proxy = B_TRUE;
rl->r_type = RL_READER; lr->lr_write_wanted = B_FALSE;
rl->r_proxy = B_TRUE; lr->lr_read_wanted = B_FALSE;
rl->r_write_wanted = B_FALSE; avl_add(tree, lr);
rl->r_read_wanted = B_FALSE;
avl_add(tree, rl);
} }
static void static void
zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
locked_range_t *prev, avl_index_t where)
{ {
rl_t *next; locked_range_t *next;
uint64_t off = new->r_off; uint64_t off = new->lr_offset;
uint64_t len = new->r_len; uint64_t len = new->lr_length;
/* /*
* prev arrives either: * prev arrives either:
@ -282,37 +299,37 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
* range may overlap with the new range * range may overlap with the new range
* - null, if there were no ranges starting before the new one * - null, if there were no ranges starting before the new one
*/ */
if (prev) { if (prev != NULL) {
if (prev->r_off + prev->r_len <= off) { if (prev->lr_offset + prev->lr_length <= off) {
prev = NULL; prev = NULL;
} else if (prev->r_off != off) { } else if (prev->lr_offset != off) {
/* /*
* convert to proxy if needed then * convert to proxy if needed then
* split this entry and bump ref count * split this entry and bump ref count
*/ */
prev = zfs_range_split(tree, prev, off); prev = rangelock_split(tree, prev, off);
prev = AVL_NEXT(tree, prev); /* move to rear range */ prev = AVL_NEXT(tree, prev); /* move to rear range */
} }
} }
ASSERT((prev == NULL) || (prev->r_off == off)); ASSERT((prev == NULL) || (prev->lr_offset == off));
if (prev) if (prev != NULL)
next = prev; next = prev;
else else
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); next = avl_nearest(tree, where, AVL_AFTER);
if (next == NULL || off + len <= next->r_off) { if (next == NULL || off + len <= next->lr_offset) {
/* no overlaps, use the original new rl_t in the tree */ /* no overlaps, use the original new rl_t in the tree */
avl_insert(tree, new, where); avl_insert(tree, new, where);
return; return;
} }
if (off < next->r_off) { if (off < next->lr_offset) {
/* Add a proxy for initial range before the overlap */ /* Add a proxy for initial range before the overlap */
zfs_range_new_proxy(tree, off, next->r_off - off); rangelock_new_proxy(tree, off, next->lr_offset - off);
} }
new->r_cnt = 0; /* will use proxies in tree */ new->lr_count = 0; /* will use proxies in tree */
/* /*
* We now search forward through the ranges, until we go past the end * We now search forward through the ranges, until we go past the end
* of the new range. For each entry we make it a proxy if it * of the new range. For each entry we make it a proxy if it
@ -320,47 +337,51 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
* gaps between the ranges then we create a new proxy range. * gaps between the ranges then we create a new proxy range.
*/ */
for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
if (off + len <= next->r_off) if (off + len <= next->lr_offset)
break; break;
if (prev && prev->r_off + prev->r_len < next->r_off) { if (prev != NULL && prev->lr_offset + prev->lr_length <
next->lr_offset) {
/* there's a gap */ /* there's a gap */
ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); ASSERT3U(next->lr_offset, >,
zfs_range_new_proxy(tree, prev->r_off + prev->r_len, prev->lr_offset + prev->lr_length);
next->r_off - (prev->r_off + prev->r_len)); rangelock_new_proxy(tree,
prev->lr_offset + prev->lr_length,
next->lr_offset -
(prev->lr_offset + prev->lr_length));
} }
if (off + len == next->r_off + next->r_len) { if (off + len == next->lr_offset + next->lr_length) {
/* exact overlap with end */ /* exact overlap with end */
next = zfs_range_proxify(tree, next); next = rangelock_proxify(tree, next);
next->r_cnt++; next->lr_count++;
return; return;
} }
if (off + len < next->r_off + next->r_len) { if (off + len < next->lr_offset + next->lr_length) {
/* new range ends in the middle of this block */ /* new range ends in the middle of this block */
next = zfs_range_split(tree, next, off + len); next = rangelock_split(tree, next, off + len);
next->r_cnt++; next->lr_count++;
return; return;
} }
ASSERT3U(off + len, >, next->r_off + next->r_len); ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
next = zfs_range_proxify(tree, next); next = rangelock_proxify(tree, next);
next->r_cnt++; next->lr_count++;
} }
/* Add the remaining end range. */ /* Add the remaining end range. */
zfs_range_new_proxy(tree, prev->r_off + prev->r_len, rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
(off + len) - (prev->r_off + prev->r_len)); (off + len) - (prev->lr_offset + prev->lr_length));
} }
/* /*
* Check if a reader lock can be grabbed, or wait and recheck until available. * Check if a reader lock can be grabbed, or wait and recheck until available.
*/ */
static void static void
zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new) rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
{ {
avl_tree_t *tree = &zrl->zr_avl; avl_tree_t *tree = &rl->rl_tree;
rl_t *prev, *next; locked_range_t *prev, *next;
avl_index_t where; avl_index_t where;
uint64_t off = new->r_off; uint64_t off = new->lr_offset;
uint64_t len = new->r_len; uint64_t len = new->lr_length;
/* /*
* Look for any writer locks in the range. * Look for any writer locks in the range.
@ -368,21 +389,22 @@ zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
retry: retry:
prev = avl_find(tree, new, &where); prev = avl_find(tree, new, &where);
if (prev == NULL) if (prev == NULL)
prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
/* /*
* Check the previous range for a writer lock overlap. * Check the previous range for a writer lock overlap.
*/ */
if (prev && (off < prev->r_off + prev->r_len)) { if (prev && (off < prev->lr_offset + prev->lr_length)) {
if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
if (!prev->r_read_wanted) { if (!prev->lr_read_wanted) {
cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); cv_init(&prev->lr_read_cv,
prev->r_read_wanted = B_TRUE; NULL, CV_DEFAULT, NULL);
prev->lr_read_wanted = B_TRUE;
} }
cv_wait(&prev->r_rd_cv, &zrl->zr_mutex); cv_wait(&prev->lr_read_cv, &rl->rl_lock);
goto retry; goto retry;
} }
if (off + len < prev->r_off + prev->r_len) if (off + len < prev->lr_offset + prev->lr_length)
goto got_lock; goto got_lock;
} }
@ -390,95 +412,97 @@ retry:
* Search through the following ranges to see if there's * Search through the following ranges to see if there's
* write lock any overlap. * write lock any overlap.
*/ */
if (prev) if (prev != NULL)
next = AVL_NEXT(tree, prev); next = AVL_NEXT(tree, prev);
else else
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
for (; next; next = AVL_NEXT(tree, next)) { for (; next != NULL; next = AVL_NEXT(tree, next)) {
if (off + len <= next->r_off) if (off + len <= next->lr_offset)
goto got_lock; goto got_lock;
if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
if (!next->r_read_wanted) { if (!next->lr_read_wanted) {
cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); cv_init(&next->lr_read_cv,
next->r_read_wanted = B_TRUE; NULL, CV_DEFAULT, NULL);
next->lr_read_wanted = B_TRUE;
} }
cv_wait(&next->r_rd_cv, &zrl->zr_mutex); cv_wait(&next->lr_read_cv, &rl->rl_lock);
goto retry; goto retry;
} }
if (off + len <= next->r_off + next->r_len) if (off + len <= next->lr_offset + next->lr_length)
goto got_lock; goto got_lock;
} }
got_lock: got_lock:
/* /*
* Add the read lock, which may involve splitting existing * Add the read lock, which may involve splitting existing
* locks and bumping ref counts (r_cnt). * locks and bumping ref counts (r_count).
*/ */
zfs_range_add_reader(tree, new, prev, where); rangelock_add_reader(tree, new, prev, where);
} }
/* /*
* Lock a range (offset, length) as either shared (RL_READER) * Lock a range (offset, length) as either shared (RL_READER) or exclusive
* or exclusive (RL_WRITER). Returns the range lock structure * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert
* for later unlocking or reduce range (if entire file * it to a RL_WRITER lock (with the offset at the end of the file). Returns
* previously locked as RL_WRITER). * the range lock structure for later unlocking (or reduce range if the
* entire file is locked as RL_WRITER).
*/ */
rl_t * locked_range_t *
zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len, rl_type_t type) rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
rangelock_type_t type)
{ {
rl_t *new;
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
new = kmem_alloc(sizeof (rl_t), KM_SLEEP); locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
new->r_zrl = zrl; new->lr_rangelock = rl;
new->r_off = off; new->lr_offset = off;
if (len + off < off) /* overflow */ if (len + off < off) /* overflow */
len = UINT64_MAX - off; len = UINT64_MAX - off;
new->r_len = len; new->lr_length = len;
new->r_cnt = 1; /* assume it's going to be in the tree */ new->lr_count = 1; /* assume it's going to be in the tree */
new->r_type = type; new->lr_type = type;
new->r_proxy = B_FALSE; new->lr_proxy = B_FALSE;
new->r_write_wanted = B_FALSE; new->lr_write_wanted = B_FALSE;
new->r_read_wanted = B_FALSE; new->lr_read_wanted = B_FALSE;
mutex_enter(&zrl->zr_mutex); mutex_enter(&rl->rl_lock);
if (type == RL_READER) { if (type == RL_READER) {
/* /*
* First check for the usual case of no locks * First check for the usual case of no locks
*/ */
if (avl_numnodes(&zrl->zr_avl) == 0) if (avl_numnodes(&rl->rl_tree) == 0)
avl_add(&zrl->zr_avl, new); avl_add(&rl->rl_tree, new);
else else
zfs_range_lock_reader(zrl, new); rangelock_enter_reader(rl, new);
} else /* RL_WRITER or RL_APPEND */ } else
zfs_range_lock_writer(zrl, new); rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */
mutex_exit(&zrl->zr_mutex); mutex_exit(&rl->rl_lock);
return (new); return (new);
} }
/*
* Safely free the locked_range_t.
*/
static void static void
zfs_range_free(void *arg) rangelock_free(locked_range_t *lr)
{ {
rl_t *rl = arg; if (lr->lr_write_wanted)
cv_destroy(&lr->lr_write_cv);
if (rl->r_write_wanted) if (lr->lr_read_wanted)
cv_destroy(&rl->r_wr_cv); cv_destroy(&lr->lr_read_cv);
if (rl->r_read_wanted) kmem_free(lr, sizeof (locked_range_t));
cv_destroy(&rl->r_rd_cv);
kmem_free(rl, sizeof (rl_t));
} }
/* /*
* Unlock a reader lock * Unlock a reader lock
*/ */
static void static void
zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list) rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove,
list_t *free_list)
{ {
avl_tree_t *tree = &zrl->zr_avl; avl_tree_t *tree = &rl->rl_tree;
rl_t *rl, *next = NULL;
uint64_t len; uint64_t len;
/* /*
@ -488,53 +512,48 @@ zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
* removed from the tree and replaced by proxies (one or * removed from the tree and replaced by proxies (one or
* more ranges mapping to the entire range). * more ranges mapping to the entire range).
*/ */
if (remove->r_cnt == 1) { if (remove->lr_count == 1) {
avl_remove(tree, remove); avl_remove(tree, remove);
if (remove->lr_write_wanted)
if (remove->r_write_wanted) cv_broadcast(&remove->lr_write_cv);
cv_broadcast(&remove->r_wr_cv); if (remove->lr_read_wanted)
cv_broadcast(&remove->lr_read_cv);
if (remove->r_read_wanted)
cv_broadcast(&remove->r_rd_cv);
list_insert_tail(free_list, remove); list_insert_tail(free_list, remove);
} else { } else {
ASSERT0(remove->r_cnt); ASSERT0(remove->lr_count);
ASSERT0(remove->r_write_wanted); ASSERT0(remove->lr_write_wanted);
ASSERT0(remove->r_read_wanted); ASSERT0(remove->lr_read_wanted);
/* /*
* Find start proxy representing this reader lock, * Find start proxy representing this reader lock,
* then decrement ref count on all proxies * then decrement ref count on all proxies
* that make up this range, freeing them as needed. * that make up this range, freeing them as needed.
*/ */
rl = avl_find(tree, remove, NULL); locked_range_t *lr = avl_find(tree, remove, NULL);
ASSERT(rl); ASSERT3P(lr, !=, NULL);
ASSERT(rl->r_cnt); ASSERT3U(lr->lr_count, !=, 0);
ASSERT(rl->r_type == RL_READER); ASSERT3U(lr->lr_type, ==, RL_READER);
for (len = remove->r_len; len != 0; rl = next) { locked_range_t *next = NULL;
len -= rl->r_len; for (len = remove->lr_length; len != 0; lr = next) {
if (len) { len -= lr->lr_length;
next = AVL_NEXT(tree, rl); if (len != 0) {
ASSERT(next); next = AVL_NEXT(tree, lr);
ASSERT(rl->r_off + rl->r_len == next->r_off); ASSERT3P(next, !=, NULL);
ASSERT(next->r_cnt); ASSERT3U(lr->lr_offset + lr->lr_length, ==,
ASSERT(next->r_type == RL_READER); next->lr_offset);
ASSERT3U(next->lr_count, !=, 0);
ASSERT3U(next->lr_type, ==, RL_READER);
} }
rl->r_cnt--; lr->lr_count--;
if (rl->r_cnt == 0) { if (lr->lr_count == 0) {
avl_remove(tree, rl); avl_remove(tree, lr);
if (lr->lr_write_wanted)
if (rl->r_write_wanted) cv_broadcast(&lr->lr_write_cv);
cv_broadcast(&rl->r_wr_cv); if (lr->lr_read_wanted)
cv_broadcast(&lr->lr_read_cv);
if (rl->r_read_wanted) list_insert_tail(free_list, lr);
cv_broadcast(&rl->r_rd_cv);
list_insert_tail(free_list, rl);
} }
} }
kmem_free(remove, sizeof (locked_range_t));
kmem_free(remove, sizeof (rl_t));
} }
} }
@ -542,91 +561,79 @@ zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
* Unlock range and destroy range lock structure. * Unlock range and destroy range lock structure.
*/ */
void void
zfs_range_unlock(rl_t *rl) rangelock_exit(locked_range_t *lr)
{ {
zfs_rlock_t *zrl = rl->r_zrl; rangelock_t *rl = lr->lr_rangelock;
list_t free_list; list_t free_list;
rl_t *free_rl; locked_range_t *free_lr;
ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
ASSERT(!rl->r_proxy); ASSERT(!lr->lr_proxy);
list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node));
mutex_enter(&zrl->zr_mutex); /*
if (rl->r_type == RL_WRITER) { * The free list is used to defer the cv_destroy() and
* subsequent kmem_free until after the mutex is dropped.
*/
list_create(&free_list, sizeof (locked_range_t),
offsetof(locked_range_t, lr_node));
mutex_enter(&rl->rl_lock);
if (lr->lr_type == RL_WRITER) {
/* writer locks can't be shared or split */ /* writer locks can't be shared or split */
avl_remove(&zrl->zr_avl, rl); avl_remove(&rl->rl_tree, lr);
if (rl->r_write_wanted) if (lr->lr_write_wanted)
cv_broadcast(&rl->r_wr_cv); cv_broadcast(&lr->lr_write_cv);
if (lr->lr_read_wanted)
if (rl->r_read_wanted) cv_broadcast(&lr->lr_read_cv);
cv_broadcast(&rl->r_rd_cv); list_insert_tail(&free_list, lr);
list_insert_tail(&free_list, rl);
} else { } else {
/* /*
* lock may be shared, let zfs_range_unlock_reader() * lock may be shared, let rangelock_exit_reader()
* release the zp->z_range_lock lock and free the rl_t * release the lock and free the locked_range_t.
*/ */
zfs_range_unlock_reader(zrl, rl, &free_list); rangelock_exit_reader(rl, lr, &free_list);
} }
mutex_exit(&zrl->zr_mutex); mutex_exit(&rl->rl_lock);
while ((free_rl = list_head(&free_list)) != NULL) { while ((free_lr = list_remove_head(&free_list)) != NULL)
list_remove(&free_list, free_rl); rangelock_free(free_lr);
zfs_range_free(free_rl);
}
list_destroy(&free_list); list_destroy(&free_list);
} }
/* /*
* Reduce range locked as RL_WRITER from whole file to specified range. * Reduce range locked as RL_WRITER from whole file to specified range.
* Asserts the whole file is exclusivly locked and so there's only one * Asserts the whole file is exclusively locked and so there's only one
* entry in the tree. * entry in the tree.
*/ */
void void
zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
{ {
zfs_rlock_t *zrl = rl->r_zrl; rangelock_t *rl = lr->lr_rangelock;
/* Ensure there are no other locks */ /* Ensure there are no other locks */
ASSERT(avl_numnodes(&zrl->zr_avl) == 1); ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
ASSERT(rl->r_off == 0); ASSERT3U(lr->lr_offset, ==, 0);
ASSERT(rl->r_type == RL_WRITER); ASSERT3U(lr->lr_type, ==, RL_WRITER);
ASSERT(!rl->r_proxy); ASSERT(!lr->lr_proxy);
ASSERT3U(rl->r_len, ==, UINT64_MAX); ASSERT3U(lr->lr_length, ==, UINT64_MAX);
ASSERT3U(rl->r_cnt, ==, 1); ASSERT3U(lr->lr_count, ==, 1);
mutex_enter(&zrl->zr_mutex); mutex_enter(&rl->rl_lock);
rl->r_off = off; lr->lr_offset = off;
rl->r_len = len; lr->lr_length = len;
mutex_exit(&rl->rl_lock);
if (rl->r_write_wanted) if (lr->lr_write_wanted)
cv_broadcast(&rl->r_wr_cv); cv_broadcast(&lr->lr_write_cv);
if (rl->r_read_wanted) if (lr->lr_read_wanted)
cv_broadcast(&rl->r_rd_cv); cv_broadcast(&lr->lr_read_cv);
mutex_exit(&zrl->zr_mutex);
} }
/* #if defined(_KERNEL)
* AVL comparison function used to order range locks EXPORT_SYMBOL(rangelock_init);
* Locks are ordered on the start offset of the range. EXPORT_SYMBOL(rangelock_fini);
*/ EXPORT_SYMBOL(rangelock_enter);
int EXPORT_SYMBOL(rangelock_exit);
zfs_range_compare(const void *arg1, const void *arg2) EXPORT_SYMBOL(rangelock_reduce);
{
const rl_t *rl1 = (const rl_t *)arg1;
const rl_t *rl2 = (const rl_t *)arg2;
return (AVL_CMP(rl1->r_off, rl2->r_off));
}
#ifdef _KERNEL
EXPORT_SYMBOL(zfs_range_lock);
EXPORT_SYMBOL(zfs_range_unlock);
EXPORT_SYMBOL(zfs_range_reduce);
EXPORT_SYMBOL(zfs_range_compare);
#endif #endif

View File

@ -477,7 +477,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
/* /*
* Lock the range against changes. * Lock the range against changes.
*/ */
rl_t *rl = zfs_range_lock(&zp->z_range_lock, locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
uio->uio_loffset, uio->uio_resid, RL_READER); uio->uio_loffset, uio->uio_resid, RL_READER);
/* /*
@ -550,7 +550,7 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
task_io_account_read(nread); task_io_account_read(nread);
out: out:
zfs_range_unlock(rl); rangelock_exit(lr);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (error); return (error);
@ -652,19 +652,18 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
#endif #endif
uio_prefaultpages(MIN(n, max_blksz), uio); uio_prefaultpages(MIN(n, max_blksz), uio);
rl_t *rl;
/* /*
* If in append mode, set the io offset pointer to eof. * If in append mode, set the io offset pointer to eof.
*/ */
locked_range_t *lr;
if (ioflag & FAPPEND) { if (ioflag & FAPPEND) {
/* /*
* Obtain an appending range lock to guarantee file append * Obtain an appending range lock to guarantee file append
* semantics. We reset the write offset once we have the lock. * semantics. We reset the write offset once we have the lock.
*/ */
rl = zfs_range_lock(&zp->z_range_lock, 0, n, RL_APPEND); lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
woff = rl->r_off; woff = lr->lr_offset;
if (rl->r_len == UINT64_MAX) { if (lr->lr_length == UINT64_MAX) {
/* /*
* We overlocked the file because this write will cause * We overlocked the file because this write will cause
* the file block size to increase. * the file block size to increase.
@ -679,11 +678,11 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
* this write, then this range lock will lock the entire file * this write, then this range lock will lock the entire file
* so that we can re-write the block safely. * so that we can re-write the block safely.
*/ */
rl = zfs_range_lock(&zp->z_range_lock, woff, n, RL_WRITER); lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
} }
if (woff >= limit) { if (woff >= limit) {
zfs_range_unlock(rl); rangelock_exit(lr);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (SET_ERROR(EFBIG)); return (SET_ERROR(EFBIG));
} }
@ -776,12 +775,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
} }
/* /*
* If zfs_range_lock() over-locked we grow the blocksize * If rangelock_enter() over-locked we grow the blocksize
* and then reduce the lock range. This will only happen * and then reduce the lock range. This will only happen
* on the first iteration since zfs_range_reduce() will * on the first iteration since rangelock_reduce() will
* shrink down r_len to the appropriate size. * shrink down lr_length to the appropriate size.
*/ */
if (rl->r_len == UINT64_MAX) { if (lr->lr_length == UINT64_MAX) {
uint64_t new_blksz; uint64_t new_blksz;
if (zp->z_blksz > max_blksz) { if (zp->z_blksz > max_blksz) {
@ -797,7 +796,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
new_blksz = MIN(end_size, max_blksz); new_blksz = MIN(end_size, max_blksz);
} }
zfs_grow_blocksize(zp, new_blksz, tx); zfs_grow_blocksize(zp, new_blksz, tx);
zfs_range_reduce(rl, woff, n); rangelock_reduce(lr, woff, n);
} }
/* /*
@ -915,7 +914,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
} }
zfs_inode_update(zp); zfs_inode_update(zp);
zfs_range_unlock(rl); rangelock_exit(lr);
/* /*
* If we're in replay mode, or we made no progress, return error. * If we're in replay mode, or we made no progress, return error.
@ -967,7 +966,7 @@ zfs_get_done(zgd_t *zgd, int error)
if (zgd->zgd_db) if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd); dmu_buf_rele(zgd->zgd_db, zgd);
zfs_range_unlock(zgd->zgd_rl); rangelock_exit(zgd->zgd_lr);
/* /*
* Release the vnode asynchronously as we currently have the * Release the vnode asynchronously as we currently have the
@ -1031,8 +1030,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
* we don't have to write the data twice. * we don't have to write the data twice.
*/ */
if (buf != NULL) { /* immediate write */ if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, size, zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
RL_READER); offset, size, RL_READER);
/* test for truncation needs to be done while range locked */ /* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) { if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT); error = SET_ERROR(ENOENT);
@ -1053,12 +1052,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
size = zp->z_blksz; size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff; offset -= blkoff;
zgd->zgd_rl = zfs_range_lock(&zp->z_range_lock, offset, zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
size, RL_READER); offset, size, RL_READER);
if (zp->z_blksz == size) if (zp->z_blksz == size)
break; break;
offset += blkoff; offset += blkoff;
zfs_range_unlock(zgd->zgd_rl); rangelock_exit(zgd->zgd_lr);
} }
/* test for truncation needs to be done while range locked */ /* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size) if (lr->lr_offset >= zp->z_size)
@ -4432,7 +4431,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
loff_t offset; loff_t offset;
loff_t pgoff; loff_t pgoff;
unsigned int pglen; unsigned int pglen;
rl_t *rl;
dmu_tx_t *tx; dmu_tx_t *tx;
caddr_t va; caddr_t va;
int err = 0; int err = 0;
@ -4506,13 +4504,14 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
redirty_page_for_writepage(wbc, pp); redirty_page_for_writepage(wbc, pp);
unlock_page(pp); unlock_page(pp);
rl = zfs_range_lock(&zp->z_range_lock, pgoff, pglen, RL_WRITER); locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
pgoff, pglen, RL_WRITER);
lock_page(pp); lock_page(pp);
/* Page mapping changed or it was no longer dirty, we're done */ /* Page mapping changed or it was no longer dirty, we're done */
if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
unlock_page(pp); unlock_page(pp);
zfs_range_unlock(rl); rangelock_exit(lr);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (0); return (0);
} }
@ -4520,7 +4519,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
/* Another process started write block if required */ /* Another process started write block if required */
if (PageWriteback(pp)) { if (PageWriteback(pp)) {
unlock_page(pp); unlock_page(pp);
zfs_range_unlock(rl); rangelock_exit(lr);
if (wbc->sync_mode != WB_SYNC_NONE) if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(pp); wait_on_page_writeback(pp);
@ -4532,7 +4531,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
/* Clear the dirty flag the required locks are held */ /* Clear the dirty flag the required locks are held */
if (!clear_page_dirty_for_io(pp)) { if (!clear_page_dirty_for_io(pp)) {
unlock_page(pp); unlock_page(pp);
zfs_range_unlock(rl); rangelock_exit(lr);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (0); return (0);
} }
@ -4559,7 +4558,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
__set_page_dirty_nobuffers(pp); __set_page_dirty_nobuffers(pp);
ClearPageError(pp); ClearPageError(pp);
end_page_writeback(pp); end_page_writeback(pp);
zfs_range_unlock(rl); rangelock_exit(lr);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (err); return (err);
} }
@ -4586,7 +4585,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
zfs_putpage_commit_cb, pp); zfs_putpage_commit_cb, pp);
dmu_tx_commit(tx); dmu_tx_commit(tx);
zfs_range_unlock(rl); rangelock_exit(lr);
if (wbc->sync_mode != WB_SYNC_NONE) { if (wbc->sync_mode != WB_SYNC_NONE) {
/* /*

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/ */
/* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2007 Jeremy Teo */
@ -91,6 +91,37 @@ static kmem_cache_t *znode_cache = NULL;
static kmem_cache_t *znode_hold_cache = NULL; static kmem_cache_t *znode_hold_cache = NULL;
unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
/*
* This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
* z_rangelock. It will modify the offset and length of the lock to reflect
* znode-specific information, and convert RL_APPEND to RL_WRITER. This is
* called with the rangelock_t's rl_lock held, which avoids races.
*/
static void
zfs_rangelock_cb(locked_range_t *new, void *arg)
{
znode_t *zp = arg;
/*
* If in append mode, convert to writer and lock starting at the
* current end of file.
*/
if (new->lr_type == RL_APPEND) {
new->lr_offset = zp->z_size;
new->lr_type = RL_WRITER;
}
/*
* If we need to grow the block size then lock the whole file range.
*/
uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
new->lr_offset = 0;
new->lr_length = UINT64_MAX;
}
}
/*ARGSUSED*/ /*ARGSUSED*/
static int static int
zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
@ -106,7 +137,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
zfs_rlock_init(&zp->z_range_lock); rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
zp->z_dirlocks = NULL; zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL; zp->z_acl_cached = NULL;
@ -128,7 +159,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_name_lock); rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock); mutex_destroy(&zp->z_acl_lock);
rw_destroy(&zp->z_xattr_lock); rw_destroy(&zp->z_xattr_lock);
zfs_rlock_destroy(&zp->z_range_lock); rangelock_fini(&zp->z_rangelock);
ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL); ASSERT(zp->z_acl_cached == NULL);
@ -577,9 +608,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_is_mapped = B_FALSE; zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_FALSE; zp->z_is_ctldir = B_FALSE;
zp->z_is_stale = B_FALSE; zp->z_is_stale = B_FALSE;
zp->z_range_lock.zr_size = &zp->z_size;
zp->z_range_lock.zr_blksz = &zp->z_blksz;
zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
@ -1475,20 +1503,20 @@ zfs_extend(znode_t *zp, uint64_t end)
{ {
zfsvfs_t *zfsvfs = ZTOZSB(zp); zfsvfs_t *zfsvfs = ZTOZSB(zp);
dmu_tx_t *tx; dmu_tx_t *tx;
rl_t *rl; locked_range_t *lr;
uint64_t newblksz; uint64_t newblksz;
int error; int error;
/* /*
* We will change zp_size, lock the whole file. * We will change zp_size, lock the whole file.
*/ */
rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER); lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
/* /*
* Nothing to do if file already at desired length. * Nothing to do if file already at desired length.
*/ */
if (end <= zp->z_size) { if (end <= zp->z_size) {
zfs_range_unlock(rl); rangelock_exit(lr);
return (0); return (0);
} }
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
@ -1518,7 +1546,7 @@ zfs_extend(znode_t *zp, uint64_t end)
error = dmu_tx_assign(tx, TXG_WAIT); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
zfs_range_unlock(rl); rangelock_exit(lr);
return (error); return (error);
} }
@ -1530,7 +1558,7 @@ zfs_extend(znode_t *zp, uint64_t end)
VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
&zp->z_size, sizeof (zp->z_size), tx)); &zp->z_size, sizeof (zp->z_size), tx));
zfs_range_unlock(rl); rangelock_exit(lr);
dmu_tx_commit(tx); dmu_tx_commit(tx);
@ -1593,19 +1621,19 @@ static int
zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
{ {
zfsvfs_t *zfsvfs = ZTOZSB(zp); zfsvfs_t *zfsvfs = ZTOZSB(zp);
rl_t *rl; locked_range_t *lr;
int error; int error;
/* /*
* Lock the range being freed. * Lock the range being freed.
*/ */
rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER); lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
/* /*
* Nothing to do if file already at desired length. * Nothing to do if file already at desired length.
*/ */
if (off >= zp->z_size) { if (off >= zp->z_size) {
zfs_range_unlock(rl); rangelock_exit(lr);
return (0); return (0);
} }
@ -1655,7 +1683,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
page_len); page_len);
} }
} }
zfs_range_unlock(rl); rangelock_exit(lr);
return (error); return (error);
} }
@ -1673,7 +1701,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
{ {
zfsvfs_t *zfsvfs = ZTOZSB(zp); zfsvfs_t *zfsvfs = ZTOZSB(zp);
dmu_tx_t *tx; dmu_tx_t *tx;
rl_t *rl; locked_range_t *lr;
int error; int error;
sa_bulk_attr_t bulk[2]; sa_bulk_attr_t bulk[2];
int count = 0; int count = 0;
@ -1681,20 +1709,20 @@ zfs_trunc(znode_t *zp, uint64_t end)
/* /*
* We will change zp_size, lock the whole file. * We will change zp_size, lock the whole file.
*/ */
rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER); lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
/* /*
* Nothing to do if file already at desired length. * Nothing to do if file already at desired length.
*/ */
if (end >= zp->z_size) { if (end >= zp->z_size) {
zfs_range_unlock(rl); rangelock_exit(lr);
return (0); return (0);
} }
error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
DMU_OBJECT_END); DMU_OBJECT_END);
if (error) { if (error) {
zfs_range_unlock(rl); rangelock_exit(lr);
return (error); return (error);
} }
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
@ -1704,7 +1732,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
error = dmu_tx_assign(tx, TXG_WAIT); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
zfs_range_unlock(rl); rangelock_exit(lr);
return (error); return (error);
} }
@ -1720,8 +1748,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
dmu_tx_commit(tx); dmu_tx_commit(tx);
rangelock_exit(lr);
zfs_range_unlock(rl);
return (0); return (0);
} }

View File

@ -86,7 +86,6 @@
#include <sys/dmu_tx.h> #include <sys/dmu_tx.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zfs_rlock.h> #include <sys/zfs_rlock.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h> #include <sys/spa_impl.h>
#include <sys/zvol.h> #include <sys/zvol.h>
@ -123,7 +122,7 @@ struct zvol_state {
uint32_t zv_open_count; /* open counts */ uint32_t zv_open_count; /* open counts */
uint32_t zv_changed; /* disk changed */ uint32_t zv_changed; /* disk changed */
zilog_t *zv_zilog; /* ZIL handle */ zilog_t *zv_zilog; /* ZIL handle */
zfs_rlock_t zv_range_lock; /* range lock */ rangelock_t zv_rangelock; /* for range locking */
dnode_t *zv_dn; /* dnode hold */ dnode_t *zv_dn; /* dnode hold */
dev_t zv_dev; /* device id */ dev_t zv_dev; /* device id */
struct gendisk *zv_disk; /* generic disk */ struct gendisk *zv_disk; /* generic disk */
@ -716,7 +715,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
typedef struct zv_request { typedef struct zv_request {
zvol_state_t *zv; zvol_state_t *zv;
struct bio *bio; struct bio *bio;
rl_t *rl; locked_range_t *lr;
} zv_request_t; } zv_request_t;
static void static void
@ -778,7 +777,7 @@ zvol_write(void *arg)
if (error) if (error)
break; break;
} }
zfs_range_unlock(zvr->rl); rangelock_exit(zvr->lr);
int64_t nwritten = start_resid - uio.uio_resid; int64_t nwritten = start_resid - uio.uio_resid;
dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
@ -872,7 +871,8 @@ zvol_discard(void *arg)
ZVOL_OBJ, start, size); ZVOL_OBJ, start, size);
} }
unlock: unlock:
zfs_range_unlock(zvr->rl); rangelock_exit(zvr->lr);
if (error == 0 && sync) if (error == 0 && sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ); zil_commit(zv->zv_zilog, ZVOL_OBJ);
@ -917,7 +917,7 @@ zvol_read(void *arg)
break; break;
} }
} }
zfs_range_unlock(zvr->rl); rangelock_exit(zvr->lr);
int64_t nread = start_resid - uio.uio_resid; int64_t nread = start_resid - uio.uio_resid;
dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
@ -985,7 +985,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
* are asynchronous, we take it here synchronously to make * are asynchronous, we take it here synchronously to make
* sure overlapped I/Os are properly ordered. * sure overlapped I/Os are properly ordered.
*/ */
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size, zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
RL_WRITER); RL_WRITER);
/* /*
* Sync writes and discards execute zil_commit() which may need * Sync writes and discards execute zil_commit() which may need
@ -1014,7 +1014,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
rw_enter(&zv->zv_suspend_lock, RW_READER); rw_enter(&zv->zv_suspend_lock, RW_READER);
zvr->rl = zfs_range_lock(&zv->zv_range_lock, offset, size, zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
RL_READER); RL_READER);
if (zvol_request_sync || taskq_dispatch(zvol_taskq, if (zvol_request_sync || taskq_dispatch(zvol_taskq,
zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID) zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
@ -1036,7 +1036,7 @@ zvol_get_done(zgd_t *zgd, int error)
if (zgd->zgd_db) if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd); dmu_buf_rele(zgd->zgd_db, zgd);
zfs_range_unlock(zgd->zgd_rl); rangelock_exit(zgd->zgd_lr);
if (error == 0 && zgd->zgd_bp) if (error == 0 && zgd->zgd_bp)
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
@ -1072,7 +1072,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
* we don't have to write the data twice. * we don't have to write the data twice.
*/ */
if (buf != NULL) { /* immediate write */ if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size, zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
RL_READER); RL_READER);
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH);
@ -1085,7 +1085,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
*/ */
size = zv->zv_volblocksize; size = zv->zv_volblocksize;
offset = P2ALIGN_TYPED(offset, size, uint64_t); offset = P2ALIGN_TYPED(offset, size, uint64_t);
zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size, zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
RL_READER); RL_READER);
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH); DMU_READ_NO_PREFETCH);
@ -1687,7 +1687,7 @@ zvol_alloc(dev_t dev, const char *name)
zv->zv_open_count = 0; zv->zv_open_count = 0;
strlcpy(zv->zv_name, name, MAXNAMELEN); strlcpy(zv->zv_name, name, MAXNAMELEN);
zfs_rlock_init(&zv->zv_range_lock); rangelock_init(&zv->zv_rangelock, NULL, NULL);
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
zv->zv_disk->major = zvol_major; zv->zv_disk->major = zvol_major;
@ -1745,7 +1745,7 @@ zvol_free(void *arg)
ASSERT(zv->zv_disk->private_data == NULL); ASSERT(zv->zv_disk->private_data == NULL);
rw_destroy(&zv->zv_suspend_lock); rw_destroy(&zv->zv_suspend_lock);
zfs_rlock_destroy(&zv->zv_range_lock); rangelock_fini(&zv->zv_rangelock);
del_gendisk(zv->zv_disk); del_gendisk(zv->zv_disk);
blk_cleanup_queue(zv->zv_queue); blk_cleanup_queue(zv->zv_queue);

View File

@ -121,11 +121,6 @@ function openzfs_port_commit()
error=1 error=1
fi fi
# need a approved by line
if ! check_tagged_line "Approved by" ; then
error=1
fi
# need ported by line # need ported by line
if ! check_tagged_line "Ported-by" ; then if ! check_tagged_line "Ported-by" ; then
error=1 error=1