log xattr=sa create/remove/update to ZIL

As such, there are no specific synchronous semantics defined for
the xattrs. But for xattr=on, it does log to ZIL and zil_commit() is
done, if sync=always is set on dataset. This provides sync semantics
for xattr=on with sync=always set on dataset.

For the xattr=sa implementation, it doesn't log to ZIL, so, even with
sync=always, xattrs are not guaranteed to be synced before xattr call
returns to caller. So, xattr can be lost if system crash happens, before
txg carrying xattr transaction is synced.

This change adds xattr=sa logging to ZIL on xattr create/remove/update
and xattrs are synced to ZIL (zil_commit() done) for sync=always.
This makes xattr=sa behavior similar to xattr=on.

Implementation notes:
The actual logging is fairly straight-forward and does not warrant
additional explanation.
However, it has been 14 years since we last added new TX types
to the ZIL [1], hence this is the first time we do it after the
introduction of zpool features. Therefore, here is an overview of the
feature activation and deactivation workflow:

1. The feature must be enabled. Otherwise, we don't log the new
    record type. This ensures compatibility with older software.
2. The feature is activated per-dataset, since the ZIL is per-dataset.
3. If the feature is enabled and dataset is not for zvol, any append to
    the ZIL chain will activate the feature for the dataset. Likewise
    for starting a new ZIL chain.
4. A dataset that doesn't have a ZIL chain has the feature deactivated.

We ensure (3) by activating on the first zil_commit() after the feature
was enabled. Since activating the features requires waiting for txg
sync, the first zil_commit() after enabling the feature will be slower
than usual. The downside is that this is really a conservative
approximation: even if we never append a 'TX_SETSAXATTR' to the ZIL
chain, we pay the penalty for feature activation. The upside is that the
user is in control of when we pay the penalty, i.e., upon enabling the
feature.

We ensure (4) by hooking into zil_sync(), where ZIL destroy actually
happens.

One more piece on feature activation, since it's spread across
multiple functions:

zil_commit()
  zil_process_commit_list()
    if lwb == NULL // first zil_commit since zil_open
      zil_create()
        if no log block pointer in ZIL header:
          if feature enabled and not active:
	    // CASE 1
            enable, COALESCE txg wait with dmu_tx that allocated the
	    log block
         else // log block was allocated earlier than this zil_open
          if feature enabled and not active:
	    // CASE 2
            enable, EXPLICIT txg wait
    else // already have an in-DRAM LWB
      if feature enabled and not active:
        // this happens when we enable the feature after zil_create
	// CASE 3
        enable, EXPLICIT txg wait

[1] https://github.com/illumos/illumos-gate/commit/da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Christian Schwarz <christian.schwarz@nutanix.com>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Reviewed-by: Ryan Moeller <freqlabs@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jitendra Patidar <jitendra.patidar@nutanix.com>
Closes #8768 
Closes #9078
This commit is contained in:
Jitendra Patidar
2022-02-23 02:36:43 +05:30
committed by GitHub
parent ccdcc1dbe8
commit 361a7e8211
21 changed files with 470 additions and 15 deletions
+3 -3
View File
@@ -5549,7 +5549,7 @@ zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
if (error != 0)
error = SET_ERROR(error);
else
error = zfs_sa_set_xattr(zp);
error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
if (error != 0) {
zp->z_xattr_cached = NULL;
nvlist_free(nvl);
@@ -5706,9 +5706,9 @@ zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
if (error != 0)
error = SET_ERROR(error);
}
kmem_free(buf, entry_size);
if (error == 0)
error = zfs_sa_set_xattr(zp);
error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
kmem_free(buf, entry_size);
if (error != 0) {
zp->z_xattr_cached = NULL;
nvlist_free(nvl);
+1 -1
View File
@@ -578,7 +578,7 @@ zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value,
* will be reconstructed from the ARC when next accessed.
*/
if (error == 0)
error = -zfs_sa_set_xattr(zp);
error = -zfs_sa_set_xattr(zp, name, value, size);
if (error) {
nvlist_free(nvl);
+12
View File
@@ -695,6 +695,18 @@ zpool_feature_init(void)
"org.openzfs:draid", "draid", "Support for distributed spare RAID",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
{
static const spa_feature_t zilsaxattr_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_ZILSAXATTR,
"org.openzfs:zilsaxattr", "zilsaxattr",
"Support for xattr=sa extended attribute logging in ZIL.",
ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT,
ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures);
}
zfs_mod_list_supported_free(sfeatures);
}
+34
View File
@@ -720,6 +720,40 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
zil_itx_assign(zilog, itx, tx);
}
/*
* Handles TX_SETSAXATTR transactions.
*/
void
zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, const char *name, const void *value, size_t size)
{
itx_t *itx;
lr_setsaxattr_t *lr;
size_t recsize = sizeof (lr_setsaxattr_t);
void *xattrstart;
int namelen;
if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
namelen = strlen(name) + 1;
recsize += (namelen + size);
itx = zil_itx_create(txtype, recsize);
lr = (lr_setsaxattr_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id;
xattrstart = (char *)(lr + 1);
bcopy(name, xattrstart, namelen);
if (value != NULL) {
bcopy(value, (char *)xattrstart + namelen, size);
lr->lr_size = size;
} else {
lr->lr_size = 0;
}
itx->itx_sync = (zp->z_sync_cnt != 0);
zil_itx_assign(zilog, itx, tx);
}
/*
* Handles TX_ACL transactions.
*/
+83
View File
@@ -47,6 +47,8 @@
#include <sys/atomic.h>
#include <sys/cred.h>
#include <sys/zpl.h>
#include <sys/dmu_objset.h>
#include <sys/zfeature.h>
/*
* NB: FreeBSD expects to be able to do vnode locking in lookup and
@@ -868,6 +870,86 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}
static int
zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap)
{
zfsvfs_t *zfsvfs = arg1;
lr_setsaxattr_t *lr = arg2;
znode_t *zp;
nvlist_t *nvl;
size_t sa_size;
char *name;
char *value;
size_t size;
int error = 0;
ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa,
SPA_FEATURE_ZILSAXATTR));
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
rw_enter(&zp->z_xattr_lock, RW_WRITER);
mutex_enter(&zp->z_lock);
if (zp->z_xattr_cached == NULL)
error = zfs_sa_get_xattr(zp);
mutex_exit(&zp->z_lock);
if (error)
goto out;
ASSERT(zp->z_xattr_cached);
nvl = zp->z_xattr_cached;
/* Get xattr name, value and size from log record */
size = lr->lr_size;
name = (char *)(lr + 1);
if (size == 0) {
value = NULL;
error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
} else {
value = name + strlen(name) + 1;
/* Limited to 32k to keep nvpair memory allocations small */
if (size > DXATTR_MAX_ENTRY_SIZE) {
error = SET_ERROR(EFBIG);
goto out;
}
/* Prevent the DXATTR SA from consuming the entire SA region */
error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
if (error)
goto out;
if (sa_size > DXATTR_MAX_SA_SIZE) {
error = SET_ERROR(EFBIG);
goto out;
}
error = nvlist_add_byte_array(nvl, name, (uchar_t *)value,
size);
}
/*
* Update the SA for additions, modifications, and removals. On
* error drop the inconsistent cached version of the nvlist, it
* will be reconstructed from the ARC when next accessed.
*/
if (error == 0)
error = zfs_sa_set_xattr(zp, name, value, size);
if (error) {
nvlist_free(nvl);
zp->z_xattr_cached = NULL;
}
out:
rw_exit(&zp->z_xattr_lock);
zrele(zp);
return (error);
}
static int
zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
{
@@ -989,4 +1071,5 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
zfs_replay_write2, /* TX_WRITE2 */
zfs_replay_setsaxattr, /* TX_SETSAXATTR */
};
+27 -2
View File
@@ -29,6 +29,7 @@
#include <sys/zfs_sa.h>
#include <sys/dmu_objset.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
/*
* ZPL attribute registration table.
@@ -69,7 +70,10 @@ const sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
{NULL, 0, 0, 0}
};
#ifdef _KERNEL
static int zfs_zil_saxattr = 1;
int
zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio)
{
@@ -219,13 +223,14 @@ zfs_sa_get_xattr(znode_t *zp)
}
int
zfs_sa_set_xattr(znode_t *zp)
zfs_sa_set_xattr(znode_t *zp, const char *name, const void *value, size_t vsize)
{
zfsvfs_t *zfsvfs = ZTOZSB(zp);
zilog_t *zilog;
dmu_tx_t *tx;
char *obj;
size_t size;
int error;
int error, logsaxattr = 0;
ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
ASSERT(zp->z_xattr_cached);
@@ -244,6 +249,17 @@ zfs_sa_set_xattr(znode_t *zp)
if (error)
goto out_free;
zilog = zfsvfs->z_log;
/*
* Users enable ZIL logging of xattr=sa operations by enabling the
* SPA_FEATURE_ZILSAXATTR feature on the pool. Feature is activated
* during zil_process_commit_list/zil_create, if enabled.
*/
if (spa_feature_is_enabled(zfsvfs->z_os->os_spa,
SPA_FEATURE_ZILSAXATTR) && zfs_zil_saxattr)
logsaxattr = 1;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa_create(tx, size);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
@@ -256,6 +272,10 @@ zfs_sa_set_xattr(znode_t *zp)
sa_bulk_attr_t bulk[2];
uint64_t ctime[2];
if (logsaxattr)
zfs_log_setsaxattr(zilog, tx, TX_SETSAXATTR, zp, name,
value, vsize);
zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs),
NULL, obj, size);
@@ -264,6 +284,8 @@ zfs_sa_set_xattr(znode_t *zp)
VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
dmu_tx_commit(tx);
if (logsaxattr && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
}
out_free:
vmem_free(obj, size);
@@ -433,6 +455,9 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
}
}
ZFS_MODULE_PARAM(zfs, zfs_, zil_saxattr, INT, ZMOD_RW,
"Disable xattr=sa extended attribute logging in ZIL by settng 0.");
EXPORT_SYMBOL(zfs_attr_table);
EXPORT_SYMBOL(zfs_sa_readlink);
EXPORT_SYMBOL(zfs_sa_symlink);
+74
View File
@@ -663,6 +663,38 @@ zilog_is_dirty(zilog_t *zilog)
return (B_FALSE);
}
/*
* Its called in zil_commit context (zil_process_commit_list()/zil_create()).
* It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
* Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
* zil_commit.
*/
static void
zil_commit_activate_saxattr_feature(zilog_t *zilog)
{
dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
uint64_t txg = 0;
dmu_tx_t *tx = NULL;
if (spa_feature_is_enabled(zilog->zl_spa,
SPA_FEATURE_ZILSAXATTR) &&
dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
!dsl_dataset_feature_is_active(ds,
SPA_FEATURE_ZILSAXATTR)) {
tx = dmu_tx_create(zilog->zl_os);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
dsl_dataset_dirty(ds, tx);
txg = dmu_tx_get_txg(tx);
mutex_enter(&ds->ds_lock);
ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
(void *)B_TRUE;
mutex_exit(&ds->ds_lock);
dmu_tx_commit(tx);
txg_wait_synced(zilog->zl_dmu_pool, txg);
}
}
/*
* Create an on-disk intent log.
*/
@@ -677,6 +709,8 @@ zil_create(zilog_t *zilog)
int error = 0;
boolean_t fastwrite = FALSE;
boolean_t slog = FALSE;
dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
/*
* Wait for any previous destroy to complete.
@@ -724,9 +758,33 @@ zil_create(zilog_t *zilog)
* (zh is part of the MOS, so we cannot modify it in open context.)
*/
if (tx != NULL) {
/*
* If "zilsaxattr" feature is enabled on zpool, then activate
* it now when we're creating the ZIL chain. We can't wait with
* this until we write the first xattr log record because we
* need to wait for the feature activation to sync out.
*/
if (spa_feature_is_enabled(zilog->zl_spa,
SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
DMU_OST_ZVOL) {
mutex_enter(&ds->ds_lock);
ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
(void *)B_TRUE;
mutex_exit(&ds->ds_lock);
}
dmu_tx_commit(tx);
txg_wait_synced(zilog->zl_dmu_pool, txg);
} else {
/*
* This branch covers the case where we enable the feature on a
* zpool that has existing ZIL headers.
*/
zil_commit_activate_saxattr_feature(zilog);
}
IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
IMPLY(error == 0, lwb != NULL);
@@ -2297,6 +2355,11 @@ zil_process_commit_list(zilog_t *zilog)
if (lwb == NULL) {
lwb = zil_create(zilog);
} else {
/*
* Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
* have already been created (zl_lwb_list not empty).
*/
zil_commit_activate_saxattr_feature(zilog);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
@@ -3075,6 +3138,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
if (zilog->zl_destroy_txg == txg) {
blkptr_t blk = zh->zh_log;
dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
@@ -3092,6 +3156,16 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
*/
zil_init_log_chain(zilog, &blk);
zh->zh_log = blk;
} else {
/*
* A destroyed ZIL chain can't contain any TX_SETSAXATTR
* records. So, deactivate the feature for this dataset.
* We activate it again when we start a new ZIL chain.
*/
if (dsl_dataset_feature_is_active(ds,
SPA_FEATURE_ZILSAXATTR))
dsl_dataset_deactivate_feature(ds,
SPA_FEATURE_ZILSAXATTR, tx);
}
}