diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 2ac84f645..2b6abbf1b 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -309,6 +309,11 @@ typedef enum { ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; +typedef enum { + ZFS_XATTR_OFF = 0, + ZFS_XATTR_DIR = 1, + ZFS_XATTR_SA = 2 +} zfs_xattr_type_t; /* * On-disk version number. diff --git a/include/sys/sa.h b/include/sys/sa.h index c8b924771..718cbfbd5 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -149,6 +149,8 @@ int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, boolean_t sa_enabled(objset_t *); void sa_cache_init(void); void sa_cache_fini(void); +void *sa_spill_alloc(int); +void sa_spill_free(void *); int sa_set_sa_object(objset_t *, uint64_t); int sa_hdrsize(void *); void sa_handle_lock(sa_handle_t *); diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h index 90bb9972b..0bac7808a 100644 --- a/include/sys/zfs_sa.h +++ b/include/sys/zfs_sa.h @@ -73,6 +73,7 @@ typedef enum zpl_attr { ZPL_SYMLINK, ZPL_SCANSTAMP, ZPL_DACL_ACES, + ZPL_DXATTR, ZPL_END } zpl_attr_t; @@ -126,12 +127,20 @@ typedef struct znode_phys { } znode_phys_t; #ifdef _KERNEL + +#define DXATTR_MAX_ENTRY_SIZE (32768) +#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1) + int zfs_sa_readlink(struct znode *, uio_t *); void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); +int zfs_sa_get_xattr(struct znode *); +int zfs_sa_set_xattr(struct znode *); void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); +void zfs_sa_init(void); +void zfs_sa_fini(void); #endif #ifdef __cplusplus diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h index fc8be58bc..6d4d713ce 100644 --- a/include/sys/zfs_vfsops.h +++ b/include/sys/zfs_vfsops.h @@ -76,6 +76,7 @@ typedef struct zfs_sb { boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 6a0c6a4df..6903ad4cc 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -105,6 +105,7 @@ extern "C" { #define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] #define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] #define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] +#define SA_ZPL_DXATTR(z) z->z_attr_table[ZPL_DXATTR] #define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] /* @@ -206,6 +207,8 @@ typedef struct znode { uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ + krwlock_t z_xattr_lock; /* xattr data lock */ + nvlist_t *z_xattr_cached;/* cached xattrs */ list_node_t z_link_node; /* all znodes in fs link */ sa_handle_t *z_sa_hdl; /* handle to sa data */ boolean_t z_is_sa; /* are we native sa? */ diff --git a/module/nvpair/nvpair_alloc_spl.c b/module/nvpair/nvpair_alloc_spl.c index d26d26913..63d57a19a 100644 --- a/module/nvpair/nvpair_alloc_spl.c +++ b/module/nvpair/nvpair_alloc_spl.c @@ -30,7 +30,7 @@ static void * nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size) { - return (kmem_alloc(size, KM_SLEEP)); + return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); } static void * diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 9d65e35de..9afe3d900 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -186,6 +186,14 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t xattr_table[] = { + { "off", ZFS_XATTR_OFF }, + { "on", ZFS_XATTR_DIR }, + { "sa", ZFS_XATTR_SA }, + { "dir", ZFS_XATTR_DIR }, + { NULL } + }; + /* inherit index properties */ zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, @@ -226,6 +234,9 @@ zfs_prop_init(void) zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table); + zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "on | off | dir | sa", "XATTR", xattr_table); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, @@ -244,12 +255,8 @@ zfs_prop_init(void) boolean_table); zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); - zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR", - boolean_table); zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", - boolean_table); + ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", boolean_table); diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 4278ed7e4..bcef7d1fb 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -201,6 +201,7 @@ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; static int sa_legacy_attr_count = 16; static kmem_cache_t *sa_cache = NULL; +static kmem_cache_t *spill_cache = NULL; /*ARGSUSED*/ static int @@ -232,6 +233,8 @@ sa_cache_init(void) sa_cache = kmem_cache_create("sa_cache", sizeof (sa_handle_t), 0, sa_cache_constructor, sa_cache_destructor, NULL, NULL, NULL, 0); + spill_cache = kmem_cache_create("spill_cache", + SPA_MAXBLOCKSIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void @@ -239,6 +242,21 @@ sa_cache_fini(void) { if (sa_cache) kmem_cache_destroy(sa_cache); + + if (spill_cache) + kmem_cache_destroy(spill_cache); +} + +void * +sa_spill_alloc(int flags) +{ + return kmem_cache_alloc(spill_cache, flags); +} + +void +sa_spill_free(void *obj) +{ + kmem_cache_free(spill_cache, obj); } static int @@ -1618,7 +1636,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; - int bonus_data_size = 0, spill_data_size = 0; + int bonus_data_size = 0; int spill_attr_count = 0; int error; uint16_t length; @@ -1648,8 +1666,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, /* Bring spill buffer online if it isn't currently */ if ((error = sa_get_spill(hdl)) == 0) { - spill_data_size = hdl->sa_spill->db_size; - old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); + ASSERT3U(hdl->sa_spill->db_size, <=, SPA_MAXBLOCKSIZE); + old_data[1] = sa_spill_alloc(KM_SLEEP); bcopy(hdl->sa_spill->db_data, old_data[1], hdl->sa_spill->db_size); spill_attr_count = @@ -1729,7 +1747,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if (old_data[0]) kmem_free(old_data[0], bonus_data_size); if (old_data[1]) - kmem_free(old_data[1], spill_data_size); + sa_spill_free(old_data[1]); kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); return (error); @@ -1998,6 +2016,8 @@ EXPORT_SYMBOL(sa_replace_all_by_template_locked); EXPORT_SYMBOL(sa_enabled); EXPORT_SYMBOL(sa_cache_init); EXPORT_SYMBOL(sa_cache_fini); +EXPORT_SYMBOL(sa_spill_alloc); +EXPORT_SYMBOL(sa_spill_free); EXPORT_SYMBOL(sa_set_sa_object); EXPORT_SYMBOL(sa_hdrsize); EXPORT_SYMBOL(sa_handle_lock); diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 324e9b96c..3ec6f0d70 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -63,6 +63,7 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, {"ZPL_DACL_ACES", 0, SA_ACL, 0}, + {"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0}, {NULL, 0, 0, 0} }; @@ -183,6 +184,83 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) } } +int +zfs_sa_get_xattr(znode_t *zp) +{ + zfs_sb_t *zsb = ZTOZSB(zp); + char *obj; + int size; + int error; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + ASSERT(!zp->z_xattr_cached); + ASSERT(zp->z_is_sa); + + error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), &size); + if (error) { + if (error == ENOENT) + return nvlist_alloc(&zp->z_xattr_cached, + NV_UNIQUE_NAME, KM_SLEEP); + else + return (error); + } + + obj = sa_spill_alloc(KM_SLEEP); + + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size); + if (error == 0) + error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP); + + sa_spill_free(obj); + + return (error); +} + +int +zfs_sa_set_xattr(znode_t *zp) +{ + zfs_sb_t *zsb = ZTOZSB(zp); + dmu_tx_t *tx; + char *obj; + size_t size; + int error; + + ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); + ASSERT(zp->z_xattr_cached); + ASSERT(zp->z_is_sa); + + error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR); + if (error) + goto out; + + obj = sa_spill_alloc(KM_SLEEP); + + error = nvlist_pack(zp->z_xattr_cached, &obj, &size, + NV_ENCODE_XDR, KM_SLEEP); + if (error) + goto out_free; + + tx = dmu_tx_create(zsb->z_os); + dmu_tx_hold_sa_create(tx, size); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = sa_update(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), + obj, size, tx); + if (error) + dmu_tx_abort(tx); + else + dmu_tx_commit(tx); + } +out_free: + sa_spill_free(obj); +out: + return (error); +} + /* * I'm not convinced we should do any of this upgrade. * since the SA code can read both old/new znode formats @@ -338,6 +416,8 @@ EXPORT_SYMBOL(zfs_sa_readlink); EXPORT_SYMBOL(zfs_sa_symlink); EXPORT_SYMBOL(zfs_sa_get_scanstamp); EXPORT_SYMBOL(zfs_sa_set_scanstamp); +EXPORT_SYMBOL(zfs_sa_get_xattr); +EXPORT_SYMBOL(zfs_sa_set_xattr); EXPORT_SYMBOL(zfs_sa_upgrade); EXPORT_SYMBOL(zfs_sa_upgrade_txholds); diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 920d87e4f..a0726e117 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -140,10 +140,16 @@ xattr_changed_cb(void *arg, uint64_t newval) { zfs_sb_t *zsb = arg; - if (newval == TRUE) - zsb->z_flags |= ZSB_XATTR; - else + if (newval == ZFS_XATTR_OFF) { zsb->z_flags &= ~ZSB_XATTR; + } else { + zsb->z_flags |= ZSB_XATTR; + + if (newval == ZFS_XATTR_SA) + zsb->z_xattr_sa = B_TRUE; + else + zsb->z_xattr_sa = B_FALSE; + } } static void @@ -641,6 +647,10 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) &sa_obj); if (error) goto out; + + error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &zval); + if ((error == 0) && (zval == ZFS_XATTR_SA)) + zsb->z_xattr_sa = B_TRUE; } else { /* * Pre SA versions file systems should never touch diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 0443b3065..1edbd7e2e 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -106,6 +106,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zp->z_range_avl, zfs_range_compare, @@ -113,6 +114,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; + zp->z_xattr_cached = NULL; zp->z_moved = 0; return (0); } @@ -128,11 +130,13 @@ zfs_znode_cache_destructor(void *buf, void *arg) rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); + rw_destroy(&zp->z_xattr_lock); avl_destroy(&zp->z_range_avl); mutex_destroy(&zp->z_range_lock); ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_acl_cached == NULL); + ASSERT(zp->z_xattr_cached == NULL); } void @@ -272,6 +276,11 @@ zfs_inode_destroy(struct inode *ip) zp->z_acl_cached = NULL; } + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + kmem_cache_free(znode_cache, zp); } diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c index cf52e720d..9117b7bc1 100644 --- a/module/zfs/zpl_xattr.c +++ b/module/zfs/zpl_xattr.c @@ -29,40 +29,54 @@ * as practically no size limit on the file, and the extended * attributes permissions may differ from those of the parent file. * This interface is really quite clever, but it's also completely - * different than what is supported on Linux. + * different than what is supported on Linux. It also comes with a + * steep performance penalty when accessing small xattrs because they + * are not stored with the parent file. * * Under Linux extended attributes are manipulated by the system * calls getxattr(2), setxattr(2), and listxattr(2). They consider * extended attributes to be name/value pairs where the name is a * NULL terminated string. The name must also include one of the - * following name space prefixes: + * following namespace prefixes: * * user - No restrictions and is available to user applications. * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. * system - Used for access control lists (system.nfs4_acl, etc). * security - Used by SELinux to store a files security context. * - * This Linux interface is implemented internally using the more - * flexible Solaris style extended attributes. Every extended - * attribute is store as a file in a hidden directory associated - * with the parent file. This ensures on disk compatibility with - * zfs implementations on other platforms (Solaris, FreeBSD, MacOS). + * The value under Linux to limited to 65536 bytes of binary data. + * In practice, individual xattrs tend to be much smaller than this + * and are typically less than 100 bytes. A good example of this + * are the security.selinux xattrs which are less than 100 bytes and + * exist for every file when xattr labeling is enabled. * - * One consequence of this implementation is that when an extended - * attribute is manipulated an inode is created. This inode will - * exist in the Linux inode cache but there will be no associated - * entry in the dentry cache which references it. This is safe - * but it may result in some confusion. + * The Linux xattr implemenation has been written to take advantage of + * this typical usage. When the dataset property 'xattr=sa' is set, + * then xattrs will be preferentially stored as System Attributes (SA). + * This allows tiny xattrs (~100 bytes) to be stored with the dnode and + * up to 64k of xattrs to be stored in the spill block. If additional + * xattr space is required, which is unlikely under Linux, they will + * be stored using the traditional directory approach. * - * Longer term I would like to see the 'security.selinux' extended - * attribute moved to a SA. This should significantly improve - * performance on a SELinux enabled system by minimizing the - * number of seeks required to access a file. However, for now - * this xattr is still stored in a file because I'm pretty sure - * adding a new SA will break on-disk compatibility. + * This optimization results in roughly a 3x performance improvement + * when accessing xattrs because it avoids the need to perform a seek + * for every xattr value. When multiple xattrs are stored per-file + * the performance improvements are even greater because all of the + * xattrs stored in the spill block will be cached. + * + * However, by default SA based xattrs are disabled in the Linux port + * to maximize compatibility with other implementations. If you do + * enable SA based xattrs then they will not be visible on platforms + * which do not support this feature. + * + * NOTE: One additional consequence of the xattr directory implementation + * is that when an extended attribute is manipulated an inode is created. + * This inode will exist in the Linux inode cache but there will be no + * associated entry in the dentry cache which references it. This is + * safe but it may result in some confusion. Enabling SA based xattrs + * largely avoids the issue except in the overflow case. */ - #include #include #include @@ -104,17 +118,13 @@ zpl_xattr_filldir(void *arg, const char *name, int name_len, return (0); } -ssize_t -zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +static ssize_t +zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) { - struct inode *ip = dentry->d_inode; + struct inode *ip = xf->inode; struct inode *dxip = NULL; loff_t pos = 3; /* skip '.', '..', and '.zfs' entries. */ - cred_t *cr = CRED(); int error; - xattr_filldir_t xf = { buffer_size, 0, buffer, ip }; - - crhold(cr); /* Lookup the xattr directory */ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); @@ -122,34 +132,84 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (error == -ENOENT) error = 0; - goto out; + return (error); } /* Fill provided buffer via zpl_zattr_filldir helper */ - error = -zfs_readdir(dxip, (void *)&xf, zpl_xattr_filldir, &pos, cr); + error = -zfs_readdir(dxip, (void *)xf, zpl_xattr_filldir, &pos, cr); + iput(dxip); + + return (error); +} + +static ssize_t +zpl_xattr_list_sa(xattr_filldir_t *xf) +{ + znode_t *zp = ITOZ(xf->inode); + nvpair_t *nvp = NULL; + int error = 0; + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + + while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { + ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); + + error = zpl_xattr_filldir((void *)xf, nvpair_name(nvp), + strlen(nvpair_name(nvp)), 0, 0, 0); + if (error) + return (error); + } + + return (0); +} + +ssize_t +zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + znode_t *zp = ITOZ(dentry->d_inode); + zfs_sb_t *zsb = ZTOZSB(zp); + xattr_filldir_t xf = { buffer_size, 0, buffer, dentry->d_inode }; + cred_t *cr = CRED(); + int error = 0; + + crhold(cr); + rw_enter(&zp->z_xattr_lock, RW_READER); + + if (zsb->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_list_sa(&xf); + if (error) + goto out; + } + + error = zpl_xattr_list_dir(&xf, cr); if (error) goto out; error = xf.offset; out: - if (dxip) - iput(dxip); + rw_exit(&zp->z_xattr_lock); crfree(cr); return (error); } static int -zpl_xattr_get(struct inode *ip, const char *name, void *buf, size_t size) +zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, + size_t size, cred_t *cr) { struct inode *dxip = NULL; struct inode *xip = NULL; - cred_t *cr = CRED(); int error; - crhold(cr); - /* Lookup the xattr directory */ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); if (error) @@ -165,7 +225,7 @@ zpl_xattr_get(struct inode *ip, const char *name, void *buf, size_t size) goto out; } - error = zpl_read_common(xip, buf, size, 0, UIO_SYSSPACE, 0, cr); + error = zpl_read_common(xip, value, size, 0, UIO_SYSSPACE, 0, cr); out: if (xip) iput(xip); @@ -173,8 +233,59 @@ out: if (dxip) iput(dxip); - crfree(cr); + return (error); +} +static int +zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) +{ + znode_t *zp = ITOZ(ip); + uchar_t *nv_value; + uint_t nv_size; + int error = 0; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, + &nv_value, &nv_size); + if (error) + return (error); + + if (!size) + return (nv_size); + + memcpy(value, nv_value, MIN(size, nv_size)); + + return (MIN(size, nv_size)); +} + +static int +__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, + cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfs_sb_t *zsb = ZTOZSB(zp); + int error; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + if (zsb->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_get_sa(ip, name, value, size); + if (error >= 0) + goto out; + } + + error = zpl_xattr_get_dir(ip, name, value, size, cr); +out: if (error == -ENOENT) error = -ENODATA; @@ -182,42 +293,43 @@ out: } static int -zpl_xattr_set(struct inode *ip, const char *name, const void *value, - size_t size, int flags) +zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) +{ + znode_t *zp = ITOZ(ip); + cred_t *cr = CRED(); + int error; + + crhold(cr); + rw_enter(&zp->z_xattr_lock, RW_READER); + error = __zpl_xattr_get(ip, name, value, size, cr); + rw_exit(&zp->z_xattr_lock); + crfree(cr); + + return (error); +} + +static int +zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, + size_t size, int flags, cred_t *cr) { struct inode *dxip = NULL; struct inode *xip = NULL; vattr_t *vap = NULL; - cred_t *cr = CRED(); ssize_t wrote; int error; const int xattr_mode = S_IFREG | 0644; - crhold(cr); - /* Lookup the xattr directory and create it if required. */ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR | CREATE_XATTR_DIR, cr, NULL, NULL); if (error) goto out; - /* - * Lookup a specific xattr name in the directory, two failure modes: - * XATTR_CREATE: fail if xattr already exists - * XATTR_REMOVE: fail if xattr does not exist - */ + /* Lookup a specific xattr name in the directory */ error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); - if (error) { - if (error != -ENOENT) - goto out; + if (error && (error != -ENOENT)) + goto out; - if ((error == -ENOENT) && (flags & XATTR_REPLACE)) - goto out; - } else { - error = -EEXIST; - if (flags & XATTR_CREATE) - goto out; - } error = 0; /* Remove a specific name xattr when value is set to NULL. */ @@ -262,7 +374,6 @@ out: if (dxip) iput(dxip); - crfree(cr); if (error == -ENOENT) error = -ENODATA; @@ -271,9 +382,101 @@ out: return (error); } +static int +zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, + size_t size, int flags, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + nvlist_t *nvl; + size_t sa_size; + int error; + + ASSERT(zp->z_xattr_cached); + nvl = zp->z_xattr_cached; + + if (value == NULL) { + error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + if (error == -ENOENT) + error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); + } else { + /* Limited to 32k to keep nvpair memory allocations small */ + if (size > DXATTR_MAX_ENTRY_SIZE) + return (-EFBIG); + + /* Prevent the DXATTR SA from consuming the entire SA region */ + error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error) + return (error); + + if (sa_size > DXATTR_MAX_SA_SIZE) + return (-EFBIG); + + error = -nvlist_add_byte_array(nvl, name, + (uchar_t *)value, size); + if (error) + return (error); + } + + /* Update the SA for additions, modifications, and removals. */ + if (!error) + error = -zfs_sa_set_xattr(zp); + + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_xattr_set(struct inode *ip, const char *name, const void *value, + size_t size, int flags) +{ + znode_t *zp = ITOZ(ip); + zfs_sb_t *zsb = ZTOZSB(zp); + cred_t *cr = CRED(); + int error; + + crhold(cr); + rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); + + /* + * Before setting the xattr check to see if it already exists. + * This is done to ensure the following optional flags are honored. + * + * XATTR_CREATE: fail if xattr already exists + * XATTR_REPLACE: fail if xattr does not exist + */ + error = __zpl_xattr_get(ip, name, NULL, 0, cr); + if (error < 0) { + if (error != -ENODATA) + goto out; + + if ((error == -ENODATA) && (flags & XATTR_REPLACE)) + goto out; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto out; + } + + /* Preferentially store the xattr as a SA for better performance */ + if (zsb->z_use_sa && zsb->z_xattr_sa && zp->z_is_sa) { + error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); + if (error == 0) + goto out; + } + + error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); +out: + rw_exit(&ITOZ(ip)->z_xattr_lock); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + static int __zpl_xattr_user_get(struct inode *ip, const char *name, - void *buffer, size_t size) + void *value, size_t size) { char *xattr_name; int error; @@ -285,7 +488,7 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, return -EOPNOTSUPP; xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, buffer, size); + error = zpl_xattr_get(ip, xattr_name, value, size); strfree(xattr_name); return (error); @@ -321,7 +524,7 @@ xattr_handler_t zpl_xattr_user_handler = { static int __zpl_xattr_trusted_get(struct inode *ip, const char *name, - void *buffer, size_t size) + void *value, size_t size) { char *xattr_name; int error; @@ -333,7 +536,7 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name, return -EINVAL; xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, buffer, size); + error = zpl_xattr_get(ip, xattr_name, value, size); strfree(xattr_name); return (error); @@ -369,7 +572,7 @@ xattr_handler_t zpl_xattr_trusted_handler = { static int __zpl_xattr_security_get(struct inode *ip, const char *name, - void *buffer, size_t size) + void *value, size_t size) { char *xattr_name; int error; @@ -378,7 +581,7 @@ __zpl_xattr_security_get(struct inode *ip, const char *name, return -EINVAL; xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); - error = zpl_xattr_get(ip, xattr_name, buffer, size); + error = zpl_xattr_get(ip, xattr_name, value, size); strfree(xattr_name); return (error);