mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-13 11:40:25 +03:00
82a37189aa
The current ZFS implementation stores xattrs on disk using a hidden directory. In this directory a file name represents the xattr name and the file contexts are the xattr binary data. This approach is very flexible and allows for arbitrarily large xattrs. However, it also suffers from a significant performance penalty. Accessing a single xattr can requires up to three disk seeks. 1) Lookup the dnode object. 2) Lookup the dnodes's xattr directory object. 3) Lookup the xattr object in the directory. To avoid this performance penalty Linux filesystems such as ext3 and xfs try to store the xattr as part of the inode on disk. When the xattr is to large to store in the inode then a single external block is allocated for them. In practice most xattrs are small and this approach works well. The addition of System Attributes (SA) to zfs provides us a clean way to make this optimization. When the dataset property 'xattr=sa' is set then xattrs will be preferentially stored as System Attributes. This allows tiny xattrs (~100 bytes) to be stored with the dnode and up to 64k of xattrs to be stored in the spill block. If additional xattr space is required, which is unlikely under Linux, they will be stored using the traditional directory approach. This optimization results in roughly a 3x performance improvement when accessing xattrs which brings zfs roughly to parity with ext4 and xfs (see table below). When multiple xattrs are stored per-file the performance improvements are even greater because all of the xattrs stored in the spill block will be cached. However, by default SA based xattrs are disabled in the Linux port to maximize compatibility with other implementations. If you do enable SA based xattrs then they will not be visible on platforms which do not support this feature. ---------------------------------------------------------------------- Time in seconds to get/set one xattr of N bytes on 100,000 files ------+--------------------------------+------------------------------ | setxattr | getxattr bytes | ext4 xfs zfs-dir zfs-sa | ext4 xfs zfs-dir zfs-sa ------+--------------------------------+------------------------------ 1 | 2.33 31.88 21.50 4.57 | 2.35 2.64 6.29 2.43 32 | 2.79 30.68 21.98 4.60 | 2.44 2.59 6.78 2.48 256 | 3.25 31.99 21.36 5.92 | 2.32 2.71 6.22 3.14 1024 | 3.30 32.61 22.83 8.45 | 2.40 2.79 6.24 3.27 4096 | 3.57 317.46 22.52 10.73 | 2.78 28.62 6.90 3.94 16384 | n/a 2342.39 34.30 19.20 | n/a 45.44 145.90 7.55 65536 | n/a 2941.39 128.15 131.32* | n/a 141.92 256.85 262.12* Legend: * ext4 - Stock RHEL6.1 ext4 mounted with '-o user_xattr'. * xfs - Stock RHEL6.1 xfs mounted with default options. * zfs-dir - Directory based xattrs only. * zfs-sa - Prefer SAs but spill in to directories as needed, a trailing * indicates overflow in to directories occured. NOTE: Ext4 supports 4096 bytes of xattr name/value pairs per file. NOTE: XFS and ZFS have no limit on xattr name/value pairs per file. NOTE: Linux limits individual name/value pairs to 65536 bytes. NOTE: All setattr/getattr's were done after dropping the cache. NOTE: All tests were run against a single hard drive. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #443
173 lines
5.1 KiB
C
173 lines
5.1 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
|
*/
|
|
|
|
#ifndef _SYS_SA_H
|
|
#define _SYS_SA_H
|
|
|
|
#include <sys/dmu.h>
|
|
|
|
/*
|
|
* Currently available byteswap functions.
|
|
* If it all possible new attributes should used
|
|
* one of the already defined byteswap functions.
|
|
* If a new byteswap function is added then the
|
|
* ZPL/Pool version will need to be bumped.
|
|
*/
|
|
|
|
typedef enum sa_bswap_type {
|
|
SA_UINT64_ARRAY,
|
|
SA_UINT32_ARRAY,
|
|
SA_UINT16_ARRAY,
|
|
SA_UINT8_ARRAY,
|
|
SA_ACL,
|
|
} sa_bswap_type_t;
|
|
|
|
typedef uint16_t sa_attr_type_t;
|
|
|
|
/*
|
|
* Attribute to register support for.
|
|
*/
|
|
typedef struct sa_attr_reg {
|
|
char *sa_name; /* attribute name */
|
|
uint16_t sa_length;
|
|
sa_bswap_type_t sa_byteswap; /* bswap functon enum */
|
|
sa_attr_type_t sa_attr; /* filled in during registration */
|
|
} sa_attr_reg_t;
|
|
|
|
|
|
typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
|
|
boolean_t, void *userptr);
|
|
|
|
/*
|
|
* array of attributes to store.
|
|
*
|
|
* This array should be treated as opaque/private data.
|
|
* The SA_BULK_ADD_ATTR() macro should be used for manipulating
|
|
* the array.
|
|
*
|
|
* When sa_replace_all_by_template() is used the attributes
|
|
* will be stored in the order defined in the array, except that
|
|
* the attributes may be split between the bonus and the spill buffer
|
|
*
|
|
*/
|
|
typedef struct sa_bulk_attr {
|
|
void *sa_data;
|
|
sa_data_locator_t *sa_data_func;
|
|
uint16_t sa_length;
|
|
sa_attr_type_t sa_attr;
|
|
/* the following are private to the sa framework */
|
|
void *sa_addr;
|
|
uint16_t sa_buftype;
|
|
uint16_t sa_size;
|
|
} sa_bulk_attr_t;
|
|
|
|
|
|
/*
|
|
* special macro for adding entries for bulk attr support
|
|
* bulk - sa_bulk_attr_t
|
|
* count - integer that will be incremented during each add
|
|
* attr - attribute to manipulate
|
|
* func - function for accessing data.
|
|
* data - pointer to data.
|
|
* len - length of data
|
|
*/
|
|
|
|
#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
|
|
{ \
|
|
b[idx].sa_attr = attr;\
|
|
b[idx].sa_data_func = func; \
|
|
b[idx].sa_data = data; \
|
|
b[idx++].sa_length = len; \
|
|
}
|
|
|
|
typedef struct sa_os sa_os_t;
|
|
|
|
typedef enum sa_handle_type {
|
|
SA_HDL_SHARED,
|
|
SA_HDL_PRIVATE
|
|
} sa_handle_type_t;
|
|
|
|
struct sa_handle;
|
|
typedef void *sa_lookup_tab_t;
|
|
typedef struct sa_handle sa_handle_t;
|
|
|
|
typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
|
|
|
|
int sa_handle_get(objset_t *, uint64_t, void *userp,
|
|
sa_handle_type_t, sa_handle_t **);
|
|
int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
|
|
sa_handle_type_t, sa_handle_t **);
|
|
void sa_handle_destroy(sa_handle_t *);
|
|
int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
|
|
void sa_buf_rele(dmu_buf_t *, void *);
|
|
int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
|
|
int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
|
|
uint32_t buflen, dmu_tx_t *);
|
|
int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
|
|
int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
|
|
int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
|
|
int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
|
|
int sa_size(sa_handle_t *, sa_attr_type_t, int *);
|
|
int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
|
|
uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
|
|
void sa_object_info(sa_handle_t *, dmu_object_info_t *);
|
|
void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
|
|
void sa_update_user(sa_handle_t *, sa_handle_t *);
|
|
void *sa_get_userdata(sa_handle_t *);
|
|
void sa_set_userp(sa_handle_t *, void *);
|
|
dmu_buf_t *sa_get_db(sa_handle_t *);
|
|
uint64_t sa_handle_object(sa_handle_t *);
|
|
boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
|
|
void sa_register_update_callback(objset_t *, sa_update_cb_t *);
|
|
int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
|
|
void sa_tear_down(objset_t *);
|
|
int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
|
|
int, dmu_tx_t *);
|
|
int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
|
|
int, dmu_tx_t *);
|
|
boolean_t sa_enabled(objset_t *);
|
|
void sa_cache_init(void);
|
|
void sa_cache_fini(void);
|
|
void *sa_spill_alloc(int);
|
|
void sa_spill_free(void *);
|
|
int sa_set_sa_object(objset_t *, uint64_t);
|
|
int sa_hdrsize(void *);
|
|
void sa_handle_lock(sa_handle_t *);
|
|
void sa_handle_unlock(sa_handle_t *);
|
|
|
|
#ifdef _KERNEL
|
|
int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif /* _SYS_SA_H */
|