mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 02:14:28 +03:00
Enhance comments for large dnode project
Fix a few nits in the comments from large dnodes. Also import some of the commit message as a comment in the code, making it more accessible. Reviewed-by: @rottegift Reviewed-by: George Melikov <mail@gmelikov.ru> Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: George Wilson <george.wilson@delphix.com> Signed-off-by: Matt Ahrens <mahrens@delphix.com> Closes #6551
This commit is contained in:
parent
2209e40981
commit
1e0457e7f5
@ -145,6 +145,57 @@ enum dnode_dirtycontext {
|
||||
|
||||
#define DNODE_CRYPT_PORTABLE_FLAGS_MASK (DNODE_FLAG_SPILL_BLKPTR)
|
||||
|
||||
/*
|
||||
* VARIABLE-LENGTH (LARGE) DNODES
|
||||
*
|
||||
* The motivation for variable-length dnodes is to eliminate the overhead
|
||||
* associated with using spill blocks. Spill blocks are used to store
|
||||
* system attribute data (i.e. file metadata) that does not fit in the
|
||||
* dnode's bonus buffer. By allowing a larger bonus buffer area the use of
|
||||
* a spill block can be avoided. Spill blocks potentially incur an
|
||||
* additional read I/O for every dnode in a dnode block. As a worst case
|
||||
* example, reading 32 dnodes from a 16k dnode block and all of the spill
|
||||
* blocks could issue 33 separate reads. Now suppose those dnodes have size
|
||||
* 1024 and therefore don't need spill blocks. Then the worst case number
|
||||
* of blocks read is reduced to from 33 to two--one per dnode block.
|
||||
*
|
||||
* ZFS-on-Linux systems that make heavy use of extended attributes benefit
|
||||
* from this feature. In particular, ZFS-on-Linux supports the xattr=sa
|
||||
* dataset property which allows file extended attribute data to be stored
|
||||
* in the dnode bonus buffer as an alternative to the traditional
|
||||
* directory-based format. Workloads such as SELinux and the Lustre
|
||||
* distributed filesystem often store enough xattr data to force spill
|
||||
* blocks when xattr=sa is in effect. Large dnodes may therefore provide a
|
||||
* performance benefit to such systems. Other use cases that benefit from
|
||||
* this feature include files with large ACLs and symbolic links with long
|
||||
* target names.
|
||||
*
|
||||
* The size of a dnode may be a multiple of 512 bytes up to the size of a
|
||||
* dnode block (currently 16384 bytes). The dn_extra_slots field of the
|
||||
* on-disk dnode_phys_t structure describes the size of the physical dnode
|
||||
* on disk. The field represents how many "extra" dnode_phys_t slots a
|
||||
* dnode consumes in its dnode block. This convention results in a value of
|
||||
* 0 for 512 byte dnodes which preserves on-disk format compatibility with
|
||||
* older software which doesn't support large dnodes.
|
||||
*
|
||||
* Similarly, the in-memory dnode_t structure has a dn_num_slots field
|
||||
* to represent the total number of dnode_phys_t slots consumed on disk.
|
||||
* Thus dn->dn_num_slots is 1 greater than the corresponding
|
||||
* dnp->dn_extra_slots. This difference in convention was adopted
|
||||
* because, unlike on-disk structures, backward compatibility is not a
|
||||
* concern for in-memory objects, so we used a more natural way to
|
||||
* represent size for a dnode_t.
|
||||
*
|
||||
* The default size for newly created dnodes is determined by the value of
|
||||
* the "dnodesize" dataset property. By default the property is set to
|
||||
* "legacy" which is compatible with older software. Setting the property
|
||||
* to "auto" will allow the filesystem to choose the most suitable dnode
|
||||
* size. Currently this just sets the default dnode size to 1k, but future
|
||||
* code improvements could dynamically choose a size based on observed
|
||||
* workload patterns. Dnodes of varying sizes can coexist within the same
|
||||
* dataset and even within the same dnode block.
|
||||
*/
|
||||
|
||||
typedef struct dnode_phys {
|
||||
uint8_t dn_type; /* dmu_object_type_t */
|
||||
uint8_t dn_indblkshift; /* ln2(indirect block size) */
|
||||
|
@ -318,7 +318,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
|
||||
dmu_object_info_t doi;
|
||||
|
||||
error = dmu_object_info(os, i, &doi);
|
||||
if (error)
|
||||
if (error != 0)
|
||||
skip = 1;
|
||||
else
|
||||
skip = doi.doi_dnodesize >> DNODE_SHIFT;
|
||||
|
@ -1176,6 +1176,18 @@ dnode_rele_slots(dnode_children_t *children, int idx, int slots)
|
||||
}
|
||||
|
||||
/*
|
||||
* When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
|
||||
* to ensure the hole at the specified object offset is large enough to
|
||||
* hold the dnode being created. The slots parameter is also used to ensure
|
||||
* a dnode does not span multiple dnode blocks. In both of these cases, if
|
||||
* a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
|
||||
* are only possible when using DNODE_MUST_BE_FREE.
|
||||
*
|
||||
* If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
|
||||
* dnode_hold_impl() will check if the requested dnode is already consumed
|
||||
* as an extra dnode slot by an large dnode, in which case it returns
|
||||
* ENOENT.
|
||||
*
|
||||
* errors:
|
||||
* EINVAL - invalid object number.
|
||||
* ENOSPC - hole too small to fulfill "slots" request
|
||||
|
@ -21,7 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 Cyril Plisko. All rights reserved.
|
||||
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
@ -453,8 +453,8 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
|
||||
* eventually end up in zfs_mknode(), which assigns the object's
|
||||
* creation time, generation number, and dnode slot count. The
|
||||
* generic zfs_create() has no concept of these attributes, so
|
||||
* we smuggle the values inside * the vattr's otherwise unused
|
||||
* va_ctime, va_nblocks, and va_nlink fields.
|
||||
* we smuggle the values inside the vattr's otherwise unused
|
||||
* va_ctime, va_nblocks, and va_fsid fields.
|
||||
*/
|
||||
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
|
||||
xva.xva_vattr.va_nblocks = lr->lr_gen;
|
||||
|
Loading…
Reference in New Issue
Block a user