mirror_zfs/include/sys/zap.h

511 lines
19 KiB
C
Raw Normal View History

2008-11-20 23:01:55 +03:00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
2008-11-20 23:01:55 +03:00
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
2008-11-20 23:01:55 +03:00
*/
#ifndef _SYS_ZAP_H
#define _SYS_ZAP_H
/*
* ZAP - ZFS Attribute Processor
*
* The ZAP is a module which sits on top of the DMU (Data Management
* Unit) and implements a higher-level storage primitive using DMU
* objects. Its primary consumer is the ZPL (ZFS Posix Layer).
*
* A "zapobj" is a DMU object which the ZAP uses to stores attributes.
* Users should use only zap routines to access a zapobj - they should
* not access the DMU object directly using DMU routines.
*
* The attributes stored in a zapobj are name-value pairs. The name is
* a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
* terminating NULL). The value is an array of integers, which may be
* 1, 2, 4, or 8 bytes long. The total space used by the array (number
* of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
* Note that an 8-byte integer value can be used to store the location
* (object number) of another dmu object (which may be itself a zapobj).
* Note that you can use a zero-length attribute to store a single bit
* of information - the attribute is present or not.
*
* The ZAP routines are thread-safe. However, you must observe the
* DMU's restriction that a transaction may not be operated on
* concurrently.
*
* Any of the routines that return an int may return an I/O error (EIO
* or ECHECKSUM).
*
*
* Implementation / Performance Notes:
*
* The ZAP is intended to operate most efficiently on attributes with
* short (49 bytes or less) names and single 8-byte values, for which
* the microzap will be used. The ZAP should be efficient enough so
* that the user does not need to cache these attributes.
*
* The ZAP's locking scheme makes its routines thread-safe. Operations
* on different zapobjs will be processed concurrently. Operations on
* the same zapobj which only read data will be processed concurrently.
* Operations on the same zapobj which modify data will be processed
* concurrently when there are many attributes in the zapobj (because
* the ZAP uses per-block locking - more than 128 * (number of cpus)
* small attributes will suffice).
*/
/*
* We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
* strings) for the names of attributes, rather than a byte string
* bounded by an explicit length. If some day we want to support names
* in character sets which have embedded zeros (eg. UTF-16, UTF-32),
* we'll have to add routines for using length-bounded strings.
*/
#include <sys/dmu.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Specifies matching criteria for ZAP lookups.
* MT_NORMALIZE Use ZAP normalization flags, which can include both
* unicode normalization and case-insensitivity.
* MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is
* specified and ZAP normalization flags include
* U8_TEXTPREP_TOUPPER.
2008-11-20 23:01:55 +03:00
*/
typedef enum matchtype {
MT_NORMALIZE = 1 << 0,
MT_MATCH_CASE = 1 << 1,
2008-11-20 23:01:55 +03:00
} matchtype_t;
typedef enum zap_flags {
/* Use 64-bit hash value (serialized cursors will always use 64-bits) */
ZAP_FLAG_HASH64 = 1 << 0,
/* Key is binary, not string (zap_add_uint64() can be used) */
ZAP_FLAG_UINT64_KEY = 1 << 1,
/*
* First word of key (which must be an array of uint64) is
* already randomly distributed.
*/
ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
} zap_flags_t;
2008-11-20 23:01:55 +03:00
/*
* Create a new zapobj with no attributes and return its object number.
*/
uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
Implement large_dnode pool feature Justification ------------- This feature adds support for variable length dnodes. Our motivation is to eliminate the overhead associated with using spill blocks. Spill blocks are used to store system attribute data (i.e. file metadata) that does not fit in the dnode's bonus buffer. By allowing a larger bonus buffer area the use of a spill block can be avoided. Spill blocks potentially incur an additional read I/O for every dnode in a dnode block. As a worst case example, reading 32 dnodes from a 16k dnode block and all of the spill blocks could issue 33 separate reads. Now suppose those dnodes have size 1024 and therefore don't need spill blocks. Then the worst case number of blocks read is reduced to from 33 to two--one per dnode block. In practice spill blocks may tend to be co-located on disk with the dnode blocks so the reduction in I/O would not be this drastic. In a badly fragmented pool, however, the improvement could be significant. ZFS-on-Linux systems that make heavy use of extended attributes would benefit from this feature. In particular, ZFS-on-Linux supports the xattr=sa dataset property which allows file extended attribute data to be stored in the dnode bonus buffer as an alternative to the traditional directory-based format. Workloads such as SELinux and the Lustre distributed filesystem often store enough xattr data to force spill bocks when xattr=sa is in effect. Large dnodes may therefore provide a performance benefit to such systems. Other use cases that may benefit from this feature include files with large ACLs and symbolic links with long target names. Furthermore, this feature may be desirable on other platforms in case future applications or features are developed that could make use of a larger bonus buffer area. Implementation -------------- The size of a dnode may be a multiple of 512 bytes up to the size of a dnode block (currently 16384 bytes). A dn_extra_slots field was added to the current on-disk dnode_phys_t structure to describe the size of the physical dnode on disk. The 8 bits for this field were taken from the zero filled dn_pad2 field. The field represents how many "extra" dnode_phys_t slots a dnode consumes in its dnode block. This convention results in a value of 0 for 512 byte dnodes which preserves on-disk format compatibility with older software. Similarly, the in-memory dnode_t structure has a new dn_num_slots field to represent the total number of dnode_phys_t slots consumed on disk. Thus dn->dn_num_slots is 1 greater than the corresponding dnp->dn_extra_slots. This difference in convention was adopted because, unlike on-disk structures, backward compatibility is not a concern for in-memory objects, so we used a more natural way to represent size for a dnode_t. The default size for newly created dnodes is determined by the value of a new "dnodesize" dataset property. By default the property is set to "legacy" which is compatible with older software. Setting the property to "auto" will allow the filesystem to choose the most suitable dnode size. Currently this just sets the default dnode size to 1k, but future code improvements could dynamically choose a size based on observed workload patterns. Dnodes of varying sizes can coexist within the same dataset and even within the same dnode block. For example, to enable automatically-sized dnodes, run # zfs set dnodesize=auto tank/fish The user can also specify literal values for the dnodesize property. These are currently limited to powers of two from 1k to 16k. The power-of-2 limitation is only for simplicity of the user interface. Internally the implementation can handle any multiple of 512 up to 16k, and consumers of the DMU API can specify any legal dnode value. The size of a new dnode is determined at object allocation time and stored as a new field in the znode in-memory structure. New DMU interfaces are added to allow the consumer to specify the dnode size that a newly allocated object should use. Existing interfaces are unchanged to avoid having to update every call site and to preserve compatibility with external consumers such as Lustre. The new interfaces names are given below. The versions of these functions that don't take a dnodesize parameter now just call the _dnsize() versions with a dnodesize of 0, which means use the legacy dnode size. New DMU interfaces: dmu_object_alloc_dnsize() dmu_object_claim_dnsize() dmu_object_reclaim_dnsize() New ZAP interfaces: zap_create_dnsize() zap_create_norm_dnsize() zap_create_flags_dnsize() zap_create_claim_norm_dnsize() zap_create_link_dnsize() The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The spa_maxdnodesize() function should be used to determine the maximum bonus length for a pool. These are a few noteworthy changes to key functions: * The prototype for dnode_hold_impl() now takes a "slots" parameter. When the DNODE_MUST_BE_FREE flag is set, this parameter is used to ensure the hole at the specified object offset is large enough to hold the dnode being created. The slots parameter is also used to ensure a dnode does not span multiple dnode blocks. In both of these cases, if a failure occurs, ENOSPC is returned. Keep in mind, these failure cases are only possible when using DNODE_MUST_BE_FREE. If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. dnode_hold_impl() will check if the requested dnode is already consumed as an extra dnode slot by an large dnode, in which case it returns ENOENT. * The function dmu_object_alloc() advances to the next dnode block if dnode_hold_impl() returns an error for a requested object. This is because the beginning of the next dnode block is the only location it can safely assume to either be a hole or a valid starting point for a dnode. * dnode_next_offset_level() and other functions that iterate through dnode blocks may no longer use a simple array indexing scheme. These now use the current dnode's dn_num_slots field to advance to the next dnode in the block. This is to ensure we properly skip the current dnode's bonus area and don't interpret it as a valid dnode. zdb --- The zdb command was updated to display a dnode's size under the "dnsize" column when the object is dumped. For ZIL create log records, zdb will now display the slot count for the object. ztest ----- Ztest chooses a random dnodesize for every newly created object. The random distribution is more heavily weighted toward small dnodes to better simulate real-world datasets. Unused bonus buffer space is filled with non-zero values computed from the object number, dataset id, offset, and generation number. This helps ensure that the dnode traversal code properly skips the interior regions of large dnodes, and that these interior regions are not overwritten by data belonging to other dnodes. A new test visits each object in a dataset. It verifies that the actual dnode size matches what was stored in the ztest block tag when it was created. It also verifies that the unused bonus buffer space is filled with the expected data patterns. ZFS Test Suite -------------- Added six new large dnode-specific tests, and integrated the dnodesize property into existing tests for zfs allow and send/recv. Send/Receive ------------ ZFS send streams for datasets containing large dnodes cannot be received on pools that don't support the large_dnode feature. A send stream with large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be unrecognized by an incompatible receiving pool so that the zfs receive will fail gracefully. While not implemented here, it may be possible to generate a backward-compatible send stream from a dataset containing large dnodes. The implementation may be tricky, however, because the send object record for a large dnode would need to be resized to a 512 byte dnode, possibly kicking in a spill block in the process. This means we would need to construct a new SA layout and possibly register it in the SA layout object. The SA layout is normally just sent as an ordinary object record. But if we are constructing new layouts while generating the send stream we'd have to build the SA layout object dynamically and send it at the end of the stream. For sending and receiving between pools that do support large dnodes, the drr_object send record type is extended with a new field to store the dnode slot count. This field was repurposed from unused padding in the structure. ZIL Replay ---------- The dnode slot count is stored in the uppermost 8 bits of the lr_foid field. The bits were unused as the object id is currently capped at 48 bits. Resizing Dnodes --------------- It should be possible to resize a dnode when it is dirtied if the current dnodesize dataset property differs from the dnode's size, but this functionality is not currently implemented. Clearly a dnode can only grow if there are sufficient contiguous unused slots in the dnode block, but it should always be possible to shrink a dnode. Growing dnodes may be useful to reduce fragmentation in a pool with many spill blocks in use. Shrinking dnodes may be useful to allow sending a dataset to a pool that doesn't support the large_dnode feature. Feature Reference Counting -------------------------- The reference count for the large_dnode pool feature tracks the number of datasets that have ever contained a dnode of size larger than 512 bytes. The first time a large dnode is created in a dataset the dataset is converted to an extensible dataset. This is a one-way operation and the only way to decrement the feature count is to destroy the dataset, even if the dataset no longer contains any large dnodes. The complexity of reference counting on a per-dnode basis was too high, so we chose to track it on a per-dataset basis similarly to the large_block feature. Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3542
2016-03-17 04:25:34 +03:00
uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
Implement large_dnode pool feature Justification ------------- This feature adds support for variable length dnodes. Our motivation is to eliminate the overhead associated with using spill blocks. Spill blocks are used to store system attribute data (i.e. file metadata) that does not fit in the dnode's bonus buffer. By allowing a larger bonus buffer area the use of a spill block can be avoided. Spill blocks potentially incur an additional read I/O for every dnode in a dnode block. As a worst case example, reading 32 dnodes from a 16k dnode block and all of the spill blocks could issue 33 separate reads. Now suppose those dnodes have size 1024 and therefore don't need spill blocks. Then the worst case number of blocks read is reduced to from 33 to two--one per dnode block. In practice spill blocks may tend to be co-located on disk with the dnode blocks so the reduction in I/O would not be this drastic. In a badly fragmented pool, however, the improvement could be significant. ZFS-on-Linux systems that make heavy use of extended attributes would benefit from this feature. In particular, ZFS-on-Linux supports the xattr=sa dataset property which allows file extended attribute data to be stored in the dnode bonus buffer as an alternative to the traditional directory-based format. Workloads such as SELinux and the Lustre distributed filesystem often store enough xattr data to force spill bocks when xattr=sa is in effect. Large dnodes may therefore provide a performance benefit to such systems. Other use cases that may benefit from this feature include files with large ACLs and symbolic links with long target names. Furthermore, this feature may be desirable on other platforms in case future applications or features are developed that could make use of a larger bonus buffer area. Implementation -------------- The size of a dnode may be a multiple of 512 bytes up to the size of a dnode block (currently 16384 bytes). A dn_extra_slots field was added to the current on-disk dnode_phys_t structure to describe the size of the physical dnode on disk. The 8 bits for this field were taken from the zero filled dn_pad2 field. The field represents how many "extra" dnode_phys_t slots a dnode consumes in its dnode block. This convention results in a value of 0 for 512 byte dnodes which preserves on-disk format compatibility with older software. Similarly, the in-memory dnode_t structure has a new dn_num_slots field to represent the total number of dnode_phys_t slots consumed on disk. Thus dn->dn_num_slots is 1 greater than the corresponding dnp->dn_extra_slots. This difference in convention was adopted because, unlike on-disk structures, backward compatibility is not a concern for in-memory objects, so we used a more natural way to represent size for a dnode_t. The default size for newly created dnodes is determined by the value of a new "dnodesize" dataset property. By default the property is set to "legacy" which is compatible with older software. Setting the property to "auto" will allow the filesystem to choose the most suitable dnode size. Currently this just sets the default dnode size to 1k, but future code improvements could dynamically choose a size based on observed workload patterns. Dnodes of varying sizes can coexist within the same dataset and even within the same dnode block. For example, to enable automatically-sized dnodes, run # zfs set dnodesize=auto tank/fish The user can also specify literal values for the dnodesize property. These are currently limited to powers of two from 1k to 16k. The power-of-2 limitation is only for simplicity of the user interface. Internally the implementation can handle any multiple of 512 up to 16k, and consumers of the DMU API can specify any legal dnode value. The size of a new dnode is determined at object allocation time and stored as a new field in the znode in-memory structure. New DMU interfaces are added to allow the consumer to specify the dnode size that a newly allocated object should use. Existing interfaces are unchanged to avoid having to update every call site and to preserve compatibility with external consumers such as Lustre. The new interfaces names are given below. The versions of these functions that don't take a dnodesize parameter now just call the _dnsize() versions with a dnodesize of 0, which means use the legacy dnode size. New DMU interfaces: dmu_object_alloc_dnsize() dmu_object_claim_dnsize() dmu_object_reclaim_dnsize() New ZAP interfaces: zap_create_dnsize() zap_create_norm_dnsize() zap_create_flags_dnsize() zap_create_claim_norm_dnsize() zap_create_link_dnsize() The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The spa_maxdnodesize() function should be used to determine the maximum bonus length for a pool. These are a few noteworthy changes to key functions: * The prototype for dnode_hold_impl() now takes a "slots" parameter. When the DNODE_MUST_BE_FREE flag is set, this parameter is used to ensure the hole at the specified object offset is large enough to hold the dnode being created. The slots parameter is also used to ensure a dnode does not span multiple dnode blocks. In both of these cases, if a failure occurs, ENOSPC is returned. Keep in mind, these failure cases are only possible when using DNODE_MUST_BE_FREE. If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. dnode_hold_impl() will check if the requested dnode is already consumed as an extra dnode slot by an large dnode, in which case it returns ENOENT. * The function dmu_object_alloc() advances to the next dnode block if dnode_hold_impl() returns an error for a requested object. This is because the beginning of the next dnode block is the only location it can safely assume to either be a hole or a valid starting point for a dnode. * dnode_next_offset_level() and other functions that iterate through dnode blocks may no longer use a simple array indexing scheme. These now use the current dnode's dn_num_slots field to advance to the next dnode in the block. This is to ensure we properly skip the current dnode's bonus area and don't interpret it as a valid dnode. zdb --- The zdb command was updated to display a dnode's size under the "dnsize" column when the object is dumped. For ZIL create log records, zdb will now display the slot count for the object. ztest ----- Ztest chooses a random dnodesize for every newly created object. The random distribution is more heavily weighted toward small dnodes to better simulate real-world datasets. Unused bonus buffer space is filled with non-zero values computed from the object number, dataset id, offset, and generation number. This helps ensure that the dnode traversal code properly skips the interior regions of large dnodes, and that these interior regions are not overwritten by data belonging to other dnodes. A new test visits each object in a dataset. It verifies that the actual dnode size matches what was stored in the ztest block tag when it was created. It also verifies that the unused bonus buffer space is filled with the expected data patterns. ZFS Test Suite -------------- Added six new large dnode-specific tests, and integrated the dnodesize property into existing tests for zfs allow and send/recv. Send/Receive ------------ ZFS send streams for datasets containing large dnodes cannot be received on pools that don't support the large_dnode feature. A send stream with large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be unrecognized by an incompatible receiving pool so that the zfs receive will fail gracefully. While not implemented here, it may be possible to generate a backward-compatible send stream from a dataset containing large dnodes. The implementation may be tricky, however, because the send object record for a large dnode would need to be resized to a 512 byte dnode, possibly kicking in a spill block in the process. This means we would need to construct a new SA layout and possibly register it in the SA layout object. The SA layout is normally just sent as an ordinary object record. But if we are constructing new layouts while generating the send stream we'd have to build the SA layout object dynamically and send it at the end of the stream. For sending and receiving between pools that do support large dnodes, the drr_object send record type is extended with a new field to store the dnode slot count. This field was repurposed from unused padding in the structure. ZIL Replay ---------- The dnode slot count is stored in the uppermost 8 bits of the lr_foid field. The bits were unused as the object id is currently capped at 48 bits. Resizing Dnodes --------------- It should be possible to resize a dnode when it is dirtied if the current dnodesize dataset property differs from the dnode's size, but this functionality is not currently implemented. Clearly a dnode can only grow if there are sufficient contiguous unused slots in the dnode block, but it should always be possible to shrink a dnode. Growing dnodes may be useful to reduce fragmentation in a pool with many spill blocks in use. Shrinking dnodes may be useful to allow sending a dataset to a pool that doesn't support the large_dnode feature. Feature Reference Counting -------------------------- The reference count for the large_dnode pool feature tracks the number of datasets that have ever contained a dnode of size larger than 512 bytes. The first time a large dnode is created in a dataset the dataset is converted to an extensible dataset. This is a one-way operation and the only way to decrement the feature count is to destroy the dataset, even if the dataset no longer contains any large dnodes. The complexity of reference counting on a per-dnode basis was too high, so we chose to track it on a per-dataset basis similarly to the large_block feature. Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3542
2016-03-17 04:25:34 +03:00
uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
Implement large_dnode pool feature Justification ------------- This feature adds support for variable length dnodes. Our motivation is to eliminate the overhead associated with using spill blocks. Spill blocks are used to store system attribute data (i.e. file metadata) that does not fit in the dnode's bonus buffer. By allowing a larger bonus buffer area the use of a spill block can be avoided. Spill blocks potentially incur an additional read I/O for every dnode in a dnode block. As a worst case example, reading 32 dnodes from a 16k dnode block and all of the spill blocks could issue 33 separate reads. Now suppose those dnodes have size 1024 and therefore don't need spill blocks. Then the worst case number of blocks read is reduced to from 33 to two--one per dnode block. In practice spill blocks may tend to be co-located on disk with the dnode blocks so the reduction in I/O would not be this drastic. In a badly fragmented pool, however, the improvement could be significant. ZFS-on-Linux systems that make heavy use of extended attributes would benefit from this feature. In particular, ZFS-on-Linux supports the xattr=sa dataset property which allows file extended attribute data to be stored in the dnode bonus buffer as an alternative to the traditional directory-based format. Workloads such as SELinux and the Lustre distributed filesystem often store enough xattr data to force spill bocks when xattr=sa is in effect. Large dnodes may therefore provide a performance benefit to such systems. Other use cases that may benefit from this feature include files with large ACLs and symbolic links with long target names. Furthermore, this feature may be desirable on other platforms in case future applications or features are developed that could make use of a larger bonus buffer area. Implementation -------------- The size of a dnode may be a multiple of 512 bytes up to the size of a dnode block (currently 16384 bytes). A dn_extra_slots field was added to the current on-disk dnode_phys_t structure to describe the size of the physical dnode on disk. The 8 bits for this field were taken from the zero filled dn_pad2 field. The field represents how many "extra" dnode_phys_t slots a dnode consumes in its dnode block. This convention results in a value of 0 for 512 byte dnodes which preserves on-disk format compatibility with older software. Similarly, the in-memory dnode_t structure has a new dn_num_slots field to represent the total number of dnode_phys_t slots consumed on disk. Thus dn->dn_num_slots is 1 greater than the corresponding dnp->dn_extra_slots. This difference in convention was adopted because, unlike on-disk structures, backward compatibility is not a concern for in-memory objects, so we used a more natural way to represent size for a dnode_t. The default size for newly created dnodes is determined by the value of a new "dnodesize" dataset property. By default the property is set to "legacy" which is compatible with older software. Setting the property to "auto" will allow the filesystem to choose the most suitable dnode size. Currently this just sets the default dnode size to 1k, but future code improvements could dynamically choose a size based on observed workload patterns. Dnodes of varying sizes can coexist within the same dataset and even within the same dnode block. For example, to enable automatically-sized dnodes, run # zfs set dnodesize=auto tank/fish The user can also specify literal values for the dnodesize property. These are currently limited to powers of two from 1k to 16k. The power-of-2 limitation is only for simplicity of the user interface. Internally the implementation can handle any multiple of 512 up to 16k, and consumers of the DMU API can specify any legal dnode value. The size of a new dnode is determined at object allocation time and stored as a new field in the znode in-memory structure. New DMU interfaces are added to allow the consumer to specify the dnode size that a newly allocated object should use. Existing interfaces are unchanged to avoid having to update every call site and to preserve compatibility with external consumers such as Lustre. The new interfaces names are given below. The versions of these functions that don't take a dnodesize parameter now just call the _dnsize() versions with a dnodesize of 0, which means use the legacy dnode size. New DMU interfaces: dmu_object_alloc_dnsize() dmu_object_claim_dnsize() dmu_object_reclaim_dnsize() New ZAP interfaces: zap_create_dnsize() zap_create_norm_dnsize() zap_create_flags_dnsize() zap_create_claim_norm_dnsize() zap_create_link_dnsize() The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The spa_maxdnodesize() function should be used to determine the maximum bonus length for a pool. These are a few noteworthy changes to key functions: * The prototype for dnode_hold_impl() now takes a "slots" parameter. When the DNODE_MUST_BE_FREE flag is set, this parameter is used to ensure the hole at the specified object offset is large enough to hold the dnode being created. The slots parameter is also used to ensure a dnode does not span multiple dnode blocks. In both of these cases, if a failure occurs, ENOSPC is returned. Keep in mind, these failure cases are only possible when using DNODE_MUST_BE_FREE. If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. dnode_hold_impl() will check if the requested dnode is already consumed as an extra dnode slot by an large dnode, in which case it returns ENOENT. * The function dmu_object_alloc() advances to the next dnode block if dnode_hold_impl() returns an error for a requested object. This is because the beginning of the next dnode block is the only location it can safely assume to either be a hole or a valid starting point for a dnode. * dnode_next_offset_level() and other functions that iterate through dnode blocks may no longer use a simple array indexing scheme. These now use the current dnode's dn_num_slots field to advance to the next dnode in the block. This is to ensure we properly skip the current dnode's bonus area and don't interpret it as a valid dnode. zdb --- The zdb command was updated to display a dnode's size under the "dnsize" column when the object is dumped. For ZIL create log records, zdb will now display the slot count for the object. ztest ----- Ztest chooses a random dnodesize for every newly created object. The random distribution is more heavily weighted toward small dnodes to better simulate real-world datasets. Unused bonus buffer space is filled with non-zero values computed from the object number, dataset id, offset, and generation number. This helps ensure that the dnode traversal code properly skips the interior regions of large dnodes, and that these interior regions are not overwritten by data belonging to other dnodes. A new test visits each object in a dataset. It verifies that the actual dnode size matches what was stored in the ztest block tag when it was created. It also verifies that the unused bonus buffer space is filled with the expected data patterns. ZFS Test Suite -------------- Added six new large dnode-specific tests, and integrated the dnodesize property into existing tests for zfs allow and send/recv. Send/Receive ------------ ZFS send streams for datasets containing large dnodes cannot be received on pools that don't support the large_dnode feature. A send stream with large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be unrecognized by an incompatible receiving pool so that the zfs receive will fail gracefully. While not implemented here, it may be possible to generate a backward-compatible send stream from a dataset containing large dnodes. The implementation may be tricky, however, because the send object record for a large dnode would need to be resized to a 512 byte dnode, possibly kicking in a spill block in the process. This means we would need to construct a new SA layout and possibly register it in the SA layout object. The SA layout is normally just sent as an ordinary object record. But if we are constructing new layouts while generating the send stream we'd have to build the SA layout object dynamically and send it at the end of the stream. For sending and receiving between pools that do support large dnodes, the drr_object send record type is extended with a new field to store the dnode slot count. This field was repurposed from unused padding in the structure. ZIL Replay ---------- The dnode slot count is stored in the uppermost 8 bits of the lr_foid field. The bits were unused as the object id is currently capped at 48 bits. Resizing Dnodes --------------- It should be possible to resize a dnode when it is dirtied if the current dnodesize dataset property differs from the dnode's size, but this functionality is not currently implemented. Clearly a dnode can only grow if there are sufficient contiguous unused slots in the dnode block, but it should always be possible to shrink a dnode. Growing dnodes may be useful to reduce fragmentation in a pool with many spill blocks in use. Shrinking dnodes may be useful to allow sending a dataset to a pool that doesn't support the large_dnode feature. Feature Reference Counting -------------------------- The reference count for the large_dnode pool feature tracks the number of datasets that have ever contained a dnode of size larger than 512 bytes. The first time a large dnode is created in a dataset the dataset is converted to an extensible dataset. This is a one-way operation and the only way to decrement the feature count is to destroy the dataset, even if the dataset no longer contains any large dnodes. The complexity of reference counting on a per-dnode basis was too high, so we chose to track it on a per-dataset basis similarly to the large_block feature. Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3542
2016-03-17 04:25:34 +03:00
uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, int dnodesize,
dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx);
uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
uint64_t parent_obj, const char *name, dmu_tx_t *tx);
Implement large_dnode pool feature Justification ------------- This feature adds support for variable length dnodes. Our motivation is to eliminate the overhead associated with using spill blocks. Spill blocks are used to store system attribute data (i.e. file metadata) that does not fit in the dnode's bonus buffer. By allowing a larger bonus buffer area the use of a spill block can be avoided. Spill blocks potentially incur an additional read I/O for every dnode in a dnode block. As a worst case example, reading 32 dnodes from a 16k dnode block and all of the spill blocks could issue 33 separate reads. Now suppose those dnodes have size 1024 and therefore don't need spill blocks. Then the worst case number of blocks read is reduced to from 33 to two--one per dnode block. In practice spill blocks may tend to be co-located on disk with the dnode blocks so the reduction in I/O would not be this drastic. In a badly fragmented pool, however, the improvement could be significant. ZFS-on-Linux systems that make heavy use of extended attributes would benefit from this feature. In particular, ZFS-on-Linux supports the xattr=sa dataset property which allows file extended attribute data to be stored in the dnode bonus buffer as an alternative to the traditional directory-based format. Workloads such as SELinux and the Lustre distributed filesystem often store enough xattr data to force spill bocks when xattr=sa is in effect. Large dnodes may therefore provide a performance benefit to such systems. Other use cases that may benefit from this feature include files with large ACLs and symbolic links with long target names. Furthermore, this feature may be desirable on other platforms in case future applications or features are developed that could make use of a larger bonus buffer area. Implementation -------------- The size of a dnode may be a multiple of 512 bytes up to the size of a dnode block (currently 16384 bytes). A dn_extra_slots field was added to the current on-disk dnode_phys_t structure to describe the size of the physical dnode on disk. The 8 bits for this field were taken from the zero filled dn_pad2 field. The field represents how many "extra" dnode_phys_t slots a dnode consumes in its dnode block. This convention results in a value of 0 for 512 byte dnodes which preserves on-disk format compatibility with older software. Similarly, the in-memory dnode_t structure has a new dn_num_slots field to represent the total number of dnode_phys_t slots consumed on disk. Thus dn->dn_num_slots is 1 greater than the corresponding dnp->dn_extra_slots. This difference in convention was adopted because, unlike on-disk structures, backward compatibility is not a concern for in-memory objects, so we used a more natural way to represent size for a dnode_t. The default size for newly created dnodes is determined by the value of a new "dnodesize" dataset property. By default the property is set to "legacy" which is compatible with older software. Setting the property to "auto" will allow the filesystem to choose the most suitable dnode size. Currently this just sets the default dnode size to 1k, but future code improvements could dynamically choose a size based on observed workload patterns. Dnodes of varying sizes can coexist within the same dataset and even within the same dnode block. For example, to enable automatically-sized dnodes, run # zfs set dnodesize=auto tank/fish The user can also specify literal values for the dnodesize property. These are currently limited to powers of two from 1k to 16k. The power-of-2 limitation is only for simplicity of the user interface. Internally the implementation can handle any multiple of 512 up to 16k, and consumers of the DMU API can specify any legal dnode value. The size of a new dnode is determined at object allocation time and stored as a new field in the znode in-memory structure. New DMU interfaces are added to allow the consumer to specify the dnode size that a newly allocated object should use. Existing interfaces are unchanged to avoid having to update every call site and to preserve compatibility with external consumers such as Lustre. The new interfaces names are given below. The versions of these functions that don't take a dnodesize parameter now just call the _dnsize() versions with a dnodesize of 0, which means use the legacy dnode size. New DMU interfaces: dmu_object_alloc_dnsize() dmu_object_claim_dnsize() dmu_object_reclaim_dnsize() New ZAP interfaces: zap_create_dnsize() zap_create_norm_dnsize() zap_create_flags_dnsize() zap_create_claim_norm_dnsize() zap_create_link_dnsize() The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The spa_maxdnodesize() function should be used to determine the maximum bonus length for a pool. These are a few noteworthy changes to key functions: * The prototype for dnode_hold_impl() now takes a "slots" parameter. When the DNODE_MUST_BE_FREE flag is set, this parameter is used to ensure the hole at the specified object offset is large enough to hold the dnode being created. The slots parameter is also used to ensure a dnode does not span multiple dnode blocks. In both of these cases, if a failure occurs, ENOSPC is returned. Keep in mind, these failure cases are only possible when using DNODE_MUST_BE_FREE. If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. dnode_hold_impl() will check if the requested dnode is already consumed as an extra dnode slot by an large dnode, in which case it returns ENOENT. * The function dmu_object_alloc() advances to the next dnode block if dnode_hold_impl() returns an error for a requested object. This is because the beginning of the next dnode block is the only location it can safely assume to either be a hole or a valid starting point for a dnode. * dnode_next_offset_level() and other functions that iterate through dnode blocks may no longer use a simple array indexing scheme. These now use the current dnode's dn_num_slots field to advance to the next dnode in the block. This is to ensure we properly skip the current dnode's bonus area and don't interpret it as a valid dnode. zdb --- The zdb command was updated to display a dnode's size under the "dnsize" column when the object is dumped. For ZIL create log records, zdb will now display the slot count for the object. ztest ----- Ztest chooses a random dnodesize for every newly created object. The random distribution is more heavily weighted toward small dnodes to better simulate real-world datasets. Unused bonus buffer space is filled with non-zero values computed from the object number, dataset id, offset, and generation number. This helps ensure that the dnode traversal code properly skips the interior regions of large dnodes, and that these interior regions are not overwritten by data belonging to other dnodes. A new test visits each object in a dataset. It verifies that the actual dnode size matches what was stored in the ztest block tag when it was created. It also verifies that the unused bonus buffer space is filled with the expected data patterns. ZFS Test Suite -------------- Added six new large dnode-specific tests, and integrated the dnodesize property into existing tests for zfs allow and send/recv. Send/Receive ------------ ZFS send streams for datasets containing large dnodes cannot be received on pools that don't support the large_dnode feature. A send stream with large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be unrecognized by an incompatible receiving pool so that the zfs receive will fail gracefully. While not implemented here, it may be possible to generate a backward-compatible send stream from a dataset containing large dnodes. The implementation may be tricky, however, because the send object record for a large dnode would need to be resized to a 512 byte dnode, possibly kicking in a spill block in the process. This means we would need to construct a new SA layout and possibly register it in the SA layout object. The SA layout is normally just sent as an ordinary object record. But if we are constructing new layouts while generating the send stream we'd have to build the SA layout object dynamically and send it at the end of the stream. For sending and receiving between pools that do support large dnodes, the drr_object send record type is extended with a new field to store the dnode slot count. This field was repurposed from unused padding in the structure. ZIL Replay ---------- The dnode slot count is stored in the uppermost 8 bits of the lr_foid field. The bits were unused as the object id is currently capped at 48 bits. Resizing Dnodes --------------- It should be possible to resize a dnode when it is dirtied if the current dnodesize dataset property differs from the dnode's size, but this functionality is not currently implemented. Clearly a dnode can only grow if there are sufficient contiguous unused slots in the dnode block, but it should always be possible to shrink a dnode. Growing dnodes may be useful to reduce fragmentation in a pool with many spill blocks in use. Shrinking dnodes may be useful to allow sending a dataset to a pool that doesn't support the large_dnode feature. Feature Reference Counting -------------------------- The reference count for the large_dnode pool feature tracks the number of datasets that have ever contained a dnode of size larger than 512 bytes. The first time a large dnode is created in a dataset the dataset is converted to an extensible dataset. This is a one-way operation and the only way to decrement the feature count is to destroy the dataset, even if the dataset no longer contains any large dnodes. The complexity of reference counting on a per-dnode basis was too high, so we chose to track it on a per-dataset basis similarly to the large_block feature. Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3542
2016-03-17 04:25:34 +03:00
uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
/*
* Initialize an already-allocated object.
*/
void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags,
dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
/*
* Create a new zapobj with no attributes from the given (unallocated)
* object number.
*/
int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
Implement large_dnode pool feature Justification ------------- This feature adds support for variable length dnodes. Our motivation is to eliminate the overhead associated with using spill blocks. Spill blocks are used to store system attribute data (i.e. file metadata) that does not fit in the dnode's bonus buffer. By allowing a larger bonus buffer area the use of a spill block can be avoided. Spill blocks potentially incur an additional read I/O for every dnode in a dnode block. As a worst case example, reading 32 dnodes from a 16k dnode block and all of the spill blocks could issue 33 separate reads. Now suppose those dnodes have size 1024 and therefore don't need spill blocks. Then the worst case number of blocks read is reduced to from 33 to two--one per dnode block. In practice spill blocks may tend to be co-located on disk with the dnode blocks so the reduction in I/O would not be this drastic. In a badly fragmented pool, however, the improvement could be significant. ZFS-on-Linux systems that make heavy use of extended attributes would benefit from this feature. In particular, ZFS-on-Linux supports the xattr=sa dataset property which allows file extended attribute data to be stored in the dnode bonus buffer as an alternative to the traditional directory-based format. Workloads such as SELinux and the Lustre distributed filesystem often store enough xattr data to force spill bocks when xattr=sa is in effect. Large dnodes may therefore provide a performance benefit to such systems. Other use cases that may benefit from this feature include files with large ACLs and symbolic links with long target names. Furthermore, this feature may be desirable on other platforms in case future applications or features are developed that could make use of a larger bonus buffer area. Implementation -------------- The size of a dnode may be a multiple of 512 bytes up to the size of a dnode block (currently 16384 bytes). A dn_extra_slots field was added to the current on-disk dnode_phys_t structure to describe the size of the physical dnode on disk. The 8 bits for this field were taken from the zero filled dn_pad2 field. The field represents how many "extra" dnode_phys_t slots a dnode consumes in its dnode block. This convention results in a value of 0 for 512 byte dnodes which preserves on-disk format compatibility with older software. Similarly, the in-memory dnode_t structure has a new dn_num_slots field to represent the total number of dnode_phys_t slots consumed on disk. Thus dn->dn_num_slots is 1 greater than the corresponding dnp->dn_extra_slots. This difference in convention was adopted because, unlike on-disk structures, backward compatibility is not a concern for in-memory objects, so we used a more natural way to represent size for a dnode_t. The default size for newly created dnodes is determined by the value of a new "dnodesize" dataset property. By default the property is set to "legacy" which is compatible with older software. Setting the property to "auto" will allow the filesystem to choose the most suitable dnode size. Currently this just sets the default dnode size to 1k, but future code improvements could dynamically choose a size based on observed workload patterns. Dnodes of varying sizes can coexist within the same dataset and even within the same dnode block. For example, to enable automatically-sized dnodes, run # zfs set dnodesize=auto tank/fish The user can also specify literal values for the dnodesize property. These are currently limited to powers of two from 1k to 16k. The power-of-2 limitation is only for simplicity of the user interface. Internally the implementation can handle any multiple of 512 up to 16k, and consumers of the DMU API can specify any legal dnode value. The size of a new dnode is determined at object allocation time and stored as a new field in the znode in-memory structure. New DMU interfaces are added to allow the consumer to specify the dnode size that a newly allocated object should use. Existing interfaces are unchanged to avoid having to update every call site and to preserve compatibility with external consumers such as Lustre. The new interfaces names are given below. The versions of these functions that don't take a dnodesize parameter now just call the _dnsize() versions with a dnodesize of 0, which means use the legacy dnode size. New DMU interfaces: dmu_object_alloc_dnsize() dmu_object_claim_dnsize() dmu_object_reclaim_dnsize() New ZAP interfaces: zap_create_dnsize() zap_create_norm_dnsize() zap_create_flags_dnsize() zap_create_claim_norm_dnsize() zap_create_link_dnsize() The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The spa_maxdnodesize() function should be used to determine the maximum bonus length for a pool. These are a few noteworthy changes to key functions: * The prototype for dnode_hold_impl() now takes a "slots" parameter. When the DNODE_MUST_BE_FREE flag is set, this parameter is used to ensure the hole at the specified object offset is large enough to hold the dnode being created. The slots parameter is also used to ensure a dnode does not span multiple dnode blocks. In both of these cases, if a failure occurs, ENOSPC is returned. Keep in mind, these failure cases are only possible when using DNODE_MUST_BE_FREE. If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. dnode_hold_impl() will check if the requested dnode is already consumed as an extra dnode slot by an large dnode, in which case it returns ENOENT. * The function dmu_object_alloc() advances to the next dnode block if dnode_hold_impl() returns an error for a requested object. This is because the beginning of the next dnode block is the only location it can safely assume to either be a hole or a valid starting point for a dnode. * dnode_next_offset_level() and other functions that iterate through dnode blocks may no longer use a simple array indexing scheme. These now use the current dnode's dn_num_slots field to advance to the next dnode in the block. This is to ensure we properly skip the current dnode's bonus area and don't interpret it as a valid dnode. zdb --- The zdb command was updated to display a dnode's size under the "dnsize" column when the object is dumped. For ZIL create log records, zdb will now display the slot count for the object. ztest ----- Ztest chooses a random dnodesize for every newly created object. The random distribution is more heavily weighted toward small dnodes to better simulate real-world datasets. Unused bonus buffer space is filled with non-zero values computed from the object number, dataset id, offset, and generation number. This helps ensure that the dnode traversal code properly skips the interior regions of large dnodes, and that these interior regions are not overwritten by data belonging to other dnodes. A new test visits each object in a dataset. It verifies that the actual dnode size matches what was stored in the ztest block tag when it was created. It also verifies that the unused bonus buffer space is filled with the expected data patterns. ZFS Test Suite -------------- Added six new large dnode-specific tests, and integrated the dnodesize property into existing tests for zfs allow and send/recv. Send/Receive ------------ ZFS send streams for datasets containing large dnodes cannot be received on pools that don't support the large_dnode feature. A send stream with large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be unrecognized by an incompatible receiving pool so that the zfs receive will fail gracefully. While not implemented here, it may be possible to generate a backward-compatible send stream from a dataset containing large dnodes. The implementation may be tricky, however, because the send object record for a large dnode would need to be resized to a 512 byte dnode, possibly kicking in a spill block in the process. This means we would need to construct a new SA layout and possibly register it in the SA layout object. The SA layout is normally just sent as an ordinary object record. But if we are constructing new layouts while generating the send stream we'd have to build the SA layout object dynamically and send it at the end of the stream. For sending and receiving between pools that do support large dnodes, the drr_object send record type is extended with a new field to store the dnode slot count. This field was repurposed from unused padding in the structure. ZIL Replay ---------- The dnode slot count is stored in the uppermost 8 bits of the lr_foid field. The bits were unused as the object id is currently capped at 48 bits. Resizing Dnodes --------------- It should be possible to resize a dnode when it is dirtied if the current dnodesize dataset property differs from the dnode's size, but this functionality is not currently implemented. Clearly a dnode can only grow if there are sufficient contiguous unused slots in the dnode block, but it should always be possible to shrink a dnode. Growing dnodes may be useful to reduce fragmentation in a pool with many spill blocks in use. Shrinking dnodes may be useful to allow sending a dataset to a pool that doesn't support the large_dnode feature. Feature Reference Counting -------------------------- The reference count for the large_dnode pool feature tracks the number of datasets that have ever contained a dnode of size larger than 512 bytes. The first time a large dnode is created in a dataset the dataset is converted to an extensible dataset. This is a one-way operation and the only way to decrement the feature count is to destroy the dataset, even if the dataset no longer contains any large dnodes. The complexity of reference counting on a per-dnode basis was too high, so we chose to track it on a per-dataset basis similarly to the large_block feature. Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3542
2016-03-17 04:25:34 +03:00
int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
int zap_create_claim_norm(objset_t *ds, uint64_t obj,
int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
Implement large_dnode pool feature Justification ------------- This feature adds support for variable length dnodes. Our motivation is to eliminate the overhead associated with using spill blocks. Spill blocks are used to store system attribute data (i.e. file metadata) that does not fit in the dnode's bonus buffer. By allowing a larger bonus buffer area the use of a spill block can be avoided. Spill blocks potentially incur an additional read I/O for every dnode in a dnode block. As a worst case example, reading 32 dnodes from a 16k dnode block and all of the spill blocks could issue 33 separate reads. Now suppose those dnodes have size 1024 and therefore don't need spill blocks. Then the worst case number of blocks read is reduced to from 33 to two--one per dnode block. In practice spill blocks may tend to be co-located on disk with the dnode blocks so the reduction in I/O would not be this drastic. In a badly fragmented pool, however, the improvement could be significant. ZFS-on-Linux systems that make heavy use of extended attributes would benefit from this feature. In particular, ZFS-on-Linux supports the xattr=sa dataset property which allows file extended attribute data to be stored in the dnode bonus buffer as an alternative to the traditional directory-based format. Workloads such as SELinux and the Lustre distributed filesystem often store enough xattr data to force spill bocks when xattr=sa is in effect. Large dnodes may therefore provide a performance benefit to such systems. Other use cases that may benefit from this feature include files with large ACLs and symbolic links with long target names. Furthermore, this feature may be desirable on other platforms in case future applications or features are developed that could make use of a larger bonus buffer area. Implementation -------------- The size of a dnode may be a multiple of 512 bytes up to the size of a dnode block (currently 16384 bytes). A dn_extra_slots field was added to the current on-disk dnode_phys_t structure to describe the size of the physical dnode on disk. The 8 bits for this field were taken from the zero filled dn_pad2 field. The field represents how many "extra" dnode_phys_t slots a dnode consumes in its dnode block. This convention results in a value of 0 for 512 byte dnodes which preserves on-disk format compatibility with older software. Similarly, the in-memory dnode_t structure has a new dn_num_slots field to represent the total number of dnode_phys_t slots consumed on disk. Thus dn->dn_num_slots is 1 greater than the corresponding dnp->dn_extra_slots. This difference in convention was adopted because, unlike on-disk structures, backward compatibility is not a concern for in-memory objects, so we used a more natural way to represent size for a dnode_t. The default size for newly created dnodes is determined by the value of a new "dnodesize" dataset property. By default the property is set to "legacy" which is compatible with older software. Setting the property to "auto" will allow the filesystem to choose the most suitable dnode size. Currently this just sets the default dnode size to 1k, but future code improvements could dynamically choose a size based on observed workload patterns. Dnodes of varying sizes can coexist within the same dataset and even within the same dnode block. For example, to enable automatically-sized dnodes, run # zfs set dnodesize=auto tank/fish The user can also specify literal values for the dnodesize property. These are currently limited to powers of two from 1k to 16k. The power-of-2 limitation is only for simplicity of the user interface. Internally the implementation can handle any multiple of 512 up to 16k, and consumers of the DMU API can specify any legal dnode value. The size of a new dnode is determined at object allocation time and stored as a new field in the znode in-memory structure. New DMU interfaces are added to allow the consumer to specify the dnode size that a newly allocated object should use. Existing interfaces are unchanged to avoid having to update every call site and to preserve compatibility with external consumers such as Lustre. The new interfaces names are given below. The versions of these functions that don't take a dnodesize parameter now just call the _dnsize() versions with a dnodesize of 0, which means use the legacy dnode size. New DMU interfaces: dmu_object_alloc_dnsize() dmu_object_claim_dnsize() dmu_object_reclaim_dnsize() New ZAP interfaces: zap_create_dnsize() zap_create_norm_dnsize() zap_create_flags_dnsize() zap_create_claim_norm_dnsize() zap_create_link_dnsize() The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The spa_maxdnodesize() function should be used to determine the maximum bonus length for a pool. These are a few noteworthy changes to key functions: * The prototype for dnode_hold_impl() now takes a "slots" parameter. When the DNODE_MUST_BE_FREE flag is set, this parameter is used to ensure the hole at the specified object offset is large enough to hold the dnode being created. The slots parameter is also used to ensure a dnode does not span multiple dnode blocks. In both of these cases, if a failure occurs, ENOSPC is returned. Keep in mind, these failure cases are only possible when using DNODE_MUST_BE_FREE. If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. dnode_hold_impl() will check if the requested dnode is already consumed as an extra dnode slot by an large dnode, in which case it returns ENOENT. * The function dmu_object_alloc() advances to the next dnode block if dnode_hold_impl() returns an error for a requested object. This is because the beginning of the next dnode block is the only location it can safely assume to either be a hole or a valid starting point for a dnode. * dnode_next_offset_level() and other functions that iterate through dnode blocks may no longer use a simple array indexing scheme. These now use the current dnode's dn_num_slots field to advance to the next dnode in the block. This is to ensure we properly skip the current dnode's bonus area and don't interpret it as a valid dnode. zdb --- The zdb command was updated to display a dnode's size under the "dnsize" column when the object is dumped. For ZIL create log records, zdb will now display the slot count for the object. ztest ----- Ztest chooses a random dnodesize for every newly created object. The random distribution is more heavily weighted toward small dnodes to better simulate real-world datasets. Unused bonus buffer space is filled with non-zero values computed from the object number, dataset id, offset, and generation number. This helps ensure that the dnode traversal code properly skips the interior regions of large dnodes, and that these interior regions are not overwritten by data belonging to other dnodes. A new test visits each object in a dataset. It verifies that the actual dnode size matches what was stored in the ztest block tag when it was created. It also verifies that the unused bonus buffer space is filled with the expected data patterns. ZFS Test Suite -------------- Added six new large dnode-specific tests, and integrated the dnodesize property into existing tests for zfs allow and send/recv. Send/Receive ------------ ZFS send streams for datasets containing large dnodes cannot be received on pools that don't support the large_dnode feature. A send stream with large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be unrecognized by an incompatible receiving pool so that the zfs receive will fail gracefully. While not implemented here, it may be possible to generate a backward-compatible send stream from a dataset containing large dnodes. The implementation may be tricky, however, because the send object record for a large dnode would need to be resized to a 512 byte dnode, possibly kicking in a spill block in the process. This means we would need to construct a new SA layout and possibly register it in the SA layout object. The SA layout is normally just sent as an ordinary object record. But if we are constructing new layouts while generating the send stream we'd have to build the SA layout object dynamically and send it at the end of the stream. For sending and receiving between pools that do support large dnodes, the drr_object send record type is extended with a new field to store the dnode slot count. This field was repurposed from unused padding in the structure. ZIL Replay ---------- The dnode slot count is stored in the uppermost 8 bits of the lr_foid field. The bits were unused as the object id is currently capped at 48 bits. Resizing Dnodes --------------- It should be possible to resize a dnode when it is dirtied if the current dnodesize dataset property differs from the dnode's size, but this functionality is not currently implemented. Clearly a dnode can only grow if there are sufficient contiguous unused slots in the dnode block, but it should always be possible to shrink a dnode. Growing dnodes may be useful to reduce fragmentation in a pool with many spill blocks in use. Shrinking dnodes may be useful to allow sending a dataset to a pool that doesn't support the large_dnode feature. Feature Reference Counting -------------------------- The reference count for the large_dnode pool feature tracks the number of datasets that have ever contained a dnode of size larger than 512 bytes. The first time a large dnode is created in a dataset the dataset is converted to an extensible dataset. This is a one-way operation and the only way to decrement the feature count is to destroy the dataset, even if the dataset no longer contains any large dnodes. The complexity of reference counting on a per-dnode basis was too high, so we chose to track it on a per-dataset basis similarly to the large_block feature. Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3542
2016-03-17 04:25:34 +03:00
int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
/*
* The zapobj passed in must be a valid ZAP object for all of the
* following routines.
*/
/*
* Destroy this zapobj and all its attributes.
*
* Frees the object number using dmu_object_free.
*/
int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
/*
* Manipulate attributes.
*
* 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
*/
/*
* Retrieve the contents of the attribute with the given name.
*
* If the requested attribute does not exist, the call will fail and
* return ENOENT.
*
* If 'integer_size' is smaller than the attribute's integer size, the
* call will fail and return EINVAL.
*
* If 'integer_size' is equal to or larger than the attribute's integer
* size, the call will succeed and return 0.
*
* When converting to a larger integer size, the integers will be treated as
* unsigned (ie. no sign-extension will be performed).
2008-11-20 23:01:55 +03:00
*
* 'num_integers' is the length (in integers) of 'buf'.
*
* If the attribute is longer than the buffer, as many integers as will
* fit will be transferred to 'buf'. If the entire attribute was not
* transferred, the call will return EOVERFLOW.
*/
int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf);
/*
2008-11-20 23:01:55 +03:00
* If rn_len is nonzero, realname will be set to the name of the found
* entry (which may be different from the requested name if matchtype is
* not MT_EXACT).
*
* If normalization_conflictp is not NULL, it will be set if there is
* another name with the same case/unicode normalized form.
*/
int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *normalization_conflictp);
int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints);
2008-11-20 23:01:55 +03:00
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object Using a benchmark which has 32 threads creating 2 million files in the same directory, on a machine with 16 CPU cores, I observed poor performance. I noticed that dmu_tx_hold_zap() was using about 30% of all CPU, and doing dnode_hold() 7 times on the same object (the ZAP object that is being held). dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the dnode_t that we already have in hand, rather than repeatedly calling dnode_hold(). To do this, we need to pass the dnode_t down through all the intermediate calls that dmu_tx_hold_zap() makes, making these routines take the dnode_t* rather than an objset_t* and a uint64_t object number. In particular, the following routines will need to have analogous *_by_dnode() variants created: dmu_buf_hold_noread() dmu_buf_hold() zap_lookup() zap_lookup_norm() zap_count_write() zap_lockdir() zap_count_write() This can improve performance on the benchmark described above by 100%, from 30,000 file creations per second to 60,000. (This improvement is on top of that provided by working around the object allocation issue. Peak performance of ~90,000 creations per second was observed with 8 CPUs; adding CPUs past that decreased performance due to lock contention.) The CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds to 40 CPU-seconds. Sponsored by: Intel Corp. Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/7004 OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109 Closes #4641 Closes #4972
2016-07-21 01:42:13 +03:00
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp);
int zap_count_write_by_dnode(dnode_t *dn, const char *name,
int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite);
2009-07-03 02:44:48 +04:00
2008-11-20 23:01:55 +03:00
/*
* Create an attribute with the given name and value.
*
* If an attribute with the given name already exists, the call will
* fail and return EEXIST.
*/
int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
2008-11-20 23:01:55 +03:00
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_by_dnode(dnode_t *dn, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
/*
* Set the attribute with the given name to the given value. If an
* attribute with the given name does not exist, it will be created. If
* an attribute with the given name already exists, the previous value
* will be overwritten. The integer_size may be different from the
* existing attribute's integer size, in which case the attribute's
* integer size will be updated to the new value.
*/
int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
/*
* Get the length (in integers) and the integer size of the specified
* attribute.
*
* If the requested attribute does not exist, the call will fail and
* return ENOENT.
*/
int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers);
int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t *integer_size, uint64_t *num_integers);
2008-11-20 23:01:55 +03:00
/*
* Remove the specified attribute.
*
* If the specified attribute does not exist, the call will fail and
* return ENOENT.
*/
int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
matchtype_t mt, dmu_tx_t *tx);
int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
/*
* Returns (in *count) the number of attributes in the specified zap
* object.
*/
int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
/*
* Returns (in name) the name of the entry whose (value & mask)
* (za_first_integer) is value, or ENOENT if not found. The string
* pointed to by name must be at least 256 bytes long. If mask==0, the
* match must be exact (ie, same as mask=-1ULL).
*/
int zap_value_search(objset_t *os, uint64_t zapobj,
uint64_t value, uint64_t mask, char *name);
/*
* Transfer all the entries from fromobj into intoobj. Only works on
* int_size=8 num_integers=1 values. Fails if there are any duplicated
* entries.
*/
int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
/* Same as zap_join, but set the values to 'value'. */
int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
uint64_t value, dmu_tx_t *tx);
/* Same as zap_join, but add together any duplicated entries. */
int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
dmu_tx_t *tx);
/*
* Manipulate entries where the name + value are the "same" (the name is
* a stringified version of the value).
*/
int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
dmu_tx_t *tx);
/* Here the key is an int and the value is a different int. */
int zap_add_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_update_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t *valuep);
int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
dmu_tx_t *tx);
2008-11-20 23:01:55 +03:00
struct zap;
struct zap_leaf;
typedef struct zap_cursor {
/* This structure is opaque! */
objset_t *zc_objset;
struct zap *zc_zap;
struct zap_leaf *zc_leaf;
uint64_t zc_zapobj;
uint64_t zc_serialized;
2008-11-20 23:01:55 +03:00
uint64_t zc_hash;
uint32_t zc_cd;
boolean_t zc_prefetch;
2008-11-20 23:01:55 +03:00
} zap_cursor_t;
typedef struct {
int za_integer_length;
/*
* za_normalization_conflict will be set if there are additional
* entries with this normalized form (eg, "foo" and "Foo").
*/
boolean_t za_normalization_conflict;
uint64_t za_num_integers;
uint64_t za_first_integer; /* no sign extension for <8byte ints */
char za_name[ZAP_MAXNAMELEN];
2008-11-20 23:01:55 +03:00
} zap_attribute_t;
/*
* The interface for listing all the attributes of a zapobj can be
* thought of as cursor moving down a list of the attributes one by
* one. The cookie returned by the zap_cursor_serialize routine is
* persistent across system calls (and across reboot, even).
*/
/*
* Initialize a zap cursor, pointing to the "first" attribute of the
* zapobj. You must _fini the cursor when you are done with it.
*/
void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj);
2008-11-20 23:01:55 +03:00
void zap_cursor_fini(zap_cursor_t *zc);
/*
* Get the attribute currently pointed to by the cursor. Returns
* ENOENT if at the end of the attributes.
*/
int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
/*
* Advance the cursor to the next attribute.
*/
void zap_cursor_advance(zap_cursor_t *zc);
/*
* Get a persistent cookie pointing to the current position of the zap
* cursor. The low 4 bits in the cookie are always zero, and thus can
* be used as to differentiate a serialized cookie from a different type
* of value. The cookie will be less than 2^32 as long as there are
* fewer than 2^22 (4.2 million) entries in the zap object.
*/
uint64_t zap_cursor_serialize(zap_cursor_t *zc);
/*
* Initialize a zap cursor pointing to the position recorded by
* zap_cursor_serialize (in the "serialized" argument). You can also
* use a "serialized" argument of 0 to start at the beginning of the
* zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
* zap_cursor_init(...).)
*/
void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
uint64_t zapobj, uint64_t serialized);
#define ZAP_HISTOGRAM_SIZE 10
typedef struct zap_stats {
/*
* Size of the pointer table (in number of entries).
* This is always a power of 2, or zero if it's a microzap.
* In general, it should be considerably greater than zs_num_leafs.
*/
uint64_t zs_ptrtbl_len;
uint64_t zs_blocksize; /* size of zap blocks */
/*
* The number of blocks used. Note that some blocks may be
* wasted because old ptrtbl's and large name/value blocks are
* not reused. (Although their space is reclaimed, we don't
* reuse those offsets in the object.)
*/
uint64_t zs_num_blocks;
/*
* Pointer table values from zap_ptrtbl in the zap_phys_t
*/
uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
uint64_t zs_ptrtbl_zt_blk; /* starting block number */
uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
/*
* Values of the other members of the zap_phys_t
*/
uint64_t zs_block_type; /* ZBT_HEADER */
uint64_t zs_magic; /* ZAP_MAGIC */
uint64_t zs_num_leafs; /* The number of leaf blocks */
uint64_t zs_num_entries; /* The number of zap entries */
uint64_t zs_salt; /* salt to stir into hash function */
/*
* Histograms. For all histograms, the last index
* (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
* than what can be represented. For example
* zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
* of leafs with more than 45 entries.
*/
/*
* zs_leafs_with_n_pointers[n] is the number of leafs with
* 2^n pointers to it.
*/
uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
/*
* zs_leafs_with_n_entries[n] is the number of leafs with
* [n*5, (n+1)*5) entries. In the current implementation, there
* can be at most 55 entries in any block, but there may be
* fewer if the name or value is large, or the block is not
* completely full.
*/
uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
/*
* zs_leafs_n_tenths_full[n] is the number of leafs whose
* fullness is in the range [n/10, (n+1)/10).
*/
uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
/*
* zs_entries_using_n_chunks[n] is the number of entries which
* consume n 24-byte chunks. (Note, large names/values only use
* one chunk, but contribute to zs_num_blocks_large.)
*/
uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
/*
* zs_buckets_with_n_entries[n] is the number of buckets (each
* leaf has 64 buckets) with n entries.
* zs_buckets_with_n_entries[1] should be very close to
* zs_num_entries.
*/
uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
} zap_stats_t;
/*
* Get statistics about a ZAP object. Note: you need to be aware of the
* internal implementation of the ZAP to correctly interpret some of the
* statistics. This interface shouldn't be relied on unless you really
* know what you're doing.
*/
int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_ZAP_H */