mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 10:54:35 +03:00
Illumos 5027 - zfs large block support
5027 zfs large block support Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Reviewed by: Richard Elling <richard.elling@richardelling.com> Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/5027 https://github.com/illumos/illumos-gate/commit/b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #354
This commit is contained in:
committed by
Brian Behlendorf
parent
3df293404a
commit
f1512ee61e
+2
-1
@@ -245,7 +245,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
|
||||
* The maximum number of bytes that can be accessed as part of one
|
||||
* operation, including metadata.
|
||||
*/
|
||||
#define DMU_MAX_ACCESS (10<<20) /* 10MB */
|
||||
#define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
|
||||
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
|
||||
|
||||
#define DMU_USERUSED_OBJECT (-1ULL)
|
||||
@@ -732,6 +732,7 @@ void xuio_stat_wbuf_copied(void);
|
||||
void xuio_stat_wbuf_nocopy(void);
|
||||
|
||||
extern int zfs_prefetch_disable;
|
||||
extern int zfs_max_recordsize;
|
||||
|
||||
/*
|
||||
* Asynchronously try to read in the data.
|
||||
|
||||
@@ -99,6 +99,7 @@ struct objset {
|
||||
zfs_cache_type_t os_secondary_cache;
|
||||
zfs_sync_type_t os_sync;
|
||||
zfs_redundant_metadata_type_t os_redundant_metadata;
|
||||
int os_recordsize;
|
||||
|
||||
/* no lock needed: */
|
||||
struct dmu_tx *os_synctx; /* XXX sketchy */
|
||||
|
||||
@@ -37,12 +37,14 @@ struct dsl_dataset;
|
||||
struct drr_begin;
|
||||
struct avl_tree;
|
||||
|
||||
int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
|
||||
int dmu_send(const char *tosnap, const char *fromsnap,
|
||||
boolean_t embedok, boolean_t large_block_ok,
|
||||
int outfd, struct vnode *vp, offset_t *off);
|
||||
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
|
||||
uint64_t *sizep);
|
||||
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
|
||||
boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
|
||||
boolean_t embedok, boolean_t large_block_ok,
|
||||
int outfd, struct vnode *vp, offset_t *off);
|
||||
|
||||
typedef struct dmu_recv_cookie {
|
||||
struct dsl_dataset *drc_ds;
|
||||
|
||||
@@ -83,6 +83,13 @@ struct dsl_pool;
|
||||
*/
|
||||
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
|
||||
|
||||
/*
|
||||
* This field is present (with value=0) if this dataset may contain large
|
||||
* blocks (>128KB). If it is present, then this dataset
|
||||
* is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
|
||||
*/
|
||||
#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
|
||||
|
||||
/*
|
||||
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
|
||||
* name lookups should be performed case-insensitively.
|
||||
@@ -138,6 +145,8 @@ typedef struct dsl_dataset {
|
||||
/* only used in syncing context, only valid for non-snapshots: */
|
||||
struct dsl_dataset *ds_prev;
|
||||
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
|
||||
boolean_t ds_large_blocks;
|
||||
boolean_t ds_need_large_blocks;
|
||||
|
||||
/* has internal locking: */
|
||||
dsl_deadlist_t ds_deadlist;
|
||||
@@ -252,6 +261,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
|
||||
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
|
||||
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
|
||||
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
|
||||
int dsl_dataset_activate_large_blocks(const char *dsname);
|
||||
void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
|
||||
|
||||
int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
|
||||
|
||||
|
||||
@@ -200,6 +200,7 @@ typedef enum {
|
||||
ZPOOL_PROP_FREEING,
|
||||
ZPOOL_PROP_FRAGMENTATION,
|
||||
ZPOOL_PROP_LEAKED,
|
||||
ZPOOL_PROP_MAXBLOCKSIZE,
|
||||
ZPOOL_PROP_TNAME,
|
||||
ZPOOL_NUM_PROPS
|
||||
} zpool_prop_t;
|
||||
|
||||
+16
-6
@@ -98,17 +98,26 @@ _NOTE(CONSTCOND) } while (0)
|
||||
_NOTE(CONSTCOND) } while (0)
|
||||
|
||||
/*
|
||||
* We currently support nine block sizes, from 512 bytes to 128K.
|
||||
* We could go higher, but the benefits are near-zero and the cost
|
||||
* of COWing a giant block to modify one byte would become excessive.
|
||||
* We currently support block sizes from 512 bytes to 16MB.
|
||||
* The benefits of larger blocks, and thus larger IO, need to be weighed
|
||||
* against the cost of COWing a giant block to modify one byte, and the
|
||||
* large latency of reading or writing a large block.
|
||||
*
|
||||
* Note that although blocks up to 16MB are supported, the recordsize
|
||||
* property can not be set larger than zfs_max_recordsize (default 1MB).
|
||||
* See the comment near zfs_max_recordsize in dsl_dataset.c for details.
|
||||
*
|
||||
* Note that although the LSIZE field of the blkptr_t can store sizes up
|
||||
* to 32MB, the dnode's dn_datablkszsec can only store sizes up to
|
||||
* 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
|
||||
*/
|
||||
#define SPA_MINBLOCKSHIFT 9
|
||||
#define SPA_MAXBLOCKSHIFT 17
|
||||
#define SPA_OLD_MAXBLOCKSHIFT 17
|
||||
#define SPA_MAXBLOCKSHIFT 24
|
||||
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
|
||||
#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
|
||||
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
|
||||
|
||||
#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
|
||||
|
||||
/*
|
||||
* Size of block to hold the configuration data (a packed nvlist)
|
||||
*/
|
||||
@@ -830,6 +839,7 @@ extern boolean_t spa_has_slogs(spa_t *spa);
|
||||
extern boolean_t spa_is_root(spa_t *spa);
|
||||
extern boolean_t spa_writeable(spa_t *spa);
|
||||
extern boolean_t spa_has_pending_synctask(spa_t *spa);
|
||||
extern int spa_maxblocksize(spa_t *spa);
|
||||
extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
|
||||
|
||||
extern int spa_mode(spa_t *spa);
|
||||
|
||||
@@ -42,8 +42,7 @@ extern int fzap_default_block_shift;
|
||||
|
||||
#define MZAP_ENT_LEN 64
|
||||
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
|
||||
#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
|
||||
#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
|
||||
#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
|
||||
|
||||
#define ZAP_NEED_CD (-1U)
|
||||
|
||||
|
||||
@@ -96,13 +96,16 @@ typedef enum drr_headertype {
|
||||
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
|
||||
#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
|
||||
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
|
||||
/* flag #18 is reserved for a Delphix feature */
|
||||
#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19)
|
||||
|
||||
/*
|
||||
* Mask of all supported backup features
|
||||
*/
|
||||
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
|
||||
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
|
||||
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
|
||||
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
|
||||
DMU_BACKUP_FEATURE_LARGE_BLOCKS)
|
||||
|
||||
/* Are all features in the given flag word currently supported? */
|
||||
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
|
||||
|
||||
@@ -129,7 +129,7 @@ typedef struct znode_phys {
|
||||
#ifdef _KERNEL
|
||||
|
||||
#define DXATTR_MAX_ENTRY_SIZE (32768)
|
||||
#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1)
|
||||
#define DXATTR_MAX_SA_SIZE (SPA_OLD_MAXBLOCKSIZE >> 1)
|
||||
|
||||
int zfs_sa_readlink(struct znode *, uio_t *);
|
||||
void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
|
||||
|
||||
@@ -137,8 +137,6 @@ extern "C" {
|
||||
#define ZFS_SHARES_DIR "SHARES"
|
||||
#define ZFS_SA_ATTRS "SA_ATTRS"
|
||||
|
||||
#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
|
||||
|
||||
/*
|
||||
* Path component length
|
||||
*
|
||||
|
||||
@@ -90,7 +90,6 @@ typedef struct zil_chain {
|
||||
} zil_chain_t;
|
||||
|
||||
#define ZIL_MIN_BLKSZ 4096ULL
|
||||
#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
|
||||
|
||||
/*
|
||||
* The words of a log block checksum.
|
||||
|
||||
@@ -140,7 +140,7 @@ typedef struct zil_bp_node {
|
||||
avl_node_t zn_node;
|
||||
} zil_bp_node_t;
|
||||
|
||||
#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
|
||||
#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
|
||||
sizeof (lr_write_t))
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Reference in New Issue
Block a user