mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-26 04:07:45 +03:00
Illumos 5027 - zfs large block support
5027 zfs large block support Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Reviewed by: Richard Elling <richard.elling@richardelling.com> Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/5027 https://github.com/illumos/illumos-gate/commit/b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #354
This commit is contained in:
committed by
Brian Behlendorf
parent
3df293404a
commit
f1512ee61e
+112
-2
@@ -51,6 +51,17 @@
|
||||
#include <sys/dsl_userhold.h>
|
||||
#include <sys/dsl_bookmark.h>
|
||||
|
||||
/*
|
||||
* The SPA supports block sizes up to 16MB. However, very large blocks
|
||||
* can have an impact on i/o latency (e.g. tying up a spinning disk for
|
||||
* ~300ms), and also potentially on the memory allocator. Therefore,
|
||||
* we do not allow the recordsize to be set larger than zfs_max_recordsize
|
||||
* (default 1MB). Larger blocks can be created by changing this tunable,
|
||||
* and pools with larger blocks can always be imported and used, regardless
|
||||
* of this setting.
|
||||
*/
|
||||
int zfs_max_recordsize = 1 * 1024 * 1024;
|
||||
|
||||
#define SWITCH64(x, y) \
|
||||
{ \
|
||||
uint64_t __tmp = (x); \
|
||||
@@ -60,8 +71,6 @@
|
||||
|
||||
#define DS_REF_MAX (1ULL << 62)
|
||||
|
||||
#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
|
||||
|
||||
extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
|
||||
|
||||
/*
|
||||
@@ -117,6 +126,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
|
||||
dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
|
||||
dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
|
||||
dsl_dataset_phys(ds)->ds_unique_bytes += used;
|
||||
if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
|
||||
ds->ds_need_large_blocks = B_TRUE;
|
||||
mutex_exit(&ds->ds_lock);
|
||||
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
|
||||
compressed, uncompressed, tx);
|
||||
@@ -414,6 +425,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
|
||||
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
|
||||
offsetof(dmu_sendarg_t, dsa_link));
|
||||
|
||||
if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
|
||||
err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
|
||||
if (err == 0)
|
||||
ds->ds_large_blocks = B_TRUE;
|
||||
else
|
||||
ASSERT3U(err, ==, ENOENT);
|
||||
}
|
||||
|
||||
if (err == 0) {
|
||||
err = dsl_dir_hold_obj(dp,
|
||||
dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
|
||||
@@ -730,6 +749,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
|
||||
dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
|
||||
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
|
||||
|
||||
if (origin->ds_large_blocks)
|
||||
dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
|
||||
|
||||
dmu_buf_will_dirty(origin->ds_dbuf, tx);
|
||||
dsl_dataset_phys(origin)->ds_num_children++;
|
||||
|
||||
@@ -1253,6 +1275,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
|
||||
dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
|
||||
dmu_buf_rele(dbuf, FTAG);
|
||||
|
||||
if (ds->ds_large_blocks)
|
||||
dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
|
||||
|
||||
ASSERT3U(ds->ds_prev != 0, ==,
|
||||
dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
|
||||
if (ds->ds_prev) {
|
||||
@@ -1541,6 +1566,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
|
||||
dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
|
||||
|
||||
dmu_objset_sync(ds->ds_objset, zio, tx);
|
||||
|
||||
if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
|
||||
dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
|
||||
ds->ds_large_blocks = B_TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -3222,6 +3252,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
|
||||
return (err);
|
||||
}
|
||||
|
||||
static int
|
||||
dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
const char *dsname = arg;
|
||||
dsl_dataset_t *ds;
|
||||
dsl_pool_t *dp = dmu_tx_pool(tx);
|
||||
int error = 0;
|
||||
|
||||
if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
|
||||
return (SET_ERROR(ENOTSUP));
|
||||
|
||||
ASSERT(spa_feature_is_enabled(dp->dp_spa,
|
||||
SPA_FEATURE_EXTENSIBLE_DATASET));
|
||||
|
||||
error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
|
||||
if (error != 0)
|
||||
return (error);
|
||||
|
||||
if (ds->ds_large_blocks)
|
||||
error = EALREADY;
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
void
|
||||
dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
|
||||
{
|
||||
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
||||
objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
|
||||
uint64_t zero = 0;
|
||||
|
||||
spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
|
||||
dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
|
||||
|
||||
VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
|
||||
sizeof (zero), 1, &zero, tx));
|
||||
}
|
||||
|
||||
static void
|
||||
dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
const char *dsname = arg;
|
||||
dsl_dataset_t *ds;
|
||||
|
||||
VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
|
||||
|
||||
dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
|
||||
ASSERT(!ds->ds_large_blocks);
|
||||
ds->ds_large_blocks = B_TRUE;
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
}
|
||||
|
||||
int
|
||||
dsl_dataset_activate_large_blocks(const char *dsname)
|
||||
{
|
||||
int error;
|
||||
|
||||
error = dsl_sync_task(dsname,
|
||||
dsl_dataset_activate_large_blocks_check,
|
||||
dsl_dataset_activate_large_blocks_sync, (void *)dsname,
|
||||
1, ZFS_SPACE_CHECK_RESERVED);
|
||||
|
||||
/*
|
||||
* EALREADY indicates that this dataset already supports large blocks.
|
||||
*/
|
||||
if (error == EALREADY)
|
||||
error = 0;
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
|
||||
* For example, they could both be snapshots of the same filesystem, and
|
||||
@@ -3275,6 +3376,15 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
|
||||
}
|
||||
|
||||
#if defined(_KERNEL) && defined(HAVE_SPL)
|
||||
#if defined(_LP64)
|
||||
module_param(zfs_max_recordsize, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
|
||||
#else
|
||||
/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
|
||||
module_param(zfs_max_recordsize, int, 0444);
|
||||
MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
|
||||
#endif
|
||||
|
||||
EXPORT_SYMBOL(dsl_dataset_hold);
|
||||
EXPORT_SYMBOL(dsl_dataset_hold_obj);
|
||||
EXPORT_SYMBOL(dsl_dataset_own);
|
||||
|
||||
Reference in New Issue
Block a user