Illumos 5027 - zfs large block support

5027 zfs large block support
Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Richard Elling <richard.elling@richardelling.com>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/5027
  https://github.com/illumos/illumos-gate/commit/b515258

Porting Notes:

* Included in this patch is a tiny ISP2() cleanup in zio_init() from
Illumos 5255.

* Unlike the upstream Illumos commit this patch does not impose an
arbitrary 128K block size limit on volumes.  Volumes, like filesystems,
are limited by the zfs_max_recordsize=1M module option.

* By default the maximum record size is limited to 1M by the module
option zfs_max_recordsize.  This value may be safely increased up to
16M which is the largest block size supported by the on-disk format.
At the moment, 1M blocks clearly offer a significant performance
improvement but the benefits of going beyond this for the majority
of workloads are less clear.

* The illumos version of this patch increased DMU_MAX_ACCESS to 32M.
This was determined not to be large enough when using 16M blocks
because the zfs_make_xattrdir() function will fail (EFBIG) when
assigning a TX.  This was immediately observed under Linux because
all newly created files must have a security xattr created and
that was failing.  Therefore, we've set DMU_MAX_ACCESS to 64M.

* On 32-bit platforms a hard limit of 1M is set for blocks due
to the limited virtual address space.  We should be able to relax
this one the ABD patches are merged.

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #354
This commit is contained in:
Matthew Ahrens
2014-11-03 12:15:08 -08:00
committed by Brian Behlendorf
parent 3df293404a
commit f1512ee61e
55 changed files with 613 additions and 155 deletions
+2 -2
View File
@@ -1329,7 +1329,7 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
VERIFY3U(size, <=, spa_maxblocksize(spa));
hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
@@ -3289,7 +3289,7 @@ top:
* Gracefully handle a damaged logical block size as a
* checksum error by passing a dummy zio to the done callback.
*/
if (size > SPA_MAXBLOCKSIZE) {
if (size > spa_maxblocksize(spa)) {
if (done) {
rzio = zio_null(pio, spa, NULL,
NULL, NULL, zio_flags);
+3 -2
View File
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
ASSERT0(dp->dp_empty_bpobj);
dp->dp_empty_bpobj =
bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -399,7 +399,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpo->bpo_phys->bpo_subobjs == 0) {
bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
DMU_OT_NONE, 0, tx);
}
ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
+1 -1
View File
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
bptree_phys_t *bt;
obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
sizeof (bptree_phys_t), tx);
/*
+2 -4
View File
@@ -2216,10 +2216,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
return (SET_ERROR(ENOTSUP));
if (blksz == 0)
blksz = SPA_MINBLOCKSIZE;
if (blksz > SPA_MAXBLOCKSIZE)
blksz = SPA_MAXBLOCKSIZE;
else
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
+16
View File
@@ -256,6 +256,14 @@ logbias_changed_cb(void *arg, uint64_t newval)
zil_set_logbias(os->os_zil, newval);
}
static void
recordsize_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
os->os_recordsize = newval;
}
void
dmu_objset_byteswap(void *buf, size_t size)
{
@@ -385,6 +393,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os);
}
}
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -660,6 +673,9 @@ dmu_objset_evict(objset_t *os)
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os));
}
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
+63 -20
View File
@@ -234,11 +234,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
if (BP_IS_EMBEDDED(bp)) {
if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
* There's no pre-computed checksum of embedded BP's, so
* (like fletcher4-checkummed blocks) userland will have
* to compute a dedup-capable checksum itself.
* There's no pre-computed checksum for partial-block
* writes or embedded BP's, so (like
* fletcher4-checkummed blocks) userland will have to
* compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
@@ -400,6 +401,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -517,6 +522,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
uint64_t offset;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
@@ -539,8 +545,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
blksz, bp, abuf->b_data);
offset = zb->zb_blkid * blksz;
if (!(dsp->dsa_featureflags &
DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
blksz > SPA_OLD_MAXBLOCKSIZE) {
char *buf = abuf->b_data;
while (blksz > 0 && err == 0) {
int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
err = dump_write(dsp, type, zb->zb_object,
offset, n, NULL, buf);
offset += n;
buf += n;
blksz -= n;
}
} else {
err = dump_write(dsp, type, zb->zb_object,
offset, blksz, bp, abuf->b_data);
}
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -554,7 +576,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
int outfd, vnode_t *vp, offset_t *off)
boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
{
objset_t *os;
dmu_replay_record_t *drr;
@@ -589,6 +611,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
}
#endif
if (large_block_ok && ds->ds_large_blocks)
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -684,7 +708,8 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
boolean_t embedok, boolean_t large_block_ok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -719,18 +744,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
embedok, large_block_ok, outfd, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
embedok, large_block_ok, outfd, vp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
}
int
dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dmu_send(const char *tosnap, const char *fromsnap,
boolean_t embedok, boolean_t large_block_ok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -797,11 +823,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dsl_pool_rele(dp, FTAG);
return (err);
}
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
embedok, large_block_ok, outfd, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
embedok, large_block_ok, outfd, vp, off);
}
if (owned)
dsl_dataset_disown(ds, FTAG);
@@ -1000,6 +1026,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
/*
* The receiving code doesn't know how to translate large blocks
* to smaller ones, so the pool must have the LARGE_BLOCKS
* feature enabled if the stream has LARGE_BLOCKS.
*/
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1125,6 +1160,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
!newds->ds_large_blocks) {
dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
newds->ds_large_blocks = B_TRUE;
}
dmu_buf_will_dirty(newds->ds_dbuf, tx);
dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1250,6 +1292,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
/* some things will require 8-byte alignment, so everything must */
ASSERT0(len % 8);
ASSERT3U(len, <=, ra->bufsize);
while (done < len) {
ssize_t resid;
@@ -1391,7 +1434,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
drro->drr_blksz > SPA_MAXBLOCKSIZE ||
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
@@ -1665,7 +1708,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
drrs->drr_length > SPA_MAXBLOCKSIZE)
drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
return (SET_ERROR(EINVAL));
data = restore_read(ra, drrs->drr_length, NULL);
@@ -1752,7 +1795,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.cksum = drc->drc_cksum;
ra.vp = vp;
ra.voff = *voffp;
ra.bufsize = 1<<20;
ra.bufsize = SPA_MAXBLOCKSIZE;
ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */
+16 -8
View File
@@ -241,7 +241,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
return;
min_bs = SPA_MINBLOCKSHIFT;
max_bs = SPA_MAXBLOCKSHIFT;
max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT;
@@ -310,6 +310,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
*/
ASSERT(dn->dn_datablkshift != 0);
min_bs = max_bs = dn->dn_datablkshift;
} else {
/*
* The blocksize can increase up to the recordsize,
* or if it is already more than the recordsize,
* up to the next power of 2.
*/
min_bs = highbit64(dn->dn_datablksz - 1);
max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
}
/*
@@ -745,11 +753,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
bp = &dn->dn_phys->dn_blkptr[0];
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
else
txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
txh->txh_space_towrite += MZAP_MAX_BLKSZ;
if (!BP_IS_HOLE(bp))
txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
txh->txh_space_tounref += MZAP_MAX_BLKSZ;
return;
}
@@ -1546,18 +1554,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
/* If blkptr doesn't exist then add space to towrite */
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
} else {
blkptr_t *bp;
bp = &dn->dn_phys->dn_spill;
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
else
txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
if (!BP_IS_HOLE(bp))
txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
}
}
+5 -5
View File
@@ -540,10 +540,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
{
int i;
ASSERT3U(blocksize, <=,
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
else if (blocksize > SPA_MAXBLOCKSIZE)
blocksize = SPA_MAXBLOCKSIZE;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
@@ -624,7 +624,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(blocksize, <=,
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
@@ -1377,10 +1378,9 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
dmu_buf_impl_t *db;
int err;
ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
if (size > SPA_MAXBLOCKSIZE)
size = SPA_MAXBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+112 -2
View File
@@ -51,6 +51,17 @@
#include <sys/dsl_userhold.h>
#include <sys/dsl_bookmark.h>
/*
* The SPA supports block sizes up to 16MB. However, very large blocks
* can have an impact on i/o latency (e.g. tying up a spinning disk for
* ~300ms), and also potentially on the memory allocator. Therefore,
* we do not allow the recordsize to be set larger than zfs_max_recordsize
* (default 1MB). Larger blocks can be created by changing this tunable,
* and pools with larger blocks can always be imported and used, regardless
* of this setting.
*/
int zfs_max_recordsize = 1 * 1024 * 1024;
#define SWITCH64(x, y) \
{ \
uint64_t __tmp = (x); \
@@ -60,8 +71,6 @@
#define DS_REF_MAX (1ULL << 62)
#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
/*
@@ -117,6 +126,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
dsl_dataset_phys(ds)->ds_unique_bytes += used;
if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
ds->ds_need_large_blocks = B_TRUE;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -414,6 +425,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
if (err == 0)
ds->ds_large_blocks = B_TRUE;
else
ASSERT3U(err, ==, ENOENT);
}
if (err == 0) {
err = dsl_dir_hold_obj(dp,
dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
@@ -730,6 +749,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
if (origin->ds_large_blocks)
dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
dmu_buf_will_dirty(origin->ds_dbuf, tx);
dsl_dataset_phys(origin)->ds_num_children++;
@@ -1253,6 +1275,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
dmu_buf_rele(dbuf, FTAG);
if (ds->ds_large_blocks)
dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
ASSERT3U(ds->ds_prev != 0, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
@@ -1541,6 +1566,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
dmu_objset_sync(ds->ds_objset, zio, tx);
if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
ds->ds_large_blocks = B_TRUE;
}
}
static void
@@ -3222,6 +3252,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
return (err);
}
static int
dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
{
const char *dsname = arg;
dsl_dataset_t *ds;
dsl_pool_t *dp = dmu_tx_pool(tx);
int error = 0;
if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
ASSERT(spa_feature_is_enabled(dp->dp_spa,
SPA_FEATURE_EXTENSIBLE_DATASET));
error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
if (error != 0)
return (error);
if (ds->ds_large_blocks)
error = EALREADY;
dsl_dataset_rele(ds, FTAG);
return (error);
}
void
dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
uint64_t zero = 0;
spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
sizeof (zero), 1, &zero, tx));
}
static void
dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
{
const char *dsname = arg;
dsl_dataset_t *ds;
VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
ASSERT(!ds->ds_large_blocks);
ds->ds_large_blocks = B_TRUE;
dsl_dataset_rele(ds, FTAG);
}
int
dsl_dataset_activate_large_blocks(const char *dsname)
{
int error;
error = dsl_sync_task(dsname,
dsl_dataset_activate_large_blocks_check,
dsl_dataset_activate_large_blocks_sync, (void *)dsname,
1, ZFS_SPACE_CHECK_RESERVED);
/*
* EALREADY indicates that this dataset already supports large blocks.
*/
if (error == EALREADY)
error = 0;
return (error);
}
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
@@ -3275,6 +3376,15 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
}
#if defined(_KERNEL) && defined(HAVE_SPL)
#if defined(_LP64)
module_param(zfs_max_recordsize, int, 0644);
MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
#else
/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
module_param(zfs_max_recordsize, int, 0444);
MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
#endif
EXPORT_SYMBOL(dsl_dataset_hold);
EXPORT_SYMBOL(dsl_dataset_hold_obj);
EXPORT_SYMBOL(dsl_dataset_own);
+4 -4
View File
@@ -148,7 +148,7 @@ uint64_t
dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
{
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
sizeof (dsl_deadlist_phys_t), tx));
}
@@ -185,7 +185,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -259,7 +259,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = mintxg;
obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);
@@ -344,7 +344,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
if (dle->dle_mintxg >= maxtxg)
break;
obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
+7
View File
@@ -277,6 +277,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
obj = ds->ds_object;
if (ds->ds_large_blocks) {
ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
}
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
ASSERT3P(ds->ds_prev, ==, NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
@@ -738,6 +742,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT0(ds->ds_reserved);
}
if (ds->ds_large_blocks)
spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
+2 -2
View File
@@ -372,7 +372,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
FREE_DIR_NAME, &dp->dp_free_dir));
/* create and open the free_bplist */
obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -804,7 +804,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
* subobj support. So call dmu_object_alloc() directly.
*/
obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
+3 -3
View File
@@ -504,7 +504,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
if (size == 0) {
blocksize = SPA_MINBLOCKSIZE;
} else if (size > SPA_MAXBLOCKSIZE) {
} else if (size > SPA_OLD_MAXBLOCKSIZE) {
ASSERT(0);
return (SET_ERROR(EFBIG));
} else {
@@ -693,7 +693,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
SA_BONUS, &spill_idx, &used, &spilling);
if (used > SPA_MAXBLOCKSIZE)
if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -717,7 +717,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i,
&spill_used, &dummy);
if (spill_used > SPA_MAXBLOCKSIZE)
if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+22 -4
View File
@@ -266,6 +266,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
0, ZPROP_SRC_LOCAL);
if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
}
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -482,7 +490,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (!error) {
objset_t *os;
uint64_t compress;
uint64_t propval;
if (strval == NULL || strval[0] == '\0') {
objnum = zpool_prop_default_numeric(
@@ -494,15 +502,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (error)
break;
/* Must be ZPL and not gzip compressed. */
/*
* Must be ZPL, and its property settings
* must be supported by GRUB (compression
* is not gzip, and large blocks are not used).
*/
if (dmu_objset_type(os) != DMU_OST_ZFS) {
error = SET_ERROR(ENOTSUP);
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
&compress)) == 0 &&
!BOOTFS_COMPRESS_VALID(compress)) {
&propval)) == 0 &&
!BOOTFS_COMPRESS_VALID(propval)) {
error = SET_ERROR(ENOTSUP);
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
&propval)) == 0 &&
propval > SPA_OLD_MAXBLOCKSIZE) {
error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
+1 -1
View File
@@ -89,7 +89,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
ASSERT(spa->spa_history == 0);
spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
sizeof (spa_history_phys_t), tx);
VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+10
View File
@@ -1985,6 +1985,15 @@ spa_debug_enabled(spa_t *spa)
return (spa->spa_debug);
}
int
spa_maxblocksize(spa_t *spa)
{
if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
return (SPA_MAXBLOCKSIZE);
else
return (SPA_OLD_MAXBLOCKSIZE);
}
#if defined(_KERNEL) && defined(HAVE_SPL)
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
@@ -2040,6 +2049,7 @@ EXPORT_SYMBOL(spa_suspended);
EXPORT_SYMBOL(spa_bootfs);
EXPORT_SYMBOL(spa_delegation);
EXPORT_SYMBOL(spa_meta_objset);
EXPORT_SYMBOL(spa_maxblocksize);
/* Miscellaneous support routines */
EXPORT_SYMBOL(spa_rename);
+3 -3
View File
@@ -847,9 +847,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
* Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
* or we will inconsistently account for existing bp's.
* in 128k (1 << 17) because it is the "typical" blocksize.
* Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
* otherwise it would inconsistently account for existing bp's.
*/
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+2 -2
View File
@@ -552,9 +552,9 @@ retry:
goto retry;
}
dr->dr_bio[i] = bio_alloc(GFP_NOIO,
bio_nr_pages(bio_ptr, bio_size));
/* bio_alloc() with __GFP_WAIT never returns NULL */
dr->dr_bio[i] = bio_alloc(GFP_NOIO,
MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
if (unlikely(dr->dr_bio[i] == NULL)) {
vdev_disk_dio_free(dr);
return (ENOMEM);
+1 -1
View File
@@ -167,7 +167,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
+8 -8
View File
@@ -34,6 +34,7 @@
#include <sys/zap_leaf.h>
#include <sys/avl.h>
#include <sys/arc.h>
#include <sys/dmu_objset.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
@@ -654,9 +655,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
indirect_blockshift >= SPA_MINBLOCKSHIFT &&
indirect_blockshift <= SPA_MAXBLOCKSHIFT);
indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
VERIFY(dmu_object_set_blocksize(os, obj,
1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -1347,7 +1348,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
zap_t *zap;
int err = 0;
/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
@@ -1360,7 +1360,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
*towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
return (err);
}
@@ -1384,7 +1384,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
/*
* We treat this case as similar to (name == NULL)
*/
*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
*towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
}
} else {
/*
@@ -1403,12 +1403,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* ptrtbl blocks
*/
if (dmu_buf_freeable(zap->zap_dbuf))
*tooverwrite += SPA_MAXBLOCKSIZE;
*tooverwrite += MZAP_MAX_BLKSZ;
else
*towrite += SPA_MAXBLOCKSIZE;
*towrite += MZAP_MAX_BLKSZ;
if (add) {
*towrite += 4 * SPA_MAXBLOCKSIZE;
*towrite += 4 * MZAP_MAX_BLKSZ;
}
}
+13 -1
View File
@@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon)
{
return ((c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') ||
c == (after_colon ? '_' : '.'));
(after_colon && c == '_') ||
(!after_colon && (c == '.' || c == '-')));
}
/*
@@ -230,4 +231,15 @@ zpool_feature_init(void)
"com.delphix:embedded_data", "embedded_data",
"Blocks which compress very well use even less space.",
B_FALSE, B_TRUE, B_TRUE, NULL);
{
static const spa_feature_t large_blocks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
"org.open-zfs:large_blocks", "large_blocks",
"Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
large_blocks_deps);
}
}
+48 -6
View File
@@ -2392,7 +2392,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
int err;
int err = -1;
if (prop == ZPROP_INVAL) {
if (zfs_prop_userquota(propname))
@@ -3790,8 +3790,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
* the SPA supports it. We ignore any errors here since
* we'll catch them later.
*/
if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
nvpair_value_uint64(pair, &intval) == 0) {
if (nvpair_value_uint64(pair, &intval) == 0) {
if (intval >= ZIO_COMPRESS_GZIP_1 &&
intval <= ZIO_COMPRESS_GZIP_9 &&
zfs_earlier_version(dsname,
@@ -3842,6 +3841,42 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
return (SET_ERROR(ENOTSUP));
break;
case ZFS_PROP_RECORDSIZE:
/* Record sizes above 128k need the feature to be enabled */
if (nvpair_value_uint64(pair, &intval) == 0 &&
intval > SPA_OLD_MAXBLOCKSIZE) {
spa_t *spa;
/*
* If this is a bootable dataset then
* the we don't allow large (>128K) blocks,
* because GRUB doesn't support them.
*/
if (zfs_is_bootfs(dsname) &&
intval > SPA_OLD_MAXBLOCKSIZE) {
return (SET_ERROR(EDOM));
}
/*
* We don't allow setting the property above 1MB,
* unless the tunable has been changed.
*/
if (intval > zfs_max_recordsize ||
intval > SPA_MAXBLOCKSIZE)
return (SET_ERROR(EDOM));
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
if (!spa_feature_is_enabled(spa,
SPA_FEATURE_LARGE_BLOCKS)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
spa_close(spa, FTAG);
}
break;
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
@@ -4221,7 +4256,7 @@ out:
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
* zc_flags if =1, WRITE_EMBEDDED records are permitted
* zc_flags lzc_send_flags
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
@@ -4233,6 +4268,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
boolean_t large_block_ok = (zc->zc_flags & 0x2);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4294,7 +4330,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
zc->zc_fromobj, embedok, large_block_ok,
zc->zc_cookie, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
@@ -5160,6 +5197,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
* (optional) "largeblockok" -> (value ignored)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* }
@@ -5175,6 +5214,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
char *fromname = NULL;
int fd;
file_t *fp;
boolean_t largeblockok;
boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd);
@@ -5183,13 +5223,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
if ((fp = getf(fd)) == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
error = dmu_send(snapname, fromname, embedok, largeblockok,
fd, fp->f_vnode, &off);
if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
fp->f_offset = off;
+1 -1
View File
@@ -492,7 +492,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
* If the write would overflow the largest block then split it.
*/
if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
len = SPA_MAXBLOCKSIZE >> 1;
len = SPA_OLD_MAXBLOCKSIZE >> 1;
else
len = resid;
+4 -5
View File
@@ -188,10 +188,9 @@ static void
blksz_changed_cb(void *arg, uint64_t newval)
{
zfs_sb_t *zsb = arg;
if (newval < SPA_MINBLOCKSIZE ||
newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
newval = SPA_MAXBLOCKSIZE;
ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zsb->z_os)));
ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
ASSERT(ISP2(newval));
zsb->z_max_blksz = newval;
}
@@ -672,7 +671,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
*/
zsb->z_sb = NULL;
zsb->z_parent = zsb;
zsb->z_max_blksz = SPA_MAXBLOCKSIZE;
zsb->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
zsb->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zsb->z_os = os;
+7 -1
View File
@@ -771,8 +771,14 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
ASSERT(!ISP2(zp->z_blksz));
new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
new_blksz = MIN(end_size,
1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
+7 -1
View File
@@ -61,6 +61,7 @@
#endif /* _KERNEL */
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -1304,8 +1305,13 @@ zfs_extend(znode_t *zp, uint64_t end)
* We are growing the file past the current block size.
*/
if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
ASSERT(!ISP2(zp->z_blksz));
newblksz = MIN(end, SPA_MAXBLOCKSIZE);
newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
} else {
newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
}
+7 -4
View File
@@ -243,6 +243,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
error = SET_ERROR(ECKSUM);
} else {
ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, len);
*end = (char *)dst + len;
*nbp = zilc->zc_next_blk;
@@ -257,6 +258,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
ASSERT3U(zilc->zc_nused, <=,
SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, zilc->zc_nused);
*end = (char *)dst + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
@@ -342,7 +345,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -389,7 +392,7 @@ done:
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
zil_bp_tree_fini(zilog);
zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@@ -941,7 +944,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
*
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
* allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
* allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
*/
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
@@ -1023,7 +1026,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
continue;
zil_blksz = zil_block_buckets[i];
if (zil_blksz == UINT64_MAX)
zil_blksz = SPA_MAXBLOCKSIZE;
zil_blksz = SPA_OLD_MAXBLOCKSIZE;
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+22 -7
View File
@@ -24,6 +24,7 @@
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/sysmacros.h>
#include <sys/zfs_context.h>
#include <sys/fm/fs/zfs.h>
#include <sys/spa.h>
@@ -107,9 +108,8 @@ zio_init(void)
/*
* For small buffers, we want a cache for each multiple of
* SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
* for each quarter-power of 2. For large buffers, we want
* a cache for each multiple of PAGESIZE.
* SPA_MINBLOCKSIZE. For larger buffers, we want a cache
* for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -117,7 +117,16 @@ zio_init(void)
size_t align = 0;
size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
while (p2 & (p2 - 1))
#ifdef _ILP32
/*
* Cache size limited to 1M on 32-bit platforms until ARC
* buffers no longer require virtual address space.
*/
if (size > zfs_max_recordsize)
break;
#endif
while (!ISP2(p2))
p2 &= p2 - 1;
#ifndef _KERNEL
@@ -132,10 +141,8 @@ zio_init(void)
#endif
if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE;
} else if (IS_P2ALIGNED(size, PAGESIZE)) {
align = PAGESIZE;
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
align = p2 >> 2;
align = MIN(p2 >> 2, PAGESIZE);
}
if (align != 0) {
@@ -174,6 +181,14 @@ zio_fini(void)
kmem_cache_t *last_data_cache = NULL;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
#ifdef _ILP32
/*
* Cache size limited to 1M on 32-bit platforms until ARC
* buffers no longer require virtual address space.
*/
if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
break;
#endif
if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c];
kmem_cache_destroy(zio_buf_cache[c]);