Illumos 5027 - zfs large block support

5027 zfs large block support
Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Richard Elling <richard.elling@richardelling.com>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/5027
  https://github.com/illumos/illumos-gate/commit/b515258

Porting Notes:

* Included in this patch is a tiny ISP2() cleanup in zio_init() from
Illumos 5255.

* Unlike the upstream Illumos commit this patch does not impose an
arbitrary 128K block size limit on volumes.  Volumes, like filesystems,
are limited by the zfs_max_recordsize=1M module option.

* By default the maximum record size is limited to 1M by the module
option zfs_max_recordsize.  This value may be safely increased up to
16M which is the largest block size supported by the on-disk format.
At the moment, 1M blocks clearly offer a significant performance
improvement but the benefits of going beyond this for the majority
of workloads are less clear.

* The illumos version of this patch increased DMU_MAX_ACCESS to 32M.
This was determined not to be large enough when using 16M blocks
because the zfs_make_xattrdir() function will fail (EFBIG) when
assigning a TX.  This was immediately observed under Linux because
all newly created files must have a security xattr created and
that was failing.  Therefore, we've set DMU_MAX_ACCESS to 64M.

* On 32-bit platforms a hard limit of 1M is set for blocks due
to the limited virtual address space.  We should be able to relax
this one the ABD patches are merged.

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #354
This commit is contained in:
Matthew Ahrens
2014-11-03 12:15:08 -08:00
committed by Brian Behlendorf
parent 3df293404a
commit f1512ee61e
55 changed files with 613 additions and 155 deletions
+63 -20
View File
@@ -234,11 +234,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
if (BP_IS_EMBEDDED(bp)) {
if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
* There's no pre-computed checksum of embedded BP's, so
* (like fletcher4-checkummed blocks) userland will have
* to compute a dedup-capable checksum itself.
* There's no pre-computed checksum for partial-block
* writes or embedded BP's, so (like
* fletcher4-checkummed blocks) userland will have to
* compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
@@ -400,6 +401,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -517,6 +522,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zb->zb_blkid * blksz, blksz, bp);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
uint64_t offset;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
@@ -539,8 +545,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
blksz, bp, abuf->b_data);
offset = zb->zb_blkid * blksz;
if (!(dsp->dsa_featureflags &
DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
blksz > SPA_OLD_MAXBLOCKSIZE) {
char *buf = abuf->b_data;
while (blksz > 0 && err == 0) {
int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
err = dump_write(dsp, type, zb->zb_object,
offset, n, NULL, buf);
offset += n;
buf += n;
blksz -= n;
}
} else {
err = dump_write(dsp, type, zb->zb_object,
offset, blksz, bp, abuf->b_data);
}
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -554,7 +576,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
int outfd, vnode_t *vp, offset_t *off)
boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
{
objset_t *os;
dmu_replay_record_t *drr;
@@ -589,6 +611,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
}
#endif
if (large_block_ok && ds->ds_large_blocks)
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -684,7 +708,8 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
boolean_t embedok, boolean_t large_block_ok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -719,18 +744,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
embedok, large_block_ok, outfd, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
embedok, large_block_ok, outfd, vp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
}
int
dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dmu_send(const char *tosnap, const char *fromsnap,
boolean_t embedok, boolean_t large_block_ok,
int outfd, vnode_t *vp, offset_t *off)
{
dsl_pool_t *dp;
@@ -797,11 +823,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dsl_pool_rele(dp, FTAG);
return (err);
}
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
embedok, large_block_ok, outfd, vp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
outfd, vp, off);
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
embedok, large_block_ok, outfd, vp, off);
}
if (owned)
dsl_dataset_disown(ds, FTAG);
@@ -1000,6 +1026,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
/*
* The receiving code doesn't know how to translate large blocks
* to smaller ones, so the pool must have the LARGE_BLOCKS
* feature enabled if the stream has LARGE_BLOCKS.
*/
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1125,6 +1160,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
!newds->ds_large_blocks) {
dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
newds->ds_large_blocks = B_TRUE;
}
dmu_buf_will_dirty(newds->ds_dbuf, tx);
dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1250,6 +1292,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
/* some things will require 8-byte alignment, so everything must */
ASSERT0(len % 8);
ASSERT3U(len, <=, ra->bufsize);
while (done < len) {
ssize_t resid;
@@ -1391,7 +1434,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
drro->drr_blksz > SPA_MAXBLOCKSIZE ||
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
@@ -1665,7 +1708,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
drrs->drr_length > SPA_MAXBLOCKSIZE)
drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
return (SET_ERROR(EINVAL));
data = restore_read(ra, drrs->drr_length, NULL);
@@ -1752,7 +1795,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
ra.cksum = drc->drc_cksum;
ra.vp = vp;
ra.voff = *voffp;
ra.bufsize = 1<<20;
ra.bufsize = SPA_MAXBLOCKSIZE;
ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */