mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 10:54:35 +03:00
Illumos 5027 - zfs large block support
5027 zfs large block support Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Reviewed by: Richard Elling <richard.elling@richardelling.com> Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/5027 https://github.com/illumos/illumos-gate/commit/b515258 Porting Notes: * Included in this patch is a tiny ISP2() cleanup in zio_init() from Illumos 5255. * Unlike the upstream Illumos commit this patch does not impose an arbitrary 128K block size limit on volumes. Volumes, like filesystems, are limited by the zfs_max_recordsize=1M module option. * By default the maximum record size is limited to 1M by the module option zfs_max_recordsize. This value may be safely increased up to 16M which is the largest block size supported by the on-disk format. At the moment, 1M blocks clearly offer a significant performance improvement but the benefits of going beyond this for the majority of workloads are less clear. * The illumos version of this patch increased DMU_MAX_ACCESS to 32M. This was determined not to be large enough when using 16M blocks because the zfs_make_xattrdir() function will fail (EFBIG) when assigning a TX. This was immediately observed under Linux because all newly created files must have a security xattr created and that was failing. Therefore, we've set DMU_MAX_ACCESS to 64M. * On 32-bit platforms a hard limit of 1M is set for blocks due to the limited virtual address space. We should be able to relax this one the ABD patches are merged. Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #354
This commit is contained in:
committed by
Brian Behlendorf
parent
3df293404a
commit
f1512ee61e
+29
-3
@@ -2185,6 +2185,8 @@ dump_label(const char *dev)
|
||||
(void) close(fd);
|
||||
}
|
||||
|
||||
static uint64_t num_large_blocks;
|
||||
|
||||
/*ARGSUSED*/
|
||||
static int
|
||||
dump_one_dir(const char *dsname, void *arg)
|
||||
@@ -2197,6 +2199,8 @@ dump_one_dir(const char *dsname, void *arg)
|
||||
(void) printf("Could not open %s, error %d\n", dsname, error);
|
||||
return (0);
|
||||
}
|
||||
if (dmu_objset_ds(os)->ds_large_blocks)
|
||||
num_large_blocks++;
|
||||
dump_dir(os);
|
||||
dmu_objset_disown(os, FTAG);
|
||||
fuid_table_destroy();
|
||||
@@ -2207,7 +2211,7 @@ dump_one_dir(const char *dsname, void *arg)
|
||||
/*
|
||||
* Block statistics.
|
||||
*/
|
||||
#define PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
|
||||
#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
|
||||
typedef struct zdb_blkstats {
|
||||
uint64_t zb_asize;
|
||||
uint64_t zb_lsize;
|
||||
@@ -2273,7 +2277,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
|
||||
zb->zb_lsize += BP_GET_LSIZE(bp);
|
||||
zb->zb_psize += BP_GET_PSIZE(bp);
|
||||
zb->zb_count++;
|
||||
zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
|
||||
|
||||
/*
|
||||
* The histogram is only big enough to record blocks up to
|
||||
* SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
|
||||
* "other", bucket.
|
||||
*/
|
||||
int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
|
||||
idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
|
||||
zb->zb_psize_histogram[idx]++;
|
||||
|
||||
zb->zb_gangs += BP_COUNT_GANG(bp);
|
||||
|
||||
@@ -2979,6 +2991,7 @@ dump_zpool(spa_t *spa)
|
||||
dump_metaslab_groups(spa);
|
||||
|
||||
if (dump_opt['d'] || dump_opt['i']) {
|
||||
uint64_t refcount;
|
||||
dump_dir(dp->dp_meta_objset);
|
||||
if (dump_opt['d'] >= 3) {
|
||||
dump_bpobj(&spa->spa_deferred_bpobj,
|
||||
@@ -2998,8 +3011,21 @@ dump_zpool(spa_t *spa)
|
||||
}
|
||||
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
|
||||
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
|
||||
|
||||
(void) feature_get_refcount(spa,
|
||||
&spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
|
||||
if (num_large_blocks != refcount) {
|
||||
(void) printf("large_blocks feature refcount mismatch: "
|
||||
"expected %lld != actual %lld\n",
|
||||
(longlong_t)num_large_blocks,
|
||||
(longlong_t)refcount);
|
||||
rc = 2;
|
||||
} else {
|
||||
(void) printf("Verified large_blocks feature refcount "
|
||||
"is correct (%llu)\n", (longlong_t)refcount);
|
||||
}
|
||||
}
|
||||
if (dump_opt['b'] || dump_opt['c'])
|
||||
if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
|
||||
rc = dump_block_stats(spa);
|
||||
|
||||
if (rc == 0)
|
||||
|
||||
+8
-3
@@ -258,9 +258,9 @@ get_usage(zfs_help_t idx)
|
||||
case HELP_ROLLBACK:
|
||||
return (gettext("\trollback [-rRf] <snapshot>\n"));
|
||||
case HELP_SEND:
|
||||
return (gettext("\tsend [-DnPpRrve] [-[iI] snapshot] "
|
||||
return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
|
||||
"<snapshot>\n"
|
||||
"\tsend [-e] [-i snapshot|bookmark] "
|
||||
"\tsend [-Le] [-i snapshot|bookmark] "
|
||||
"<filesystem|volume|snapshot>\n"));
|
||||
case HELP_SET:
|
||||
return (gettext("\tset <property=value> "
|
||||
@@ -3683,7 +3683,7 @@ zfs_do_send(int argc, char **argv)
|
||||
boolean_t extraverbose = B_FALSE;
|
||||
|
||||
/* check options */
|
||||
while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
|
||||
while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
|
||||
switch (c) {
|
||||
case 'i':
|
||||
if (fromname)
|
||||
@@ -3718,6 +3718,9 @@ zfs_do_send(int argc, char **argv)
|
||||
case 'n':
|
||||
flags.dryrun = B_TRUE;
|
||||
break;
|
||||
case 'L':
|
||||
flags.largeblock = B_TRUE;
|
||||
break;
|
||||
case 'e':
|
||||
flags.embed_data = B_TRUE;
|
||||
break;
|
||||
@@ -3774,6 +3777,8 @@ zfs_do_send(int argc, char **argv)
|
||||
if (zhp == NULL)
|
||||
return (1);
|
||||
|
||||
if (flags.largeblock)
|
||||
lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
|
||||
if (flags.embed_data)
|
||||
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
|
||||
|
||||
|
||||
@@ -56,7 +56,6 @@ uint64_t total_stream_len = 0;
|
||||
FILE *send_stream = 0;
|
||||
boolean_t do_byteswap = B_FALSE;
|
||||
boolean_t do_cksum = B_TRUE;
|
||||
#define INITIAL_BUFLEN (1<<20)
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
@@ -69,6 +68,18 @@ usage(void)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static void *
|
||||
safe_malloc(size_t size)
|
||||
{
|
||||
void *rv = malloc(size);
|
||||
if (rv == NULL) {
|
||||
(void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n",
|
||||
(unsigned)size);
|
||||
abort();
|
||||
}
|
||||
return (rv);
|
||||
}
|
||||
|
||||
/*
|
||||
* ssread - send stream read.
|
||||
*
|
||||
@@ -160,7 +171,7 @@ print_block(char *buf, int length)
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
char *buf = malloc(INITIAL_BUFLEN);
|
||||
char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
|
||||
uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
|
||||
uint64_t total_records = 0;
|
||||
dmu_replay_record_t thedrr;
|
||||
@@ -308,9 +319,9 @@ main(int argc, char *argv[])
|
||||
nvlist_t *nv;
|
||||
int sz = drr->drr_payloadlen;
|
||||
|
||||
if (sz > INITIAL_BUFLEN) {
|
||||
if (sz > SPA_MAXBLOCKSIZE) {
|
||||
free(buf);
|
||||
buf = malloc(sz);
|
||||
buf = safe_malloc(sz);
|
||||
}
|
||||
(void) ssread(buf, sz, &zc);
|
||||
if (ferror(send_stream))
|
||||
|
||||
+9
-4
@@ -1040,9 +1040,14 @@ ztest_spa_get_ashift(void) {
|
||||
static int
|
||||
ztest_random_blocksize(void)
|
||||
{
|
||||
// Choose a block size >= the ashift.
|
||||
uint64_t block_shift =
|
||||
ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
|
||||
/*
|
||||
* Choose a block size >= the ashift.
|
||||
* If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
|
||||
*/
|
||||
int maxbs = SPA_OLD_MAXBLOCKSHIFT;
|
||||
if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
|
||||
maxbs = 20;
|
||||
uint64_t block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
|
||||
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
|
||||
}
|
||||
|
||||
@@ -4972,7 +4977,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
|
||||
char *path0;
|
||||
char *pathrand;
|
||||
size_t fsize;
|
||||
int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
|
||||
int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
|
||||
int iters = 1000;
|
||||
int maxfaults;
|
||||
int mirror_save;
|
||||
|
||||
Reference in New Issue
Block a user