mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 03:09:34 +03:00
Illumos 3835 zfs need not store 2 copies of all metadata
Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Approved by: Richard Lowe <richlowe@richlowe.net> Description from Matt Ahrens's bug report at Delphix: Add a new zfs property, "redundant_metadata" which can have values "all" or "most". The default will be "all", which is the current behavior. Setting to "most" will cause us to only store 1 copy of level-1 indirect blocks of user data files. Additional notes: The new man page section for this property states "The exact behavior of which metadata blocks are stored redundantly may change in future releases." and: "When set to most, ZFS stores an extra copy of most types of metadata. This can improve performance of random writes, because less metadata must be written." The current implementation is as described above in Matt's blog. It is controlled by a new global integer "zfs_redundant_metadata_most_ditto_level", currently initialized to 2. When "redundant_metadata" is set to "most", only indirect blocks of the specified level and higher will have additional ditto blocks created. Ported by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2542
This commit is contained in:
parent
603cb25ca5
commit
faf0f58c69
@ -20,7 +20,7 @@
|
|||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||||
*/
|
*/
|
||||||
@ -739,8 +739,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
|
|||||||
extern void dmu_objset_name(objset_t *os, char *buf);
|
extern void dmu_objset_name(objset_t *os, char *buf);
|
||||||
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
|
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
|
||||||
extern uint64_t dmu_objset_id(objset_t *os);
|
extern uint64_t dmu_objset_id(objset_t *os);
|
||||||
extern uint64_t dmu_objset_syncprop(objset_t *os);
|
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
|
||||||
extern uint64_t dmu_objset_logbias(objset_t *os);
|
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
|
||||||
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
|
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
|
||||||
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
|
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
|
||||||
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
|
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Portions Copyright 2010 Robert Milkowski */
|
/* Portions Copyright 2010 Robert Milkowski */
|
||||||
@ -85,15 +85,16 @@ struct objset {
|
|||||||
zilog_t *os_zil;
|
zilog_t *os_zil;
|
||||||
|
|
||||||
/* can change, under dsl_dir's locks: */
|
/* can change, under dsl_dir's locks: */
|
||||||
uint8_t os_checksum;
|
enum zio_checksum os_checksum;
|
||||||
uint8_t os_compress;
|
enum zio_compress os_compress;
|
||||||
uint8_t os_copies;
|
uint8_t os_copies;
|
||||||
uint8_t os_dedup_checksum;
|
enum zio_checksum os_dedup_checksum;
|
||||||
uint8_t os_dedup_verify;
|
boolean_t os_dedup_verify;
|
||||||
uint8_t os_logbias;
|
zfs_logbias_op_t os_logbias;
|
||||||
uint8_t os_primary_cache;
|
zfs_cache_type_t os_primary_cache;
|
||||||
uint8_t os_secondary_cache;
|
zfs_cache_type_t os_secondary_cache;
|
||||||
uint8_t os_sync;
|
zfs_sync_type_t os_sync;
|
||||||
|
zfs_redundant_metadata_type_t os_redundant_metadata;
|
||||||
|
|
||||||
/* no lock needed: */
|
/* no lock needed: */
|
||||||
struct dmu_tx *os_synctx; /* XXX sketchy */
|
struct dmu_tx *os_synctx; /* XXX sketchy */
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||||
*/
|
*/
|
||||||
@ -149,6 +149,7 @@ typedef enum {
|
|||||||
ZFS_PROP_SELINUX_DEFCONTEXT,
|
ZFS_PROP_SELINUX_DEFCONTEXT,
|
||||||
ZFS_PROP_SELINUX_ROOTCONTEXT,
|
ZFS_PROP_SELINUX_ROOTCONTEXT,
|
||||||
ZFS_PROP_RELATIME,
|
ZFS_PROP_RELATIME,
|
||||||
|
ZFS_PROP_REDUNDANT_METADATA,
|
||||||
ZFS_NUM_PROPS
|
ZFS_NUM_PROPS
|
||||||
} zfs_prop_t;
|
} zfs_prop_t;
|
||||||
|
|
||||||
@ -349,6 +350,11 @@ typedef enum {
|
|||||||
ZFS_XATTR_SA = 2
|
ZFS_XATTR_SA = 2
|
||||||
} zfs_xattr_type_t;
|
} zfs_xattr_type_t;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
ZFS_REDUNDANT_METADATA_ALL,
|
||||||
|
ZFS_REDUNDANT_METADATA_MOST
|
||||||
|
} zfs_redundant_metadata_type_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* On-disk version number.
|
* On-disk version number.
|
||||||
*/
|
*/
|
||||||
|
@ -22,7 +22,7 @@
|
|||||||
.\"
|
.\"
|
||||||
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
|
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
|
||||||
.\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
|
.\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
|
||||||
.\" Copyright (c) 2013 by Delphix. All rights reserved.
|
.\" Copyright (c) 2014 by Delphix. All rights reserved.
|
||||||
.\" Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
.\" Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||||
.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
|
.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
|
||||||
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
@ -1014,6 +1014,37 @@ This property can also be referred to by its shortened column name, \fBrecsize\f
|
|||||||
.ne 2
|
.ne 2
|
||||||
.mk
|
.mk
|
||||||
.na
|
.na
|
||||||
|
\fB\fBredundant_metadata\fR=\fBall\fR | \fBmost\fR\fR
|
||||||
|
.ad
|
||||||
|
.sp .6
|
||||||
|
.RS 4n
|
||||||
|
Controls what types of metadata are stored redundantly. ZFS stores an
|
||||||
|
extra copy of metadata, so that if a single block is corrupted, the
|
||||||
|
amount of user data lost is limited. This extra copy is in addition to
|
||||||
|
any redundancy provided at the pool level (e.g. by mirroring or RAID-Z),
|
||||||
|
and is in addition to an extra copy specified by the \fBcopies\fR
|
||||||
|
property (up to a total of 3 copies). For example if the pool is
|
||||||
|
mirrored, \fBcopies\fR=2, and \fBredundant_metadata\fR=most, then ZFS
|
||||||
|
stores 6 copies of most metadata, and 4 copies of data and some
|
||||||
|
metadata.
|
||||||
|
.sp
|
||||||
|
When set to \fBall\fR, ZFS stores an extra copy of all metadata. If a
|
||||||
|
single on-disk block is corrupt, at worst a single block of user data
|
||||||
|
(which is \fBrecordsize\fR bytes long) can be lost.
|
||||||
|
.sp
|
||||||
|
When set to \fBmost\fR, ZFS stores an extra copy of most types of
|
||||||
|
metadata. This can improve performance of random writes, because less
|
||||||
|
metadata must be written. In practice, at worst about 100 blocks (of
|
||||||
|
\fBrecordsize\fR bytes each) of user data can be lost if a single
|
||||||
|
on-disk block is corrupt. The exact behavior of which metadata blocks
|
||||||
|
are stored redundantly may change in future releases.
|
||||||
|
.sp
|
||||||
|
The default value is \fBall\fR.
|
||||||
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
\fB\fBrefquota\fR=\fIsize\fR | \fBnone\fR\fR
|
\fB\fBrefquota\fR=\fIsize\fR | \fBnone\fR\fR
|
||||||
.ad
|
.ad
|
||||||
.sp .6
|
.sp .6
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -210,7 +210,18 @@ zfs_prop_init(void)
|
|||||||
{ NULL }
|
{ NULL }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static zprop_index_t redundant_metadata_table[] = {
|
||||||
|
{ "all", ZFS_REDUNDANT_METADATA_ALL },
|
||||||
|
{ "most", ZFS_REDUNDANT_METADATA_MOST },
|
||||||
|
{ NULL }
|
||||||
|
};
|
||||||
|
|
||||||
/* inherit index properties */
|
/* inherit index properties */
|
||||||
|
zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
|
||||||
|
ZFS_REDUNDANT_METADATA_ALL,
|
||||||
|
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
|
||||||
|
"all | most", "REDUND_MD",
|
||||||
|
redundant_metadata_table);
|
||||||
zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
|
zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
|
||||||
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
|
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
|
||||||
"standard | always | disabled", "SYNC",
|
"standard | always | disabled", "SYNC",
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -1688,6 +1688,12 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
|
|||||||
|
|
||||||
int zfs_mdcomp_disable = 0;
|
int zfs_mdcomp_disable = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When the "redundant_metadata" property is set to "most", only indirect
|
||||||
|
* blocks of this level and higher will have an additional ditto block.
|
||||||
|
*/
|
||||||
|
int zfs_redundant_metadata_most_ditto_level = 2;
|
||||||
|
|
||||||
void
|
void
|
||||||
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||||
{
|
{
|
||||||
@ -1727,6 +1733,13 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
|||||||
if (zio_checksum_table[checksum].ci_correctable < 1 ||
|
if (zio_checksum_table[checksum].ci_correctable < 1 ||
|
||||||
zio_checksum_table[checksum].ci_eck)
|
zio_checksum_table[checksum].ci_eck)
|
||||||
checksum = ZIO_CHECKSUM_FLETCHER_4;
|
checksum = ZIO_CHECKSUM_FLETCHER_4;
|
||||||
|
|
||||||
|
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
|
||||||
|
(os->os_redundant_metadata ==
|
||||||
|
ZFS_REDUNDANT_METADATA_MOST &&
|
||||||
|
(level >= zfs_redundant_metadata_most_ditto_level ||
|
||||||
|
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
|
||||||
|
copies++;
|
||||||
} else if (wp & WP_NOFILL) {
|
} else if (wp & WP_NOFILL) {
|
||||||
ASSERT(level == 0);
|
ASSERT(level == 0);
|
||||||
|
|
||||||
@ -1774,7 +1787,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
|||||||
zp->zp_compress = compress;
|
zp->zp_compress = compress;
|
||||||
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
|
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
|
||||||
zp->zp_level = level;
|
zp->zp_level = level;
|
||||||
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
|
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
|
||||||
zp->zp_dedup = dedup;
|
zp->zp_dedup = dedup;
|
||||||
zp->zp_dedup_verify = dedup && dedup_verify;
|
zp->zp_dedup_verify = dedup && dedup_verify;
|
||||||
zp->zp_nopwrite = nopwrite;
|
zp->zp_nopwrite = nopwrite;
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -114,13 +114,13 @@ dmu_objset_id(objset_t *os)
|
|||||||
return (ds ? ds->ds_object : 0);
|
return (ds ? ds->ds_object : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t
|
zfs_sync_type_t
|
||||||
dmu_objset_syncprop(objset_t *os)
|
dmu_objset_syncprop(objset_t *os)
|
||||||
{
|
{
|
||||||
return (os->os_sync);
|
return (os->os_sync);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t
|
zfs_logbias_op_t
|
||||||
dmu_objset_logbias(objset_t *os)
|
dmu_objset_logbias(objset_t *os)
|
||||||
{
|
{
|
||||||
return (os->os_logbias);
|
return (os->os_logbias);
|
||||||
@ -228,6 +228,20 @@ sync_changed_cb(void *arg, uint64_t newval)
|
|||||||
zil_set_sync(os->os_zil, newval);
|
zil_set_sync(os->os_zil, newval);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
redundant_metadata_changed_cb(void *arg, uint64_t newval)
|
||||||
|
{
|
||||||
|
objset_t *os = arg;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Inheritance and range checking should have been done by now.
|
||||||
|
*/
|
||||||
|
ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
|
||||||
|
newval == ZFS_REDUNDANT_METADATA_MOST);
|
||||||
|
|
||||||
|
os->os_redundant_metadata = newval;
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
logbias_changed_cb(void *arg, uint64_t newval)
|
logbias_changed_cb(void *arg, uint64_t newval)
|
||||||
{
|
{
|
||||||
@ -363,6 +377,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
|||||||
zfs_prop_to_name(ZFS_PROP_SYNC),
|
zfs_prop_to_name(ZFS_PROP_SYNC),
|
||||||
sync_changed_cb, os);
|
sync_changed_cb, os);
|
||||||
}
|
}
|
||||||
|
if (err == 0) {
|
||||||
|
err = dsl_prop_register(ds,
|
||||||
|
zfs_prop_to_name(
|
||||||
|
ZFS_PROP_REDUNDANT_METADATA),
|
||||||
|
redundant_metadata_changed_cb, os);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
|
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
|
||||||
@ -376,9 +396,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
|||||||
os->os_compress = ZIO_COMPRESS_LZJB;
|
os->os_compress = ZIO_COMPRESS_LZJB;
|
||||||
os->os_copies = spa_max_replication(spa);
|
os->os_copies = spa_max_replication(spa);
|
||||||
os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
|
os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
|
||||||
os->os_dedup_verify = 0;
|
os->os_dedup_verify = B_FALSE;
|
||||||
os->os_logbias = 0;
|
os->os_logbias = ZFS_LOGBIAS_LATENCY;
|
||||||
os->os_sync = 0;
|
os->os_sync = ZFS_SYNC_STANDARD;
|
||||||
os->os_primary_cache = ZFS_CACHE_ALL;
|
os->os_primary_cache = ZFS_CACHE_ALL;
|
||||||
os->os_secondary_cache = ZFS_CACHE_ALL;
|
os->os_secondary_cache = ZFS_CACHE_ALL;
|
||||||
}
|
}
|
||||||
@ -623,6 +643,9 @@ dmu_objset_evict(objset_t *os)
|
|||||||
VERIFY0(dsl_prop_unregister(ds,
|
VERIFY0(dsl_prop_unregister(ds,
|
||||||
zfs_prop_to_name(ZFS_PROP_SYNC),
|
zfs_prop_to_name(ZFS_PROP_SYNC),
|
||||||
sync_changed_cb, os));
|
sync_changed_cb, os));
|
||||||
|
VERIFY0(dsl_prop_unregister(ds,
|
||||||
|
zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
|
||||||
|
redundant_metadata_changed_cb, os));
|
||||||
}
|
}
|
||||||
VERIFY0(dsl_prop_unregister(ds,
|
VERIFY0(dsl_prop_unregister(ds,
|
||||||
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
|
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
|
||||||
|
Loading…
Reference in New Issue
Block a user