mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-25 18:59:33 +03:00
Illumos #3236
3236 zio nop-write Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Christopher Siden <chris.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> References: illumos/illumos-gate@80901aea8e https://www.illumos.org/issues/3236 Porting Notes 1. This patch is being merged dispite an increased instance of https://www.illumos.org/issues/3113 being triggered by ztest. Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1489
This commit is contained in:
parent
831baf06ef
commit
03c6040bee
@ -208,6 +208,7 @@ enum ztest_io_type {
|
|||||||
ZTEST_IO_WRITE_ZEROES,
|
ZTEST_IO_WRITE_ZEROES,
|
||||||
ZTEST_IO_TRUNCATE,
|
ZTEST_IO_TRUNCATE,
|
||||||
ZTEST_IO_SETATTR,
|
ZTEST_IO_SETATTR,
|
||||||
|
ZTEST_IO_REWRITE,
|
||||||
ZTEST_IO_TYPES
|
ZTEST_IO_TYPES
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1900,6 +1901,12 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
|||||||
DMU_READ_NO_PREFETCH);
|
DMU_READ_NO_PREFETCH);
|
||||||
|
|
||||||
if (error == 0) {
|
if (error == 0) {
|
||||||
|
blkptr_t *obp = dmu_buf_get_blkptr(db);
|
||||||
|
if (obp) {
|
||||||
|
ASSERT(BP_IS_HOLE(bp));
|
||||||
|
*bp = *obp;
|
||||||
|
}
|
||||||
|
|
||||||
zgd->zgd_db = db;
|
zgd->zgd_db = db;
|
||||||
zgd->zgd_bp = bp;
|
zgd->zgd_bp = bp;
|
||||||
|
|
||||||
@ -2048,6 +2055,9 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* No object was found.
|
||||||
|
*/
|
||||||
if (od->od_object == 0)
|
if (od->od_object == 0)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
@ -2163,6 +2173,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
|
|||||||
static void
|
static void
|
||||||
ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
|
ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
|
||||||
{
|
{
|
||||||
|
int err;
|
||||||
ztest_block_tag_t wbt;
|
ztest_block_tag_t wbt;
|
||||||
dmu_object_info_t doi;
|
dmu_object_info_t doi;
|
||||||
enum ztest_io_type io_type;
|
enum ztest_io_type io_type;
|
||||||
@ -2217,6 +2228,25 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
|
|||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ZTEST_IO_REWRITE:
|
||||||
|
(void) rw_enter(&ztest_name_lock, RW_READER);
|
||||||
|
err = ztest_dsl_prop_set_uint64(zd->zd_name,
|
||||||
|
ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
|
||||||
|
B_FALSE);
|
||||||
|
VERIFY(err == 0 || err == ENOSPC);
|
||||||
|
err = ztest_dsl_prop_set_uint64(zd->zd_name,
|
||||||
|
ZFS_PROP_COMPRESSION,
|
||||||
|
ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
|
||||||
|
B_FALSE);
|
||||||
|
VERIFY(err == 0 || err == ENOSPC);
|
||||||
|
(void) rw_exit(&ztest_name_lock);
|
||||||
|
|
||||||
|
VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
|
||||||
|
DMU_READ_NO_PREFETCH));
|
||||||
|
|
||||||
|
(void) ztest_write(zd, object, offset, blocksize, data);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) rw_exit(&zd->zd_zilog_lock);
|
(void) rw_exit(&zd->zd_zilog_lock);
|
||||||
@ -2304,6 +2334,11 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
|
|||||||
{
|
{
|
||||||
objset_t *os = zd->zd_os;
|
objset_t *os = zd->zd_os;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We grab the zd_dirobj_lock to ensure that no other thread is
|
||||||
|
* updating the zil (i.e. adding in-memory log records) and the
|
||||||
|
* zd_zilog_lock to block any I/O.
|
||||||
|
*/
|
||||||
mutex_enter(&zd->zd_dirobj_lock);
|
mutex_enter(&zd->zd_dirobj_lock);
|
||||||
(void) rw_enter(&zd->zd_zilog_lock, RW_WRITER);
|
(void) rw_enter(&zd->zd_zilog_lock, RW_WRITER);
|
||||||
|
|
||||||
@ -5121,8 +5156,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
|
|||||||
/*
|
/*
|
||||||
* Find out what block we got.
|
* Find out what block we got.
|
||||||
*/
|
*/
|
||||||
VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
|
VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
|
||||||
DMU_READ_NO_PREFETCH) == 0);
|
DMU_READ_NO_PREFETCH));
|
||||||
blk = *((dmu_buf_impl_t *)db)->db_blkptr;
|
blk = *((dmu_buf_impl_t *)db)->db_blkptr;
|
||||||
dmu_buf_rele(db, FTAG);
|
dmu_buf_rele(db, FTAG);
|
||||||
|
|
||||||
@ -5824,6 +5859,8 @@ ztest_freeze(void)
|
|||||||
kernel_init(FREAD | FWRITE);
|
kernel_init(FREAD | FWRITE);
|
||||||
VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
|
VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
|
||||||
VERIFY3U(0, ==, ztest_dataset_open(0));
|
VERIFY3U(0, ==, ztest_dataset_open(0));
|
||||||
|
spa->spa_debug = B_TRUE;
|
||||||
|
ztest_spa = spa;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Force the first log block to be transactionally allocated.
|
* Force the first log block to be transactionally allocated.
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -131,6 +132,7 @@ typedef struct dbuf_dirty_record {
|
|||||||
blkptr_t dr_overridden_by;
|
blkptr_t dr_overridden_by;
|
||||||
override_states_t dr_override_state;
|
override_states_t dr_override_state;
|
||||||
uint8_t dr_copies;
|
uint8_t dr_copies;
|
||||||
|
boolean_t dr_nopwrite;
|
||||||
} dl;
|
} dl;
|
||||||
} dt;
|
} dt;
|
||||||
} dbuf_dirty_record_t;
|
} dbuf_dirty_record_t;
|
||||||
|
@ -497,6 +497,11 @@ void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
|
|||||||
*/
|
*/
|
||||||
void *dmu_buf_get_user(dmu_buf_t *db);
|
void *dmu_buf_get_user(dmu_buf_t *db);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the blkptr associated with this dbuf, or NULL if not set.
|
||||||
|
*/
|
||||||
|
struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Indicate that you are going to modify the buffer's data (db_data).
|
* Indicate that you are going to modify the buffer's data (db_data).
|
||||||
*
|
*
|
||||||
|
@ -196,7 +196,9 @@ enum zio_flag {
|
|||||||
ZIO_FLAG_GANG_CHILD = 1 << 22,
|
ZIO_FLAG_GANG_CHILD = 1 << 22,
|
||||||
ZIO_FLAG_DDT_CHILD = 1 << 23,
|
ZIO_FLAG_DDT_CHILD = 1 << 23,
|
||||||
ZIO_FLAG_GODFATHER = 1 << 24,
|
ZIO_FLAG_GODFATHER = 1 << 24,
|
||||||
ZIO_FLAG_FASTWRITE = 1 << 25
|
ZIO_FLAG_NOPWRITE = 1 << 25,
|
||||||
|
ZIO_FLAG_REEXECUTED = 1 << 26,
|
||||||
|
ZIO_FLAG_FASTWRITE = 1 << 27
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ZIO_FLAG_MUSTSUCCEED 0
|
#define ZIO_FLAG_MUSTSUCCEED 0
|
||||||
@ -296,8 +298,9 @@ typedef struct zio_prop {
|
|||||||
dmu_object_type_t zp_type;
|
dmu_object_type_t zp_type;
|
||||||
uint8_t zp_level;
|
uint8_t zp_level;
|
||||||
uint8_t zp_copies;
|
uint8_t zp_copies;
|
||||||
uint8_t zp_dedup;
|
boolean_t zp_dedup;
|
||||||
uint8_t zp_dedup_verify;
|
boolean_t zp_dedup_verify;
|
||||||
|
boolean_t zp_nopwrite;
|
||||||
} zio_prop_t;
|
} zio_prop_t;
|
||||||
|
|
||||||
typedef struct zio_cksum_report zio_cksum_report_t;
|
typedef struct zio_cksum_report zio_cksum_report_t;
|
||||||
@ -466,7 +469,8 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
|
|||||||
void *data, uint64_t size, zio_done_func_t *done, void *private,
|
void *data, uint64_t size, zio_done_func_t *done, void *private,
|
||||||
int priority, enum zio_flag flags, zbookmark_t *zb);
|
int priority, enum zio_flag flags, zbookmark_t *zb);
|
||||||
|
|
||||||
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
|
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
|
||||||
|
boolean_t nopwrite);
|
||||||
|
|
||||||
extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
|
extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
|
||||||
|
|
||||||
|
@ -37,6 +37,70 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* XXX -- Describe ZFS I/O pipleine here. Fill in as needed.
|
||||||
|
*
|
||||||
|
* The ZFS I/O pipeline is comprised of various stages which are defined
|
||||||
|
* in the zio_stage enum below. The individual stages are used to construct
|
||||||
|
* these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
|
||||||
|
*
|
||||||
|
* I/O operations: (XXX - provide detail for each of the operations)
|
||||||
|
*
|
||||||
|
* Read:
|
||||||
|
* Write:
|
||||||
|
* Free:
|
||||||
|
* Claim:
|
||||||
|
* Ioctl:
|
||||||
|
*
|
||||||
|
* Although the most common pipeline are used by the basic I/O operations
|
||||||
|
* above, there are some helper pipelines (one could consider them
|
||||||
|
* sub-pipelines) which are used internally by the ZIO module and are
|
||||||
|
* explained below:
|
||||||
|
*
|
||||||
|
* Interlock Pipeline:
|
||||||
|
* The interlock pipeline is the most basic pipeline and is used by all
|
||||||
|
* of the I/O operations. The interlock pipeline does not perform any I/O
|
||||||
|
* and is used to coordinate the dependencies between I/Os that are being
|
||||||
|
* issued (i.e. the parent/child relationship).
|
||||||
|
*
|
||||||
|
* Vdev child Pipeline:
|
||||||
|
* The vdev child pipeline is responsible for performing the physical I/O.
|
||||||
|
* It is in this pipeline where the I/O are queued and possibly cached.
|
||||||
|
*
|
||||||
|
* In addition to performing I/O, the pipeline is also responsible for
|
||||||
|
* data transformations. The transformations performed are based on the
|
||||||
|
* specific properties that user may have selected and modify the
|
||||||
|
* behavior of the pipeline. Examples of supported transformations are
|
||||||
|
* compression, dedup, and nop writes. Transformations will either modify
|
||||||
|
* the data or the pipeline. This list below further describes each of
|
||||||
|
* the supported transformations:
|
||||||
|
*
|
||||||
|
* Compression:
|
||||||
|
* ZFS supports three different flavors of compression -- gzip, lzjb, and
|
||||||
|
* zle. Compression occurs as part of the write pipeline and is performed
|
||||||
|
* in the ZIO_STAGE_WRITE_BP_INIT stage.
|
||||||
|
*
|
||||||
|
* Dedup:
|
||||||
|
* Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
|
||||||
|
* ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
|
||||||
|
* read pipeline if the dedup bit is set on the block pointer.
|
||||||
|
* Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
|
||||||
|
* and added to a write pipeline if a user has enabled dedup on that
|
||||||
|
* particular dataset.
|
||||||
|
*
|
||||||
|
* NOP Write:
|
||||||
|
* The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
|
||||||
|
* and is added to an existing write pipeline if a crypographically
|
||||||
|
* secure checksum (i.e. SHA256) is enabled and compression is turned on.
|
||||||
|
* The NOP write stage will compare the checksums of the current data
|
||||||
|
* on-disk (level-0 blocks only) and the data that is currently being written.
|
||||||
|
* If the checksum values are identical then the pipeline is converted to
|
||||||
|
* an interlock pipeline skipping block allocation and bypassing the
|
||||||
|
* physical I/O. The nop write feature can handle writes in either
|
||||||
|
* syncing or open context (i.e. zil writes) and as a result is mutually
|
||||||
|
* exclusive with dedup.
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* zio pipeline stage definitions
|
* zio pipeline stage definitions
|
||||||
*/
|
*/
|
||||||
@ -50,27 +114,29 @@ enum zio_stage {
|
|||||||
|
|
||||||
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
|
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
|
||||||
|
|
||||||
ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */
|
ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */
|
||||||
ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */
|
|
||||||
ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */
|
|
||||||
ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */
|
|
||||||
|
|
||||||
ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */
|
ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */
|
||||||
ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */
|
ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */
|
||||||
|
ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */
|
||||||
|
ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */
|
||||||
|
|
||||||
ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */
|
ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */
|
||||||
ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */
|
ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */
|
||||||
ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */
|
|
||||||
|
|
||||||
ZIO_STAGE_READY = 1 << 15, /* RWFCI */
|
ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */
|
||||||
|
ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */
|
||||||
|
ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */
|
||||||
|
|
||||||
ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
|
ZIO_STAGE_READY = 1 << 16, /* RWFCI */
|
||||||
ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */
|
|
||||||
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
|
|
||||||
|
|
||||||
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
|
ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */
|
||||||
|
ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */
|
||||||
|
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */
|
||||||
|
|
||||||
ZIO_STAGE_DONE = 1 << 20 /* RWFCI */
|
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */
|
||||||
|
|
||||||
|
ZIO_STAGE_DONE = 1 << 21 /* RWFCI */
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ZIO_INTERLOCK_STAGES \
|
#define ZIO_INTERLOCK_STAGES \
|
||||||
|
@ -3708,6 +3708,12 @@ arc_write_done(zio_t *zio)
|
|||||||
arc_hdr_destroy(exists);
|
arc_hdr_destroy(exists);
|
||||||
exists = buf_hash_insert(hdr, &hash_lock);
|
exists = buf_hash_insert(hdr, &hash_lock);
|
||||||
ASSERT3P(exists, ==, NULL);
|
ASSERT3P(exists, ==, NULL);
|
||||||
|
} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
|
||||||
|
/* nopwrite */
|
||||||
|
ASSERT(zio->io_prop.zp_nopwrite);
|
||||||
|
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
|
||||||
|
panic("bad nopwrite, hdr=%p exists=%p",
|
||||||
|
(void *)hdr, (void *)exists);
|
||||||
} else {
|
} else {
|
||||||
/* Dedup */
|
/* Dedup */
|
||||||
ASSERT(hdr->b_datacnt == 1);
|
ASSERT(hdr->b_datacnt == 1);
|
||||||
|
@ -823,13 +823,15 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
|
|||||||
ASSERT(db->db_data_pending != dr);
|
ASSERT(db->db_data_pending != dr);
|
||||||
|
|
||||||
/* free this block */
|
/* free this block */
|
||||||
if (!BP_IS_HOLE(bp)) {
|
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
|
||||||
spa_t *spa;
|
spa_t *spa;
|
||||||
|
|
||||||
DB_GET_SPA(&spa, db);
|
DB_GET_SPA(&spa, db);
|
||||||
zio_free(spa, txg, bp);
|
zio_free(spa, txg, bp);
|
||||||
}
|
}
|
||||||
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
||||||
|
dr->dt.dl.dr_nopwrite = B_FALSE;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Release the already-written buffer, so we leave it in
|
* Release the already-written buffer, so we leave it in
|
||||||
* a consistent dirty state. Note that all callers are
|
* a consistent dirty state. Note that all callers are
|
||||||
@ -2269,6 +2271,13 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
|
|||||||
return (res);
|
return (res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
blkptr_t *
|
||||||
|
dmu_buf_get_blkptr(dmu_buf_t *db)
|
||||||
|
{
|
||||||
|
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
|
||||||
|
return (dbi->db_blkptr);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
|
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
|
||||||
{
|
{
|
||||||
@ -2622,7 +2631,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|||||||
ASSERT0(zio->io_error);
|
ASSERT0(zio->io_error);
|
||||||
ASSERT(db->db_blkptr == bp);
|
ASSERT(db->db_blkptr == bp);
|
||||||
|
|
||||||
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
|
/*
|
||||||
|
* For nopwrites and rewrites we ensure that the bp matches our
|
||||||
|
* original and bypass all the accounting.
|
||||||
|
*/
|
||||||
|
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
|
||||||
ASSERT(BP_EQUAL(bp, bp_orig));
|
ASSERT(BP_EQUAL(bp, bp_orig));
|
||||||
} else {
|
} else {
|
||||||
objset_t *os;
|
objset_t *os;
|
||||||
@ -2822,7 +2835,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
|||||||
mutex_enter(&db->db_mtx);
|
mutex_enter(&db->db_mtx);
|
||||||
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
||||||
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
|
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
|
||||||
dr->dt.dl.dr_copies);
|
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
} else if (db->db_state == DB_NOFILL) {
|
} else if (db->db_state == DB_NOFILL) {
|
||||||
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
|
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
|
||||||
|
138
module/zfs/dmu.c
138
module/zfs/dmu.c
@ -41,12 +41,18 @@
|
|||||||
#include <sys/zfs_ioctl.h>
|
#include <sys/zfs_ioctl.h>
|
||||||
#include <sys/zap.h>
|
#include <sys/zap.h>
|
||||||
#include <sys/zio_checksum.h>
|
#include <sys/zio_checksum.h>
|
||||||
|
#include <sys/zio_compress.h>
|
||||||
#include <sys/sa.h>
|
#include <sys/sa.h>
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
#include <sys/vmsystm.h>
|
#include <sys/vmsystm.h>
|
||||||
#include <sys/zfs_znode.h>
|
#include <sys/zfs_znode.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Enable/disable nopwrite feature.
|
||||||
|
*/
|
||||||
|
int zfs_nopwrite_enabled = 1;
|
||||||
|
|
||||||
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
||||||
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
|
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
|
||||||
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
|
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
|
||||||
@ -1473,6 +1479,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
|||||||
mutex_enter(&db->db_mtx);
|
mutex_enter(&db->db_mtx);
|
||||||
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
|
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
|
||||||
if (zio->io_error == 0) {
|
if (zio->io_error == 0) {
|
||||||
|
dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
|
||||||
|
if (dr->dt.dl.dr_nopwrite) {
|
||||||
|
ASSERTV(blkptr_t *bp = zio->io_bp);
|
||||||
|
ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
|
||||||
|
ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig));
|
||||||
|
|
||||||
|
ASSERT(BP_EQUAL(bp, bp_orig));
|
||||||
|
ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
|
||||||
|
ASSERT(zio_checksum_table[chksum].ci_dedup);
|
||||||
|
}
|
||||||
dr->dt.dl.dr_overridden_by = *zio->io_bp;
|
dr->dt.dl.dr_overridden_by = *zio->io_bp;
|
||||||
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
|
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
|
||||||
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
|
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
|
||||||
@ -1494,11 +1510,22 @@ dmu_sync_late_arrival_done(zio_t *zio)
|
|||||||
{
|
{
|
||||||
blkptr_t *bp = zio->io_bp;
|
blkptr_t *bp = zio->io_bp;
|
||||||
dmu_sync_arg_t *dsa = zio->io_private;
|
dmu_sync_arg_t *dsa = zio->io_private;
|
||||||
|
ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
|
||||||
|
|
||||||
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
|
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
|
||||||
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
|
/*
|
||||||
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
|
* If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
|
||||||
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
|
* then there is nothing to do here. Otherwise, free the
|
||||||
|
* newly allocated block in this txg.
|
||||||
|
*/
|
||||||
|
if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
|
||||||
|
ASSERT(BP_EQUAL(bp, bp_orig));
|
||||||
|
} else {
|
||||||
|
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
|
||||||
|
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
|
||||||
|
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
|
||||||
|
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dmu_tx_commit(dsa->dsa_tx);
|
dmu_tx_commit(dsa->dsa_tx);
|
||||||
@ -1544,7 +1571,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
|
|||||||
*
|
*
|
||||||
* Return values:
|
* Return values:
|
||||||
*
|
*
|
||||||
* EEXIST: this txg has already been synced, so there's nothing to to.
|
* EEXIST: this txg has already been synced, so there's nothing to do.
|
||||||
* The caller should not log the write.
|
* The caller should not log the write.
|
||||||
*
|
*
|
||||||
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
|
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
|
||||||
@ -1576,7 +1603,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
|
|||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
|
|
||||||
ASSERT(pio != NULL);
|
ASSERT(pio != NULL);
|
||||||
ASSERT(BP_IS_HOLE(bp));
|
|
||||||
ASSERT(txg != 0);
|
ASSERT(txg != 0);
|
||||||
|
|
||||||
SET_BOOKMARK(&zb, ds->ds_object,
|
SET_BOOKMARK(&zb, ds->ds_object,
|
||||||
@ -1631,6 +1657,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
|
|||||||
return (SET_ERROR(ENOENT));
|
return (SET_ERROR(ENOENT));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Assume the on-disk data is X, the current syncing data is Y,
|
||||||
|
* and the current in-memory data is Z (currently in dmu_sync).
|
||||||
|
* X and Z are identical but Y is has been modified. Normally,
|
||||||
|
* when X and Z are the same we will perform a nopwrite but if Y
|
||||||
|
* is different we must disable nopwrite since the resulting write
|
||||||
|
* of Y to disk can free the block containing X. If we allowed a
|
||||||
|
* nopwrite to occur the block pointing to Z would reference a freed
|
||||||
|
* block. Since this is a rare case we simplify this by disabling
|
||||||
|
* nopwrite if the current dmu_sync-ing dbuf has been modified in
|
||||||
|
* a previous transaction.
|
||||||
|
*/
|
||||||
|
if (dr->dr_next)
|
||||||
|
zp.zp_nopwrite = B_FALSE;
|
||||||
|
|
||||||
ASSERT(dr->dr_txg == txg);
|
ASSERT(dr->dr_txg == txg);
|
||||||
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
|
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
|
||||||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
|
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
|
||||||
@ -1715,14 +1758,26 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
|||||||
enum zio_checksum checksum = os->os_checksum;
|
enum zio_checksum checksum = os->os_checksum;
|
||||||
enum zio_compress compress = os->os_compress;
|
enum zio_compress compress = os->os_compress;
|
||||||
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
|
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
|
||||||
boolean_t dedup;
|
boolean_t dedup = B_FALSE;
|
||||||
|
boolean_t nopwrite = B_FALSE;
|
||||||
boolean_t dedup_verify = os->os_dedup_verify;
|
boolean_t dedup_verify = os->os_dedup_verify;
|
||||||
int copies = os->os_copies;
|
int copies = os->os_copies;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Determine checksum setting.
|
* We maintain different write policies for each of the following
|
||||||
|
* types of data:
|
||||||
|
* 1. metadata
|
||||||
|
* 2. preallocated blocks (i.e. level-0 blocks of a dump device)
|
||||||
|
* 3. all other level 0 blocks
|
||||||
*/
|
*/
|
||||||
if (ismd) {
|
if (ismd) {
|
||||||
|
/*
|
||||||
|
* XXX -- we should design a compression algorithm
|
||||||
|
* that specializes in arrays of bps.
|
||||||
|
*/
|
||||||
|
compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
|
||||||
|
ZIO_COMPRESS_LZJB;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Metadata always gets checksummed. If the data
|
* Metadata always gets checksummed. If the data
|
||||||
* checksum is multi-bit correctable, and it's not a
|
* checksum is multi-bit correctable, and it's not a
|
||||||
@ -1733,45 +1788,47 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
|||||||
if (zio_checksum_table[checksum].ci_correctable < 1 ||
|
if (zio_checksum_table[checksum].ci_correctable < 1 ||
|
||||||
zio_checksum_table[checksum].ci_eck)
|
zio_checksum_table[checksum].ci_eck)
|
||||||
checksum = ZIO_CHECKSUM_FLETCHER_4;
|
checksum = ZIO_CHECKSUM_FLETCHER_4;
|
||||||
} else {
|
} else if (wp & WP_NOFILL) {
|
||||||
checksum = zio_checksum_select(dn->dn_checksum, checksum);
|
ASSERT(level == 0);
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Determine compression setting.
|
|
||||||
*/
|
|
||||||
if (ismd) {
|
|
||||||
/*
|
/*
|
||||||
* XXX -- we should design a compression algorithm
|
* If we're writing preallocated blocks, we aren't actually
|
||||||
* that specializes in arrays of bps.
|
* writing them so don't set any policy properties. These
|
||||||
|
* blocks are currently only used by an external subsystem
|
||||||
|
* outside of zfs (i.e. dump) and not written by the zio
|
||||||
|
* pipeline.
|
||||||
*/
|
*/
|
||||||
compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
|
compress = ZIO_COMPRESS_OFF;
|
||||||
ZIO_COMPRESS_LZJB;
|
checksum = ZIO_CHECKSUM_OFF;
|
||||||
} else {
|
} else {
|
||||||
compress = zio_compress_select(dn->dn_compress, compress);
|
compress = zio_compress_select(dn->dn_compress, compress);
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
|
||||||
* Determine dedup setting. If we are in dmu_sync(), we won't
|
zio_checksum_select(dn->dn_checksum, checksum) :
|
||||||
* actually dedup now because that's all done in syncing context;
|
dedup_checksum;
|
||||||
* but we do want to use the dedup checkum. If the checksum is not
|
|
||||||
* strong enough to ensure unique signatures, force dedup_verify.
|
|
||||||
*/
|
|
||||||
dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
|
|
||||||
if (dedup) {
|
|
||||||
checksum = dedup_checksum;
|
|
||||||
if (!zio_checksum_table[checksum].ci_dedup)
|
|
||||||
dedup_verify = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (wp & WP_DMU_SYNC)
|
/*
|
||||||
dedup = 0;
|
* Determine dedup setting. If we are in dmu_sync(),
|
||||||
|
* we won't actually dedup now because that's all
|
||||||
|
* done in syncing context; but we do want to use the
|
||||||
|
* dedup checkum. If the checksum is not strong
|
||||||
|
* enough to ensure unique signatures, force
|
||||||
|
* dedup_verify.
|
||||||
|
*/
|
||||||
|
if (dedup_checksum != ZIO_CHECKSUM_OFF) {
|
||||||
|
dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
|
||||||
|
if (!zio_checksum_table[checksum].ci_dedup)
|
||||||
|
dedup_verify = B_TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
if (wp & WP_NOFILL) {
|
/*
|
||||||
ASSERT(!ismd && level == 0);
|
* Enable nopwrite if we have a cryptographically secure
|
||||||
checksum = ZIO_CHECKSUM_OFF;
|
* checksum that has no known collisions (i.e. SHA-256)
|
||||||
compress = ZIO_COMPRESS_OFF;
|
* and compression is enabled. We don't enable nopwrite if
|
||||||
dedup = B_FALSE;
|
* dedup is enabled as the two features are mutually exclusive.
|
||||||
|
*/
|
||||||
|
nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
|
||||||
|
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
|
||||||
}
|
}
|
||||||
|
|
||||||
zp->zp_checksum = checksum;
|
zp->zp_checksum = checksum;
|
||||||
@ -1781,6 +1838,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
|||||||
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
|
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
|
||||||
zp->zp_dedup = dedup;
|
zp->zp_dedup = dedup;
|
||||||
zp->zp_dedup_verify = dedup && dedup_verify;
|
zp->zp_dedup_verify = dedup && dedup_verify;
|
||||||
|
zp->zp_nopwrite = nopwrite;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
@ -2005,4 +2063,8 @@ EXPORT_SYMBOL(dmu_ot);
|
|||||||
|
|
||||||
module_param(zfs_mdcomp_disable, int, 0644);
|
module_param(zfs_mdcomp_disable, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
|
MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
|
||||||
|
|
||||||
|
module_param(zfs_nopwrite_enabled, int, 0644);
|
||||||
|
MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -421,7 +421,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
|
|||||||
* clean up our in-memory structures accumulated while syncing:
|
* clean up our in-memory structures accumulated while syncing:
|
||||||
*
|
*
|
||||||
* - move dead blocks from the pending deadlist to the on-disk deadlist
|
* - move dead blocks from the pending deadlist to the on-disk deadlist
|
||||||
* - clean up zil records
|
|
||||||
* - release hold from dsl_dataset_dirty()
|
* - release hold from dsl_dataset_dirty()
|
||||||
*/
|
*/
|
||||||
while ((ds = list_remove_head(&synced_datasets))) {
|
while ((ds = list_remove_head(&synced_datasets))) {
|
||||||
|
@ -245,7 +245,6 @@ int spa_mode_global;
|
|||||||
* Secondly, the value determines if an I/O is considered "hung".
|
* Secondly, the value determines if an I/O is considered "hung".
|
||||||
* Any I/O that has not completed in zfs_deadman_synctime is considered
|
* Any I/O that has not completed in zfs_deadman_synctime is considered
|
||||||
* "hung" resulting in a zevent being posted.
|
* "hung" resulting in a zevent being posted.
|
||||||
* 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
|
|
||||||
*/
|
*/
|
||||||
unsigned long zfs_deadman_synctime = 1000ULL;
|
unsigned long zfs_deadman_synctime = 1000ULL;
|
||||||
|
|
||||||
|
@ -1050,6 +1050,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
|||||||
DMU_READ_NO_PREFETCH);
|
DMU_READ_NO_PREFETCH);
|
||||||
|
|
||||||
if (error == 0) {
|
if (error == 0) {
|
||||||
|
blkptr_t *obp = dmu_buf_get_blkptr(db);
|
||||||
|
if (obp) {
|
||||||
|
ASSERT(BP_IS_HOLE(bp));
|
||||||
|
*bp = *obp;
|
||||||
|
}
|
||||||
|
|
||||||
zgd->zgd_db = db;
|
zgd->zgd_db = db;
|
||||||
zgd->zgd_bp = bp;
|
zgd->zgd_bp = bp;
|
||||||
|
|
||||||
|
105
module/zfs/zio.c
105
module/zfs/zio.c
@ -723,9 +723,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
|
|||||||
DMU_OT_IS_VALID(zp->zp_type) &&
|
DMU_OT_IS_VALID(zp->zp_type) &&
|
||||||
zp->zp_level < 32 &&
|
zp->zp_level < 32 &&
|
||||||
zp->zp_copies > 0 &&
|
zp->zp_copies > 0 &&
|
||||||
zp->zp_copies <= spa_max_replication(spa) &&
|
zp->zp_copies <= spa_max_replication(spa));
|
||||||
zp->zp_dedup <= 1 &&
|
|
||||||
zp->zp_dedup_verify <= 1);
|
|
||||||
|
|
||||||
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
|
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
|
||||||
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
|
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
|
||||||
@ -753,13 +751,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
|
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
|
||||||
{
|
{
|
||||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||||
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
||||||
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
|
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
|
||||||
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
|
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We must reset the io_prop to match the values that existed
|
||||||
|
* when the bp was first written by dmu_sync() keeping in mind
|
||||||
|
* that nopwrite and dedup are mutually exclusive.
|
||||||
|
*/
|
||||||
|
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
|
||||||
|
zio->io_prop.zp_nopwrite = nopwrite;
|
||||||
zio->io_prop.zp_copies = copies;
|
zio->io_prop.zp_copies = copies;
|
||||||
zio->io_bp_override = bp;
|
zio->io_bp_override = bp;
|
||||||
}
|
}
|
||||||
@ -1051,6 +1056,19 @@ zio_write_bp_init(zio_t *zio)
|
|||||||
*bp = *zio->io_bp_override;
|
*bp = *zio->io_bp_override;
|
||||||
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we've been overridden and nopwrite is set then
|
||||||
|
* set the flag accordingly to indicate that a nopwrite
|
||||||
|
* has already occurred.
|
||||||
|
*/
|
||||||
|
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
|
||||||
|
ASSERT(!zp->zp_dedup);
|
||||||
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
||||||
|
return (ZIO_PIPELINE_CONTINUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT(!zp->zp_nopwrite);
|
||||||
|
|
||||||
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
|
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
|
||||||
return (ZIO_PIPELINE_CONTINUE);
|
return (ZIO_PIPELINE_CONTINUE);
|
||||||
|
|
||||||
@ -1138,6 +1156,11 @@ zio_write_bp_init(zio_t *zio)
|
|||||||
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
||||||
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
|
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
|
||||||
}
|
}
|
||||||
|
if (zp->zp_nopwrite) {
|
||||||
|
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
||||||
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
||||||
|
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (ZIO_PIPELINE_CONTINUE);
|
return (ZIO_PIPELINE_CONTINUE);
|
||||||
@ -1404,6 +1427,7 @@ zio_reexecute(zio_t *pio)
|
|||||||
pio->io_stage = pio->io_orig_stage;
|
pio->io_stage = pio->io_orig_stage;
|
||||||
pio->io_pipeline = pio->io_orig_pipeline;
|
pio->io_pipeline = pio->io_orig_pipeline;
|
||||||
pio->io_reexecute = 0;
|
pio->io_reexecute = 0;
|
||||||
|
pio->io_flags |= ZIO_FLAG_REEXECUTED;
|
||||||
pio->io_error = 0;
|
pio->io_error = 0;
|
||||||
for (w = 0; w < ZIO_WAIT_TYPES; w++)
|
for (w = 0; w < ZIO_WAIT_TYPES; w++)
|
||||||
pio->io_state[w] = 0;
|
pio->io_state[w] = 0;
|
||||||
@ -1887,8 +1911,9 @@ zio_write_gang_block(zio_t *pio)
|
|||||||
zp.zp_type = DMU_OT_NONE;
|
zp.zp_type = DMU_OT_NONE;
|
||||||
zp.zp_level = 0;
|
zp.zp_level = 0;
|
||||||
zp.zp_copies = gio->io_prop.zp_copies;
|
zp.zp_copies = gio->io_prop.zp_copies;
|
||||||
zp.zp_dedup = 0;
|
zp.zp_dedup = B_FALSE;
|
||||||
zp.zp_dedup_verify = 0;
|
zp.zp_dedup_verify = B_FALSE;
|
||||||
|
zp.zp_nopwrite = B_FALSE;
|
||||||
|
|
||||||
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
|
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
|
||||||
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
|
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
|
||||||
@ -1912,6 +1937,62 @@ zio_write_gang_block(zio_t *pio)
|
|||||||
return (ZIO_PIPELINE_CONTINUE);
|
return (ZIO_PIPELINE_CONTINUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The zio_nop_write stage in the pipeline determines if allocating
|
||||||
|
* a new bp is necessary. By leveraging a cryptographically secure checksum,
|
||||||
|
* such as SHA256, we can compare the checksums of the new data and the old
|
||||||
|
* to determine if allocating a new block is required. The nopwrite
|
||||||
|
* feature can handle writes in either syncing or open context (i.e. zil
|
||||||
|
* writes) and as a result is mutually exclusive with dedup.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
zio_nop_write(zio_t *zio)
|
||||||
|
{
|
||||||
|
blkptr_t *bp = zio->io_bp;
|
||||||
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
||||||
|
zio_prop_t *zp = &zio->io_prop;
|
||||||
|
|
||||||
|
ASSERT(BP_GET_LEVEL(bp) == 0);
|
||||||
|
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
|
||||||
|
ASSERT(zp->zp_nopwrite);
|
||||||
|
ASSERT(!zp->zp_dedup);
|
||||||
|
ASSERT(zio->io_bp_override == NULL);
|
||||||
|
ASSERT(IO_IS_ALLOCATING(zio));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check to see if the original bp and the new bp have matching
|
||||||
|
* characteristics (i.e. same checksum, compression algorithms, etc).
|
||||||
|
* If they don't then just continue with the pipeline which will
|
||||||
|
* allocate a new bp.
|
||||||
|
*/
|
||||||
|
if (BP_IS_HOLE(bp_orig) ||
|
||||||
|
!zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
|
||||||
|
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
|
||||||
|
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
|
||||||
|
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
|
||||||
|
zp->zp_copies != BP_GET_NDVAS(bp_orig))
|
||||||
|
return (ZIO_PIPELINE_CONTINUE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the checksums match then reset the pipeline so that we
|
||||||
|
* avoid allocating a new bp and issuing any I/O.
|
||||||
|
*/
|
||||||
|
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
|
||||||
|
ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
|
||||||
|
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
|
||||||
|
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
|
||||||
|
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
|
||||||
|
ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
|
||||||
|
sizeof (uint64_t)) == 0);
|
||||||
|
|
||||||
|
*bp = *bp_orig;
|
||||||
|
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
||||||
|
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (ZIO_PIPELINE_CONTINUE);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ==========================================================================
|
* ==========================================================================
|
||||||
* Dedup
|
* Dedup
|
||||||
@ -2186,7 +2267,7 @@ zio_ddt_write(zio_t *zio)
|
|||||||
zio->io_stage = ZIO_STAGE_OPEN;
|
zio->io_stage = ZIO_STAGE_OPEN;
|
||||||
BP_ZERO(bp);
|
BP_ZERO(bp);
|
||||||
} else {
|
} else {
|
||||||
zp->zp_dedup = 0;
|
zp->zp_dedup = B_FALSE;
|
||||||
}
|
}
|
||||||
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
||||||
ddt_exit(ddt);
|
ddt_exit(ddt);
|
||||||
@ -2815,7 +2896,8 @@ zio_ready(zio_t *zio)
|
|||||||
|
|
||||||
if (zio->io_ready) {
|
if (zio->io_ready) {
|
||||||
ASSERT(IO_IS_ALLOCATING(zio));
|
ASSERT(IO_IS_ALLOCATING(zio));
|
||||||
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
|
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
|
||||||
|
(zio->io_flags & ZIO_FLAG_NOPWRITE));
|
||||||
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
|
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
|
||||||
|
|
||||||
zio->io_ready(zio);
|
zio->io_ready(zio);
|
||||||
@ -2893,6 +2975,8 @@ zio_done(zio_t *zio)
|
|||||||
ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
|
ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
|
||||||
(BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp)));
|
(BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp)));
|
||||||
}
|
}
|
||||||
|
if (zio->io_flags & ZIO_FLAG_NOPWRITE)
|
||||||
|
VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -3015,7 +3099,7 @@ zio_done(zio_t *zio)
|
|||||||
|
|
||||||
if ((zio->io_error || zio->io_reexecute) &&
|
if ((zio->io_error || zio->io_reexecute) &&
|
||||||
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
|
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
|
||||||
!(zio->io_flags & ZIO_FLAG_IO_REWRITE))
|
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
|
||||||
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
|
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
|
||||||
|
|
||||||
zio_gang_tree_free(&zio->io_gang_tree);
|
zio_gang_tree_free(&zio->io_gang_tree);
|
||||||
@ -3112,7 +3196,7 @@ zio_done(zio_t *zio)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
|
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
|
||||||
!BP_IS_HOLE(zio->io_bp)) {
|
!BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
|
||||||
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
|
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3159,6 +3243,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
|
|||||||
zio_issue_async,
|
zio_issue_async,
|
||||||
zio_write_bp_init,
|
zio_write_bp_init,
|
||||||
zio_checksum_generate,
|
zio_checksum_generate,
|
||||||
|
zio_nop_write,
|
||||||
zio_ddt_read_start,
|
zio_ddt_read_start,
|
||||||
zio_ddt_read_done,
|
zio_ddt_read_done,
|
||||||
zio_ddt_write,
|
zio_ddt_write,
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
* needs to be run before opening and using a device.
|
* needs to be run before opening and using a device.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <sys/dbuf.h>
|
||||||
#include <sys/dmu_traverse.h>
|
#include <sys/dmu_traverse.h>
|
||||||
#include <sys/dsl_dataset.h>
|
#include <sys/dsl_dataset.h>
|
||||||
#include <sys/dsl_prop.h>
|
#include <sys/dsl_prop.h>
|
||||||
@ -815,8 +816,10 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
|||||||
{
|
{
|
||||||
zvol_state_t *zv = arg;
|
zvol_state_t *zv = arg;
|
||||||
objset_t *os = zv->zv_objset;
|
objset_t *os = zv->zv_objset;
|
||||||
|
uint64_t object = ZVOL_OBJ;
|
||||||
uint64_t offset = lr->lr_offset;
|
uint64_t offset = lr->lr_offset;
|
||||||
uint64_t size = lr->lr_length;
|
uint64_t size = lr->lr_length;
|
||||||
|
blkptr_t *bp = &lr->lr_blkptr;
|
||||||
dmu_buf_t *db;
|
dmu_buf_t *db;
|
||||||
zgd_t *zgd;
|
zgd_t *zgd;
|
||||||
int error;
|
int error;
|
||||||
@ -836,14 +839,20 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
|
|||||||
* we don't have to write the data twice.
|
* we don't have to write the data twice.
|
||||||
*/
|
*/
|
||||||
if (buf != NULL) { /* immediate write */
|
if (buf != NULL) { /* immediate write */
|
||||||
error = dmu_read(os, ZVOL_OBJ, offset, size, buf,
|
error = dmu_read(os, object, offset, size, buf,
|
||||||
DMU_READ_NO_PREFETCH);
|
DMU_READ_NO_PREFETCH);
|
||||||
} else {
|
} else {
|
||||||
size = zv->zv_volblocksize;
|
size = zv->zv_volblocksize;
|
||||||
offset = P2ALIGN_TYPED(offset, size, uint64_t);
|
offset = P2ALIGN_TYPED(offset, size, uint64_t);
|
||||||
error = dmu_buf_hold(os, ZVOL_OBJ, offset, zgd, &db,
|
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
||||||
DMU_READ_NO_PREFETCH);
|
DMU_READ_NO_PREFETCH);
|
||||||
if (error == 0) {
|
if (error == 0) {
|
||||||
|
blkptr_t *obp = dmu_buf_get_blkptr(db);
|
||||||
|
if (obp) {
|
||||||
|
ASSERT(BP_IS_HOLE(bp));
|
||||||
|
*bp = *obp;
|
||||||
|
}
|
||||||
|
|
||||||
zgd->zgd_db = db;
|
zgd->zgd_db = db;
|
||||||
zgd->zgd_bp = &lr->lr_blkptr;
|
zgd->zgd_bp = &lr->lr_blkptr;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user