From 03c6040bee6c87a9413b7da41d9f580f79a8ab62 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 10 May 2013 12:47:54 -0700 Subject: [PATCH] Illumos #3236 3236 zio nop-write Reviewed by: Matt Ahrens Reviewed by: Adam Leventhal Reviewed by: Christopher Siden Approved by: Garrett D'Amore References: illumos/illumos-gate@80901aea8e78a2c20751f61f01bebd1d5b5c2ba5 https://www.illumos.org/issues/3236 Porting Notes 1. This patch is being merged dispite an increased instance of https://www.illumos.org/issues/3113 being triggered by ztest. Ported-by: Brian Behlendorf Issue #1489 --- cmd/ztest/ztest.c | 41 +++++++++++- include/sys/dbuf.h | 2 + include/sys/dmu.h | 5 ++ include/sys/zio.h | 12 ++-- include/sys/zio_impl.h | 96 +++++++++++++++++++++++----- module/zfs/arc.c | 6 ++ module/zfs/dbuf.c | 19 +++++- module/zfs/dmu.c | 138 +++++++++++++++++++++++++++++------------ module/zfs/dsl_pool.c | 1 - module/zfs/spa_misc.c | 1 - module/zfs/zfs_vnops.c | 6 ++ module/zfs/zio.c | 105 ++++++++++++++++++++++++++++--- module/zfs/zvol.c | 13 +++- 13 files changed, 369 insertions(+), 76 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 48da5b206..7b09532c0 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -208,6 +208,7 @@ enum ztest_io_type { ZTEST_IO_WRITE_ZEROES, ZTEST_IO_TRUNCATE, ZTEST_IO_SETATTR, + ZTEST_IO_REWRITE, ZTEST_IO_TYPES }; @@ -1900,6 +1901,12 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) DMU_READ_NO_PREFETCH); if (error == 0) { + blkptr_t *obp = dmu_buf_get_blkptr(db); + if (obp) { + ASSERT(BP_IS_HOLE(bp)); + *bp = *obp; + } + zgd->zgd_db = db; zgd->zgd_bp = bp; @@ -2048,6 +2055,9 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) continue; } + /* + * No object was found. + */ if (od->od_object == 0) continue; @@ -2163,6 +2173,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) static void ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) { + int err; ztest_block_tag_t wbt; dmu_object_info_t doi; enum ztest_io_type io_type; @@ -2217,6 +2228,25 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) break; default: break; + + case ZTEST_IO_REWRITE: + (void) rw_enter(&ztest_name_lock, RW_READER); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_COMPRESSION, + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + (void) rw_exit(&ztest_name_lock); + + VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, + DMU_READ_NO_PREFETCH)); + + (void) ztest_write(zd, object, offset, blocksize, data); + break; } (void) rw_exit(&zd->zd_zilog_lock); @@ -2304,6 +2334,11 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; + /* + * We grab the zd_dirobj_lock to ensure that no other thread is + * updating the zil (i.e. adding in-memory log records) and the + * zd_zilog_lock to block any I/O. + */ mutex_enter(&zd->zd_dirobj_lock); (void) rw_enter(&zd->zd_zilog_lock, RW_WRITER); @@ -5121,8 +5156,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) /* * Find out what block we got. */ - VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db, - DMU_READ_NO_PREFETCH) == 0); + VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, + DMU_READ_NO_PREFETCH)); blk = *((dmu_buf_impl_t *)db)->db_blkptr; dmu_buf_rele(db, FTAG); @@ -5824,6 +5859,8 @@ ztest_freeze(void) kernel_init(FREAD | FWRITE); VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY3U(0, ==, ztest_dataset_open(0)); + spa->spa_debug = B_TRUE; + ztest_spa = spa; /* * Force the first log block to be transactionally allocated. diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 85e967d0f..3140665ab 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -131,6 +132,7 @@ typedef struct dbuf_dirty_record { blkptr_t dr_overridden_by; override_states_t dr_override_state; uint8_t dr_copies; + boolean_t dr_nopwrite; } dl; } dt; } dbuf_dirty_record_t; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 0dd7e2835..fdd9923e6 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -497,6 +497,11 @@ void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); */ void *dmu_buf_get_user(dmu_buf_t *db); +/* + * Returns the blkptr associated with this dbuf, or NULL if not set. + */ +struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); + /* * Indicate that you are going to modify the buffer's data (db_data). * diff --git a/include/sys/zio.h b/include/sys/zio.h index f5a128e0b..b505ca1e6 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -196,7 +196,9 @@ enum zio_flag { ZIO_FLAG_GANG_CHILD = 1 << 22, ZIO_FLAG_DDT_CHILD = 1 << 23, ZIO_FLAG_GODFATHER = 1 << 24, - ZIO_FLAG_FASTWRITE = 1 << 25 + ZIO_FLAG_NOPWRITE = 1 << 25, + ZIO_FLAG_REEXECUTED = 1 << 26, + ZIO_FLAG_FASTWRITE = 1 << 27 }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -296,8 +298,9 @@ typedef struct zio_prop { dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; - uint8_t zp_dedup; - uint8_t zp_dedup_verify; + boolean_t zp_dedup; + boolean_t zp_dedup_verify; + boolean_t zp_nopwrite; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; @@ -466,7 +469,8 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, int priority, enum zio_flag flags, zbookmark_t *zb); -extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); +extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, + boolean_t nopwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 2d062d091..787079d2c 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -37,6 +37,70 @@ extern "C" { #endif +/* + * XXX -- Describe ZFS I/O pipleine here. Fill in as needed. + * + * The ZFS I/O pipeline is comprised of various stages which are defined + * in the zio_stage enum below. The individual stages are used to construct + * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. + * + * I/O operations: (XXX - provide detail for each of the operations) + * + * Read: + * Write: + * Free: + * Claim: + * Ioctl: + * + * Although the most common pipeline are used by the basic I/O operations + * above, there are some helper pipelines (one could consider them + * sub-pipelines) which are used internally by the ZIO module and are + * explained below: + * + * Interlock Pipeline: + * The interlock pipeline is the most basic pipeline and is used by all + * of the I/O operations. The interlock pipeline does not perform any I/O + * and is used to coordinate the dependencies between I/Os that are being + * issued (i.e. the parent/child relationship). + * + * Vdev child Pipeline: + * The vdev child pipeline is responsible for performing the physical I/O. + * It is in this pipeline where the I/O are queued and possibly cached. + * + * In addition to performing I/O, the pipeline is also responsible for + * data transformations. The transformations performed are based on the + * specific properties that user may have selected and modify the + * behavior of the pipeline. Examples of supported transformations are + * compression, dedup, and nop writes. Transformations will either modify + * the data or the pipeline. This list below further describes each of + * the supported transformations: + * + * Compression: + * ZFS supports three different flavors of compression -- gzip, lzjb, and + * zle. Compression occurs as part of the write pipeline and is performed + * in the ZIO_STAGE_WRITE_BP_INIT stage. + * + * Dedup: + * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and + * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing + * read pipeline if the dedup bit is set on the block pointer. + * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage + * and added to a write pipeline if a user has enabled dedup on that + * particular dataset. + * + * NOP Write: + * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage + * and is added to an existing write pipeline if a crypographically + * secure checksum (i.e. SHA256) is enabled and compression is turned on. + * The NOP write stage will compare the checksums of the current data + * on-disk (level-0 blocks only) and the data that is currently being written. + * If the checksum values are identical then the pipeline is converted to + * an interlock pipeline skipping block allocation and bypassing the + * physical I/O. The nop write feature can handle writes in either + * syncing or open context (i.e. zil writes) and as a result is mutually + * exclusive with dedup. + */ + /* * zio pipeline stage definitions */ @@ -50,27 +114,29 @@ enum zio_stage { ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ + ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ + ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */ - ZIO_STAGE_READY = 1 << 15, /* RWFCI */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */ - ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ + ZIO_STAGE_READY = 1 << 16, /* RWFCI */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ + ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */ - ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ + + ZIO_STAGE_DONE = 1 << 21 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index c66ff009d..04f87a810 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -3708,6 +3708,12 @@ arc_write_done(zio_t *zio) arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); + } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + /* nopwrite */ + ASSERT(zio->io_prop.zp_nopwrite); + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad nopwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_datacnt == 1); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 95c7b3297..cfa16b7c0 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -823,13 +823,15 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) ASSERT(db->db_data_pending != dr); /* free this block */ - if (!BP_IS_HOLE(bp)) { + if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { spa_t *spa; DB_GET_SPA(&spa, db); zio_free(spa, txg, bp); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + dr->dt.dl.dr_nopwrite = B_FALSE; + /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are @@ -2269,6 +2271,13 @@ dmu_buf_freeable(dmu_buf_t *dbuf) return (res); } +blkptr_t * +dmu_buf_get_blkptr(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_blkptr); +} + static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -2622,7 +2631,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + /* + * For nopwrites and rewrites we ensure that the bp matches our + * original and bypass all the accounting. + */ + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { objset_t *os; @@ -2822,7 +2835,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies); + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 34f3eeef9..1e16b1eb0 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -41,12 +41,18 @@ #include #include #include +#include #include #ifdef _KERNEL #include #include #endif +/* + * Enable/disable nopwrite feature. + */ +int zfs_nopwrite_enabled = 1; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, @@ -1473,6 +1479,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { + dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); + if (dr->dt.dl.dr_nopwrite) { + ASSERTV(blkptr_t *bp = zio->io_bp); + ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); + ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig)); + + ASSERT(BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); + ASSERT(zio_checksum_table[chksum].ci_dedup); + } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; @@ -1494,11 +1510,22 @@ dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; + ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { - ASSERT(zio->io_bp->blk_birth == zio->io_txg); - ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); - zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + /* + * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) + * then there is nothing to do here. Otherwise, free the + * newly allocated block in this txg. + */ + if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } } dmu_tx_commit(dsa->dsa_tx); @@ -1544,7 +1571,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, * * Return values: * - * EEXIST: this txg has already been synced, so there's nothing to to. + * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. @@ -1576,7 +1603,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dnode_t *dn; ASSERT(pio != NULL); - ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, @@ -1631,6 +1657,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (SET_ERROR(ENOENT)); } + ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); + + /* + * Assume the on-disk data is X, the current syncing data is Y, + * and the current in-memory data is Z (currently in dmu_sync). + * X and Z are identical but Y is has been modified. Normally, + * when X and Z are the same we will perform a nopwrite but if Y + * is different we must disable nopwrite since the resulting write + * of Y to disk can free the block containing X. If we allowed a + * nopwrite to occur the block pointing to Z would reference a freed + * block. Since this is a rare case we simplify this by disabling + * nopwrite if the current dmu_sync-ing dbuf has been modified in + * a previous transaction. + */ + if (dr->dr_next) + zp.zp_nopwrite = B_FALSE; + ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { @@ -1715,14 +1758,26 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; - boolean_t dedup; + boolean_t dedup = B_FALSE; + boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* - * Determine checksum setting. + * We maintain different write policies for each of the following + * types of data: + * 1. metadata + * 2. preallocated blocks (i.e. level-0 blocks of a dump device) + * 3. all other level 0 blocks */ if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a @@ -1733,45 +1788,47 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (zio_checksum_table[checksum].ci_correctable < 1 || zio_checksum_table[checksum].ci_eck) checksum = ZIO_CHECKSUM_FLETCHER_4; - } else { - checksum = zio_checksum_select(dn->dn_checksum, checksum); - } + } else if (wp & WP_NOFILL) { + ASSERT(level == 0); - /* - * Determine compression setting. - */ - if (ismd) { /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. + * If we're writing preallocated blocks, we aren't actually + * writing them so don't set any policy properties. These + * blocks are currently only used by an external subsystem + * outside of zfs (i.e. dump) and not written by the zio + * pipeline. */ - compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; + compress = ZIO_COMPRESS_OFF; + checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(dn->dn_compress, compress); - } - /* - * Determine dedup setting. If we are in dmu_sync(), we won't - * actually dedup now because that's all done in syncing context; - * but we do want to use the dedup checkum. If the checksum is not - * strong enough to ensure unique signatures, force dedup_verify. - */ - dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); - if (dedup) { - checksum = dedup_checksum; - if (!zio_checksum_table[checksum].ci_dedup) - dedup_verify = 1; - } + checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? + zio_checksum_select(dn->dn_checksum, checksum) : + dedup_checksum; - if (wp & WP_DMU_SYNC) - dedup = 0; + /* + * Determine dedup setting. If we are in dmu_sync(), + * we won't actually dedup now because that's all + * done in syncing context; but we do want to use the + * dedup checkum. If the checksum is not strong + * enough to ensure unique signatures, force + * dedup_verify. + */ + if (dedup_checksum != ZIO_CHECKSUM_OFF) { + dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; + if (!zio_checksum_table[checksum].ci_dedup) + dedup_verify = B_TRUE; + } - if (wp & WP_NOFILL) { - ASSERT(!ismd && level == 0); - checksum = ZIO_CHECKSUM_OFF; - compress = ZIO_COMPRESS_OFF; - dedup = B_FALSE; + /* + * Enable nopwrite if we have a cryptographically secure + * checksum that has no known collisions (i.e. SHA-256) + * and compression is enabled. We don't enable nopwrite if + * dedup is enabled as the two features are mutually exclusive. + */ + nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && + compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; @@ -1781,6 +1838,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; + zp->zp_nopwrite = nopwrite; } int @@ -2005,4 +2063,8 @@ EXPORT_SYMBOL(dmu_ot); module_param(zfs_mdcomp_disable, int, 0644); MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression"); + +module_param(zfs_nopwrite_enabled, int, 0644); +MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes"); + #endif diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 72e819f6b..e7127c535 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -421,7 +421,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist to the on-disk deadlist - * - clean up zil records * - release hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets))) { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 3b7922b6c..7bdfbf738 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -245,7 +245,6 @@ int spa_mode_global; * Secondly, the value determines if an I/O is considered "hung". * Any I/O that has not completed in zfs_deadman_synctime is considered * "hung" resulting in a zevent being posted. - * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). */ unsigned long zfs_deadman_synctime = 1000ULL; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index e6c1711ac..84b4fe81f 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1050,6 +1050,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) DMU_READ_NO_PREFETCH); if (error == 0) { + blkptr_t *obp = dmu_buf_get_blkptr(db); + if (obp) { + ASSERT(BP_IS_HOLE(bp)); + *bp = *obp; + } + zgd->zgd_db = db; zgd->zgd_bp = bp; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a24f2bef3..1f803a44b 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -723,9 +723,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && - zp->zp_copies <= spa_max_replication(spa) && - zp->zp_dedup <= 1 && - zp->zp_dedup_verify <= 1); + zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, @@ -753,13 +751,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, } void -zio_write_override(zio_t *zio, blkptr_t *bp, int copies) +zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); + /* + * We must reset the io_prop to match the values that existed + * when the bp was first written by dmu_sync() keeping in mind + * that nopwrite and dedup are mutually exclusive. + */ + zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; + zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } @@ -1051,6 +1056,19 @@ zio_write_bp_init(zio_t *zio) *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + /* + * If we've been overridden and nopwrite is set then + * set the flag accordingly to indicate that a nopwrite + * has already occurred. + */ + if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { + ASSERT(!zp->zp_dedup); + zio->io_flags |= ZIO_FLAG_NOPWRITE; + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(!zp->zp_nopwrite); + if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); @@ -1138,6 +1156,11 @@ zio_write_bp_init(zio_t *zio) ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } + if (zp->zp_nopwrite) { + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; + } } return (ZIO_PIPELINE_CONTINUE); @@ -1404,6 +1427,7 @@ zio_reexecute(zio_t *pio) pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; + pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_error = 0; for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; @@ -1887,8 +1911,9 @@ zio_write_gang_block(zio_t *pio) zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; - zp.zp_dedup = 0; - zp.zp_dedup_verify = 0; + zp.zp_dedup = B_FALSE; + zp.zp_dedup_verify = B_FALSE; + zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, @@ -1912,6 +1937,62 @@ zio_write_gang_block(zio_t *pio) return (ZIO_PIPELINE_CONTINUE); } +/* + * The zio_nop_write stage in the pipeline determines if allocating + * a new bp is necessary. By leveraging a cryptographically secure checksum, + * such as SHA256, we can compare the checksums of the new data and the old + * to determine if allocating a new block is required. The nopwrite + * feature can handle writes in either syncing or open context (i.e. zil + * writes) and as a result is mutually exclusive with dedup. + */ +static int +zio_nop_write(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + zio_prop_t *zp = &zio->io_prop; + + ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(zp->zp_nopwrite); + ASSERT(!zp->zp_dedup); + ASSERT(zio->io_bp_override == NULL); + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Check to see if the original bp and the new bp have matching + * characteristics (i.e. same checksum, compression algorithms, etc). + * If they don't then just continue with the pipeline which will + * allocate a new bp. + */ + if (BP_IS_HOLE(bp_orig) || + !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || + BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || + BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || + BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || + zp->zp_copies != BP_GET_NDVAS(bp_orig)) + return (ZIO_PIPELINE_CONTINUE); + + /* + * If the checksums match then reset the pipeline so that we + * avoid allocating a new bp and issuing any I/O. + */ + if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { + ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); + ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); + ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); + ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, + sizeof (uint64_t)) == 0); + + *bp = *bp_orig; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + zio->io_flags |= ZIO_FLAG_NOPWRITE; + } + + return (ZIO_PIPELINE_CONTINUE); +} + /* * ========================================================================== * Dedup @@ -2186,7 +2267,7 @@ zio_ddt_write(zio_t *zio) zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { - zp->zp_dedup = 0; + zp->zp_dedup = B_FALSE; } zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); @@ -2815,7 +2896,8 @@ zio_ready(zio_t *zio) if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || + (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); @@ -2893,6 +2975,8 @@ zio_done(zio_t *zio) ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } + if (zio->io_flags & ZIO_FLAG_NOPWRITE) + VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* @@ -3015,7 +3099,7 @@ zio_done(zio_t *zio) if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && - !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) + !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); @@ -3112,7 +3196,7 @@ zio_done(zio_t *zio) } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && - !BP_IS_HOLE(zio->io_bp)) { + !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } @@ -3159,6 +3243,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_issue_async, zio_write_bp_init, zio_checksum_generate, + zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 59822a6cd..79c56cd78 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -35,6 +35,7 @@ * needs to be run before opening and using a device. */ +#include #include #include #include @@ -815,8 +816,10 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zvol_state_t *zv = arg; objset_t *os = zv->zv_objset; + uint64_t object = ZVOL_OBJ; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; + blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error; @@ -836,14 +839,20 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - error = dmu_read(os, ZVOL_OBJ, offset, size, buf, + error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } else { size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); - error = dmu_buf_hold(os, ZVOL_OBJ, offset, zgd, &db, + error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { + blkptr_t *obp = dmu_buf_get_blkptr(db); + if (obp) { + ASSERT(BP_IS_HOLE(bp)); + *bp = *obp; + } + zgd->zgd_db = db; zgd->zgd_bp = &lr->lr_blkptr;