diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 60d4aa58a..0dd7e2835 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -407,6 +407,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release what you hold with dmu_buf_rele(). + * + * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); @@ -662,8 +664,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +/* + * Like dmu_object_info_from_db, but faster still when you only care about + * the size. This is specifically optimized for zfs_getattr(). + */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index be23382b5..ede0d91f8 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -4791,6 +4791,11 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) return (err); } +/* + * Convert the zvol's volume size to an appropriate reservation. + * Note: If this routine is updated, it is necessary to update the ZFS test + * suite's shell version in reservation.kshlib. + */ uint64_t zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) { diff --git a/module/zfs/arc.c b/module/zfs/arc.c index a521501bd..2ae4c37a3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -260,7 +260,18 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; kstat_named_t arcstat_recycle_miss; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped because they have I/O in progress, are + * indrect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; @@ -3174,6 +3185,10 @@ top: mutex_exit(hash_lock); + /* + * At this point, we have a level 1 cache miss. Try again in + * L2ARC if possible. + */ ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_t *, zb); @@ -3445,8 +3460,8 @@ arc_buf_evict(arc_buf_t *buf) } /* - * Release this buffer from the cache. This must be done - * after a read and prior to modifying the buffer contents. + * Release this buffer from the cache, making it an anonymous buffer. This + * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9c60ec55a..95c7b3297 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -691,6 +691,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (!havepzio) err = zio_wait(zio); } else { + /* + * Another reader came in while the dbuf was in flight + * between UNCACHED and CACHED. Either a writer will finish + * writing the buffer (sending the dbuf to CACHED) or the + * first reader's request will reach the read_done callback + * and send the dbuf to CACHED. Otherwise, a failure + * occurred and the dbuf went to UNCACHED. + */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, @@ -699,6 +707,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); + /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || @@ -1313,7 +1322,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } /* - * Return TRUE if this evicted the dbuf. + * Undirty a buffer in the transaction group referenced by the given + * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -2324,6 +2334,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db_level > 0); DBUF_VERIFY(db); + /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); @@ -2334,10 +2345,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); + /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); @@ -2728,6 +2741,7 @@ dbuf_write_override_done(zio_t *zio) dbuf_write_done(zio, NULL, db); } +/* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { @@ -2762,11 +2776,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } if (parent != dn->dn_dbuf) { + /* Our parent is an indirect block. */ + /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); + /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); + /* + * We're about to modify our parent's db_data by modifying + * our block pointer, so the parent must be released. + */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { + /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 9223b907b..34f3eeef9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1965,7 +1965,7 @@ dmu_init(void) void dmu_fini(void) { - arc_fini(); + arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1cad8d20e..caac60193 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1040,6 +1040,10 @@ dmu_tx_unassign(dmu_tx_t *tx) txg_rele_to_quiesce(&tx->tx_txgh); + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1157,6 +1161,10 @@ dmu_tx_commit(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); + /* + * Go through the transaction's hold list and remove holds on + * associated dnodes, notifying waiters if no holds remain. + */ while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 1763bae51..705478c82 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -48,11 +48,11 @@ unsigned int zfetch_block_cap = 256; unsigned long zfetch_array_rd_sz = 1024 * 1024; /* forward decls for static routines */ -static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); +static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); @@ -104,9 +104,9 @@ kstat_t *zfetch_ksp; * last stream, then we are probably in a strided access pattern. So * combine the two sequential streams into a single strided stream. * - * If no co-linear streams are found, return NULL. + * Returns whether co-linear streams were found. */ -static int +static boolean_t dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) { zstream_t *z_walk; @@ -326,7 +326,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) * for this block read. If so, it starts a prefetch for the stream it * located and returns true, otherwise it returns false */ -static int +static boolean_t dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) { zstream_t *zs; @@ -639,7 +639,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) { zstream_t zst; zstream_t *newstream; - int fetched; + boolean_t fetched; int inserted; unsigned int blkshft; uint64_t blksz; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index bc9bf2cc3..c30107771 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -26,6 +26,8 @@ */ /* + * SPA: Storage Pool Allocator + * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. diff --git a/module/zfs/txg.c b/module/zfs/txg.c index c8a29e14f..697aa0905 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -354,6 +354,12 @@ txg_rele_to_sync(txg_handle_t *th) th->th_cpu = NULL; /* defensive */ } +/* + * Blocks until all transactions in the group are committed. + * + * On return, the transaction group has reached a stable state in which it can + * then be passed off to the syncing context. + */ static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { @@ -409,6 +415,9 @@ txg_do_callbacks(list_t *cb_list) /* * Dispatch the commit callbacks registered on this txg to worker threads. + * + * If no callbacks are registered for a given TXG, nothing happens. + * This function creates a taskq for the associated pool, if needed. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) @@ -419,7 +428,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; - /* No need to lock tx_cpu_t at this point */ + /* + * No need to lock tx_cpu_t at this point, since this can + * only be called once a txg has been synced. + */ int g = txg & TXG_MASK; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 07e66ebdc..0405608c2 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1035,6 +1035,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); } +/* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 130ec575e..d2dfd5b43 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -431,23 +431,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { vdev_raidz_cksum_report }; +/* + * Divides the IO evenly across all child vdevs; usually, dcols is + * the number of children in the target vdev. + */ static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> unit_shift; + /* The first column for this stripe. */ uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { @@ -1521,6 +1548,23 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ static int vdev_raidz_io_start(zio_t *zio) { @@ -1864,6 +1908,27 @@ done: return (ret); } +/* + * Complete an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + * - For read operations: + * 1. Check for errors on the child IOs. + * 2. If data errors occurred: + * a. Try to reassemble the data from the parity available. + * b. If we haven't yet read the parity drives, read them now. + * c. If all parity drives have been read but the data still doesn't + * reassemble with a correct checksum, then try combinatorial + * reconstruction. + * d. If that doesn't work, return an error. + * 3. If there were unexpected errors or this is a resilver operation, + * rewrite the vdevs that had errors. + */ static void vdev_raidz_io_done(zio_t *zio) { diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index ce084fff1..c08e9dd9b 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -368,6 +368,11 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) return (0); } +/* + * Gets the full dataset name that corresponds to the given snapshot name + * Example: + * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" + */ static int zfsctl_snapshot_zpath(struct path *path, int len, char *zpath) {