3741 zfs needs better comments
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Approved by: Christopher Siden <christopher.siden@delphix.com>

References:
  https://www.illumos.org/issues/3741
  illumos/illumos-gate@3e30c24aee

Ported-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1775
This commit is contained in:
Will Andrews 2013-06-11 09:12:34 -08:00 committed by Brian Behlendorf
parent b1118acbb1
commit e49f1e20a0
12 changed files with 154 additions and 11 deletions

View File

@ -407,6 +407,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
* object must be held in an assigned transaction before calling * object must be held in an assigned transaction before calling
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
* buffer as well. You must release what you hold with dmu_buf_rele(). * buffer as well. You must release what you hold with dmu_buf_rele().
*
* Returns ENOENT, EIO, or 0.
*/ */
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void); int dmu_bonus_max(void);
@ -662,8 +664,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
*/ */
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
/*
* Like dmu_object_info_from_db, but faster still when you only care about
* the size. This is specifically optimized for zfs_getattr().
*/
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
u_longlong_t *nblk512); u_longlong_t *nblk512);

View File

@ -4791,6 +4791,11 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
return (err); return (err);
} }
/*
* Convert the zvol's volume size to an appropriate reservation.
* Note: If this routine is updated, it is necessary to update the ZFS test
* suite's shell version in reservation.kshlib.
*/
uint64_t uint64_t
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
{ {

View File

@ -260,7 +260,18 @@ typedef struct arc_stats {
kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_deleted; kstat_named_t arcstat_deleted;
kstat_named_t arcstat_recycle_miss; kstat_named_t arcstat_recycle_miss;
/*
* Number of buffers that could not be evicted because the hash lock
* was held by another thread. The lock may not necessarily be held
* by something using the same buffer, since hash locks are shared
* by multiple buffers.
*/
kstat_named_t arcstat_mutex_miss; kstat_named_t arcstat_mutex_miss;
/*
* Number of buffers skipped because they have I/O in progress, are
* indrect prefetch buffers that have not lived long enough, or are
* not from the spa we're trying to evict from.
*/
kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_skip;
kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_eligible;
@ -3174,6 +3185,10 @@ top:
mutex_exit(hash_lock); mutex_exit(hash_lock);
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
*/
ASSERT3U(hdr->b_size, ==, size); ASSERT3U(hdr->b_size, ==, size);
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
uint64_t, size, zbookmark_t *, zb); uint64_t, size, zbookmark_t *, zb);
@ -3445,8 +3460,8 @@ arc_buf_evict(arc_buf_t *buf)
} }
/* /*
* Release this buffer from the cache. This must be done * Release this buffer from the cache, making it an anonymous buffer. This
* after a read and prior to modifying the buffer contents. * must be done after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make * If the buffer has more than one reference, we must make
* a new hdr for the buffer. * a new hdr for the buffer.
*/ */

View File

@ -691,6 +691,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (!havepzio) if (!havepzio)
err = zio_wait(zio); err = zio_wait(zio);
} else { } else {
/*
* Another reader came in while the dbuf was in flight
* between UNCACHED and CACHED. Either a writer will finish
* writing the buffer (sending the dbuf to CACHED) or the
* first reader's request will reach the read_done callback
* and send the dbuf to CACHED. Otherwise, a failure
* occurred and the dbuf went to UNCACHED.
*/
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (prefetch) if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
@ -699,6 +707,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
/* Skip the wait per the caller's request. */
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) { if ((flags & DB_RF_NEVERWAIT) == 0) {
while (db->db_state == DB_READ || while (db->db_state == DB_READ ||
@ -1313,7 +1322,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} }
/* /*
* Return TRUE if this evicted the dbuf. * Undirty a buffer in the transaction group referenced by the given
* transaction. Return whether this evicted the dbuf.
*/ */
static boolean_t static boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
@ -2324,6 +2334,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(db->db_level > 0); ASSERT(db->db_level > 0);
DBUF_VERIFY(db); DBUF_VERIFY(db);
/* Read the block if it hasn't been read yet. */
if (db->db_buf == NULL) { if (db->db_buf == NULL) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
@ -2334,10 +2345,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
dn = DB_DNODE(db); dn = DB_DNODE(db);
/* Indirect block size must match what the dnode thinks it is. */
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db); dbuf_check_blkptr(dn, db);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
/* Provide the pending dirty record to child dbufs */
db->db_data_pending = dr; db->db_data_pending = dr;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
@ -2728,6 +2741,7 @@ dbuf_write_override_done(zio_t *zio)
dbuf_write_done(zio, NULL, db); dbuf_write_done(zio, NULL, db);
} }
/* Issue I/O to commit a dirty buffer to disk. */
static void static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{ {
@ -2762,11 +2776,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
} }
if (parent != dn->dn_dbuf) { if (parent != dn->dn_dbuf) {
/* Our parent is an indirect block. */
/* We have a dirty parent that has been scheduled for write. */
ASSERT(parent && parent->db_data_pending); ASSERT(parent && parent->db_data_pending);
/* Our parent's buffer is one level closer to the dnode. */
ASSERT(db->db_level == parent->db_level-1); ASSERT(db->db_level == parent->db_level-1);
/*
* We're about to modify our parent's db_data by modifying
* our block pointer, so the parent must be released.
*/
ASSERT(arc_released(parent->db_buf)); ASSERT(arc_released(parent->db_buf));
zio = parent->db_data_pending->dr_zio; zio = parent->db_data_pending->dr_zio;
} else { } else {
/* Our parent is the dnode itself. */
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
db->db_blkid != DMU_SPILL_BLKID) || db->db_blkid != DMU_SPILL_BLKID) ||
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));

View File

@ -1965,7 +1965,7 @@ dmu_init(void)
void void
dmu_fini(void) dmu_fini(void)
{ {
arc_fini(); arc_fini(); /* arc depends on l2arc, so arc must go first */
l2arc_fini(); l2arc_fini();
dmu_tx_fini(); dmu_tx_fini();
zfetch_fini(); zfetch_fini();

View File

@ -1040,6 +1040,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
txg_rele_to_quiesce(&tx->tx_txgh); txg_rele_to_quiesce(&tx->tx_txgh);
/*
* Walk the transaction's hold list, removing the hold on the
* associated dnode, and notifying waiters if the refcount drops to 0.
*/
for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
txh = list_next(&tx->tx_holds, txh)) { txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode; dnode_t *dn = txh->txh_dnode;
@ -1157,6 +1161,10 @@ dmu_tx_commit(dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_txg != 0);
/*
* Go through the transaction's hold list and remove holds on
* associated dnodes, notifying waiters if no holds remain.
*/
while ((txh = list_head(&tx->tx_holds))) { while ((txh = list_head(&tx->tx_holds))) {
dnode_t *dn = txh->txh_dnode; dnode_t *dn = txh->txh_dnode;

View File

@ -48,11 +48,11 @@ unsigned int zfetch_block_cap = 256;
unsigned long zfetch_array_rd_sz = 1024 * 1024; unsigned long zfetch_array_rd_sz = 1024 * 1024;
/* forward decls for static routines */ /* forward decls for static routines */
static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *);
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int);
static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
@ -104,9 +104,9 @@ kstat_t *zfetch_ksp;
* last stream, then we are probably in a strided access pattern. So * last stream, then we are probably in a strided access pattern. So
* combine the two sequential streams into a single strided stream. * combine the two sequential streams into a single strided stream.
* *
* If no co-linear streams are found, return NULL. * Returns whether co-linear streams were found.
*/ */
static int static boolean_t
dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
{ {
zstream_t *z_walk; zstream_t *z_walk;
@ -326,7 +326,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
* for this block read. If so, it starts a prefetch for the stream it * for this block read. If so, it starts a prefetch for the stream it
* located and returns true, otherwise it returns false * located and returns true, otherwise it returns false
*/ */
static int static boolean_t
dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
{ {
zstream_t *zs; zstream_t *zs;
@ -639,7 +639,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
{ {
zstream_t zst; zstream_t zst;
zstream_t *newstream; zstream_t *newstream;
int fetched; boolean_t fetched;
int inserted; int inserted;
unsigned int blkshft; unsigned int blkshft;
uint64_t blksz; uint64_t blksz;

View File

@ -26,6 +26,8 @@
*/ */
/* /*
* SPA: Storage Pool Allocator
*
* This file contains all the routines used when modifying on-disk SPA state. * This file contains all the routines used when modifying on-disk SPA state.
* This includes opening, importing, destroying, exporting a pool, and syncing a * This includes opening, importing, destroying, exporting a pool, and syncing a
* pool. * pool.

View File

@ -354,6 +354,12 @@ txg_rele_to_sync(txg_handle_t *th)
th->th_cpu = NULL; /* defensive */ th->th_cpu = NULL; /* defensive */
} }
/*
* Blocks until all transactions in the group are committed.
*
* On return, the transaction group has reached a stable state in which it can
* then be passed off to the syncing context.
*/
static void static void
txg_quiesce(dsl_pool_t *dp, uint64_t txg) txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{ {
@ -409,6 +415,9 @@ txg_do_callbacks(list_t *cb_list)
/* /*
* Dispatch the commit callbacks registered on this txg to worker threads. * Dispatch the commit callbacks registered on this txg to worker threads.
*
* If no callbacks are registered for a given TXG, nothing happens.
* This function creates a taskq for the associated pool, if needed.
*/ */
static void static void
txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
@ -419,7 +428,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
for (c = 0; c < max_ncpus; c++) { for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c]; tx_cpu_t *tc = &tx->tx_cpu[c];
/* No need to lock tx_cpu_t at this point */ /*
* No need to lock tx_cpu_t at this point, since this can
* only be called once a txg has been synced.
*/
int g = txg & TXG_MASK; int g = txg & TXG_MASK;

View File

@ -1035,6 +1035,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
} }
/* Sync the uberblocks to all vdevs in svd[] */
int int
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
{ {

View File

@ -431,23 +431,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
vdev_raidz_cksum_report vdev_raidz_cksum_report
}; };
/*
* Divides the IO evenly across all child vdevs; usually, dcols is
* the number of children in the target vdev.
*/
static raidz_map_t * static raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t nparity) uint64_t nparity)
{ {
raidz_map_t *rm; raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = zio->io_offset >> unit_shift; uint64_t b = zio->io_offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = zio->io_size >> unit_shift; uint64_t s = zio->io_size >> unit_shift;
/* The first column for this stripe. */
uint64_t f = b % dcols; uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift; uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
/*
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
*/
q = s / (dcols - nparity); q = s / (dcols - nparity);
/*
* "Remainder": The number of partial stripe data sectors in this I/O.
* This will add a sector to some, but not all, child vdevs.
*/
r = s - q * (dcols - nparity); r = s - q * (dcols - nparity);
/* The number of "big columns" - those which contain remainder data. */
bc = (r == 0 ? 0 : r + nparity); bc = (r == 0 ? 0 : r + nparity);
/*
* The total number of data and parity sectors associated with
* this I/O.
*/
tot = s + nparity * (q + (r == 0 ? 0 : 1)); tot = s + nparity * (q + (r == 0 ? 0 : 1));
/* acols: The columns that will be accessed. */
/* scols: The columns that will be accessed or skipped. */
if (q == 0) { if (q == 0) {
/* Our I/O request doesn't span all child vdevs. */
acols = bc; acols = bc;
scols = MIN(dcols, roundup(bc, nparity + 1)); scols = MIN(dcols, roundup(bc, nparity + 1));
} else { } else {
@ -1521,6 +1548,23 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_skipped = 0; rc->rc_skipped = 0;
} }
/*
* Start an IO operation on a RAIDZ VDev
*
* Outline:
* - For write operations:
* 1. Generate the parity data
* 2. Create child zio write operations to each column's vdev, for both
* data and parity.
* 3. If the column skips any sectors for padding, create optional dummy
* write zio children for those areas to improve aggregation continuity.
* - For read operations:
* 1. Create child zio read operations to each data column's vdev to read
* the range of data required for zio.
* 2. If this is a scrub or resilver operation, or if any of the data
* vdevs have had errors, then create zio read operations to the parity
* columns' VDevs as well.
*/
static int static int
vdev_raidz_io_start(zio_t *zio) vdev_raidz_io_start(zio_t *zio)
{ {
@ -1864,6 +1908,27 @@ done:
return (ret); return (ret);
} }
/*
* Complete an IO operation on a RAIDZ VDev
*
* Outline:
* - For write operations:
* 1. Check for errors on the child IOs.
* 2. Return, setting an error code if too few child VDevs were written
* to reconstruct the data later. Note that partial writes are
* considered successful if they can be reconstructed at all.
* - For read operations:
* 1. Check for errors on the child IOs.
* 2. If data errors occurred:
* a. Try to reassemble the data from the parity available.
* b. If we haven't yet read the parity drives, read them now.
* c. If all parity drives have been read but the data still doesn't
* reassemble with a correct checksum, then try combinatorial
* reconstruction.
* d. If that doesn't work, return an error.
* 3. If there were unexpected errors or this is a resilver operation,
* rewrite the vdevs that had errors.
*/
static void static void
vdev_raidz_io_done(zio_t *zio) vdev_raidz_io_done(zio_t *zio)
{ {

View File

@ -368,6 +368,11 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
return (0); return (0);
} }
/*
* Gets the full dataset name that corresponds to the given snapshot name
* Example:
* zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
*/
static int static int
zfsctl_snapshot_zpath(struct path *path, int len, char *zpath) zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
{ {