From 4bda3bd0e72d582a785b6552ce16b99e04414fbe Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 2 Jul 2015 18:23:20 +0200 Subject: [PATCH] Illumos 5911 - ZFS "hangs" while deleting file 5911 ZFS "hangs" while deleting file Reviewed by: Bayard Bell Reviewed by: Alek Pinchuk Reviewed by: Simon Klinkert Reviewed by: Dan McDonald Approved by: Richard Lowe References: https://www.illumos.org/issues/5911 https://github.com/illumos/illumos-gate/commit/46e1baa Porting notes: Resolved ISO C90 forbids mixed declarations and code wanting in the dnode_free_range() function. Ported-by: kernelOfTruth kerneloftruth@gmail.com Signed-off-by: Brian Behlendorf Closes #3554 --- include/sys/dbuf.h | 4 +-- module/zfs/dbuf.c | 34 ++++++++++-------- module/zfs/dmu_tx.c | 4 +-- module/zfs/dnode.c | 79 +++++++++++++++++++++++++++++++++-------- module/zfs/dnode_sync.c | 4 +-- 5 files changed, 91 insertions(+), 34 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index dd45261a3..94d326d57 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -287,7 +287,7 @@ void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); void dbuf_unoverride(dbuf_dirty_record_t *dr); -void dbuf_sync_list(list_t *list, dmu_tx_t *tx); +void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 4e0f857c5..c7b6a5d9a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -1455,6 +1455,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); + + /* + * Due to our use of dn_nlevels below, this can only be called + * in open context, unless we are operating on the MOS. + * From syncing context, dn_nlevels may be different from the + * dn_nlevels used when dbuf was dirtied. + */ + ASSERT(db->db_objset == + dmu_objset_pool(db->db_objset)->dp_meta_objset || + txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1477,11 +1487,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db.db_size != 0); - /* - * Any space we accounted for in dp_dirty_* will be cleaned up by - * dsl_pool_sync(). This is relatively rare so the discrepancy - * is not a big deal. - */ + dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), + dr->dr_accounted, txg); *drp = dr->dr_next; @@ -1496,7 +1503,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || - db->db_level+1 == dn->dn_nlevels) { + db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); @@ -1513,11 +1520,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } - if (db->db_level != 0) { - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); @@ -2603,7 +2605,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); - dbuf_sync_list(&dr->dt.di.dr_children, tx); + dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); @@ -2754,7 +2756,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } void -dbuf_sync_list(list_t *list, dmu_tx_t *tx) +dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; @@ -2771,6 +2773,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx) DMU_META_DNODE_OBJECT); break; } + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 62a8d471e..5ae429f70 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -679,7 +679,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; - if (err == ESRCH) + if (err == ESRCH || i > end) break; if (err) { tx->tx_err = err; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 6865cd01e..2858bbfb4 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -1517,6 +1517,16 @@ out: rw_downgrade(&dn->dn_struct_rwlock); } +static void +dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); + if (db != NULL) { + dmu_buf_will_dirty(&db->db, tx); + dbuf_rele(db, FTAG); + } +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { @@ -1637,27 +1647,68 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) nblks += 1; /* - * Dirty the first and last indirect blocks, as they (and/or their - * parents) will need to be written out if they were only - * partially freed. Interior indirect blocks will be themselves freed, - * by free_children(), so they need not be dirtied. Note that these - * interior blocks have already been prefetched by dmu_tx_hold_free(). + * Dirty all the indirect blocks in this range. Note that only + * the first and last indirect blocks can actually be written + * (if they were partially freed) -- they must be dirtied, even if + * they do not exist on disk yet. The interior blocks will + * be freed by free_children(), so they will not actually be written. + * Even though these interior blocks will not be written, we + * dirty them for two reasons: + * + * - It ensures that the indirect blocks remain in memory until + * syncing context. (They have already been prefetched by + * dmu_tx_hold_free(), so we don't have to worry about reading + * them serially here.) + * + * - The dirty space accounting will put pressure on the txg sync + * mechanism to begin syncing, and to delay transactions if there + * is a large amount of freeing. Even though these indirect + * blocks will not be written, we could need to write the same + * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { - uint64_t first, last; + uint64_t first, last, i, ibyte; + int shift, err; first = blkid >> epbs; - if ((db = dbuf_hold_level(dn, 1, first, FTAG))) { - dmu_buf_will_dirty(&db->db, tx); - dbuf_rele(db, FTAG); - } + dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; - if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { - dmu_buf_will_dirty(&db->db, tx); - dbuf_rele(db, FTAG); + if (last != first) + dnode_dirty_l1(dn, last, tx); + + shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + for (i = first + 1; i < last; i++) { + /* + * Set i to the blockid of the next non-hole + * level-1 indirect block at or after i. Note + * that dnode_next_offset() operates in terms of + * level-0-equivalent bytes. + */ + ibyte = i << shift; + err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, + &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (i >= last) + break; + + /* + * Normally we should not see an error, either + * from dnode_next_offset() or dbuf_hold_level() + * (except for ESRCH from dnode_next_offset). + * If there is an i/o error, then when we read + * this block in syncing context, it will use + * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according + * to the "failmode" property. dnode_next_offset() + * doesn't have a flag to indicate MUSTSUCCEED. + */ + if (err != 0) + break; + + dnode_dirty_l1(dn, i, tx); } } diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 6f668364b..a8fa9a952 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -718,7 +718,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); } - dbuf_sync_list(list, tx); + dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL);