diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index b5ab7dbae..620244556 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -353,7 +353,8 @@ extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag); + znode_t *zp, offset_t off, ssize_t len, int ioflag, + zil_callback_t callback, void *callback_data); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zil.h b/include/sys/zil.h index d3e4b8ec6..b6718b93c 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -361,11 +361,15 @@ typedef enum { WR_NUM_STATES /* number of states */ } itx_wr_state_t; +typedef void (*zil_callback_t)(void *data); + typedef struct itx { list_node_t itx_node; /* linkage on zl_itx_list */ void *itx_private; /* type-specific opaque data */ itx_wr_state_t itx_wr_state; /* write state */ uint8_t itx_sync; /* synchronous transaction */ + zil_callback_t itx_callback; /* Called when the itx is persistent */ + void *itx_callback_data; /* User data for the callback */ uint64_t itx_sod; /* record size on disk */ uint64_t itx_oid; /* object id */ lr_t itx_lr; /* common part of log record */ diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 0bb44234d..cfce83138 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -445,21 +445,27 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * Handles TX_WRITE transactions. + * zfs_log_write() handles TX_WRITE transactions. The specified callback is + * called as soon as the write is on stable storage (be it via a DMU sync or a + * ZIL commit). */ long zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag) + znode_t *zp, offset_t off, ssize_t resid, int ioflag, + zil_callback_t callback, void *callback_data) { itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; ssize_t immediate_write_sz; - if (zil_replaying(zilog, tx) || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) { + if (callback != NULL) + callback(callback_data); return; + } immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ? 0 : (ssize_t)zfs_immediate_write_sz; @@ -516,6 +522,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, (fsync_cnt == 0)) itx->itx_sync = B_FALSE; + itx->itx_callback = callback; + itx->itx_callback_data = callback_data; zil_itx_assign(zilog, itx, tx); off += len; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 84b4fe81f..abf3747db 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -892,7 +892,8 @@ again: error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); dmu_tx_commit(tx); if (error != 0) @@ -3822,19 +3823,11 @@ top: EXPORT_SYMBOL(zfs_link); static void -zfs_putpage_commit_cb(void *arg, int error) +zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; - if (error) { - __set_page_dirty_nobuffers(pp); - - if (error != ECANCELED) - SetPageError(pp); - } else { - ClearPageError(pp); - } - + ClearPageError(pp); end_page_writeback(pp); } @@ -3868,7 +3861,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; - int sync; ZFS_ENTER(zsb); ZFS_VERIFY_ZP(zp); @@ -3909,11 +3901,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER); tx = dmu_tx_create(zsb->z_os); - sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) || - (wbc->sync_mode == WB_SYNC_ALL)); - if (!sync) - dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp); - dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); @@ -3923,16 +3910,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) if (err == ERESTART) dmu_tx_wait(tx); - /* Will call all registered commit callbacks */ dmu_tx_abort(tx); - - /* - * For the synchronous case the commit callback must be - * explicitly called because there is no registered callback. - */ - if (sync) - zfs_putpage_commit_cb(pp, ECANCELED); - + __set_page_dirty_nobuffers(pp); + ClearPageError(pp); + end_page_writeback(pp); zfs_range_unlock(rl); ZFS_EXIT(zsb); return (err); @@ -3955,14 +3936,19 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0); + zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, + zfs_putpage_commit_cb, pp); dmu_tx_commit(tx); zfs_range_unlock(rl); - if (sync) { + if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Note that this is rarely called under writepages(), because + * writepages() normally handles the entire commit for + * performance reasons. + */ zil_commit(zsb->z_log, zp->z_id); - zfs_putpage_commit_cb(pp, err); } ZFS_EXIT(zsb); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 3688c8a16..839afa956 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1184,6 +1184,8 @@ zil_itx_create(uint64_t txtype, size_t lrsize) itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; return (itx); } @@ -1209,6 +1211,8 @@ zil_itxg_clean(itxs_t *itxs) list = &itxs->i_sync_list; while ((itx = list_head(list)) != NULL) { + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); @@ -1219,6 +1223,8 @@ zil_itxg_clean(itxs_t *itxs) while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_head(list)) != NULL) { + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); @@ -1285,6 +1291,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) mutex_exit(&itxg->itxg_lock); } while ((itx = list_head(&clean_list)) != NULL) { + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); list_remove(&clean_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); @@ -1530,15 +1538,13 @@ zil_commit_writer(zilog_t *zilog) } DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); - while ((itx = list_head(&zilog->zl_itx_commit_list))) { + for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL; + itx = list_next(&zilog->zl_itx_commit_list, itx)) { txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); - list_remove(&zilog->zl_itx_commit_list, itx); - kmem_free(itx, offsetof(itx_t, itx_lr) - + itx->itx_lr.lrc_reclen); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); @@ -1560,6 +1566,17 @@ zil_commit_writer(zilog_t *zilog) if (error || lwb == NULL) txg_wait_synced(zilog->zl_dmu_pool, 0); + while ((itx = list_head(&zilog->zl_itx_commit_list))) { + txg = itx->itx_lr.lrc_txg; + ASSERT(txg); + + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); + list_remove(&zilog->zl_itx_commit_list, itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + } + mutex_enter(&zilog->zl_lock); /* diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c index 8054645c1..0d46eee02 100644 --- a/module/zfs/zpl_file.c +++ b/module/zfs/zpl_file.c @@ -23,6 +23,7 @@ */ +#include #include #include #include @@ -420,7 +421,43 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return write_cache_pages(mapping, wbc, zpl_putpage, mapping); + znode_t *zp = ITOZ(mapping->host); + zfs_sb_t *zsb = ITOZSB(mapping->host); + enum writeback_sync_modes sync_mode; + int result; + + ZFS_ENTER(zsb); + if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; + ZFS_EXIT(zsb); + sync_mode = wbc->sync_mode; + + /* + * We don't want to run write_cache_pages() in SYNC mode here, because + * that would make putpage() wait for a single page to be committed to + * disk every single time, resulting in atrocious performance. Instead + * we run it once in non-SYNC mode so that the ZIL gets all the data, + * and then we commit it all in one go. + */ + wbc->sync_mode = WB_SYNC_NONE; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + if (sync_mode != wbc->sync_mode) { + ZFS_ENTER(zsb); + ZFS_VERIFY_ZP(zp); + zil_commit(zsb->z_log, zp->z_id); + ZFS_EXIT(zsb); + + /* + * We need to call write_cache_pages() again (we can't just + * return after the commit) because the previous call in + * non-SYNC mode does not guarantee that we got all the dirty + * pages (see the implementation of write_cache_pages() for + * details). That being said, this is a no-op in most cases. + */ + wbc->sync_mode = sync_mode; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + } + return (result); } /* @@ -432,7 +469,10 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { - return zpl_putpage(pp, wbc, pp->mapping); + if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; + + return (zpl_putpage(pp, wbc, pp->mapping)); } /*