OpenZFS 7578 - Fix/improve some aspects of ZIL writing

- After some ZIL changes 6 years ago zil_slog_limit got partially broken
due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
Actually because of other changes about that time zl_itx_list_sz is not
really required to implement the functionality, so this patch removes
some unneeded broken code and variables.

 - Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
single heavy logger, that increased latency for other (more latency critical)
loggers, by pushing heavy log out into the main pool instead of SLOG.  Beside
huge latency increase for heavy writers, this implementation caused double
write of all data, since the log records were explicitly prepared for SLOG.
Since we now have I/O scheduler, I've found it can be much more efficient
to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.

 - Existing ZIL implementation had problem with space efficiency when it
has to write large chunks of data into log blocks of limited size.  In some
cases efficiency stopped to almost as low as 50%.  In case of ZIL stored on
spinning rust, that also reduced log write speed in half, since head had to
uselessly fly over allocated but not written areas.  This change improves
the situation by offloading problematic operations from z*_log_write() to
zil_lwb_commit(), which knows real situation of log blocks allocation and
can split large requests into pieces much more efficiently.  Also as side
effect it removes one of two data copy operations done by ZIL code WR_COPIED
case.

 - While there, untangle and unify code of z*_log_write() functions.
Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
block boundary, that may also improve efficiency if ZPL is made to do that.

Sponsored by:   iXsystems, Inc.

Authored by: Alexander Motin <mav@FreeBSD.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Steven Hartland <steven.hartland@multiplay.co.uk>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Ported-by: Giuseppe Di Natale <dinatale2@llnl.gov>

OpenZFS-issue: https://www.illumos.org/issues/7578
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/aeb13ac
Closes #6191
This commit is contained in:
Giuseppe Di Natale
2017-06-09 09:15:37 -07:00
committed by Brian Behlendorf
parent 82644107c4
commit 1b7c1e5ce9
10 changed files with 133 additions and 128 deletions
+2 -6
View File
@@ -63,7 +63,6 @@ DECLARE_EVENT_CLASS(zfs_zil_class,
__field(uint64_t, zl_parse_lr_count)
__field(uint64_t, zl_next_batch)
__field(uint64_t, zl_com_batch)
__field(uint64_t, zl_itx_list_sz)
__field(uint64_t, zl_cur_used)
__field(clock_t, zl_replay_time)
__field(uint64_t, zl_replay_blks)
@@ -88,7 +87,6 @@ DECLARE_EVENT_CLASS(zfs_zil_class,
__entry->zl_parse_lr_count = zilog->zl_parse_lr_count;
__entry->zl_next_batch = zilog->zl_next_batch;
__entry->zl_com_batch = zilog->zl_com_batch;
__entry->zl_itx_list_sz = zilog->zl_itx_list_sz;
__entry->zl_cur_used = zilog->zl_cur_used;
__entry->zl_replay_time = zilog->zl_replay_time;
__entry->zl_replay_blks = zilog->zl_replay_blks;
@@ -98,8 +96,7 @@ DECLARE_EVENT_CLASS(zfs_zil_class,
"replay %u stop_sync %u writer %u logbias %u sync %u "
"parse_error %u parse_blk_seq %llu parse_lr_seq %llu "
"parse_blk_count %llu parse_lr_count %llu next_batch %llu "
"com_batch %llu itx_list_sz %llu cur_used %llu replay_time %lu "
"replay_blks %llu }",
"com_batch %llu cur_used %llu replay_time %lu replay_blks %llu }",
__entry->zl_lr_seq, __entry->zl_commit_lr_seq,
__entry->zl_destroy_txg, __entry->zl_replaying_seq,
__entry->zl_suspend, __entry->zl_suspending, __entry->zl_keep_first,
@@ -107,8 +104,7 @@ DECLARE_EVENT_CLASS(zfs_zil_class,
__entry->zl_logbias, __entry->zl_sync, __entry->zl_parse_error,
__entry->zl_parse_blk_seq, __entry->zl_parse_lr_seq,
__entry->zl_parse_blk_count, __entry->zl_parse_lr_count,
__entry->zl_next_batch, __entry->zl_com_batch,
__entry->zl_itx_list_sz, __entry->zl_cur_used,
__entry->zl_next_batch, __entry->zl_com_batch, __entry->zl_cur_used,
__entry->zl_replay_time, __entry->zl_replay_blks)
);
/* END CSTYLED */
-1
View File
@@ -394,7 +394,6 @@ typedef struct itx {
uint8_t itx_sync; /* synchronous transaction */
zil_callback_t itx_callback; /* Called when the itx is persistent */
void *itx_callback_data; /* User data for the callback */
uint64_t itx_sod; /* record size on disk */
uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
+18 -2
View File
@@ -42,6 +42,7 @@ typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
int lwb_nused; /* # used bytes in buffer */
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
@@ -62,7 +63,6 @@ typedef struct itxs {
typedef struct itxg {
kmutex_t itxg_lock; /* lock for this structure */
uint64_t itxg_txg; /* txg for this chain */
uint64_t itxg_sod; /* total size on disk for this txg */
itxs_t *itxg_itxs; /* sync and async itxs */
} itxg_t;
@@ -120,7 +120,6 @@ struct zilog {
kcondvar_t zl_cv_batch[2]; /* batch condition variables */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
@@ -140,9 +139,26 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;
/*
* Maximum amount of write data that can be put into single log block.
*/
#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
/*
* Maximum amount of log space we agree to waste to reduce number of
* WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
*/
#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8)
/*
* Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY
* as more space efficient if we can't fit at least two log records into
* maximum sized log block.
*/
#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \
sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
#ifdef __cplusplus
}
#endif
+1 -1
View File
@@ -515,7 +515,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, enum zio_flag flags);
extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
uint64_t size, boolean_t use_slog);
uint64_t size, boolean_t *slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_shrink(zio_t *zio, uint64_t size);