From 142e6dd100eb70ef06f39015a2e54cbd74172f8b Mon Sep 17 00:00:00 2001
From: Etienne Dechamps <etienne.dechamps@ovh.net>
Date: Wed, 27 Jun 2012 10:26:49 +0200
Subject: [PATCH 1/3] Add atomic_sub_* functions to libspl.

Both the SPL and the ZFS libspl export most of the atomic_* functions,
except atomic_sub_* functions which are only exported by the SPL, not by
libspl. This patch remedies that by implementing atomic_sub_* functions
in libspl.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
---
 lib/libspl/asm-generic/atomic.c |  56 +++++++++++++++++
 lib/libspl/asm-i386/atomic.S    | 106 ++++++++++++++++++++++++++++++++
 lib/libspl/asm-x86_64/atomic.S  |  92 +++++++++++++++++++++++++++
 lib/libspl/include/atomic.h     |  30 +++++++++
 4 files changed, 284 insertions(+)

diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c
index de4430f9f..a3223eadc 100644
--- a/lib/libspl/asm-generic/atomic.c
+++ b/lib/libspl/asm-generic/atomic.c
@@ -103,6 +103,31 @@ void atomic_add_ptr(volatile void *target, ssize_t bits)
 }
 
 
+#define ATOMIC_SUB(name, type1, type2) \
+	void atomic_sub_##name(volatile type1 *target, type2 bits)	\
+	{								\
+		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
+		*target -= bits;					\
+		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
+	}
+
+ATOMIC_SUB(8, uint8_t, int8_t)
+ATOMIC_SUB(char, uchar_t, signed char)
+ATOMIC_SUB(16, uint16_t, int16_t)
+ATOMIC_SUB(short, ushort_t, short)
+ATOMIC_SUB(32, uint32_t, int32_t)
+ATOMIC_SUB(int, uint_t, int)
+ATOMIC_SUB(long, ulong_t, long)
+ATOMIC_SUB(64, uint64_t, int64_t)
+
+void atomic_sub_ptr(volatile void *target, ssize_t bits)
+{
+	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
+	*(caddr_t *)target -= bits;
+	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
+}
+
+
 #define ATOMIC_OR(name, type) \
 	void atomic_or_##name(volatile type *target, type bits)		\
 	{								\
@@ -216,6 +241,37 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits)
 }
 
 
+#define ATOMIC_SUB_NV(name, type1, type2) \
+	type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\
+	{								\
+		type1 rc;						\
+		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
+		rc = (*target -= bits);					\
+		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
+		return rc;						\
+	}
+
+ATOMIC_SUB_NV(8, uint8_t, int8_t)
+ATOMIC_SUB_NV(char, uchar_t, signed char)
+ATOMIC_SUB_NV(16, uint16_t, int16_t)
+ATOMIC_SUB_NV(short, ushort_t, short)
+ATOMIC_SUB_NV(32, uint32_t, int32_t)
+ATOMIC_SUB_NV(int, uint_t, int)
+ATOMIC_SUB_NV(long, ulong_t, long)
+ATOMIC_SUB_NV(64, uint64_t, int64_t)
+
+void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits)
+{
+	void *ptr;
+
+	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
+	ptr = (*(caddr_t *)target -= bits);
+	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
+
+	return ptr;
+}
+
+
 #define ATOMIC_OR_NV(name, type) \
 	type atomic_or_##name##_nv(volatile type *target, type bits)	\
 	{								\
diff --git a/lib/libspl/asm-i386/atomic.S b/lib/libspl/asm-i386/atomic.S
index 93c04bfb8..d3d425090 100644
--- a/lib/libspl/asm-i386/atomic.S
+++ b/lib/libspl/asm-i386/atomic.S
@@ -271,6 +271,40 @@
 	SET_SIZE(atomic_add_int)
 	SET_SIZE(atomic_add_32)
 
+	ENTRY(atomic_sub_8)
+	ALTENTRY(atomic_sub_char)
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	lock
+	subb	%cl, (%eax)
+	ret
+	SET_SIZE(atomic_sub_char)
+	SET_SIZE(atomic_sub_8)
+
+	ENTRY(atomic_sub_16)
+	ALTENTRY(atomic_sub_short)
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	lock
+	subw	%cx, (%eax)
+	ret
+	SET_SIZE(atomic_sub_short)
+	SET_SIZE(atomic_sub_16)
+
+	ENTRY(atomic_sub_32)
+	ALTENTRY(atomic_sub_int)
+	ALTENTRY(atomic_sub_ptr)
+	ALTENTRY(atomic_sub_long)
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	lock
+	subl	%ecx, (%eax)
+	ret
+	SET_SIZE(atomic_sub_long)
+	SET_SIZE(atomic_sub_ptr)
+	SET_SIZE(atomic_sub_int)
+	SET_SIZE(atomic_sub_32)
+
 	ENTRY(atomic_or_8)
 	ALTENTRY(atomic_or_uchar)
 	movl	4(%esp), %eax
@@ -384,6 +418,55 @@
 	SET_SIZE(atomic_add_int_nv)
 	SET_SIZE(atomic_add_32_nv)
 
+	ENTRY(atomic_sub_8_nv)
+	ALTENTRY(atomic_sub_char_nv)
+	movl	4(%esp), %edx
+	movb	(%edx), %al
+1:
+	movl	8(%esp), %ecx
+	subb	%al, %cl
+	lock
+	cmpxchgb %cl, (%edx)
+	jne	1b
+	movzbl	%cl, %eax
+	ret
+	SET_SIZE(atomic_sub_char_nv)
+	SET_SIZE(atomic_sub_8_nv)
+
+	ENTRY(atomic_sub_16_nv)
+	ALTENTRY(atomic_sub_short_nv)
+	movl	4(%esp), %edx
+	movw	(%edx), %ax
+1:
+	movl	8(%esp), %ecx
+	subw	%ax, %cx
+	lock
+	cmpxchgw %cx, (%edx)
+	jne	1b
+	movzwl	%cx, %eax
+	ret
+	SET_SIZE(atomic_sub_short_nv)
+	SET_SIZE(atomic_sub_16_nv)
+
+	ENTRY(atomic_sub_32_nv)
+	ALTENTRY(atomic_sub_int_nv)
+	ALTENTRY(atomic_sub_ptr_nv)
+	ALTENTRY(atomic_sub_long_nv)
+	movl	4(%esp), %edx
+	movl	(%edx), %eax
+1:
+	movl	8(%esp), %ecx
+	subl	%eax, %ecx
+	lock
+	cmpxchgl %ecx, (%edx)
+	jne	1b
+	movl	%ecx, %eax
+	ret
+	SET_SIZE(atomic_sub_long_nv)
+	SET_SIZE(atomic_sub_ptr_nv)
+	SET_SIZE(atomic_sub_int_nv)
+	SET_SIZE(atomic_sub_32_nv)
+
 	/*
 	 * NOTE: If atomic_add_64 and atomic_add_64_nv are ever
 	 * separated, it is important to edit the libc i386 platform
@@ -413,6 +496,29 @@
 	SET_SIZE(atomic_add_64_nv)
 	SET_SIZE(atomic_add_64)
 
+	ENTRY(atomic_sub_64)
+	ALTENTRY(atomic_sub_64_nv)
+	pushl	%edi
+	pushl	%ebx
+	movl	12(%esp), %edi
+	movl	(%edi), %eax
+	movl	4(%edi), %edx
+1:
+	movl	16(%esp), %ebx
+	movl	20(%esp), %ecx
+	subl	%eax, %ebx
+	adcl	%edx, %ecx
+	lock
+	cmpxchg8b (%edi)
+	jne	1b
+	movl	%ebx, %eax
+	movl	%ecx, %edx
+	popl	%ebx
+	popl	%edi
+	ret
+	SET_SIZE(atomic_sub_64_nv)
+	SET_SIZE(atomic_sub_64)
+
 	ENTRY(atomic_or_8_nv)
 	ALTENTRY(atomic_or_uchar_nv)
 	movl	4(%esp), %edx
diff --git a/lib/libspl/asm-x86_64/atomic.S b/lib/libspl/asm-x86_64/atomic.S
index e321bf732..49c9b2ad1 100644
--- a/lib/libspl/asm-x86_64/atomic.S
+++ b/lib/libspl/asm-x86_64/atomic.S
@@ -232,6 +232,40 @@
 	SET_SIZE(atomic_add_ptr)
 	SET_SIZE(atomic_add_64)
 
+	ENTRY(atomic_sub_8)
+	ALTENTRY(atomic_sub_char)
+	lock
+	subb	%sil, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_char)
+	SET_SIZE(atomic_sub_8)
+
+	ENTRY(atomic_sub_16)
+	ALTENTRY(atomic_sub_short)
+	lock
+	subw	%si, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_short)
+	SET_SIZE(atomic_sub_16)
+
+	ENTRY(atomic_sub_32)
+	ALTENTRY(atomic_sub_int)
+	lock
+	subl	%esi, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_int)
+	SET_SIZE(atomic_sub_32)
+
+	ENTRY(atomic_sub_64)
+	ALTENTRY(atomic_sub_ptr)
+	ALTENTRY(atomic_sub_long)
+	lock
+	subq	%rsi, (%rdi)
+	ret
+	SET_SIZE(atomic_sub_long)
+	SET_SIZE(atomic_sub_ptr)
+	SET_SIZE(atomic_sub_64)
+
 	ENTRY(atomic_or_8)
 	ALTENTRY(atomic_or_uchar)
 	lock
@@ -354,6 +388,64 @@
 	SET_SIZE(atomic_add_ptr_nv)
 	SET_SIZE(atomic_add_64_nv)
 
+	ENTRY(atomic_sub_8_nv)
+	ALTENTRY(atomic_sub_char_nv)
+	movb	(%rdi), %al
+1:
+	movb	%sil, %cl
+	subb	%al, %cl
+	lock
+	cmpxchgb %cl, (%rdi)
+	jne	1b
+	movzbl	%cl, %eax
+	ret
+	SET_SIZE(atomic_sub_char_nv)
+	SET_SIZE(atomic_sub_8_nv)
+
+	ENTRY(atomic_sub_16_nv)
+	ALTENTRY(atomic_sub_short_nv)
+	movw	(%rdi), %ax
+1:
+	movw	%si, %cx
+	subw	%ax, %cx
+	lock
+	cmpxchgw %cx, (%rdi)
+	jne	1b
+	movzwl	%cx, %eax
+	ret
+	SET_SIZE(atomic_sub_short_nv)
+	SET_SIZE(atomic_sub_16_nv)
+
+	ENTRY(atomic_sub_32_nv)
+	ALTENTRY(atomic_sub_int_nv)
+	movl	(%rdi), %eax
+1:
+	movl	%esi, %ecx
+	subl	%eax, %ecx
+	lock
+	cmpxchgl %ecx, (%rdi)
+	jne	1b
+	movl	%ecx, %eax
+	ret
+	SET_SIZE(atomic_sub_int_nv)
+	SET_SIZE(atomic_sub_32_nv)
+
+	ENTRY(atomic_sub_64_nv)
+	ALTENTRY(atomic_sub_ptr_nv)
+	ALTENTRY(atomic_sub_long_nv)
+	movq	(%rdi), %rax
+1:
+	movq	%rsi, %rcx
+	subq	%rax, %rcx
+	lock
+	cmpxchgq %rcx, (%rdi)
+	jne	1b
+	movq	%rcx, %rax
+	ret
+	SET_SIZE(atomic_sub_long_nv)
+	SET_SIZE(atomic_sub_ptr_nv)
+	SET_SIZE(atomic_sub_64_nv)
+
 	ENTRY(atomic_and_8_nv)
 	ALTENTRY(atomic_and_uchar_nv)
 	movb	(%rdi), %al
diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h
index 508000152..9b0775bb9 100644
--- a/lib/libspl/include/atomic.h
+++ b/lib/libspl/include/atomic.h
@@ -78,6 +78,21 @@ extern void atomic_add_long(volatile ulong_t *, long);
 extern void atomic_add_64(volatile uint64_t *, int64_t);
 #endif
 
+/*
+ * Substract delta from target
+ */
+extern void atomic_sub_8(volatile uint8_t *, int8_t);
+extern void atomic_sub_char(volatile uchar_t *, signed char);
+extern void atomic_sub_16(volatile uint16_t *, int16_t);
+extern void atomic_sub_short(volatile ushort_t *, short);
+extern void atomic_sub_32(volatile uint32_t *, int32_t);
+extern void atomic_sub_int(volatile uint_t *, int);
+extern void atomic_sub_ptr(volatile void *, ssize_t);
+extern void atomic_sub_long(volatile ulong_t *, long);
+#if defined(_INT64_TYPE)
+extern void atomic_sub_64(volatile uint64_t *, int64_t);
+#endif
+
 /*
  * logical OR bits with target
  */
@@ -157,6 +172,21 @@ extern ulong_t atomic_add_long_nv(volatile ulong_t *, long);
 extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t);
 #endif
 
+/*
+ * Substract delta from target
+ */
+extern uint8_t atomic_sub_8_nv(volatile uint8_t *, int8_t);
+extern uchar_t atomic_sub_char_nv(volatile uchar_t *, signed char);
+extern uint16_t atomic_sub_16_nv(volatile uint16_t *, int16_t);
+extern ushort_t atomic_sub_short_nv(volatile ushort_t *, short);
+extern uint32_t atomic_sub_32_nv(volatile uint32_t *, int32_t);
+extern uint_t atomic_sub_int_nv(volatile uint_t *, int);
+extern void *atomic_sub_ptr_nv(volatile void *, ssize_t);
+extern ulong_t atomic_sub_long_nv(volatile ulong_t *, long);
+#if defined(_INT64_TYPE)
+extern uint64_t atomic_sub_64_nv(volatile uint64_t *, int64_t);
+#endif
+
 /*
  * logical OR bits with target and return new value.
  */

From 920dd524fb2997225d4b1ac180bcbc14b045fda6 Mon Sep 17 00:00:00 2001
From: Etienne Dechamps <etienne.dechamps@ovh.net>
Date: Wed, 27 Jun 2012 15:20:20 +0200
Subject: [PATCH 2/3] Add FASTWRITE algorithm for synchronous writes.

Currently, ZIL blocks are spread over vdevs using hint block pointers
managed by the ZIL commit code and passed to metaslab_alloc(). Spreading
log blocks accross vdevs is important for performance: indeed, using
mutliple disks in parallel decreases the ZIL commit latency, which is
the main performance metric for synchronous writes. However, the current
implementation suffers from the following issues:

1) It would be best if the ZIL module was not aware of such low-level
details. They should be handled by the ZIO and metaslab modules;

2) Because the hint block pointer is managed per log, simultaneous
commits from multiple logs might use the same vdevs at the same time,
which is inefficient;

3) Because dmu_write() does not honor the block pointer hint, indirect
writes are not spread.

The naive solution of rotating the metaslab rotor each time a block is
allocated for the ZIL or dmu_sync() doesn't work in practice because the
first ZIL block to be written is actually allocated during the previous
commit. Consequently, when metaslab_alloc() decides the vdev for this
block, it will do so while a bunch of other allocations are happening at
the same time (from dmu_sync() and other ZILs). This means the vdev for
this block is chosen more or less at random. When the next commit
happens, there is a high chance (especially when the number of blocks
per commit is slightly less than the number of the disks) that one disk
will have to write two blocks (with a potential seek) while other disks
are sitting idle, which defeats spreading and increases the commit
latency.

This commit introduces a new concept in the metaslab allocator:
fastwrites. Basically, each top-level vdev maintains a counter
indicating the number of synchronous writes (from dmu_sync() and the
ZIL) which have been allocated but not yet completed. When the metaslab
is called with the FASTWRITE flag, it will choose the vdev with the
least amount of pending synchronous writes. If there are multiple vdevs
with the same value, the first matching vdev (starting from the rotor)
is used. Once metaslab_alloc() has decided which vdev the block is
allocated to, it updates the fastwrite counter for this vdev.

The rationale goes like this: when an allocation is done with
FASTWRITE, it "reserves" the vdev until the data is written. Until then,
all future allocations will naturally avoid this vdev, even after a full
rotation of the rotor. As a result, pending synchronous writes at a
given point in time will be nicely spread over all vdevs. This contrasts
with the previous algorithm, which is based on the implicit assumption
that blocks are written instantaneously after they're allocated.

metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to
manually increase or decrease fastwrite counters, respectively. They
should be used with caution, as there is no per-BP tracking of fastwrite
information, so leaks and "double-unmarks" are possible. There is,
however, an assert in the vdev teardown code which will fire if the
fastwrite counters are not zero when the pool is exported or the vdev
removed. Note that as stated above, marking is also done implictly by
metaslab_alloc().

ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to
the metaslab when allocating (assuming ZIO does the allocation, which is
only true in the case of dmu_sync). This flag will also trigger an
unmark when zio_done() fires.

A side-effect of the new algorithm is that when a ZIL stops being used,
its last block can stay in the pending state (allocated but not yet
written) for a long time, polluting the fastwrite counters. To avoid
that, I've implemented a somewhat crude but working solution which
unmarks these pending blocks in zil_sync(), thus guaranteeing that
linguering fastwrites will get pruned at each sync event.

The best performance improvements are observed with pools using a large
number of top-level vdevs and heavy synchronous write workflows
(especially indirect writes and concurrent writes from multiple ZILs).
Real-life testing shows a 200% to 300% performance increase with
indirect writes and various commit sizes.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
---
 include/sys/metaslab.h      |  3 ++
 include/sys/metaslab_impl.h |  1 +
 include/sys/vdev_impl.h     |  1 +
 include/sys/zil_impl.h      |  1 +
 include/sys/zio.h           |  5 +--
 module/zfs/dmu.c            |  4 +--
 module/zfs/metaslab.c       | 72 +++++++++++++++++++++++++++++++++++--
 module/zfs/vdev.c           |  2 ++
 module/zfs/zil.c            | 52 ++++++++++++++++++++++-----
 module/zfs/zio.c            | 23 ++++++++----
 10 files changed, 144 insertions(+), 20 deletions(-)

diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 2cf4d2b48..99912424b 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -50,12 +50,15 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
 #define	METASLAB_GANG_HEADER	0x2
 #define	METASLAB_GANG_CHILD	0x4
 #define	METASLAB_GANG_AVOID	0x8
+#define	METASLAB_FASTWRITE	0x10
 
 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
 extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
+extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
+extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);
 
 extern metaslab_class_t *metaslab_class_create(spa_t *spa,
     space_map_ops_t *ops);
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 6c670a162..658359478 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -46,6 +46,7 @@ struct metaslab_class {
 	uint64_t		mc_deferred;	/* total deferred frees */
 	uint64_t		mc_space;	/* total space (alloc + free) */
 	uint64_t		mc_dspace;	/* total deflated space */
+	kmutex_t		mc_fastwrite_lock;
 };
 
 struct metaslab_group {
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 5bd432beb..0b532dcdd 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -156,6 +156,7 @@ struct vdev {
 	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
 	metaslab_group_t *vdev_mg;	/* metaslab group		*/
 	metaslab_t	**vdev_ms;	/* metaslab array		*/
+	uint64_t	vdev_pending_fastwrite; /* allocated fastwrites */
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
index 1d4c0cc6c..6c37c1ac2 100644
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@@ -40,6 +40,7 @@ extern "C" {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	boolean_t       lwb_fastwrite;  /* is blk marked for fastwrite? */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 4f20cab65..289238c36 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -193,7 +193,8 @@ enum zio_flag {
 	ZIO_FLAG_RAW		= 1 << 21,
 	ZIO_FLAG_GANG_CHILD	= 1 << 22,
 	ZIO_FLAG_DDT_CHILD	= 1 << 23,
-	ZIO_FLAG_GODFATHER	= 1 << 24
+	ZIO_FLAG_GODFATHER	= 1 << 24,
+	ZIO_FLAG_FASTWRITE      = 1 << 25
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
@@ -475,7 +476,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+    uint64_t size, boolean_t use_slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 1d4d1257d..e2abf8cf2 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1440,7 +1440,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb));
 
 	return (0);
 }
@@ -1564,7 +1564,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
 	    dmu_sync_ready, dmu_sync_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb));
 
 	return (0);
 }
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index d06012ffb..d199921b7 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -107,6 +107,7 @@ metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
+	mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (mc);
 }
@@ -120,6 +121,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
+	mutex_destroy(&mc->mc_fastwrite_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
@@ -1307,7 +1309,7 @@ static int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
 {
-	metaslab_group_t *mg, *rotor;
+	metaslab_group_t *mg, *fast_mg, *rotor;
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
@@ -1325,6 +1327,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
 		return (ENOSPC);
 
+	if (flags & METASLAB_FASTWRITE)
+		mutex_enter(&mc->mc_fastwrite_lock);
+
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mc_rotor or mc_aliquot because
@@ -1367,6 +1372,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
+	} else if (flags & METASLAB_FASTWRITE) {
+		mg = fast_mg = mc->mc_rotor;
+
+		do {
+			if (fast_mg->mg_vd->vdev_pending_fastwrite <
+			    mg->mg_vd->vdev_pending_fastwrite)
+				mg = fast_mg;
+		} while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
+
 	} else {
 		mg = mc->mc_rotor;
 	}
@@ -1453,7 +1467,8 @@ top:
 				    (int64_t)mg->mg_aliquot) / 100;
 			}
 
-			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
+			if ((flags & METASLAB_FASTWRITE) ||
+			    atomic_add_64_nv(&mc->mc_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mc->mc_rotor = mg->mg_next;
 				mc->mc_aliquot = 0;
@@ -1464,6 +1479,12 @@ top:
 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
 			DVA_SET_ASIZE(&dva[d], asize);
 
+			if (flags & METASLAB_FASTWRITE) {
+				atomic_add_64(&vd->vdev_pending_fastwrite,
+				    psize);
+				mutex_exit(&mc->mc_fastwrite_lock);
+			}
+
 			return (0);
 		}
 next:
@@ -1485,6 +1506,8 @@ next:
 
 	bzero(&dva[d], sizeof (dva_t));
 
+	if (flags & METASLAB_FASTWRITE)
+		mutex_exit(&mc->mc_fastwrite_lock);
 	return (ENOSPC);
 }
 
@@ -1678,3 +1701,48 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 
 	return (error);
 }
+
+void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		atomic_add_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
+		atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 205a1d1aa..7d6d5278a 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -904,6 +904,8 @@ vdev_metaslab_fini(vdev_t *vd)
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 	}
+
+	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
 }
 
 typedef struct vdev_probe_stats {
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e76e5ecf1..6492dbc1c 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -38,6 +38,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
+#include <sys/metaslab.h>
 
 /*
  * The zfs intent log (ZIL) saves transaction records of system calls
@@ -451,13 +452,14 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 }
 
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_PUSHPAGE);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
+	lwb->lwb_fastwrite = fastwrite;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_zio = NULL;
@@ -489,6 +491,7 @@ zil_create(zilog_t *zilog)
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
+	boolean_t fastwrite = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
@@ -516,8 +519,9 @@ zil_create(zilog_t *zilog)
 			BP_ZERO(&blk);
 		}
 
-		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+		error = zio_alloc_zil(zilog->zl_spa, txg, &blk,
 		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		fastwrite = TRUE;
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -527,7 +531,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, txg);
+		lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -586,6 +590,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+			ASSERT(lwb->lwb_zio == NULL);
+			if (lwb->lwb_fastwrite)
+				metaslab_fastwrite_unmark(zilog->zl_spa,
+				    &lwb->lwb_blk);
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
@@ -826,6 +834,8 @@ zil_lwb_write_done(zio_t *zio)
 	 */
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
+	lwb->lwb_zio = NULL;
+	lwb->lwb_fastwrite = FALSE;
 	lwb->lwb_buf = NULL;
 	lwb->lwb_tx = NULL;
 	mutex_exit(&zilog->zl_lock);
@@ -854,12 +864,21 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
+
+	/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
+	mutex_enter(&zilog->zl_lock);
 	if (lwb->lwb_zio == NULL) {
+		if (!lwb->lwb_fastwrite) {
+			metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
+			lwb->lwb_fastwrite = 1;
+		}
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
 		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+		    ZIO_FLAG_FASTWRITE, &zb);
 	}
+	mutex_exit(&zilog->zl_lock);
 }
 
 /*
@@ -956,10 +975,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	BP_ZERO(bp);
-	/* pass the old blkptr in order to spread log blocks across devs */
 	use_slog = USE_SLOG(zilog);
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
-	    use_slog);
+	error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog));
 	if (use_slog)
 	{
 		ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
@@ -978,7 +995,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 		/*
 		 * Allocate a new log write buffer (lwb).
 		 */
-		nlwb = zil_alloc_lwb(zilog, bp, txg);
+		nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE);
 
 		/* Record the block for later vdev flushing */
 		zil_add_block(zilog, &lwb->lwb_blk);
@@ -1625,6 +1642,9 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
 			break;
+
+		ASSERT(lwb->lwb_zio == NULL);
+
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_free_zil(spa, txg, &lwb->lwb_blk);
 		kmem_cache_free(zil_lwb_cache, lwb);
@@ -1638,6 +1658,19 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		if (list_head(&zilog->zl_lwb_list) == NULL)
 			BP_ZERO(&zh->zh_log);
 	}
+
+	/*
+	 * Remove fastwrite on any blocks that have been pre-allocated for
+	 * the next commit. This prevents fastwrite counter pollution by
+	 * unused, long-lived LWBs.
+	 */
+	for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
+		if (lwb->lwb_fastwrite && !lwb->lwb_zio) {
+			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
+			lwb->lwb_fastwrite = 0;
+		}
+	}
+
 	mutex_exit(&zilog->zl_lock);
 }
 
@@ -1817,6 +1850,9 @@ zil_close(zilog_t *zilog)
 	lwb = list_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
+		ASSERT(lwb->lwb_zio == NULL);
+		if (lwb->lwb_fastwrite)
+			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		kmem_cache_free(zil_lwb_cache, lwb);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index ace72a087..ce76e010c 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1861,6 +1861,11 @@ zio_write_gang_block(zio_t *pio)
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+	/*
+	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
+	 */
+	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
+
 	zio_nowait(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
@@ -2270,6 +2275,7 @@ zio_dva_allocate(zio_t *zio)
 	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
 	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
 	    METASLAB_GANG_CHILD : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
 
@@ -2333,8 +2339,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t use_slog)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
+    boolean_t use_slog)
 {
 	int error = 1;
 
@@ -2347,14 +2353,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
 	 */
 	if (use_slog) {
 		error = metaslab_alloc(spa, spa_log_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+		    new_bp, 1, txg, NULL,
+		    METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
 	}
 
 	if (error) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+		    new_bp, 1, txg, NULL,
+		    METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
 	}
 
 	if (error == 0) {
@@ -3066,6 +3072,11 @@ zio_done(zio_t *zio)
 		zfs_ereport_free_checksum(zcr);
 	}
 
+	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
+	    !BP_IS_HOLE(zio->io_bp)) {
+		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
+	}
+
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as

From 5d7a86d114c2706a8d14d94b71f81ad5cdf066c5 Mon Sep 17 00:00:00 2001
From: Etienne Dechamps <etienne.dechamps@ovh.net>
Date: Thu, 28 Jun 2012 12:30:07 +0200
Subject: [PATCH 3/3] Use the slog even with logbias=throughput.

In the current code, logbias=throughput implies the following:
 1) All synchronous writes are logged in indirect mode.
 2) The slog is not used.

(1) makes sense because it avoids writing the data twice, which is
obviously a good thing when the user wants maximum pool throughput.

(2), however, is a surprising decision. Considering all writes are
indirect, the log record doesn't contain the actual data, only pointers
to DMU blocks. As a result, log records written in logbias=throughput
mode are quite small, and as such, it doesn't make any sense to write
them to the main pool since slogs are usually optimized for small
synchronous writes.

In fact, the current behavior is actually harmful for performance,
because log blocks and data blocks from dmu_sync() seldom have the same
allocation size and as a result are usually allocated from different
metaslabs. This means that if a spindle has to write both log blocks and
DMU blocks (which is likely to happen under heavy load), it will have to
seek between the two. Allocating the log blocks from the slog pool
instead of the main pool avoids these unnecessary seeks.

This commit makes ZFS use the slog on datasets with logbias=throughput.
Real-life performance testing shows a 50% synchronous write performance
increase with some large commit sizes, and no negative effect in other
cases.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1013
---
 module/zfs/zil.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 6492dbc1c..220f2d79e 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -520,7 +520,7 @@ zil_create(zilog_t *zilog)
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, txg, &blk,
-		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		    ZIL_MIN_BLKSZ, B_TRUE);
 		fastwrite = TRUE;
 
 		if (error == 0)
@@ -895,14 +895,13 @@ uint64_t zil_block_buckets[] = {
 };
 
 /*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
+ * Use the slog as long as the current commit size is less than the
+ * limit or the total list size is less than 2X the limit.  Limit
+ * checking is disabled by setting zil_slog_limit to UINT64_MAX.
  */
 unsigned long zil_slog_limit = 1024 * 1024;
-#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
-	(((zilog)->zl_cur_used < zil_slog_limit) || \
-	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
+#define	USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \
+	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))
 
 /*
  * Start a log block write and advance to the next log block.