From 142e6dd100eb70ef06f39015a2e54cbd74172f8b Mon Sep 17 00:00:00 2001 From: Etienne Dechamps Date: Wed, 27 Jun 2012 10:26:49 +0200 Subject: [PATCH 1/3] Add atomic_sub_* functions to libspl. Both the SPL and the ZFS libspl export most of the atomic_* functions, except atomic_sub_* functions which are only exported by the SPL, not by libspl. This patch remedies that by implementing atomic_sub_* functions in libspl. Signed-off-by: Brian Behlendorf Issue #1013 --- lib/libspl/asm-generic/atomic.c | 56 +++++++++++++++++ lib/libspl/asm-i386/atomic.S | 106 ++++++++++++++++++++++++++++++++ lib/libspl/asm-x86_64/atomic.S | 92 +++++++++++++++++++++++++++ lib/libspl/include/atomic.h | 30 +++++++++ 4 files changed, 284 insertions(+) diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c index de4430f9f..a3223eadc 100644 --- a/lib/libspl/asm-generic/atomic.c +++ b/lib/libspl/asm-generic/atomic.c @@ -103,6 +103,31 @@ void atomic_add_ptr(volatile void *target, ssize_t bits) } +#define ATOMIC_SUB(name, type1, type2) \ + void atomic_sub_##name(volatile type1 *target, type2 bits) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + *target -= bits; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_SUB(8, uint8_t, int8_t) +ATOMIC_SUB(char, uchar_t, signed char) +ATOMIC_SUB(16, uint16_t, int16_t) +ATOMIC_SUB(short, ushort_t, short) +ATOMIC_SUB(32, uint32_t, int32_t) +ATOMIC_SUB(int, uint_t, int) +ATOMIC_SUB(long, ulong_t, long) +ATOMIC_SUB(64, uint64_t, int64_t) + +void atomic_sub_ptr(volatile void *target, ssize_t bits) +{ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + *(caddr_t *)target -= bits; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); +} + + #define ATOMIC_OR(name, type) \ void atomic_or_##name(volatile type *target, type bits) \ { \ @@ -216,6 +241,37 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) } +#define ATOMIC_SUB_NV(name, type1, type2) \ + type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\ + { \ + type1 rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (*target -= bits); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return rc; \ + } + +ATOMIC_SUB_NV(8, uint8_t, int8_t) +ATOMIC_SUB_NV(char, uchar_t, signed char) +ATOMIC_SUB_NV(16, uint16_t, int16_t) +ATOMIC_SUB_NV(short, ushort_t, short) +ATOMIC_SUB_NV(32, uint32_t, int32_t) +ATOMIC_SUB_NV(int, uint_t, int) +ATOMIC_SUB_NV(long, ulong_t, long) +ATOMIC_SUB_NV(64, uint64_t, int64_t) + +void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) +{ + void *ptr; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + ptr = (*(caddr_t *)target -= bits); + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return ptr; +} + + #define ATOMIC_OR_NV(name, type) \ type atomic_or_##name##_nv(volatile type *target, type bits) \ { \ diff --git a/lib/libspl/asm-i386/atomic.S b/lib/libspl/asm-i386/atomic.S index 93c04bfb8..d3d425090 100644 --- a/lib/libspl/asm-i386/atomic.S +++ b/lib/libspl/asm-i386/atomic.S @@ -271,6 +271,40 @@ SET_SIZE(atomic_add_int) SET_SIZE(atomic_add_32) + ENTRY(atomic_sub_8) + ALTENTRY(atomic_sub_char) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subb %cl, (%eax) + ret + SET_SIZE(atomic_sub_char) + SET_SIZE(atomic_sub_8) + + ENTRY(atomic_sub_16) + ALTENTRY(atomic_sub_short) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subw %cx, (%eax) + ret + SET_SIZE(atomic_sub_short) + SET_SIZE(atomic_sub_16) + + ENTRY(atomic_sub_32) + ALTENTRY(atomic_sub_int) + ALTENTRY(atomic_sub_ptr) + ALTENTRY(atomic_sub_long) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subl %ecx, (%eax) + ret + SET_SIZE(atomic_sub_long) + SET_SIZE(atomic_sub_ptr) + SET_SIZE(atomic_sub_int) + SET_SIZE(atomic_sub_32) + ENTRY(atomic_or_8) ALTENTRY(atomic_or_uchar) movl 4(%esp), %eax @@ -384,6 +418,55 @@ SET_SIZE(atomic_add_int_nv) SET_SIZE(atomic_add_32_nv) + ENTRY(atomic_sub_8_nv) + ALTENTRY(atomic_sub_char_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + subb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_sub_char_nv) + SET_SIZE(atomic_sub_8_nv) + + ENTRY(atomic_sub_16_nv) + ALTENTRY(atomic_sub_short_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + subw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_sub_short_nv) + SET_SIZE(atomic_sub_16_nv) + + ENTRY(atomic_sub_32_nv) + ALTENTRY(atomic_sub_int_nv) + ALTENTRY(atomic_sub_ptr_nv) + ALTENTRY(atomic_sub_long_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + subl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_sub_long_nv) + SET_SIZE(atomic_sub_ptr_nv) + SET_SIZE(atomic_sub_int_nv) + SET_SIZE(atomic_sub_32_nv) + /* * NOTE: If atomic_add_64 and atomic_add_64_nv are ever * separated, it is important to edit the libc i386 platform @@ -413,6 +496,29 @@ SET_SIZE(atomic_add_64_nv) SET_SIZE(atomic_add_64) + ENTRY(atomic_sub_64) + ALTENTRY(atomic_sub_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + subl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_sub_64_nv) + SET_SIZE(atomic_sub_64) + ENTRY(atomic_or_8_nv) ALTENTRY(atomic_or_uchar_nv) movl 4(%esp), %edx diff --git a/lib/libspl/asm-x86_64/atomic.S b/lib/libspl/asm-x86_64/atomic.S index e321bf732..49c9b2ad1 100644 --- a/lib/libspl/asm-x86_64/atomic.S +++ b/lib/libspl/asm-x86_64/atomic.S @@ -232,6 +232,40 @@ SET_SIZE(atomic_add_ptr) SET_SIZE(atomic_add_64) + ENTRY(atomic_sub_8) + ALTENTRY(atomic_sub_char) + lock + subb %sil, (%rdi) + ret + SET_SIZE(atomic_sub_char) + SET_SIZE(atomic_sub_8) + + ENTRY(atomic_sub_16) + ALTENTRY(atomic_sub_short) + lock + subw %si, (%rdi) + ret + SET_SIZE(atomic_sub_short) + SET_SIZE(atomic_sub_16) + + ENTRY(atomic_sub_32) + ALTENTRY(atomic_sub_int) + lock + subl %esi, (%rdi) + ret + SET_SIZE(atomic_sub_int) + SET_SIZE(atomic_sub_32) + + ENTRY(atomic_sub_64) + ALTENTRY(atomic_sub_ptr) + ALTENTRY(atomic_sub_long) + lock + subq %rsi, (%rdi) + ret + SET_SIZE(atomic_sub_long) + SET_SIZE(atomic_sub_ptr) + SET_SIZE(atomic_sub_64) + ENTRY(atomic_or_8) ALTENTRY(atomic_or_uchar) lock @@ -354,6 +388,64 @@ SET_SIZE(atomic_add_ptr_nv) SET_SIZE(atomic_add_64_nv) + ENTRY(atomic_sub_8_nv) + ALTENTRY(atomic_sub_char_nv) + movb (%rdi), %al +1: + movb %sil, %cl + subb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_sub_char_nv) + SET_SIZE(atomic_sub_8_nv) + + ENTRY(atomic_sub_16_nv) + ALTENTRY(atomic_sub_short_nv) + movw (%rdi), %ax +1: + movw %si, %cx + subw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_sub_short_nv) + SET_SIZE(atomic_sub_16_nv) + + ENTRY(atomic_sub_32_nv) + ALTENTRY(atomic_sub_int_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + subl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_sub_int_nv) + SET_SIZE(atomic_sub_32_nv) + + ENTRY(atomic_sub_64_nv) + ALTENTRY(atomic_sub_ptr_nv) + ALTENTRY(atomic_sub_long_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + subq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_sub_long_nv) + SET_SIZE(atomic_sub_ptr_nv) + SET_SIZE(atomic_sub_64_nv) + ENTRY(atomic_and_8_nv) ALTENTRY(atomic_and_uchar_nv) movb (%rdi), %al diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h index 508000152..9b0775bb9 100644 --- a/lib/libspl/include/atomic.h +++ b/lib/libspl/include/atomic.h @@ -78,6 +78,21 @@ extern void atomic_add_long(volatile ulong_t *, long); extern void atomic_add_64(volatile uint64_t *, int64_t); #endif +/* + * Substract delta from target + */ +extern void atomic_sub_8(volatile uint8_t *, int8_t); +extern void atomic_sub_char(volatile uchar_t *, signed char); +extern void atomic_sub_16(volatile uint16_t *, int16_t); +extern void atomic_sub_short(volatile ushort_t *, short); +extern void atomic_sub_32(volatile uint32_t *, int32_t); +extern void atomic_sub_int(volatile uint_t *, int); +extern void atomic_sub_ptr(volatile void *, ssize_t); +extern void atomic_sub_long(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern void atomic_sub_64(volatile uint64_t *, int64_t); +#endif + /* * logical OR bits with target */ @@ -157,6 +172,21 @@ extern ulong_t atomic_add_long_nv(volatile ulong_t *, long); extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t); #endif +/* + * Substract delta from target + */ +extern uint8_t atomic_sub_8_nv(volatile uint8_t *, int8_t); +extern uchar_t atomic_sub_char_nv(volatile uchar_t *, signed char); +extern uint16_t atomic_sub_16_nv(volatile uint16_t *, int16_t); +extern ushort_t atomic_sub_short_nv(volatile ushort_t *, short); +extern uint32_t atomic_sub_32_nv(volatile uint32_t *, int32_t); +extern uint_t atomic_sub_int_nv(volatile uint_t *, int); +extern void *atomic_sub_ptr_nv(volatile void *, ssize_t); +extern ulong_t atomic_sub_long_nv(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern uint64_t atomic_sub_64_nv(volatile uint64_t *, int64_t); +#endif + /* * logical OR bits with target and return new value. */ From 920dd524fb2997225d4b1ac180bcbc14b045fda6 Mon Sep 17 00:00:00 2001 From: Etienne Dechamps Date: Wed, 27 Jun 2012 15:20:20 +0200 Subject: [PATCH 2/3] Add FASTWRITE algorithm for synchronous writes. Currently, ZIL blocks are spread over vdevs using hint block pointers managed by the ZIL commit code and passed to metaslab_alloc(). Spreading log blocks accross vdevs is important for performance: indeed, using mutliple disks in parallel decreases the ZIL commit latency, which is the main performance metric for synchronous writes. However, the current implementation suffers from the following issues: 1) It would be best if the ZIL module was not aware of such low-level details. They should be handled by the ZIO and metaslab modules; 2) Because the hint block pointer is managed per log, simultaneous commits from multiple logs might use the same vdevs at the same time, which is inefficient; 3) Because dmu_write() does not honor the block pointer hint, indirect writes are not spread. The naive solution of rotating the metaslab rotor each time a block is allocated for the ZIL or dmu_sync() doesn't work in practice because the first ZIL block to be written is actually allocated during the previous commit. Consequently, when metaslab_alloc() decides the vdev for this block, it will do so while a bunch of other allocations are happening at the same time (from dmu_sync() and other ZILs). This means the vdev for this block is chosen more or less at random. When the next commit happens, there is a high chance (especially when the number of blocks per commit is slightly less than the number of the disks) that one disk will have to write two blocks (with a potential seek) while other disks are sitting idle, which defeats spreading and increases the commit latency. This commit introduces a new concept in the metaslab allocator: fastwrites. Basically, each top-level vdev maintains a counter indicating the number of synchronous writes (from dmu_sync() and the ZIL) which have been allocated but not yet completed. When the metaslab is called with the FASTWRITE flag, it will choose the vdev with the least amount of pending synchronous writes. If there are multiple vdevs with the same value, the first matching vdev (starting from the rotor) is used. Once metaslab_alloc() has decided which vdev the block is allocated to, it updates the fastwrite counter for this vdev. The rationale goes like this: when an allocation is done with FASTWRITE, it "reserves" the vdev until the data is written. Until then, all future allocations will naturally avoid this vdev, even after a full rotation of the rotor. As a result, pending synchronous writes at a given point in time will be nicely spread over all vdevs. This contrasts with the previous algorithm, which is based on the implicit assumption that blocks are written instantaneously after they're allocated. metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to manually increase or decrease fastwrite counters, respectively. They should be used with caution, as there is no per-BP tracking of fastwrite information, so leaks and "double-unmarks" are possible. There is, however, an assert in the vdev teardown code which will fire if the fastwrite counters are not zero when the pool is exported or the vdev removed. Note that as stated above, marking is also done implictly by metaslab_alloc(). ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to the metaslab when allocating (assuming ZIO does the allocation, which is only true in the case of dmu_sync). This flag will also trigger an unmark when zio_done() fires. A side-effect of the new algorithm is that when a ZIL stops being used, its last block can stay in the pending state (allocated but not yet written) for a long time, polluting the fastwrite counters. To avoid that, I've implemented a somewhat crude but working solution which unmarks these pending blocks in zil_sync(), thus guaranteeing that linguering fastwrites will get pruned at each sync event. The best performance improvements are observed with pools using a large number of top-level vdevs and heavy synchronous write workflows (especially indirect writes and concurrent writes from multiple ZILs). Real-life testing shows a 200% to 300% performance increase with indirect writes and various commit sizes. Signed-off-by: Brian Behlendorf Issue #1013 --- include/sys/metaslab.h | 3 ++ include/sys/metaslab_impl.h | 1 + include/sys/vdev_impl.h | 1 + include/sys/zil_impl.h | 1 + include/sys/zio.h | 5 +-- module/zfs/dmu.c | 4 +-- module/zfs/metaslab.c | 72 +++++++++++++++++++++++++++++++++++-- module/zfs/vdev.c | 2 ++ module/zfs/zil.c | 52 ++++++++++++++++++++++----- module/zfs/zio.c | 23 ++++++++---- 10 files changed, 144 insertions(+), 20 deletions(-) diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 2cf4d2b48..99912424b 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -50,12 +50,15 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_GANG_AVOID 0x8 +#define METASLAB_FASTWRITE 0x10 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); +extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp); +extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp); extern metaslab_class_t *metaslab_class_create(spa_t *spa, space_map_ops_t *ops); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 6c670a162..658359478 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -46,6 +46,7 @@ struct metaslab_class { uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ + kmutex_t mc_fastwrite_lock; }; struct metaslab_group { diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 5bd432beb..0b532dcdd 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -156,6 +156,7 @@ struct vdev { uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ + uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 1d4c0cc6c..6c37c1ac2 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -40,6 +40,7 @@ extern "C" { typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ + boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ int lwb_nused; /* # used bytes in buffer */ int lwb_sz; /* size of block and buffer */ char *lwb_buf; /* log write buffer */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 4f20cab65..289238c36 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -193,7 +193,8 @@ enum zio_flag { ZIO_FLAG_RAW = 1 << 21, ZIO_FLAG_GANG_CHILD = 1 << 22, ZIO_FLAG_DDT_CHILD = 1 << 23, - ZIO_FLAG_GODFATHER = 1 << 24 + ZIO_FLAG_GODFATHER = 1 << 24, + ZIO_FLAG_FASTWRITE = 1 << 25 }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -475,7 +476,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t size, boolean_t use_slog); + uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 1d4d1257d..e2abf8cf2 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1440,7 +1440,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb)); return (0); } @@ -1564,7 +1564,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb)); return (0); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d06012ffb..d199921b7 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -107,6 +107,7 @@ metaslab_class_create(spa_t *spa, space_map_ops_t *ops) mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; + mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); return (mc); } @@ -120,6 +121,7 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); + mutex_destroy(&mc->mc_fastwrite_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -1307,7 +1309,7 @@ static int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) { - metaslab_group_t *mg, *rotor; + metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; int dshift = 3; int all_zero; @@ -1325,6 +1327,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) return (ENOSPC); + if (flags & METASLAB_FASTWRITE) + mutex_enter(&mc->mc_fastwrite_lock); + /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mc_rotor or mc_aliquot because @@ -1367,6 +1372,15 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; + } else if (flags & METASLAB_FASTWRITE) { + mg = fast_mg = mc->mc_rotor; + + do { + if (fast_mg->mg_vd->vdev_pending_fastwrite < + mg->mg_vd->vdev_pending_fastwrite) + mg = fast_mg; + } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + } else { mg = mc->mc_rotor; } @@ -1453,7 +1467,8 @@ top: (int64_t)mg->mg_aliquot) / 100; } - if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= + if ((flags & METASLAB_FASTWRITE) || + atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; @@ -1464,6 +1479,12 @@ top: DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); + if (flags & METASLAB_FASTWRITE) { + atomic_add_64(&vd->vdev_pending_fastwrite, + psize); + mutex_exit(&mc->mc_fastwrite_lock); + } + return (0); } next: @@ -1485,6 +1506,8 @@ next: bzero(&dva[d], sizeof (dva_t)); + if (flags & METASLAB_FASTWRITE) + mutex_exit(&mc->mc_fastwrite_lock); return (ENOSPC); } @@ -1678,3 +1701,48 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } + +void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + atomic_add_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); + atomic_sub_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 205a1d1aa..7d6d5278a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -904,6 +904,8 @@ vdev_metaslab_fini(vdev_t *vd) kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } + + ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { diff --git a/module/zfs/zil.c b/module/zfs/zil.c index e76e5ecf1..6492dbc1c 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -38,6 +38,7 @@ #include #include #include +#include /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -451,13 +452,14 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) } static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_PUSHPAGE); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; + lwb->lwb_fastwrite = fastwrite; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; @@ -489,6 +491,7 @@ zil_create(zilog_t *zilog) dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; + boolean_t fastwrite = FALSE; /* * Wait for any previous destroy to complete. @@ -516,8 +519,9 @@ zil_create(zilog_t *zilog) BP_ZERO(&blk); } - error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, + error = zio_alloc_zil(zilog->zl_spa, txg, &blk, ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + fastwrite = TRUE; if (error == 0) zil_init_log_chain(zilog, &blk); @@ -527,7 +531,7 @@ zil_create(zilog_t *zilog) * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, txg); + lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite); /* * If we just allocated the first log block, commit our transaction @@ -586,6 +590,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + ASSERT(lwb->lwb_zio == NULL); + if (lwb->lwb_fastwrite) + metaslab_fastwrite_unmark(zilog->zl_spa, + &lwb->lwb_blk); list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); @@ -826,6 +834,8 @@ zil_lwb_write_done(zio_t *zio) */ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); + lwb->lwb_zio = NULL; + lwb->lwb_fastwrite = FALSE; lwb->lwb_buf = NULL; lwb->lwb_tx = NULL; mutex_exit(&zilog->zl_lock); @@ -854,12 +864,21 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } + + /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ + mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; + } lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_FASTWRITE, &zb); } + mutex_exit(&zilog->zl_lock); } /* @@ -956,10 +975,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); - /* pass the old blkptr in order to spread log blocks across devs */ use_slog = USE_SLOG(zilog); - error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, - use_slog); + error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog)); if (use_slog) { ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); @@ -978,7 +995,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) /* * Allocate a new log write buffer (lwb). */ - nlwb = zil_alloc_lwb(zilog, bp, txg); + nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); @@ -1625,6 +1642,9 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; + + ASSERT(lwb->lwb_zio == NULL); + list_remove(&zilog->zl_lwb_list, lwb); zio_free_zil(spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); @@ -1638,6 +1658,19 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) if (list_head(&zilog->zl_lwb_list) == NULL) BP_ZERO(&zh->zh_log); } + + /* + * Remove fastwrite on any blocks that have been pre-allocated for + * the next commit. This prevents fastwrite counter pollution by + * unused, long-lived LWBs. + */ + for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { + if (lwb->lwb_fastwrite && !lwb->lwb_zio) { + metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 0; + } + } + mutex_exit(&zilog->zl_lock); } @@ -1817,6 +1850,9 @@ zil_close(zilog_t *zilog) lwb = list_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(lwb == list_tail(&zilog->zl_lwb_list)); + ASSERT(lwb->lwb_zio == NULL); + if (lwb->lwb_fastwrite) + metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); list_remove(&zilog->zl_lwb_list, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); kmem_cache_free(zil_lwb_cache, lwb); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ace72a087..ce76e010c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1861,6 +1861,11 @@ zio_write_gang_block(zio_t *pio) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + /* + * We didn't allocate this bp, so make sure it doesn't get unmarked. + */ + pio->io_flags &= ~ZIO_FLAG_FASTWRITE; + zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); @@ -2270,6 +2275,7 @@ zio_dva_allocate(zio_t *zio) flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? METASLAB_GANG_CHILD : 0; + flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags); @@ -2333,8 +2339,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t size, boolean_t use_slog) +zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, + boolean_t use_slog) { int error = 1; @@ -2347,14 +2353,14 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, old_bp, - METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); + new_bp, 1, txg, NULL, + METASLAB_FASTWRITE | METASLAB_GANG_AVOID); } if (error == 0) { @@ -3066,6 +3072,11 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } + if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && + !BP_IS_HOLE(zio->io_bp)) { + metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); + } + /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as From 5d7a86d114c2706a8d14d94b71f81ad5cdf066c5 Mon Sep 17 00:00:00 2001 From: Etienne Dechamps Date: Thu, 28 Jun 2012 12:30:07 +0200 Subject: [PATCH 3/3] Use the slog even with logbias=throughput. In the current code, logbias=throughput implies the following: 1) All synchronous writes are logged in indirect mode. 2) The slog is not used. (1) makes sense because it avoids writing the data twice, which is obviously a good thing when the user wants maximum pool throughput. (2), however, is a surprising decision. Considering all writes are indirect, the log record doesn't contain the actual data, only pointers to DMU blocks. As a result, log records written in logbias=throughput mode are quite small, and as such, it doesn't make any sense to write them to the main pool since slogs are usually optimized for small synchronous writes. In fact, the current behavior is actually harmful for performance, because log blocks and data blocks from dmu_sync() seldom have the same allocation size and as a result are usually allocated from different metaslabs. This means that if a spindle has to write both log blocks and DMU blocks (which is likely to happen under heavy load), it will have to seek between the two. Allocating the log blocks from the slog pool instead of the main pool avoids these unnecessary seeks. This commit makes ZFS use the slog on datasets with logbias=throughput. Real-life performance testing shows a 50% synchronous write performance increase with some large commit sizes, and no negative effect in other cases. Signed-off-by: Brian Behlendorf Issue #1013 --- module/zfs/zil.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 6492dbc1c..220f2d79e 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -520,7 +520,7 @@ zil_create(zilog_t *zilog) } error = zio_alloc_zil(zilog->zl_spa, txg, &blk, - ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + ZIL_MIN_BLKSZ, B_TRUE); fastwrite = TRUE; if (error == 0) @@ -895,14 +895,13 @@ uint64_t zil_block_buckets[] = { }; /* - * Use the slog as long as the logbias is 'latency' and the current commit size - * is less than the limit or the total list size is less than 2X the limit. - * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. + * Use the slog as long as the current commit size is less than the + * limit or the total list size is less than 2X the limit. Limit + * checking is disabled by setting zil_slog_limit to UINT64_MAX. */ unsigned long zil_slog_limit = 1024 * 1024; -#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ - (((zilog)->zl_cur_used < zil_slog_limit) || \ - ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) +#define USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \ + ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))) /* * Start a log block write and advance to the next log block.