mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	More aggsum optimizations
- Avoid atomic_add() when updating as_lower_bound/as_upper_bound. Previous code was excessively strong on 64bit systems while not strong enough on 32bit ones. Instead introduce and use real atomic_load() and atomic_store() operations, just an assignments on 64bit machines, but using proper atomics on 32bit ones to avoid torn reads/writes. - Reduce number of buckets on large systems. Extra buckets not as much improve add speed, as hurt reads. Unlike wmsum for aggsum reads are still important. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #12145
This commit is contained in:
		
							parent
							
								
									b05ae1a82a
								
							
						
					
					
						commit
						e76373de7b
					
				| @ -48,6 +48,8 @@ | ||||
| #define	atomic_sub_32_nv(v, i)	atomic_sub_return((i), (atomic_t *)(v)) | ||||
| #define	atomic_cas_32(v, x, y)	atomic_cmpxchg((atomic_t *)(v), x, y) | ||||
| #define	atomic_swap_32(v, x)	atomic_xchg((atomic_t *)(v), x) | ||||
| #define	atomic_load_32(v)	atomic_read((atomic_t *)(v)) | ||||
| #define	atomic_store_32(v, x)	atomic_set((atomic_t *)(v), x) | ||||
| #define	atomic_inc_64(v)	atomic64_inc((atomic64_t *)(v)) | ||||
| #define	atomic_dec_64(v)	atomic64_dec((atomic64_t *)(v)) | ||||
| #define	atomic_add_64(v, i)	atomic64_add((i), (atomic64_t *)(v)) | ||||
| @ -58,6 +60,8 @@ | ||||
| #define	atomic_sub_64_nv(v, i)	atomic64_sub_return((i), (atomic64_t *)(v)) | ||||
| #define	atomic_cas_64(v, x, y)	atomic64_cmpxchg((atomic64_t *)(v), x, y) | ||||
| #define	atomic_swap_64(v, x)	atomic64_xchg((atomic64_t *)(v), x) | ||||
| #define	atomic_load_64(v)	atomic64_read((atomic64_t *)(v)) | ||||
| #define	atomic_store_64(v, x)	atomic64_set((atomic64_t *)(v), x) | ||||
| 
 | ||||
| #ifdef _LP64 | ||||
| static __inline__ void * | ||||
|  | ||||
| @ -39,15 +39,16 @@ struct aggsum_bucket { | ||||
| typedef struct aggsum { | ||||
| 	kmutex_t as_lock; | ||||
| 	int64_t as_lower_bound; | ||||
| 	int64_t as_upper_bound; | ||||
| 	uint64_t as_upper_bound; | ||||
| 	aggsum_bucket_t *as_buckets ____cacheline_aligned; | ||||
| 	uint_t as_numbuckets; | ||||
| 	aggsum_bucket_t *as_buckets; | ||||
| 	uint_t as_bucketshift; | ||||
| } aggsum_t; | ||||
| 
 | ||||
| void aggsum_init(aggsum_t *, uint64_t); | ||||
| void aggsum_fini(aggsum_t *); | ||||
| int64_t aggsum_lower_bound(aggsum_t *); | ||||
| int64_t aggsum_upper_bound(aggsum_t *); | ||||
| uint64_t aggsum_upper_bound(aggsum_t *); | ||||
| int aggsum_compare(aggsum_t *, uint64_t); | ||||
| uint64_t aggsum_value(aggsum_t *); | ||||
| void aggsum_add(aggsum_t *, int64_t); | ||||
|  | ||||
| @ -390,6 +390,19 @@ atomic_swap_ptr(volatile void *target, void *bits) | ||||
| 	return (old); | ||||
| } | ||||
| 
 | ||||
| #ifndef _LP64 | ||||
| uint64_t | ||||
| atomic_load_64(volatile uint64_t *target) | ||||
| { | ||||
| 	return (__atomic_load_n(target, __ATOMIC_RELAXED)); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| atomic_store_64(volatile uint64_t *target, uint64_t bits) | ||||
| { | ||||
| 	return (__atomic_store_n(target, bits, __ATOMIC_RELAXED)); | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| int | ||||
| atomic_set_long_excl(volatile ulong_t *target, uint_t value) | ||||
|  | ||||
| @ -245,6 +245,49 @@ extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t); | ||||
| extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Atomically read variable. | ||||
|  */ | ||||
| #define	atomic_load_char(p)	(*(volatile uchar_t *)(p)) | ||||
| #define	atomic_load_short(p)	(*(volatile ushort_t *)(p)) | ||||
| #define	atomic_load_int(p)	(*(volatile uint_t *)(p)) | ||||
| #define	atomic_load_long(p)	(*(volatile ulong_t *)(p)) | ||||
| #define	atomic_load_ptr(p)	(*(volatile __typeof(*p) *)(p)) | ||||
| #define	atomic_load_8(p)	(*(volatile uint8_t *)(p)) | ||||
| #define	atomic_load_16(p)	(*(volatile uint16_t *)(p)) | ||||
| #define	atomic_load_32(p)	(*(volatile uint32_t *)(p)) | ||||
| #ifdef _LP64 | ||||
| #define	atomic_load_64(p)	(*(volatile uint64_t *)(p)) | ||||
| #elif defined(_INT64_TYPE) | ||||
| extern uint64_t atomic_load_64(volatile uint64_t *); | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Atomically write variable. | ||||
|  */ | ||||
| #define	atomic_store_char(p, v)		\ | ||||
| 	(*(volatile uchar_t *)(p) = (uchar_t)(v)) | ||||
| #define	atomic_store_short(p, v)	\ | ||||
| 	(*(volatile ushort_t *)(p) = (ushort_t)(v)) | ||||
| #define	atomic_store_int(p, v)		\ | ||||
| 	(*(volatile uint_t *)(p) = (uint_t)(v)) | ||||
| #define	atomic_store_long(p, v)		\ | ||||
| 	(*(volatile ulong_t *)(p) = (ulong_t)(v)) | ||||
| #define	atomic_store_ptr(p, v)		\ | ||||
| 	(*(volatile __typeof(*p) *)(p) = (v)) | ||||
| #define	atomic_store_8(p, v)		\ | ||||
| 	(*(volatile uint8_t *)(p) = (uint8_t)(v)) | ||||
| #define	atomic_store_16(p, v)		\ | ||||
| 	(*(volatile uint16_t *)(p) = (uint16_t)(v)) | ||||
| #define	atomic_store_32(p, v)		\ | ||||
| 	(*(volatile uint32_t *)(p) = (uint32_t)(v)) | ||||
| #ifdef _LP64 | ||||
| #define	atomic_store_64(p, v)		\ | ||||
| 	(*(volatile uint64_t *)(p) = (uint64_t)(v)) | ||||
| #elif defined(_INT64_TYPE) | ||||
| extern void atomic_store_64(volatile uint64_t *, uint64_t); | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Perform an exclusive atomic bit set/clear on a target. | ||||
|  * Returns 0 if bit was successfully set/cleared, or -1 | ||||
|  | ||||
| @ -78,11 +78,11 @@ | ||||
|  */ | ||||
| 
 | ||||
| /*
 | ||||
|  * We will borrow aggsum_borrow_multiplier times the current request, so we will | ||||
|  * have to get the as_lock approximately every aggsum_borrow_multiplier calls to | ||||
|  * aggsum_delta(). | ||||
|  * We will borrow 2^aggsum_borrow_shift times the current request, so we will | ||||
|  * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to | ||||
|  * aggsum_add(). | ||||
|  */ | ||||
| static uint_t aggsum_borrow_multiplier = 10; | ||||
| static uint_t aggsum_borrow_shift = 4; | ||||
| 
 | ||||
| void | ||||
| aggsum_init(aggsum_t *as, uint64_t value) | ||||
| @ -90,9 +90,14 @@ aggsum_init(aggsum_t *as, uint64_t value) | ||||
| 	bzero(as, sizeof (*as)); | ||||
| 	as->as_lower_bound = as->as_upper_bound = value; | ||||
| 	mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	as->as_numbuckets = boot_ncpus; | ||||
| 	as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), | ||||
| 	    KM_SLEEP); | ||||
| 	/*
 | ||||
| 	 * Too many buckets may hurt read performance without improving | ||||
| 	 * write.  From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc. | ||||
| 	 */ | ||||
| 	as->as_bucketshift = highbit64(boot_ncpus / 6) / 2; | ||||
| 	as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1; | ||||
| 	as->as_buckets = kmem_zalloc(as->as_numbuckets * | ||||
| 	    sizeof (aggsum_bucket_t), KM_SLEEP); | ||||
| 	for (int i = 0; i < as->as_numbuckets; i++) { | ||||
| 		mutex_init(&as->as_buckets[i].asc_lock, | ||||
| 		    NULL, MUTEX_DEFAULT, NULL); | ||||
| @ -111,59 +116,49 @@ aggsum_fini(aggsum_t *as) | ||||
| int64_t | ||||
| aggsum_lower_bound(aggsum_t *as) | ||||
| { | ||||
| 	return (as->as_lower_bound); | ||||
| 	return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound)); | ||||
| } | ||||
| 
 | ||||
| int64_t | ||||
| uint64_t | ||||
| aggsum_upper_bound(aggsum_t *as) | ||||
| { | ||||
| 	return (as->as_upper_bound); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&as->as_lock)); | ||||
| 	ASSERT(MUTEX_HELD(&asb->asc_lock)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We use atomic instructions for this because we read the upper and | ||||
| 	 * lower bounds without the lock, so we need stores to be atomic. | ||||
| 	 */ | ||||
| 	atomic_add_64((volatile uint64_t *)&as->as_lower_bound, | ||||
| 	    asb->asc_delta + asb->asc_borrowed); | ||||
| 	atomic_add_64((volatile uint64_t *)&as->as_upper_bound, | ||||
| 	    asb->asc_delta - asb->asc_borrowed); | ||||
| 	asb->asc_delta = 0; | ||||
| 	asb->asc_borrowed = 0; | ||||
| 	return (atomic_load_64(&as->as_upper_bound)); | ||||
| } | ||||
| 
 | ||||
| uint64_t | ||||
| aggsum_value(aggsum_t *as) | ||||
| { | ||||
| 	int64_t rv; | ||||
| 	int64_t lb; | ||||
| 	uint64_t ub; | ||||
| 
 | ||||
| 	mutex_enter(&as->as_lock); | ||||
| 	if (as->as_lower_bound == as->as_upper_bound) { | ||||
| 		rv = as->as_lower_bound; | ||||
| 	lb = as->as_lower_bound; | ||||
| 	ub = as->as_upper_bound; | ||||
| 	if (lb == ub) { | ||||
| 		for (int i = 0; i < as->as_numbuckets; i++) { | ||||
| 			ASSERT0(as->as_buckets[i].asc_delta); | ||||
| 			ASSERT0(as->as_buckets[i].asc_borrowed); | ||||
| 		} | ||||
| 		mutex_exit(&as->as_lock); | ||||
| 		return (rv); | ||||
| 		return (lb); | ||||
| 	} | ||||
| 	for (int i = 0; i < as->as_numbuckets; i++) { | ||||
| 		struct aggsum_bucket *asb = &as->as_buckets[i]; | ||||
| 		if (asb->asc_borrowed == 0) | ||||
| 			continue; | ||||
| 		mutex_enter(&asb->asc_lock); | ||||
| 		aggsum_flush_bucket(as, asb); | ||||
| 		lb += asb->asc_delta + asb->asc_borrowed; | ||||
| 		ub += asb->asc_delta - asb->asc_borrowed; | ||||
| 		asb->asc_delta = 0; | ||||
| 		asb->asc_borrowed = 0; | ||||
| 		mutex_exit(&asb->asc_lock); | ||||
| 	} | ||||
| 	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); | ||||
| 	rv = as->as_lower_bound; | ||||
| 	ASSERT3U(lb, ==, ub); | ||||
| 	atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); | ||||
| 	atomic_store_64(&as->as_upper_bound, lb); | ||||
| 	mutex_exit(&as->as_lock); | ||||
| 
 | ||||
| 	return (rv); | ||||
| 	return (lb); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| @ -172,7 +167,8 @@ aggsum_add(aggsum_t *as, int64_t delta) | ||||
| 	struct aggsum_bucket *asb; | ||||
| 	int64_t borrow; | ||||
| 
 | ||||
| 	asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets]; | ||||
| 	asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) % | ||||
| 	    as->as_numbuckets]; | ||||
| 
 | ||||
| 	/* Try fast path if we already borrowed enough before. */ | ||||
| 	mutex_enter(&asb->asc_lock); | ||||
| @ -188,21 +184,22 @@ aggsum_add(aggsum_t *as, int64_t delta) | ||||
| 	 * We haven't borrowed enough.  Take the global lock and borrow | ||||
| 	 * considering what is requested now and what we borrowed before. | ||||
| 	 */ | ||||
| 	borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier; | ||||
| 	borrow = (delta < 0 ? -delta : delta); | ||||
| 	borrow <<= aggsum_borrow_shift + as->as_bucketshift; | ||||
| 	mutex_enter(&as->as_lock); | ||||
| 	mutex_enter(&asb->asc_lock); | ||||
| 	delta += asb->asc_delta; | ||||
| 	asb->asc_delta = 0; | ||||
| 	if (borrow >= asb->asc_borrowed) | ||||
| 		borrow -= asb->asc_borrowed; | ||||
| 	else | ||||
| 		borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; | ||||
| 	mutex_enter(&asb->asc_lock); | ||||
| 	delta += asb->asc_delta; | ||||
| 	asb->asc_delta = 0; | ||||
| 	asb->asc_borrowed += borrow; | ||||
| 	atomic_add_64((volatile uint64_t *)&as->as_lower_bound, | ||||
| 	    delta - borrow); | ||||
| 	atomic_add_64((volatile uint64_t *)&as->as_upper_bound, | ||||
| 	    delta + borrow); | ||||
| 	mutex_exit(&asb->asc_lock); | ||||
| 	atomic_store_64((volatile uint64_t *)&as->as_lower_bound, | ||||
| 	    as->as_lower_bound + delta - borrow); | ||||
| 	atomic_store_64(&as->as_upper_bound, | ||||
| 	    as->as_upper_bound + delta + borrow); | ||||
| 	mutex_exit(&as->as_lock); | ||||
| } | ||||
| 
 | ||||
| @ -214,27 +211,35 @@ aggsum_add(aggsum_t *as, int64_t delta) | ||||
| int | ||||
| aggsum_compare(aggsum_t *as, uint64_t target) | ||||
| { | ||||
| 	if (as->as_upper_bound < target) | ||||
| 	int64_t lb; | ||||
| 	uint64_t ub; | ||||
| 	int i; | ||||
| 
 | ||||
| 	if (atomic_load_64(&as->as_upper_bound) < target) | ||||
| 		return (-1); | ||||
| 	if (as->as_lower_bound > target) | ||||
| 	lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound); | ||||
| 	if (lb > 0 && (uint64_t)lb > target) | ||||
| 		return (1); | ||||
| 	mutex_enter(&as->as_lock); | ||||
| 	for (int i = 0; i < as->as_numbuckets; i++) { | ||||
| 	lb = as->as_lower_bound; | ||||
| 	ub = as->as_upper_bound; | ||||
| 	for (i = 0; i < as->as_numbuckets; i++) { | ||||
| 		struct aggsum_bucket *asb = &as->as_buckets[i]; | ||||
| 		if (asb->asc_borrowed == 0) | ||||
| 			continue; | ||||
| 		mutex_enter(&asb->asc_lock); | ||||
| 		aggsum_flush_bucket(as, asb); | ||||
| 		lb += asb->asc_delta + asb->asc_borrowed; | ||||
| 		ub += asb->asc_delta - asb->asc_borrowed; | ||||
| 		asb->asc_delta = 0; | ||||
| 		asb->asc_borrowed = 0; | ||||
| 		mutex_exit(&asb->asc_lock); | ||||
| 		if (as->as_upper_bound < target) { | ||||
| 			mutex_exit(&as->as_lock); | ||||
| 			return (-1); | ||||
| 		if (ub < target || (lb > 0 && (uint64_t)lb > target)) | ||||
| 			break; | ||||
| 	} | ||||
| 		if (as->as_lower_bound > target) { | ||||
| 	if (i >= as->as_numbuckets) | ||||
| 		ASSERT3U(lb, ==, ub); | ||||
| 	atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); | ||||
| 	atomic_store_64(&as->as_upper_bound, ub); | ||||
| 	mutex_exit(&as->as_lock); | ||||
| 			return (1); | ||||
| 		} | ||||
| 	} | ||||
| 	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); | ||||
| 	ASSERT3U(as->as_lower_bound, ==, target); | ||||
| 	mutex_exit(&as->as_lock); | ||||
| 	return (0); | ||||
| 	return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0); | ||||
| } | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Alexander Motin
						Alexander Motin