mirror_zfs/include/sys/refcount.h
Don Brady 3dfb57a35e OpenZFS 7090 - zfs should throttle allocations
OpenZFS 7090 - zfs should throttle allocations

Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Sebastien Roy <sebastien.roy@delphix.com>
Approved by: Matthew Ahrens <mahrens@delphix.com>
Ported-by: Don Brady <don.brady@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>

When write I/Os are issued, they are issued in block order but the ZIO
pipeline will drive them asynchronously through the allocation stage
which can result in blocks being allocated out-of-order. It would be
nice to preserve as much of the logical order as possible.

In addition, the allocations are equally scattered across all top-level
VDEVs but not all top-level VDEVs are created equally. The pipeline
should be able to detect devices that are more capable of handling
allocations and should allocate more blocks to those devices. This
allows for dynamic allocation distribution when devices are imbalanced
as fuller devices will tend to be slower than empty devices.

The change includes a new pool-wide allocation queue which would
throttle and order allocations in the ZIO pipeline. The queue would be
ordered by issued time and offset and would provide an initial amount of
allocation of work to each top-level vdev. The allocation logic utilizes
a reservation system to reserve allocations that will be performed by
the allocator. Once an allocation is successfully completed it's
scheduled on a given top-level vdev. Each top-level vdev maintains a
maximum number of allocations that it can handle (mg_alloc_queue_depth).
The pool-wide reserved allocations (top-levels * mg_alloc_queue_depth)
are distributed across the top-level vdevs metaslab groups and round
robin across all eligible metaslab groups to distribute the work. As
top-levels complete their work, they receive additional work from the
pool-wide allocation queue until the allocation queue is emptied.

OpenZFS-issue: https://www.illumos.org/issues/7090
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/4756c3d7
Closes #5258 

Porting Notes:
- Maintained minimal stack in zio_done
- Preserve linux-specific io sizes in zio_write_compress
- Added module params and documentation
- Updated to use optimize AVL cmp macros
2016-10-13 17:59:18 -07:00

120 lines
3.8 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
#define _SYS_REFCOUNT_H
#include <sys/inttypes.h>
#include <sys/list.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* If the reference is held only by the calling function and not any
* particular object, use FTAG (which is a string) for the holder_tag.
* Otherwise, use the object that holds the reference.
*/
#define FTAG ((char *)__func__)
#ifdef ZFS_DEBUG
typedef struct reference {
list_node_t ref_link;
void *ref_holder;
uint64_t ref_number;
uint8_t *ref_removed;
} reference_t;
typedef struct refcount {
kmutex_t rc_mtx;
boolean_t rc_tracked;
list_t rc_list;
list_t rc_removed;
uint64_t rc_count;
uint64_t rc_removed_count;
} refcount_t;
/* Note: refcount_t must be initialized with refcount_create[_untracked]() */
void refcount_create(refcount_t *rc);
void refcount_create_untracked(refcount_t *rc);
void refcount_create_tracked(refcount_t *rc);
void refcount_destroy(refcount_t *rc);
void refcount_destroy_many(refcount_t *rc, uint64_t number);
int refcount_is_zero(refcount_t *rc);
int64_t refcount_count(refcount_t *rc);
int64_t refcount_add(refcount_t *rc, void *holder_tag);
int64_t refcount_remove(refcount_t *rc, void *holder_tag);
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
void refcount_transfer(refcount_t *dst, refcount_t *src);
void refcount_transfer_ownership(refcount_t *, void *, void *);
boolean_t refcount_held(refcount_t *, void *);
boolean_t refcount_not_held(refcount_t *, void *);
void refcount_init(void);
void refcount_fini(void);
#else /* ZFS_DEBUG */
typedef struct refcount {
uint64_t rc_count;
} refcount_t;
#define refcount_create(rc) ((rc)->rc_count = 0)
#define refcount_create_untracked(rc) ((rc)->rc_count = 0)
#define refcount_create_tracked(rc) ((rc)->rc_count = 0)
#define refcount_destroy(rc) ((rc)->rc_count = 0)
#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
#define refcount_is_zero(rc) ((rc)->rc_count == 0)
#define refcount_count(rc) ((rc)->rc_count)
#define refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
#define refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
#define refcount_add_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, number)
#define refcount_remove_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, -number)
#define refcount_transfer(dst, src) { \
uint64_t __tmp = (src)->rc_count; \
atomic_add_64(&(src)->rc_count, -__tmp); \
atomic_add_64(&(dst)->rc_count, __tmp); \
}
#define refcount_transfer_ownership(rc, current_holder, new_holder) (void)0
#define refcount_held(rc, holder) ((rc)->rc_count > 0)
#define refcount_not_held(rc, holder) (B_TRUE)
#define refcount_init()
#define refcount_fini()
#endif /* ZFS_DEBUG */
#ifdef __cplusplus
}
#endif
#endif /* _SYS_REFCOUNT_H */