mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-26 20:22:14 +03:00
OpenZFS 7090 - zfs should throttle allocations
OpenZFS 7090 - zfs should throttle allocations Authored by: George Wilson <george.wilson@delphix.com> Reviewed by: Alex Reece <alex@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com> Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Approved by: Matthew Ahrens <mahrens@delphix.com> Ported-by: Don Brady <don.brady@intel.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> When write I/Os are issued, they are issued in block order but the ZIO pipeline will drive them asynchronously through the allocation stage which can result in blocks being allocated out-of-order. It would be nice to preserve as much of the logical order as possible. In addition, the allocations are equally scattered across all top-level VDEVs but not all top-level VDEVs are created equally. The pipeline should be able to detect devices that are more capable of handling allocations and should allocate more blocks to those devices. This allows for dynamic allocation distribution when devices are imbalanced as fuller devices will tend to be slower than empty devices. The change includes a new pool-wide allocation queue which would throttle and order allocations in the ZIO pipeline. The queue would be ordered by issued time and offset and would provide an initial amount of allocation of work to each top-level vdev. The allocation logic utilizes a reservation system to reserve allocations that will be performed by the allocator. Once an allocation is successfully completed it's scheduled on a given top-level vdev. Each top-level vdev maintains a maximum number of allocations that it can handle (mg_alloc_queue_depth). The pool-wide reserved allocations (top-levels * mg_alloc_queue_depth) are distributed across the top-level vdevs metaslab groups and round robin across all eligible metaslab groups to distribute the work. As top-levels complete their work, they receive additional work from the pool-wide allocation queue until the allocation queue is emptied. OpenZFS-issue: https://www.illumos.org/issues/7090 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/4756c3d7 Closes #5258 Porting Notes: - Maintained minimal stack in zio_done - Preserve linux-specific io sizes in zio_write_compress - Added module params and documentation - Updated to use optimize AVL cmp macros
This commit is contained in:
committed by
Brian Behlendorf
parent
a85a90557d
commit
3dfb57a35e
+25
-19
@@ -157,6 +157,7 @@ enum zio_flag {
|
||||
ZIO_FLAG_DONT_CACHE = 1 << 11,
|
||||
ZIO_FLAG_NODATA = 1 << 12,
|
||||
ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
|
||||
ZIO_FLAG_IO_ALLOCATING = 1 << 14,
|
||||
|
||||
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
||||
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
|
||||
@@ -164,28 +165,28 @@ enum zio_flag {
|
||||
/*
|
||||
* Flags inherited by vdev children.
|
||||
*/
|
||||
ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
|
||||
ZIO_FLAG_PROBE = 1 << 15,
|
||||
ZIO_FLAG_TRYHARD = 1 << 16,
|
||||
ZIO_FLAG_OPTIONAL = 1 << 17,
|
||||
ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
|
||||
ZIO_FLAG_PROBE = 1 << 16,
|
||||
ZIO_FLAG_TRYHARD = 1 << 17,
|
||||
ZIO_FLAG_OPTIONAL = 1 << 18,
|
||||
|
||||
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
|
||||
|
||||
/*
|
||||
* Flags not inherited by any children.
|
||||
*/
|
||||
ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
|
||||
ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
|
||||
ZIO_FLAG_IO_BYPASS = 1 << 20,
|
||||
ZIO_FLAG_IO_REWRITE = 1 << 21,
|
||||
ZIO_FLAG_RAW = 1 << 22,
|
||||
ZIO_FLAG_GANG_CHILD = 1 << 23,
|
||||
ZIO_FLAG_DDT_CHILD = 1 << 24,
|
||||
ZIO_FLAG_GODFATHER = 1 << 25,
|
||||
ZIO_FLAG_NOPWRITE = 1 << 26,
|
||||
ZIO_FLAG_REEXECUTED = 1 << 27,
|
||||
ZIO_FLAG_DELEGATED = 1 << 28,
|
||||
ZIO_FLAG_FASTWRITE = 1 << 29,
|
||||
ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
|
||||
ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
|
||||
ZIO_FLAG_IO_BYPASS = 1 << 21,
|
||||
ZIO_FLAG_IO_REWRITE = 1 << 22,
|
||||
ZIO_FLAG_RAW = 1 << 23,
|
||||
ZIO_FLAG_GANG_CHILD = 1 << 24,
|
||||
ZIO_FLAG_DDT_CHILD = 1 << 25,
|
||||
ZIO_FLAG_GODFATHER = 1 << 26,
|
||||
ZIO_FLAG_NOPWRITE = 1 << 27,
|
||||
ZIO_FLAG_REEXECUTED = 1 << 28,
|
||||
ZIO_FLAG_DELEGATED = 1 << 29,
|
||||
ZIO_FLAG_FASTWRITE = 1 << 30
|
||||
};
|
||||
|
||||
#define ZIO_FLAG_MUSTSUCCEED 0
|
||||
@@ -225,6 +226,7 @@ enum zio_wait_type {
|
||||
|
||||
typedef void zio_done_func_t(zio_t *zio);
|
||||
|
||||
extern int zio_dva_throttle_enabled;
|
||||
extern const char *zio_type_name[ZIO_TYPES];
|
||||
|
||||
/*
|
||||
@@ -379,7 +381,6 @@ struct zio {
|
||||
blkptr_t io_bp_copy;
|
||||
list_t io_parent_list;
|
||||
list_t io_child_list;
|
||||
zio_link_t *io_walk_link;
|
||||
zio_t *io_logical;
|
||||
zio_transform_t *io_transform_stack;
|
||||
|
||||
@@ -407,12 +408,14 @@ struct zio {
|
||||
|
||||
uint64_t io_offset;
|
||||
hrtime_t io_timestamp; /* submitted at */
|
||||
hrtime_t io_queued_timestamp;
|
||||
hrtime_t io_target_timestamp;
|
||||
hrtime_t io_delta; /* vdev queue service delta */
|
||||
hrtime_t io_delay; /* Device access time (disk or */
|
||||
/* file). */
|
||||
avl_node_t io_queue_node;
|
||||
avl_node_t io_offset_node;
|
||||
avl_node_t io_alloc_node;
|
||||
|
||||
/* Internal pipeline state */
|
||||
enum zio_flag io_flags;
|
||||
@@ -421,6 +424,7 @@ struct zio {
|
||||
enum zio_flag io_orig_flags;
|
||||
enum zio_stage io_orig_stage;
|
||||
enum zio_stage io_orig_pipeline;
|
||||
enum zio_stage io_pipeline_trace;
|
||||
int io_error;
|
||||
int io_child_error[ZIO_CHILD_TYPES];
|
||||
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
|
||||
@@ -443,6 +447,8 @@ struct zio {
|
||||
taskq_ent_t io_tqent;
|
||||
};
|
||||
|
||||
extern int zio_timestamp_compare(const void *, const void *);
|
||||
|
||||
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
|
||||
zio_done_func_t *done, void *private, enum zio_flag flags);
|
||||
|
||||
@@ -502,8 +508,8 @@ extern void zio_interrupt(zio_t *zio);
|
||||
extern void zio_delay_init(zio_t *zio);
|
||||
extern void zio_delay_interrupt(zio_t *zio);
|
||||
|
||||
extern zio_t *zio_walk_parents(zio_t *cio);
|
||||
extern zio_t *zio_walk_children(zio_t *pio);
|
||||
extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
|
||||
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
|
||||
extern zio_t *zio_unique_parent(zio_t *cio);
|
||||
extern void zio_add_child(zio_t *pio, zio_t *cio);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user