2025-01-04 03:04:27 +03:00
|
|
|
// SPDX-License-Identifier: CDDL-1.0
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-12 00:16:13 +03:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2008-11-20 23:01:55 +03:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
|
2008-11-20 23:01:55 +03:00
|
|
|
* Use is subject to license terms.
|
|
|
|
*/
|
|
|
|
|
2015-12-27 00:10:31 +03:00
|
|
|
/*
|
2019-07-08 23:18:50 +03:00
|
|
|
* Copyright (c) 2014, 2017 by Delphix. All rights reserved.
|
2015-12-27 00:10:31 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _DMU_ZFETCH_H
|
|
|
|
#define _DMU_ZFETCH_H
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct dnode; /* so we can reference dnode */
|
|
|
|
|
2020-09-28 03:08:38 +03:00
|
|
|
typedef struct zfetch {
|
|
|
|
kmutex_t zf_lock; /* protects zfetch structure */
|
|
|
|
list_t zf_stream; /* list of zstream_t's */
|
|
|
|
struct dnode *zf_dnode; /* dnode that owns this zfetch */
|
|
|
|
int zf_numstreams; /* number of zstream_t's */
|
|
|
|
} zfetch_t;
|
|
|
|
|
2024-04-09 01:13:27 +03:00
|
|
|
typedef struct zsrange {
|
|
|
|
uint16_t start;
|
|
|
|
uint16_t end;
|
|
|
|
} zsrange_t;
|
|
|
|
|
|
|
|
#define ZFETCH_RANGES 9 /* Fits zstream_t into 128 bytes */
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
typedef struct zstream {
|
2024-04-09 01:13:27 +03:00
|
|
|
list_node_t zs_node; /* link for zf_stream */
|
2015-12-27 00:10:31 +03:00
|
|
|
uint64_t zs_blkid; /* expect next access at this blkid */
|
2024-04-09 01:13:27 +03:00
|
|
|
uint_t zs_atime; /* time last prefetch issued */
|
|
|
|
zsrange_t zs_ranges[ZFETCH_RANGES]; /* ranges from future */
|
2022-05-25 20:12:52 +03:00
|
|
|
unsigned int zs_pf_dist; /* data prefetch distance in bytes */
|
|
|
|
unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */
|
|
|
|
uint64_t zs_pf_start; /* first data block to prefetch */
|
|
|
|
uint64_t zs_pf_end; /* data block to prefetch up to */
|
|
|
|
uint64_t zs_ipf_start; /* first data block to prefetch L1 */
|
|
|
|
uint64_t zs_ipf_end; /* data block to prefetch L1 up to */
|
Split dmu_zfetch() speculation and execution parts
To make better predictions on parallel workloads dmu_zfetch() should
be called as early as possible to reduce possible request reordering.
In particular, it should be called before dmu_buf_hold_array_by_dnode()
calls dbuf_hold(), which may sleep waiting for indirect blocks, waking
up multiple threads same time on completion, that can significantly
reorder the requests, making the stream look like random. But we
should not issue prefetch requests before the on-demand ones, since
they may get to the disks first despite the I/O scheduler, increasing
on-demand request latency.
This patch splits dmu_zfetch() into two functions: dmu_zfetch_prepare()
and dmu_zfetch_run(). The first can be executed as early as needed.
It only updates statistics and makes predictions without issuing any
I/Os. The I/O issuance is handled by dmu_zfetch_run(), which can be
called later when all on-demand I/Os are already issued. It even
tracks the activity of other concurrent threads, issuing the prefetch
only when _all_ on-demand requests are issued.
For many years it was a big problem for storage servers, handling
deeper request queues from their clients, having to either serialize
consequential reads to make ZFS prefetcher usable, or execute the
incoming requests as-is and get almost no prefetch from ZFS, relying
only on deep enough prefetch by the clients. Benefits of those ways
varied, but neither was perfect. With this patch deeper queue
sequential read benchmarks with CrystalDiskMark from Windows via
iSCSI to FreeBSD target show me much better throughput with almost
100% prefetcher hit rate, comparing to almost zero before.
While there, I also removed per-stream zs_lock as useless, completely
covered by parent zf_lock. Also I reused zs_blocks refcount to track
zf_stream linkage of the stream, since I believe previous zs_fetch ==
NULL check in dmu_zfetch_stream_done() was racy.
Delete prefetch streams when they reach ends of files. It saves up
to 1KB of RAM per file, plus reduces searches through the stream list.
Block data prefetch (speculation and indirect block prefetch is still
done since they are cheaper) if all dbufs of the stream are already
in DMU cache. First cache miss immediately fires all the prefetch
that would be done for the stream by that time. It saves some CPU
time if same files within DMU cache capacity are read over and over.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Adam Moss <c@yotes.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #11652
2021-03-20 08:56:11 +03:00
|
|
|
boolean_t zs_missed; /* stream saw cache misses */
|
2022-05-25 20:12:52 +03:00
|
|
|
boolean_t zs_more; /* need more distant prefetch */
|
Split dmu_zfetch() speculation and execution parts
To make better predictions on parallel workloads dmu_zfetch() should
be called as early as possible to reduce possible request reordering.
In particular, it should be called before dmu_buf_hold_array_by_dnode()
calls dbuf_hold(), which may sleep waiting for indirect blocks, waking
up multiple threads same time on completion, that can significantly
reorder the requests, making the stream look like random. But we
should not issue prefetch requests before the on-demand ones, since
they may get to the disks first despite the I/O scheduler, increasing
on-demand request latency.
This patch splits dmu_zfetch() into two functions: dmu_zfetch_prepare()
and dmu_zfetch_run(). The first can be executed as early as needed.
It only updates statistics and makes predictions without issuing any
I/Os. The I/O issuance is handled by dmu_zfetch_run(), which can be
called later when all on-demand I/Os are already issued. It even
tracks the activity of other concurrent threads, issuing the prefetch
only when _all_ on-demand requests are issued.
For many years it was a big problem for storage servers, handling
deeper request queues from their clients, having to either serialize
consequential reads to make ZFS prefetcher usable, or execute the
incoming requests as-is and get almost no prefetch from ZFS, relying
only on deep enough prefetch by the clients. Benefits of those ways
varied, but neither was perfect. With this patch deeper queue
sequential read benchmarks with CrystalDiskMark from Windows via
iSCSI to FreeBSD target show me much better throughput with almost
100% prefetcher hit rate, comparing to almost zero before.
While there, I also removed per-stream zs_lock as useless, completely
covered by parent zf_lock. Also I reused zs_blocks refcount to track
zf_stream linkage of the stream, since I believe previous zs_fetch ==
NULL check in dmu_zfetch_stream_done() was racy.
Delete prefetch streams when they reach ends of files. It saves up
to 1KB of RAM per file, plus reduces searches through the stream list.
Block data prefetch (speculation and indirect block prefetch is still
done since they are cheaper) if all dbufs of the stream are already
in DMU cache. First cache miss immediately fires all the prefetch
that would be done for the stream by that time. It saves some CPU
time if same files within DMU cache capacity are read over and over.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Adam Moss <c@yotes.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #11652
2021-03-20 08:56:11 +03:00
|
|
|
zfs_refcount_t zs_callers; /* number of pending callers */
|
|
|
|
/*
|
|
|
|
* Number of stream references: dnode, callers and pending blocks.
|
|
|
|
* The stream memory is freed when the number returns to zero.
|
|
|
|
*/
|
|
|
|
zfs_refcount_t zs_refs;
|
2008-11-20 23:01:55 +03:00
|
|
|
} zstream_t;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void zfetch_init(void);
|
|
|
|
void zfetch_fini(void);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void dmu_zfetch_init(zfetch_t *, struct dnode *);
|
2015-12-27 00:10:31 +03:00
|
|
|
void dmu_zfetch_fini(zfetch_t *);
|
Split dmu_zfetch() speculation and execution parts
To make better predictions on parallel workloads dmu_zfetch() should
be called as early as possible to reduce possible request reordering.
In particular, it should be called before dmu_buf_hold_array_by_dnode()
calls dbuf_hold(), which may sleep waiting for indirect blocks, waking
up multiple threads same time on completion, that can significantly
reorder the requests, making the stream look like random. But we
should not issue prefetch requests before the on-demand ones, since
they may get to the disks first despite the I/O scheduler, increasing
on-demand request latency.
This patch splits dmu_zfetch() into two functions: dmu_zfetch_prepare()
and dmu_zfetch_run(). The first can be executed as early as needed.
It only updates statistics and makes predictions without issuing any
I/Os. The I/O issuance is handled by dmu_zfetch_run(), which can be
called later when all on-demand I/Os are already issued. It even
tracks the activity of other concurrent threads, issuing the prefetch
only when _all_ on-demand requests are issued.
For many years it was a big problem for storage servers, handling
deeper request queues from their clients, having to either serialize
consequential reads to make ZFS prefetcher usable, or execute the
incoming requests as-is and get almost no prefetch from ZFS, relying
only on deep enough prefetch by the clients. Benefits of those ways
varied, but neither was perfect. With this patch deeper queue
sequential read benchmarks with CrystalDiskMark from Windows via
iSCSI to FreeBSD target show me much better throughput with almost
100% prefetcher hit rate, comparing to almost zero before.
While there, I also removed per-stream zs_lock as useless, completely
covered by parent zf_lock. Also I reused zs_blocks refcount to track
zf_stream linkage of the stream, since I believe previous zs_fetch ==
NULL check in dmu_zfetch_stream_done() was racy.
Delete prefetch streams when they reach ends of files. It saves up
to 1KB of RAM per file, plus reduces searches through the stream list.
Block data prefetch (speculation and indirect block prefetch is still
done since they are cheaper) if all dbufs of the stream are already
in DMU cache. First cache miss immediately fires all the prefetch
that would be done for the stream by that time. It saves some CPU
time if same files within DMU cache capacity are read over and over.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Adam Moss <c@yotes.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #11652
2021-03-20 08:56:11 +03:00
|
|
|
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
|
|
|
|
boolean_t);
|
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O. It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable. Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.
While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations. It require I/Os
to be page aligned, does not allow speculative prefetch, etc. The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O. As such it should fill the gap in between. Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.
To pass the information between the layers I had to change a number
of APIs. But as side effect upper layers can now control not only
the caching, but also speculative prefetch. I haven't wired it to
VFS yet, since it require looking on some OS specifics. But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
2025-05-14 00:26:55 +03:00
|
|
|
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t,
|
2019-07-08 23:18:50 +03:00
|
|
|
boolean_t);
|
Wire O_DIRECT also to Uncached I/O (#17218)
Before Direct I/O was implemented, I've implemented lighter version
I called Uncached I/O. It uses normal DMU/ARC data path with some
optimizations, but evicts data from caches as soon as possible and
reasonable. Originally I wired it only to a primarycache property,
but now completing the integration all the way up to the VFS.
While Direct I/O has the lowest possible memory bandwidth usage,
it also has a significant number of limitations. It require I/Os
to be page aligned, does not allow speculative prefetch, etc. The
Uncached I/O does not have those limitations, but instead require
additional memory copy, though still one less than regular cached
I/O. As such it should fill the gap in between. Considering this
I've disabled annoying EINVAL errors on misaligned requests, adding
a tunable for those who wants to test their applications.
To pass the information between the layers I had to change a number
of APIs. But as side effect upper layers can now control not only
the caching, but also speculative prefetch. I haven't wired it to
VFS yet, since it require looking on some OS specifics. But while
there I've implemented speculative prefetch of indirect blocks for
Direct I/O, controllable via all the same mechanisms.
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Fixes #17027
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
2025-05-14 00:26:55 +03:00
|
|
|
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
|
|
|
|
boolean_t, boolean_t);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-12-27 00:10:31 +03:00
|
|
|
#endif /* _DMU_ZFETCH_H */
|