mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-04-06 17:49:11 +03:00
zfetch: Don't issue new streams when old have not completed
The current dmu_zfetch code implicitly assumes that I/Os complete within min_sec_reap seconds. With async dmu and a readonly workload (and thus no exponential backoff in operations from the "write throttle") such as L2ARC rebuild it is possible to saturate the drives with I/O requests. These are then effectively compounded with prefetch requests. This change reference counts streams and prevents them from being recycled after their min_sec_reap timeout if they still have outstanding I/Os. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matt Macy <mmacy@FreeBSD.org> Closes #10900
This commit is contained in:
parent
cf2667759f
commit
af20b97078
@ -309,6 +309,8 @@ typedef struct dbuf_hash_table {
|
|||||||
kmutex_t hash_mutexes[DBUF_MUTEXES];
|
kmutex_t hash_mutexes[DBUF_MUTEXES];
|
||||||
} dbuf_hash_table_t;
|
} dbuf_hash_table_t;
|
||||||
|
|
||||||
|
typedef void (*dbuf_prefetch_fn)(void *, boolean_t);
|
||||||
|
|
||||||
uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level,
|
uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level,
|
||||||
const uint64_t offset);
|
const uint64_t offset);
|
||||||
|
|
||||||
@ -324,7 +326,10 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
|
|||||||
boolean_t fail_sparse, boolean_t fail_uncached,
|
boolean_t fail_sparse, boolean_t fail_uncached,
|
||||||
void *tag, dmu_buf_impl_t **dbp);
|
void *tag, dmu_buf_impl_t **dbp);
|
||||||
|
|
||||||
void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
|
int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid,
|
||||||
|
zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
|
||||||
|
void *arg);
|
||||||
|
int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
|
||||||
zio_priority_t prio, arc_flags_t aflags);
|
zio_priority_t prio, arc_flags_t aflags);
|
||||||
|
|
||||||
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
|
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
|
||||||
|
@ -40,6 +40,13 @@ extern unsigned long zfetch_array_rd_sz;
|
|||||||
|
|
||||||
struct dnode; /* so we can reference dnode */
|
struct dnode; /* so we can reference dnode */
|
||||||
|
|
||||||
|
typedef struct zfetch {
|
||||||
|
kmutex_t zf_lock; /* protects zfetch structure */
|
||||||
|
list_t zf_stream; /* list of zstream_t's */
|
||||||
|
struct dnode *zf_dnode; /* dnode that owns this zfetch */
|
||||||
|
int zf_numstreams; /* number of zstream_t's */
|
||||||
|
} zfetch_t;
|
||||||
|
|
||||||
typedef struct zstream {
|
typedef struct zstream {
|
||||||
uint64_t zs_blkid; /* expect next access at this blkid */
|
uint64_t zs_blkid; /* expect next access at this blkid */
|
||||||
uint64_t zs_pf_blkid; /* next block to prefetch */
|
uint64_t zs_pf_blkid; /* next block to prefetch */
|
||||||
@ -52,15 +59,12 @@ typedef struct zstream {
|
|||||||
|
|
||||||
kmutex_t zs_lock; /* protects stream */
|
kmutex_t zs_lock; /* protects stream */
|
||||||
hrtime_t zs_atime; /* time last prefetch issued */
|
hrtime_t zs_atime; /* time last prefetch issued */
|
||||||
|
hrtime_t zs_start_time; /* start of last prefetch */
|
||||||
list_node_t zs_node; /* link for zf_stream */
|
list_node_t zs_node; /* link for zf_stream */
|
||||||
|
zfetch_t *zs_fetch; /* parent fetch */
|
||||||
|
zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
|
||||||
} zstream_t;
|
} zstream_t;
|
||||||
|
|
||||||
typedef struct zfetch {
|
|
||||||
kmutex_t zf_lock; /* protects zfetch structure */
|
|
||||||
list_t zf_stream; /* list of zstream_t's */
|
|
||||||
struct dnode *zf_dnode; /* dnode that owns this zfetch */
|
|
||||||
} zfetch_t;
|
|
||||||
|
|
||||||
void zfetch_init(void);
|
void zfetch_init(void);
|
||||||
void zfetch_fini(void);
|
void zfetch_fini(void);
|
||||||
|
|
||||||
|
@ -3003,8 +3003,29 @@ typedef struct dbuf_prefetch_arg {
|
|||||||
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
|
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
|
||||||
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
|
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
|
||||||
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
|
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
|
||||||
|
dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
|
||||||
|
void *dpa_arg; /* prefetch completion arg */
|
||||||
} dbuf_prefetch_arg_t;
|
} dbuf_prefetch_arg_t;
|
||||||
|
|
||||||
|
static void
|
||||||
|
dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
|
||||||
|
{
|
||||||
|
if (dpa->dpa_cb != NULL)
|
||||||
|
dpa->dpa_cb(dpa->dpa_arg, io_done);
|
||||||
|
kmem_free(dpa, sizeof (*dpa));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
|
||||||
|
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
|
||||||
|
{
|
||||||
|
dbuf_prefetch_arg_t *dpa = private;
|
||||||
|
|
||||||
|
dbuf_prefetch_fini(dpa, B_TRUE);
|
||||||
|
if (abuf != NULL)
|
||||||
|
arc_buf_destroy(abuf, private);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Actually issue the prefetch read for the block given.
|
* Actually issue the prefetch read for the block given.
|
||||||
*/
|
*/
|
||||||
@ -3017,7 +3038,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
|
|||||||
SPA_FEATURE_REDACTED_DATASETS));
|
SPA_FEATURE_REDACTED_DATASETS));
|
||||||
|
|
||||||
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
|
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
|
||||||
return;
|
return (dbuf_prefetch_fini(dpa, B_FALSE));
|
||||||
|
|
||||||
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
|
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
|
||||||
arc_flags_t aflags =
|
arc_flags_t aflags =
|
||||||
@ -3031,7 +3052,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
|
|||||||
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
||||||
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
|
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
|
||||||
ASSERT(dpa->dpa_zio != NULL);
|
ASSERT(dpa->dpa_zio != NULL);
|
||||||
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
|
||||||
|
dbuf_issue_final_prefetch_done, dpa,
|
||||||
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
|
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3051,8 +3073,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
|||||||
|
|
||||||
if (abuf == NULL) {
|
if (abuf == NULL) {
|
||||||
ASSERT(zio == NULL || zio->io_error != 0);
|
ASSERT(zio == NULL || zio->io_error != 0);
|
||||||
kmem_free(dpa, sizeof (*dpa));
|
return (dbuf_prefetch_fini(dpa, B_TRUE));
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
ASSERT(zio == NULL || zio->io_error == 0);
|
ASSERT(zio == NULL || zio->io_error == 0);
|
||||||
|
|
||||||
@ -3084,11 +3105,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
|||||||
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
|
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
|
||||||
dpa->dpa_curlevel, curblkid, FTAG);
|
dpa->dpa_curlevel, curblkid, FTAG);
|
||||||
if (db == NULL) {
|
if (db == NULL) {
|
||||||
kmem_free(dpa, sizeof (*dpa));
|
|
||||||
arc_buf_destroy(abuf, private);
|
arc_buf_destroy(abuf, private);
|
||||||
return;
|
return (dbuf_prefetch_fini(dpa, B_TRUE));
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) dbuf_read(db, NULL,
|
(void) dbuf_read(db, NULL,
|
||||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
|
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
|
||||||
dbuf_rele(db, FTAG);
|
dbuf_rele(db, FTAG);
|
||||||
@ -3105,11 +3124,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
|||||||
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
|
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
|
||||||
SPA_FEATURE_REDACTED_DATASETS));
|
SPA_FEATURE_REDACTED_DATASETS));
|
||||||
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
|
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
|
||||||
kmem_free(dpa, sizeof (*dpa));
|
dbuf_prefetch_fini(dpa, B_TRUE);
|
||||||
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
|
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
|
||||||
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
|
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
|
||||||
dbuf_issue_final_prefetch(dpa, bp);
|
dbuf_issue_final_prefetch(dpa, bp);
|
||||||
kmem_free(dpa, sizeof (*dpa));
|
|
||||||
} else {
|
} else {
|
||||||
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
@ -3139,9 +3157,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
|||||||
* complete. Note that the prefetch might fail if the dataset is encrypted and
|
* complete. Note that the prefetch might fail if the dataset is encrypted and
|
||||||
* the encryption key is unmapped before the IO completes.
|
* the encryption key is unmapped before the IO completes.
|
||||||
*/
|
*/
|
||||||
void
|
int
|
||||||
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
|
||||||
arc_flags_t aflags)
|
zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
|
||||||
|
void *arg)
|
||||||
{
|
{
|
||||||
blkptr_t bp;
|
blkptr_t bp;
|
||||||
int epbs, nlevels, curlevel;
|
int epbs, nlevels, curlevel;
|
||||||
@ -3151,10 +3170,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|||||||
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
||||||
|
|
||||||
if (blkid > dn->dn_maxblkid)
|
if (blkid > dn->dn_maxblkid)
|
||||||
return;
|
goto no_issue;
|
||||||
|
|
||||||
if (level == 0 && dnode_block_freed(dn, blkid))
|
if (level == 0 && dnode_block_freed(dn, blkid))
|
||||||
return;
|
goto no_issue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This dnode hasn't been written to disk yet, so there's nothing to
|
* This dnode hasn't been written to disk yet, so there's nothing to
|
||||||
@ -3162,11 +3181,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|||||||
*/
|
*/
|
||||||
nlevels = dn->dn_phys->dn_nlevels;
|
nlevels = dn->dn_phys->dn_nlevels;
|
||||||
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
|
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
|
||||||
return;
|
goto no_issue;
|
||||||
|
|
||||||
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
||||||
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
|
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
|
||||||
return;
|
goto no_issue;
|
||||||
|
|
||||||
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
|
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
|
||||||
level, blkid);
|
level, blkid);
|
||||||
@ -3176,7 +3195,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|||||||
* This dbuf already exists. It is either CACHED, or
|
* This dbuf already exists. It is either CACHED, or
|
||||||
* (we assume) about to be read or filled.
|
* (we assume) about to be read or filled.
|
||||||
*/
|
*/
|
||||||
return;
|
goto no_issue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -3212,7 +3231,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|||||||
dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
|
dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
|
||||||
SPA_FEATURE_REDACTED_DATASETS));
|
SPA_FEATURE_REDACTED_DATASETS));
|
||||||
if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
|
if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
|
||||||
return;
|
goto no_issue;
|
||||||
|
|
||||||
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
|
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
|
||||||
|
|
||||||
@ -3230,6 +3249,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|||||||
dpa->dpa_dnode = dn;
|
dpa->dpa_dnode = dn;
|
||||||
dpa->dpa_epbs = epbs;
|
dpa->dpa_epbs = epbs;
|
||||||
dpa->dpa_zio = pio;
|
dpa->dpa_zio = pio;
|
||||||
|
dpa->dpa_cb = cb;
|
||||||
|
dpa->dpa_arg = arg;
|
||||||
|
|
||||||
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
|
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
|
||||||
if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
|
if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
|
||||||
@ -3245,7 +3266,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|||||||
if (curlevel == level) {
|
if (curlevel == level) {
|
||||||
ASSERT3U(curblkid, ==, blkid);
|
ASSERT3U(curblkid, ==, blkid);
|
||||||
dbuf_issue_final_prefetch(dpa, &bp);
|
dbuf_issue_final_prefetch(dpa, &bp);
|
||||||
kmem_free(dpa, sizeof (*dpa));
|
|
||||||
} else {
|
} else {
|
||||||
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
@ -3266,6 +3286,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|||||||
* dpa may have already been freed.
|
* dpa may have already been freed.
|
||||||
*/
|
*/
|
||||||
zio_nowait(pio);
|
zio_nowait(pio);
|
||||||
|
return (1);
|
||||||
|
no_issue:
|
||||||
|
if (cb != NULL)
|
||||||
|
cb(arg, B_FALSE);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
||||||
|
arc_flags_t aflags)
|
||||||
|
{
|
||||||
|
|
||||||
|
return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -59,16 +59,29 @@ typedef struct zfetch_stats {
|
|||||||
kstat_named_t zfetchstat_hits;
|
kstat_named_t zfetchstat_hits;
|
||||||
kstat_named_t zfetchstat_misses;
|
kstat_named_t zfetchstat_misses;
|
||||||
kstat_named_t zfetchstat_max_streams;
|
kstat_named_t zfetchstat_max_streams;
|
||||||
|
kstat_named_t zfetchstat_max_completion_us;
|
||||||
|
kstat_named_t zfetchstat_last_completion_us;
|
||||||
|
kstat_named_t zfetchstat_io_issued;
|
||||||
} zfetch_stats_t;
|
} zfetch_stats_t;
|
||||||
|
|
||||||
static zfetch_stats_t zfetch_stats = {
|
static zfetch_stats_t zfetch_stats = {
|
||||||
{ "hits", KSTAT_DATA_UINT64 },
|
{ "hits", KSTAT_DATA_UINT64 },
|
||||||
{ "misses", KSTAT_DATA_UINT64 },
|
{ "misses", KSTAT_DATA_UINT64 },
|
||||||
{ "max_streams", KSTAT_DATA_UINT64 },
|
{ "max_streams", KSTAT_DATA_UINT64 },
|
||||||
|
{ "max_completion_us", KSTAT_DATA_UINT64 },
|
||||||
|
{ "last_completion_us", KSTAT_DATA_UINT64 },
|
||||||
|
{ "io_issued", KSTAT_DATA_UINT64 },
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ZFETCHSTAT_BUMP(stat) \
|
#define ZFETCHSTAT_BUMP(stat) \
|
||||||
atomic_inc_64(&zfetch_stats.stat.value.ui64);
|
atomic_inc_64(&zfetch_stats.stat.value.ui64)
|
||||||
|
#define ZFETCHSTAT_ADD(stat, val) \
|
||||||
|
atomic_add_64(&zfetch_stats.stat.value.ui64, val)
|
||||||
|
#define ZFETCHSTAT_SET(stat, val) \
|
||||||
|
zfetch_stats.stat.value.ui64 = val
|
||||||
|
#define ZFETCHSTAT_GET(stat) \
|
||||||
|
zfetch_stats.stat.value.ui64
|
||||||
|
|
||||||
|
|
||||||
kstat_t *zfetch_ksp;
|
kstat_t *zfetch_ksp;
|
||||||
|
|
||||||
@ -104,8 +117,8 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
|
|||||||
{
|
{
|
||||||
if (zf == NULL)
|
if (zf == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
zf->zf_dnode = dno;
|
zf->zf_dnode = dno;
|
||||||
|
zf->zf_numstreams = 0;
|
||||||
|
|
||||||
list_create(&zf->zf_stream, sizeof (zstream_t),
|
list_create(&zf->zf_stream, sizeof (zstream_t),
|
||||||
offsetof(zstream_t, zs_node));
|
offsetof(zstream_t, zs_node));
|
||||||
@ -113,13 +126,29 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
|
|||||||
mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
dmu_zfetch_stream_fini(zstream_t *zs)
|
||||||
|
{
|
||||||
|
mutex_destroy(&zs->zs_lock);
|
||||||
|
kmem_free(zs, sizeof (*zs));
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
|
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
|
||||||
{
|
{
|
||||||
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
||||||
list_remove(&zf->zf_stream, zs);
|
list_remove(&zf->zf_stream, zs);
|
||||||
mutex_destroy(&zs->zs_lock);
|
dmu_zfetch_stream_fini(zs);
|
||||||
kmem_free(zs, sizeof (*zs));
|
zf->zf_numstreams--;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
|
||||||
|
{
|
||||||
|
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
||||||
|
list_remove(&zf->zf_stream, zs);
|
||||||
|
zs->zs_fetch = NULL;
|
||||||
|
zf->zf_numstreams--;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -133,7 +162,7 @@ dmu_zfetch_fini(zfetch_t *zf)
|
|||||||
|
|
||||||
mutex_enter(&zf->zf_lock);
|
mutex_enter(&zf->zf_lock);
|
||||||
while ((zs = list_head(&zf->zf_stream)) != NULL)
|
while ((zs = list_head(&zf->zf_stream)) != NULL)
|
||||||
dmu_zfetch_stream_remove(zf, zs);
|
dmu_zfetch_stream_orphan(zf, zs);
|
||||||
mutex_exit(&zf->zf_lock);
|
mutex_exit(&zf->zf_lock);
|
||||||
list_destroy(&zf->zf_stream);
|
list_destroy(&zf->zf_stream);
|
||||||
mutex_destroy(&zf->zf_lock);
|
mutex_destroy(&zf->zf_lock);
|
||||||
@ -151,7 +180,7 @@ static void
|
|||||||
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||||
{
|
{
|
||||||
zstream_t *zs_next;
|
zstream_t *zs_next;
|
||||||
int numstreams = 0;
|
hrtime_t now = gethrtime();
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
||||||
|
|
||||||
@ -161,11 +190,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
|||||||
for (zstream_t *zs = list_head(&zf->zf_stream);
|
for (zstream_t *zs = list_head(&zf->zf_stream);
|
||||||
zs != NULL; zs = zs_next) {
|
zs != NULL; zs = zs_next) {
|
||||||
zs_next = list_next(&zf->zf_stream, zs);
|
zs_next = list_next(&zf->zf_stream, zs);
|
||||||
if (((gethrtime() - zs->zs_atime) / NANOSEC) >
|
/*
|
||||||
|
* Skip gethrtime() call if there are still references
|
||||||
|
*/
|
||||||
|
if (zfs_refcount_count(&zs->zs_blocks) != 0)
|
||||||
|
continue;
|
||||||
|
if (((now - zs->zs_atime) / NANOSEC) >
|
||||||
zfetch_min_sec_reap)
|
zfetch_min_sec_reap)
|
||||||
dmu_zfetch_stream_remove(zf, zs);
|
dmu_zfetch_stream_remove(zf, zs);
|
||||||
else
|
|
||||||
numstreams++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -179,7 +211,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
|||||||
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
|
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
|
||||||
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
|
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
|
||||||
zfetch_max_distance));
|
zfetch_max_distance));
|
||||||
if (numstreams >= max_streams) {
|
if (zf->zf_numstreams >= max_streams) {
|
||||||
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
|
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -188,12 +220,39 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
|||||||
zs->zs_blkid = blkid;
|
zs->zs_blkid = blkid;
|
||||||
zs->zs_pf_blkid = blkid;
|
zs->zs_pf_blkid = blkid;
|
||||||
zs->zs_ipf_blkid = blkid;
|
zs->zs_ipf_blkid = blkid;
|
||||||
zs->zs_atime = gethrtime();
|
zs->zs_atime = now;
|
||||||
|
zs->zs_fetch = zf;
|
||||||
|
zfs_refcount_create(&zs->zs_blocks);
|
||||||
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
|
zf->zf_numstreams++;
|
||||||
list_insert_head(&zf->zf_stream, zs);
|
list_insert_head(&zf->zf_stream, zs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
|
||||||
|
{
|
||||||
|
zstream_t *zs = arg;
|
||||||
|
|
||||||
|
if (zs->zs_start_time && io_issued) {
|
||||||
|
hrtime_t now = gethrtime();
|
||||||
|
hrtime_t delta = NSEC2USEC(now - zs->zs_start_time);
|
||||||
|
|
||||||
|
zs->zs_start_time = 0;
|
||||||
|
ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta);
|
||||||
|
if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us))
|
||||||
|
ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The parent fetch structure has gone away
|
||||||
|
*/
|
||||||
|
if (zs->zs_fetch == NULL)
|
||||||
|
dmu_zfetch_stream_fini(zs);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the predictive prefetch entry point. It associates dnode access
|
* This is the predictive prefetch entry point. It associates dnode access
|
||||||
* specified with blkid and nblks arguments with prefetch stream, predicts
|
* specified with blkid and nblks arguments with prefetch stream, predicts
|
||||||
@ -209,7 +268,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
|||||||
zstream_t *zs;
|
zstream_t *zs;
|
||||||
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
|
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
|
||||||
int64_t pf_ahead_blks, max_blks;
|
int64_t pf_ahead_blks, max_blks;
|
||||||
int epbs, max_dist_blks, pf_nblks, ipf_nblks;
|
int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
|
||||||
uint64_t end_of_access_blkid;
|
uint64_t end_of_access_blkid;
|
||||||
end_of_access_blkid = blkid + nblks;
|
end_of_access_blkid = blkid + nblks;
|
||||||
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
|
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
|
||||||
@ -230,11 +289,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
|||||||
* As a fast path for small (single-block) files, ignore access
|
* As a fast path for small (single-block) files, ignore access
|
||||||
* to the first block.
|
* to the first block.
|
||||||
*/
|
*/
|
||||||
if (blkid == 0)
|
if (!have_lock && blkid == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
|
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A fast path for small files for which no prefetch will
|
||||||
|
* happen.
|
||||||
|
*/
|
||||||
|
if (zf->zf_dnode->dn_maxblkid < 2) {
|
||||||
|
if (!have_lock)
|
||||||
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
mutex_enter(&zf->zf_lock);
|
mutex_enter(&zf->zf_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -343,9 +412,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
|||||||
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
|
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
|
||||||
|
|
||||||
zs->zs_atime = gethrtime();
|
zs->zs_atime = gethrtime();
|
||||||
|
/* no prior reads in progress */
|
||||||
|
if (zfs_refcount_count(&zs->zs_blocks) == 0)
|
||||||
|
zs->zs_start_time = zs->zs_atime;
|
||||||
zs->zs_blkid = end_of_access_blkid;
|
zs->zs_blkid = end_of_access_blkid;
|
||||||
|
zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
|
||||||
|
NULL);
|
||||||
mutex_exit(&zs->zs_lock);
|
mutex_exit(&zs->zs_lock);
|
||||||
mutex_exit(&zf->zf_lock);
|
mutex_exit(&zf->zf_lock);
|
||||||
|
issued = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* dbuf_prefetch() is asynchronous (even when it needs to read
|
* dbuf_prefetch() is asynchronous (even when it needs to read
|
||||||
@ -354,16 +429,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
for (int i = 0; i < pf_nblks; i++) {
|
for (int i = 0; i < pf_nblks; i++) {
|
||||||
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
|
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
|
||||||
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
|
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
|
||||||
|
dmu_zfetch_stream_done, zs);
|
||||||
}
|
}
|
||||||
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
|
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
|
||||||
dbuf_prefetch(zf->zf_dnode, 1, iblk,
|
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
|
||||||
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
|
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
|
||||||
|
dmu_zfetch_stream_done, zs);
|
||||||
}
|
}
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
||||||
|
|
||||||
|
if (issued)
|
||||||
|
ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* BEGIN CSTYLED */
|
/* BEGIN CSTYLED */
|
||||||
|
Loading…
Reference in New Issue
Block a user