OpenZFS 6322 - ZFS indirect block predictive prefetch

For quite some time I was thinking about possibility to prefetch
ZFS indirection tables while doing sequential reads or writes.
Recent changes in predictive prefetcher made that much easier to
do. My tests on zvol with 16KB block size on 5x striped and 2x
mirrored pool of 10 disks show almost double throughput on sequential
read, and almost tripple on sequential rewrite. While for read alike
effect can be received from increasing maximal prefetch distance
(though at higher memory cost), for rewrite there is no other
solution so far.

Authored by: Alexander Motin <mav@freebsd.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Ported-by: kernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://www.illumos.org/issues/6322
OpenZFS-commit: https://github.com/illumos/illumos-gate/commit/cb92f413
Closes #5040

Porting notes:
- Change from upstream in module/zfs/dbuf.c in 'int dbuf_read' due
  to commit 5f6d0b6 'Handle block pointers with a corrupt logical size'

- Difference from upstream in module/zfs/dmu_zfetch.c,
  uint32_t zfetch_max_idistance -> unsigned int zfetch_max_idistance

- Variables have been initialized at the beginning of the function
 (void dmu_zfetch) to resemble the order of occurrence and account
 for C99, C11 mode errors.
This commit is contained in:
Alexander Motin 2016-08-29 23:36:39 +02:00 committed by Brian Behlendorf
parent 98ace739bd
commit 755065f3dc
5 changed files with 91 additions and 26 deletions

View File

@ -43,6 +43,13 @@ struct dnode; /* so we can reference dnode */
typedef struct zstream { typedef struct zstream {
uint64_t zs_blkid; /* expect next access at this blkid */ uint64_t zs_blkid; /* expect next access at this blkid */
uint64_t zs_pf_blkid; /* next block to prefetch */ uint64_t zs_pf_blkid; /* next block to prefetch */
/*
* We will next prefetch the L1 indirect block of this level-0
* block id.
*/
uint64_t zs_ipf_blkid;
kmutex_t zs_lock; /* protects stream */ kmutex_t zs_lock; /* protects stream */
hrtime_t zs_atime; /* time last prefetch issued */ hrtime_t zs_atime; /* time last prefetch issued */
list_node_t zs_node; /* link for zf_stream */ list_node_t zs_node; /* link for zf_stream */
@ -59,7 +66,7 @@ void zfetch_fini(void);
void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *); void dmu_zfetch_fini(zfetch_t *);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -349,6 +349,15 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn);
#define DNODE_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
#define DNODE_META_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
#ifdef ZFS_DEBUG #ifdef ZFS_DEBUG
/* /*

View File

@ -844,7 +844,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (db->db_state == DB_CACHED) { if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (prefetch) if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0) if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
@ -859,7 +859,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
/* dbuf_read_impl has dropped db_mtx for us */ /* dbuf_read_impl has dropped db_mtx for us */
if (!err && prefetch) if (!err && prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0) if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -878,7 +878,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/ */
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (prefetch) if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0) if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);

View File

@ -485,9 +485,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbp[i] = &db->db; dbp[i] = &db->db;
} }
if ((flags & DMU_READ_NO_PREFETCH) == 0 && read && if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
length <= zfetch_array_rd_sz) { DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks); dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
read && DNODE_IS_CACHEABLE(dn));
} }
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);

View File

@ -50,6 +50,8 @@ unsigned int zfetch_max_streams = 8;
unsigned int zfetch_min_sec_reap = 2; unsigned int zfetch_min_sec_reap = 2;
/* max bytes to prefetch per stream (default 8MB) */ /* max bytes to prefetch per stream (default 8MB) */
unsigned int zfetch_max_distance = 8 * 1024 * 1024; unsigned int zfetch_max_distance = 8 * 1024 * 1024;
/* max bytes to prefetch indirects for per stream (default 64MB) */
unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
/* max number of bytes in an array_read in which we allow prefetching (1MB) */ /* max number of bytes in an array_read in which we allow prefetching (1MB) */
unsigned long zfetch_array_rd_sz = 1024 * 1024; unsigned long zfetch_array_rd_sz = 1024 * 1024;
@ -189,6 +191,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
zs->zs_blkid = blkid; zs->zs_blkid = blkid;
zs->zs_pf_blkid = blkid; zs->zs_pf_blkid = blkid;
zs->zs_ipf_blkid = blkid;
zs->zs_atime = gethrtime(); zs->zs_atime = gethrtime();
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
@ -196,16 +199,22 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
} }
/* /*
* This is the prefetch entry point. It calls all of the other dmu_zfetch * This is the predictive prefetch entry point. It associates dnode access
* routines to create, delete, find, or operate upon prefetch streams. * specified with blkid and nblks arguments with prefetch stream, predicts
* further accesses based on that stats and initiates speculative prefetch.
* fetch_data argument specifies whether actual data blocks should be fetched:
* FALSE -- prefetch only indirect blocks for predicted data blocks;
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
*/ */
void void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks) dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
{ {
zstream_t *zs; zstream_t *zs;
int64_t pf_start; int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
int pf_nblks; int64_t pf_ahead_blks, max_blks, iblk;
int i; int epbs, max_dist_blks, pf_nblks, ipf_nblks, i;
uint64_t end_of_access_blkid;
end_of_access_blkid = blkid + nblks;
if (zfs_prefetch_disable) if (zfs_prefetch_disable)
return; return;
@ -242,7 +251,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
*/ */
ZFETCHSTAT_BUMP(zfetchstat_misses); ZFETCHSTAT_BUMP(zfetchstat_misses);
if (rw_tryupgrade(&zf->zf_rwlock)) if (rw_tryupgrade(&zf->zf_rwlock))
dmu_zfetch_stream_create(zf, blkid + nblks); dmu_zfetch_stream_create(zf, end_of_access_blkid);
rw_exit(&zf->zf_rwlock); rw_exit(&zf->zf_rwlock);
return; return;
} }
@ -254,36 +263,75 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
* Normally, we start prefetching where we stopped * Normally, we start prefetching where we stopped
* prefetching last (zs_pf_blkid). But when we get our first * prefetching last (zs_pf_blkid). But when we get our first
* hit on this stream, zs_pf_blkid == zs_blkid, we don't * hit on this stream, zs_pf_blkid == zs_blkid, we don't
* want to prefetch to block we just accessed. In this case, * want to prefetch the block we just accessed. In this case,
* start just after the block we just accessed. * start just after the block we just accessed.
*/ */
pf_start = MAX(zs->zs_pf_blkid, blkid + nblks); pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
/* /*
* Double our amount of prefetched data, but don't let the * Double our amount of prefetched data, but don't let the
* prefetch get further ahead than zfetch_max_distance. * prefetch get further ahead than zfetch_max_distance.
*/ */
pf_nblks = if (fetch_data) {
MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks, max_dist_blks =
zs->zs_blkid + nblks + zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
(zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start); /*
* Previously, we were (zs_pf_blkid - blkid) ahead. We
* want to now be double that, so read that amount again,
* plus the amount we are catching up by (i.e. the amount
* read just now).
*/
pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
pf_nblks = MIN(pf_ahead_blks, max_blks);
} else {
pf_nblks = 0;
}
zs->zs_pf_blkid = pf_start + pf_nblks; zs->zs_pf_blkid = pf_start + pf_nblks;
zs->zs_atime = gethrtime();
zs->zs_blkid = blkid + nblks;
/* /*
* dbuf_prefetch() issues the prefetch i/o * Do the same for indirects, starting from where we stopped last,
* asynchronously, but it may need to wait for an * or where we will stop reading data blocks (and the indirects
* indirect block to be read from disk. Therefore * that point to them).
* we do not want to hold any locks while we call it.
*/ */
ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
/*
* We want to double our distance ahead of the data prefetch
* (or reader, if we are not prefetching data). Previously, we
* were (zs_ipf_blkid - blkid) ahead. To double that, we read
* that amount again, plus the amount we are catching up by
* (i.e. the amount read now + the amount of data prefetched now).
*/
pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
ipf_nblks = MIN(pf_ahead_blks, max_blks);
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
zs->zs_atime = gethrtime();
zs->zs_blkid = end_of_access_blkid;
mutex_exit(&zs->zs_lock); mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock); rw_exit(&zf->zf_rwlock);
/*
* dbuf_prefetch() is asynchronous (even when it needs to read
* indirect blocks), but we still prefer to drop our locks before
* calling it to reduce the time we hold them.
*/
for (i = 0; i < pf_nblks; i++) { for (i = 0; i < pf_nblks; i++) {
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
} }
for (iblk = ipf_istart; iblk < ipf_iend; iblk++) {
dbuf_prefetch(zf->zf_dnode, 1, iblk,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
}
ZFETCHSTAT_BUMP(zfetchstat_hits); ZFETCHSTAT_BUMP(zfetchstat_hits);
} }