2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
Illumos #764: panic in zfs:dbuf_sync_list
Hypothesis about what's going on here.
At some time in the past, something, i.e. dnode_reallocate()
calls one of:
dbuf_rm_spill(dn, tx);
These will do:
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx)
dbuf_undirty(db, tx)
Currently dbuf_undirty can leave a spill block in dn_dirty_records[],
(it having been put there previously by dbuf_dirty) and free it.
Sometime later, dbuf_sync_list trips over this reference to free'd
(and typically reused) memory.
Also, dbuf_undirty can call dnode_clear_range with a bogus
block ID. It needs to test for DMU_SPILL_BLKID, similar to
how dnode_clear_range is called in dbuf_dirty().
References to Illumos issue and patch:
- https://www.illumos.org/issues/764
- https://github.com/illumos/illumos-gate/commit/3f2366c2bb
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Mark.Maybe@oracle.com
Reviewed by: Albert Lee <trisk@nexenta.com
Approved by: Garrett D'Amore <garrett@nexenta.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
2011-07-26 22:37:06 +04:00
|
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
2013-03-08 22:41:28 +04:00
|
|
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
2013-08-02 00:02:10 +04:00
|
|
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/zfs_context.h>
|
2010-08-26 22:49:16 +04:00
|
|
|
#include <sys/arc.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/dmu.h>
|
2013-07-29 22:58:53 +04:00
|
|
|
#include <sys/dmu_send.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/dmu_impl.h>
|
|
|
|
#include <sys/dbuf.h>
|
|
|
|
#include <sys/dmu_objset.h>
|
|
|
|
#include <sys/dsl_dataset.h>
|
|
|
|
#include <sys/dsl_dir.h>
|
|
|
|
#include <sys/dmu_tx.h>
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/zio.h>
|
|
|
|
#include <sys/dmu_zfetch.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/sa.h>
|
|
|
|
#include <sys/sa_impl.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
struct dbuf_hold_impl_data {
|
|
|
|
/* Function arguments */
|
|
|
|
dnode_t *dh_dn;
|
|
|
|
uint8_t dh_level;
|
|
|
|
uint64_t dh_blkid;
|
|
|
|
int dh_fail_sparse;
|
|
|
|
void *dh_tag;
|
|
|
|
dmu_buf_impl_t **dh_dbp;
|
|
|
|
/* Local variables */
|
|
|
|
dmu_buf_impl_t *dh_db;
|
|
|
|
dmu_buf_impl_t *dh_parent;
|
|
|
|
blkptr_t *dh_bp;
|
|
|
|
int dh_err;
|
|
|
|
dbuf_dirty_record_t *dh_dr;
|
|
|
|
arc_buf_contents_t dh_type;
|
|
|
|
int dh_depth;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
|
|
|
|
dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
|
|
|
|
void *tag, dmu_buf_impl_t **dbp, int depth);
|
|
|
|
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
|
|
|
|
|
2013-08-21 08:11:52 +04:00
|
|
|
/*
|
|
|
|
* Number of times that zfs_free_range() took the slow path while doing
|
|
|
|
* a zfs receive. A nonzero value indicates a potential performance problem.
|
|
|
|
*/
|
|
|
|
uint64_t zfs_free_range_recv_miss;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void dbuf_destroy(dmu_buf_impl_t *db);
|
2013-09-04 16:00:57 +04:00
|
|
|
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
2008-12-03 23:09:06 +03:00
|
|
|
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Global data structures and functions for the dbuf cache.
|
|
|
|
*/
|
|
|
|
static kmem_cache_t *dbuf_cache;
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
|
|
|
dbuf_cons(void *vdb, void *unused, int kmflag)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
bzero(db, sizeof (dmu_buf_impl_t));
|
|
|
|
|
|
|
|
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
|
|
|
|
refcount_create(&db->db_holds);
|
2010-08-26 21:26:44 +04:00
|
|
|
list_link_init(&db->db_link);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_dest(void *vdb, void *unused)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
mutex_destroy(&db->db_mtx);
|
|
|
|
cv_destroy(&db->db_changed);
|
|
|
|
refcount_destroy(&db->db_holds);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dbuf hash table routines
|
|
|
|
*/
|
|
|
|
static dbuf_hash_table_t dbuf_hash_table;
|
|
|
|
|
|
|
|
static uint64_t dbuf_hash_count;
|
|
|
|
|
|
|
|
static uint64_t
|
|
|
|
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
|
|
|
|
{
|
|
|
|
uintptr_t osv = (uintptr_t)os;
|
|
|
|
uint64_t crc = -1ULL;
|
|
|
|
|
|
|
|
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
|
|
|
|
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
|
|
|
|
|
|
|
|
crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
|
|
|
|
|
|
|
|
return (crc);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
|
|
|
|
|
|
|
|
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
|
|
|
|
((dbuf)->db.db_object == (obj) && \
|
|
|
|
(dbuf)->db_objset == (os) && \
|
|
|
|
(dbuf)->db_level == (level) && \
|
|
|
|
(dbuf)->db_blkid == (blkid))
|
|
|
|
|
|
|
|
dmu_buf_impl_t *
|
|
|
|
dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
|
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
2010-05-29 00:45:14 +04:00
|
|
|
objset_t *os = dn->dn_objset;
|
2010-08-26 20:52:39 +04:00
|
|
|
uint64_t obj;
|
|
|
|
uint64_t hv;
|
|
|
|
uint64_t idx;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_buf_impl_t *db;
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
obj = dn->dn_object;
|
|
|
|
hv = DBUF_HASH(os, obj, level, blkid);
|
|
|
|
idx = hv & h->hash_table_mask;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
|
|
|
|
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (db->db_state != DB_EVICTING) {
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
return (db);
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert an entry into the hash table. If there is already an element
|
|
|
|
* equal to elem in the hash table, then the already existing element
|
|
|
|
* will be returned and the new element will not be inserted.
|
|
|
|
* Otherwise returns NULL.
|
|
|
|
*/
|
|
|
|
static dmu_buf_impl_t *
|
|
|
|
dbuf_hash_insert(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
2010-05-29 00:45:14 +04:00
|
|
|
objset_t *os = db->db_objset;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t obj = db->db.db_object;
|
|
|
|
int level = db->db_level;
|
2010-08-26 20:52:39 +04:00
|
|
|
uint64_t blkid, hv, idx;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_buf_impl_t *dbf;
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
blkid = db->db_blkid;
|
|
|
|
hv = DBUF_HASH(os, obj, level, blkid);
|
|
|
|
idx = hv & h->hash_table_mask;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
|
|
|
|
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
|
|
|
|
mutex_enter(&dbf->db_mtx);
|
|
|
|
if (dbf->db_state != DB_EVICTING) {
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
return (dbf);
|
|
|
|
}
|
|
|
|
mutex_exit(&dbf->db_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
db->db_hash_next = h->hash_table[idx];
|
|
|
|
h->hash_table[idx] = db;
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
atomic_add_64(&dbuf_hash_count, 1);
|
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove an entry from the hash table. This operation will
|
|
|
|
* fail if there are any existing holds on the db.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_hash_remove(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
2010-08-26 20:52:39 +04:00
|
|
|
uint64_t hv, idx;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_buf_impl_t *dbf, **dbp;
|
|
|
|
|
2010-08-26 20:52:39 +04:00
|
|
|
hv = DBUF_HASH(db->db_objset, db->db.db_object,
|
|
|
|
db->db_level, db->db_blkid);
|
|
|
|
idx = hv & h->hash_table_mask;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* We musn't hold db_mtx to maintin lock ordering:
|
|
|
|
* DBUF_HASH_MUTEX > db_mtx.
|
|
|
|
*/
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
ASSERT(db->db_state == DB_EVICTING);
|
|
|
|
ASSERT(!MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
dbp = &h->hash_table[idx];
|
|
|
|
while ((dbf = *dbp) != db) {
|
|
|
|
dbp = &dbf->db_hash_next;
|
|
|
|
ASSERT(dbf != NULL);
|
|
|
|
}
|
|
|
|
*dbp = db->db_hash_next;
|
|
|
|
db->db_hash_next = NULL;
|
|
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
|
|
atomic_add_64(&dbuf_hash_count, -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static arc_evict_func_t dbuf_do_evict;
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_evict_user(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
if (db->db_level != 0 || db->db_evict_func == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (db->db_user_data_ptr_ptr)
|
|
|
|
*db->db_user_data_ptr_ptr = db->db.db_data;
|
|
|
|
db->db_evict_func(&db->db, db->db_user_ptr);
|
|
|
|
db->db_user_ptr = NULL;
|
|
|
|
db->db_user_data_ptr_ptr = NULL;
|
|
|
|
db->db_evict_func = NULL;
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
boolean_t
|
|
|
|
dbuf_is_metadata(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
if (db->db_level > 0) {
|
|
|
|
return (B_TRUE);
|
|
|
|
} else {
|
|
|
|
boolean_t is_metadata;
|
|
|
|
|
|
|
|
DB_DNODE_ENTER(db);
|
2012-12-14 03:24:15 +04:00
|
|
|
is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
|
|
|
return (is_metadata);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void
|
|
|
|
dbuf_evict(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
ASSERT(db->db_data_pending == NULL);
|
|
|
|
|
|
|
|
dbuf_clear(db);
|
|
|
|
dbuf_destroy(db);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_init(void)
|
|
|
|
{
|
|
|
|
uint64_t hsize = 1ULL << 16;
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The hash table is big enough to fill all of physical memory
|
|
|
|
* with an average 4K block size. The table will take up
|
|
|
|
* totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
|
|
|
|
*/
|
|
|
|
while (hsize * 4096 < physmem * PAGESIZE)
|
|
|
|
hsize <<= 1;
|
|
|
|
|
|
|
|
retry:
|
|
|
|
h->hash_table_mask = hsize - 1;
|
2010-08-26 22:46:09 +04:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
|
|
|
/* Large allocations which do not require contiguous pages
|
|
|
|
* should be using vmem_alloc() in the linux kernel */
|
2012-05-07 21:49:51 +04:00
|
|
|
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE);
|
2010-08-26 22:46:09 +04:00
|
|
|
#else
|
2008-11-20 23:01:55 +03:00
|
|
|
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
|
2010-08-26 22:46:09 +04:00
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
if (h->hash_table == NULL) {
|
|
|
|
/* XXX - we should really return an error instead of assert */
|
|
|
|
ASSERT(hsize > (1ULL << 10));
|
|
|
|
hsize >>= 1;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
|
|
|
|
sizeof (dmu_buf_impl_t),
|
|
|
|
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
|
|
|
|
|
|
|
|
for (i = 0; i < DBUF_MUTEXES; i++)
|
|
|
|
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
|
2013-10-03 04:11:19 +04:00
|
|
|
|
|
|
|
dbuf_stats_init(h);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_fini(void)
|
|
|
|
{
|
|
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
|
|
int i;
|
|
|
|
|
2013-10-03 04:11:19 +04:00
|
|
|
dbuf_stats_destroy();
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
for (i = 0; i < DBUF_MUTEXES; i++)
|
|
|
|
mutex_destroy(&h->hash_mutexes[i]);
|
2010-08-26 22:46:09 +04:00
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
|
|
|
/* Large allocations which do not require contiguous pages
|
|
|
|
* should be using vmem_free() in the linux kernel */
|
|
|
|
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
|
|
|
#else
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
2010-08-26 22:46:09 +04:00
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_cache_destroy(dbuf_cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Other stuff.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
static void
|
|
|
|
dbuf_verify(dmu_buf_impl_t *db)
|
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
2010-05-29 00:45:14 +04:00
|
|
|
dbuf_dirty_record_t *dr;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ASSERT(db->db_objset != NULL);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (dn == NULL) {
|
|
|
|
ASSERT(db->db_parent == NULL);
|
|
|
|
ASSERT(db->db_blkptr == NULL);
|
|
|
|
} else {
|
|
|
|
ASSERT3U(db->db.db_object, ==, dn->dn_object);
|
|
|
|
ASSERT3P(db->db_objset, ==, dn->dn_objset);
|
|
|
|
ASSERT3U(db->db_level, <, dn->dn_nlevels);
|
2010-08-27 01:24:34 +04:00
|
|
|
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
|
|
|
|
db->db_blkid == DMU_SPILL_BLKID ||
|
|
|
|
!list_is_empty(&dn->dn_dbufs));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
|
|
ASSERT(dn != NULL);
|
|
|
|
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
|
|
|
|
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
|
|
|
|
} else if (db->db_blkid == DMU_SPILL_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(dn != NULL);
|
|
|
|
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(db->db.db_offset);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
|
|
|
|
ASSERT(dr->dr_dbuf == db);
|
|
|
|
|
|
|
|
for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
|
|
|
|
ASSERT(dr->dr_dbuf == db);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* We can't assert that db_size matches dn_datablksz because it
|
|
|
|
* can be momentarily different when another thread is doing
|
|
|
|
* dnode_set_blksz().
|
|
|
|
*/
|
|
|
|
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
|
2010-05-29 00:45:14 +04:00
|
|
|
dr = db->db_data_pending;
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* It should only be modified in syncing context, so
|
|
|
|
* make sure we only have one copy of the data.
|
|
|
|
*/
|
|
|
|
ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* verify db->db_blkptr */
|
|
|
|
if (db->db_blkptr) {
|
|
|
|
if (db->db_parent == dn->dn_dbuf) {
|
|
|
|
/* db is pointed to by the dnode */
|
|
|
|
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
|
2009-07-03 02:44:48 +04:00
|
|
|
if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(db->db_parent == NULL);
|
|
|
|
else
|
|
|
|
ASSERT(db->db_parent != NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
|
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
|
|
&dn->dn_phys->dn_blkptr[db->db_blkid]);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
/* db is pointed to by an indirect block */
|
2010-08-26 20:53:00 +04:00
|
|
|
ASSERTV(int epb = db->db_parent->db.db_size >>
|
|
|
|
SPA_BLKPTRSHIFT);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
|
|
|
|
ASSERT3U(db->db_parent->db.db_object, ==,
|
|
|
|
db->db.db_object);
|
|
|
|
/*
|
|
|
|
* dnode_grow_indblksz() can make this fail if we don't
|
|
|
|
* have the struct_rwlock. XXX indblksz no longer
|
|
|
|
* grows. safe to do this now?
|
|
|
|
*/
|
2010-08-27 01:24:34 +04:00
|
|
|
if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
|
|
((blkptr_t *)db->db_parent->db.db_data +
|
|
|
|
db->db_blkid % epb));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
|
2010-05-29 00:45:14 +04:00
|
|
|
(db->db_buf == NULL || db->db_buf->b_data) &&
|
|
|
|
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_state != DB_FILL && !dn->dn_free_txg) {
|
|
|
|
/*
|
|
|
|
* If the blkptr isn't set but they have nonzero data,
|
|
|
|
* it had better be dirty, otherwise we'll lose that
|
|
|
|
* data when we evict this buffer.
|
|
|
|
*/
|
|
|
|
if (db->db_dirtycnt == 0) {
|
2010-08-26 20:53:00 +04:00
|
|
|
ASSERTV(uint64_t *buf = db->db.db_data);
|
2008-11-20 23:01:55 +03:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < db->db.db_size >> 3; i++) {
|
|
|
|
ASSERT(buf[i] == 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_update_data(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
*db->db_user_data_ptr_ptr = db->db.db_data;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
|
|
|
|
{
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
|
|
|
|
db->db_buf = buf;
|
|
|
|
if (buf != NULL) {
|
|
|
|
ASSERT(buf->b_data != NULL);
|
|
|
|
db->db.db_data = buf->b_data;
|
|
|
|
if (!arc_released(buf))
|
|
|
|
arc_set_callback(buf, dbuf_do_evict, db);
|
|
|
|
dbuf_update_data(db);
|
|
|
|
} else {
|
|
|
|
dbuf_evict_user(db);
|
|
|
|
db->db.db_data = NULL;
|
2008-12-03 23:09:06 +03:00
|
|
|
if (db->db_state != DB_NOFILL)
|
|
|
|
db->db_state = DB_UNCACHED;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Loan out an arc_buf for read. Return the loaned arc_buf.
|
|
|
|
*/
|
|
|
|
arc_buf_t *
|
|
|
|
dbuf_loan_arcbuf(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
arc_buf_t *abuf;
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
|
|
|
|
int blksz = db->db.db_size;
|
2010-08-27 01:24:34 +04:00
|
|
|
spa_t *spa;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_exit(&db->db_mtx);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_GET_SPA(&spa, db);
|
|
|
|
abuf = arc_loan_buf(spa, blksz);
|
2010-05-29 00:45:14 +04:00
|
|
|
bcopy(db->db.db_data, abuf->b_data, blksz);
|
|
|
|
} else {
|
|
|
|
abuf = db->db_buf;
|
|
|
|
arc_loan_inuse_buf(abuf, db);
|
|
|
|
dbuf_set_data(db, NULL);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
return (abuf);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t
|
|
|
|
dbuf_whichblock(dnode_t *dn, uint64_t offset)
|
|
|
|
{
|
|
|
|
if (dn->dn_datablkshift) {
|
|
|
|
return (offset >> dn->dn_datablkshift);
|
|
|
|
} else {
|
|
|
|
ASSERT3U(offset, <, dn->dn_datablksz);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
ASSERT3U(db->db_state, ==, DB_READ);
|
|
|
|
/*
|
|
|
|
* All reads are synchronous, so we must have a hold on the dbuf
|
|
|
|
*/
|
|
|
|
ASSERT(refcount_count(&db->db_holds) > 0);
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
if (db->db_level == 0 && db->db_freed_in_flight) {
|
|
|
|
/* we were freed in flight; disregard any error */
|
|
|
|
arc_release(buf, db);
|
|
|
|
bzero(buf->b_data, db->db.db_size);
|
|
|
|
arc_buf_freeze(buf);
|
|
|
|
db->db_freed_in_flight = FALSE;
|
|
|
|
dbuf_set_data(db, buf);
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
} else if (zio == NULL || zio->io_error == 0) {
|
|
|
|
dbuf_set_data(db, buf);
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
} else {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3P(db->db_buf, ==, NULL);
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
}
|
|
|
|
cv_broadcast(&db->db_changed);
|
2010-05-29 00:45:14 +04:00
|
|
|
dbuf_rele_and_unlock(db, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
|
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
|
|
|
spa_t *spa;
|
2008-11-20 23:01:55 +03:00
|
|
|
zbookmark_t zb;
|
|
|
|
uint32_t aflags = ARC_NOWAIT;
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
/* We need the struct_rwlock to prevent db_blkptr from changing. */
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(db->db_state == DB_UNCACHED);
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2009-07-03 02:44:48 +04:00
|
|
|
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
ASSERT3U(bonuslen, <=, db->db.db_size);
|
|
|
|
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
|
2009-02-18 23:51:31 +03:00
|
|
|
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (bonuslen < DN_MAX_BONUSLEN)
|
|
|
|
bzero(db->db.db_data, DN_MAX_BONUSLEN);
|
2009-07-03 02:44:48 +04:00
|
|
|
if (bonuslen)
|
|
|
|
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_update_data(db);
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
|
|
|
|
* processes the delete record and clears the bp while we are waiting
|
|
|
|
* for the dn_mtx (resulting in a "no" from block_freed).
|
|
|
|
*/
|
|
|
|
if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
|
|
|
|
(db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
|
|
|
|
BP_IS_HOLE(db->db_blkptr)))) {
|
2008-11-20 23:01:55 +03:00
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db.db_size, db, type));
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
bzero(db->db.db_data, db->db.db_size);
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
*flags |= DB_RF_CACHED;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
spa = dn->dn_objset->os_spa;
|
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_state = DB_READ;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (DBUF_IS_L2CACHEABLE(db))
|
|
|
|
aflags |= ARC_L2CACHE;
|
2013-08-02 00:02:10 +04:00
|
|
|
if (DBUF_IS_L2COMPRESSIBLE(db))
|
|
|
|
aflags |= ARC_L2COMPRESS;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
|
|
|
|
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
|
|
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
dbuf_add_ref(db, NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
(void) arc_read(zio, spa, db->db_blkptr,
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
|
|
|
|
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
|
|
|
|
&aflags, &zb);
|
|
|
|
if (aflags & ARC_CACHED)
|
|
|
|
*flags |= DB_RF_CACHED;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
|
|
|
{
|
|
|
|
int err = 0;
|
|
|
|
int havepzio = (zio != NULL);
|
|
|
|
int prefetch;
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't have to hold the mutex to check db_state because it
|
|
|
|
* can't be freed while we have a hold on the buffer.
|
|
|
|
*/
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (db->db_state == DB_NOFILL)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EIO));
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-27 01:24:34 +04:00
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
2010-08-27 01:24:34 +04:00
|
|
|
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
|
2008-12-03 23:09:06 +03:00
|
|
|
DBUF_IS_CACHEABLE(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (prefetch)
|
2010-08-27 01:24:34 +04:00
|
|
|
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db.db_size, TRUE);
|
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-27 01:24:34 +04:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else if (db->db_state == DB_UNCACHED) {
|
2010-08-27 01:24:34 +04:00
|
|
|
spa_t *spa = dn->dn_objset->os_spa;
|
|
|
|
|
|
|
|
if (zio == NULL)
|
|
|
|
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_read_impl(db, zio, &flags);
|
|
|
|
|
|
|
|
/* dbuf_read_impl has dropped db_mtx for us */
|
|
|
|
|
|
|
|
if (prefetch)
|
2010-08-27 01:24:34 +04:00
|
|
|
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db.db_size, flags & DB_RF_CACHED);
|
|
|
|
|
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-27 01:24:34 +04:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (!havepzio)
|
|
|
|
err = zio_wait(zio);
|
|
|
|
} else {
|
2013-06-11 21:12:34 +04:00
|
|
|
/*
|
|
|
|
* Another reader came in while the dbuf was in flight
|
|
|
|
* between UNCACHED and CACHED. Either a writer will finish
|
|
|
|
* writing the buffer (sending the dbuf to CACHED) or the
|
|
|
|
* first reader's request will reach the read_done callback
|
|
|
|
* and send the dbuf to CACHED. Otherwise, a failure
|
|
|
|
* occurred and the dbuf went to UNCACHED.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (prefetch)
|
2010-08-27 01:24:34 +04:00
|
|
|
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db.db_size, TRUE);
|
|
|
|
if ((flags & DB_RF_HAVESTRUCT) == 0)
|
2010-08-27 01:24:34 +04:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Skip the wait per the caller's request. */
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if ((flags & DB_RF_NEVERWAIT) == 0) {
|
|
|
|
while (db->db_state == DB_READ ||
|
|
|
|
db->db_state == DB_FILL) {
|
|
|
|
ASSERT(db->db_state == DB_READ ||
|
|
|
|
(flags & DB_RF_HAVESTRUCT) == 0);
|
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
}
|
|
|
|
if (db->db_state == DB_UNCACHED)
|
2013-03-08 22:41:28 +04:00
|
|
|
err = SET_ERROR(EIO);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(err || havepzio || db->db_state == DB_CACHED);
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_noread(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
if (db->db_state == DB_UNCACHED) {
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
2010-08-27 01:24:34 +04:00
|
|
|
spa_t *spa;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_GET_SPA(&spa, db);
|
|
|
|
dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_state = DB_FILL;
|
2008-12-03 23:09:06 +03:00
|
|
|
} else if (db->db_state == DB_NOFILL) {
|
|
|
|
dbuf_set_data(db, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is our just-in-time copy function. It makes a copy of
|
|
|
|
* buffers, that have been modified in a previous transaction
|
|
|
|
* group, before we modify them in the current active group.
|
|
|
|
*
|
|
|
|
* This function is used in two places: when we are dirtying a
|
|
|
|
* buffer for the first time in a txg, and when we are freeing
|
|
|
|
* a range in a dnode that includes this buffer.
|
|
|
|
*
|
|
|
|
* Note that when we are called from dbuf_free_range() we do
|
|
|
|
* not put a hold on the buffer, we just traverse the active
|
|
|
|
* dbuf list for the dnode.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr = db->db_last_dirty;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(db->db.db_data != NULL);
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
|
|
|
|
|
|
|
|
if (dr == NULL ||
|
|
|
|
(dr->dt.dl.dr_data !=
|
2010-05-29 00:45:14 +04:00
|
|
|
((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the last dirty record for this dbuf has not yet synced
|
|
|
|
* and its referencing the dbuf data, either:
|
2010-08-27 01:24:34 +04:00
|
|
|
* reset the reference to point to a new copy,
|
2008-11-20 23:01:55 +03:00
|
|
|
* or (if there a no active holders)
|
|
|
|
* just null out the current db_data pointer.
|
|
|
|
*/
|
|
|
|
ASSERT(dr->dr_txg >= txg - 2);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/* Note that the data bufs here are zio_bufs */
|
|
|
|
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
|
2009-02-18 23:51:31 +03:00
|
|
|
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
|
|
|
|
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
|
|
|
|
int size = db->db.db_size;
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
2010-08-27 01:24:34 +04:00
|
|
|
spa_t *spa;
|
|
|
|
|
|
|
|
DB_GET_SPA(&spa, db);
|
|
|
|
dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
|
2008-11-20 23:01:55 +03:00
|
|
|
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
|
|
|
|
} else {
|
|
|
|
dbuf_set_data(db, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_unoverride(dbuf_dirty_record_t *dr)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-05-29 00:45:14 +04:00
|
|
|
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t txg = dr->dr_txg;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID ||
|
2008-11-20 23:01:55 +03:00
|
|
|
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
|
|
|
|
return;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_data_pending != dr);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/* free this block */
|
2013-05-10 23:47:54 +04:00
|
|
|
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
|
2010-08-27 01:24:34 +04:00
|
|
|
spa_t *spa;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_GET_SPA(&spa, db);
|
|
|
|
zio_free(spa, txg, bp);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
2013-05-10 23:47:54 +04:00
|
|
|
dr->dt.dl.dr_nopwrite = B_FALSE;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Release the already-written buffer, so we leave it in
|
|
|
|
* a consistent dirty state. Note that all callers are
|
|
|
|
* modifying the buffer, so they will immediately do
|
|
|
|
* another (redundant) arc_release(). Therefore, leave
|
|
|
|
* the buf thawed to save the effort of freezing &
|
|
|
|
* immediately re-thawing it.
|
|
|
|
*/
|
|
|
|
arc_release(dr->dt.dl.dr_data, db);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* Evict (if its unreferenced) or clear (if its referenced) any level-0
|
|
|
|
* data blocks in the free range, so that any future readers will find
|
2013-07-29 22:58:53 +04:00
|
|
|
* empty blocks. Also, if we happen across any level-1 dbufs in the
|
2008-12-03 23:09:06 +03:00
|
|
|
* range that have not already been marked dirty, mark them dirty so
|
|
|
|
* they stay in memory.
|
2013-07-29 22:58:53 +04:00
|
|
|
*
|
|
|
|
* This is a no-op if the dataset is in the middle of an incremental
|
|
|
|
* receive; see comment below for details.
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
void
|
2008-12-03 23:09:06 +03:00
|
|
|
dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db, *db_next;
|
|
|
|
uint64_t txg = tx->tx_txg;
|
2008-12-03 23:09:06 +03:00
|
|
|
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
uint64_t first_l1 = start >> epbs;
|
|
|
|
uint64_t last_l1 = end >> epbs;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
end = dn->dn_maxblkid;
|
|
|
|
last_l1 = end >> epbs;
|
|
|
|
}
|
|
|
|
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
|
2013-07-29 22:58:53 +04:00
|
|
|
|
2013-08-21 08:11:52 +04:00
|
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
|
|
|
if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
|
|
|
|
/* There can't be any dbufs in this range; no need to search. */
|
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
|
|
return;
|
|
|
|
} else if (dmu_objset_is_receiving(dn->dn_objset)) {
|
2013-07-29 22:58:53 +04:00
|
|
|
/*
|
2013-08-21 08:11:52 +04:00
|
|
|
* If we are receiving, we expect there to be no dbufs in
|
|
|
|
* the range to be freed, because receive modifies each
|
|
|
|
* block at most once, and in offset order. If this is
|
|
|
|
* not the case, it can lead to performance problems,
|
|
|
|
* so note that we unexpectedly took the slow path.
|
2013-07-29 22:58:53 +04:00
|
|
|
*/
|
2013-08-21 08:11:52 +04:00
|
|
|
atomic_inc_64(&zfs_free_range_recv_miss);
|
2013-07-29 22:58:53 +04:00
|
|
|
}
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
|
2008-11-20 23:01:55 +03:00
|
|
|
db_next = list_next(&dn->dn_dbufs, db);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (db->db_level == 1 &&
|
|
|
|
db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (db->db_last_dirty &&
|
|
|
|
db->db_last_dirty->dr_txg < txg) {
|
|
|
|
dbuf_add_ref(db, FTAG);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_will_dirty(db, tx);
|
|
|
|
dbuf_rele(db, FTAG);
|
|
|
|
} else {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_level != 0)
|
|
|
|
continue;
|
|
|
|
dprintf_dbuf(db, "found buf %s\n", "");
|
2008-12-03 23:09:06 +03:00
|
|
|
if (db->db_blkid < start || db->db_blkid > end)
|
2008-11-20 23:01:55 +03:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/* found a level 0 buffer in the range */
|
2013-09-04 16:00:57 +04:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (dbuf_undirty(db, tx)) {
|
|
|
|
/* mutex has been dropped and dbuf destroyed */
|
2008-11-20 23:01:55 +03:00
|
|
|
continue;
|
2013-09-04 16:00:57 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (db->db_state == DB_UNCACHED ||
|
2008-12-03 23:09:06 +03:00
|
|
|
db->db_state == DB_NOFILL ||
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_state == DB_EVICTING) {
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
|
|
|
|
/* will be handled in dbuf_read_done or dbuf_rele */
|
|
|
|
db->db_freed_in_flight = TRUE;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (refcount_count(&db->db_holds) == 0) {
|
|
|
|
ASSERT(db->db_buf);
|
|
|
|
dbuf_clear(db);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* The dbuf is referenced */
|
|
|
|
|
|
|
|
if (db->db_last_dirty != NULL) {
|
|
|
|
dbuf_dirty_record_t *dr = db->db_last_dirty;
|
|
|
|
|
|
|
|
if (dr->dr_txg == txg) {
|
|
|
|
/*
|
|
|
|
* This buffer is "in-use", re-adjust the file
|
|
|
|
* size to reflect that this buffer may
|
|
|
|
* contain new data when we sync.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID &&
|
|
|
|
db->db_blkid > dn->dn_maxblkid)
|
2008-11-20 23:01:55 +03:00
|
|
|
dn->dn_maxblkid = db->db_blkid;
|
|
|
|
dbuf_unoverride(dr);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* This dbuf is not dirty in the open context.
|
|
|
|
* Either uncache it (if its not referenced in
|
|
|
|
* the open context) or reset its contents to
|
|
|
|
* empty.
|
|
|
|
*/
|
|
|
|
dbuf_fix_old_data(db, txg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* clear the contents if its cached */
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
ASSERT(db->db.db_data != NULL);
|
|
|
|
arc_release(db->db_buf, db);
|
|
|
|
bzero(db->db.db_data, db->db.db_size);
|
|
|
|
arc_buf_freeze(db->db_buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
dbuf_block_freeable(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
|
|
|
|
uint64_t birth_txg = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't need any locking to protect db_blkptr:
|
|
|
|
* If it's syncing, then db_last_dirty will be set
|
|
|
|
* so we'll ignore db_blkptr.
|
|
|
|
*/
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
if (db->db_last_dirty)
|
|
|
|
birth_txg = db->db_last_dirty->dr_txg;
|
|
|
|
else if (db->db_blkptr)
|
|
|
|
birth_txg = db->db_blkptr->blk_birth;
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* If we don't exist or are in a snapshot, we can't be freed.
|
|
|
|
* Don't pass the bp to dsl_dataset_block_freeable() since we
|
|
|
|
* are holding the db_mtx lock and might deadlock if we are
|
|
|
|
* prefetching a dedup-ed block.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
if (birth_txg)
|
|
|
|
return (ds == NULL ||
|
2010-08-27 01:24:34 +04:00
|
|
|
dsl_dataset_block_freeable(ds, NULL, birth_txg));
|
2008-11-20 23:01:55 +03:00
|
|
|
else
|
|
|
|
return (FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
arc_buf_t *buf, *obuf;
|
|
|
|
int osize = db->db.db_size;
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/* XXX does *this* func really need the lock? */
|
2010-08-27 01:24:34 +04:00
|
|
|
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
|
|
|
|
* is OK, because there can be no other references to the db
|
|
|
|
* when we are changing its size, so no concurrent DB_FILL can
|
|
|
|
* be happening.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* XXX we should be doing a dbuf_read, checking the return
|
|
|
|
* value and returning that up to our callers
|
|
|
|
*/
|
|
|
|
dbuf_will_dirty(db, tx);
|
|
|
|
|
|
|
|
/* create the data buffer for the new block */
|
2010-08-27 01:24:34 +04:00
|
|
|
buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/* copy old block data to the new block */
|
|
|
|
obuf = db->db_buf;
|
|
|
|
bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
|
|
|
|
/* zero the remainder */
|
|
|
|
if (size > osize)
|
|
|
|
bzero((uint8_t *)buf->b_data + osize, size - osize);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
dbuf_set_data(db, buf);
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(obuf, db));
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db.db_size = size;
|
|
|
|
|
|
|
|
if (db->db_level == 0) {
|
|
|
|
ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
|
|
|
|
db->db_last_dirty->dt.dl.dr_data = buf;
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_willuse_space(dn, size-osize, tx);
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
void
|
|
|
|
dbuf_release_bp(dmu_buf_impl_t *db)
|
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
objset_t *os;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_GET_OBJSET(&os, db);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
|
|
|
|
ASSERT(arc_released(os->os_phys_buf) ||
|
|
|
|
list_link_active(&os->os_dsl_dataset->ds_synced_link));
|
|
|
|
ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
|
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
(void) arc_release(db->db_buf, db);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_dirty_record_t *
|
|
|
|
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
|
|
|
objset_t *os;
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_dirty_record_t **drp, *dr;
|
|
|
|
int drop_struct_lock = FALSE;
|
2008-12-03 23:09:06 +03:00
|
|
|
boolean_t do_free_accounting = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
int txgoff = tx->tx_txg & TXG_MASK;
|
|
|
|
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
DMU_TX_DIRTY_BUF(tx, db);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Shouldn't dirty a regular buffer in syncing context. Private
|
|
|
|
* objects may be dirtied in syncing context, but only if they
|
|
|
|
* were already pre-dirtied in open context.
|
|
|
|
*/
|
|
|
|
ASSERT(!dmu_tx_is_syncing(tx) ||
|
|
|
|
BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
|
2009-07-03 02:44:48 +04:00
|
|
|
DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
|
|
|
|
dn->dn_objset->os_dsl_dataset == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* We make this assert for private objects as well, but after we
|
|
|
|
* check if we're already dirty. They are allowed to re-dirty
|
|
|
|
* in syncing context.
|
|
|
|
*/
|
|
|
|
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
|
|
|
|
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
|
|
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
/*
|
|
|
|
* XXX make this true for indirects too? The problem is that
|
|
|
|
* transactions created with dmu_tx_create_assigned() from
|
|
|
|
* syncing context don't bother holding ahead.
|
|
|
|
*/
|
|
|
|
ASSERT(db->db_level != 0 ||
|
2008-12-03 23:09:06 +03:00
|
|
|
db->db_state == DB_CACHED || db->db_state == DB_FILL ||
|
|
|
|
db->db_state == DB_NOFILL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
/*
|
|
|
|
* Don't set dirtyctx to SYNC if we're just modifying this as we
|
|
|
|
* initialize the objset.
|
|
|
|
*/
|
|
|
|
if (dn->dn_dirtyctx == DN_UNDIRTIED &&
|
|
|
|
!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
|
|
|
|
dn->dn_dirtyctx =
|
|
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
|
|
|
|
ASSERT(dn->dn_dirtyctx_firstset == NULL);
|
2011-07-23 00:55:27 +04:00
|
|
|
dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_PUSHPAGE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID)
|
|
|
|
dn->dn_have_spill = B_TRUE;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If this buffer is already dirty, we're done.
|
|
|
|
*/
|
|
|
|
drp = &db->db_last_dirty;
|
|
|
|
ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
|
|
|
|
db->db.db_object == DMU_META_DNODE_OBJECT);
|
|
|
|
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
|
|
|
|
drp = &dr->dr_next;
|
|
|
|
if (dr && dr->dr_txg == tx->tx_txg) {
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If this buffer has already been written out,
|
|
|
|
* we now need to reset its state.
|
|
|
|
*/
|
|
|
|
dbuf_unoverride(dr);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db.db_object != DMU_META_DNODE_OBJECT &&
|
|
|
|
db->db_state != DB_NOFILL)
|
2008-11-20 23:01:55 +03:00
|
|
|
arc_buf_thaw(db->db_buf);
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return (dr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only valid if not already dirty.
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(dn->dn_object == 0 ||
|
|
|
|
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
|
2008-11-20 23:01:55 +03:00
|
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
|
|
|
|
|
|
|
|
ASSERT3U(dn->dn_nlevels, >, db->db_level);
|
|
|
|
ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
|
|
|
|
dn->dn_phys->dn_nlevels > db->db_level ||
|
|
|
|
dn->dn_next_nlevels[txgoff] > db->db_level ||
|
|
|
|
dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
|
|
|
|
dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We should only be dirtying in syncing context if it's the
|
2009-07-03 02:44:48 +04:00
|
|
|
* mos or we're initializing the os or it's a special object.
|
|
|
|
* However, we are allowed to dirty in syncing context provided
|
|
|
|
* we already dirtied it in open context. Hence we must make
|
|
|
|
* this assertion only if we're not already dirty.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-08-27 01:24:34 +04:00
|
|
|
os = dn->dn_objset;
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
|
|
|
|
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(db->db.db_size != 0);
|
|
|
|
|
|
|
|
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Update the accounting.
|
2008-12-03 23:09:06 +03:00
|
|
|
* Note: we delay "free accounting" until after we drop
|
|
|
|
* the db_mtx. This keeps us from grabbing other locks
|
2010-05-29 00:45:14 +04:00
|
|
|
* (and possibly deadlocking) in bp_get_dsize() while
|
2008-12-03 23:09:06 +03:00
|
|
|
* also holding the db_mtx.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
dnode_willuse_space(dn, db->db.db_size, tx);
|
2008-12-03 23:09:06 +03:00
|
|
|
do_free_accounting = dbuf_block_freeable(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this buffer is dirty in an old transaction group we need
|
|
|
|
* to make a copy of it so that the changes we make in this
|
|
|
|
* transaction group won't leak out when we sync the older txg.
|
|
|
|
*/
|
2011-07-23 00:55:27 +04:00
|
|
|
dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_PUSHPAGE);
|
2010-08-26 21:26:44 +04:00
|
|
|
list_link_init(&dr->dr_dirty_node);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_level == 0) {
|
|
|
|
void *data_old = db->db_buf;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-12-03 23:09:06 +03:00
|
|
|
dbuf_fix_old_data(db, tx->tx_txg);
|
|
|
|
data_old = db->db.db_data;
|
|
|
|
} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
|
|
|
|
/*
|
|
|
|
* Release the data buffer from the cache so
|
|
|
|
* that we can modify it without impacting
|
|
|
|
* possible other users of this cached data
|
|
|
|
* block. Note that indirect blocks and
|
|
|
|
* private objects are not released until the
|
|
|
|
* syncing state (since they are only modified
|
|
|
|
* then).
|
|
|
|
*/
|
|
|
|
arc_release(db->db_buf, db);
|
|
|
|
dbuf_fix_old_data(db, tx->tx_txg);
|
|
|
|
data_old = db->db_buf;
|
|
|
|
}
|
|
|
|
ASSERT(data_old != NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
dr->dt.dl.dr_data = data_old;
|
|
|
|
} else {
|
|
|
|
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
|
|
|
|
list_create(&dr->dt.di.dr_children,
|
|
|
|
sizeof (dbuf_dirty_record_t),
|
|
|
|
offsetof(dbuf_dirty_record_t, dr_dirty_node));
|
|
|
|
}
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
|
|
|
|
dr->dr_accounted = db->db.db_size;
|
2008-11-20 23:01:55 +03:00
|
|
|
dr->dr_dbuf = db;
|
|
|
|
dr->dr_txg = tx->tx_txg;
|
|
|
|
dr->dr_next = *drp;
|
|
|
|
*drp = dr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We could have been freed_in_flight between the dbuf_noread
|
|
|
|
* and dbuf_dirty. We win, as though the dbuf_noread() had
|
|
|
|
* happened after the free.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
|
|
|
db->db_blkid != DMU_SPILL_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
dnode_clear_range(dn, db->db_blkid, 1, tx);
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
db->db_freed_in_flight = FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This buffer is now part of this txg
|
|
|
|
*/
|
|
|
|
dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
|
|
|
|
db->db_dirtycnt += 1;
|
|
|
|
ASSERT3U(db->db_dirtycnt, <=, 3);
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID ||
|
|
|
|
db->db_blkid == DMU_SPILL_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
|
|
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
dnode_setdirty(dn, tx);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (dr);
|
2008-12-03 23:09:06 +03:00
|
|
|
} else if (do_free_accounting) {
|
|
|
|
blkptr_t *bp = db->db_blkptr;
|
|
|
|
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
|
2010-05-29 00:45:14 +04:00
|
|
|
bp_get_dsize(os->os_spa, bp) : db->db.db_size;
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* This is only a guess -- if the dbuf is dirty
|
|
|
|
* in a previous txg, we don't know how much
|
|
|
|
* space it will use on disk yet. We should
|
|
|
|
* really have the struct_rwlock to access
|
|
|
|
* db_blkptr, but since this is just a guess,
|
|
|
|
* it's OK if we get an odd answer.
|
|
|
|
*/
|
2010-08-27 01:24:34 +04:00
|
|
|
ddt_prefetch(os->os_spa, bp);
|
2008-12-03 23:09:06 +03:00
|
|
|
dnode_willuse_space(dn, -willfree, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
|
|
drop_struct_lock = TRUE;
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (db->db_level == 0) {
|
|
|
|
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
|
|
|
|
ASSERT(dn->dn_maxblkid >= db->db_blkid);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_level+1 < dn->dn_nlevels) {
|
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
|
|
dbuf_dirty_record_t *di;
|
|
|
|
int parent_held = FALSE;
|
|
|
|
|
|
|
|
if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
|
|
|
|
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
|
|
|
|
parent = dbuf_hold_level(dn, db->db_level+1,
|
|
|
|
db->db_blkid >> epbs, FTAG);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(parent != NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
parent_held = TRUE;
|
|
|
|
}
|
|
|
|
if (drop_struct_lock)
|
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
ASSERT3U(db->db_level+1, ==, parent->db_level);
|
|
|
|
di = dbuf_dirty(parent, tx);
|
|
|
|
if (parent_held)
|
|
|
|
dbuf_rele(parent, FTAG);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
/*
|
|
|
|
* Since we've dropped the mutex, it's possible that
|
|
|
|
* dbuf_undirty() might have changed this out from under us.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_last_dirty == dr ||
|
|
|
|
dn->dn_object == DMU_META_DNODE_OBJECT) {
|
|
|
|
mutex_enter(&di->dt.di.dr_mtx);
|
|
|
|
ASSERT3U(di->dr_txg, ==, tx->tx_txg);
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
|
|
list_insert_tail(&di->dt.di.dr_children, dr);
|
|
|
|
mutex_exit(&di->dt.di.dr_mtx);
|
|
|
|
dr->dr_parent = di;
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
} else {
|
|
|
|
ASSERT(db->db_level+1 == dn->dn_nlevels);
|
|
|
|
ASSERT(db->db_blkid < dn->dn_nblkptr);
|
2010-08-27 01:24:34 +04:00
|
|
|
ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
|
|
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
if (drop_struct_lock)
|
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
}
|
|
|
|
|
|
|
|
dnode_setdirty(dn, tx);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (dr);
|
|
|
|
}
|
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
/*
|
2013-06-11 21:12:34 +04:00
|
|
|
* Undirty a buffer in the transaction group referenced by the given
|
|
|
|
* transaction. Return whether this evicted the dbuf.
|
2013-09-04 16:00:57 +04:00
|
|
|
*/
|
|
|
|
static boolean_t
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t txg = tx->tx_txg;
|
|
|
|
dbuf_dirty_record_t *dr, **drp;
|
|
|
|
|
|
|
|
ASSERT(txg != 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2013-09-04 16:00:57 +04:00
|
|
|
ASSERT0(db->db_level);
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this buffer is not dirty, we're done.
|
|
|
|
*/
|
|
|
|
for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
|
|
|
|
if (dr->dr_txg <= txg)
|
|
|
|
break;
|
2013-09-04 16:00:57 +04:00
|
|
|
if (dr == NULL || dr->dr_txg < txg)
|
|
|
|
return (B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(dr->dr_txg == txg);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(dr->dr_dbuf == db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2013-09-04 16:00:57 +04:00
|
|
|
* Note: This code will probably work even if there are concurrent
|
|
|
|
* holders, but it is untested in that scenerio, as the ZPL and
|
|
|
|
* ztest have additional locking (the range locks) that prevents
|
|
|
|
* that type of concurrent access.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2013-09-04 16:00:57 +04:00
|
|
|
ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
|
|
|
|
|
|
|
|
ASSERT(db->db.db_size != 0);
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
/*
|
|
|
|
* Any space we accounted for in dp_dirty_* will be cleaned up by
|
|
|
|
* dsl_pool_sync(). This is relatively rare so the discrepancy
|
|
|
|
* is not a big deal.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
*drp = dr->dr_next;
|
|
|
|
|
Illumos #764: panic in zfs:dbuf_sync_list
Hypothesis about what's going on here.
At some time in the past, something, i.e. dnode_reallocate()
calls one of:
dbuf_rm_spill(dn, tx);
These will do:
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx)
dbuf_undirty(db, tx)
Currently dbuf_undirty can leave a spill block in dn_dirty_records[],
(it having been put there previously by dbuf_dirty) and free it.
Sometime later, dbuf_sync_list trips over this reference to free'd
(and typically reused) memory.
Also, dbuf_undirty can call dnode_clear_range with a bogus
block ID. It needs to test for DMU_SPILL_BLKID, similar to
how dnode_clear_range is called in dbuf_dirty().
References to Illumos issue and patch:
- https://www.illumos.org/issues/764
- https://github.com/illumos/illumos-gate/commit/3f2366c2bb
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Mark.Maybe@oracle.com
Reviewed by: Albert Lee <trisk@nexenta.com
Approved by: Garrett D'Amore <garrett@nexenta.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
2011-07-26 22:37:06 +04:00
|
|
|
/*
|
|
|
|
* Note that there are three places in dbuf_dirty()
|
|
|
|
* where this dirty record may be put on a list.
|
|
|
|
* Make sure to do a list_remove corresponding to
|
|
|
|
* every one of those list_insert calls.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
if (dr->dr_parent) {
|
|
|
|
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
|
|
|
|
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
|
|
|
|
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
|
Illumos #764: panic in zfs:dbuf_sync_list
Hypothesis about what's going on here.
At some time in the past, something, i.e. dnode_reallocate()
calls one of:
dbuf_rm_spill(dn, tx);
These will do:
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx)
dbuf_undirty(db, tx)
Currently dbuf_undirty can leave a spill block in dn_dirty_records[],
(it having been put there previously by dbuf_dirty) and free it.
Sometime later, dbuf_sync_list trips over this reference to free'd
(and typically reused) memory.
Also, dbuf_undirty can call dnode_clear_range with a bogus
block ID. It needs to test for DMU_SPILL_BLKID, similar to
how dnode_clear_range is called in dbuf_dirty().
References to Illumos issue and patch:
- https://www.illumos.org/issues/764
- https://github.com/illumos/illumos-gate/commit/3f2366c2bb
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Mark.Maybe@oracle.com
Reviewed by: Albert Lee <trisk@nexenta.com
Approved by: Garrett D'Amore <garrett@nexenta.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #340
2011-07-26 22:37:06 +04:00
|
|
|
} else if (db->db_blkid == DMU_SPILL_BLKID ||
|
|
|
|
db->db_level+1 == dn->dn_nlevels) {
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
|
|
|
dbuf_unoverride(dr);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
ASSERT(db->db_buf != NULL);
|
2013-09-04 16:00:57 +04:00
|
|
|
ASSERT(dr->dt.dl.dr_data != NULL);
|
|
|
|
if (dr->dt.dl.dr_data != db->db_buf)
|
|
|
|
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
|
|
|
|
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
|
|
db->db_dirtycnt -= 1;
|
|
|
|
|
|
|
|
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
|
|
|
|
arc_buf_t *buf = db->db_buf;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_set_data(db, NULL);
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_evict(db);
|
2013-09-04 16:00:57 +04:00
|
|
|
return (B_TRUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2013-09-04 16:00:57 +04:00
|
|
|
return (B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
|
|
|
|
void
|
|
|
|
dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
|
|
|
|
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
|
2008-11-20 23:01:55 +03:00
|
|
|
rf |= DB_RF_HAVESTRUCT;
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) dbuf_read(db, NULL, rf);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
void
|
|
|
|
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
|
|
|
db->db_state = DB_NOFILL;
|
|
|
|
|
|
|
|
dmu_buf_will_fill(db_fake, tx);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
void
|
|
|
|
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
|
|
|
|
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
|
|
|
|
dmu_tx_private_ok(tx));
|
|
|
|
|
|
|
|
dbuf_noread(db);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
|
|
|
}
|
|
|
|
|
|
|
|
#pragma weak dmu_buf_fill_done = dbuf_fill_done
|
|
|
|
/* ARGSUSED */
|
|
|
|
void
|
|
|
|
dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
|
|
|
if (db->db_state == DB_FILL) {
|
|
|
|
if (db->db_level == 0 && db->db_freed_in_flight) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 23:01:55 +03:00
|
|
|
/* we were freed while filling */
|
|
|
|
/* XXX dbuf_undirty? */
|
|
|
|
bzero(db->db.db_data, db->db.db_size);
|
|
|
|
db->db_freed_in_flight = FALSE;
|
|
|
|
}
|
|
|
|
db->db_state = DB_CACHED;
|
|
|
|
cv_broadcast(&db->db_changed);
|
|
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* Directly assign a provided arc buf to a given dbuf if it's not referenced
|
|
|
|
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
|
|
|
|
ASSERT(buf != NULL);
|
|
|
|
ASSERT(arc_buf_size(buf) == db->db.db_size);
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
|
|
|
|
arc_return_buf(buf, db);
|
|
|
|
ASSERT(arc_released(buf));
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
|
|
|
|
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
|
|
|
|
|
|
|
|
if (db->db_state == DB_CACHED &&
|
|
|
|
refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
|
|
|
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2010-05-29 00:45:14 +04:00
|
|
|
xuio_stat_wbuf_copied();
|
2009-07-03 02:44:48 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
xuio_stat_wbuf_nocopy();
|
2009-07-03 02:44:48 +04:00
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
dbuf_dirty_record_t *dr = db->db_last_dirty;
|
|
|
|
|
|
|
|
ASSERT(db->db_buf != NULL);
|
|
|
|
if (dr != NULL && dr->dr_txg == tx->tx_txg) {
|
|
|
|
ASSERT(dr->dt.dl.dr_data == db->db_buf);
|
|
|
|
if (!arc_released(db->db_buf)) {
|
|
|
|
ASSERT(dr->dt.dl.dr_override_state ==
|
|
|
|
DR_OVERRIDDEN);
|
|
|
|
arc_release(db->db_buf, db);
|
|
|
|
}
|
|
|
|
dr->dt.dl.dr_data = buf;
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(db->db_buf, db));
|
2009-07-03 02:44:48 +04:00
|
|
|
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
|
|
|
|
arc_release(db->db_buf, db);
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(db->db_buf, db));
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
|
|
|
db->db_buf = NULL;
|
|
|
|
}
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
|
|
dbuf_set_data(db, buf);
|
|
|
|
db->db_state = DB_FILL;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
(void) dbuf_dirty(db, tx);
|
|
|
|
dbuf_fill_done(db, tx);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* "Clear" the contents of this dbuf. This will mark the dbuf
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
* EVICTING and clear *most* of its references. Unfortunately,
|
2008-11-20 23:01:55 +03:00
|
|
|
* when we are not holding the dn_dbufs_mtx, we can't clear the
|
|
|
|
* entry in the dn_dbufs list. We have to wait until dbuf_destroy()
|
|
|
|
* in this case. For callers from the DMU we will usually see:
|
|
|
|
* dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
|
|
|
|
* For the arc callback, we will usually see:
|
2010-08-27 01:24:34 +04:00
|
|
|
* dbuf_do_evict()->dbuf_clear();dbuf_destroy()
|
2008-11-20 23:01:55 +03:00
|
|
|
* Sometimes, though, we will get a mix of these two:
|
|
|
|
* DMU: dbuf_clear()->arc_buf_evict()
|
|
|
|
* ARC: dbuf_do_evict()->dbuf_destroy()
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
dbuf_clear(dmu_buf_impl_t *db)
|
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
2010-08-27 01:24:34 +04:00
|
|
|
dmu_buf_impl_t *dndb;
|
2008-11-20 23:01:55 +03:00
|
|
|
int dbuf_gone = FALSE;
|
|
|
|
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
|
|
|
|
dbuf_evict_user(db);
|
|
|
|
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
|
|
ASSERT(db->db.db_data != NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
|
2009-02-18 23:51:31 +03:00
|
|
|
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
db->db.db_data = NULL;
|
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(db->db_data_pending == NULL);
|
|
|
|
|
|
|
|
db->db_state = DB_EVICTING;
|
|
|
|
db->db_blkptr = NULL;
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
dndb = dn->dn_dbuf;
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
list_remove(&dn->dn_dbufs, db);
|
2010-08-27 01:24:34 +04:00
|
|
|
(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
|
|
|
|
membar_producer();
|
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
|
|
* Decrementing the dbuf count means that the hold corresponding
|
|
|
|
* to the removed dbuf is no longer discounted in dnode_move(),
|
|
|
|
* so the dnode cannot be moved until after we release the hold.
|
|
|
|
* The membar_producer() ensures visibility of the decremented
|
|
|
|
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
|
|
|
|
* release any lock.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
dnode_rele(dn, db);
|
2010-08-27 01:24:34 +04:00
|
|
|
db->db_dnode_handle = NULL;
|
|
|
|
} else {
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (db->db_buf)
|
|
|
|
dbuf_gone = arc_buf_evict(db->db_buf);
|
|
|
|
|
|
|
|
if (!dbuf_gone)
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
|
|
|
/*
|
2010-08-27 01:24:34 +04:00
|
|
|
* If this dbuf is referenced from an indirect dbuf,
|
2008-11-20 23:01:55 +03:00
|
|
|
* decrement the ref count on the indirect dbuf.
|
|
|
|
*/
|
|
|
|
if (parent && parent != dndb)
|
|
|
|
dbuf_rele(parent, db);
|
|
|
|
}
|
|
|
|
|
2010-08-26 21:58:00 +04:00
|
|
|
__attribute__((always_inline))
|
|
|
|
static inline int
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
2010-08-26 21:52:00 +04:00
|
|
|
dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
int nlevels, epbs;
|
|
|
|
|
|
|
|
*parentp = NULL;
|
|
|
|
*bpp = NULL;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
|
|
|
|
|
|
|
if (blkid == DMU_SPILL_BLKID) {
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
if (dn->dn_have_spill &&
|
|
|
|
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
|
|
|
|
*bpp = &dn->dn_phys->dn_spill;
|
|
|
|
else
|
|
|
|
*bpp = NULL;
|
|
|
|
dbuf_add_ref(dn->dn_dbuf, NULL);
|
|
|
|
*parentp = dn->dn_dbuf;
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
return (0);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (dn->dn_phys->dn_nlevels == 0)
|
|
|
|
nlevels = 1;
|
|
|
|
else
|
|
|
|
nlevels = dn->dn_phys->dn_nlevels;
|
|
|
|
|
|
|
|
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
|
|
|
|
ASSERT3U(level * epbs, <, 64);
|
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
if (level >= nlevels ||
|
|
|
|
(blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
|
|
|
|
/* the buffer has no parent yet */
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 23:01:55 +03:00
|
|
|
} else if (level < nlevels-1) {
|
|
|
|
/* this block is referenced from an indirect block */
|
2010-08-26 21:52:00 +04:00
|
|
|
int err;
|
|
|
|
if (dh == NULL) {
|
|
|
|
err = dbuf_hold_impl(dn, level+1, blkid >> epbs,
|
|
|
|
fail_sparse, NULL, parentp);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
|
|
|
|
blkid >> epbs, fail_sparse, NULL,
|
|
|
|
parentp, dh->dh_depth + 1);
|
|
|
|
err = __dbuf_hold_impl(dh + 1);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
if (err)
|
|
|
|
return (err);
|
|
|
|
err = dbuf_read(*parentp, NULL,
|
|
|
|
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
|
|
|
if (err) {
|
|
|
|
dbuf_rele(*parentp, NULL);
|
|
|
|
*parentp = NULL;
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
|
|
|
|
(blkid & ((1ULL << epbs) - 1));
|
|
|
|
return (0);
|
|
|
|
} else {
|
|
|
|
/* the block is referenced from the dnode */
|
|
|
|
ASSERT3U(level, ==, nlevels-1);
|
|
|
|
ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
|
|
|
|
blkid < dn->dn_phys->dn_nblkptr);
|
|
|
|
if (dn->dn_dbuf) {
|
|
|
|
dbuf_add_ref(dn->dn_dbuf, NULL);
|
|
|
|
*parentp = dn->dn_dbuf;
|
|
|
|
}
|
|
|
|
*bpp = &dn->dn_phys->dn_blkptr[blkid];
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static dmu_buf_impl_t *
|
|
|
|
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
|
|
|
dmu_buf_impl_t *parent, blkptr_t *blkptr)
|
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
objset_t *os = dn->dn_objset;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_buf_impl_t *db, *odb;
|
|
|
|
|
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
ASSERT(dn->dn_type != DMU_OT_NONE);
|
|
|
|
|
2012-05-07 21:49:51 +04:00
|
|
|
db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
db->db_objset = os;
|
|
|
|
db->db.db_object = dn->dn_object;
|
|
|
|
db->db_level = level;
|
|
|
|
db->db_blkid = blkid;
|
|
|
|
db->db_last_dirty = NULL;
|
|
|
|
db->db_dirtycnt = 0;
|
2010-08-27 01:24:34 +04:00
|
|
|
db->db_dnode_handle = dn->dn_handle;
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_parent = parent;
|
|
|
|
db->db_blkptr = blkptr;
|
|
|
|
|
|
|
|
db->db_user_ptr = NULL;
|
|
|
|
db->db_user_data_ptr_ptr = NULL;
|
|
|
|
db->db_evict_func = NULL;
|
|
|
|
db->db_immediate_evict = 0;
|
|
|
|
db->db_freed_in_flight = 0;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (blkid == DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3P(parent, ==, dn->dn_dbuf);
|
|
|
|
db->db.db_size = DN_MAX_BONUSLEN -
|
|
|
|
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
|
|
|
|
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
|
2010-05-29 00:45:14 +04:00
|
|
|
db->db.db_offset = DMU_BONUS_BLKID;
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
/* the bonus dbuf is not placed in the hash table */
|
2009-02-18 23:51:31 +03:00
|
|
|
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (db);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else if (blkid == DMU_SPILL_BLKID) {
|
|
|
|
db->db.db_size = (blkptr != NULL) ?
|
|
|
|
BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
|
|
|
|
db->db.db_offset = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
int blocksize =
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db.db_size = blocksize;
|
|
|
|
db->db.db_offset = db->db_blkid * blocksize;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hold the dn_dbufs_mtx while we get the new dbuf
|
|
|
|
* in the hash table *and* added to the dbufs list.
|
|
|
|
* This prevents a possible deadlock with someone
|
|
|
|
* trying to look up this dbuf before its added to the
|
|
|
|
* dn_dbufs list.
|
|
|
|
*/
|
|
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
|
|
|
db->db_state = DB_EVICTING;
|
|
|
|
if ((odb = dbuf_hash_insert(db)) != NULL) {
|
|
|
|
/* someone else inserted it first */
|
|
|
|
kmem_cache_free(dbuf_cache, db);
|
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
|
|
return (odb);
|
|
|
|
}
|
|
|
|
list_insert_head(&dn->dn_dbufs, db);
|
2013-08-21 08:11:52 +04:00
|
|
|
if (db->db_level == 0 && db->db_blkid >=
|
|
|
|
dn->dn_unlisted_l0_blkid)
|
|
|
|
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_state = DB_UNCACHED;
|
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
2009-02-18 23:51:31 +03:00
|
|
|
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (parent && parent != dn->dn_dbuf)
|
|
|
|
dbuf_add_ref(parent, db);
|
|
|
|
|
|
|
|
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
|
|
|
|
refcount_count(&dn->dn_holds) > 0);
|
|
|
|
(void) refcount_add(&dn->dn_holds, db);
|
2010-08-27 01:24:34 +04:00
|
|
|
(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
dprintf_dbuf(db, "db=%p\n", db);
|
|
|
|
|
|
|
|
return (db);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
dbuf_do_evict(void *private)
|
|
|
|
{
|
|
|
|
arc_buf_t *buf = private;
|
|
|
|
dmu_buf_impl_t *db = buf->b_private;
|
|
|
|
|
|
|
|
if (!MUTEX_HELD(&db->db_mtx))
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
|
|
|
|
if (db->db_state != DB_EVICTING) {
|
|
|
|
ASSERT(db->db_state == DB_CACHED);
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
db->db_buf = NULL;
|
|
|
|
dbuf_evict(db);
|
|
|
|
} else {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_destroy(db);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_destroy(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
ASSERT(refcount_is_zero(&db->db_holds));
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If this dbuf is still on the dn_dbufs list,
|
|
|
|
* remove it from that list.
|
|
|
|
*/
|
2010-08-27 01:24:34 +04:00
|
|
|
if (db->db_dnode_handle != NULL) {
|
|
|
|
dnode_t *dn;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
|
|
|
list_remove(&dn->dn_dbufs, db);
|
2010-08-27 01:24:34 +04:00
|
|
|
(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
|
|
* Decrementing the dbuf count means that the hold
|
|
|
|
* corresponding to the removed dbuf is no longer
|
|
|
|
* discounted in dnode_move(), so the dnode cannot be
|
|
|
|
* moved until after we release the hold.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
dnode_rele(dn, db);
|
2010-08-27 01:24:34 +04:00
|
|
|
db->db_dnode_handle = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
dbuf_hash_remove(db);
|
|
|
|
}
|
|
|
|
db->db_parent = NULL;
|
|
|
|
db->db_buf = NULL;
|
|
|
|
|
|
|
|
ASSERT(!list_link_active(&db->db_link));
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
ASSERT(db->db_hash_next == NULL);
|
|
|
|
ASSERT(db->db_blkptr == NULL);
|
|
|
|
ASSERT(db->db_data_pending == NULL);
|
|
|
|
|
|
|
|
kmem_cache_free(dbuf_cache, db);
|
2009-02-18 23:51:31 +03:00
|
|
|
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = NULL;
|
|
|
|
blkptr_t *bp = NULL;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
|
|
|
|
if (dnode_block_freed(dn, blkid))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* dbuf_find() returns with db_mtx held */
|
2010-08-26 20:52:42 +04:00
|
|
|
if ((db = dbuf_find(dn, 0, blkid))) {
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* This dbuf is already in the cache. We assume that
|
|
|
|
* it is already CACHED, or else about to be either
|
|
|
|
* read or filled.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&db->db_mtx);
|
2010-08-27 01:24:34 +04:00
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (bp && !BP_IS_HOLE(bp)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
|
|
|
|
zbookmark_t zb;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
|
|
|
|
dn->dn_object, 0, blkid);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-07-03 00:26:24 +04:00
|
|
|
(void) arc_read(NULL, dn->dn_objset->os_spa,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
bp, NULL, NULL, prio,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
|
|
&aflags, &zb);
|
|
|
|
}
|
|
|
|
if (db)
|
|
|
|
dbuf_rele(db, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
#define DBUF_HOLD_IMPL_MAX_DEPTH 20
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Returns with db_holds incremented, and db_mtx not held.
|
|
|
|
* Note: dn_struct_rwlock must be held.
|
|
|
|
*/
|
2010-08-26 21:52:00 +04:00
|
|
|
static int
|
|
|
|
__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-08-26 21:52:00 +04:00
|
|
|
ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
|
|
|
|
dh->dh_parent = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
|
|
|
|
ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
|
|
|
|
ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
*(dh->dh_dbp) = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
top:
|
|
|
|
/* dbuf_find() returns with db_mtx held */
|
2010-08-26 21:52:00 +04:00
|
|
|
dh->dh_db = dbuf_find(dh->dh_dn, dh->dh_level, dh->dh_blkid);
|
|
|
|
|
|
|
|
if (dh->dh_db == NULL) {
|
|
|
|
dh->dh_bp = NULL;
|
|
|
|
|
|
|
|
ASSERT3P(dh->dh_parent, ==, NULL);
|
|
|
|
dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
|
|
|
|
dh->dh_fail_sparse, &dh->dh_parent,
|
|
|
|
&dh->dh_bp, dh);
|
|
|
|
if (dh->dh_fail_sparse) {
|
|
|
|
if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
|
2013-03-08 22:41:28 +04:00
|
|
|
dh->dh_err = SET_ERROR(ENOENT);
|
2010-08-26 21:52:00 +04:00
|
|
|
if (dh->dh_err) {
|
|
|
|
if (dh->dh_parent)
|
|
|
|
dbuf_rele(dh->dh_parent, NULL);
|
|
|
|
return (dh->dh_err);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
2010-08-26 21:52:00 +04:00
|
|
|
if (dh->dh_err && dh->dh_err != ENOENT)
|
|
|
|
return (dh->dh_err);
|
|
|
|
dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
|
|
|
|
dh->dh_parent, dh->dh_bp);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) {
|
|
|
|
arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db);
|
|
|
|
if (dh->dh_db->db_buf->b_data == NULL) {
|
|
|
|
dbuf_clear(dh->dh_db);
|
|
|
|
if (dh->dh_parent) {
|
|
|
|
dbuf_rele(dh->dh_parent, NULL);
|
|
|
|
dh->dh_parent = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
goto top;
|
|
|
|
}
|
2010-08-26 21:52:00 +04:00
|
|
|
ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this buffer is currently syncing out, and we are are
|
|
|
|
* still referencing it from db_data, we need to make a copy
|
|
|
|
* of it in case we decide we want to dirty it again in this txg.
|
|
|
|
*/
|
2010-08-26 21:52:00 +04:00
|
|
|
if (dh->dh_db->db_level == 0 &&
|
|
|
|
dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
|
|
|
|
dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
|
|
|
|
dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
|
|
|
|
dh->dh_dr = dh->dh_db->db_data_pending;
|
|
|
|
|
|
|
|
if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) {
|
|
|
|
dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db);
|
|
|
|
|
|
|
|
dbuf_set_data(dh->dh_db,
|
|
|
|
arc_buf_alloc(dh->dh_dn->dn_objset->os_spa,
|
|
|
|
dh->dh_db->db.db_size, dh->dh_db, dh->dh_type));
|
|
|
|
bcopy(dh->dh_dr->dt.dl.dr_data->b_data,
|
|
|
|
dh->dh_db->db.db_data, dh->dh_db->db.db_size);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
(void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
|
|
|
|
dbuf_update_data(dh->dh_db);
|
|
|
|
DBUF_VERIFY(dh->dh_db);
|
|
|
|
mutex_exit(&dh->dh_db->db_mtx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/* NOTE: we can't rele the parent until after we drop the db_mtx */
|
2010-08-26 21:52:00 +04:00
|
|
|
if (dh->dh_parent)
|
|
|
|
dbuf_rele(dh->dh_parent, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
|
|
|
|
ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
|
|
|
|
ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
|
|
|
|
*(dh->dh_dbp) = dh->dh_db;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2010-08-26 21:52:00 +04:00
|
|
|
/*
|
|
|
|
* The following code preserves the recursive function dbuf_hold_impl()
|
|
|
|
* but moves the local variables AND function arguments to the heap to
|
|
|
|
* minimize the stack frame size. Enough space is initially allocated
|
|
|
|
* on the stack for 20 levels of recursion.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
|
|
|
|
void *tag, dmu_buf_impl_t **dbp)
|
|
|
|
{
|
|
|
|
struct dbuf_hold_impl_data *dh;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) *
|
2012-05-07 21:49:51 +04:00
|
|
|
DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE);
|
2010-08-26 21:52:00 +04:00
|
|
|
__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
|
|
|
|
|
|
|
|
error = __dbuf_hold_impl(dh);
|
|
|
|
|
|
|
|
kmem_free(dh, sizeof(struct dbuf_hold_impl_data) *
|
|
|
|
DBUF_HOLD_IMPL_MAX_DEPTH);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
|
|
|
|
dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
|
|
|
|
void *tag, dmu_buf_impl_t **dbp, int depth)
|
|
|
|
{
|
|
|
|
dh->dh_dn = dn;
|
|
|
|
dh->dh_level = level;
|
|
|
|
dh->dh_blkid = blkid;
|
|
|
|
dh->dh_fail_sparse = fail_sparse;
|
|
|
|
dh->dh_tag = tag;
|
|
|
|
dh->dh_dbp = dbp;
|
|
|
|
dh->dh_depth = depth;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_buf_impl_t *
|
|
|
|
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db;
|
|
|
|
int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
|
|
|
|
return (err ? NULL : db);
|
|
|
|
}
|
|
|
|
|
|
|
|
dmu_buf_impl_t *
|
|
|
|
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db;
|
|
|
|
int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
|
|
|
|
return (err ? NULL : db);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_create_bonus(dnode_t *dn)
|
|
|
|
{
|
|
|
|
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
|
|
|
|
|
|
|
|
ASSERT(dn->dn_bonus == NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOTSUP));
|
2010-05-29 00:45:14 +04:00
|
|
|
if (blksz == 0)
|
|
|
|
blksz = SPA_MINBLOCKSIZE;
|
|
|
|
if (blksz > SPA_MAXBLOCKSIZE)
|
|
|
|
blksz = SPA_MAXBLOCKSIZE;
|
|
|
|
else
|
|
|
|
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
|
2010-05-29 00:45:14 +04:00
|
|
|
dbuf_new_size(db, blksz, tx);
|
2010-08-27 01:24:34 +04:00
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
DB_DNODE_EXIT(db);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
#pragma weak dmu_buf_add_ref = dbuf_add_ref
|
|
|
|
void
|
|
|
|
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
|
|
|
|
{
|
2010-08-26 20:53:00 +04:00
|
|
|
VERIFY(refcount_add(&db->db_holds, tag) > 1);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* If you call dbuf_rele() you had better not be referencing the dnode handle
|
|
|
|
* unless you have some other direct or indirect hold on the dnode. (An indirect
|
|
|
|
* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
|
|
|
|
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
|
|
|
|
* dnode's parent dbuf evicting its dnode handles.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
#pragma weak dmu_buf_rele = dbuf_rele
|
|
|
|
void
|
|
|
|
dbuf_rele(dmu_buf_impl_t *db, void *tag)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
dbuf_rele_and_unlock(db, tag);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
|
|
|
|
* db_dirtycnt and db_holds to be updated atomically.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
int64_t holds;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
2008-11-20 23:01:55 +03:00
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* Remove the reference to the dbuf before removing its hold on the
|
|
|
|
* dnode so we can guarantee in dnode_move() that a referenced bonus
|
|
|
|
* buffer has a corresponding dnode hold.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
holds = refcount_remove(&db->db_holds, tag);
|
|
|
|
ASSERT(holds >= 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can't freeze indirects if there is a possibility that they
|
|
|
|
* may be modified in the current syncing context.
|
|
|
|
*/
|
|
|
|
if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
|
|
|
|
arc_buf_freeze(db->db_buf);
|
|
|
|
|
|
|
|
if (holds == db->db_dirtycnt &&
|
|
|
|
db->db_level == 0 && db->db_immediate_evict)
|
|
|
|
dbuf_evict_user(db);
|
|
|
|
|
|
|
|
if (holds == 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&db->db_mtx);
|
2010-08-27 01:24:34 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the dnode moves here, we cannot cross this barrier
|
|
|
|
* until the move completes.
|
|
|
|
*/
|
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
|
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
|
|
* The bonus buffer's dnode hold is no longer discounted
|
|
|
|
* in dnode_move(). The dnode cannot move until after
|
|
|
|
* the dnode_rele().
|
|
|
|
*/
|
|
|
|
dnode_rele(DB_DNODE(db), db);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else if (db->db_buf == NULL) {
|
|
|
|
/*
|
|
|
|
* This is a special case: we never associated this
|
|
|
|
* dbuf with any data allocated from the ARC.
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(db->db_state == DB_UNCACHED ||
|
|
|
|
db->db_state == DB_NOFILL);
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_evict(db);
|
|
|
|
} else if (arc_released(db->db_buf)) {
|
|
|
|
arc_buf_t *buf = db->db_buf;
|
|
|
|
/*
|
|
|
|
* This dbuf has anonymous data associated with it.
|
|
|
|
*/
|
|
|
|
dbuf_set_data(db, NULL);
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(arc_buf_remove_ref(buf, db));
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_evict(db);
|
|
|
|
} else {
|
2013-09-04 16:00:57 +04:00
|
|
|
VERIFY(!arc_buf_remove_ref(db->db_buf, db));
|
2012-12-22 02:57:09 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A dbuf will be eligible for eviction if either the
|
|
|
|
* 'primarycache' property is set or a duplicate
|
|
|
|
* copy of this buffer is already cached in the arc.
|
|
|
|
*
|
|
|
|
* In the case of the 'primarycache' a buffer
|
|
|
|
* is considered for eviction if it matches the
|
|
|
|
* criteria set in the property.
|
|
|
|
*
|
|
|
|
* To decide if our buffer is considered a
|
|
|
|
* duplicate, we must call into the arc to determine
|
|
|
|
* if multiple buffers are referencing the same
|
|
|
|
* block on-disk. If so, then we simply evict
|
|
|
|
* ourselves.
|
|
|
|
*/
|
|
|
|
if (!DBUF_IS_CACHEABLE(db) ||
|
|
|
|
arc_buf_eviction_needed(db->db_buf))
|
2008-12-03 23:09:06 +03:00
|
|
|
dbuf_clear(db);
|
|
|
|
else
|
|
|
|
mutex_exit(&db->db_mtx);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#pragma weak dmu_buf_refcount = dbuf_refcount
|
|
|
|
uint64_t
|
|
|
|
dbuf_refcount(dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
return (refcount_count(&db->db_holds));
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
|
|
|
|
dmu_buf_evict_func_t *evict_func)
|
|
|
|
{
|
|
|
|
return (dmu_buf_update_user(db_fake, NULL, user_ptr,
|
|
|
|
user_data_ptr_ptr, evict_func));
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
|
|
|
|
dmu_buf_evict_func_t *evict_func)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
|
|
|
|
db->db_immediate_evict = TRUE;
|
|
|
|
return (dmu_buf_update_user(db_fake, NULL, user_ptr,
|
|
|
|
user_data_ptr_ptr, evict_func));
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
|
|
|
|
void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
ASSERT(db->db_level == 0);
|
|
|
|
|
|
|
|
ASSERT((user_ptr == NULL) == (evict_func == NULL));
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
if (db->db_user_ptr == old_user_ptr) {
|
|
|
|
db->db_user_ptr = user_ptr;
|
|
|
|
db->db_user_data_ptr_ptr = user_data_ptr_ptr;
|
|
|
|
db->db_evict_func = evict_func;
|
|
|
|
|
|
|
|
dbuf_update_data(db);
|
|
|
|
} else {
|
|
|
|
old_user_ptr = db->db_user_ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return (old_user_ptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
dmu_buf_get_user(dmu_buf_t *db_fake)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
ASSERT(!refcount_is_zero(&db->db_holds));
|
|
|
|
|
|
|
|
return (db->db_user_ptr);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
boolean_t
|
|
|
|
dmu_buf_freeable(dmu_buf_t *dbuf)
|
|
|
|
{
|
|
|
|
boolean_t res = B_FALSE;
|
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
|
|
|
|
|
|
|
|
if (db->db_blkptr)
|
|
|
|
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
|
2010-05-29 00:45:14 +04:00
|
|
|
db->db_blkptr, db->db_blkptr->blk_birth);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
return (res);
|
|
|
|
}
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
blkptr_t *
|
|
|
|
dmu_buf_get_blkptr(dmu_buf_t *db)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
|
|
|
|
return (dbi->db_blkptr);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
static void
|
|
|
|
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
|
|
|
|
{
|
|
|
|
/* ASSERT(dmu_tx_is_syncing(tx) */
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
|
|
|
|
if (db->db_blkptr != NULL)
|
|
|
|
return;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
|
|
db->db_blkptr = &dn->dn_phys->dn_spill;
|
|
|
|
BP_ZERO(db->db_blkptr);
|
|
|
|
return;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
|
|
|
|
/*
|
|
|
|
* This buffer was allocated at a time when there was
|
|
|
|
* no available blkptrs from the dnode, or it was
|
|
|
|
* inappropriate to hook it in (i.e., nlevels mis-match).
|
|
|
|
*/
|
|
|
|
ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
|
|
|
|
ASSERT(db->db_parent == NULL);
|
|
|
|
db->db_parent = dn->dn_dbuf;
|
|
|
|
db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
} else {
|
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
|
|
int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
|
|
|
|
ASSERT(dn->dn_phys->dn_nlevels > 1);
|
|
|
|
if (parent == NULL) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
|
|
(void) dbuf_hold_impl(dn, db->db_level+1,
|
|
|
|
db->db_blkid >> epbs, FALSE, db, &parent);
|
|
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
db->db_parent = parent;
|
|
|
|
}
|
|
|
|
db->db_blkptr = (blkptr_t *)parent->db.db_data +
|
|
|
|
(db->db_blkid & ((1ULL << epbs) - 1));
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-26 21:58:36 +04:00
|
|
|
/* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
|
|
|
|
* is critical the we not allow the compiler to inline this function in to
|
|
|
|
* dbuf_sync_list() thereby drastically bloating the stack usage.
|
|
|
|
*/
|
|
|
|
noinline static void
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_t *zio;
|
|
|
|
|
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
|
|
|
|
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
|
|
|
ASSERT(db->db_level > 0);
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Read the block if it hasn't been read yet. */
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_buf == NULL) {
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
}
|
|
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
|
|
ASSERT(db->db_buf != NULL);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Indirect block size must match what the dnode thinks it is. */
|
2010-08-27 01:24:34 +04:00
|
|
|
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_check_blkptr(dn, db);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Provide the pending dirty record to child dbufs */
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_data_pending = dr;
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
2008-12-03 23:09:06 +03:00
|
|
|
dbuf_write(dr, db->db_buf, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zio = dr->dr_zio;
|
|
|
|
mutex_enter(&dr->dt.di.dr_mtx);
|
|
|
|
dbuf_sync_list(&dr->dt.di.dr_children, tx);
|
|
|
|
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
|
|
|
|
mutex_exit(&dr->dt.di.dr_mtx);
|
|
|
|
zio_nowait(zio);
|
|
|
|
}
|
|
|
|
|
2010-08-26 21:58:36 +04:00
|
|
|
/* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
|
|
|
|
* critical the we not allow the compiler to inline this function in to
|
|
|
|
* dbuf_sync_list() thereby drastically bloating the stack usage.
|
|
|
|
*/
|
|
|
|
noinline static void
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
arc_buf_t **datap = &dr->dt.dl.dr_data;
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
|
|
|
objset_t *os;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t txg = tx->tx_txg;
|
|
|
|
|
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
|
|
|
|
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
/*
|
|
|
|
* To be synced, we must be dirtied. But we
|
|
|
|
* might have been freed after the dirty.
|
|
|
|
*/
|
|
|
|
if (db->db_state == DB_UNCACHED) {
|
|
|
|
/* This buffer has been freed since it was dirtied */
|
|
|
|
ASSERT(db->db.db_data == NULL);
|
|
|
|
} else if (db->db_state == DB_FILL) {
|
|
|
|
/* This buffer was freed and is now being re-filled */
|
|
|
|
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
|
|
|
|
} else {
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
|
|
dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If this is a bonus buffer, simply copy the bonus data into the
|
|
|
|
* dnode. It will be written out when the dnode is synced (and it
|
|
|
|
* will be synced, since it must have been dirty for dbuf_sync to
|
|
|
|
* be called).
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
2008-11-20 23:01:55 +03:00
|
|
|
dbuf_dirty_record_t **drp;
|
|
|
|
|
|
|
|
ASSERT(*datap != NULL);
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(db->db_level);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
|
|
|
|
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (*datap != db->db.db_data) {
|
|
|
|
zio_buf_free(*datap, DN_MAX_BONUSLEN);
|
2009-02-18 23:51:31 +03:00
|
|
|
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
db->db_data_pending = NULL;
|
|
|
|
drp = &db->db_last_dirty;
|
|
|
|
while (*drp != dr)
|
|
|
|
drp = &(*drp)->dr_next;
|
|
|
|
ASSERT(dr->dr_next == NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(dr->dr_dbuf == db);
|
2008-11-20 23:01:55 +03:00
|
|
|
*drp = dr->dr_next;
|
2010-08-26 21:19:04 +04:00
|
|
|
if (dr->dr_dbuf->db_level != 0) {
|
|
|
|
mutex_destroy(&dr->dt.di.dr_mtx);
|
|
|
|
list_destroy(&dr->dt.di.dr_children);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
|
|
db->db_dirtycnt -= 1;
|
2010-05-29 00:45:14 +04:00
|
|
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
os = dn->dn_objset;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* This function may have dropped the db_mtx lock allowing a dmu_sync
|
|
|
|
* operation to sneak in. As a result, we need to ensure that we
|
|
|
|
* don't check the dr_override_state until we have returned from
|
|
|
|
* dbuf_check_blkptr.
|
|
|
|
*/
|
|
|
|
dbuf_check_blkptr(dn, db);
|
|
|
|
|
|
|
|
/*
|
2010-08-27 01:24:34 +04:00
|
|
|
* If this buffer is in the middle of an immediate write,
|
2008-11-20 23:01:55 +03:00
|
|
|
* wait for the synchronous IO to complete.
|
|
|
|
*/
|
|
|
|
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
|
|
|
|
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
|
|
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (db->db_state != DB_NOFILL &&
|
|
|
|
dn->dn_object != DMU_META_DNODE_OBJECT &&
|
|
|
|
refcount_count(&db->db_holds) > 1 &&
|
2010-05-29 00:45:14 +04:00
|
|
|
dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
|
2009-07-03 02:44:48 +04:00
|
|
|
*datap == db->db_buf) {
|
|
|
|
/*
|
|
|
|
* If this buffer is currently "in use" (i.e., there
|
|
|
|
* are active holds and db_data still references it),
|
|
|
|
* then make a copy before we start the write so that
|
|
|
|
* any modifications from the open txg will not leak
|
|
|
|
* into this write.
|
|
|
|
*
|
|
|
|
* NOTE: this copy does not need to be made for
|
|
|
|
* objects only modified in the syncing context (e.g.
|
|
|
|
* DNONE_DNODE blocks).
|
|
|
|
*/
|
|
|
|
int blksz = arc_buf_size(*datap);
|
|
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
|
|
|
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
|
|
|
|
bcopy(db->db.db_data, (*datap)->b_data, blksz);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
db->db_data_pending = dr;
|
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
dbuf_write(dr, *datap, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
2010-08-27 01:24:34 +04:00
|
|
|
if (dn->dn_object == DMU_META_DNODE_OBJECT) {
|
2008-11-20 23:01:55 +03:00
|
|
|
list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Although zio_nowait() does not "wait for an IO", it does
|
|
|
|
* initiate the IO. If this is an empty write it seems plausible
|
|
|
|
* that the IO could actually be completed before the nowait
|
|
|
|
* returns. We need to DB_DNODE_EXIT() first in case
|
|
|
|
* zio_nowait() invalidates the dbuf.
|
|
|
|
*/
|
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
zio_nowait(dr->dr_zio);
|
2010-08-27 01:24:34 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
dbuf_sync_list(list_t *list, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
|
2010-08-26 20:52:42 +04:00
|
|
|
while ((dr = list_head(list))) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (dr->dr_zio != NULL) {
|
|
|
|
/*
|
|
|
|
* If we find an already initialized zio then we
|
|
|
|
* are processing the meta-dnode, and we have finished.
|
|
|
|
* The dbufs for all dnodes are put back on the list
|
|
|
|
* during processing, so that we can zio_wait()
|
|
|
|
* these IOs after initiating all child IOs.
|
|
|
|
*/
|
|
|
|
ASSERT3U(dr->dr_dbuf->db.db_object, ==,
|
|
|
|
DMU_META_DNODE_OBJECT);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
list_remove(list, dr);
|
|
|
|
if (dr->dr_dbuf->db_level > 0)
|
|
|
|
dbuf_sync_indirect(dr, tx);
|
|
|
|
else
|
|
|
|
dbuf_sync_leaf(dr, tx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
2008-11-20 23:01:55 +03:00
|
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
2010-05-29 00:45:14 +04:00
|
|
|
spa_t *spa = zio->io_spa;
|
|
|
|
int64_t delta;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t fill = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
int i;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(db->db_blkptr == bp);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2010-05-29 00:45:14 +04:00
|
|
|
delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
|
|
|
|
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
|
|
|
|
zio->io_prev_space_delta = delta;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (BP_IS_HOLE(bp)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(bp->blk_fill == 0);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
|
|
|
|
BP_GET_TYPE(bp) == dn->dn_type) ||
|
|
|
|
(db->db_blkid == DMU_SPILL_BLKID &&
|
|
|
|
BP_GET_TYPE(bp) == dn->dn_bonustype));
|
2008-12-03 23:09:06 +03:00
|
|
|
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
|
|
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
|
|
|
|
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
|
|
|
|
db->db_blkptr == &dn->dn_phys->dn_spill);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_level == 0) {
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
|
|
|
|
db->db_blkid != DMU_SPILL_BLKID)
|
2008-11-20 23:01:55 +03:00
|
|
|
dn->dn_phys->dn_maxblkid = db->db_blkid;
|
|
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
|
|
|
|
if (dn->dn_type == DMU_OT_DNODE) {
|
|
|
|
dnode_phys_t *dnp = db->db.db_data;
|
|
|
|
for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
|
|
|
|
i--, dnp++) {
|
|
|
|
if (dnp->dn_type != DMU_OT_NONE)
|
|
|
|
fill++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fill = 1;
|
|
|
|
}
|
|
|
|
} else {
|
2008-12-03 23:09:06 +03:00
|
|
|
blkptr_t *ibp = db->db.db_data;
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
|
2008-12-03 23:09:06 +03:00
|
|
|
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
|
|
|
|
if (BP_IS_HOLE(ibp))
|
2008-11-20 23:01:55 +03:00
|
|
|
continue;
|
2008-12-03 23:09:06 +03:00
|
|
|
fill += ibp->blk_fill;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
bp->blk_fill = fill;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
}
|
|
|
|
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
/*
|
|
|
|
* The SPA will call this callback several times for each zio - once
|
|
|
|
* for every physical child i/o (zio->io_phys_children times). This
|
|
|
|
* allows the DMU to monitor the progress of each logical i/o. For example,
|
|
|
|
* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
|
|
|
|
* block. There may be a long delay before all copies/fragments are completed,
|
|
|
|
* so this callback allows us to retire dirty space gradually, as the physical
|
|
|
|
* i/os complete.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = arg;
|
|
|
|
objset_t *os = db->db_objset;
|
|
|
|
dsl_pool_t *dp = dmu_objset_pool(os);
|
|
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
int delta = 0;
|
|
|
|
|
|
|
|
dr = db->db_data_pending;
|
|
|
|
ASSERT3U(dr->dr_txg, ==, zio->io_txg);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The callback will be called io_phys_children times. Retire one
|
|
|
|
* portion of our dirty space each time we are called. Any rounding
|
|
|
|
* error will be cleaned up by dsl_pool_sync()'s call to
|
|
|
|
* dsl_pool_undirty_space().
|
|
|
|
*/
|
|
|
|
delta = dr->dr_accounted / zio->io_phys_children;
|
|
|
|
dsl_pool_undirty_space(dp, delta, zio->io_txg);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/* ARGSUSED */
|
|
|
|
static void
|
|
|
|
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = vdb;
|
2010-05-29 00:45:14 +04:00
|
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t txg = zio->io_txg;
|
|
|
|
dbuf_dirty_record_t **drp, *dr;
|
|
|
|
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(zio->io_error);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkptr == bp);
|
|
|
|
|
2013-05-10 23:47:54 +04:00
|
|
|
/*
|
|
|
|
* For nopwrites and rewrites we ensure that the bp matches our
|
|
|
|
* original and bypass all the accounting.
|
|
|
|
*/
|
|
|
|
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(BP_EQUAL(bp, bp_orig));
|
|
|
|
} else {
|
2010-08-27 01:24:34 +04:00
|
|
|
objset_t *os;
|
|
|
|
dsl_dataset_t *ds;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
|
|
|
|
DB_GET_OBJSET(&os, db);
|
|
|
|
ds = os->os_dsl_dataset;
|
|
|
|
tx = os->os_synctx;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
|
|
|
|
dsl_dataset_block_born(ds, bp, tx);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
DBUF_VERIFY(db);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
drp = &db->db_last_dirty;
|
|
|
|
while ((dr = *drp) != db->db_data_pending)
|
|
|
|
drp = &dr->dr_next;
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
|
|
ASSERT(dr->dr_txg == txg);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(dr->dr_dbuf == db);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(dr->dr_next == NULL);
|
|
|
|
*drp = dr->dr_next;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
#ifdef ZFS_DEBUG
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
|
|
|
|
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
|
|
|
|
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
|
|
|
|
db->db_blkptr == &dn->dn_phys->dn_spill);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (db->db_level == 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
|
2008-12-03 23:09:06 +03:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
|
|
|
if (dr->dt.dl.dr_data != db->db_buf)
|
|
|
|
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
|
2013-09-04 16:00:57 +04:00
|
|
|
db));
|
2010-05-29 00:45:14 +04:00
|
|
|
else if (!arc_released(db->db_buf))
|
2008-12-03 23:09:06 +03:00
|
|
|
arc_set_callback(db->db_buf, dbuf_do_evict, db);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
|
|
|
|
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
|
|
|
|
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
|
|
|
|
if (!BP_IS_HOLE(db->db_blkptr)) {
|
2010-08-26 20:53:00 +04:00
|
|
|
ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
|
|
|
|
SPA_BLKPTRSHIFT);
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
|
|
|
|
db->db.db_size);
|
|
|
|
ASSERT3U(dn->dn_phys->dn_maxblkid
|
|
|
|
>> (db->db_level * epbs), >=, db->db_blkid);
|
|
|
|
arc_set_callback(db->db_buf, dbuf_do_evict, db);
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_destroy(&dr->dt.di.dr_mtx);
|
|
|
|
list_destroy(&dr->dt.di.dr_children);
|
|
|
|
}
|
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
|
|
|
|
|
|
cv_broadcast(&db->db_changed);
|
|
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
|
|
db->db_dirtycnt -= 1;
|
|
|
|
db->db_data_pending = NULL;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_nofill_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_write_ready(zio, NULL, zio->io_private);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_nofill_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_write_done(zio, NULL, zio->io_private);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_override_ready(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
|
|
|
|
dbuf_write_ready(zio, NULL, db);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dbuf_write_override_done(zio_t *zio)
|
|
|
|
{
|
|
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
|
|
|
|
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
if (!BP_EQUAL(zio->io_bp, obp)) {
|
|
|
|
if (!BP_IS_HOLE(obp))
|
|
|
|
dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
|
|
|
|
arc_release(dr->dt.dl.dr_data, db);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
dbuf_write_done(zio, NULL, db);
|
|
|
|
}
|
|
|
|
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Issue I/O to commit a dirty buffer to disk. */
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
|
|
|
{
|
|
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
2010-08-27 01:24:34 +04:00
|
|
|
dnode_t *dn;
|
|
|
|
objset_t *os;
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
|
|
uint64_t txg = tx->tx_txg;
|
|
|
|
zbookmark_t zb;
|
|
|
|
zio_prop_t zp;
|
|
|
|
zio_t *zio;
|
|
|
|
int wp_flag = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
|
|
os = dn->dn_objset;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (db->db_state != DB_NOFILL) {
|
|
|
|
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
|
|
|
|
/*
|
|
|
|
* Private object buffers are released here rather
|
|
|
|
* than in dbuf_dirty() since they are only modified
|
|
|
|
* in the syncing context and we don't want the
|
|
|
|
* overhead of making multiple copies of the data.
|
|
|
|
*/
|
|
|
|
if (BP_IS_HOLE(db->db_blkptr)) {
|
|
|
|
arc_buf_thaw(data);
|
|
|
|
} else {
|
|
|
|
dbuf_release_bp(db);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (parent != dn->dn_dbuf) {
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Our parent is an indirect block. */
|
|
|
|
/* We have a dirty parent that has been scheduled for write. */
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(parent && parent->db_data_pending);
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Our parent's buffer is one level closer to the dnode. */
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(db->db_level == parent->db_level-1);
|
2013-06-11 21:12:34 +04:00
|
|
|
/*
|
|
|
|
* We're about to modify our parent's db_data by modifying
|
|
|
|
* our block pointer, so the parent must be released.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(arc_released(parent->db_buf));
|
|
|
|
zio = parent->db_data_pending->dr_zio;
|
|
|
|
} else {
|
2013-06-11 21:12:34 +04:00
|
|
|
/* Our parent is the dnode itself. */
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
|
|
|
|
db->db_blkid != DMU_SPILL_BLKID) ||
|
|
|
|
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
|
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
|
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
|
|
&dn->dn_phys->dn_blkptr[db->db_blkid]);
|
|
|
|
zio = dn->dn_zio;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(db->db_level == 0 || data == db->db_buf);
|
|
|
|
ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
|
|
|
|
ASSERT(zio);
|
|
|
|
|
|
|
|
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
|
|
|
|
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
|
|
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
|
|
|
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID)
|
|
|
|
wp_flag = WP_SPILL;
|
|
|
|
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
|
|
|
|
|
|
|
|
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
|
2010-08-27 01:24:34 +04:00
|
|
|
DB_DNODE_EXIT(db);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
|
|
|
|
ASSERT(db->db_state != DB_NOFILL);
|
|
|
|
dr->dr_zio = zio_write(zio, os->os_spa, txg,
|
|
|
|
db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
dbuf_write_override_ready, NULL, dbuf_write_override_done,
|
|
|
|
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
|
|
|
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
|
2013-05-10 23:47:54 +04:00
|
|
|
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
} else if (db->db_state == DB_NOFILL) {
|
|
|
|
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
|
|
|
|
dr->dr_zio = zio_write(zio, os->os_spa, txg,
|
|
|
|
db->db_blkptr, NULL, db->db.db_size, &zp,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
|
2010-05-29 00:45:14 +04:00
|
|
|
ZIO_PRIORITY_ASYNC_WRITE,
|
|
|
|
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
|
|
|
|
} else {
|
|
|
|
ASSERT(arc_released(data));
|
|
|
|
dr->dr_zio = arc_write(zio, os->os_spa, txg,
|
2013-08-02 00:02:10 +04:00
|
|
|
db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
|
|
|
|
DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
dbuf_write_physdone, dbuf_write_done, db,
|
|
|
|
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-08-26 22:49:16 +04:00
|
|
|
|
|
|
|
#if defined(_KERNEL) && defined(HAVE_SPL)
|
2012-08-11 03:28:37 +04:00
|
|
|
EXPORT_SYMBOL(dbuf_find);
|
|
|
|
EXPORT_SYMBOL(dbuf_is_metadata);
|
|
|
|
EXPORT_SYMBOL(dbuf_evict);
|
|
|
|
EXPORT_SYMBOL(dbuf_loan_arcbuf);
|
|
|
|
EXPORT_SYMBOL(dbuf_whichblock);
|
|
|
|
EXPORT_SYMBOL(dbuf_read);
|
|
|
|
EXPORT_SYMBOL(dbuf_unoverride);
|
|
|
|
EXPORT_SYMBOL(dbuf_free_range);
|
|
|
|
EXPORT_SYMBOL(dbuf_new_size);
|
|
|
|
EXPORT_SYMBOL(dbuf_release_bp);
|
|
|
|
EXPORT_SYMBOL(dbuf_dirty);
|
2010-08-26 22:49:16 +04:00
|
|
|
EXPORT_SYMBOL(dmu_buf_will_dirty);
|
2012-08-11 03:28:37 +04:00
|
|
|
EXPORT_SYMBOL(dmu_buf_will_not_fill);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_will_fill);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_fill_done);
|
2012-08-14 19:35:32 +04:00
|
|
|
EXPORT_SYMBOL(dmu_buf_rele);
|
2012-08-11 03:28:37 +04:00
|
|
|
EXPORT_SYMBOL(dbuf_assign_arcbuf);
|
|
|
|
EXPORT_SYMBOL(dbuf_clear);
|
|
|
|
EXPORT_SYMBOL(dbuf_prefetch);
|
|
|
|
EXPORT_SYMBOL(dbuf_hold_impl);
|
|
|
|
EXPORT_SYMBOL(dbuf_hold);
|
|
|
|
EXPORT_SYMBOL(dbuf_hold_level);
|
|
|
|
EXPORT_SYMBOL(dbuf_create_bonus);
|
|
|
|
EXPORT_SYMBOL(dbuf_spill_set_blksz);
|
|
|
|
EXPORT_SYMBOL(dbuf_rm_spill);
|
|
|
|
EXPORT_SYMBOL(dbuf_add_ref);
|
|
|
|
EXPORT_SYMBOL(dbuf_rele);
|
|
|
|
EXPORT_SYMBOL(dbuf_rele_and_unlock);
|
|
|
|
EXPORT_SYMBOL(dbuf_refcount);
|
|
|
|
EXPORT_SYMBOL(dbuf_sync_list);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_set_user);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_set_user_ie);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_update_user);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_get_user);
|
|
|
|
EXPORT_SYMBOL(dmu_buf_freeable);
|
2010-08-26 22:49:16 +04:00
|
|
|
#endif
|