ZIL: Make allocations more flexible

When ZIL allocates space for new LWBs without knowing how much it
will require, it can use new metaslab_alloc_range() function to
allocate slightly more or less than it predicted.  It allows to
improve space efficiency by allocating bigger LWBs on RAIDZ/dRAID
instead of padding and possibly packing more ZIL records there.
It may also allow to reduce ganging in some cases by allowing to
allocate smaller LWBs when we are not sure we'll need bigger.

On the opposite side, when we allocate space for already closed
LWBs, when we precisely know how much space we need, we may just
allocate what we need instead of relying on writing less than
allocated, that does not work for RAIDZ.

Space for LWBs in open state (still being filled) is allocated
same as before.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17613
This commit is contained in:
Alexander Motin 2025-08-14 11:50:17 -04:00 committed by GitHub
parent 8d35a022e4
commit d151432073
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 180 additions and 105 deletions

View File

@ -41,8 +41,8 @@ extern "C" {
*
* An lwb will start out in the "new" state, and transition to the "opened"
* state via a call to zil_lwb_write_open() on first itx assignment. When
* transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
* held.
* transitioning from "new" to "opened" the zilog's "zl_issuer_lock" and
* LWB's "lwb_lock" must be held.
*
* After the lwb is "opened", it can be assigned number of itxs and transition
* into the "closed" state via zil_lwb_write_close() when full or on timeout.
@ -115,6 +115,7 @@ typedef struct lwb {
int lwb_nused; /* # used bytes in buffer */
int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */
int lwb_min_sz; /* min size for range allocation */
lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */
zio_t *lwb_child_zio; /* parent zio for children */
@ -129,7 +130,7 @@ typedef struct lwb {
list_t lwb_itxs; /* list of itx's */
list_t lwb_waiters; /* list of zil_commit_waiter's */
avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */
kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */
kmutex_t lwb_lock; /* protects lwb_vdev_tree and size */
} lwb_t;
/*

View File

@ -622,7 +622,8 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, zio_flag_t flags);
extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
blkptr_t *new_bp, uint64_t size, boolean_t *slog);
blkptr_t *new_bp, uint64_t min_size, uint64_t max_size, boolean_t *slog,
boolean_t allow_larger);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_shrink(zio_t *zio, uint64_t size);

View File

@ -819,8 +819,8 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
* we choose them here and later make the block allocation match.
*/
static lwb_t *
zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
uint64_t txg, lwb_state_t state)
zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz,
boolean_t slog, uint64_t txg)
{
lwb_t *lwb;
@ -832,24 +832,24 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2)
lwb->lwb_flags |= LWB_FLAG_SLIM;
sz = BP_GET_LSIZE(bp);
lwb->lwb_min_sz = sz;
} else {
BP_ZERO(&lwb->lwb_blk);
if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL)
lwb->lwb_flags |= LWB_FLAG_SLIM;
lwb->lwb_min_sz = min_sz;
}
if (slog)
lwb->lwb_flags |= LWB_FLAG_SLOG;
lwb->lwb_error = 0;
if (lwb->lwb_flags & LWB_FLAG_SLIM) {
lwb->lwb_nmax = sz;
lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
} else {
lwb->lwb_nmax = sz - sizeof (zil_chain_t);
lwb->lwb_nused = lwb->lwb_nfilled = 0;
}
/*
* Buffer allocation and capacity setup will be done in
* zil_lwb_write_open() when the LWB is opened for ITX assignment.
*/
lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0;
lwb->lwb_sz = sz;
lwb->lwb_state = state;
lwb->lwb_buf = zio_buf_alloc(sz);
lwb->lwb_buf = NULL;
lwb->lwb_state = LWB_STATE_NEW;
lwb->lwb_child_zio = NULL;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
@ -860,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
if (state != LWB_STATE_NEW)
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
return (lwb);
@ -881,7 +879,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
VERIFY(list_is_empty(&lwb->lwb_itxs));
VERIFY(list_is_empty(&lwb->lwb_waiters));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
ASSERT(!MUTEX_HELD(&lwb->lwb_lock));
/*
* Clear the zilog's field to indicate this lwb is no longer
@ -1022,7 +1020,7 @@ zil_create(zilog_t *zilog)
}
error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
ZIL_MIN_BLKSZ, &slog);
ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE);
if (error == 0)
zil_init_log_chain(zilog, &blk);
}
@ -1031,7 +1029,7 @@ zil_create(zilog_t *zilog)
* Allocate a log write block (lwb) for the first log block.
*/
if (error == 0)
lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg);
/*
* If we just allocated the first log block, commit our transaction
@ -1394,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
if (zil_nocacheflush)
return;
mutex_enter(&lwb->lwb_vdev_lock);
mutex_enter(&lwb->lwb_lock);
for (i = 0; i < ndvas; i++) {
zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
if (avl_find(t, &zvsearch, &where) == NULL) {
@ -1403,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
avl_insert(t, zv, where);
}
}
mutex_exit(&lwb->lwb_vdev_lock);
mutex_exit(&lwb->lwb_lock);
}
static void
@ -1420,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
/*
* While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
* not need the protection of lwb_vdev_lock (it will only be modified
* not need the protection of lwb_lock (it will only be modified
* while holding zilog->zl_lock) as its writes and those of its
* children have all completed. The younger 'nlwb' may be waiting on
* future writes to additional vdevs.
*/
mutex_enter(&nlwb->lwb_vdev_lock);
mutex_enter(&nlwb->lwb_lock);
/*
* Tear down the 'lwb' vdev tree, ensuring that entries which do not
* exist in 'nlwb' are moved to it, freeing any would-be duplicates.
@ -1439,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
kmem_free(zv, sizeof (*zv));
}
}
mutex_exit(&nlwb->lwb_vdev_lock);
mutex_exit(&nlwb->lwb_lock);
}
void
@ -1743,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
return;
}
mutex_enter(&lwb->lwb_lock);
mutex_enter(&zilog->zl_lock);
lwb->lwb_state = LWB_STATE_OPENED;
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
mutex_exit(&lwb->lwb_lock);
/*
* Allocate buffer and set up LWB capacities.
*/
ASSERT0P(lwb->lwb_buf);
ASSERT3U(lwb->lwb_sz, >, 0);
lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
if (lwb->lwb_flags & LWB_FLAG_SLIM) {
lwb->lwb_nmax = lwb->lwb_sz;
lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
} else {
lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t);
lwb->lwb_nused = lwb->lwb_nfilled = 0;
}
}
/*
@ -1763,6 +1777,8 @@ static uint_t
zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
{
uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
uint_t waste = zil_max_waste_space(zilog);
waste = MAX(waste, zilog->zl_cur_max);
if (size <= md) {
/*
@ -1773,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
} else if (size > 8 * md) {
/*
* Big bursts use maximum blocks. The first block size
* is hard to predict, but it does not really matter.
* is hard to predict, but we need at least enough space
* to make reasonable progress.
*/
*minsize = 0;
*minsize = waste;
return (md);
}
@ -1788,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
uint_t s = size;
uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
uint_t chunk = DIV_ROUND_UP(s, n);
uint_t waste = zil_max_waste_space(zilog);
waste = MAX(waste, zilog->zl_cur_max);
if (chunk <= md - waste) {
*minsize = MAX(s - (md - waste) * (n - 1), waste);
return (chunk);
} else {
*minsize = 0;
*minsize = waste;
return (md);
}
}
/*
* Try to predict next block size based on previous history. Make prediction
* sufficient for 7 of 8 previous bursts. Don't try to save if the saving is
* less then 50%, extra writes may cost more, but we don't want single spike
* to badly affect our predictions.
* sufficient for 7 of 8 previous bursts, but don't try to save if the saving
* is less then 50%. Extra writes may cost more, but we don't want single
* spike to badly affect our predictions.
*/
static uint_t
zil_lwb_predict(zilog_t *zilog)
static void
zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict)
{
uint_t m, o;
uint_t m1 = 0, m2 = 0, o;
/* If we are in the middle of a burst, take it into account also. */
if (zilog->zl_cur_size > 0) {
o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
} else {
/* If we are in the middle of a burst, take it as another data point. */
if (zilog->zl_cur_size > 0)
o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1);
else
o = UINT_MAX;
m = 0;
}
/* Find minimum optimal size. We don't need to go below that. */
for (int i = 0; i < ZIL_BURSTS; i++)
o = MIN(o, zilog->zl_prev_opt[i]);
/* Find two biggest minimal first block sizes above the optimal. */
uint_t m1 = MAX(m, o), m2 = o;
/* Find two largest minimal first block sizes. */
for (int i = 0; i < ZIL_BURSTS; i++) {
m = zilog->zl_prev_min[i];
if (m >= m1) {
uint_t cur = zilog->zl_prev_min[i];
if (cur >= m1) {
m2 = m1;
m1 = m;
} else if (m > m2) {
m2 = m;
m1 = cur;
} else if (cur > m2) {
m2 = cur;
}
}
/*
* If second minimum size gives 50% saving -- use it. It may cost us
* one additional write later, but the space saving is just too big.
*/
return ((m1 < m2 * 2) ? m1 : m2);
/* Minimum should guarantee progress in most cases. */
*min_predict = (m1 < m2 * 2) ? m1 : m2;
/* Maximum doesn't need to go below the minimum optimal size. */
for (int i = 0; i < ZIL_BURSTS; i++)
o = MIN(o, zilog->zl_prev_opt[i]);
m1 = MAX(m1, o);
m2 = MAX(m2, o);
*max_predict = (m1 < m2 * 2) ? m1 : m2;
}
/*
@ -1846,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog)
* Has to be called under zl_issuer_lock to chain more lwbs.
*/
static lwb_t *
zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
{
uint64_t blksz, plan, plan2;
uint64_t minbs, maxbs;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
membar_producer();
lwb->lwb_state = LWB_STATE_CLOSED;
/*
@ -1876,27 +1889,34 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
* Try to predict what can it be and plan for the worst case.
*/
uint_t m;
plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
minbs = m;
if (zilog->zl_parallel) {
plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
zil_lwb_predict(zilog), &m);
if (plan < plan2)
plan = plan2;
uint64_t minp, maxp;
zil_lwb_predict(zilog, &minp, &maxp);
maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp,
&m);
if (maxbs < maxp)
maxbs = maxp;
}
} else {
/*
* The previous burst is done and we can only predict what
* will come next.
*/
plan = zil_lwb_predict(zilog);
zil_lwb_predict(zilog, &minbs, &maxbs);
}
blksz = plan + sizeof (zil_chain_t);
blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
blksz = MIN(blksz, zilog->zl_max_block_size);
DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
uint64_t, plan);
return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
minbs += sizeof (zil_chain_t);
maxbs += sizeof (zil_chain_t);
minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t);
maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t);
maxbs = MIN(maxbs, zilog->zl_max_block_size);
minbs = MIN(minbs, maxbs);
DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs,
uint64_t, maxbs);
return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0));
}
/*
@ -1949,7 +1969,8 @@ next_lwb:
zilc = (zil_chain_t *)lwb->lwb_buf;
else
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
int wsz = lwb->lwb_sz;
uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk);
int wsz = alloc_size;
if (lwb->lwb_error == 0) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
if (!(lwb->lwb_flags & LWB_FLAG_SLOG) ||
@ -1961,7 +1982,7 @@ next_lwb:
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
&lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
&lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done,
lwb, prio, ZIO_FLAG_CANFAIL, &zb);
zil_lwb_add_block(lwb, &lwb->lwb_blk);
@ -1969,8 +1990,9 @@ next_lwb:
/* For Slim ZIL only write what is used. */
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
int);
ASSERT3S(wsz, <=, lwb->lwb_sz);
zio_shrink(lwb->lwb_write_zio, wsz);
ASSERT3S(wsz, <=, alloc_size);
if (wsz < alloc_size)
zio_shrink(lwb->lwb_write_zio, wsz);
wsz = lwb->lwb_write_zio->io_size;
}
memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
@ -2006,8 +2028,48 @@ next_lwb:
BP_ZERO(bp);
error = lwb->lwb_error;
if (error == 0) {
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
&slog);
/*
* Allocation flexibility depends on LWB state:
* if NEW: allow range allocation and larger sizes;
* if OPENED: use fixed predetermined allocation size;
* if CLOSED + Slim: allocate precisely for actual usage.
*/
boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW);
if (flexible) {
/* We need to prevent opening till we update lwb_sz. */
mutex_enter(&nlwb->lwb_lock);
flexible = (nlwb->lwb_state == LWB_STATE_NEW);
if (!flexible)
mutex_exit(&nlwb->lwb_lock); /* We lost. */
}
boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED &&
(lwb->lwb_flags & LWB_FLAG_SLIM));
uint64_t min_size, max_size;
if (closed_slim) {
/* This transition is racy, but only one way. */
membar_consumer();
min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused,
ZIL_MIN_BLKSZ, uint64_t);
} else if (flexible) {
min_size = nlwb->lwb_min_sz;
max_size = nlwb->lwb_sz;
} else {
min_size = max_size = nlwb->lwb_sz;
}
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp,
min_size, max_size, &slog, flexible);
if (error == 0) {
if (closed_slim)
ASSERT3U(BP_GET_LSIZE(bp), ==, max_size);
else if (flexible)
nlwb->lwb_sz = BP_GET_LSIZE(bp);
else
ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz);
}
if (flexible)
mutex_exit(&nlwb->lwb_lock);
}
if (error == 0) {
ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
@ -2223,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
ASSERT3P(lwb->lwb_buf, !=, NULL);
zil_lwb_write_open(zilog, lwb);
@ -2265,9 +2326,10 @@ cont:
(dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) {
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_open(zilog, lwb);
lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
}
@ -3302,7 +3364,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
(!zilog->zl_parallel || zilog->zl_suspend > 0)) {
zil_burst_done(zilog);
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL) {
int err = 0;
while ((lwb =
@ -3480,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* hasn't been issued.
*/
zil_burst_done(zilog);
lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
@ -4162,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
offsetof(zil_commit_waiter_t, zcw_node));
avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
@ -4171,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused)
{
(void) unused;
lwb_t *lwb = vbuf;
mutex_destroy(&lwb->lwb_vdev_lock);
mutex_destroy(&lwb->lwb_lock);
avl_destroy(&lwb->lwb_vdev_tree);
list_destroy(&lwb->lwb_waiters);
list_destroy(&lwb->lwb_itxs);
@ -4394,7 +4456,7 @@ zil_close(zilog_t *zilog)
if (lwb != NULL) {
ASSERT(list_is_empty(&zilog->zl_lwb_list));
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
ASSERT0P(lwb->lwb_buf);
zil_free_lwb(zilog, lwb);
}
mutex_exit(&zilog->zl_lock);

View File

@ -4434,12 +4434,15 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
*/
int
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
uint64_t size, boolean_t *slog)
uint64_t min_size, uint64_t max_size, boolean_t *slog,
boolean_t allow_larger)
{
int error;
zio_alloc_list_t io_alloc_list;
uint64_t alloc_size = 0;
ASSERT(txg > spa_syncing_txg(spa));
ASSERT3U(min_size, <=, max_size);
metaslab_trace_init(&io_alloc_list);
@ -4448,7 +4451,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* Fill in the obvious ones before calling into metaslab_alloc().
*/
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_PSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, max_size);
BP_SET_LEVEL(new_bp, 0);
/*
@ -4463,43 +4466,51 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
ZIOSTAT_BUMP(ziostat_total_allocations);
/* Try log class (dedicated slog devices) first */
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, allocator, NULL);
error = metaslab_alloc_range(spa, spa_log_class(spa), min_size,
max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL, &alloc_size);
*slog = (error == 0);
/* Try special_embedded_log class (reserved on special vdevs) */
if (error != 0) {
error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
allocator, NULL);
error = metaslab_alloc_range(spa,
spa_special_embedded_log_class(spa), min_size, max_size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL, &alloc_size);
}
/* Try special class (general special vdev allocation) */
if (error != 0) {
error = metaslab_alloc(spa, spa_special_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
error = metaslab_alloc_range(spa, spa_special_class(spa),
min_size, max_size, new_bp, 1, txg, NULL, flags,
&io_alloc_list, allocator, NULL, &alloc_size);
}
/* Try embedded_log class (reserved on normal vdevs) */
if (error != 0) {
error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
error = metaslab_alloc_range(spa, spa_embedded_log_class(spa),
min_size, max_size, new_bp, 1, txg, NULL, flags,
&io_alloc_list, allocator, NULL, &alloc_size);
}
/* Finally fall back to normal class */
if (error != 0) {
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
error = metaslab_alloc_range(spa, spa_normal_class(spa),
min_size, max_size, new_bp, 1, txg, NULL, flags,
&io_alloc_list, allocator, NULL, &alloc_size);
}
metaslab_trace_fini(&io_alloc_list);
if (error == 0) {
BP_SET_LSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, size);
if (!allow_larger)
alloc_size = MIN(alloc_size, max_size);
else if (max_size <= SPA_OLD_MAXBLOCKSIZE)
alloc_size = MIN(alloc_size, SPA_OLD_MAXBLOCKSIZE);
alloc_size = P2ALIGN_TYPED(alloc_size, ZIL_MIN_BLKSZ, uint64_t);
BP_SET_LSIZE(new_bp, alloc_size);
BP_SET_PSIZE(new_bp, alloc_size);
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
BP_SET_CHECKSUM(new_bp,
spa_version(spa) >= SPA_VERSION_SLIM_ZIL
@ -4527,8 +4538,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
}
} else {
zfs_dbgmsg("%s: zil block allocation failure: "
"size %llu, error %d", spa_name(spa), (u_longlong_t)size,
error);
"min_size %llu, max_size %llu, error %d", spa_name(spa),
(u_longlong_t)min_size, (u_longlong_t)max_size, error);
}
return (error);