diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 4acaf8c9c..ea1364a7e 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -41,8 +41,8 @@ extern "C" { * * An lwb will start out in the "new" state, and transition to the "opened" * state via a call to zil_lwb_write_open() on first itx assignment. When - * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be - * held. + * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" and + * LWB's "lwb_lock" must be held. * * After the lwb is "opened", it can be assigned number of itxs and transition * into the "closed" state via zil_lwb_write_close() when full or on timeout. @@ -115,6 +115,7 @@ typedef struct lwb { int lwb_nused; /* # used bytes in buffer */ int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ + int lwb_min_sz; /* min size for range allocation */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ zio_t *lwb_child_zio; /* parent zio for children */ @@ -129,7 +130,7 @@ typedef struct lwb { list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ - kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ + kmutex_t lwb_lock; /* protects lwb_vdev_tree and size */ } lwb_t; /* diff --git a/include/sys/zio.h b/include/sys/zio.h index 56e16551d..211a7d0b0 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -622,7 +622,8 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, - blkptr_t *new_bp, uint64_t size, boolean_t *slog); + blkptr_t *new_bp, uint64_t min_size, uint64_t max_size, boolean_t *slog, + boolean_t allow_larger); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 975586833..0307df55a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -819,8 +819,8 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) * we choose them here and later make the block allocation match. */ static lwb_t * -zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, - uint64_t txg, lwb_state_t state) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz, + boolean_t slog, uint64_t txg) { lwb_t *lwb; @@ -832,24 +832,24 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) lwb->lwb_flags |= LWB_FLAG_SLIM; sz = BP_GET_LSIZE(bp); + lwb->lwb_min_sz = sz; } else { BP_ZERO(&lwb->lwb_blk); if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL) lwb->lwb_flags |= LWB_FLAG_SLIM; + lwb->lwb_min_sz = min_sz; } if (slog) lwb->lwb_flags |= LWB_FLAG_SLOG; lwb->lwb_error = 0; - if (lwb->lwb_flags & LWB_FLAG_SLIM) { - lwb->lwb_nmax = sz; - lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); - } else { - lwb->lwb_nmax = sz - sizeof (zil_chain_t); - lwb->lwb_nused = lwb->lwb_nfilled = 0; - } + /* + * Buffer allocation and capacity setup will be done in + * zil_lwb_write_open() when the LWB is opened for ITX assignment. + */ + lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0; lwb->lwb_sz = sz; - lwb->lwb_state = state; - lwb->lwb_buf = zio_buf_alloc(sz); + lwb->lwb_buf = NULL; + lwb->lwb_state = LWB_STATE_NEW; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; @@ -860,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); - if (state != LWB_STATE_NEW) - zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); @@ -881,7 +879,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) VERIFY(list_is_empty(&lwb->lwb_itxs)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); + ASSERT(!MUTEX_HELD(&lwb->lwb_lock)); /* * Clear the zilog's field to indicate this lwb is no longer @@ -1022,7 +1020,7 @@ zil_create(zilog_t *zilog) } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, - ZIL_MIN_BLKSZ, &slog); + ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE); if (error == 0) zil_init_log_chain(zilog, &blk); } @@ -1031,7 +1029,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); + lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg); /* * If we just allocated the first log block, commit our transaction @@ -1394,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) if (zil_nocacheflush) return; - mutex_enter(&lwb->lwb_vdev_lock); + mutex_enter(&lwb->lwb_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { @@ -1403,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) avl_insert(t, zv, where); } } - mutex_exit(&lwb->lwb_vdev_lock); + mutex_exit(&lwb->lwb_lock); } static void @@ -1420,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified + * not need the protection of lwb_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ - mutex_enter(&nlwb->lwb_vdev_lock); + mutex_enter(&nlwb->lwb_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. @@ -1439,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) kmem_free(zv, sizeof (*zv)); } } - mutex_exit(&nlwb->lwb_vdev_lock); + mutex_exit(&nlwb->lwb_lock); } void @@ -1743,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) return; } + mutex_enter(&lwb->lwb_lock); mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); + mutex_exit(&lwb->lwb_lock); + + /* + * Allocate buffer and set up LWB capacities. + */ + ASSERT0P(lwb->lwb_buf); + ASSERT3U(lwb->lwb_sz, >, 0); + lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); + if (lwb->lwb_flags & LWB_FLAG_SLIM) { + lwb->lwb_nmax = lwb->lwb_sz; + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + } else { + lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t); + lwb->lwb_nused = lwb->lwb_nfilled = 0; + } } /* @@ -1763,6 +1777,8 @@ static uint_t zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) { uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t); + uint_t waste = zil_max_waste_space(zilog); + waste = MAX(waste, zilog->zl_cur_max); if (size <= md) { /* @@ -1773,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) } else if (size > 8 * md) { /* * Big bursts use maximum blocks. The first block size - * is hard to predict, but it does not really matter. + * is hard to predict, but we need at least enough space + * to make reasonable progress. */ - *minsize = 0; + *minsize = waste; return (md); } @@ -1788,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) uint_t s = size; uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t)); uint_t chunk = DIV_ROUND_UP(s, n); - uint_t waste = zil_max_waste_space(zilog); - waste = MAX(waste, zilog->zl_cur_max); if (chunk <= md - waste) { *minsize = MAX(s - (md - waste) * (n - 1), waste); return (chunk); } else { - *minsize = 0; + *minsize = waste; return (md); } } /* * Try to predict next block size based on previous history. Make prediction - * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is - * less then 50%, extra writes may cost more, but we don't want single spike - * to badly affect our predictions. + * sufficient for 7 of 8 previous bursts, but don't try to save if the saving + * is less then 50%. Extra writes may cost more, but we don't want single + * spike to badly affect our predictions. */ -static uint_t -zil_lwb_predict(zilog_t *zilog) +static void +zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict) { - uint_t m, o; + uint_t m1 = 0, m2 = 0, o; - /* If we are in the middle of a burst, take it into account also. */ - if (zilog->zl_cur_size > 0) { - o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m); - } else { + /* If we are in the middle of a burst, take it as another data point. */ + if (zilog->zl_cur_size > 0) + o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1); + else o = UINT_MAX; - m = 0; - } - /* Find minimum optimal size. We don't need to go below that. */ - for (int i = 0; i < ZIL_BURSTS; i++) - o = MIN(o, zilog->zl_prev_opt[i]); - - /* Find two biggest minimal first block sizes above the optimal. */ - uint_t m1 = MAX(m, o), m2 = o; + /* Find two largest minimal first block sizes. */ for (int i = 0; i < ZIL_BURSTS; i++) { - m = zilog->zl_prev_min[i]; - if (m >= m1) { + uint_t cur = zilog->zl_prev_min[i]; + if (cur >= m1) { m2 = m1; - m1 = m; - } else if (m > m2) { - m2 = m; + m1 = cur; + } else if (cur > m2) { + m2 = cur; } } - /* - * If second minimum size gives 50% saving -- use it. It may cost us - * one additional write later, but the space saving is just too big. - */ - return ((m1 < m2 * 2) ? m1 : m2); + /* Minimum should guarantee progress in most cases. */ + *min_predict = (m1 < m2 * 2) ? m1 : m2; + + /* Maximum doesn't need to go below the minimum optimal size. */ + for (int i = 0; i < ZIL_BURSTS; i++) + o = MIN(o, zilog->zl_prev_opt[i]); + m1 = MAX(m1, o); + m2 = MAX(m2, o); + *max_predict = (m1 < m2 * 2) ? m1 : m2; } /* @@ -1846,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog) * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) { - uint64_t blksz, plan, plan2; + uint64_t minbs, maxbs; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + membar_producer(); lwb->lwb_state = LWB_STATE_CLOSED; /* @@ -1876,27 +1889,34 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) * Try to predict what can it be and plan for the worst case. */ uint_t m; - plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + minbs = m; if (zilog->zl_parallel) { - plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left + - zil_lwb_predict(zilog), &m); - if (plan < plan2) - plan = plan2; + uint64_t minp, maxp; + zil_lwb_predict(zilog, &minp, &maxp); + maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp, + &m); + if (maxbs < maxp) + maxbs = maxp; } } else { /* * The previous burst is done and we can only predict what * will come next. */ - plan = zil_lwb_predict(zilog); + zil_lwb_predict(zilog, &minbs, &maxbs); } - blksz = plan + sizeof (zil_chain_t); - blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t); - blksz = MIN(blksz, zilog->zl_max_block_size); - DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz, - uint64_t, plan); - return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state)); + minbs += sizeof (zil_chain_t); + maxbs += sizeof (zil_chain_t); + minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t); + maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t); + maxbs = MIN(maxbs, zilog->zl_max_block_size); + minbs = MIN(minbs, maxbs); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs, + uint64_t, maxbs); + + return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0)); } /* @@ -1949,7 +1969,8 @@ next_lwb: zilc = (zil_chain_t *)lwb->lwb_buf; else zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); - int wsz = lwb->lwb_sz; + uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk); + int wsz = alloc_size; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); if (!(lwb->lwb_flags & LWB_FLAG_SLOG) || @@ -1961,7 +1982,7 @@ next_lwb: ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, - &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, + &lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); zil_lwb_add_block(lwb, &lwb->lwb_blk); @@ -1969,8 +1990,9 @@ next_lwb: /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); - ASSERT3S(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); + ASSERT3S(wsz, <=, alloc_size); + if (wsz < alloc_size) + zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; } memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); @@ -2006,8 +2028,48 @@ next_lwb: BP_ZERO(bp); error = lwb->lwb_error; if (error == 0) { - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, - &slog); + /* + * Allocation flexibility depends on LWB state: + * if NEW: allow range allocation and larger sizes; + * if OPENED: use fixed predetermined allocation size; + * if CLOSED + Slim: allocate precisely for actual usage. + */ + boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW); + if (flexible) { + /* We need to prevent opening till we update lwb_sz. */ + mutex_enter(&nlwb->lwb_lock); + flexible = (nlwb->lwb_state == LWB_STATE_NEW); + if (!flexible) + mutex_exit(&nlwb->lwb_lock); /* We lost. */ + } + boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED && + (lwb->lwb_flags & LWB_FLAG_SLIM)); + + uint64_t min_size, max_size; + if (closed_slim) { + /* This transition is racy, but only one way. */ + membar_consumer(); + min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused, + ZIL_MIN_BLKSZ, uint64_t); + } else if (flexible) { + min_size = nlwb->lwb_min_sz; + max_size = nlwb->lwb_sz; + } else { + min_size = max_size = nlwb->lwb_sz; + } + + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, + min_size, max_size, &slog, flexible); + if (error == 0) { + if (closed_slim) + ASSERT3U(BP_GET_LSIZE(bp), ==, max_size); + else if (flexible) + nlwb->lwb_sz = BP_GET_LSIZE(bp); + else + ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz); + } + if (flexible) + mutex_exit(&nlwb->lwb_lock); } if (error == 0) { ASSERT3U(BP_GET_BIRTH(bp), ==, txg); @@ -2223,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); - ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); @@ -2265,9 +2326,10 @@ cont: (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) return (NULL); + zil_lwb_write_open(zilog, lwb); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; } @@ -3302,7 +3364,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) (!zilog->zl_parallel || zilog->zl_suspend > 0)) { zil_burst_done(zilog); list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) { int err = 0; while ((lwb = @@ -3480,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * hasn't been issued. */ zil_burst_done(zilog); - lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); @@ -4162,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); - mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -4171,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused) { (void) unused; lwb_t *lwb = vbuf; - mutex_destroy(&lwb->lwb_vdev_lock); + mutex_destroy(&lwb->lwb_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); @@ -4394,7 +4456,7 @@ zil_close(zilog_t *zilog) if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + ASSERT0P(lwb->lwb_buf); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3f0ddb632..4cf8912d4 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4434,12 +4434,15 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, - uint64_t size, boolean_t *slog) + uint64_t min_size, uint64_t max_size, boolean_t *slog, + boolean_t allow_larger) { int error; zio_alloc_list_t io_alloc_list; + uint64_t alloc_size = 0; ASSERT(txg > spa_syncing_txg(spa)); + ASSERT3U(min_size, <=, max_size); metaslab_trace_init(&io_alloc_list); @@ -4448,7 +4451,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); - BP_SET_PSIZE(new_bp, size); + BP_SET_PSIZE(new_bp, max_size); BP_SET_LEVEL(new_bp, 0); /* @@ -4463,43 +4466,51 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, ZIOSTAT_BUMP(ziostat_total_allocations); /* Try log class (dedicated slog devices) first */ - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, flags, &io_alloc_list, allocator, NULL); + error = metaslab_alloc_range(spa, spa_log_class(spa), min_size, + max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, + NULL, &alloc_size); *slog = (error == 0); /* Try special_embedded_log class (reserved on special vdevs) */ if (error != 0) { - error = metaslab_alloc(spa, spa_special_embedded_log_class(spa), - size, new_bp, 1, txg, NULL, flags, &io_alloc_list, - allocator, NULL); + error = metaslab_alloc_range(spa, + spa_special_embedded_log_class(spa), min_size, max_size, + new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, + NULL, &alloc_size); } /* Try special class (general special vdev allocation) */ if (error != 0) { - error = metaslab_alloc(spa, spa_special_class(spa), size, - new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, - NULL); + error = metaslab_alloc_range(spa, spa_special_class(spa), + min_size, max_size, new_bp, 1, txg, NULL, flags, + &io_alloc_list, allocator, NULL, &alloc_size); } /* Try embedded_log class (reserved on normal vdevs) */ if (error != 0) { - error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, - new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, - NULL); + error = metaslab_alloc_range(spa, spa_embedded_log_class(spa), + min_size, max_size, new_bp, 1, txg, NULL, flags, + &io_alloc_list, allocator, NULL, &alloc_size); } /* Finally fall back to normal class */ if (error != 0) { ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); - error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, - NULL); + error = metaslab_alloc_range(spa, spa_normal_class(spa), + min_size, max_size, new_bp, 1, txg, NULL, flags, + &io_alloc_list, allocator, NULL, &alloc_size); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { - BP_SET_LSIZE(new_bp, size); - BP_SET_PSIZE(new_bp, size); + if (!allow_larger) + alloc_size = MIN(alloc_size, max_size); + else if (max_size <= SPA_OLD_MAXBLOCKSIZE) + alloc_size = MIN(alloc_size, SPA_OLD_MAXBLOCKSIZE); + alloc_size = P2ALIGN_TYPED(alloc_size, ZIL_MIN_BLKSZ, uint64_t); + + BP_SET_LSIZE(new_bp, alloc_size); + BP_SET_PSIZE(new_bp, alloc_size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL @@ -4527,8 +4538,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, } } else { zfs_dbgmsg("%s: zil block allocation failure: " - "size %llu, error %d", spa_name(spa), (u_longlong_t)size, - error); + "min_size %llu, max_size %llu, error %d", spa_name(spa), + (u_longlong_t)min_size, (u_longlong_t)max_size, error); } return (error);