mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 18:34:22 +03:00
OpenZFS 6569 - large file delete can starve out write ops
Authored by: Alek Pinchuk <alek@nexenta.com> Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Ported-by: George Melikov <mail@gmelikov.ru> Tested-by: kernelOfTruth <kerneloftruth@gmail.com> OpenZFS-issue: https://www.illumos.org/issues/6569 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/1bf4b6f2 Closes #5706
This commit is contained in:
parent
a873815b95
commit
539d33c791
@ -21,6 +21,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||||
|
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _SYS_DSL_POOL_H
|
#ifndef _SYS_DSL_POOL_H
|
||||||
@ -106,6 +107,7 @@ typedef struct dsl_pool {
|
|||||||
kcondvar_t dp_spaceavail_cv;
|
kcondvar_t dp_spaceavail_cv;
|
||||||
uint64_t dp_dirty_pertxg[TXG_SIZE];
|
uint64_t dp_dirty_pertxg[TXG_SIZE];
|
||||||
uint64_t dp_dirty_total;
|
uint64_t dp_dirty_total;
|
||||||
|
uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
|
||||||
uint64_t dp_mos_used_delta;
|
uint64_t dp_mos_used_delta;
|
||||||
uint64_t dp_mos_compressed_delta;
|
uint64_t dp_mos_compressed_delta;
|
||||||
uint64_t dp_mos_uncompressed_delta;
|
uint64_t dp_mos_uncompressed_delta;
|
||||||
|
@ -112,6 +112,36 @@ DEFINE_EVENT(zfs_delay_mintime_class, name, \
|
|||||||
/* END CSTYLED */
|
/* END CSTYLED */
|
||||||
DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime);
|
DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime);
|
||||||
|
|
||||||
|
/* BEGIN CSTYLED */
|
||||||
|
DECLARE_EVENT_CLASS(zfs_free_long_range_class,
|
||||||
|
TP_PROTO(uint64_t long_free_dirty_all_txgs, uint64_t chunk_len, \
|
||||||
|
uint64_t txg),
|
||||||
|
TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg),
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
__field(uint64_t, long_free_dirty_all_txgs)
|
||||||
|
__field(uint64_t, chunk_len)
|
||||||
|
__field(uint64_t, txg)
|
||||||
|
),
|
||||||
|
TP_fast_assign(
|
||||||
|
__entry->long_free_dirty_all_txgs = long_free_dirty_all_txgs;
|
||||||
|
__entry->chunk_len = chunk_len;
|
||||||
|
__entry->txg = txg;
|
||||||
|
),
|
||||||
|
TP_printk("long_free_dirty_all_txgs %llu chunk_len %llu txg %llu",
|
||||||
|
__entry->long_free_dirty_all_txgs,
|
||||||
|
__entry->chunk_len, __entry->txg)
|
||||||
|
);
|
||||||
|
/* END CSTYLED */
|
||||||
|
|
||||||
|
/* BEGIN CSTYLED */
|
||||||
|
#define DEFINE_FREE_LONG_RANGE_EVENT(name) \
|
||||||
|
DEFINE_EVENT(zfs_free_long_range_class, name, \
|
||||||
|
TP_PROTO(uint64_t long_free_dirty_all_txgs, \
|
||||||
|
uint64_t chunk_len, uint64_t txg), \
|
||||||
|
TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg))
|
||||||
|
/* END CSTYLED */
|
||||||
|
DEFINE_FREE_LONG_RANGE_EVENT(zfs_free__long__range);
|
||||||
|
|
||||||
#endif /* _TRACE_DMU_H */
|
#endif /* _TRACE_DMU_H */
|
||||||
|
|
||||||
#undef TRACE_INCLUDE_PATH
|
#undef TRACE_INCLUDE_PATH
|
||||||
|
@ -48,6 +48,7 @@
|
|||||||
#include <sys/sa.h>
|
#include <sys/sa.h>
|
||||||
#include <sys/zfeature.h>
|
#include <sys/zfeature.h>
|
||||||
#include <sys/abd.h>
|
#include <sys/abd.h>
|
||||||
|
#include <sys/trace_dmu.h>
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
#include <sys/vmsystm.h>
|
#include <sys/vmsystm.h>
|
||||||
#include <sys/zfs_znode.h>
|
#include <sys/zfs_znode.h>
|
||||||
@ -58,6 +59,14 @@
|
|||||||
*/
|
*/
|
||||||
int zfs_nopwrite_enabled = 1;
|
int zfs_nopwrite_enabled = 1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tunable to control percentage of dirtied blocks from frees in one TXG.
|
||||||
|
* After this threshold is crossed, additional dirty blocks from frees
|
||||||
|
* wait until the next TXG.
|
||||||
|
* A value of zero will disable this throttle.
|
||||||
|
*/
|
||||||
|
uint32_t zfs_per_txg_dirty_frees_percent = 30;
|
||||||
|
|
||||||
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
||||||
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
|
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
|
||||||
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
|
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
|
||||||
@ -727,6 +736,9 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
|
|||||||
{
|
{
|
||||||
uint64_t object_size;
|
uint64_t object_size;
|
||||||
int err;
|
int err;
|
||||||
|
uint64_t dirty_frees_threshold;
|
||||||
|
dsl_pool_t *dp = dmu_objset_pool(os);
|
||||||
|
int t;
|
||||||
|
|
||||||
if (dn == NULL)
|
if (dn == NULL)
|
||||||
return (SET_ERROR(EINVAL));
|
return (SET_ERROR(EINVAL));
|
||||||
@ -735,11 +747,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
|
|||||||
if (offset >= object_size)
|
if (offset >= object_size)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
if (zfs_per_txg_dirty_frees_percent <= 100)
|
||||||
|
dirty_frees_threshold =
|
||||||
|
zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
|
||||||
|
else
|
||||||
|
dirty_frees_threshold = zfs_dirty_data_max / 4;
|
||||||
|
|
||||||
if (length == DMU_OBJECT_END || offset + length > object_size)
|
if (length == DMU_OBJECT_END || offset + length > object_size)
|
||||||
length = object_size - offset;
|
length = object_size - offset;
|
||||||
|
|
||||||
while (length != 0) {
|
while (length != 0) {
|
||||||
uint64_t chunk_end, chunk_begin;
|
uint64_t chunk_end, chunk_begin, chunk_len;
|
||||||
|
uint64_t long_free_dirty_all_txgs = 0;
|
||||||
dmu_tx_t *tx;
|
dmu_tx_t *tx;
|
||||||
|
|
||||||
if (dmu_objset_zfs_unmounting(dn->dn_objset))
|
if (dmu_objset_zfs_unmounting(dn->dn_objset))
|
||||||
@ -754,9 +773,28 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
|
|||||||
ASSERT3U(chunk_begin, >=, offset);
|
ASSERT3U(chunk_begin, >=, offset);
|
||||||
ASSERT3U(chunk_begin, <=, chunk_end);
|
ASSERT3U(chunk_begin, <=, chunk_end);
|
||||||
|
|
||||||
|
chunk_len = chunk_end - chunk_begin;
|
||||||
|
|
||||||
|
mutex_enter(&dp->dp_lock);
|
||||||
|
for (t = 0; t < TXG_SIZE; t++) {
|
||||||
|
long_free_dirty_all_txgs +=
|
||||||
|
dp->dp_long_free_dirty_pertxg[t];
|
||||||
|
}
|
||||||
|
mutex_exit(&dp->dp_lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To avoid filling up a TXG with just frees wait for
|
||||||
|
* the next TXG to open before freeing more chunks if
|
||||||
|
* we have reached the threshold of frees
|
||||||
|
*/
|
||||||
|
if (dirty_frees_threshold != 0 &&
|
||||||
|
long_free_dirty_all_txgs >= dirty_frees_threshold) {
|
||||||
|
txg_wait_open(dp, 0);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
tx = dmu_tx_create(os);
|
tx = dmu_tx_create(os);
|
||||||
dmu_tx_hold_free(tx, dn->dn_object,
|
dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
|
||||||
chunk_begin, chunk_end - chunk_begin);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Mark this transaction as typically resulting in a net
|
* Mark this transaction as typically resulting in a net
|
||||||
@ -768,10 +806,18 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
|
|||||||
dmu_tx_abort(tx);
|
dmu_tx_abort(tx);
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
|
|
||||||
|
mutex_enter(&dp->dp_lock);
|
||||||
|
dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
|
||||||
|
chunk_len;
|
||||||
|
mutex_exit(&dp->dp_lock);
|
||||||
|
DTRACE_PROBE3(free__long__range,
|
||||||
|
uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
|
||||||
|
uint64_t, dmu_tx_get_txg(tx));
|
||||||
|
dnode_free_range(dn, chunk_begin, chunk_len, tx);
|
||||||
dmu_tx_commit(tx);
|
dmu_tx_commit(tx);
|
||||||
|
|
||||||
length -= chunk_end - chunk_begin;
|
length -= chunk_len;
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||||
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
* Copyright (c) 2013 Steven Hartland. All rights reserved.
|
||||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||||
|
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/dsl_pool.h>
|
#include <sys/dsl_pool.h>
|
||||||
@ -509,6 +510,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
|
|||||||
*/
|
*/
|
||||||
dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
|
dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update the long range free counter after
|
||||||
|
* we're done syncing user data
|
||||||
|
*/
|
||||||
|
mutex_enter(&dp->dp_lock);
|
||||||
|
ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
|
||||||
|
dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
|
||||||
|
dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
|
||||||
|
mutex_exit(&dp->dp_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* After the data blocks have been written (ensured by the zio_wait()
|
* After the data blocks have been written (ensured by the zio_wait()
|
||||||
* above), update the user/group space accounting.
|
* above), update the user/group space accounting.
|
||||||
|
Loading…
Reference in New Issue
Block a user