mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-01-14 17:22:05 +03:00
If the ZIL runs into trouble, it calls txg_wait_synced(), which blocks on suspend. We want it to not block on suspend, instead returning an error. On the surface, this is simple: change all calls to txg_wait_synced_flags(TXG_WAIT_SUSPEND), and then thread the error return back to the zil_commit() caller. Handling suspension means returning an error to all commit waiters. This is relatively straightforward, as zil_commit_waiter_t already has zcw_zio_error to hold the write IO error, which signals a fallback to txg_wait_synced_flags(TXG_WAIT_SUSPEND), which will fail, and so the waiter can now return an error from zil_commit(). However, commit waiters are normally signalled when their associated write (LWB) completes. If the pool has suspended, those IOs may not return for some time, or maybe not at all. We still want to signal those waiters so they can return from zil_commit(). We have a list of those in-flight LWBs on zl_lwb_list, so we can run through those, detach them and signal them. The LWB itself is still in-flight, but no longer has attached waiters, so when it returns there will be nothing to do. (As an aside, ITXs can also supply completion callbacks, which are called when they are destroyed. These are directly connected to LWBs though, so are passed the error code and destroyed there too). At this point, all ZIL waiters have been ejected, so we only have to consider the internal state. We potentially still have ITXs that have not been committed, LWBs still open, and LWBs in-flight. The on-disk ZIL is in an unknown state; some writes may have been written but not returned to us. We really can't rely on any of it; the best thing to do is abandon it entirely and start over when the pool returns to service. But, since we may have IO out that won't return until the pool resumes, we need something for it to return to. The simplest solution I could find, implemented here, is to "crash" the ZIL: accept no new ITXs, make no further updates, and let it empty out on its normal schedule, that is, as txgs complete and zil_sync() and zil_clean() are called. We set a "restart txg" to three txgs in the future (syncing + TXG_CONCURRENT_STATES), at which point all the internal state will have been cleared out, and the ZIL can resume operation (handled at the top of zil_clean()). This commit adds zil_crash(), which handles all of the above: - sets the restart txg - capture and signal all waiters - zero the header zil_crash() is called when txg_wait_synced_flags(TXG_WAIT_SUSPEND) returns because the pool suspended (ESHUTDOWN). The rest of the commit is just threading the errors through, and related housekeeping. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #17398
262 lines
8.1 KiB
C
262 lines
8.1 KiB
C
// SPDX-License-Identifier: CDDL-1.0
|
|
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2018 by Delphix. All rights reserved.
|
|
* Copyright (c) 2018 Datto Inc.
|
|
*/
|
|
|
|
#include <sys/dataset_kstats.h>
|
|
#include <sys/dmu_objset.h>
|
|
#include <sys/dsl_dataset.h>
|
|
#include <sys/spa.h>
|
|
|
|
static dataset_kstat_values_t empty_dataset_kstats = {
|
|
{ "dataset_name", KSTAT_DATA_STRING },
|
|
{ "writes", KSTAT_DATA_UINT64 },
|
|
{ "nwritten", KSTAT_DATA_UINT64 },
|
|
{ "reads", KSTAT_DATA_UINT64 },
|
|
{ "nread", KSTAT_DATA_UINT64 },
|
|
{ "nunlinks", KSTAT_DATA_UINT64 },
|
|
{ "nunlinked", KSTAT_DATA_UINT64 },
|
|
{
|
|
{ "zil_commit_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_commit_writer_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_commit_error_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_commit_stall_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_commit_suspend_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_commit_crash_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_copied_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_copied_bytes", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_needcopy_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 },
|
|
{ "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }
|
|
}
|
|
};
|
|
|
|
static int
|
|
dataset_kstats_update(kstat_t *ksp, int rw)
|
|
{
|
|
dataset_kstats_t *dk = ksp->ks_private;
|
|
dataset_kstat_values_t *dkv = ksp->ks_data;
|
|
ASSERT3P(dk->dk_kstats->ks_data, ==, dkv);
|
|
|
|
if (rw == KSTAT_WRITE)
|
|
return (EACCES);
|
|
|
|
dkv->dkv_writes.value.ui64 =
|
|
wmsum_value(&dk->dk_sums.dss_writes);
|
|
dkv->dkv_nwritten.value.ui64 =
|
|
wmsum_value(&dk->dk_sums.dss_nwritten);
|
|
dkv->dkv_reads.value.ui64 =
|
|
wmsum_value(&dk->dk_sums.dss_reads);
|
|
dkv->dkv_nread.value.ui64 =
|
|
wmsum_value(&dk->dk_sums.dss_nread);
|
|
dkv->dkv_nunlinks.value.ui64 =
|
|
wmsum_value(&dk->dk_sums.dss_nunlinks);
|
|
dkv->dkv_nunlinked.value.ui64 =
|
|
wmsum_value(&dk->dk_sums.dss_nunlinked);
|
|
|
|
zil_kstat_values_update(&dkv->dkv_zil_stats, &dk->dk_zil_sums);
|
|
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
|
|
{
|
|
/*
|
|
* There should not be anything wrong with having kstats for
|
|
* snapshots. Since we are not sure how useful they would be
|
|
* though nor how much their memory overhead would matter in
|
|
* a filesystem with many snapshots, we skip them for now.
|
|
*/
|
|
if (dmu_objset_is_snapshot(objset))
|
|
return (0);
|
|
|
|
/*
|
|
* At the time of this writing, KSTAT_STRLEN is 255 in Linux,
|
|
* and the spa_name can theoretically be up to 256 characters.
|
|
* In reality though the spa_name can be 240 characters max
|
|
* [see origin directory name check in pool_namecheck()]. Thus,
|
|
* the naming scheme for the module name below should not cause
|
|
* any truncations. In the event that a truncation does happen
|
|
* though, due to some future change, we silently skip creating
|
|
* the kstat and log the event.
|
|
*/
|
|
char kstat_module_name[KSTAT_STRLEN];
|
|
int n = snprintf(kstat_module_name, sizeof (kstat_module_name),
|
|
"zfs/%s", spa_name(dmu_objset_spa(objset)));
|
|
if (n < 0) {
|
|
zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
|
|
" snprintf() for kstat module name returned %d",
|
|
(unsigned long long)dmu_objset_id(objset), n);
|
|
return (SET_ERROR(EINVAL));
|
|
} else if (n >= KSTAT_STRLEN) {
|
|
zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
|
|
"kstat module name length (%d) exceeds limit (%d)",
|
|
(unsigned long long)dmu_objset_id(objset),
|
|
n, KSTAT_STRLEN);
|
|
return (SET_ERROR(ENAMETOOLONG));
|
|
}
|
|
|
|
char kstat_name[KSTAT_STRLEN];
|
|
n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx",
|
|
(unsigned long long)dmu_objset_id(objset));
|
|
if (n < 0) {
|
|
zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
|
|
" snprintf() for kstat name returned %d",
|
|
(unsigned long long)dmu_objset_id(objset), n);
|
|
return (SET_ERROR(EINVAL));
|
|
} else if (n >= KSTAT_STRLEN) {
|
|
zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
|
|
"kstat name length (%d) exceeds limit (%d)",
|
|
(unsigned long long)dmu_objset_id(objset),
|
|
n, KSTAT_STRLEN);
|
|
return (SET_ERROR(ENAMETOOLONG));
|
|
}
|
|
|
|
kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
|
|
"dataset", KSTAT_TYPE_NAMED,
|
|
sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
|
|
KSTAT_FLAG_VIRTUAL);
|
|
if (kstat == NULL)
|
|
return (SET_ERROR(ENOMEM));
|
|
|
|
dataset_kstat_values_t *dk_kstats =
|
|
kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
|
|
memcpy(dk_kstats, &empty_dataset_kstats,
|
|
sizeof (empty_dataset_kstats));
|
|
|
|
char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
|
|
dsl_dataset_name(objset->os_dsl_dataset, ds_name);
|
|
KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name;
|
|
KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) =
|
|
ZFS_MAX_DATASET_NAME_LEN;
|
|
|
|
kstat->ks_data = dk_kstats;
|
|
kstat->ks_update = dataset_kstats_update;
|
|
kstat->ks_private = dk;
|
|
kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN;
|
|
|
|
wmsum_init(&dk->dk_sums.dss_writes, 0);
|
|
wmsum_init(&dk->dk_sums.dss_nwritten, 0);
|
|
wmsum_init(&dk->dk_sums.dss_reads, 0);
|
|
wmsum_init(&dk->dk_sums.dss_nread, 0);
|
|
wmsum_init(&dk->dk_sums.dss_nunlinks, 0);
|
|
wmsum_init(&dk->dk_sums.dss_nunlinked, 0);
|
|
zil_sums_init(&dk->dk_zil_sums);
|
|
|
|
dk->dk_kstats = kstat;
|
|
kstat_install(kstat);
|
|
return (0);
|
|
}
|
|
|
|
void
|
|
dataset_kstats_destroy(dataset_kstats_t *dk)
|
|
{
|
|
if (dk->dk_kstats == NULL)
|
|
return;
|
|
|
|
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
|
|
kstat_delete(dk->dk_kstats);
|
|
dk->dk_kstats = NULL;
|
|
kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
|
|
KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
|
|
kmem_free(dkv, sizeof (empty_dataset_kstats));
|
|
|
|
wmsum_fini(&dk->dk_sums.dss_writes);
|
|
wmsum_fini(&dk->dk_sums.dss_nwritten);
|
|
wmsum_fini(&dk->dk_sums.dss_reads);
|
|
wmsum_fini(&dk->dk_sums.dss_nread);
|
|
wmsum_fini(&dk->dk_sums.dss_nunlinks);
|
|
wmsum_fini(&dk->dk_sums.dss_nunlinked);
|
|
zil_sums_fini(&dk->dk_zil_sums);
|
|
}
|
|
|
|
void
|
|
dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
|
|
{
|
|
if (dk->dk_kstats == NULL)
|
|
return;
|
|
|
|
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
|
|
char *ds_name;
|
|
|
|
ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name);
|
|
ASSERT3S(ds_name, !=, NULL);
|
|
(void) strlcpy(ds_name, name,
|
|
KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
|
|
}
|
|
|
|
void
|
|
dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten)
|
|
{
|
|
ASSERT3S(nwritten, >=, 0);
|
|
|
|
if (dk->dk_kstats == NULL)
|
|
return;
|
|
|
|
wmsum_add(&dk->dk_sums.dss_writes, 1);
|
|
wmsum_add(&dk->dk_sums.dss_nwritten, nwritten);
|
|
}
|
|
|
|
void
|
|
dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread)
|
|
{
|
|
ASSERT3S(nread, >=, 0);
|
|
|
|
if (dk->dk_kstats == NULL)
|
|
return;
|
|
|
|
wmsum_add(&dk->dk_sums.dss_reads, 1);
|
|
wmsum_add(&dk->dk_sums.dss_nread, nread);
|
|
}
|
|
|
|
void
|
|
dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
|
|
{
|
|
if (dk->dk_kstats == NULL)
|
|
return;
|
|
|
|
wmsum_add(&dk->dk_sums.dss_nunlinks, delta);
|
|
}
|
|
|
|
void
|
|
dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
|
|
{
|
|
if (dk->dk_kstats == NULL)
|
|
return;
|
|
|
|
wmsum_add(&dk->dk_sums.dss_nunlinked, delta);
|
|
}
|