From 17e212652db34d2942d6aa42ba14b443bcd73bea Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 22 Aug 2022 12:36:22 -0700 Subject: [PATCH] Prevent zevent list from consuming all of kernel memory There are a couple changes included here. The first is to introduce a cap on the size the ZED will grow the zevent list to. One million entries is more than enough for most use cases, and if you are overflowing that value, the problem needs to be addressed another way. The value is also tunable, for those who want the limit to be higher or lower. The other change is to add a kernel module parameter that allows snapshot creation/deletion to be exempted from the history logging; for most workloads, having these things logged is valuable, but for some workloads it produces large quantities of log spam and isn't especially helpful. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Issue #13374 Closes #13753 --- cmd/zed/zed_conf.c | 16 +++++++++++++++- cmd/zed/zed_conf.h | 1 + cmd/zed/zed_event.c | 19 +++++++++++++++---- man/man8/zed.8.in | 12 ++++++++++++ module/zfs/dsl_dataset.c | 8 +++++++- module/zfs/dsl_destroy.c | 13 ++++++++++--- 6 files changed, 60 insertions(+), 9 deletions(-) diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c index 59935102f..9a39d1a80 100644 --- a/cmd/zed/zed_conf.c +++ b/cmd/zed/zed_conf.c @@ -48,6 +48,7 @@ zed_conf_init(struct zed_conf *zcp) zcp->zevent_fd = -1; /* opened in zed_event_init() */ zcp->max_jobs = 16; + zcp->max_zevent_buf_len = 1 << 20; if (!(zcp->pid_file = strdup(ZED_PID_FILE)) || !(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)) || @@ -141,6 +142,8 @@ _zed_conf_display_help(const char *prog, boolean_t got_err) .v = ZED_STATE_FILE }, { .o = "-j JOBS", .d = "Start at most JOBS at once.", .v = "16" }, + { .o = "-b LEN", .d = "Cap kernel event buffer at LEN entries.", + .v = "1048576" }, {}, }; @@ -230,7 +233,7 @@ _zed_conf_parse_path(char **resultp, const char *path) void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) { - const char * const opts = ":hLVd:p:P:s:vfFMZIj:"; + const char * const opts = ":hLVd:p:P:s:vfFMZIj:b:"; int opt; unsigned long raw; @@ -291,6 +294,17 @@ zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) zcp->max_jobs = raw; } break; + case 'b': + errno = 0; + raw = strtoul(optarg, NULL, 0); + if (errno == ERANGE || raw > INT32_MAX) { + zed_log_die("%lu is too large", raw); + } if (raw == 0) { + zcp->max_zevent_buf_len = INT32_MAX; + } else { + zcp->max_zevent_buf_len = raw; + } + break; case '?': default: if (optopt == '?') diff --git a/cmd/zed/zed_conf.h b/cmd/zed/zed_conf.h index 0b30a1503..84825a05d 100644 --- a/cmd/zed/zed_conf.h +++ b/cmd/zed/zed_conf.h @@ -33,6 +33,7 @@ struct zed_conf { int zevent_fd; /* fd for access to zevents */ int16_t max_jobs; /* max zedlets to run at one time */ + int32_t max_zevent_buf_len; /* max size of kernel event list */ boolean_t do_force:1; /* true if force enabled */ boolean_t do_foreground:1; /* true if run in foreground */ diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c index d5cafdaff..079c6a043 100644 --- a/cmd/zed/zed_event.c +++ b/cmd/zed/zed_event.c @@ -38,6 +38,8 @@ #define MAXBUF 4096 +static int max_zevent_buf_len = 1 << 20; + /* * Open the libzfs interface. */ @@ -70,6 +72,9 @@ zed_event_init(struct zed_conf *zcp) zed_log_die("Failed to initialize disk events"); } + if (zcp->max_zevent_buf_len != 0) + max_zevent_buf_len = zcp->max_zevent_buf_len; + return (0); } @@ -105,7 +110,7 @@ _bump_event_queue_length(void) { int zzlm = -1, wr; char qlen_buf[12] = {0}; /* parameter is int => max "-2147483647\n" */ - long int qlen; + long int qlen, orig_qlen; zzlm = open("/sys/module/zfs/parameters/zfs_zevent_len_max", O_RDWR); if (zzlm < 0) @@ -116,7 +121,7 @@ _bump_event_queue_length(void) qlen_buf[sizeof (qlen_buf) - 1] = '\0'; errno = 0; - qlen = strtol(qlen_buf, NULL, 10); + orig_qlen = qlen = strtol(qlen_buf, NULL, 10); if (errno == ERANGE) goto done; @@ -125,8 +130,14 @@ _bump_event_queue_length(void) else qlen *= 2; - if (qlen > INT_MAX) - qlen = INT_MAX; + /* + * Don't consume all of kernel memory with event logs if something + * goes wrong. + */ + if (qlen > max_zevent_buf_len) + qlen = max_zevent_buf_len; + if (qlen == orig_qlen) + goto done; wr = snprintf(qlen_buf, sizeof (qlen_buf), "%ld", qlen); if (pwrite(zzlm, qlen_buf, wr, 0) < 0) diff --git a/man/man8/zed.8.in b/man/man8/zed.8.in index 6c51f1069..00f23edec 100644 --- a/man/man8/zed.8.in +++ b/man/man8/zed.8.in @@ -27,6 +27,7 @@ .Op Fl P Ar path .Op Fl s Ar statefile .Op Fl j Ar jobs +.Op Fl b Ar buflen . .Sh DESCRIPTION The @@ -96,6 +97,17 @@ ZEDLETs to run concurrently, delaying execution of new ones until they finish. Defaults to .Sy 16 . +.It Fl b Ar buflen +Cap kernel event buffer growth to +.Ar buflen +entries. +This buffer is grown when the daemon misses an event, but results in +unreclaimable memory use in the kernel. +A value of +.Sy 0 +removes the cap. +Defaults to +.Sy 1048576 . .El .Sh ZEVENTS A zevent is comprised of a list of nvpairs (name/value pairs). diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 12fcecf0d..21fef4c62 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -88,6 +88,8 @@ int zfs_max_recordsize = 16 * 1024 * 1024; #endif static int zfs_allow_redacted_dataset_mount = 0; +int zfs_snapshot_history_enabled = 1; + #define SWITCH64(x, y) \ { \ uint64_t __tmp = (x); \ @@ -1867,7 +1869,8 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_dir_snap_cmtime_update(ds->ds_dir, tx); - spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); + if (zfs_snapshot_history_enabled) + spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); } void @@ -4985,6 +4988,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, "Allow mounting of redacted datasets"); +ZFS_MODULE_PARAM(zfs, zfs_, snapshot_history_enabled, INT, ZMOD_RW, + "Include snapshot events in pool history/events"); + EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_flags); EXPORT_SYMBOL(dsl_dataset_hold_obj); diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 778b6b99b..100a48cb4 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -49,6 +49,8 @@ #include #include +extern int zfs_snapshot_history_enabled; + int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) { @@ -321,14 +323,19 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; - spa_history_log_internal_ds(ds, "defer_destroy", tx, " "); + if (zfs_snapshot_history_enabled) { + spa_history_log_internal_ds(ds, "defer_destroy", tx, + " "); + } return; } ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, " "); + if (zfs_snapshot_history_enabled) { + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, " "); + } dsl_scan_ds_destroyed(ds, tx);