mirror of
https://dev.lirent.ru/Vatrog/vm-automation-signaling.git
synced 2026-06-26 04:36:37 +03:00
319 lines
12 KiB
C
319 lines
12 KiB
C
|
|
/* socket.c — out-of-process control over a unix socket.
|
||
|
|
*
|
||
|
|
* The listener registers in the core as a SLOT_SOURCE (listen-fd). On accept the
|
||
|
|
* peer is authenticated via SO_PEERCRED, the policy issues a neutral grant; an empty
|
||
|
|
* grant => the connection is closed (not a valid poller). Otherwise a per-conn
|
||
|
|
* control is created: its fd is driven by the epoll core, DOWN frames are parsed and
|
||
|
|
* dispatched through emit_down (enforced by the grant), UP events are serialized into
|
||
|
|
* a frame. On EOF — deferred reap.
|
||
|
|
*
|
||
|
|
* DoS protection: per-uid limit of concurrent connections (against eviction of
|
||
|
|
* legitimate ones); a janitor timerfd detaches "stuck" partial frames (slowloris).
|
||
|
|
* The global ceiling and slot reuse live in the core. */
|
||
|
|
#define _GNU_SOURCE
|
||
|
|
#include "vmsig_socket.h"
|
||
|
|
#include "core_internal.h" /* core_add_source, core_request_drop, add_control */
|
||
|
|
#include <sys/socket.h>
|
||
|
|
#include <sys/uio.h>
|
||
|
|
#include <sys/un.h>
|
||
|
|
#include <sys/timerfd.h>
|
||
|
|
#include <sys/stat.h> /* umask */
|
||
|
|
#include <unistd.h>
|
||
|
|
#include <string.h>
|
||
|
|
#include <stdlib.h>
|
||
|
|
#include <stddef.h>
|
||
|
|
#include <errno.h>
|
||
|
|
#include <stdint.h>
|
||
|
|
#include <time.h>
|
||
|
|
|
||
|
|
#define VMSIG_SOCK_PER_UID_MAX 8 /* concurrent connections per uid */
|
||
|
|
#define VMSIG_SOCK_IDLE_NS (10ull * 1000000000ull) /* timeout for a stuck partial frame */
|
||
|
|
#define VMSIG_SOCK_JANITOR_S 5 /* sweep period */
|
||
|
|
|
||
|
|
typedef struct sock_listener sock_listener;
|
||
|
|
|
||
|
|
static uint64_t now_ns(void) {
|
||
|
|
struct timespec ts;
|
||
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||
|
|
return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ===== wire codec (public — also for external clients) ===== */
|
||
|
|
void vmsig_wire_encode(vmsig_wire* w, const vmsig_event* ev) {
|
||
|
|
memset(w, 0, sizeof *w);
|
||
|
|
w->magic = VMSIG_WIRE_MAGIC; w->version = VMSIG_WIRE_VERSION;
|
||
|
|
w->kind = ev->kind; w->source = ev->source; w->dir = ev->dir; w->prio = ev->prio;
|
||
|
|
w->endpoint = ev->endpoint; w->corr = ev->corr;
|
||
|
|
memcpy(w->inln, ev->inln, sizeof w->inln);
|
||
|
|
}
|
||
|
|
int vmsig_wire_decode(const vmsig_wire* w, vmsig_event* ev) {
|
||
|
|
if (w->magic != VMSIG_WIRE_MAGIC || w->version != VMSIG_WIRE_VERSION) return -1;
|
||
|
|
memset(ev, 0, sizeof *ev);
|
||
|
|
ev->kind = w->kind; ev->source = w->source; ev->dir = w->dir; ev->prio = w->prio;
|
||
|
|
ev->endpoint = w->endpoint; ev->corr = w->corr;
|
||
|
|
ev->payload.flags = VMSIG_PL_INLINE;
|
||
|
|
memcpy(ev->inln, w->inln, sizeof ev->inln);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ===== per-conn control ===== */
|
||
|
|
typedef struct sock_conn {
|
||
|
|
int fd;
|
||
|
|
vmsig_core* core;
|
||
|
|
int id;
|
||
|
|
uint32_t uid;
|
||
|
|
uint64_t last_ns; /* activity for the janitor */
|
||
|
|
sock_listener* L;
|
||
|
|
struct sock_conn* lnext; /* listener's connection list */
|
||
|
|
int (*emit_down)(void* token, vmsig_event*);
|
||
|
|
void* token;
|
||
|
|
uint8_t buf[sizeof(vmsig_wire)];
|
||
|
|
size_t buflen;
|
||
|
|
} sock_conn;
|
||
|
|
|
||
|
|
static int conn_fd(void* ctl) { return ((sock_conn*)ctl)->fd; }
|
||
|
|
|
||
|
|
static int conn_subscribe(void* ctl, vmsig_sub* out) {
|
||
|
|
(void)ctl; memset(out, 0, sizeof *out); return 0; /* everything; the grant gates it */
|
||
|
|
}
|
||
|
|
|
||
|
|
static int conn_deliver(void* ctl, const vmsig_event* ev) {
|
||
|
|
sock_conn* c = ctl;
|
||
|
|
vmsig_wire w;
|
||
|
|
vmsig_wire_encode(&w, ev);
|
||
|
|
ssize_t r = write(c->fd, &w, sizeof w); /* best-effort; EAGAIN => frame dropped */
|
||
|
|
(void)r;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void conn_set_emit_down(void* ctl, int (*emit)(void* token, vmsig_event*), void* token) {
|
||
|
|
sock_conn* c = ctl; c->emit_down = emit; c->token = token;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int conn_on_readable(void* ctl) {
|
||
|
|
sock_conn* c = ctl;
|
||
|
|
for (;;) {
|
||
|
|
ssize_t n = read(c->fd, c->buf + c->buflen, sizeof c->buf - c->buflen);
|
||
|
|
if (n == 0) { core_request_drop(c->core, c->id); return 0; } /* EOF */
|
||
|
|
if (n < 0) {
|
||
|
|
if (errno == EAGAIN || errno == EWOULDBLOCK) break;
|
||
|
|
core_request_drop(c->core, c->id);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
c->last_ns = now_ns();
|
||
|
|
c->buflen += (size_t)n;
|
||
|
|
if (c->buflen == sizeof c->buf) {
|
||
|
|
vmsig_event ev;
|
||
|
|
if (vmsig_wire_decode((const vmsig_wire*)c->buf, &ev) == 0) {
|
||
|
|
ev.dir = VMSIG_DIR_DOWN; /* from a poller — DOWN only */
|
||
|
|
if (c->emit_down) c->emit_down(c->token, &ev); /* enforced by the grant */
|
||
|
|
}
|
||
|
|
c->buflen = 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ===== listener ===== */
|
||
|
|
struct sock_listener {
|
||
|
|
int listen_fd;
|
||
|
|
int janitor_fd;
|
||
|
|
vmsig_core* core;
|
||
|
|
vmsig_socket_policy policy;
|
||
|
|
void* ud;
|
||
|
|
sock_conn* conns; /* singly-linked list of active connections */
|
||
|
|
};
|
||
|
|
|
||
|
|
static void listener_unlink(sock_listener* L, sock_conn* c) {
|
||
|
|
sock_conn** pp = &L->conns;
|
||
|
|
while (*pp) { if (*pp == c) { *pp = c->lnext; return; } pp = &(*pp)->lnext; }
|
||
|
|
}
|
||
|
|
|
||
|
|
static int listener_uid_count(sock_listener* L, uint32_t uid) {
|
||
|
|
int n = 0;
|
||
|
|
for (sock_conn* c = L->conns; c; c = c->lnext) if (c->uid == uid) n++;
|
||
|
|
return n;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void conn_close(void* ctl) {
|
||
|
|
sock_conn* c = ctl;
|
||
|
|
if (c->L) listener_unlink(c->L, c);
|
||
|
|
if (c->fd >= 0) close(c->fd);
|
||
|
|
free(c);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Send a SINGLE 80-byte vmsig_wire frame + ONE RO-fd in a cmsg (SCM_RIGHTS). This keeps
|
||
|
|
* the control-socket stream fixed-framed at sizeof(vmsig_wire): the client reads one
|
||
|
|
* frame via recvmsg and extracts the fd only on an fd-carrying frame. Partial cmsg
|
||
|
|
* transfer is not allowed (the fd is all-or-nothing): a short sendmsg -> -1. Shared
|
||
|
|
* primitive for the memctx handoff (one SCM_RIGHTS mechanism). */
|
||
|
|
static int conn_send_fd_frame(sock_conn* c, const vmsig_wire* w, int fd) {
|
||
|
|
struct iovec iov;
|
||
|
|
iov.iov_base = (void*)w;
|
||
|
|
iov.iov_len = sizeof *w;
|
||
|
|
|
||
|
|
union {
|
||
|
|
char buf[CMSG_SPACE(sizeof(int))];
|
||
|
|
struct cmsghdr align;
|
||
|
|
} cm;
|
||
|
|
memset(&cm, 0, sizeof cm);
|
||
|
|
|
||
|
|
struct msghdr mh;
|
||
|
|
memset(&mh, 0, sizeof mh);
|
||
|
|
mh.msg_iov = &iov;
|
||
|
|
mh.msg_iovlen = 1;
|
||
|
|
mh.msg_control = cm.buf;
|
||
|
|
mh.msg_controllen = sizeof cm.buf;
|
||
|
|
|
||
|
|
struct cmsghdr* cmsg = CMSG_FIRSTHDR(&mh);
|
||
|
|
cmsg->cmsg_level = SOL_SOCKET;
|
||
|
|
cmsg->cmsg_type = SCM_RIGHTS;
|
||
|
|
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
||
|
|
memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
|
||
|
|
|
||
|
|
for (;;) {
|
||
|
|
ssize_t n = sendmsg(c->fd, &mh, MSG_NOSIGNAL);
|
||
|
|
if (n < 0) {
|
||
|
|
if (errno == EINTR) continue;
|
||
|
|
return -1;
|
||
|
|
}
|
||
|
|
return ((size_t)n == sizeof *w) ? 0 : -1; /* partial frame -> failure */
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Core -> socket-control: handoff of an address-space context (kind=MEMCTX, inln=vmsig_memctx
|
||
|
|
* POD) + RO-fd of the RAM region in a cmsg. The segs payload does NOT go on the wire (the
|
||
|
|
* fixed-framed vmsig_wire carries only inln); the holder opens it at `low`. */
|
||
|
|
static int conn_attach_memctx(void* ctl, const vmsig_event* ev, int fd) {
|
||
|
|
sock_conn* c = ctl;
|
||
|
|
if (fd < 0 || !ev) return -1;
|
||
|
|
vmsig_wire w;
|
||
|
|
vmsig_wire_encode(&w, ev); /* kind=MEMCTX, inln=vmsig_memctx; payload is not serialized */
|
||
|
|
return conn_send_fd_frame(c, &w, fd);
|
||
|
|
}
|
||
|
|
|
||
|
|
static const vmsig_control_ops CONN_OPS = {
|
||
|
|
.name = "socket",
|
||
|
|
.fd = conn_fd, .subscribe = conn_subscribe, .deliver = conn_deliver,
|
||
|
|
.on_readable = conn_on_readable, .set_emit_down = conn_set_emit_down, .close = conn_close,
|
||
|
|
.attach_memctx = conn_attach_memctx
|
||
|
|
};
|
||
|
|
|
||
|
|
static void on_accept(void* user, uint32_t events) {
|
||
|
|
(void)events;
|
||
|
|
sock_listener* L = user;
|
||
|
|
for (;;) {
|
||
|
|
int fd = accept4(L->listen_fd, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC);
|
||
|
|
if (fd < 0) break; /* EAGAIN / other — done */
|
||
|
|
|
||
|
|
uint32_t uid = (uint32_t)-1, pid = 0;
|
||
|
|
struct ucred uc; socklen_t ul = sizeof uc;
|
||
|
|
if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &uc, &ul) == 0) {
|
||
|
|
uid = (uint32_t)uc.uid; pid = (uint32_t)uc.pid;
|
||
|
|
}
|
||
|
|
vmsig_grant g;
|
||
|
|
if (L->policy) g = L->policy(uid, pid, L->ud);
|
||
|
|
else memset(&g, 0, sizeof g);
|
||
|
|
|
||
|
|
if (g.cap_mask == 0 || g.endpoint_mask == 0) { /* not a valid poller */
|
||
|
|
vmsig_audit a = { VMSIG_AUDIT_REJECT, uid, 0, 0, pid };
|
||
|
|
core_audit(L->core, &a);
|
||
|
|
close(fd);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if (listener_uid_count(L, uid) >= VMSIG_SOCK_PER_UID_MAX) { /* anti-eviction */
|
||
|
|
vmsig_audit a = { VMSIG_AUDIT_REJECT, uid, 0, 0, pid };
|
||
|
|
core_audit(L->core, &a);
|
||
|
|
close(fd);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
sock_conn* conn = calloc(1, sizeof *conn);
|
||
|
|
if (!conn) { close(fd); continue; }
|
||
|
|
conn->fd = fd; conn->core = L->core; conn->id = -1;
|
||
|
|
conn->uid = uid; conn->last_ns = now_ns(); conn->L = L;
|
||
|
|
conn->lnext = L->conns; L->conns = conn;
|
||
|
|
int id = vmsig_core_add_control(L->core, &CONN_OPS, conn, &g);
|
||
|
|
if (id < 0) { /* no slot — reject */
|
||
|
|
vmsig_audit a = { VMSIG_AUDIT_REJECT, uid, 0, 0, pid };
|
||
|
|
core_audit(L->core, &a);
|
||
|
|
listener_unlink(L, conn); close(fd); free(conn); continue;
|
||
|
|
}
|
||
|
|
conn->id = id;
|
||
|
|
vmsig_audit a = { VMSIG_AUDIT_ADMIT, g.principal, 0, 0, pid };
|
||
|
|
core_audit(L->core, &a);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* janitor: detach connections with a stuck partial frame (slowloris) */
|
||
|
|
static void on_janitor(void* user, uint32_t events) {
|
||
|
|
(void)events;
|
||
|
|
sock_listener* L = user;
|
||
|
|
uint64_t v;
|
||
|
|
while (read(L->janitor_fd, &v, sizeof v) == (ssize_t)sizeof v) { /* drain */ }
|
||
|
|
uint64_t now = now_ns();
|
||
|
|
for (sock_conn* c = L->conns; c; c = c->lnext)
|
||
|
|
if (c->buflen > 0 && now - c->last_ns > VMSIG_SOCK_IDLE_NS)
|
||
|
|
core_request_drop(c->core, c->id);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* listener cleanup on core_free (owner = the core, via on_free of the first source) */
|
||
|
|
static void listener_free(void* user) {
|
||
|
|
sock_listener* L = user;
|
||
|
|
if (L->janitor_fd >= 0) close(L->janitor_fd);
|
||
|
|
if (L->listen_fd >= 0) close(L->listen_fd);
|
||
|
|
free(L);
|
||
|
|
}
|
||
|
|
|
||
|
|
int vmsig_socket_attach(vmsig_core* core, const char* path,
|
||
|
|
vmsig_socket_policy policy, void* ud) {
|
||
|
|
if (!core || !path || !*path) return -1;
|
||
|
|
int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0);
|
||
|
|
if (fd < 0) return -1;
|
||
|
|
|
||
|
|
struct sockaddr_un addr;
|
||
|
|
memset(&addr, 0, sizeof addr);
|
||
|
|
addr.sun_family = AF_UNIX;
|
||
|
|
socklen_t alen;
|
||
|
|
size_t n = strlen(path);
|
||
|
|
if (path[0] == '@') { /* abstract namespace */
|
||
|
|
if (n > sizeof addr.sun_path) { close(fd); return -1; }
|
||
|
|
addr.sun_path[0] = 0;
|
||
|
|
memcpy(addr.sun_path + 1, path + 1, n - 1);
|
||
|
|
alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n);
|
||
|
|
} else { /* filesystem path */
|
||
|
|
if (n >= sizeof addr.sun_path) { close(fd); return -1; }
|
||
|
|
unlink(path);
|
||
|
|
memcpy(addr.sun_path, path, n);
|
||
|
|
alen = (socklen_t)sizeof addr;
|
||
|
|
}
|
||
|
|
/* Create the filesystem socket with restrictive perms (0600): the path must not be
|
||
|
|
* the only gate — connect requires write, so we open it to the owner only.
|
||
|
|
* (An abstract socket has no FS perms; its access is bounded by the net namespace.) */
|
||
|
|
mode_t old_um = 0;
|
||
|
|
int restrict_perm = (path[0] != '@');
|
||
|
|
if (restrict_perm) old_um = umask(0177);
|
||
|
|
int br = bind(fd, (struct sockaddr*)&addr, alen);
|
||
|
|
if (restrict_perm) umask(old_um);
|
||
|
|
if (br < 0) { close(fd); return -1; }
|
||
|
|
if (listen(fd, 64) < 0) { close(fd); return -1; }
|
||
|
|
|
||
|
|
int jfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC);
|
||
|
|
if (jfd < 0) { close(fd); return -1; }
|
||
|
|
struct itimerspec its;
|
||
|
|
memset(&its, 0, sizeof its);
|
||
|
|
its.it_interval.tv_sec = VMSIG_SOCK_JANITOR_S;
|
||
|
|
its.it_value = its.it_interval;
|
||
|
|
if (timerfd_settime(jfd, 0, &its, NULL) < 0) { close(jfd); close(fd); return -1; }
|
||
|
|
|
||
|
|
sock_listener* L = calloc(1, sizeof *L);
|
||
|
|
if (!L) { close(jfd); close(fd); return -1; }
|
||
|
|
L->listen_fd = fd; L->janitor_fd = jfd; L->core = core; L->policy = policy; L->ud = ud;
|
||
|
|
/* the listen source owns the listener (on_free=listener_free closes both fds + free) */
|
||
|
|
if (core_add_source(core, fd, on_accept, L, listener_free) < 0) {
|
||
|
|
close(jfd); close(fd); free(L); return -1;
|
||
|
|
}
|
||
|
|
/* janitor without on_free (L already belongs to the core); on error core_free releases it */
|
||
|
|
if (core_add_source(core, jfd, on_janitor, L, NULL) < 0) return -1;
|
||
|
|
return 0;
|
||
|
|
}
|