Proof of concept for pid1 sentinel executor

This commit is contained in:
Radon Rosborough 2021-08-08 12:04:13 -07:00
parent 43d2a20971
commit 389d8d0ad6
6 changed files with 201 additions and 136 deletions

4
.gitignore vendored
View File

@ -4,8 +4,6 @@
.lsp-repl-history
.terraform
build
# Separate directory for things that are ignored by Git but not by
# Docker.
build-docker
node_modules
out
sentinel.h

View File

@ -160,7 +160,7 @@ system: # Compile setuid binary for production
./system/compile.bash
system-dev: # Compile and watch setuid binary for development
watchexec -w system/src -n -- ./system/compile.bash
watchexec -w system/res -w system/src -n -- ./system/compile.bash
supervisor: # Compile supervisor binary for production
./supervisor/compile.bash

View File

@ -8,16 +8,21 @@ if [[ ! -d system/src ]]; then
fi
function verbosely {
echo "$@"
echo >&2 "$@"
"$@"
}
mkdir -p system/out
rm -f system/out/*
pushd system/res >/dev/null
verbosely xxd -i sentinel.bash > ../src/sentinel.h
popd >/dev/null
for src in system/src/*.c; do
out="${src/src/out}"
out="${out/.c}"
verbosely clang -Wall -Wextra -Werror -std=c11 "${src}" -o "${out}"
verbosely clang -Isystem/res -Wall -Wextra -Werror -std=c11 "${src}" -o "${out}"
if [[ "${out}" == *-privileged && -z "${UNPRIVILEGED:-}" ]]; then
verbosely sudo chown root:riju "${out}"
verbosely sudo chmod a=,g=rx,u=rwxs "${out}"

View File

@ -1,93 +0,0 @@
#!/usr/bin/env python3
import argparse
import signal
import subprocess
import sys
import uuid
class Parser(argparse.ArgumentParser):
def format_help(self):
return """
Usage: docker-exec.bash [OPTIONS] CONTAINER COMMAND [ARG...]
Run a command in a running container
Options:
-i, --interactive Keep STDIN open even if not attached
-t, --tty Allocate a pseudo-TTY
-u, --user string Username or UID (format: <name|uid>:[<group|gid>])
"""
parser = Parser()
parser.add_argument("-i", "--interactive", action="store_true")
parser.add_argument("-t", "--tty", action="store_true")
parser.add_argument("-u", "--user", type=str)
parser.add_argument("container", type=str)
parser.add_argument("arg", type=str, nargs="*")
args = parser.parse_args()
pidfiles = "/var/run/riju/pidfiles"
pidfile = pidfiles + "/" + str(uuid.uuid4()).replace("-", "")
# We have to use 'kill -9' here, otherwise runuser intercepts the
# signal and takes its sweet time cleaning up.
def cleanup(*ignored_args):
subprocess.run(
[
"docker",
"exec",
args.container,
"bash",
"-c",
f"""
set -euo pipefail
if [[ -f '{pidfile}' ]]; then
kill -9 -$(< '{pidfile}') 2>/dev/null || true
rm -f '{pidfile}'
fi
""",
]
)
signal.signal(signal.SIGINT, cleanup)
signal.signal(signal.SIGTERM, cleanup)
exec_args = []
if args.interactive:
exec_args.append("-i")
if args.tty:
exec_args.append("-t")
runuser_args = []
if args.user:
runuser_args = ["runuser", "-u", args.user, "--"]
sys.exit(
subprocess.run(
[
"docker",
"exec",
*exec_args,
args.container,
"bash",
"-c",
f"""
set -euo pipefail
umask 077
mkdir -p '{pidfiles}'
echo "$$" > '{pidfile}'
exec "$@"
""",
"--",
*runuser_args,
*args.arg,
]
).returncode
)

27
system/res/sentinel.bash Executable file
View File

@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -euo pipefail
while read -t2 -a cmd; do
if (( "${#cmd[@]}" > 0 )); then
case "${cmd[0]}" in
ping) ;;
exec|pty)
if (( "${#cmd[@]}" < 3 )); then
echo >&2 "usage: (exec|pty) UUID ARG..."
else
uuid="${cmd[1]}"
args=("${cmd[@]:2}")
echo >&2 "${cmd[0]} ${args[0]} with UUID ${uuid}"
input="/var/run/riju/share/cmd-${uuid}-input"
output="/var/run/riju/share/cmd-${uuid}-output"
mkfifo "${input}" "${output}"
runuser -u riju -- bash -c 'exec "$@"' sentinel "${args[@]}" < "${input}" &> "${output}" &
fi
;;
*)
echo >&2 "unrecognized command: ${cmd[0]}"
;;
esac
fi
done < /var/run/riju/share/control

View File

@ -7,18 +7,23 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/random.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include "sentinel.h"
void __attribute__((noreturn)) die(char *msg)
{
fprintf(stderr, "%s\n", msg);
exit(1);
}
void init() { sentinel_bash[sentinel_bash_len - 1] = '\0'; }
void die_with_usage()
{
die("usage:\n"
@ -27,6 +32,70 @@ void die_with_usage()
" riju-system-privileged pty UUID CMDLINE...");
}
char *quoteArgs(int argc, char **cmdline)
{
char **printfArgs = malloc(sizeof(char *) * (argc + 3));
printfArgs[0] = "printf";
printfArgs[1] = "%q ";
memcpy(printfArgs + 2, cmdline, sizeof(char *) * argc);
printfArgs[argc + 2] = NULL;
int fd[2];
if (pipe(fd) < 0)
die("pipe failed");
pid_t pid = fork();
if (pid < 0)
die("fork failed");
else if (pid == 0) {
if (dup2(fd[1], STDOUT_FILENO) < 0)
die("dup2 failed");
if (close(fd[0]) < 0 || close(fd[1]) < 0)
die("close failed");
execvp(printfArgs[0], printfArgs);
die("execvp failed");
}
if (close(fd[1]) < 0)
die("close failed");
char *buf = malloc(1024);
if (buf == NULL)
die("malloc failed");
ssize_t len_allocated = 2048;
ssize_t len_total = 0;
ssize_t len_read;
while ((len_read = read(fd[0], buf + len_total, 1024)) > 0) {
len_total += len_read;
if (len_allocated - len_total < 1024) {
char *new_buf = malloc(len_allocated + 1024);
if (new_buf == NULL)
die("malloc failed");
memcpy(new_buf, buf, len_total);
free(buf);
buf = new_buf;
}
}
if (len_read < 0)
die("read failed");
buf[len_total] = '\0';
return buf;
}
char *getUUID()
{
char *buf = malloc(16);
if (buf == NULL)
die("malloc failed");
if (getrandom(buf, 16, 0) != 16)
die("getrandom failed");
char *uuid;
if (asprintf(&uuid,
"%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%"
"02hhx%02hhx%02hhx%02hhx%02hhx%02hhx",
buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7],
buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14],
buf[15]) < 0)
die("asprintf failed");
return uuid;
}
char *parseUUID(char *uuid)
{
if (strnlen(uuid, 33) != 32)
@ -55,15 +124,17 @@ char *parseImageHash(char *imageHash)
return imageHash;
}
char *timeout_msg;
void wait_alarm(int signum)
{
(void)signum;
die("container did not come up within 10 seconds");
die(timeout_msg);
}
void session(char *uuid, char *lang, char *imageHash)
{
char *image, *container, *hostname, *volume, *fifo;
char *image, *container, *hostname, *share, *volume, *fifo;
if ((imageHash != NULL ? asprintf(&image, "riju:lang-%s-%s", lang, imageHash)
: asprintf(&image, "riju:lang-%s", lang)) < 0)
die("asprintf failed");
@ -71,20 +142,20 @@ void session(char *uuid, char *lang, char *imageHash)
die("asprintf failed");
if (asprintf(&hostname, "HOSTNAME=%s", lang) < 0)
die("asprintf failed");
int rv = mkdir("/var/run/riju/sentinels", 0700);
if (asprintf(&share, "/var/run/riju/shares/%s", uuid) < 0)
die("asprintf failed");
int rv = mkdir("/var/run/riju/shares", 0700);
if (rv < 0 && errno != EEXIST)
die("mkdir failed");
char tmpdir[] = "/var/run/riju/sentinels/XXXXXX";
if (mkdtemp(tmpdir) == NULL)
die("mkdtemp failed");
if (asprintf(&volume, "%s:/var/run/riju/sentinel", tmpdir) < 0)
rv = mkdir(share, 0700);
if (rv < 0 && errno != EEXIST)
die("mkdir failed");
if (asprintf(&volume, "%s:/var/run/riju/share", share) < 0)
die("asprintf failed");
if (asprintf(&fifo, "%s/fifo", tmpdir) < 0)
if (asprintf(&fifo, "%s/control", share) < 0)
die("asprintf failed");
if (mknod(fifo, 0700 | S_IFIFO, 0) < 0)
die("mknod failed");
char sentinel[] = "cat /var/run/riju/sentinel/fifo | ( sleep 10; while "
"read -t2; do :; done; pkill -g0 )";
pid_t pid = fork();
if (pid < 0)
die("fork failed");
@ -138,15 +209,16 @@ void session(char *uuid, char *lang, char *imageHash)
image,
"bash",
"-c",
sentinel,
(char *)sentinel_bash,
NULL,
};
execvp(argv[0], argv);
die("execvp failed");
}
struct timespec ts_10ms; // 10ms
struct timespec ts_10ms;
ts_10ms.tv_sec = 0;
ts_10ms.tv_nsec = 1000 * 1000 * 10;
timeout_msg = "container did not come up within 10 seconds";
signal(SIGALRM, wait_alarm);
alarm(10);
int fd;
@ -161,20 +233,16 @@ void session(char *uuid, char *lang, char *imageHash)
die("nanosleep failed");
}
signal(SIGALRM, SIG_IGN);
if (unlink(fifo) < 0)
die("unlink failed");
if (rmdir(tmpdir) < 0)
die("rmdir failed");
pid = fork();
if (pid < 0)
die("fork failed");
else if (pid == 0) {
struct timespec ts_1s; // 10ms
struct timespec ts_1s;
ts_1s.tv_sec = 1;
ts_1s.tv_nsec = 0;
while (1) {
static const char ok[] = "ok\n";
if (write(fd, ok, sizeof(ok) / sizeof(char)) < 0)
static const char ok[] = "ping\n";
if (write(fd, ok, sizeof(ok) / sizeof(char)) != sizeof(ok) / sizeof(char))
die("write failed");
int rv = nanosleep(&ts_1s, NULL);
if (rv != 0 && errno != EINTR)
@ -190,29 +258,89 @@ void session(char *uuid, char *lang, char *imageHash)
void exec(char *uuid, int argc, char **cmdline, bool pty)
{
char *container;
if (asprintf(&container, "riju-session-%s", uuid) < 0)
char *share, *ctlFIFO, *inputFIFO, *outputFIFO, *ctlCmd, *dataFIFO;
if (asprintf(&share, "/var/run/riju/shares/%s", uuid) < 0)
die("asprintf failed");
char *argvPrefix[] = {
"./system/res/docker-exec.py",
"--user",
"riju",
pty ? "-it" : "-i",
container,
"--",
};
char **argv = malloc(sizeof(argvPrefix) + (argc + 1) * sizeof(char *));
if (argv == NULL)
die("malloc failed");
memcpy(argv, argvPrefix, sizeof(argvPrefix));
memcpy((void *)argv + sizeof(argvPrefix), cmdline, argc * sizeof(char *));
argv[sizeof(argvPrefix) + argc * sizeof(char *)] = NULL;
execvp(argv[0], argv);
die("execvp failed");
if (asprintf(&ctlFIFO, "%s/control", share) < 0)
die("asprintf failed");
char *procUUID = getUUID();
if (asprintf(&inputFIFO, "%s/cmd-%s-input", share, procUUID) < 0)
die("asprintf failed");
if (asprintf(&outputFIFO, "%s/cmd-%s-output", share, procUUID) < 0)
die("asprintf failed");
int fd = open(ctlFIFO, O_WRONLY);
if (fd < 0)
die("open failed");
char *quotedArgs = quoteArgs(argc, cmdline);
int len = asprintf(&ctlCmd, "%s %s %s\n", pty ? "pty" : "exec", procUUID,
quotedArgs);
if (len < 0)
die("asprintf failed");
int len_written;
while ((len_written = write(fd, ctlCmd, len)) > 0) {
ctlCmd += len_written;
len -= len_written;
}
if (len_written < 0)
die("write failed");
close(fd);
struct timespec ts_10ms;
ts_10ms.tv_sec = 0;
ts_10ms.tv_nsec = 1000 * 1000 * 10;
int mode;
pid_t pid = fork();
if (pid < 0)
die("fork failed");
else if (pid == 0) {
dataFIFO = inputFIFO;
timeout_msg = "sentinel did not set up input FIFO within 1 second";
mode = O_WRONLY;
} else {
dataFIFO = outputFIFO;
timeout_msg = "sentinel did not set up output FIFO within 1 second";
mode = O_RDONLY;
}
signal(SIGALRM, wait_alarm);
alarm(1);
while (1) {
fd = open(dataFIFO, mode);
if (fd >= 0)
break;
if (errno != ENOENT)
die("open failed");
int rv = nanosleep(&ts_10ms, NULL);
if (rv != 0 && errno != EINTR)
die("nanosleep failed");
}
signal(SIGALRM, SIG_IGN);
char buf[1024];
if (pid == 0) {
while ((len = read(STDIN_FILENO, buf, 1024)) > 0) {
char *ptr = buf;
while (len > 0) {
len_written = write(fd, ptr, len);
if (len_written < 0)
die("write failed");
len -= len_written;
ptr += len_written;
}
}
} else {
while ((len = read(fd, buf, 1024)) > 0) {
fwrite(buf, 1, len, stdout);
if (ferror(stdout))
die("fwrite failed");
if (feof(stdout))
break;
}
}
if (len < 0)
die("read failed");
}
int main(int argc, char **argv)
{
init();
if (seteuid(0) != 0)
die("seteuid failed");
if (argc < 2)