From f5b7536235cd863c2f53682ef45b518504709ca7 Mon Sep 17 00:00:00 2001 From: Radon Rosborough Date: Sun, 1 Aug 2021 12:42:01 -0700 Subject: [PATCH] Many things fixed --- Makefile | 6 +-- packer/cloudwatch.json | 2 +- packer/docker.json | 3 ++ packer/promtail.service | 13 ++++++ packer/promtail.yaml | 63 +++++++++++++++++++++++++++++ packer/provision-web.bash | 35 ++++++++++++++-- packer/riju.slice | 14 +++++++ packer/web.pkr.hcl | 36 ++++++++++++++--- supervisor/src/main.go | 3 +- system/src/riju-system-privileged.c | 11 +++-- tf/asg.tf | 13 +++++- tf/iam.tf | 49 ++++++++++++++++++++++ tf/outputs.tf | 9 +++++ tools/depgraph.js | 2 +- 14 files changed, 237 insertions(+), 22 deletions(-) create mode 100644 packer/docker.json create mode 100644 packer/promtail.service create mode 100644 packer/promtail.yaml create mode 100644 packer/riju.slice diff --git a/Makefile b/Makefile index d5b396f..280c8ca 100644 --- a/Makefile +++ b/Makefile @@ -224,8 +224,8 @@ download: # L= T= : Download last published .deb from S3 aws s3 cp $(S3_DEB) $(BUILD)/$(DEB) undeploy: # Pull latest deployment config from S3 - mkdir -p $(BUILD) - aws s3 cp $(S3_CONFIG) $(BUILD)/config.json + mkdir -p build + aws s3 cp $(S3_CONFIG) build/config.json ### Publish artifacts to registries @@ -251,7 +251,7 @@ deploy-config: # Generate deployment config file node tools/generate-deploy-config.js deploy-latest: # Upload deployment config to S3 and update ASG instances - aws s3 cp $(BUILD)/config.json $(S3_CONFIG) + aws s3 cp build/config.json $(S3_CONFIG) deploy: deploy-config deploy-latest # Shorthand for deploy-config followed by deploy-latest diff --git a/packer/cloudwatch.json b/packer/cloudwatch.json index 5735764..f8b081c 100644 --- a/packer/cloudwatch.json +++ b/packer/cloudwatch.json @@ -22,7 +22,7 @@ "disk": { "measurement": ["used_percent"], "metrics_collection_interval": 60, - "resources": ["*"] + "resources": ["/", "/mnt/riju"] }, "mem": { "measurement": ["mem_used_percent"], diff --git a/packer/docker.json b/packer/docker.json new file mode 100644 index 0000000..f90c1b1 --- /dev/null +++ b/packer/docker.json @@ -0,0 +1,3 @@ +{ + "exec-opts": ["native.cgroupdriver=systemd"] +} diff --git a/packer/promtail.service b/packer/promtail.service new file mode 100644 index 0000000..115223a --- /dev/null +++ b/packer/promtail.service @@ -0,0 +1,13 @@ +[Unit] +Description=Promtail +StartLimitBurst=5 +StartLimitIntervalSec=300 + +[Service] +Type=exec +ExecStart=promtail -config.file /etc/promtail/config.yaml +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/packer/promtail.yaml b/packer/promtail.yaml new file mode 100644 index 0000000..0309713 --- /dev/null +++ b/packer/promtail.yaml @@ -0,0 +1,63 @@ +server: + http_listen_address: 0.0.0.0 + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +client: + url: https://72217:$GRAFANA_API_KEY@logs-prod-us-central1.grafana.net/api/prom/push + +scrape_configs: + - job_name: kernel + static_configs: + - labels: + source: kernel + __path__: /var/log/kern.log + - job_name: systemd + journal: + labels: + source: systemd + relabel_configs: + - source_labels: + - __journal__systemd_unit + regex: "(docker|riju)\\.service" + action: keep + - source_labels: + - __journal__systemd_unit + regex: "docker\\.service" + target_label: source + replacement: "dockerd" + - source_labels: + - __journal__systemd_unit + regex: "riju\\.service" + target_label: source + replacement: "supervisor" + - source_labels: + - source + regex: "systemd" + action: drop + - job_name: server + static_configs: + - labels: + source: server + __path__: /mnt/riju/docker/containers/*/*.log + pipeline_stages: + - json: + expressions: + log: log + stream: stream + tag: attrs.tag + time: time + - output: + source: log + - timestamp: + source: time + format: RFC3339Nano + - labels: + container: tag + stream: stream + - match: + selector: '{container!~"riju-app-(blue|green)"}' + action: drop diff --git a/packer/provision-web.bash b/packer/provision-web.bash index bf3ecd4..2dc9efe 100644 --- a/packer/provision-web.bash +++ b/packer/provision-web.bash @@ -7,6 +7,10 @@ set -euo pipefail : ${S3_BUCKET} : ${SUPERVISOR_ACCESS_TOKEN} +latest_release() { + curl -sSL "https://api.github.com/repos/$1/releases/latest" | jq -r .tag_name +} + # I think there is a race condition related to Ubuntu wanting to do an # automated system upgrade at boot, which causes 'apt-get update' to # sometimes fail with an obscure error message. @@ -31,7 +35,7 @@ deb [arch=amd64] https://download.docker.com/linux/ubuntu ${ubuntu_name} stable EOF sudo -E apt-get update -sudo -E apt-get install -y docker-ce docker-ce-cli containerd.io unzip whois +sudo -E apt-get install -y docker-ce docker-ce-cli containerd.io jq unzip whois wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscli.zip unzip -q awscli.zip @@ -42,10 +46,14 @@ wget -nv https://s3.us-west-1.amazonaws.com/amazon-ssm-us-west-1/latest/debian_a wget -nv https://s3.amazonaws.com/amazoncloudwatch-agent/ubuntu/amd64/latest/amazon-cloudwatch-agent.deb sudo apt-get install -y ./amazon-cloudwatch-agent.deb -sudo chown root:root /tmp/cloudwatch.json /tmp/riju-init-volume /tmp/riju-supervisor /tmp/riju.service -sudo mv /tmp/riju-init-volume /tmp/riju-supervisor /usr/local/bin/ -sudo mv /tmp/riju.service /etc/systemd/system/ +sudo chown root:root \ + /tmp/cloudwatch.json /tmp/docker.json /tmp/riju.service \ + /tmp/riju.slice /tmp/riju-init-volume /tmp/riju-supervisor + +sudo mv /tmp/docker.json /etc/docker/daemon.json +sudo mv /tmp/riju.service /tmp/riju.slice /etc/systemd/system/ sudo mv /tmp/cloudwatch.json /opt/aws/amazon-cloudwatch-agent/bin/config.json +sudo mv /tmp/riju-init-volume /tmp/riju-supervisor /usr/local/bin/ sudo sed -Ei 's/^#?PermitRootLogin .*/PermitRootLogin no/' /etc/ssh/sshd_config sudo sed -Ei 's/^#?PasswordAuthentication .*/PasswordAuthentication no/' /etc/ssh/sshd_config @@ -61,6 +69,25 @@ sudo useradd admin -g admin -G sudo -s /usr/bin/bash -p "$(echo "${ADMIN_PASSWOR sudo amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json sudo systemctl enable riju +if [[ -n "${GRAFANA_API_KEY:-}" ]]; then + ver="$(latest_release grafana/loki)" + + wget -nv "https://github.com/grafana/loki/releases/download/${ver}/promtail-linux-amd64.zip" + unzip promtail-linux-amd64.zip + sudo cp promtail-linux-amd64 /usr/local/bin/promtail + + sudo chown root:root /tmp/promtail.service /tmp/promtail.yaml + + sudo mkdir /etc/promtail + sudo mv /tmp/promtail.yaml /etc/promtail/config.yaml + sudo mv /tmp/promtail.service /etc/systemd/system/ + sudo sed -Ei "s/\\\$GRAFANA_API_KEY/${GRAFANA_API_KEY}/" /etc/promtail/config.yaml + + sudo systemctl enable promtail +else + sudo rm /tmp/promtail.yaml /tmp/promtail.service +fi + sudo userdel ubuntu -f popd diff --git a/packer/riju.slice b/packer/riju.slice new file mode 100644 index 0000000..b7adec0 --- /dev/null +++ b/packer/riju.slice @@ -0,0 +1,14 @@ +[Unit] +Description=Resource limits for Riju user containers +Before=slices.target + +[Slice] +CPUAccounting=true +CPUQuota=100% +MemoryAccounting=true +MemoryMax=1G +MemorySwapMax=8G +TasksAccounting=true +TasksMax=2048 +IPAccounting=true +IPAddressDeny=169.254.169.254 diff --git a/packer/web.pkr.hcl b/packer/web.pkr.hcl index d176395..0c12855 100644 --- a/packer/web.pkr.hcl +++ b/packer/web.pkr.hcl @@ -13,6 +13,11 @@ variable "fathom_site_id" { default = "${env("FATHOM_SITE_ID")}" } +variable "grafana_api_key" { + type = string + default = "${env("GRAFANA_API_KEY")}" +} + variable "s3_bucket" { type = string default = "${env("S3_BUCKET")}" @@ -67,6 +72,31 @@ build { source = "cloudwatch.json" } + provisioner "file" { + destination = "/tmp/docker.json" + source = "docker.json" + } + + provisioner "file" { + destination = "/tmp/promtail.service" + source = "promtail.service" + } + + provisioner "file" { + destination = "/tmp/promtail.yaml" + source = "promtail.yaml" + } + + provisioner "file" { + destination = "/tmp/riju.service" + source = "riju.service" + } + + provisioner "file" { + destination = "/tmp/riju.slice" + source = "riju.slice" + } + provisioner "file" { destination = "/tmp/riju-init-volume" source = "riju-init-volume" @@ -77,16 +107,12 @@ build { source = "../supervisor/out/riju-supervisor" } - provisioner "file" { - destination = "/tmp/riju.service" - source = "riju.service" - } - provisioner "shell" { environment_vars = [ "ADMIN_PASSWORD=${var.admin_password}", "AWS_REGION=${var.aws_region}", "FATHOM_SITE_ID=${var.fathom_site_id}", + "GRAFANA_API_KEY=${var.grafana_api_key}", "S3_BUCKET=${var.s3_bucket}", "SUPERVISOR_ACCESS_TOKEN=${var.supervisor_access_token}", ] diff --git a/supervisor/src/main.go b/supervisor/src/main.go index 337d4c8..336c0be 100644 --- a/supervisor/src/main.go +++ b/supervisor/src/main.go @@ -352,8 +352,7 @@ func (sv *supervisor) reload() error { "--label", fmt.Sprintf("riju.deploy-config-hash=%s", deployCfgHash), "--name", name, "--restart", "unless-stopped", - "--oom-kill-disable", - "--cpu-shares", "2048", + "--log-opt", "tag={{.Name}}", fmt.Sprintf("riju:%s", deployCfg.AppImageTag), ) dockerRun.Stdout = os.Stdout diff --git a/system/src/riju-system-privileged.c b/system/src/riju-system-privileged.c index 2a1d1a0..d742370 100644 --- a/system/src/riju-system-privileged.c +++ b/system/src/riju-system-privileged.c @@ -83,6 +83,8 @@ void session(char *uuid, char *lang, char *imageHash) die("asprintf failed"); if (mknod(fifo, 0700 | S_IFIFO, 0) < 0) die("mknod failed"); + char sentinel[] = "cat /var/run/riju/sentinel/fifo | ( sleep 10; while " + "read -t2; do :; done; pkill -g0 )"; pid_t pid = fork(); if (pid < 0) die("fork failed"); @@ -128,14 +130,15 @@ void session(char *uuid, char *lang, char *imageHash) "--memory", "1g", "--memory-swap", - "3g", + "8g", "--pids-limit", - "512", + "2048", + "--cgroup-parent", + "riju.slice", image, "bash", "-c", - "cat /var/run/riju/sentinel/fifo | ( sleep 10; while read -t2; do :; " - "done; pkill -g0 )", + sentinel, NULL, }; execvp(argv[0], argv); diff --git a/tf/asg.tf b/tf/asg.tf index 4372705..660a481 100644 --- a/tf/asg.tf +++ b/tf/asg.tf @@ -80,13 +80,18 @@ resource "aws_autoscaling_group" "server" { availability_zones = [local.primary_az] desired_capacity = 1 - min_size = 1 + min_size = 0 max_size = 3 launch_template { id = aws_launch_template.server.id } + termination_policies = [ + "OldestLaunchTemplate", + "OldestInstance", + ] + tags = concat( [ { @@ -98,6 +103,10 @@ resource "aws_autoscaling_group" "server" { ) lifecycle { - ignore_changes = [target_group_arns] + ignore_changes = [ + desired_capacity, + target_group_arns, + ] } + } diff --git a/tf/iam.tf b/tf/iam.tf index 4b5295b..8b8e4d9 100644 --- a/tf/iam.tf +++ b/tf/iam.tf @@ -247,3 +247,52 @@ resource "aws_iam_role_policy_attachment" "backup_restores" { role = aws_iam_role.backup.name policy_arn = data.aws_iam_policy.backup_restores.arn } + +data "aws_iam_policy_document" "grafana_cloudwatch" { + statement { + actions = [ + "cloudwatch:DescribeAlarmsForMetric", + "cloudwatch:DescribeAlarmHistory", + "cloudwatch:DescribeAlarms", + "cloudwatch:ListMetrics", + "cloudwatch:GetMetricStatistics", + "cloudwatch:GetMetricData", + + "logs:DescribeLogGroups", + "logs:GetLogGroupFields", + "logs:StartQuery", + "logs:StopQuery", + "logs:GetQueryResults", + "logs:GetLogEvents", + + "ec2:DescribeTags", + "ec2:DescribeInstances", + "ec2:DescribeRegions", + + "tag:GetResources", + ] + + resources = [ + "*", + ] + } +} + +resource "aws_iam_user" "grafana" { + name = "riju-grafana" +} + +resource "aws_iam_policy" "grafana_cloudwatch" { + name = "riju-grafana-cloudwatch" + description = "Policy granting Grafana access to CloudWatch metrics and logs" + policy = data.aws_iam_policy_document.grafana_cloudwatch.json +} + +resource "aws_iam_user_policy_attachment" "grafana_cloudwatch" { + user = aws_iam_user.grafana.name + policy_arn = aws_iam_policy.grafana_cloudwatch.arn +} + +resource "aws_iam_access_key" "grafana" { + user = aws_iam_user.grafana.name +} diff --git a/tf/outputs.tf b/tf/outputs.tf index 630adbc..8418a66 100644 --- a/tf/outputs.tf +++ b/tf/outputs.tf @@ -10,3 +10,12 @@ output "deploy_aws_secret_access_key" { value = aws_iam_access_key.deploy.secret sensitive = true } + +output "grafana_aws_access_key_id" { + value = aws_iam_access_key.grafana.id +} + +output "grafana_aws_secret_access_key" { + value = aws_iam_access_key.grafana.secret + sensitive = true +} diff --git a/tools/depgraph.js b/tools/depgraph.js index f07ff1a..dab5e45 100644 --- a/tools/depgraph.js +++ b/tools/depgraph.js @@ -545,7 +545,7 @@ async function executeDepGraph({ continue; } if (artifacts[target].publishTarget) { - if (statuses[dep] === "publishToRegistry") { + if (statuses[dep] === "publishToRegistry" && publish) { plan.push({ artifact: dep, action: "publishToRegistry",