diff --git a/backend/api.js b/backend/api.js index 82c1c72..c339fd4 100644 --- a/backend/api.js +++ b/backend/api.js @@ -64,14 +64,15 @@ export class Session { this.container = { pty: containerPty, }; - containerPty.on("close", (code, signal) => + containerPty.on("close", async (code, signal) => { this.send({ event: "serviceFailed", service: "container", error: `Exited with status ${signal || code}`, code: signal || code, - }) - ); + }); + await this.teardown(); + }); containerPty.on("error", (err) => this.send({ event: "serviceFailed", diff --git a/langs/hack.yaml b/langs/hack.yaml index 134e187..ade2fae 100644 --- a/langs/hack.yaml +++ b/langs/hack.yaml @@ -1,5 +1,3 @@ -# Disabled due to https://github.com/facebook/hhvm/issues/8796 - id: "hack" aliases: - "hhvm" diff --git a/packer/cloudwatch.json b/packer/cloudwatch.json new file mode 100644 index 0000000..16ea7f9 --- /dev/null +++ b/packer/cloudwatch.json @@ -0,0 +1,35 @@ +{ + "agent": { + "metrics_collection_interval": 60, + "run_as_user": "root" + }, + "metrics": { + "append_dimensions": { + "AutoScalingGroupName": "${aws:AutoScalingGroupName}", + "ImageId": "${aws:ImageId}", + "InstanceId": "${aws:InstanceId}", + "InstanceType": "${aws:InstanceType}" + }, + "aggregation_dimensions": [ + ["AutoScalingGroupName"], + ["AutoScalingGroupName", "path"] + ], + "metrics_collected": { + "disk": { + "measurement": [ + "used_percent" + ], + "metrics_collection_interval": 60, + "resources": [ + "*" + ] + }, + "mem": { + "measurement": [ + "mem_used_percent" + ], + "metrics_collection_interval": 60 + } + } + } +} diff --git a/packer/config.pkr.hcl b/packer/config.pkr.hcl index 49d213e..4ce3e72 100644 --- a/packer/config.pkr.hcl +++ b/packer/config.pkr.hcl @@ -42,6 +42,11 @@ source "amazon-ebs" "ubuntu" { build { sources = ["source.amazon-ebs.ubuntu"] + provisioner "file" { + destination = "/tmp/cloudwatch.json" + source = "cloudwatch.json" + } + provisioner "file" { destination = "/tmp/riju-init-volume" source = "riju-init-volume" diff --git a/packer/provision.bash b/packer/provision.bash index 187dd4b..f1d153e 100644 --- a/packer/provision.bash +++ b/packer/provision.bash @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -euo pipefail +set -euxo pipefail : ${ADMIN_PASSWORD} : ${AWS_REGION} @@ -37,9 +37,15 @@ wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscli.zip unzip -q awscli.zip sudo ./aws/install -sudo chown root:root /tmp/riju-init-volume /tmp/riju-supervisor /tmp/riju.service +wget -nv https://s3.us-west-1.amazonaws.com/amazon-ssm-us-west-1/latest/debian_amd64/amazon-ssm-agent.deb + +wget -nv https://s3.amazonaws.com/amazoncloudwatch-agent/ubuntu/amd64/latest/amazon-cloudwatch-agent.deb +sudo apt-get install -y ./amazon-cloudwatch-agent.deb + +sudo chown root:root /tmp/cloudwatch.json /tmp/riju-init-volume /tmp/riju-supervisor /tmp/riju.service sudo mv /tmp/riju-init-volume /tmp/riju-supervisor /usr/local/bin/ sudo mv /tmp/riju.service /etc/systemd/system/ +sudo mv /tmp/cloudwatch.json /opt/aws/amazon-cloudwatch-agent/bin/config.json sudo sed -Ei 's/^#?PermitRootLogin .*/PermitRootLogin no/' /etc/ssh/sshd_config sudo sed -Ei 's/^#?PasswordAuthentication .*/PasswordAuthentication no/' /etc/ssh/sshd_config @@ -51,8 +57,7 @@ sudo sed -Ei "s/\\\$SUPERVISOR_ACCESS_TOKEN/${SUPERVISOR_ACCESS_TOKEN}/" /etc/sy sudo passwd -l root sudo useradd admin -g admin -G sudo -s /usr/bin/bash -p "$(echo "${ADMIN_PASSWORD}" | mkpasswd -s)" -m -sudo hostnamectl set-hostname riju - +sudo amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json sudo systemctl enable riju sudo userdel ubuntu -f diff --git a/supervisor/src/main.go b/supervisor/src/main.go index 3fac4ed..1d8181e 100644 --- a/supervisor/src/main.go +++ b/supervisor/src/main.go @@ -208,6 +208,7 @@ func (sv *supervisor) reloadWithScheduling() { } var rijuImageRegexp = regexp.MustCompile(`(?:^|/)riju:([^<>]+)$`) +var rijuImageTagRegexp = regexp.MustCompile(`^([^|]+)\|([^|]+)$`) func (sv *supervisor) reload() error { sv.status("getting access token from ECR") @@ -377,13 +378,54 @@ func (sv *supervisor) reload() error { sv.isGreen = !sv.isGreen sv.status("stopping old container") dockerRm := exec.Command("docker", "rm", "-f", oldName) - dockerRm.Stdout = dockerRm.Stdout - dockerRm.Stderr = dockerRm.Stderr + dockerRm.Stdout = os.Stdout + dockerRm.Stderr = os.Stderr if err := dockerRm.Run(); err != nil { return err } sv.status("saving updated config hash") sv.deployConfigHash = deployCfgHash + sv.status("pruning unneeded Docker images") + dockerImageLs = exec.Command( + "docker", "image", "ls", "--format", + "{{ .ID }}|{{ .Tag }}", + ) + dockerImageLs.Stderr = os.Stderr + out, err = dockerImageLs.Output() + if err != nil { + return err + } + neededTagsSet := map[string]bool{} + for _, tag := range neededTags { + neededTagsSet[tag] = true + } + unneededTagsSet := map[string]bool{} + for _, line := range strings.Split(string(out), "\n") { + if match := rijuImageTagRegexp.FindStringSubmatch(line); match != nil { + id := match[1] + tag := match[2] + if !neededTagsSet[tag] { + unneededTagsSet[id] = true + } + } + } + unneededTags := []string{} + for tag := range unneededTagsSet { + unneededTags = append(unneededTags, tag) + } + dockerImageRmArgs := append([]string{"image", "rm", "-f"}, unneededTags...) + dockerImageRm := exec.Command("docker", dockerImageRmArgs...) + dockerImageRm.Stdout = os.Stdout + dockerImageRm.Stderr = os.Stderr + if err := dockerImageRm.Run(); err != nil { + return err + } + dockerPrune := exec.Command("docker", "system", "prune") + dockerPrune.Stdout = os.Stdout + dockerPrune.Stderr = os.Stderr + if err := dockerPrune.Run(); err != nil { + return err + } sv.status("reload complete") return nil } diff --git a/tf/cloudwatch.tf b/tf/cloudwatch.tf new file mode 100644 index 0000000..6523b6a --- /dev/null +++ b/tf/cloudwatch.tf @@ -0,0 +1,61 @@ +resource "aws_cloudwatch_metric_alarm" "server_memory" { + count = local.ami_available ? 1 : 0 + + alarm_name = "riju-server-memory-high" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "5" + metric_name = "mem_used_percent" + namespace = "CWAgent" + period = "60" + statistic = "Maximum" + threshold = "80" + alarm_description = "Memory usage on Riju server is above 80%" + ok_actions = [aws_sns_topic.riju.arn] + alarm_actions = [aws_sns_topic.riju.arn] + insufficient_data_actions = [aws_sns_topic.riju.arn] + dimensions = { + AutoScalingGroupName = aws_autoscaling_group.server[count.index].name + } +} + +resource "aws_cloudwatch_metric_alarm" "server_data_volume_disk_space" { + count = local.ami_available ? 1 : 0 + + alarm_name = "riju-server-data-volume-disk-usage-high" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "5" + metric_name = "disk_used_percent" + namespace = "CWAgent" + period = "60" + statistic = "Minimum" + threshold = "80" + alarm_description = "Disk space usage for data volume on Riju server is above 80%" + ok_actions = [aws_sns_topic.riju.arn] + alarm_actions = [aws_sns_topic.riju.arn] + insufficient_data_actions = [aws_sns_topic.riju.arn] + dimensions = { + AutoScalingGroupName = aws_autoscaling_group.server[count.index].name + path = "/mnt/riju/data" + } +} + +resource "aws_cloudwatch_metric_alarm" "server_root_volume_disk_space" { + count = local.ami_available ? 1 : 0 + + alarm_name = "riju-server-root-volume-disk-usage-high" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "5" + metric_name = "disk_used_percent" + namespace = "CWAgent" + period = "60" + statistic = "Minimum" + threshold = "80" + alarm_description = "Disk space usage for root volume on Riju server is above 80%" + ok_actions = [aws_sns_topic.riju.arn] + alarm_actions = [aws_sns_topic.riju.arn] + insufficient_data_actions = [aws_sns_topic.riju.arn] + dimensions = { + AutoScalingGroupName = aws_autoscaling_group.server[count.index].name + path = "/" + } +} diff --git a/tf/iam.tf b/tf/iam.tf index 8f1dfff..81db4f5 100644 --- a/tf/iam.tf +++ b/tf/iam.tf @@ -1,3 +1,11 @@ +data "aws_iam_policy" "cloudwatch" { + name = "CloudWatchAgentServerPolicy" +} + +data "aws_iam_policy" "ssm" { + name = "AmazonSSMManagedInstanceCore" +} + resource "aws_iam_user" "deploy" { name = "riju-deploy" } @@ -171,6 +179,16 @@ resource "aws_iam_role_policy_attachment" "server" { policy_arn = aws_iam_policy.server.arn } +resource "aws_iam_role_policy_attachment" "server_cloudwatch" { + role = aws_iam_role.server.name + policy_arn = data.aws_iam_policy.cloudwatch.arn +} + +resource "aws_iam_role_policy_attachment" "server_ssm" { + role = aws_iam_role.server.name + policy_arn = data.aws_iam_policy.ssm.arn +} + resource "aws_iam_instance_profile" "server" { name = "riju-server" role = aws_iam_role.server.name diff --git a/tf/sns.tf b/tf/sns.tf new file mode 100644 index 0000000..951ba31 --- /dev/null +++ b/tf/sns.tf @@ -0,0 +1,3 @@ +resource "aws_sns_topic" "riju" { + name = "Riju" +} diff --git a/tools/generate-deploy-config.js b/tools/generate-deploy-config.js index bb66f51..e9fc943 100644 --- a/tools/generate-deploy-config.js +++ b/tools/generate-deploy-config.js @@ -32,6 +32,7 @@ async function main() { program.parse(process.argv); await fs.mkdir("build", { recursive: true }); await fs.writeFile("build/config.json", JSON.stringify(await getDeployConfig(), null, 2) + "\n"); + console.log("wrote build/config.json"); process.exit(0); }