From 301faefdcb6a95d263d91c5a4fb58322aeba05fd Mon Sep 17 00:00:00 2001 From: Radon Rosborough Date: Tue, 27 Jul 2021 18:44:15 -0700 Subject: [PATCH] [#80] Various updates to run CI on EC2 dynamic runner Not quite working yet but merging it now so I can focus on some other higher-priority things. --- .github/workflows/main.yml | 5 +- Makefile | 7 +- packer/ci.pkr.hcl | 48 +++++++++ packer/provision-ci.bash | 35 +++++++ packer/{provision.bash => provision-web.bash} | 0 packer/riju-init-volume | 4 +- packer/{config.pkr.hcl => web.pkr.hcl} | 6 +- tf/alb.tf | 4 +- tf/ami.tf | 11 ++- tf/asg.tf | 8 +- tf/cloudwatch.tf | 58 +++++------ tf/ec2.tf | 19 ++++ tf/iam.tf | 64 +++--------- tf/main.tf | 2 - tf/ssm.tf | 24 +++++ tools/ci-ec2.bash | 97 +++++++++++++++++++ tools/ci-user-data.bash | 15 +++ tools/packer-build-ci.bash | 5 + ...acker-build.bash => packer-build-web.bash} | 7 +- 19 files changed, 312 insertions(+), 107 deletions(-) create mode 100644 packer/ci.pkr.hcl create mode 100755 packer/provision-ci.bash rename packer/{provision.bash => provision-web.bash} (100%) rename packer/{config.pkr.hcl => web.pkr.hcl} (92%) create mode 100644 tf/ec2.tf create mode 100644 tf/ssm.tf create mode 100755 tools/ci-ec2.bash create mode 100755 tools/ci-user-data.bash create mode 100755 tools/packer-build-ci.bash rename tools/{packer-build.bash => packer-build-web.bash} (71%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fe22d8c..838f17b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,4 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_REGION: us-west-1 AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DOCKER_REPO: 084011155226.dkr.ecr.us-west-1.amazonaws.com/riju - PUBLIC_DOCKER_REPO: public.ecr.aws/raxod502/riju - S3_BUCKET: riju - run: tools/ci-bootstrap.bash + run: tools/ci-ec2.bash diff --git a/Makefile b/Makefile index d227ce1..d5b396f 100644 --- a/Makefile +++ b/Makefile @@ -276,8 +276,11 @@ fmt: fmt-c fmt-go fmt-python fmt-terraform fmt-web # Format all code ### Infrastructure -packer: supervisor # Build and publish a new AMI - tools/packer-build.bash +packer-web: supervisor # Build and publish a new webserver AMI + tools/packer-build-web.bash + +packer-ci: # Build and publish a new CI AMI + tools/packer-build-ci.bash ### Miscellaneous diff --git a/packer/ci.pkr.hcl b/packer/ci.pkr.hcl new file mode 100644 index 0000000..c172a8e --- /dev/null +++ b/packer/ci.pkr.hcl @@ -0,0 +1,48 @@ +data "amazon-ami" "ubuntu" { + filters = { + name = "ubuntu/images/hvm-ssd/ubuntu-*-21.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["099720109477"] +} + +locals { + timestamp = regex_replace(timestamp(), "[- TZ:]", "") +} + +source "amazon-ebs" "ubuntu" { + ami_name = "riju-ci-${local.timestamp}" + instance_type = "t3.micro" + source_ami = "${data.amazon-ami.ubuntu.id}" + ssh_username = "ubuntu" + + tag { + key = "BillingCategory" + value = "Riju" + } + + tag { + key = "BillingSubcategory" + value = "Riju:AMI" + } + + tag { + key = "Name" + value = "riju-ci-${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.ubuntu"] + + provisioner "file" { + destination = "/tmp/riju-init-volume" + source = "riju-init-volume" + } + + provisioner "shell" { + script = "provision-ci.bash" + } +} diff --git a/packer/provision-ci.bash b/packer/provision-ci.bash new file mode 100755 index 0000000..d1258b1 --- /dev/null +++ b/packer/provision-ci.bash @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# I think there is a race condition related to Ubuntu wanting to do an +# automated system upgrade at boot, which causes 'apt-get update' to +# sometimes fail with an obscure error message. +sleep 5 + +mkdir /tmp/riju-work +pushd /tmp/riju-work + +export DEBIAN_FRONTEND=noninteractive + +sudo -E apt-get update +sudo -E apt-get dist-upgrade -y + +sudo -E apt-get install -y curl gnupg lsb-release + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo -E apt-key add - + +ubuntu_name="$(lsb_release -cs)" + +sudo tee -a /etc/apt/sources.list.d/custom.list >/dev/null <&2 "riju-init-volume: $@" } -mount_point=/mnt/riju/data +mount_point=/mnt/riju mkdir -p "${mount_point}" @@ -55,7 +55,7 @@ mount -a print "filesystem mounted at ${mount_point}" -docker_args="-g ${mount_point}" +docker_args="-g ${mount_point}/docker" if ! cat /lib/systemd/system/docker.service | grep -q -- "${docker_args}"; then print "adding '${docker_args}' to docker.service" diff --git a/packer/config.pkr.hcl b/packer/web.pkr.hcl similarity index 92% rename from packer/config.pkr.hcl rename to packer/web.pkr.hcl index fd4962f..f83e151 100644 --- a/packer/config.pkr.hcl +++ b/packer/web.pkr.hcl @@ -33,7 +33,7 @@ locals { } source "amazon-ebs" "ubuntu" { - ami_name = "riju-${local.timestamp}" + ami_name = "riju-web-${local.timestamp}" instance_type = "t3.micro" source_ami = "${data.amazon-ami.ubuntu.id}" ssh_username = "ubuntu" @@ -50,7 +50,7 @@ source "amazon-ebs" "ubuntu" { tag { key = "Name" - value = "riju-${local.timestamp}" + value = "riju-web-${local.timestamp}" } } @@ -84,6 +84,6 @@ build { "S3_BUCKET=${var.s3_bucket}", "SUPERVISOR_ACCESS_TOKEN=${var.supervisor_access_token}", ] - script = "provision.bash" + script = "provision-web.bash" } } diff --git a/tf/alb.tf b/tf/alb.tf index c03d69c..4fd8d0c 100644 --- a/tf/alb.tf +++ b/tf/alb.tf @@ -74,8 +74,6 @@ resource "aws_lb_listener" "server_https" { } resource "aws_autoscaling_attachment" "server" { - count = local.ami_available ? 1 : 0 - - autoscaling_group_name = aws_autoscaling_group.server[count.index].name + autoscaling_group_name = aws_autoscaling_group.server.name alb_target_group_arn = aws_lb_target_group.server.arn } diff --git a/tf/ami.tf b/tf/ami.tf index 89dd194..a604c69 100644 --- a/tf/ami.tf +++ b/tf/ami.tf @@ -1,6 +1,4 @@ data "aws_ami" "server" { - count = local.ami_available ? 1 : 0 - owners = ["self"] filter { @@ -8,3 +6,12 @@ data "aws_ami" "server" { values = [data.external.env.result.AMI_NAME] } } + +data "aws_ami" "ci" { + owners = ["self"] + + filter { + name = "name" + values = [data.external.env.result.CI_AMI_NAME] + } +} diff --git a/tf/asg.tf b/tf/asg.tf index 91a1177..9b2fa7e 100644 --- a/tf/asg.tf +++ b/tf/asg.tf @@ -35,10 +35,8 @@ resource "aws_security_group" "server" { } resource "aws_launch_template" "server" { - count = local.ami_available ? 1 : 0 - name = "riju-server" - image_id = data.aws_ami.server[count.index].id + image_id = data.aws_ami.server.id instance_type = "t3.small" security_group_names = [aws_security_group.server.name] @@ -78,8 +76,6 @@ resource "aws_launch_template" "server" { } resource "aws_autoscaling_group" "server" { - count = local.ami_available ? 1 : 0 - name = "riju-server" availability_zones = [ @@ -90,7 +86,7 @@ resource "aws_autoscaling_group" "server" { max_size = 3 launch_template { - id = aws_launch_template.server[count.index].id + id = aws_launch_template.server.id } tags = concat( diff --git a/tf/cloudwatch.tf b/tf/cloudwatch.tf index 875d648..d37a845 100644 --- a/tf/cloudwatch.tf +++ b/tf/cloudwatch.tf @@ -1,20 +1,19 @@ resource "aws_cloudwatch_metric_alarm" "server_cpu" { - count = local.ami_available ? 1 : 0 - alarm_name = "riju-server-cpu-high" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "30" + datapoints_to_alarm = "15" metric_name = "cpu_usage_active" namespace = "CWAgent" period = "60" - statistic = "Maximum" - threshold = "90" - alarm_description = "CPU usage on Riju server is above 90% for 30 minutes" + statistic = "Average" + threshold = "70" + alarm_description = "Average CPU usage on Riju server is above 70% for 30 minutes" ok_actions = [aws_sns_topic.riju.arn] alarm_actions = [aws_sns_topic.riju.arn] insufficient_data_actions = [aws_sns_topic.riju.arn] dimensions = { - AutoScalingGroupName = aws_autoscaling_group.server[count.index].name + AutoScalingGroupName = aws_autoscaling_group.server.name } tags = { @@ -23,22 +22,21 @@ resource "aws_cloudwatch_metric_alarm" "server_cpu" { } resource "aws_cloudwatch_metric_alarm" "server_memory" { - count = local.ami_available ? 1 : 0 - alarm_name = "riju-server-memory-high" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "30" + datapoints_to_alarm = "15" metric_name = "mem_used_percent" namespace = "CWAgent" period = "60" - statistic = "Maximum" - threshold = "80" - alarm_description = "Memory usage on Riju server is above 80% for 30 minutes" + statistic = "Average" + threshold = "70" + alarm_description = "Average memory usage on Riju server is above 70% for 30 minutes" ok_actions = [aws_sns_topic.riju.arn] alarm_actions = [aws_sns_topic.riju.arn] insufficient_data_actions = [aws_sns_topic.riju.arn] dimensions = { - AutoScalingGroupName = aws_autoscaling_group.server[count.index].name + AutoScalingGroupName = aws_autoscaling_group.server.name } tags = { @@ -47,22 +45,21 @@ resource "aws_cloudwatch_metric_alarm" "server_memory" { } resource "aws_cloudwatch_metric_alarm" "server_data_volume_disk_space" { - count = local.ami_available ? 1 : 0 - alarm_name = "riju-server-data-volume-disk-usage-high" comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "30" + evaluation_periods = "5" + datapoints_to_alarm = "5" metric_name = "disk_used_percent" namespace = "CWAgent" period = "60" - statistic = "Maximum" - threshold = "90" - alarm_description = "Disk space usage for data volume on Riju server is above 90% for 30 minutes" + statistic = "Average" + threshold = "70" + alarm_description = "Disk space usage for data volume on Riju server is above 70%" ok_actions = [aws_sns_topic.riju.arn] alarm_actions = [aws_sns_topic.riju.arn] insufficient_data_actions = [aws_sns_topic.riju.arn] dimensions = { - AutoScalingGroupName = aws_autoscaling_group.server[count.index].name + AutoScalingGroupName = aws_autoscaling_group.server.name path = "/mnt/riju/data" } @@ -72,22 +69,21 @@ resource "aws_cloudwatch_metric_alarm" "server_data_volume_disk_space" { } resource "aws_cloudwatch_metric_alarm" "server_root_volume_disk_space" { - count = local.ami_available ? 1 : 0 - alarm_name = "riju-server-root-volume-disk-usage-high" comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "30" + evaluation_periods = "5" + datapoints_to_alarm = "5" metric_name = "disk_used_percent" namespace = "CWAgent" period = "60" - statistic = "Maximum" - threshold = "90" - alarm_description = "Disk space usage for root volume on Riju server is above 90% for 30 minutes" + statistic = "Average" + threshold = "70" + alarm_description = "Disk space usage for root volume on Riju server is above 70%" ok_actions = [aws_sns_topic.riju.arn] alarm_actions = [aws_sns_topic.riju.arn] insufficient_data_actions = [aws_sns_topic.riju.arn] dimensions = { - AutoScalingGroupName = aws_autoscaling_group.server[count.index].name + AutoScalingGroupName = aws_autoscaling_group.server.name path = "/" } @@ -97,8 +93,6 @@ resource "aws_cloudwatch_metric_alarm" "server_root_volume_disk_space" { } resource "aws_cloudwatch_dashboard" "riju" { - count = local.ami_available ? 1 : 0 - dashboard_name = "Riju" dashboard_body = <&2 "[ci-run.bash] Fetching build parameters from SSM." + +parameters="riju-ci-ami-id riju-docker-repo-host riju-public-docker-repo-host riju-s3-bucket-name" +resp="$(aws ssm get-parameters --names ${parameters})" + +read -r ami < <(jq '.Parameters[] | select(.Name == "riju-ci-ami-id").Value' -r <<< "${resp}") +read -r docker_repo < <(jq '.Parameters[] | select(.Name == "riju-docker-repo-host").Value' -r <<< "${resp}") +read -r public_docker_repo < <(jq '.Parameters[] | select(.Name == "riju-public-docker-repo-host").Value' -r <<< "${resp}") +read -r s3_bucket < <(jq '.Parameters[] | select(.Name == "riju-s3-bucket-name").Value' -r <<< "${resp}") + +echo >&2 "[ci-run.bash] Launching EC2 instance for CI job." + +ebs_config="DeviceName=/dev/sdh,Ebs={DeleteOnTermination=true,VolumeSize=128,VolumeType=gp3}" +instance_tags="ResourceType=instance,Tags=[{Key=Name,Value=Riju CI},{Key=BillingCategory,Value=Riju},{Key=BillingSubcategory,Value=Riju:EC2:CI}]" +ebs_tags="ResourceType=volume,Tags=[{Key=Name,Value=Riju CI},{Key=BillingCategory,Value=Riju},{Key=BillingSubcategory,Value=Riju:EBS:CI}]" + +resp="$(aws ec2 run-instances \ + --image-id "${ami}" \ + --instance-type t3.2xlarge \ + --security-groups riju-deploy \ + --iam-instance-profile Name=riju-deploy \ + --instance-initiated-shutdown-behavior terminate \ + --user-data file://tools/ci-user-data.bash \ + --tag-specifications "${instance_tags}" "${ebs_tags}" \ + --block-device-mappings "${ebs_config}")" + +instance_id="$(jq '.Instances[].InstanceId' -r <<< "${resp}")" + +echo >&2 "[ci-run.bash] Waiting for instance ${instance_id} to become ready." + +success= +for i in $(seq 1 15); do + sleep 2 + resp="$(aws ec2 describe-instance-status --instance-id "${instance_id}")" + status="$(jq '.InstanceStatuses[].InstanceState.Name' -r <<< "${resp}")" + status="${status:-unknown}" + case "${status}" in + pending|unknown) ;; + running) success=yes; break ;; + * ) exit 1 ;; + esac +done + +if [[ -z "${success}}" ]]; then + exit 124 +fi + +echo >&2 "[ci-run.bash] Waiting for SSH to come online." + +success= +for i in $(seq 1 15); do + if (yes || true) | timeout 5 mssh "ubuntu@${instance_id}" true 2>/dev/null; then + success=yes + break + elif (( $# == 124 )); then + exit 1 + fi + sleep 2 +done + +if [[ -z "${success}}" ]]; then + exit 124 +fi + +echo >&2 "[ci-run.bash] Running CI remotely using EC2 Instance Connect." + +mssh "ubuntu@${instance_id}" bash <&2 "[ci-run.bash] CI completed." diff --git a/tools/ci-user-data.bash b/tools/ci-user-data.bash new file mode 100755 index 0000000..a0e8dbe --- /dev/null +++ b/tools/ci-user-data.bash @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -euo pipefail + +if [[ -z "${NOHUP:-}" ]]; then + NOHUP=1 nohup "$0" "$@" & +fi + +while true; do + sleep 60 + # https://unix.stackexchange.com/a/92579 + if ! sudo netstat -tnpa | grep 'ESTABLISHED.*sshd'; then + sudo shutdown -h now + fi +done diff --git a/tools/packer-build-ci.bash b/tools/packer-build-ci.bash new file mode 100755 index 0000000..415a3e4 --- /dev/null +++ b/tools/packer-build-ci.bash @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +set -euo pipefail + +packer build ci.pkr.hcl diff --git a/tools/packer-build.bash b/tools/packer-build-web.bash similarity index 71% rename from tools/packer-build.bash rename to tools/packer-build-web.bash index f52e12b..c0c7891 100755 --- a/tools/packer-build.bash +++ b/tools/packer-build-web.bash @@ -2,6 +2,10 @@ set -euo pipefail +: ${ADMIN_PASSWORD} +: ${S3_BUCKET} +: ${SUPERVISOR_ACCESS_TOKEN} + export AWS_REGION="${AWS_REGION:-$(aws configure get region)}" if [[ -z "${AWS_REGION}" ]]; then @@ -9,5 +13,4 @@ if [[ -z "${AWS_REGION}" ]]; then exit 1 fi -cd packer -packer build config.pkr.hcl +packer build web.pkr.hcl