From 422a344d1ba898cd8a7af7a11f9b18ac37f57b5c Mon Sep 17 00:00:00 2001 From: Pavel Raiskup Date: Mon, 6 Mar 2023 08:17:20 +0100 Subject: [PATCH] copr-backend: delete orphaned AWS instances via resalloc-aws-list --- roles/copr/backend/files/cleanup-vms-aws | 134 ------------------ .../backend/files/cleanup-vms-aws-resalloc | 92 ------------ roles/copr/backend/tasks/main.yml | 16 +-- .../backend/templates/resalloc/pools.yaml | 2 + 4 files changed, 4 insertions(+), 240 deletions(-) delete mode 100755 roles/copr/backend/files/cleanup-vms-aws delete mode 100644 roles/copr/backend/files/cleanup-vms-aws-resalloc diff --git a/roles/copr/backend/files/cleanup-vms-aws b/roles/copr/backend/files/cleanup-vms-aws deleted file mode 100755 index 119c03fa48..0000000000 --- a/roles/copr/backend/files/cleanup-vms-aws +++ /dev/null @@ -1,134 +0,0 @@ -#! /bin/bash - -cat <&2 <....) -- and are not in our redis database. -Such leftover VM usually occurs by manual spawning of bulider, during image -creation. But bug in VM spawner can cause this, etc. That's why we run this -mode periodically in cron job. - -The --kill-also-unused mode is useful when we need to re-distribute new -mock/rpmbuild configuration to all builders. All unused builders are -terminated (so we don't kill running builds) and respawned. -EOF -} - -die () -{ - echo >&2 "$*" - echo >&2 - help_output - exit 1 -} - -test "$UID" != "0" || die "execute as copr user" - -kill_unused=false - -for arg; do - case $arg in - -h|--help) - help_output - exit 1 - ;; - --kill-also-unused) - kill_unused=true - ;; - *) - die "unknown arg $arg" - ;; - esac -done - -something_found=false - -dump_command () -{ - echo >&2 " -> $*" - "$@" -} - -tracked() -{ - name=$(redis-cli --scan --pattern "copr:backend:vm_instance:hset::$1") - test -n "$name" -} - -used() -{ - task=$(redis-cli hget "copr:backend:vm_instance:hset::$1" task_id) - test -n "$task" -} - -old_enough() -{ - # give them 1 hour - started=$(date --date="$1" +%s) - now=$(date +%s) - old_enough=$(( now - 3600 )) - test "$started" -le "$old_enough" -} - -aws_command=( - aws ec2 describe-instances - --query "Reservations[].Instances[].{Id:InstanceId,Name:Tags[?Key=='Name']|[0].Value,Time:LaunchTime}" - --filters "Name=tag:FedoraGroup,Values=copr" - "Name=instance-state-name,Values=running" - "Name=tag:CoprPurpose,Values=builder" - --output text -) - -something_found=false - -prefix=dev -case $(hostname) in - copr-be.*) - prefix=prod - ;; -esac - -while read -r aws_id vm_name launch_time; do - case $vm_name in - aws_*_normal_${prefix}_*) - something_found=true - - if tracked "$vm_name"; then - # skip known VMs - ! $kill_unused && continue - used "$vm_name" && continue - fi - - # skip recently started VMs - if ! $kill_unused && ! old_enough "$launch_time"; then - echo >&2 "$vm_name is not yet old enough: $launch_time" - continue - fi - - # delete the rest - dump_command aws ec2 terminate-instances --instance-ids "$aws_id" - ;; - *) - continue ;; - esac -done < <( "${aws_command[@]}" ) - -# fail if no VM was found (weird situation) -$something_found diff --git a/roles/copr/backend/files/cleanup-vms-aws-resalloc b/roles/copr/backend/files/cleanup-vms-aws-resalloc deleted file mode 100644 index 71931a6261..0000000000 --- a/roles/copr/backend/files/cleanup-vms-aws-resalloc +++ /dev/null @@ -1,92 +0,0 @@ -#! /usr/bin/python3 - -""" -Cleanup all AWS VM instances which are using 'copr-builder' key, and are not -valid. Either they have no Name tag set (some spawning problems...) or is not -tracked by resalloc server. -""" - -import json -import logging -import subprocess -import time - -import dateutil.parser - -logging.basicConfig(level=logging.INFO) -LOG = logging.getLogger() - - -def run_cmd(cmd): - """ check_output() and decode from utf8 """ - return subprocess.check_output(cmd).decode("utf-8") - - -def _get_instances(): - query = ( - "Reservations[].Instances[].{" - "ID:InstanceId," - "Name:Tags[?Key=='Name']|[0].Value," - "KeyName:KeyName," - "CoprInstance:Tags[?Key=='CoprInstance']|[0].Value," - "Start:LaunchTime" - "}" - ) - aws_command = [ - "aws", "ec2", "describe-instances", - "--query", query, - "--filters", - "Name=key-name,Values=copr-builder", - "Name=instance-state-name,Values=running", - #"Name=tag-key,Values=FedoraCopr,Name=tag-value,Values=copr", - "--output", "json", - "--region", "us-east-1", - ] - return json.loads(run_cmd(aws_command)) - - -def _terminate_instnace(instance_id): - cmd = ["aws", "ec2", "terminate-instances", "--instance-ids", instance_id] - subprocess.call(cmd) - - -def _get_tracked_instances(): - raw = run_cmd(["resalloc-maint", "resource-list"]) - return_tracked = [] - for resource in raw.strip().split("\n"): - return_tracked.append(resource.split(' ')[2]) - return return_tracked - - -def _detect_instance(): - hostname = run_cmd("hostname").strip() - return "devel" if "copr-be-dev" in hostname else "production" - - -def _main(): - tracked = _get_tracked_instances() - copr_instance = _detect_instance() - - for instance in _get_instances(): - started = dateutil.parser.parse(instance["Start"]).timestamp() - if time.time() - started < 1800: - continue - - if not instance["Name"]: - LOG.info("shutting down unnamed instance %s", instance["ID"]) - _terminate_instnace(instance["ID"]) - continue - - if instance["CoprInstance"] != copr_instance: - LOG.debug("not our instance: %s (%s)", instance["Name"], instance["CoprInstance"]) - continue - - if instance["Name"] in tracked: - LOG.debug("tracked %s, skipped", instance["Name"]) - continue - - _terminate_instnace(instance["ID"]) - - -if __name__ == "__main__": - _main() diff --git a/roles/copr/backend/tasks/main.yml b/roles/copr/backend/tasks/main.yml index 832fe77193..07f21bfff4 100644 --- a/roles/copr/backend/tasks/main.yml +++ b/roles/copr/backend/tasks/main.yml @@ -304,12 +304,6 @@ - redis # TODO: .service in copr-backend should depend on redis - "{{ copr_backend_target }}" -- name: install aws cleaning script - copy: src="cleanup-vms-aws" dest=/usr/local/bin/ mode=755 - -- name: install aws cleaning script for resalloc - copy: src="cleanup-vms-aws-resalloc" dest=/usr/local/bin/ mode=755 - - name: access.redhat.com offline token file set_fact: "rhn_offline_token_file=/var/lib/resallocserver/.access.redhat.com-copr-team" tags: @@ -343,13 +337,6 @@ user=copr state=absent -- name: setup crontab for VMs - cron: name="cleanup AWS VMs periodically" - job="/usr/local/bin/cleanup-vms-aws" - minute="0" - user=copr - state=absent - - name: setup crontab for cleaning up redis cron: name="prune redis VM db periodically" job="/usr/local/bin/cleanup-unused-vms-from-redis &>> /var/log/copr-backend/cleanup-redis-vms.log" @@ -359,9 +346,10 @@ - name: crontab for cleaning resalloc VMs cron: name="cleanup nova VMs periodically" - job="/usr/local/bin/cleanup-vms-aws-resalloc &>> /var/log/resallocserver/cron-cleanup-vms-aws.log" + job="true /usr/local/bin/cleanup-vms-aws-resalloc &>> /var/log/resallocserver/cron-cleanup-vms-aws.log" minute="*/10" user=resalloc + state=absent - name: crontab for cleaning-up unused subscriptions cron: name="cleanup unused Red Hat subscribed systems" diff --git a/roles/copr/backend/templates/resalloc/pools.yaml b/roles/copr/backend/templates/resalloc/pools.yaml index d5e2172543..b64e8311f1 100644 --- a/roles/copr/backend/templates/resalloc/pools.yaml +++ b/roles/copr/backend/templates/resalloc/pools.yaml @@ -30,6 +30,7 @@ aws_x86_64_{% if spot %}spot{% else %}normal{% endif %}_{% if devel %}dev{% else cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-delete" cmd_livecheck: "resalloc-check-vm-ip" cmd_release: "/var/lib/resallocserver/resalloc_provision/vm-release" + cmd_list: resalloc-aws-list livecheck_period: 180 reuse_opportunity_time: 180 reuse_max_count: 8 @@ -54,6 +55,7 @@ aws_aarch64_{% if spot %}spot{% else %}normal{% endif %}_{% if devel %}dev{% els cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-delete" cmd_livecheck: "resalloc-check-vm-ip" cmd_release: "/var/lib/resallocserver/resalloc_provision/vm-release" + cmd_list: resalloc-aws-list livecheck_period: 180 reuse_opportunity_time: 180 reuse_max_count: 8