copr-backend: delete orphaned AWS instances via resalloc-aws-list

This commit is contained in:
Pavel Raiskup 2023-03-06 08:17:20 +01:00
parent 9e2172db47
commit 422a344d1b
4 changed files with 4 additions and 240 deletions

View file

@ -1,134 +0,0 @@
#! /bin/bash
cat <<EOF
This script doesn't work currently, it is obsoleted by
the script 'cleanup-vms-aws-resalloc'. It is kept for documentation purposes or
future reference.
EOF
exit 1
set -e
help_output()
{
cat >&2 <<EOF
Usage: $0 [--kill-also-unused]
This script terminates all VMs started in AWS which are "probably" started by
this instance of Copr, but are no more relevant for us.
Normal mode (no --kill-also-unused) only terminates VMs which
- are in running state,
- are named properly (e.g. copr-builder-<instance>....)
- and are not in our redis database.
Such leftover VM usually occurs by manual spawning of bulider, during image
creation. But bug in VM spawner can cause this, etc. That's why we run this
mode periodically in cron job.
The --kill-also-unused mode is useful when we need to re-distribute new
mock/rpmbuild configuration to all builders. All unused builders are
terminated (so we don't kill running builds) and respawned.
EOF
}
die ()
{
echo >&2 "$*"
echo >&2
help_output
exit 1
}
test "$UID" != "0" || die "execute as copr user"
kill_unused=false
for arg; do
case $arg in
-h|--help)
help_output
exit 1
;;
--kill-also-unused)
kill_unused=true
;;
*)
die "unknown arg $arg"
;;
esac
done
something_found=false
dump_command ()
{
echo >&2 " -> $*"
"$@"
}
tracked()
{
name=$(redis-cli --scan --pattern "copr:backend:vm_instance:hset::$1")
test -n "$name"
}
used()
{
task=$(redis-cli hget "copr:backend:vm_instance:hset::$1" task_id)
test -n "$task"
}
old_enough()
{
# give them 1 hour
started=$(date --date="$1" +%s)
now=$(date +%s)
old_enough=$(( now - 3600 ))
test "$started" -le "$old_enough"
}
aws_command=(
aws ec2 describe-instances
--query "Reservations[].Instances[].{Id:InstanceId,Name:Tags[?Key=='Name']|[0].Value,Time:LaunchTime}"
--filters "Name=tag:FedoraGroup,Values=copr"
"Name=instance-state-name,Values=running"
"Name=tag:CoprPurpose,Values=builder"
--output text
)
something_found=false
prefix=dev
case $(hostname) in
copr-be.*)
prefix=prod
;;
esac
while read -r aws_id vm_name launch_time; do
case $vm_name in
aws_*_normal_${prefix}_*)
something_found=true
if tracked "$vm_name"; then
# skip known VMs
! $kill_unused && continue
used "$vm_name" && continue
fi
# skip recently started VMs
if ! $kill_unused && ! old_enough "$launch_time"; then
echo >&2 "$vm_name is not yet old enough: $launch_time"
continue
fi
# delete the rest
dump_command aws ec2 terminate-instances --instance-ids "$aws_id"
;;
*)
continue ;;
esac
done < <( "${aws_command[@]}" )
# fail if no VM was found (weird situation)
$something_found

View file

@ -1,92 +0,0 @@
#! /usr/bin/python3
"""
Cleanup all AWS VM instances which are using 'copr-builder' key, and are not
valid. Either they have no Name tag set (some spawning problems...) or is not
tracked by resalloc server.
"""
import json
import logging
import subprocess
import time
import dateutil.parser
logging.basicConfig(level=logging.INFO)
LOG = logging.getLogger()
def run_cmd(cmd):
""" check_output() and decode from utf8 """
return subprocess.check_output(cmd).decode("utf-8")
def _get_instances():
query = (
"Reservations[].Instances[].{"
"ID:InstanceId,"
"Name:Tags[?Key=='Name']|[0].Value,"
"KeyName:KeyName,"
"CoprInstance:Tags[?Key=='CoprInstance']|[0].Value,"
"Start:LaunchTime"
"}"
)
aws_command = [
"aws", "ec2", "describe-instances",
"--query", query,
"--filters",
"Name=key-name,Values=copr-builder",
"Name=instance-state-name,Values=running",
#"Name=tag-key,Values=FedoraCopr,Name=tag-value,Values=copr",
"--output", "json",
"--region", "us-east-1",
]
return json.loads(run_cmd(aws_command))
def _terminate_instnace(instance_id):
cmd = ["aws", "ec2", "terminate-instances", "--instance-ids", instance_id]
subprocess.call(cmd)
def _get_tracked_instances():
raw = run_cmd(["resalloc-maint", "resource-list"])
return_tracked = []
for resource in raw.strip().split("\n"):
return_tracked.append(resource.split(' ')[2])
return return_tracked
def _detect_instance():
hostname = run_cmd("hostname").strip()
return "devel" if "copr-be-dev" in hostname else "production"
def _main():
tracked = _get_tracked_instances()
copr_instance = _detect_instance()
for instance in _get_instances():
started = dateutil.parser.parse(instance["Start"]).timestamp()
if time.time() - started < 1800:
continue
if not instance["Name"]:
LOG.info("shutting down unnamed instance %s", instance["ID"])
_terminate_instnace(instance["ID"])
continue
if instance["CoprInstance"] != copr_instance:
LOG.debug("not our instance: %s (%s)", instance["Name"], instance["CoprInstance"])
continue
if instance["Name"] in tracked:
LOG.debug("tracked %s, skipped", instance["Name"])
continue
_terminate_instnace(instance["ID"])
if __name__ == "__main__":
_main()

View file

@ -304,12 +304,6 @@
- redis # TODO: .service in copr-backend should depend on redis
- "{{ copr_backend_target }}"
- name: install aws cleaning script
copy: src="cleanup-vms-aws" dest=/usr/local/bin/ mode=755
- name: install aws cleaning script for resalloc
copy: src="cleanup-vms-aws-resalloc" dest=/usr/local/bin/ mode=755
- name: access.redhat.com offline token file
set_fact: "rhn_offline_token_file=/var/lib/resallocserver/.access.redhat.com-copr-team"
tags:
@ -343,13 +337,6 @@
user=copr
state=absent
- name: setup crontab for VMs
cron: name="cleanup AWS VMs periodically"
job="/usr/local/bin/cleanup-vms-aws"
minute="0"
user=copr
state=absent
- name: setup crontab for cleaning up redis
cron: name="prune redis VM db periodically"
job="/usr/local/bin/cleanup-unused-vms-from-redis &>> /var/log/copr-backend/cleanup-redis-vms.log"
@ -359,9 +346,10 @@
- name: crontab for cleaning resalloc VMs
cron: name="cleanup nova VMs periodically"
job="/usr/local/bin/cleanup-vms-aws-resalloc &>> /var/log/resallocserver/cron-cleanup-vms-aws.log"
job="true /usr/local/bin/cleanup-vms-aws-resalloc &>> /var/log/resallocserver/cron-cleanup-vms-aws.log"
minute="*/10"
user=resalloc
state=absent
- name: crontab for cleaning-up unused subscriptions
cron: name="cleanup unused Red Hat subscribed systems"

View file

@ -30,6 +30,7 @@ aws_x86_64_{% if spot %}spot{% else %}normal{% endif %}_{% if devel %}dev{% else
cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-delete"
cmd_livecheck: "resalloc-check-vm-ip"
cmd_release: "/var/lib/resallocserver/resalloc_provision/vm-release"
cmd_list: resalloc-aws-list
livecheck_period: 180
reuse_opportunity_time: 180
reuse_max_count: 8
@ -54,6 +55,7 @@ aws_aarch64_{% if spot %}spot{% else %}normal{% endif %}_{% if devel %}dev{% els
cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-delete"
cmd_livecheck: "resalloc-check-vm-ip"
cmd_release: "/var/lib/resallocserver/resalloc_provision/vm-release"
cmd_list: resalloc-aws-list
livecheck_period: 180
reuse_opportunity_time: 180
reuse_max_count: 8