copr: be: cleanup AWS builders periodically

Terminate copr-*-builder* VMs which are not tracked in redis database periodically. While we are on it, remove `cleanup_vm*` scripts which are no longer needed, the up2date script is provided by `copr-backend` package.
2019-11-27 09:10:58 +01:00 · 2019-11-27 09:10:58 +01:00 · fa604650ba
commit fa604650ba
parent c206da6415
4 changed files with 66 additions and 124 deletions
--- a/roles/copr/backend/files/cleanup-vms-aws
+++ b/roles/copr/backend/files/cleanup-vms-aws
@ -0,0 +1,53 @@
+#! /bin/bash
+
+set -e
+
+test "$UID" != "0" || { echo "execute as copr user" && exit 1 ; }
+
+something_found=false
+
+dump_command ()
+{
+        echo >&2 " -> $*"
+        "$@"
+}
+
+tracked()
+{
+        name=$(redis-cli --scan --pattern "copr:backend:vm_instance:hset::$1")
+        test -n "$name"
+}
+
+aws_command=(
+    aws ec2 describe-instances
+        --query "Reservations[].Instances[].{Id:InstanceId,Name:Tags[?Key=='Name']|[0].Value}"
+        --filters "Name=tag-key,Values=FedoraCopr,Name=tag-value,Values=copr"
+                  "Name=instance-state-name,Values=running"
+        --output text
+)
+
+something_found=false
+
+prefix=dev
+case $(hostname) in
+    copr-be.*)
+        prefix=prod
+        ;;
+esac
+
+while read -r aws_id vm_name; do
+    case $vm_name in
+    copr-$prefix-builder*)
+        something_found=true
+        if ! tracked "$vm_name"; then
+                echo "removing $vm_name"
+                dump_command aws ec2 terminate-instances --instance-ids "$aws_id"
+        fi
+        ;;
+    *)
+        continue ;;
+    esac
+done < <( "${aws_command[@]}" )
+
+# fail if no VM was found (weird situation)
+$something_found
--- a/roles/copr/backend/files/cleanup_vm_nova.py
+++ b/roles/copr/backend/files/cleanup_vm_nova.py
@ -1,118 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-
-import os
-import sys
-import time
-import logging
-
-from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dateutil.parser import parse as dt_parse
-
-import psutil
-import yaml
-from novaclient.client import Client
-
-sys.path.append("/usr/share/copr/")
-
-from backend.helpers import BackendConfigReader
-from backend.helpers import utc_now
-
-try:
-    from backend.vm_manage.manager import VmManager
-except ImportError:
-    VmManager = None
-
-logging.getLogger("requests").setLevel(logging.ERROR)
-
-
-nova_cloud_vars_path = os.environ.get("NOVA_CLOUD_VARS", "/home/copr/provision/nova_cloud_vars.yml")
-
-
-def read_config():
-    with open(nova_cloud_vars_path) as handle:
-        conf = yaml.load(handle.read())
-    return conf
-
-
-def get_client(conf):
-    username = conf["OS_USERNAME"]
-    password = conf["OS_PASSWORD"]
-    tenant_name = conf["OS_TENANT_NAME"]
-    auth_url = conf["OS_AUTH_URL"]
-    return Client('2', username, password, tenant_name, auth_url)
-
-
-def get_managed_vms_names():
-    result = []
-    if VmManager:
-        opts = BackendConfigReader().read()
-        vmm = VmManager(opts, log)
-        result.extend(vmd.vm_name.lower() for vmd in vmm.get_all_vm())
-    return result
-
-
-class Cleaner(object):
-    def __init__(self, conf):
-        self.conf = conf
-        self.nt = None
-
-    @staticmethod
-    def terminate(srv):
-        try:
-            srv.delete()
-            log.info("delete invoked for: {}".format(srv))
-        except Exception as err:
-            log.exception("failed to request VM termination: {}".format(err))
-
-    @staticmethod
-    def old_enough(srv):
-        dt_created = dt_parse(srv.created)
-        delta = (utc_now() - dt_created).total_seconds()
-        # log.info("Server {} created {} now {}; delta: {}".format(srv, dt_created, utc_now(), delta))
-        return delta > 60 * 5  # 5 minutes
-
-    def check_one(self, srv_id, vms_names):
-        srv = self.nt.servers.get(srv_id)
-        log.info("checking vm: {}".format(srv))
-        srv.get()
-        if srv.status.lower().strip() == "error":
-            log.info("server {} got into the error state, terminating".format(srv))
-            self.terminate(srv)
-        elif self.old_enough(srv) and srv.human_id.lower() not in vms_names:
-            log.info("server {} not placed in our db, terminating".format(srv))
-            self.terminate(srv)
-
-    def main(self):
-        """
-        Terminate erred VM's and VM's with uptime > 10 minutes and which doesn't have associated process
-        """
-        start = time.time()
-        log.info("Cleanup start")
-
-        self.nt = get_client(self.conf)
-        srv_list = self.nt.servers.list(detailed=False)
-        vms_names = get_managed_vms_names()
-        with ThreadPoolExecutor(max_workers=20) as executor:
-            future_check = {executor.submit(self.check_one, srv.id, vms_names): srv.id for srv in srv_list}
-            for future in as_completed(future_check):
-                try:
-                    future.result()
-                except Exception as exc:
-                    log.exception(exc)
-
-        log.info("cleanup consumed: {} seconds".format(time.time() - start))
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        filename="/var/log/copr-backend/cleanup_vms.log",
-        # filename="/tmp/cleanup_vms.log",
-        # stream=sys.stdout,
-        format='[%(asctime)s][%(thread)s][%(levelname)6s]: %(message)s',
-        level=logging.INFO)
-
-    log = logging.getLogger(__name__)
-
-    cleaner = Cleaner(read_config())
-    cleaner.main()
--- a/roles/copr/backend/files/cleanup_vms.sh
+++ b/roles/copr/backend/files/cleanup_vms.sh
@ -1,3 +0,0 @@
-#!/usr/bin/sh
-
-runuser -c "/home/copr/cleanup_vm_nova.py 2> /dev/null" - copr
--- a/roles/copr/backend/tasks/main.yml
+++ b/roles/copr/backend/tasks/main.yml
@ -255,10 +255,20 @@
  - redis       # TODO: .service in copr-backend should depend on redis
  - copr-backend

- copy: src="cleanup_vm_nova.py" dest=/home/copr/ mode=755
+- name: install aws cleaning script
+  copy: src="cleanup-vms-aws" dest=/usr/local/bin/ mode=755

- copy: src="cleanup_vms.sh" dest=/etc/cron.hourly/copr_cleanup_vms.sh mode=755
-  when: not devel
+- name: setup crontab for VMs
+  cron: name="cleanup nova VMs periodically"
+        job="/usr/bin/cleanup_vm_nova.py"
+        minute="*/20"
+        user=copr
+
+- name: setup crontab for VMs
+  cron: name="cleanup AWS VMs periodically"
+        job="/usr/local/bin/cleanup-vms-aws"
+        minute="0"
+        user=copr

 - name: setup monitoring
  import_tasks: "monitoring.yml"