diff --git a/roles/copr/backend/files/cleanup_vm_nova.py b/roles/copr/backend/files/cleanup_vm_nova.py new file mode 100644 index 0000000000..d2101dabfc --- /dev/null +++ b/roles/copr/backend/files/cleanup_vm_nova.py @@ -0,0 +1,126 @@ +#!/usr/bin/python +# coding: utf-8 + +# TODO: remove from ansible when new release on copr-backend become available + +import os +import sys +import time +import logging + +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +from dateutil.parser import parse as dt_parse + +import psutil +import yaml +from novaclient.v1_1.client import Client + +sys.path.append("/usr/share/copr/") + +try: + from backend.helpers import utc_now +except ImportError: + # TODO: remove when updated version of copr-backend will be released + import pytz + + def utc_now(): + """ + :return datetime.datetime: Current utc datetime with specified timezone + """ + u = datetime.utcnow() + u = u.replace(tzinfo=pytz.utc) + return u + + +logging.getLogger("requests").setLevel(logging.ERROR) + + +nova_cloud_vars_path = os.environ.get("NOVA_CLOUD_VARS", "/home/copr/provision/nova_cloud_vars.yml") + + +def read_config(): + with open(nova_cloud_vars_path) as handle: + conf = yaml.load(handle.read()) + return conf + + +def get_client(conf): + return Client(username=conf["OS_USERNAME"], + api_key=conf["OS_PASSWORD"], + project_id=conf["OS_TENANT_NAME"], + auth_url=conf["OS_AUTH_URL"], + insecure=True) + + +class Cleaner(object): + def __init__(self, conf): + self.conf = conf + self.nt = None + self.ps_set = None + + def post_init(self): + self.nt = get_client(self.conf) + # TODO: use VM management after release + self.ps_set = "\n".join(p.name + " ".join(p.cmdline) for p in psutil.process_iter()) + # log.debug("ps_set: \n{}".format(self.ps_set)) + + @staticmethod + def terminate(srv): + try: + srv.delete() + log.info("delete invoked for: {}".format(srv)) + except Exception as err: + log.exception("failed to request VM termination: {}".format(err)) + + @staticmethod + def old_enough(srv): + dt_created = dt_parse(srv.created) + delta = (utc_now() - dt_created).total_seconds() + # log.debug("Server {} created {} now {}; delta: {}".format(srv, dt_created, utc_now(), delta)) + return delta > 60 * 10 # 10 minutes + + def check_one(self, srv_id): + srv = self.nt.servers.get(srv_id) + log.debug("checking vm: {}".format(srv)) + srv.get() + if srv.status == u"ERROR": + log.info("server {} got into the error state, deleting".format(srv)) + self.terminate(srv) + elif self.old_enough(srv) and srv.human_id not in self.ps_set: + log.info("server {} not used by any builder".format(srv)) + self.terminate(srv) + # elif not self.old_enough(srv): + # log.info("Server {} not old enough".format(srv)) + + def main(self): + """ + Terminate erred VM's and VM's with uptime > 10 minutes and which doesn't have associated process + """ + self.post_init() + start = time.time() + + srv_list = self.nt.servers.list(detailed=False) + with ThreadPoolExecutor(max_workers=20) as executor: + future_check = {executor.submit(self.check_one, srv.id): srv.id for srv in srv_list} + for future in as_completed(future_check): + try: + future.result() + except Exception as exc: + log.exception(exc) + + log.info("cleanup consumed: {} seconds".format(time.time() - start)) + +if __name__ == "__main__": + logging.basicConfig( + filename="/var/log/copr/cleanup_vms.log", + # filename="/tmp/cleanup_vms.log", + # stream=sys.stdout, + format='[%(asctime)s][%(thread)s][%(levelname)6s]: %(message)s', + level=logging.INFO) + + log = logging.getLogger(__name__) + log.info("Logger done") + + cleaner = Cleaner(read_config()) + cleaner.main() diff --git a/roles/copr/backend/files/cleanup_vms.sh b/roles/copr/backend/files/cleanup_vms.sh new file mode 100644 index 0000000000..3ff63c6d0c --- /dev/null +++ b/roles/copr/backend/files/cleanup_vms.sh @@ -0,0 +1,3 @@ +#!/usr/bin/sh + +runuser -c "/home/copr/cleanup_vm_nova.py 2> /dev/null" - copr diff --git a/roles/copr/backend/tasks/main.yml b/roles/copr/backend/tasks/main.yml index fe61800a46..90abecde94 100644 --- a/roles/copr/backend/tasks/main.yml +++ b/roles/copr/backend/tasks/main.yml @@ -145,18 +145,21 @@ - logstash - copr-backend -- name: copy delete-forgotten-instances.pl - copy: src="delete-forgotten-instances.pl" dest=/home/copr/delete-forgotten-instances.pl mode=755 +- copy: src="cleanup_vm_nova.py" dest=/home/copr/ mode=755 +- copy: src="cleanup_vms.sh" dest=/etc/cron.hourly/copr_cleanup_vms.sh mode=755 -- name: copy delete-forgotten-instances.cron - copy: src="delete-forgotten-instances.cron" dest=/etc/cron.daily/delete-forgotten-instances owner=root group=root mode=755 - when: not devel +#- name: copy delete-forgotten-instances.pl +# copy: src="delete-forgotten-instances.pl" dest=/home/copr/delete-forgotten-instances.pl mode=755 -- name: install script to kill VMs in error state - copy: src="instant-instance-killer.sh" dest="/root/" +#- name: copy delete-forgotten-instances.cron +# copy: src="delete-forgotten-instances.cron" dest=/etc/cron.daily/delete-forgotten-instances owner=root group=root mode=755 +# when: not devel -- cron: name="kill VMs in error state" minute="*/15" job="/root/instant-instance-killer.sh" - when: not devel +#- name: install script to kill VMs in error state +# copy: src="instant-instance-killer.sh" dest="/root/" + +#- cron: name="kill VMs in error state" minute="*/15" job="/root/instant-instance-killer.sh" +# when: not devel - name: setup monitoring include: "monitoring.yml"