[copr] new script to clean up builder VMs
This commit is contained in:
parent
742e2f7951
commit
f153d2725a
3 changed files with 141 additions and 9 deletions
126
roles/copr/backend/files/cleanup_vm_nova.py
Normal file
126
roles/copr/backend/files/cleanup_vm_nova.py
Normal file
|
@ -0,0 +1,126 @@
|
|||
#!/usr/bin/python
|
||||
# coding: utf-8
|
||||
|
||||
# TODO: remove from ansible when new release on copr-backend become available
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dateutil.parser import parse as dt_parse
|
||||
|
||||
import psutil
|
||||
import yaml
|
||||
from novaclient.v1_1.client import Client
|
||||
|
||||
sys.path.append("/usr/share/copr/")
|
||||
|
||||
try:
|
||||
from backend.helpers import utc_now
|
||||
except ImportError:
|
||||
# TODO: remove when updated version of copr-backend will be released
|
||||
import pytz
|
||||
|
||||
def utc_now():
|
||||
"""
|
||||
:return datetime.datetime: Current utc datetime with specified timezone
|
||||
"""
|
||||
u = datetime.utcnow()
|
||||
u = u.replace(tzinfo=pytz.utc)
|
||||
return u
|
||||
|
||||
|
||||
logging.getLogger("requests").setLevel(logging.ERROR)
|
||||
|
||||
|
||||
nova_cloud_vars_path = os.environ.get("NOVA_CLOUD_VARS", "/home/copr/provision/nova_cloud_vars.yml")
|
||||
|
||||
|
||||
def read_config():
|
||||
with open(nova_cloud_vars_path) as handle:
|
||||
conf = yaml.load(handle.read())
|
||||
return conf
|
||||
|
||||
|
||||
def get_client(conf):
|
||||
return Client(username=conf["OS_USERNAME"],
|
||||
api_key=conf["OS_PASSWORD"],
|
||||
project_id=conf["OS_TENANT_NAME"],
|
||||
auth_url=conf["OS_AUTH_URL"],
|
||||
insecure=True)
|
||||
|
||||
|
||||
class Cleaner(object):
|
||||
def __init__(self, conf):
|
||||
self.conf = conf
|
||||
self.nt = None
|
||||
self.ps_set = None
|
||||
|
||||
def post_init(self):
|
||||
self.nt = get_client(self.conf)
|
||||
# TODO: use VM management after release
|
||||
self.ps_set = "\n".join(p.name + " ".join(p.cmdline) for p in psutil.process_iter())
|
||||
# log.debug("ps_set: \n{}".format(self.ps_set))
|
||||
|
||||
@staticmethod
|
||||
def terminate(srv):
|
||||
try:
|
||||
srv.delete()
|
||||
log.info("delete invoked for: {}".format(srv))
|
||||
except Exception as err:
|
||||
log.exception("failed to request VM termination: {}".format(err))
|
||||
|
||||
@staticmethod
|
||||
def old_enough(srv):
|
||||
dt_created = dt_parse(srv.created)
|
||||
delta = (utc_now() - dt_created).total_seconds()
|
||||
# log.debug("Server {} created {} now {}; delta: {}".format(srv, dt_created, utc_now(), delta))
|
||||
return delta > 60 * 10 # 10 minutes
|
||||
|
||||
def check_one(self, srv_id):
|
||||
srv = self.nt.servers.get(srv_id)
|
||||
log.debug("checking vm: {}".format(srv))
|
||||
srv.get()
|
||||
if srv.status == u"ERROR":
|
||||
log.info("server {} got into the error state, deleting".format(srv))
|
||||
self.terminate(srv)
|
||||
elif self.old_enough(srv) and srv.human_id not in self.ps_set:
|
||||
log.info("server {} not used by any builder".format(srv))
|
||||
self.terminate(srv)
|
||||
# elif not self.old_enough(srv):
|
||||
# log.info("Server {} not old enough".format(srv))
|
||||
|
||||
def main(self):
|
||||
"""
|
||||
Terminate erred VM's and VM's with uptime > 10 minutes and which doesn't have associated process
|
||||
"""
|
||||
self.post_init()
|
||||
start = time.time()
|
||||
|
||||
srv_list = self.nt.servers.list(detailed=False)
|
||||
with ThreadPoolExecutor(max_workers=20) as executor:
|
||||
future_check = {executor.submit(self.check_one, srv.id): srv.id for srv in srv_list}
|
||||
for future in as_completed(future_check):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as exc:
|
||||
log.exception(exc)
|
||||
|
||||
log.info("cleanup consumed: {} seconds".format(time.time() - start))
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
filename="/var/log/copr/cleanup_vms.log",
|
||||
# filename="/tmp/cleanup_vms.log",
|
||||
# stream=sys.stdout,
|
||||
format='[%(asctime)s][%(thread)s][%(levelname)6s]: %(message)s',
|
||||
level=logging.INFO)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
log.info("Logger done")
|
||||
|
||||
cleaner = Cleaner(read_config())
|
||||
cleaner.main()
|
3
roles/copr/backend/files/cleanup_vms.sh
Normal file
3
roles/copr/backend/files/cleanup_vms.sh
Normal file
|
@ -0,0 +1,3 @@
|
|||
#!/usr/bin/sh
|
||||
|
||||
runuser -c "/home/copr/cleanup_vm_nova.py 2> /dev/null" - copr
|
|
@ -145,18 +145,21 @@
|
|||
- logstash
|
||||
- copr-backend
|
||||
|
||||
- name: copy delete-forgotten-instances.pl
|
||||
copy: src="delete-forgotten-instances.pl" dest=/home/copr/delete-forgotten-instances.pl mode=755
|
||||
- copy: src="cleanup_vm_nova.py" dest=/home/copr/ mode=755
|
||||
- copy: src="cleanup_vms.sh" dest=/etc/cron.hourly/copr_cleanup_vms.sh mode=755
|
||||
|
||||
- name: copy delete-forgotten-instances.cron
|
||||
copy: src="delete-forgotten-instances.cron" dest=/etc/cron.daily/delete-forgotten-instances owner=root group=root mode=755
|
||||
when: not devel
|
||||
#- name: copy delete-forgotten-instances.pl
|
||||
# copy: src="delete-forgotten-instances.pl" dest=/home/copr/delete-forgotten-instances.pl mode=755
|
||||
|
||||
- name: install script to kill VMs in error state
|
||||
copy: src="instant-instance-killer.sh" dest="/root/"
|
||||
#- name: copy delete-forgotten-instances.cron
|
||||
# copy: src="delete-forgotten-instances.cron" dest=/etc/cron.daily/delete-forgotten-instances owner=root group=root mode=755
|
||||
# when: not devel
|
||||
|
||||
- cron: name="kill VMs in error state" minute="*/15" job="/root/instant-instance-killer.sh"
|
||||
when: not devel
|
||||
#- name: install script to kill VMs in error state
|
||||
# copy: src="instant-instance-killer.sh" dest="/root/"
|
||||
|
||||
#- cron: name="kill VMs in error state" minute="*/15" job="/root/instant-instance-killer.sh"
|
||||
# when: not devel
|
||||
|
||||
- name: setup monitoring
|
||||
include: "monitoring.yml"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue