copr: fix the copr-ping checker

Go "green" as soon as possible.  Consider only the last build.
Fail also if no builds are submitted for a long time.

Fixes: https://github.com/fedora-copr/copr/issues/2355
This commit is contained in:
Pavel Raiskup 2023-02-09 16:45:51 +01:00
parent 732e303232
commit b8f9517b4b
3 changed files with 49 additions and 76 deletions

View file

@ -86,6 +86,7 @@
- name: rebuild the copr-ping package periodically - name: rebuild the copr-ping package periodically
ansible.builtin.cron: ansible.builtin.cron:
# NOTE: sync with CRON_PERIOD in roles/copr/backend/templates/copr-ping-check.py.j2
name: build the ping package name: build the ping package
minute: "{% if devel %}0{% else %}0,30{% endif %}" minute: "{% if devel %}0{% else %}0,30{% endif %}"
hour: "{% if devel %}1{% else %}*{% endif %}" hour: "{% if devel %}1{% else %}*{% endif %}"

View file

@ -6,10 +6,13 @@ Analyze "{{ ping_log }}"
import sys import sys
import logging import logging
import queue import time
FILE = "{{ ping_log }}" FILE = "{{ ping_log }}"
TAKE_LAST_N_ATTEMPTS = 10 TAKE_LAST_N_ATTEMPTS = 10
# NOTE: sync with cron period in roles/copr/backend/tasks/copr-ping.yml
CRON_PERIOD = 30*60
WARN_TIME = 6*60 WARN_TIME = 6*60
ERR_TIME = 20*60 ERR_TIME = 20*60
@ -23,91 +26,64 @@ logging.basicConfig(
format='%(message)s', format='%(message)s',
handlers=[logging.StreamHandler(sys.stdout)], handlers=[logging.StreamHandler(sys.stdout)],
) )
LOG = logging.getLogger() LOG = logging.getLogger()
NOW = time.time()
class Context: # pylint: disable=too-few-public-methods def _main():
""" Just a context structure """
status = EXIT_OK
# Get the last log line
def set_status(context, status): line = ""
""" with open(FILE, "r", encoding="utf-8") as file:
Set a CTX.status to STATUS, if STATUS is worse than the actual
"""
if context.status < status:
context.status = status
def warning(context, msg, *args):
""" Throw a nagios warning """
LOG.warning(msg, *args)
set_status(context, EXIT_WARN)
def error(context, msg, *args):
""" Throw a nagios error """
LOG.error(msg, *args)
set_status(context, EXIT_CRITICAL)
def _main(context):
last_lines = queue.Queue()
with open(FILE) as file:
for line in file: for line in file:
last_lines.put(line) pass
if len(last_lines.queue) > TAKE_LAST_N_ATTEMPTS: values = {
last_lines.get() "start": None,
"stop": None,
"exit_status": -1,
"build_id": 0,
}
builds_checked = 0 for value in line.split():
while last_lines.queue: key, value = value.split('=')
# re-set the state, only the last matters values[key] = value
context.status = EXIT_OK
builds_checked += 1 if values["start"] is None or values["stop"] is None or values["exit_status"] == -1:
LOG.error("some values not set in %s", FILE)
return EXIT_CRITICAL
line = last_lines.get() if int(values["exit_status"]) != 0:
values = { LOG.error("Exit status is %s (non-zero) for build ID %s",
"start": None,
"stop": None,
"exit_status": -1,
"build_id": 0,
}
for value in line.split():
key, value = value.split('=')
values[key] = value
if values["start"] is None or values["stop"] is None or values["exit_status"] == -1:
LOG.error("some values not set in %s", FILE)
sys.exit(EXIT_CRITICAL)
build_id = values["build_id"]
start = int(values["start"])
stop = int(values["stop"])
took = stop - start
if took > ERR_TIME:
error(context, "Build %s took %ss (allowed %s)", build_id, took,
WARN_TIME)
elif took > WARN_TIME:
warning(context, "Build %s took %ss (allowed %s)", build_id, took,
WARN_TIME)
if int(values["exit_status"]) != 0:
error(context, "Exit status is %s (non-zero) for build ID %s",
values["exit_status"], values["build_id"]) values["exit_status"], values["build_id"])
return EXIT_CRITICAL
if context.status == EXIT_OK: build_id = values["build_id"]
LOG.info("The last build %s for copr-ping succeeded", build_id) stop = int(values["stop"])
start = int(values["start"])
since_last_stop = NOW - stop
measured_time = since_last_stop - CRON_PERIOD
took = stop - start
if measured_time > ERR_TIME:
LOG.error("%s seconds since the last successful build, allowed %s seconds",
since_last_stop, CRON_PERIOD + ERR_TIME)
return EXIT_CRITICAL
if measured_time > WARN_TIME:
LOG.warning("%s seconds since the last successful build, allowed %s seconds",
since_last_stop, CRON_PERIOD + WARN_TIME)
return EXIT_WARN
LOG.info("The last build %s for copr-ping succeeded, took %s seconds",
build_id, took)
return EXIT_OK
if __name__ == "__main__": if __name__ == "__main__":
ctx = Context()
try: try:
_main(ctx) sys.exit(_main())
sys.exit(ctx.status)
except Exception: # pylint: disable=broad-except except Exception: # pylint: disable=broad-except
LOG.exception("UNKNOWN EXCEPTION") LOG.exception("UNKNOWN EXCEPTION")
sys.exit(EXIT_CRITICAL) sys.exit(EXIT_CRITICAL)

View file

@ -26,7 +26,7 @@ build_id=$(echo "$output" | grep 'Created builds:' | cut -d' ' -f3)
exit_status=$? exit_status=$?
if ! expr "$build_id" : '\([0-9]*\)$'; then if ! expr "$build_id" : '\([0-9]*\)$'; then
# we don't even have the build_id # we don't even have the build_id
build_id=unknown build_id=failed_to_submit
fi fi
if test $exit_status -ne 0; then if test $exit_status -ne 0; then
@ -36,7 +36,3 @@ fi
copr watch-build "$build_id" copr watch-build "$build_id"
exit_status=$? exit_status=$?
log log