copr: fix the copr-ping checker

Go "green" as soon as possible.  Consider only the last build.
Fail also if no builds are submitted for a long time.

Fixes: https://github.com/fedora-copr/copr/issues/2355
This commit is contained in:
Pavel Raiskup 2023-02-09 16:45:51 +01:00
parent 732e303232
commit b8f9517b4b
3 changed files with 49 additions and 76 deletions

View file

@ -86,6 +86,7 @@
- name: rebuild the copr-ping package periodically
ansible.builtin.cron:
# NOTE: sync with CRON_PERIOD in roles/copr/backend/templates/copr-ping-check.py.j2
name: build the ping package
minute: "{% if devel %}0{% else %}0,30{% endif %}"
hour: "{% if devel %}1{% else %}*{% endif %}"

View file

@ -6,10 +6,13 @@ Analyze "{{ ping_log }}"
import sys
import logging
import queue
import time
FILE = "{{ ping_log }}"
TAKE_LAST_N_ATTEMPTS = 10
# NOTE: sync with cron period in roles/copr/backend/tasks/copr-ping.yml
CRON_PERIOD = 30*60
WARN_TIME = 6*60
ERR_TIME = 20*60
@ -23,91 +26,64 @@ logging.basicConfig(
format='%(message)s',
handlers=[logging.StreamHandler(sys.stdout)],
)
LOG = logging.getLogger()
NOW = time.time()
class Context: # pylint: disable=too-few-public-methods
""" Just a context structure """
status = EXIT_OK
def _main():
def set_status(context, status):
"""
Set a CTX.status to STATUS, if STATUS is worse than the actual
"""
if context.status < status:
context.status = status
def warning(context, msg, *args):
""" Throw a nagios warning """
LOG.warning(msg, *args)
set_status(context, EXIT_WARN)
def error(context, msg, *args):
""" Throw a nagios error """
LOG.error(msg, *args)
set_status(context, EXIT_CRITICAL)
def _main(context):
last_lines = queue.Queue()
with open(FILE) as file:
# Get the last log line
line = ""
with open(FILE, "r", encoding="utf-8") as file:
for line in file:
last_lines.put(line)
pass
if len(last_lines.queue) > TAKE_LAST_N_ATTEMPTS:
last_lines.get()
values = {
"start": None,
"stop": None,
"exit_status": -1,
"build_id": 0,
}
builds_checked = 0
while last_lines.queue:
# re-set the state, only the last matters
context.status = EXIT_OK
for value in line.split():
key, value = value.split('=')
values[key] = value
builds_checked += 1
if values["start"] is None or values["stop"] is None or values["exit_status"] == -1:
LOG.error("some values not set in %s", FILE)
return EXIT_CRITICAL
line = last_lines.get()
values = {
"start": None,
"stop": None,
"exit_status": -1,
"build_id": 0,
}
for value in line.split():
key, value = value.split('=')
values[key] = value
if values["start"] is None or values["stop"] is None or values["exit_status"] == -1:
LOG.error("some values not set in %s", FILE)
sys.exit(EXIT_CRITICAL)
build_id = values["build_id"]
start = int(values["start"])
stop = int(values["stop"])
took = stop - start
if took > ERR_TIME:
error(context, "Build %s took %ss (allowed %s)", build_id, took,
WARN_TIME)
elif took > WARN_TIME:
warning(context, "Build %s took %ss (allowed %s)", build_id, took,
WARN_TIME)
if int(values["exit_status"]) != 0:
error(context, "Exit status is %s (non-zero) for build ID %s",
if int(values["exit_status"]) != 0:
LOG.error("Exit status is %s (non-zero) for build ID %s",
values["exit_status"], values["build_id"])
return EXIT_CRITICAL
if context.status == EXIT_OK:
LOG.info("The last build %s for copr-ping succeeded", build_id)
build_id = values["build_id"]
stop = int(values["stop"])
start = int(values["start"])
since_last_stop = NOW - stop
measured_time = since_last_stop - CRON_PERIOD
took = stop - start
if measured_time > ERR_TIME:
LOG.error("%s seconds since the last successful build, allowed %s seconds",
since_last_stop, CRON_PERIOD + ERR_TIME)
return EXIT_CRITICAL
if measured_time > WARN_TIME:
LOG.warning("%s seconds since the last successful build, allowed %s seconds",
since_last_stop, CRON_PERIOD + WARN_TIME)
return EXIT_WARN
LOG.info("The last build %s for copr-ping succeeded, took %s seconds",
build_id, took)
return EXIT_OK
if __name__ == "__main__":
ctx = Context()
try:
_main(ctx)
sys.exit(ctx.status)
sys.exit(_main())
except Exception: # pylint: disable=broad-except
LOG.exception("UNKNOWN EXCEPTION")
sys.exit(EXIT_CRITICAL)

View file

@ -26,7 +26,7 @@ build_id=$(echo "$output" | grep 'Created builds:' | cut -d' ' -f3)
exit_status=$?
if ! expr "$build_id" : '\([0-9]*\)$'; then
# we don't even have the build_id
build_id=unknown
build_id=failed_to_submit
fi
if test $exit_status -ne 0; then
@ -36,7 +36,3 @@ fi
copr watch-build "$build_id"
exit_status=$?
log