diff --git a/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py b/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py index 70ffc826ce..8d99529bf7 100644 --- a/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py +++ b/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py @@ -1,5 +1,6 @@ #! /usr/bin/python3 +import time import requests from bs4 import BeautifulSoup from prometheus_client import CollectorRegistry, write_to_textfile, Gauge @@ -10,8 +11,21 @@ def collect_nagios_service_state(url, name, documentation, filename): gauge = Gauge(name, documentation, registry=registry) state = 0 + try: - response = requests.get(url) + # Give the Nagios (our our network) some time to recover before we + # pollute our Graphana metrics + attempt = 0 + while True: + attempt += 1 + try: + response = requests.get(url) + break + except Exception: + time.sleep(10) + if attempt >= 4: + raise + soup = BeautifulSoup(response.content, features="lxml") if soup.select_one("div.serviceOK"): state = 1 diff --git a/roles/copr/frontend/tasks/main.yml b/roles/copr/frontend/tasks/main.yml index 5b130491e2..7df7c1f088 100644 --- a/roles/copr/frontend/tasks/main.yml +++ b/roles/copr/frontend/tasks/main.yml @@ -33,7 +33,7 @@ cron: name: "generating prometheus metrics" user: root - minute: 0 + minute: "*/5" job: "/usr/bin/python3 /usr/bin/copr-frontend-prometheus-monitoring.py" tags: - cron_tasks