copr-fe: prometheus: get the data from nagios more frequently

But do not fail immediately if we can not get the results from Nagios.
2023-02-10 14:58:44 +01:00 · 2023-02-10 14:58:44 +01:00 · 001f60b8bd
commit 001f60b8bd
parent 6d82fba602
2 changed files with 16 additions and 2 deletions
--- a/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py
+++ b/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py
@ -1,5 +1,6 @@
 #! /usr/bin/python3

+import time
 import requests
 from bs4 import BeautifulSoup
 from prometheus_client import CollectorRegistry, write_to_textfile, Gauge
@ -10,8 +11,21 @@ def collect_nagios_service_state(url, name, documentation, filename):
    gauge = Gauge(name, documentation, registry=registry)
    state = 0

+
    try:
-        response = requests.get(url)
+        # Give the Nagios (our our network) some time to recover before we
+        # pollute our Graphana metrics
+        attempt = 0
+        while True:
+            attempt += 1
+            try:
+                response = requests.get(url)
+                break
+            except Exception:
+                time.sleep(10)
+                if attempt >= 4:
+                    raise
+
        soup = BeautifulSoup(response.content, features="lxml")
        if soup.select_one("div.serviceOK"):
            state = 1
--- a/roles/copr/frontend/tasks/main.yml
+++ b/roles/copr/frontend/tasks/main.yml
@ -33,7 +33,7 @@
  cron:
    name: "generating prometheus metrics"
    user: root
-    minute: 0
+    minute: "*/5"
    job: "/usr/bin/python3 /usr/bin/copr-frontend-prometheus-monitoring.py"
  tags:
  - cron_tasks