From 001f60b8bd4c663013dbb84fc4837c3654263b9a Mon Sep 17 00:00:00 2001 From: Pavel Raiskup Date: Fri, 10 Feb 2023 14:58:44 +0100 Subject: [PATCH] copr-fe: prometheus: get the data from nagios more frequently But do not fail immediately if we can not get the results from Nagios. --- .../files/copr-frontend-prometheus-monitoring.py | 16 +++++++++++++++- roles/copr/frontend/tasks/main.yml | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py b/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py index 70ffc826ce..8d99529bf7 100644 --- a/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py +++ b/roles/copr/frontend/files/copr-frontend-prometheus-monitoring.py @@ -1,5 +1,6 @@ #! /usr/bin/python3 +import time import requests from bs4 import BeautifulSoup from prometheus_client import CollectorRegistry, write_to_textfile, Gauge @@ -10,8 +11,21 @@ def collect_nagios_service_state(url, name, documentation, filename): gauge = Gauge(name, documentation, registry=registry) state = 0 + try: - response = requests.get(url) + # Give the Nagios (our our network) some time to recover before we + # pollute our Graphana metrics + attempt = 0 + while True: + attempt += 1 + try: + response = requests.get(url) + break + except Exception: + time.sleep(10) + if attempt >= 4: + raise + soup = BeautifulSoup(response.content, features="lxml") if soup.select_one("div.serviceOK"): state = 1 diff --git a/roles/copr/frontend/tasks/main.yml b/roles/copr/frontend/tasks/main.yml index 5b130491e2..7df7c1f088 100644 --- a/roles/copr/frontend/tasks/main.yml +++ b/roles/copr/frontend/tasks/main.yml @@ -33,7 +33,7 @@ cron: name: "generating prometheus metrics" user: root - minute: 0 + minute: "*/5" job: "/usr/bin/python3 /usr/bin/copr-frontend-prometheus-monitoring.py" tags: - cron_tasks