Update the datanommer Nagios check to query datagrepper directly
Signed-off-by: Aurélien Bompard <aurelien@bompard.org>
This commit is contained in:
parent
81771937c2
commit
e979a1955e
1 changed files with 67 additions and 56 deletions
|
@ -1,74 +1,85 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python3
|
||||||
""" NRPE check for datanommer/fedmsg health.
|
""" NRPE check for datanommer/fedora-messaging health.
|
||||||
Given a category like 'bodhi', 'buildsys', or 'git', return an error if
|
Given a category like 'bodhi', 'buildsys', or 'git', return an error if
|
||||||
datanommer hasn't seen a message of that type in such and such time.
|
datanommer hasn't seen a message of that type in such and such time.
|
||||||
You can alternatively provide a 'topic' which might look like
|
You can alternatively provide a 'topic' which might look like
|
||||||
org.fedoraproject.prod.bodhi.update.comment.
|
org.fedoraproject.prod.bodhi.update.comment.
|
||||||
|
|
||||||
Requires: python-dateutil
|
Requires: python-requests
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
$ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH
|
$ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH
|
||||||
|
|
||||||
:Author: Ralph Bean <rbean@redhat.com>
|
:Author: Aurelien Bompard <abompard@fedoraproject.org>
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
from builtins import str
|
|
||||||
import dateutil.relativedelta
|
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
import json
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def query_timesince(identifier):
|
DATAGREPPER_URL = "https://apps.fedoraproject.org/datagrepper"
|
||||||
|
|
||||||
|
|
||||||
|
def query_messages(identifier, delta):
|
||||||
|
params = {"delta": delta, "rows_per_page": 1, "page": 1}
|
||||||
# If it has a '.', then assume it is a topic.
|
# If it has a '.', then assume it is a topic.
|
||||||
if '.' in identifier:
|
if "." in identifier:
|
||||||
cmd = 'datanommer-latest --topic %s --timesince' % identifier
|
params["topic"] = identifier
|
||||||
else:
|
else:
|
||||||
cmd = 'datanommer-latest --category %s --timesince' % identifier
|
params["category"] = identifier
|
||||||
sys.stderr.write("Running %r\n" % cmd)
|
response = requests.get(f"{DATAGREPPER_URL}/v2/search", params=params)
|
||||||
process = subprocess.Popen(cmd.split(), shell=False,
|
if not response.ok:
|
||||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
print(f"UNKNOWN: Could not query {DATAGREPPER_URL}: error {response.status_code}")
|
||||||
stdout, stderr = process.communicate()
|
sys.exit(3)
|
||||||
prefix, stdout = stdout.split("INFO] ", 1)
|
result = response.json()
|
||||||
data = json.loads(stdout)
|
return result
|
||||||
return float(data[0])
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
def _usage():
|
||||||
|
print(f"Usage: {sys.argv[0]} CATEGORY WARNING_THRESHOLD CRITICAL_THRESHOLD")
|
||||||
|
sys.exit(3)
|
||||||
|
|
||||||
|
if len(sys.argv) != 4:
|
||||||
|
_usage()
|
||||||
|
try:
|
||||||
|
int(sys.argv[2])
|
||||||
|
int(sys.argv[3])
|
||||||
|
except ValueError:
|
||||||
|
_usage()
|
||||||
|
if int(sys.argv[2]) > int(sys.argv[3]):
|
||||||
|
_usage()
|
||||||
|
return sys.argv[1], int(sys.argv[2]), int(sys.argv[3])
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
identifier, warning_threshold, critical_threshold = sys.argv[-3:]
|
identifier, warning_threshold, critical_threshold = parse_args()
|
||||||
timesince = query_timesince(identifier)
|
result = query_messages(identifier, critical_threshold)
|
||||||
warning_threshold = int(warning_threshold)
|
|
||||||
critical_threshold = int(critical_threshold)
|
|
||||||
|
|
||||||
time_strings = []
|
if result["total"] == 0:
|
||||||
rd = dateutil.relativedelta.relativedelta(seconds=timesince)
|
print(f"CRIT: no {identifier} messages in {critical_threshold} seconds")
|
||||||
for denomination in ['years', 'months', 'days', 'hours', 'minutes', 'seconds']:
|
|
||||||
value = getattr(rd, denomination, 0)
|
|
||||||
if value:
|
|
||||||
time_strings.append("%d %s" % (value, denomination))
|
|
||||||
|
|
||||||
string = ", ".join(time_strings)
|
|
||||||
reason = "datanommer has not seen a %r message in %s" % (identifier, string)
|
|
||||||
|
|
||||||
if timesince > critical_threshold:
|
|
||||||
print("CRIT: ", reason)
|
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
if timesince > warning_threshold:
|
last_timestamp = result["raw_messages"][0]["headers"]["sent-at"]
|
||||||
print("WARN: ", reason)
|
last_timestamp = datetime.fromisoformat(last_timestamp)
|
||||||
|
seconds_since = int((datetime.now(timezone.utc) - last_timestamp).total_seconds())
|
||||||
|
reason = f"last {identifier} message was {seconds_since} seconds ago"
|
||||||
|
|
||||||
|
if seconds_since > warning_threshold:
|
||||||
|
print(f"WARN: {reason}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
print("OK: ", reason)
|
print(f"OK: {reason}")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
try:
|
try:
|
||||||
main()
|
main()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("UNKNOWN: ", str(e))
|
print(f"UNKNOWN: {e}")
|
||||||
sys.exit(3)
|
sys.exit(3)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue