Datanommer history nagios checks.
This commit is contained in:
parent
a411c40da9
commit
8999dc9717
3 changed files with 104 additions and 0 deletions
65
roles/nagios_client/files/scripts/check_datanommer_timesince.py
Executable file
65
roles/nagios_client/files/scripts/check_datanommer_timesince.py
Executable file
|
@ -0,0 +1,65 @@
|
|||
#!/usr/bin/env python
|
||||
""" NRPE check for datanommer/fedmsg health.
|
||||
Given a category like 'bodhi', 'buildsys', or 'git', return an error if
|
||||
datanommer hasn't seen a message of that type in such and such time.
|
||||
|
||||
Requires: python-dateutil
|
||||
|
||||
Usage:
|
||||
|
||||
$ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH
|
||||
|
||||
:Author: Ralph Bean <rbean@redhat.com>
|
||||
|
||||
"""
|
||||
|
||||
import dateutil.relativedelta
|
||||
import subprocess
|
||||
import sys
|
||||
import json
|
||||
|
||||
|
||||
def query_timesince(category):
|
||||
cmd = 'datanommer-latest --category %s --timesince' % category
|
||||
sys.stderr.write("Running %r\n" % cmd)
|
||||
process = subprocess.Popen(cmd.split(), shell=False,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = process.communicate()
|
||||
data = json.loads(stdout)
|
||||
return float(data[0])
|
||||
|
||||
|
||||
def main():
|
||||
category, warning_threshold, critical_threshold = sys.argv[-3:]
|
||||
timesince = query_timesince(category)
|
||||
warning_threshold = int(warning_threshold)
|
||||
critical_threshold = int(critical_threshold)
|
||||
|
||||
time_strings = []
|
||||
rd = dateutil.relativedelta.relativedelta(seconds=timesince)
|
||||
for denomination in ['years', 'months', 'days', 'hours', 'minutes', 'seconds']:
|
||||
value = getattr(rd, denomination, 0)
|
||||
if value:
|
||||
time_strings.append("%d %s" % (value, denomination))
|
||||
|
||||
string = ", ".join(time_strings)
|
||||
reason = "datanommer has not seen a %r message in %s" % (category, string)
|
||||
|
||||
if timesince > critical_threshold:
|
||||
print "CRIT: ", reason
|
||||
sys.exit(2)
|
||||
|
||||
if timesince > warning_threshold:
|
||||
print "WARN: ", reason
|
||||
sys.exit(1)
|
||||
|
||||
print "OK: ", reason
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
print "UNKNOWN: ", str(e)
|
||||
sys.exit(3)
|
|
@ -30,6 +30,7 @@
|
|||
- check_fedmsg_consumer_exceptions.py
|
||||
- check_fedmsg_producers_consumers.py
|
||||
- check_supybot_plugin
|
||||
- check_datanommer_timesince.py
|
||||
tags:
|
||||
- nagios_client
|
||||
|
||||
|
@ -75,6 +76,7 @@
|
|||
- check_fcomm_queue.cfg
|
||||
- check_fedmsg_consumers.cfg
|
||||
- check_supybot_fedmsg_plugin.cfg
|
||||
- check_datanommer_history.cfg
|
||||
notify:
|
||||
- restart nrpe
|
||||
tags:
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
# Checks on the datanommer history to make sure we're still receiving messages
|
||||
# of all types.
|
||||
#
|
||||
# The following are fedmsg/datanommer checks to be run on busgateway01.
|
||||
# They check for the time since the latest message in any particular category.
|
||||
# The first number is the seconds elapsed until we should raise a warning.
|
||||
# The second number is the seconds elapsed until we should raise an error.
|
||||
# For your reference:
|
||||
# 4 hours -> 14400
|
||||
# 1 day -> 86400
|
||||
# 3 days -> 259200
|
||||
# 1 week -> 604800
|
||||
# 3 weeks -> 1814400
|
||||
# 1 month -> 2628000
|
||||
# 3 months -> 7884000
|
||||
command[check_datanommer_buildsys]=/usr/lib/nagios/plugins/check_datanommer_timesince.py buildsys 14400 86400
|
||||
command[check_datanommer_git]=/usr/lib/nagios/plugins/check_datanommer_timesince.py git 86400 604800
|
||||
command[check_datanommer_bodhi]=/usr/lib/nagios/plugins/check_datanommer_timesince.py bodhi 86400 604800
|
||||
command[check_datanommer_wiki]=/usr/lib/nagios/plugins/check_datanommer_timesince.py wiki 259200 1814400
|
||||
command[check_datanommer_compose]=/usr/lib/nagios/plugins/check_datanommer_timesince.py compose 259200 1814400
|
||||
command[check_datanommer_meetbot]=/usr/lib/nagios/plugins/check_datanommer_timesince.py meetbot 604800 2628000
|
||||
command[check_datanommer_fas]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fas 1814400 2628000
|
||||
command[check_datanommer_pkgdb]=/usr/lib/nagios/plugins/check_datanommer_timesince.py pkgdb 1814400 2628000
|
||||
command[check_datanommer_fedoratagger]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fedoratagger 2628000 7884000
|
||||
command[check_datanommer_planet]=/usr/lib/nagios/plugins/check_datanommer_timesince.py planet 2628000 7884000
|
||||
command[check_datanommer_copr]=/usr/lib/nagios/plugins/check_datanommer_timesince.py copr 21600 86400
|
||||
command[check_datanommer_trac]=/usr/lib/nagios/plugins/check_datanommer_timesince.py trac 86400 259200
|
||||
command[check_datanommer_askbot]=/usr/lib/nagios/plugins/check_datanommer_timesince.py askbot 86400 259200
|
||||
command[check_datanommer_fedbadges]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fedbadges 86400 259200
|
||||
command[check_datanommer_nuancier]=/usr/lib/nagios/plugins/check_datanommer_timesince.py nuancier 23652000 31536000
|
||||
command[check_datanommer_fedocal]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fedocal 7884000 23652000
|
||||
command[check_datanommer_ansible]=/usr/lib/nagios/plugins/check_datanommer_timesince.py ansible 432000 604800
|
||||
|
||||
# These are not actually finished and deployed yet
|
||||
command[check_datanommer_mailman]=/usr/lib/nagios/plugins/check_datanommer_timesince.py mailman 14400 86400
|
||||
command[check_datanommer_cnucnuweb]=/usr/lib/nagios/plugins/check_datanommer_timesince.py cnucnuweb 604800 1814400
|
||||
command[check_datanommer_summershum]=/usr/lib/nagios/plugins/check_datanommer_timesince.py summershum 604800 1814400
|
Loading…
Add table
Add a link
Reference in a new issue