From 8999dc9717aed10280bae913775e1ed2d1246d8f Mon Sep 17 00:00:00 2001 From: Ralph Bean Date: Thu, 17 Jul 2014 19:19:16 +0000 Subject: [PATCH] Datanommer history nagios checks. --- .../scripts/check_datanommer_timesince.py | 65 +++++++++++++++++++ roles/nagios_client/tasks/main.yml | 2 + .../templates/check_datanommer_history.cfg.j2 | 37 +++++++++++ 3 files changed, 104 insertions(+) create mode 100755 roles/nagios_client/files/scripts/check_datanommer_timesince.py create mode 100644 roles/nagios_client/templates/check_datanommer_history.cfg.j2 diff --git a/roles/nagios_client/files/scripts/check_datanommer_timesince.py b/roles/nagios_client/files/scripts/check_datanommer_timesince.py new file mode 100755 index 0000000000..d4fcef1725 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_datanommer_timesince.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +""" NRPE check for datanommer/fedmsg health. +Given a category like 'bodhi', 'buildsys', or 'git', return an error if +datanommer hasn't seen a message of that type in such and such time. + +Requires: python-dateutil + +Usage: + + $ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH + +:Author: Ralph Bean + +""" + +import dateutil.relativedelta +import subprocess +import sys +import json + + +def query_timesince(category): + cmd = 'datanommer-latest --category %s --timesince' % category + sys.stderr.write("Running %r\n" % cmd) + process = subprocess.Popen(cmd.split(), shell=False, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + data = json.loads(stdout) + return float(data[0]) + + +def main(): + category, warning_threshold, critical_threshold = sys.argv[-3:] + timesince = query_timesince(category) + warning_threshold = int(warning_threshold) + critical_threshold = int(critical_threshold) + + time_strings = [] + rd = dateutil.relativedelta.relativedelta(seconds=timesince) + for denomination in ['years', 'months', 'days', 'hours', 'minutes', 'seconds']: + value = getattr(rd, denomination, 0) + if value: + time_strings.append("%d %s" % (value, denomination)) + + string = ", ".join(time_strings) + reason = "datanommer has not seen a %r message in %s" % (category, string) + + if timesince > critical_threshold: + print "CRIT: ", reason + sys.exit(2) + + if timesince > warning_threshold: + print "WARN: ", reason + sys.exit(1) + + print "OK: ", reason + sys.exit(0) + + +if __name__ == '__main__': + try: + main() + except Exception as e: + print "UNKNOWN: ", str(e) + sys.exit(3) diff --git a/roles/nagios_client/tasks/main.yml b/roles/nagios_client/tasks/main.yml index 1661a3be1b..0f47da9ada 100644 --- a/roles/nagios_client/tasks/main.yml +++ b/roles/nagios_client/tasks/main.yml @@ -30,6 +30,7 @@ - check_fedmsg_consumer_exceptions.py - check_fedmsg_producers_consumers.py - check_supybot_plugin + - check_datanommer_timesince.py tags: - nagios_client @@ -75,6 +76,7 @@ - check_fcomm_queue.cfg - check_fedmsg_consumers.cfg - check_supybot_fedmsg_plugin.cfg + - check_datanommer_history.cfg notify: - restart nrpe tags: diff --git a/roles/nagios_client/templates/check_datanommer_history.cfg.j2 b/roles/nagios_client/templates/check_datanommer_history.cfg.j2 new file mode 100644 index 0000000000..4896e76ee5 --- /dev/null +++ b/roles/nagios_client/templates/check_datanommer_history.cfg.j2 @@ -0,0 +1,37 @@ +# Checks on the datanommer history to make sure we're still receiving messages +# of all types. +# +# The following are fedmsg/datanommer checks to be run on busgateway01. +# They check for the time since the latest message in any particular category. +# The first number is the seconds elapsed until we should raise a warning. +# The second number is the seconds elapsed until we should raise an error. +# For your reference: +# 4 hours -> 14400 +# 1 day -> 86400 +# 3 days -> 259200 +# 1 week -> 604800 +# 3 weeks -> 1814400 +# 1 month -> 2628000 +# 3 months -> 7884000 +command[check_datanommer_buildsys]=/usr/lib/nagios/plugins/check_datanommer_timesince.py buildsys 14400 86400 +command[check_datanommer_git]=/usr/lib/nagios/plugins/check_datanommer_timesince.py git 86400 604800 +command[check_datanommer_bodhi]=/usr/lib/nagios/plugins/check_datanommer_timesince.py bodhi 86400 604800 +command[check_datanommer_wiki]=/usr/lib/nagios/plugins/check_datanommer_timesince.py wiki 259200 1814400 +command[check_datanommer_compose]=/usr/lib/nagios/plugins/check_datanommer_timesince.py compose 259200 1814400 +command[check_datanommer_meetbot]=/usr/lib/nagios/plugins/check_datanommer_timesince.py meetbot 604800 2628000 +command[check_datanommer_fas]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fas 1814400 2628000 +command[check_datanommer_pkgdb]=/usr/lib/nagios/plugins/check_datanommer_timesince.py pkgdb 1814400 2628000 +command[check_datanommer_fedoratagger]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fedoratagger 2628000 7884000 +command[check_datanommer_planet]=/usr/lib/nagios/plugins/check_datanommer_timesince.py planet 2628000 7884000 +command[check_datanommer_copr]=/usr/lib/nagios/plugins/check_datanommer_timesince.py copr 21600 86400 +command[check_datanommer_trac]=/usr/lib/nagios/plugins/check_datanommer_timesince.py trac 86400 259200 +command[check_datanommer_askbot]=/usr/lib/nagios/plugins/check_datanommer_timesince.py askbot 86400 259200 +command[check_datanommer_fedbadges]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fedbadges 86400 259200 +command[check_datanommer_nuancier]=/usr/lib/nagios/plugins/check_datanommer_timesince.py nuancier 23652000 31536000 +command[check_datanommer_fedocal]=/usr/lib/nagios/plugins/check_datanommer_timesince.py fedocal 7884000 23652000 +command[check_datanommer_ansible]=/usr/lib/nagios/plugins/check_datanommer_timesince.py ansible 432000 604800 + +# These are not actually finished and deployed yet +command[check_datanommer_mailman]=/usr/lib/nagios/plugins/check_datanommer_timesince.py mailman 14400 86400 +command[check_datanommer_cnucnuweb]=/usr/lib/nagios/plugins/check_datanommer_timesince.py cnucnuweb 604800 1814400 +command[check_datanommer_summershum]=/usr/lib/nagios/plugins/check_datanommer_timesince.py summershum 604800 1814400