From b3a97a1c91fb8226b8251fa249842b4f1a110bf0 Mon Sep 17 00:00:00 2001 From: Ralph Bean Date: Thu, 2 Oct 2014 13:42:27 +0000 Subject: [PATCH] Add two new nagios checks for the FMN "Producers" --- .../scripts/check_fedmsg_producer_last_ran.py | 69 +++++++++++++++++++ roles/nagios_client/tasks/main.yml | 1 + .../templates/check_fedmsg_consumers.cfg.j2 | 3 + .../files/nagios/services/fedmsg.cfg | 15 ++++ roles/nagios_server/files/nrpe.cfg | 3 + 5 files changed, 91 insertions(+) create mode 100644 roles/nagios_client/files/scripts/check_fedmsg_producer_last_ran.py diff --git a/roles/nagios_client/files/scripts/check_fedmsg_producer_last_ran.py b/roles/nagios_client/files/scripts/check_fedmsg_producer_last_ran.py new file mode 100644 index 0000000000..263d3c3351 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_fedmsg_producer_last_ran.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +import arrow +import json +import os +import socket +import sys +import time +import zmq + +try: + service = sys.argv[1] + check_producer = sys.argv[2] + elapsed_warning = int(sys.argv[3]) + elapsed_critical = int(sys.argv[4]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + if not os.access(fname, os.W_OK): + print "UNKNOWN - cannot write to %s" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 20000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + now = time.time() + + for prod in msg['producers']: + if prod['name'] != check_producer: + continue + diff = now - prod['last_ran'] + then = arrow.get(prod['last_ran']).humanize() + if diff > elapsed_critical: + print "CRITICAL: %s last ran %s (%i seconds ago)" % ( + check_producer, then, diff) + sys.exit(2) + elif diff > elapsed_warning: + print "WARNING: %s last ran %s (%i seconds ago)" % ( + check_producer, then, diff) + sys.exit(1) + else: + print "OK: %s last ran %s (%i seconds ago)" % ( + check_producer, then, diff) + sys.exit(0) + + print "UNKNOWN: fedmsg producer %s not found" % check_producer + sys.exit(3) +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_client/tasks/main.yml b/roles/nagios_client/tasks/main.yml index 231e03c7bd..713c207622 100644 --- a/roles/nagios_client/tasks/main.yml +++ b/roles/nagios_client/tasks/main.yml @@ -27,6 +27,7 @@ - check_fcomm_queue - check_fedmsg_consumer_backlog.py - check_fedmsg_consumer_exceptions.py + - check_fedmsg_producer_last_ran.py - check_fedmsg_producers_consumers.py - check_supybot_plugin - check_datanommer_timesince.py diff --git a/roles/nagios_client/templates/check_fedmsg_consumers.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_consumers.cfg.j2 index 9378a11844..997fca6470 100644 --- a/roles/nagios_client/templates/check_fedmsg_consumers.cfg.j2 +++ b/roles/nagios_client/templates/check_fedmsg_consumers.cfg.j2 @@ -31,3 +31,6 @@ command[check_fedmsg_cbacklog_summershum]={{libdir}}/nagios/plugins/check_fedmsg command[check_fedmsg_cbacklog_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedoraBadgesConsumer 5000 10000 command[check_fedmsg_cbacklog_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FMNConsumer 2000 5000 command[check_fedmsg_cbacklog_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py moksha-hub BugzillaConsumer 10 100 + +command[check_fedmsg_fmn_digest_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 30 300 +command[check_fedmsg_fmn_confirm_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 30 300 diff --git a/roles/nagios_server/files/nagios/services/fedmsg.cfg b/roles/nagios_server/files/nagios/services/fedmsg.cfg index 9da766c11b..187206dc8a 100644 --- a/roles/nagios_server/files/nagios/services/fedmsg.cfg +++ b/roles/nagios_server/files/nagios/services/fedmsg.cfg @@ -410,3 +410,18 @@ define service { check_command check_by_nrpe!check_fedmsg_cbacklog_bugzilla2fedmsg use defaulttemplate } + + +define service { + host_name notifs-backend01 + service_description Did the FMN digest producer run? + check_command check_by_nrpe!check_fedmsg_fmn_digest_last_ran + use defaulttemplate +} + +define service { + host_name notifs-backend01 + service_description Did the FMN confirmation producer run? + check_command check_by_nrpe!check_fedmsg_fmn_confirm_last_ran + use defaulttemplate +} diff --git a/roles/nagios_server/files/nrpe.cfg b/roles/nagios_server/files/nrpe.cfg index 70444ada29..ba68b0156e 100644 --- a/roles/nagios_server/files/nrpe.cfg +++ b/roles/nagios_server/files/nrpe.cfg @@ -308,6 +308,9 @@ command[check_fedmsg_cbacklog_badges_backend]=/usr/lib64/nagios/plugins/check_fe command[check_fedmsg_cbacklog_notifs_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FMNConsumer 10 50 command[check_fedmsg_cbacklog_bugzilla2fedmsg]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py moksha-hub BugzillaConsumer 10 100 +command[check_fedmsg_fmn_digest_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 30 300 +command[check_fedmsg_fmn_confirm_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 30 300 + # The following are 'action commands' where by an actual action is performed # like restarting httpd