Add two new nagios checks for the FMN "Producers"

This commit is contained in:
Ralph Bean 2014-10-02 13:42:27 +00:00
parent ad9673cbae
commit b3a97a1c91
5 changed files with 91 additions and 0 deletions

View file

@ -0,0 +1,69 @@
#!/usr/bin/env python
import arrow
import json
import os
import socket
import sys
import time
import zmq
try:
service = sys.argv[1]
check_producer = sys.argv[2]
elapsed_warning = int(sys.argv[3])
elapsed_critical = int(sys.argv[4])
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
if not os.path.exists(fname):
print "UNKNOWN - %s does not exist" % fname
sys.exit(3)
if not os.access(fname, os.W_OK):
print "UNKNOWN - cannot write to %s" % fname
sys.exit(3)
connect_to = "ipc:///%s" % fname
ctx = zmq.Context()
s = ctx.socket(zmq.SUB)
s.connect(connect_to)
s.setsockopt(zmq.SUBSCRIBE, '')
poller = zmq.Poller()
poller.register(s, zmq.POLLIN)
timeout = 20000
events = dict(poller.poll(timeout))
if s in events and events[s] == zmq.POLLIN:
msg = s.recv()
msg = json.loads(msg)
else:
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
sys.exit(3)
now = time.time()
for prod in msg['producers']:
if prod['name'] != check_producer:
continue
diff = now - prod['last_ran']
then = arrow.get(prod['last_ran']).humanize()
if diff > elapsed_critical:
print "CRITICAL: %s last ran %s (%i seconds ago)" % (
check_producer, then, diff)
sys.exit(2)
elif diff > elapsed_warning:
print "WARNING: %s last ran %s (%i seconds ago)" % (
check_producer, then, diff)
sys.exit(1)
else:
print "OK: %s last ran %s (%i seconds ago)" % (
check_producer, then, diff)
sys.exit(0)
print "UNKNOWN: fedmsg producer %s not found" % check_producer
sys.exit(3)
except Exception as err:
print "UNKNOWN:", str(err)
sys.exit(3)

View file

@ -27,6 +27,7 @@
- check_fcomm_queue
- check_fedmsg_consumer_backlog.py
- check_fedmsg_consumer_exceptions.py
- check_fedmsg_producer_last_ran.py
- check_fedmsg_producers_consumers.py
- check_supybot_plugin
- check_datanommer_timesince.py

View file

@ -31,3 +31,6 @@ command[check_fedmsg_cbacklog_summershum]={{libdir}}/nagios/plugins/check_fedmsg
command[check_fedmsg_cbacklog_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedoraBadgesConsumer 5000 10000
command[check_fedmsg_cbacklog_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FMNConsumer 2000 5000
command[check_fedmsg_cbacklog_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py moksha-hub BugzillaConsumer 10 100
command[check_fedmsg_fmn_digest_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 30 300
command[check_fedmsg_fmn_confirm_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 30 300

View file

@ -410,3 +410,18 @@ define service {
check_command check_by_nrpe!check_fedmsg_cbacklog_bugzilla2fedmsg
use defaulttemplate
}
define service {
host_name notifs-backend01
service_description Did the FMN digest producer run?
check_command check_by_nrpe!check_fedmsg_fmn_digest_last_ran
use defaulttemplate
}
define service {
host_name notifs-backend01
service_description Did the FMN confirmation producer run?
check_command check_by_nrpe!check_fedmsg_fmn_confirm_last_ran
use defaulttemplate
}

View file

@ -308,6 +308,9 @@ command[check_fedmsg_cbacklog_badges_backend]=/usr/lib64/nagios/plugins/check_fe
command[check_fedmsg_cbacklog_notifs_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FMNConsumer 10 50
command[check_fedmsg_cbacklog_bugzilla2fedmsg]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py moksha-hub BugzillaConsumer 10 100
command[check_fedmsg_fmn_digest_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 30 300
command[check_fedmsg_fmn_confirm_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 30 300
# The following are 'action commands' where by an actual action is performed
# like restarting httpd