diff --git a/inventory/inventory b/inventory/inventory index c87b70b45b..78d5941adf 100644 --- a/inventory/inventory +++ b/inventory/inventory @@ -1264,6 +1264,12 @@ docker-candidate-registry01.phx2.fedoraproject.org docker-registry01.stg.phx2.fedoraproject.org docker-candidate-registry01.stg.phx2.fedoraproject.org +[webservers:children] +proxies +ipsilon +ipa +fas + # # Hosts in this group have zombie processes for various reasons # and we want to not alert on those, so to the client nrpe.conf uses @@ -1276,3 +1282,4 @@ pkgs02.phx2.fedoraproject.org fed-cloud09.cloud.fedoraproject.org # Ansible from time to time in large runs has zombie threads batcave01.phx2.fedoraproject.org + diff --git a/roles/nagios/server/files/nagios-external/contacts/jstanley.cfg b/roles/nagios/server/files/nagios-external/contacts/jstanley.cfg index f0ee3dfffc..b32dd9902d 100644 --- a/roles/nagios/server/files/nagios-external/contacts/jstanley.cfg +++ b/roles/nagios/server/files/nagios-external/contacts/jstanley.cfg @@ -35,4 +35,3 @@ define contact{ email 9178159801@vtext.com pager 9178159801@vtext.com } - diff --git a/roles/nagios/server/files/nagios-external/contacts/nb.cfg b/roles/nagios/server/files/nagios-external/contacts/nb.cfg index 02c3c2a0a9..ccf2dcd937 100644 --- a/roles/nagios/server/files/nagios-external/contacts/nb.cfg +++ b/roles/nagios/server/files/nagios-external/contacts/nb.cfg @@ -10,29 +10,29 @@ define contact{ email nick@bebout.net } -define contact{ - contact_name nb-emergency - alias Nick Bebout - service_notification_period never - host_notification_period never - service_notification_options w,u,c,r - host_notification_options d,u,r - service_notification_commands notify-by-epager - host_notification_commands host-notify-by-epager - email nb5@txt.att.net - pager nb5@txt.att.net -} +#define contact{ +# contact_name nb-emergency +# alias Nick Bebout +# service_notification_period never +# host_notification_period never +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-epager +# host_notification_commands host-notify-by-epager +# email nb5@txt.att.net +# pager nb5@txt.att.net +#} -define contact{ - contact_name nbp - alias Nick Bebout - service_notification_period never - host_notification_period never - service_notification_options w,u,c,r - host_notification_options d,u,r - service_notification_commands notify-by-epager - host_notification_commands host-notify-by-epager - email nb5@txt.att.net - pager nb5@txt.att.net -} +#define contact{ +# contact_name nbp +# alias Nick Bebout +# service_notification_period never +# host_notification_period never +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-epager +# host_notification_commands host-notify-by-epager +# email nb5@txt.att.net +# pager nb5@txt.att.net +#} diff --git a/roles/nagios/server/files/nagios-external/contacts/skvidal.cfg b/roles/nagios/server/files/nagios-external/contacts/skvidal.cfg index 2a7d65ab31..27465a1484 100644 --- a/roles/nagios/server/files/nagios-external/contacts/skvidal.cfg +++ b/roles/nagios/server/files/nagios-external/contacts/skvidal.cfg @@ -11,7 +11,19 @@ #} # #define contact{ -# contact_name skvidalp +# contact_name skvidal_xmpp +# alias Seth Vidal +# service_notification_period 24x7 +# host_notification_period 24x7 +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-xmpp +# host_notification_commands host-notify-by-xmpp +# email skvidal@jabber.org +#} +# +#define contact{ +# contact_name skvidal-emergency # alias Seth Vidal # service_notification_period 24x7 # host_notification_period 24x7 @@ -20,5 +32,17 @@ # service_notification_commands notify-by-epager # host_notification_commands host-notify-by-epager # email page-seth-vidal@sethdot.org +#} +# +#define contact{ +# contact_name skvidalp +# alias Seth Vidal +# service_notification_period 16x7 +# host_notification_period 16x7 +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-epager +# host_notification_commands host-notify-by-epager +# email page-seth-vidal@sethdot.org # pager page-seth-vidal@sethdot.org #} diff --git a/roles/nagios_client/README.rst b/roles/nagios_client/README.rst new file mode 100644 index 0000000000..cc2f3038c7 --- /dev/null +++ b/roles/nagios_client/README.rst @@ -0,0 +1,36 @@ +=================================== + Nagios 4 Configuration for Fedora +=================================== + +The Fedora Infrastructure Nagios is built on a set of configurations +originally written for Nagios 2 and then upgraded over time to Nagios +3 and then 4.08. With additional changes made in the 4.2 series of +Nagios this needed a better rewrite as various parts came from +pre-puppet and then various puppet modules added on top. + +In order to get this rewrite done, we will use as much of the original +layout of the Fedora ansible nagios module but with rewrites to better +match current Nagios configurations so that it can be maintained. + +Role directory layout +===================== +The original layout branched out from + + roles/nagios/client/ + roles/nagios/server/ + +With the usual trees below this. This breaks ansible best practices +and how most new modules are set up so the rewrite uses: + + roles/nagios_client/ + roles/nagios_server/ + +===================== + Nagios Client Files +===================== + +For the most part the Nagios Client files seem to work from the +original layout to the new site. Changes will only need to be made to +playbooks for the initial changes. + + diff --git a/roles/nagios_client/files/scripts/check_datanommer_timesince.py b/roles/nagios_client/files/scripts/check_datanommer_timesince.py new file mode 100755 index 0000000000..66b4b2866d --- /dev/null +++ b/roles/nagios_client/files/scripts/check_datanommer_timesince.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +""" NRPE check for datanommer/fedmsg health. +Given a category like 'bodhi', 'buildsys', or 'git', return an error if +datanommer hasn't seen a message of that type in such and such time. +You can alternatively provide a 'topic' which might look like +org.fedoraproject.prod.bodhi.update.comment. + +Requires: python-dateutil + +Usage: + + $ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH + +:Author: Ralph Bean + +""" + +import dateutil.relativedelta +import subprocess +import sys +import json + + +def query_timesince(identifier): + # If it has a '.', then assume it is a topic. + if '.' in identifier: + cmd = 'datanommer-latest --topic %s --timesince' % identifier + else: + cmd = 'datanommer-latest --category %s --timesince' % identifier + sys.stderr.write("Running %r\n" % cmd) + process = subprocess.Popen(cmd.split(), shell=False, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + prefix, stdout = stdout.split("INFO] ", 1) + data = json.loads(stdout) + return float(data[0]) + + +def main(): + identifier, warning_threshold, critical_threshold = sys.argv[-3:] + timesince = query_timesince(identifier) + warning_threshold = int(warning_threshold) + critical_threshold = int(critical_threshold) + + time_strings = [] + rd = dateutil.relativedelta.relativedelta(seconds=timesince) + for denomination in ['years', 'months', 'days', 'hours', 'minutes', 'seconds']: + value = getattr(rd, denomination, 0) + if value: + time_strings.append("%d %s" % (value, denomination)) + + string = ", ".join(time_strings) + reason = "datanommer has not seen a %r message in %s" % (identifier, string) + + if timesince > critical_threshold: + print "CRIT: ", reason + sys.exit(2) + + if timesince > warning_threshold: + print "WARN: ", reason + sys.exit(1) + + print "OK: ", reason + sys.exit(0) + + +if __name__ == '__main__': + try: + main() + except Exception as e: + print "UNKNOWN: ", str(e) + sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_fcomm_queue b/roles/nagios_client/files/scripts/check_fcomm_queue new file mode 100644 index 0000000000..b38d7f8664 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_fcomm_queue @@ -0,0 +1,23 @@ +#!/usr/bin/env python +import sys + +try: + import retask.queue + + queue = retask.queue.Queue('fedora-packages') + queue.connect() + + items = queue.length + if items > 500: + print "CRITICAL: %i tasks in fcomm queue" % items + sys.exit(2) + elif items > 250: + print "WARNING: %i tasks in fcomm queue" % items + sys.exit(1) + else: + print "OK: %i tasks in fcomm queue" % items + sys.exit(0) + +except Exception as e: + print "UNKNOWN:", str(e) + sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_fedmsg_consumer_backlog.py b/roles/nagios_client/files/scripts/check_fedmsg_consumer_backlog.py new file mode 100644 index 0000000000..a70d49f41b --- /dev/null +++ b/roles/nagios_client/files/scripts/check_fedmsg_consumer_backlog.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +import json +import os +import socket +import sys +import zmq + +try: + service = sys.argv[1] + check_consumer = sys.argv[2] + backlog_warning = int(sys.argv[3]) + backlog_critical = int(sys.argv[4]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + if not os.access(fname, os.W_OK): + print "UNKNOWN - cannot write to %s" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 20000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + for consumer in msg['consumers']: + if consumer['name'] == check_consumer: + if consumer['backlog'] is None: + print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name'] + sys.exit(3) + elif consumer['backlog'] > backlog_critical: + print 'CRITICAL: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog']) + sys.exit(2) + elif consumer['backlog'] > backlog_warning: + print 'WARNING: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog']) + sys.exit(1) + else: + print 'OK: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog']) + sys.exit(0) + + print "UNKNOWN: fedmsg consumer %s not found" % check_consumer + sys.exit(3) +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_fedmsg_consumer_exceptions.py b/roles/nagios_client/files/scripts/check_fedmsg_consumer_exceptions.py new file mode 100644 index 0000000000..7fa5f6f4e4 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_fedmsg_consumer_exceptions.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import json +import os +import socket +import sys +import zmq + +try: + service = sys.argv[1] + check_consumer = sys.argv[2] + exceptions_warning = int(sys.argv[3]) + exceptions_critical = int(sys.argv[4]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + if not os.access(fname, os.W_OK): + print "UNKNOWN - cannot write to %s" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 20000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + for consumer in msg['consumers']: + if consumer['name'] == check_consumer: + if consumer['exceptions'] > exceptions_critical: + print 'CRITICAL: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions']) + sys.exit(2) + elif consumer['exceptions'] > exceptions_warning: + print 'WARNING: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions']) + sys.exit(1) + else: + print 'OK: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions']) + sys.exit(0) + + print "UNKNOWN: fedmsg consumers %s not found" % check_consumer + sys.exit(3) +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_fedmsg_producer_last_ran.py b/roles/nagios_client/files/scripts/check_fedmsg_producer_last_ran.py new file mode 100644 index 0000000000..263d3c3351 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_fedmsg_producer_last_ran.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +import arrow +import json +import os +import socket +import sys +import time +import zmq + +try: + service = sys.argv[1] + check_producer = sys.argv[2] + elapsed_warning = int(sys.argv[3]) + elapsed_critical = int(sys.argv[4]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + if not os.access(fname, os.W_OK): + print "UNKNOWN - cannot write to %s" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 20000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + now = time.time() + + for prod in msg['producers']: + if prod['name'] != check_producer: + continue + diff = now - prod['last_ran'] + then = arrow.get(prod['last_ran']).humanize() + if diff > elapsed_critical: + print "CRITICAL: %s last ran %s (%i seconds ago)" % ( + check_producer, then, diff) + sys.exit(2) + elif diff > elapsed_warning: + print "WARNING: %s last ran %s (%i seconds ago)" % ( + check_producer, then, diff) + sys.exit(1) + else: + print "OK: %s last ran %s (%i seconds ago)" % ( + check_producer, then, diff) + sys.exit(0) + + print "UNKNOWN: fedmsg producer %s not found" % check_producer + sys.exit(3) +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_fedmsg_producers_consumers.py b/roles/nagios_client/files/scripts/check_fedmsg_producers_consumers.py new file mode 100644 index 0000000000..f5c00ccd12 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_fedmsg_producers_consumers.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import json +import os +import socket +import sys +import zmq + +try: + service = sys.argv[1] + check_list = frozenset(sys.argv[2:]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not check_list: + print "UNKNOWN - empty list of fedmsg consumers and producers to check" + sys.exit(3) + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + if not os.access(fname, os.W_OK): + print "UNKNOWN - cannot write to %s" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 20000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + for consumer in msg['consumers']: + if consumer['name'] in check_list and not consumer['initialized']: + print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name'] + sys.exit(2) + + for producer in msg['producers']: + if producer['name'] in check_list and not producer['initialized']: + print 'ERROR: fedmsg producer %s is not initialized' % producer['name'] + sys.exit(2) + + for item in check_list: + if item not in [p['name'] for p in msg['producers'] + msg['consumers']]: + print 'ERROR: %s not found among installed plugins' % item + sys.exit(2) + + print "OK: fedmsg consumer(s) and producer(s) initialized" + sys.exit(0) + +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_haproxy_conns.py b/roles/nagios_client/files/scripts/check_haproxy_conns.py new file mode 100755 index 0000000000..e9e8c9f968 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_haproxy_conns.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +""" Nagios check for haproxy over-subscription. + +fedmsg-gateway is the primary concern as it can eat up a ton of simultaneous +connections. + +:Author: Ralph Bean +""" + +import socket +import sys + + +def _numeric(value): + """ Type casting utility """ + try: + return int(value) + except ValueError: + try: + return float(value) + except ValueError: + return value + + +def query(sockname="/var/run/haproxy-stat"): + """ Read stats from the haproxy socket and return a dict """ + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect("/var/run/haproxy-stat") + s.send('show info\n') + try: + response = s.recv(1024).strip() + lines = response.split('\n') + data = dict([map(str.strip, line.split(':')) for line in lines]) + data = dict([(k, _numeric(v)) for k, v in data.items()]) + return data + except Exception, e: + print str(e) + finally: + s.close() + + return None + + +def nagios_check(data): + """ Print warnings and return nagios exit codes. """ + + current = data['CurrConns'] + maxconn = data['Maxconn'] + percent = 100 * float(current) / float(maxconn) + details = "%.2f%% subscribed. %i current of %i maxconn." % ( + percent, current, maxconn, + ) + + if percent < 50: + print "HAPROXY SUBS OK: " + details + return 0 + + if percent < 75: + print "HAPROXY SUBS WARN: " + details + return 1 + + if percent <= 100: + print "HAPROXY SUBS CRIT: " + details + return 2 + + print "HAPROXY SUBS UNKNOWN: " + details + return 3 + + +if __name__ == '__main__': + try: + data = query(sockname="/var/run/haproxy-stat") + except Exception as e: + print "HAPROXY SUBS UNKNOWN: " + str(e) + sys.exit(3) + sys.exit(nagios_check(data)) diff --git a/roles/nagios_client/files/scripts/check_haproxy_mirrorlist.py b/roles/nagios_client/files/scripts/check_haproxy_mirrorlist.py new file mode 100755 index 0000000000..6ea3dec610 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_haproxy_mirrorlist.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +import socket +import sys + + +try: + + unixsocket="/var/run/haproxy-stat" + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(unixsocket) + s.send('show stat\n') + + try: + + output = s.recv(16384).strip().split('\n') + fields = output.pop(0).split(',') + fields[0]=fields[0].replace('# ','') + proxies = list() + for line in output: + proxies.append(dict(zip(fields,line.split(',')))) + + except Exception, e: + print str(e) + finally: + s.close() + +except Exception as e: + print "MIRRORLIST STATE UNKNOWN: " + str(e) + sys.exit(3) + +total=0 +downcount=0 +downlist="" +for proxy in proxies: + if proxy['svname'] == "FRONTEND" or proxy['svname'] == "BACKEND": + continue + if proxy['pxname'] == "mirror-lists": + total+=1 + if proxy['status'] == "DOWN": + downlist+=proxy["svname"]+" " + downcount+=1 + +unavailability = 100 * float(downcount) / float(total) + +if unavailability == 0: + print "MIRRORLIST STATE OK: " + downlist + sys.exit(0) + +if unavailability < 50: + print "MIRRORLIST STATE WARN: " + downlist + sys.exit(1) + +if unavailability >= 50: + print "MIRRORLIST STATE CRIT: " + downlist + sys.exit(2) + +print "MIRRORLIST STATE UNKNOWN: " + downlist +sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_ipa_replication b/roles/nagios_client/files/scripts/check_ipa_replication new file mode 100644 index 0000000000..96ff469cf2 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_ipa_replication @@ -0,0 +1,74 @@ +#!/usr/bin/python +# Source: https://github.com/opinkerfi/nagios-plugins/blob/master/check_ipa/check_ipa_replication +# Copyright 2013, Tomas Edwardsson +# Copyright 2016, Patrick Uiterwijk +# +# This script is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import ldap +from pynag.Plugins import PluginHelper, critical, warning, ok + +plugin = PluginHelper() + +plugin.parser.add_option('-u', help="ldap uri", dest="uri") +plugin.parser.add_option('-D', help="bind DN", dest="binddn") +plugin.parser.add_option('-w', help="bind password", dest="bindpw") +plugin.parse_arguments() + +if not plugin.options.uri: + plugin.parser.error('-u (uri) argument is required') + +try: + l = ldap.initialize(plugin.options.uri) + + if plugin.options.binddn: + l.bind_s(plugin.options.binddn, plugin.options.bindpw) + + replication = l.search_s('cn=config', + ldap.SCOPE_SUBTREE, + '(objectclass=nsds5replicationagreement)', + ['nsDS5ReplicaHost', 'nsds5replicaLastUpdateStatus']) +except Exception, e: + plugin.status(critical) + plugin.add_summary("Unable to initialize ldap connection: %s" % (e)) + plugin.exit() + + +# Loop through replication agreements +for rhost in replication: + plugin.add_summary("Replica %s Status: %s" % (rhost[1]['nsDS5ReplicaHost'][0], rhost[1]['nsds5replicaLastUpdateStatus'][0])) + + status = rhost[1]['nsds5replicaLastUpdateStatus'][0] + code = status[:2] + if status.startswith('Error ('): + # IPA >=4.4.0 + code = status[status.find('(')+1:status.find(')')] + else: + # IPA <4.4.0 + code = status[:status.find(' ')] + + if code == '0': + plugin.status(ok) + elif code == '1': + # Busy Replica is not an error, its "unknown" (but its "ok" for now) + plugin.status(ok) + else: + plugin.status(critical) + +if not len(replication): + plugin.add_summary("Warning: No replicas found") + plugin.status(warning) + +plugin.exit() + diff --git a/roles/nagios_client/files/scripts/check_lock b/roles/nagios_client/files/scripts/check_lock new file mode 100644 index 0000000000..1a58e95ef8 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_lock @@ -0,0 +1,17 @@ +#!/usr/bin/python + +import fcntl +import sys + +try: + f = open('/mnt/koji/.nagios_test', 'r') + f.close() + f = open('/mnt/koji/.nagios_test', 'w') +except IOError: + print "Could not create file" + sys.exit(2) + +fcntl.flock(f, fcntl.LOCK_EX) +f.close() +print "File Locked Successfully" +sys.exit(0) diff --git a/roles/nagios_client/files/scripts/check_lock_file_age b/roles/nagios_client/files/scripts/check_lock_file_age new file mode 100755 index 0000000000..f5abaa9e11 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_lock_file_age @@ -0,0 +1,123 @@ +#! /usr/bin/perl -w + +# check_lock_file_age.pl Copyright (C) 2010 Ricky Elrod +# +# Fork of check_file_age.pl +# +# Checks a lock file's size and modification time to make sure it's not empty +# and that it's sufficiently recent. +# +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# you should have received a copy of the GNU General Public License +# along with this program (or with Nagios); if not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA + +use strict; +use English; +use Getopt::Long; +use File::stat; +use vars qw($PROGNAME); +use lib "/usr/lib64/nagios/plugins"; +use utils qw (%ERRORS &print_revision &support); + +sub print_help (); +sub print_usage (); + +my ($opt_c, $opt_f, $opt_w, $opt_h, $opt_V); +my ($result, $message, $age, $size, $st); + +$PROGNAME="check_lock_file_age"; + +$opt_w = 1; +$opt_c = 5; +$opt_f = ""; + +Getopt::Long::Configure('bundling'); +GetOptions( + "V" => \$opt_V, "version" => \$opt_V, + "h" => \$opt_h, "help" => \$opt_h, + "f=s" => \$opt_f, "file" => \$opt_f, + "w=f" => \$opt_w, "warning-age=f" => \$opt_w, + "c=f" => \$opt_c, "critical-age=f" => \$opt_c); + +if ($opt_V) { + print_revision($PROGNAME, '1.4.14'); + exit $ERRORS{'OK'}; +} + +if ($opt_h) { + print_help(); + exit $ERRORS{'OK'}; +} + +if (($opt_c and $opt_w) and ($opt_c < $opt_w)) { + print "Warning time must be less than Critical time.\n"; + exit $ERRORS{'UNKNOWN'}; +} + +$opt_f = shift unless ($opt_f); + +if (! $opt_f) { + print "LOCK_FILE_AGE UNKNOWN: No file specified\n"; + exit $ERRORS{'UNKNOWN'}; +} + +# Check that file exists (can be directory or link) +unless (-e $opt_f) { + print "LOCK_FILE_AGE OK: File not found (Lock file removed) - $opt_f\n"; + exit $ERRORS{'OK'}; +} + +$st = File::stat::stat($opt_f); +$age = time - $st->mtime; + +$result = 'OK'; + +# Convert minutes to seconds +if($opt_c) { $opt_c *= 60; } +if($opt_w) { $opt_w *= 60; } + +if ($opt_c and $age > $opt_c) { + $result = 'CRITICAL'; +} +elsif ($opt_w and $age > $opt_w) { + $result = 'WARNING'; +} + +# If the age is higher than 2 minutes, convert seconds -> minutes +# If it's higher than a day, use days. +# Just a nicety, to make people not have to do math ;) +if($age > 86400) { $age = int(($age/86400))." days"; } +elsif($age > 120) { $age = int(($age/60))." minutes"; } +else { $age = "$age seconds"; } + +print "LOCK_FILE_AGE $result: $opt_f is $age old.\n"; +exit $ERRORS{$result}; + +sub print_usage () { + print "Usage:\n"; + print " $PROGNAME [-w ] [-c ] -f \n"; + print " $PROGNAME [-h | --help]\n"; + print " $PROGNAME [-V | --version]\n"; +} + +sub print_help () { + print_revision($PROGNAME, '1.4.14'); + print "Copyright (c) 2010 Ricky Elrod\n\n"; + print_usage(); + print "\n"; + print " File must be no more than this many minutes old (default: warn 1m, crit 5m)\n"; + print "\n"; + support(); +} diff --git a/roles/nagios_client/files/scripts/check_memcache_connect b/roles/nagios_client/files/scripts/check_memcache_connect new file mode 100644 index 0000000000..9c9d6e984e --- /dev/null +++ b/roles/nagios_client/files/scripts/check_memcache_connect @@ -0,0 +1,24 @@ +#!/bin/bash +# +# 2014-11-19 +# Author: Ralph Bean + +# exit codes +ok=0 +warn=1 +crit=2 +unkn=3 + +# Right now we just check to see if we can even run this command without +# hanging and timing out. In the future, we could parse stdout for more +# fine-grained information. +echo stats | nc 127.0.0.1 11211 > /dev/null +status=$? + +if [ $status -ne 0 ]; then + echo "CRIT: stats command got status code $status" + exit $crit +else + echo "OK: stats command got status code $status" + exit $ok +fi diff --git a/roles/nagios_client/files/scripts/check_osbs_api.py b/roles/nagios_client/files/scripts/check_osbs_api.py new file mode 100755 index 0000000000..b836f007dd --- /dev/null +++ b/roles/nagios_client/files/scripts/check_osbs_api.py @@ -0,0 +1,14 @@ +#!/usr/bin/python + +import requests +import sys + +r = requests.get("https://localhost:8443/", verify=False) + +if 'paths' in r.json().keys(): + print "OK: OSBS API endpoint is responding with path data" + sys.exit(0) +else: + print "CRITICAL: OSBS API not responding properly" + sys.exit(2) + diff --git a/roles/nagios_client/files/scripts/check_osbs_builds.py b/roles/nagios_client/files/scripts/check_osbs_builds.py new file mode 100755 index 0000000000..ef0a4700a2 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_osbs_builds.py @@ -0,0 +1,23 @@ +#!/usr/bin/python + +import subprocess +import sys + +sp = subprocess.Popen( + ["osbs", "list-builds"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=subprocess.PIPE +) +sp_out, sp_err = sp.communicate() +sp_err = sp_err.split('\n') + +if 'not attached to terminal' in sp_err[0]: + sp_err = sp_err[1:] + +if sp_err[0].split()[0] == 'BUILD': + print "OK: OSBS is responsive to 'osbs list-builds'" + sys.exit(0) +else: + print "CRITICAL: OSBS UNRESPONSIVE" + sys.exit(2) diff --git a/roles/nagios_client/files/scripts/check_postfix_queue b/roles/nagios_client/files/scripts/check_postfix_queue new file mode 100644 index 0000000000..44ab4445f9 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_postfix_queue @@ -0,0 +1,49 @@ +#!/bin/bash +# +# 19-07-2010 +# Author: Cherwin Nooitmeer +# + +# exit codes +e_ok=0 +e_warning=1 +e_critical=2 +e_unknown=3 + +# regular expression that matches queue IDs (e.g. D71EF7AC80F8) +queue_id='^[A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9]' + +usage="Invalid command line usage" + +if [ -z $1 ]; then + echo $usage + exit $e_unknown +fi + +while getopts ":w:c:" options +do + case $options in + w ) warning=$OPTARG ;; + c ) critical=$OPTARG ;; + * ) echo $usage + exit $e_unknown ;; + esac +done + +# determine queue size +qsize=$(mailq | egrep -c $queue_id) +if [ -z $qsize ] +then + exit $e_unknown +fi + +if [ $qsize -ge $critical ]; then + retval=$e_critical +elif [ $qsize -ge $warning ]; then + retval=$e_warning +elif [ $qsize -lt $warning ]; then + retval=$e_ok +fi + +echo "$qsize mail(s) in queue | mail_queue=$qsize" +exit $retval diff --git a/roles/nagios_client/files/scripts/check_rabbitmq_size b/roles/nagios_client/files/scripts/check_rabbitmq_size new file mode 100644 index 0000000000..ff6154aea7 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_rabbitmq_size @@ -0,0 +1,26 @@ +#!/bin/python +import sys +import requests + +url = 'http://localhost:15672/api/queues/%%2f/%s' % (sys.argv[1]) + +r = requests.get(url, auth=('guest', 'guest')).json() +consumers = r['consumers'] +messages = r['messages'] + +msg = 'Messages in queue: %i (%i consumers)' % (messages, consumers) + +if consumers < 1: + print 'CRITICAL: No consumers: %s' % msg + sys.exit(2) + +if messages > sys.argv[2]: + print 'CRITICAL: %s' % msg + sys.exit(2) + +if messages > sys.argv[3]: + print 'WARNING: %s' % msg + sys.exit(1) + +print 'OK: %s' % msg +sys.exit(0) diff --git a/roles/nagios_client/files/scripts/check_raid.py b/roles/nagios_client/files/scripts/check_raid.py new file mode 100644 index 0000000000..48cddd93d4 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_raid.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# +# very simple python script to parse out /proc/mdstat +# and give results for nagios to monitor +# + +import sys +import string + +devices = [] + +try: + mdstat = string.split(open('/proc/mdstat').read(), '\n') +except IOError: + # seems we have no software raid on this machines + sys.exit(0) + +error = "" +i = 0 +for line in mdstat: + if line[0:2] == 'md': + device = string.split(line)[0] + devices.append(device) + status = string.split(mdstat[i+1])[3] + if string.count(status, "_"): + # see if we can figure out what's going on + err = string.split(mdstat[i+2]) + msg = "device=%s status=%s" % (device, status) + if len(err) > 0: + msg = msg + " rebuild=%s" % err[0] + + if not error: + error = msg + else: + error = error + ", " + msg + i = i + 1 + +if not error: + print "DEVICES %s OK" % " ".join(devices) + sys.exit(0) + +else: + print error + sys.exit(2) + diff --git a/roles/nagios_client/files/scripts/check_readonly_fs b/roles/nagios_client/files/scripts/check_readonly_fs new file mode 100755 index 0000000000..cd2b1973a7 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_readonly_fs @@ -0,0 +1,84 @@ +#!/bin/bash + +# check_readonlyfs: Check for readonly filesystems +# Copyright (C) 2010 Davide Madrisan + +PROGNAME=`/bin/basename $0` +PROGPATH=`echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,'` +REVISION=`echo '$Revision: 1 $' | sed -e 's/[^0-9.]//g'` + +. $PROGPATH/utils.sh + +print_usage() { + echo "Usage: $PROGNAME --no-network-fs" + echo "Usage: $PROGNAME --help" + echo "Usage: $PROGNAME --version" +} + +print_help() { + print_revision $PROGNAME $REVISION + echo "" + print_usage + echo "" + echo "readonly filesystem checker plugin for Nagios" + echo "" + support +} + +NETFS=1 + +# Grab the command line arguments + +exitstatus=$STATE_WARNING #default + +while test -n "$1"; do + case "$1" in + --help|-h) + print_help + exit $STATE_OK + ;; + --version|-V) + print_revision $PROGNAME $REVISION + exit $STATE_OK + ;; + --no-network-fs|-n) + NETFS="0" + ;; + *) + echo "Unknown argument: $1" + print_usage + exit $STATE_UNKNOWN + ;; + esac + shift +done + +[ -r /proc/mounts ] || { echo "cannot read /proc/mounts!"; exit $STATE_UNKNOWN; } + +nerr=0 +IFS_SAVE="$IFS" + +rofs_list="" +while read dev mp fs mopt ignore; do + [ "$dev" = none ] && continue + case $fs in binfmt_misc|devpts|iso9660|proc|selinuxfs|rpc_pipefs|sysfs|tmpfs|usbfs) + continue ;; + esac + case $fs in autofs|nfs|nfs4|smbfs) + # skip the network filesystems + [ "$NETFS" = 0 ] && continue ;; + esac + + IFS=","; set -- $mopt; IFS="$IFS_SAVE" + while :; do + case "$1" in + ro) rofs_list="$rofs_list $mp"; nerr=$(( $nerr + 1 )) ;; + "") shift; break ;; + esac + shift + done +done < <(LC_ALL=C /bin/cat /proc/mounts 2>/dev/null) + +[ $nerr -eq 0 ] && { echo OK; exit $STATE_OK; } || echo "$rofs_list: read only fs" + +exit $exitstatus diff --git a/roles/nagios_client/files/scripts/check_supybot_plugin b/roles/nagios_client/files/scripts/check_supybot_plugin new file mode 100755 index 0000000000..a66ead2e7e --- /dev/null +++ b/roles/nagios_client/files/scripts/check_supybot_plugin @@ -0,0 +1,108 @@ +#!/usr/bin/env python +""" check_supybot_plugin -- ensure that a plugin is loaded by supybot. + +Run like: + + check_supybot_plugin --target fedmsg + check_supybot_plugin --target koji --debug + +""" + +import argparse +import sys +import socket +import string +import uuid + + +def process_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '-t', '--target', default=None, dest='target', + help="Required. The plugin we're looking for." + ) + parser.add_argument( + '-n', '--nick', default=None, dest='nick', + help="NICK to use when connecting to freenode.", + ) + parser.add_argument( + '-d', '--debug', default=False, action='store_true', + help='Print out debug information.', dest='debug', + ) + parser.add_argument( + '-H', '--host', default='irc.freenode.net', + help='Host to connect to.', dest='host', + ) + parser.add_argument( + '-p', '--port', default=6667, type=int, + help='Host to connect to.', dest='port', + ) + return parser.parse_args() + +args = process_args() + +# Use a random nick so people can't mess with us +if not args.nick: + args.nick = 'nrpe-' + str(uuid.uuid4()).split('-')[0] + +name = "NRPE Bot" +readbuffer = "" + +if not args.target: + print "UNKNOWN: No 'target' specified." + sys.exit(3) + +args.target = args.target.lower() + +if args.debug: + print "connecting to %s/%i" % (args.host, args.port) + +try: + s = socket.socket() + s.connect((args.host, args.port)) + + if args.debug: + print "as %s/%s (%s)" % (args.nick, args.nick, name) + + s.send("nick %s\r\n" % args.nick) + s.send("USER %s %s bla :%s\r\n" % (args.nick, args.host, name)) + + while 1: + readbuffer = readbuffer+s.recv(1024) + temp = string.split(readbuffer, "\n") + readbuffer = temp.pop() + + for line in temp: + line = string.rstrip(line) + + if args.debug: + print " * ", line + + line = string.split(line) + + if line[1] == 'MODE': + msg = "privmsg zodbot :list\r\n" + if args.debug: + print "sending:" + print " ->", msg + s.send(msg) + + if line[1] == 'PRIVMSG': + if args.debug: + print "Got our response.." + + plugins = map(str.lower, ' '.join(line[3:][1:]).split(', ')) + + if args.target in plugins: + print "OK" + s.send("QUIT") + sys.exit(0) + else: + print "CRITICAL: %r not loaded by supybot" % args.target + s.send("QUIT") + sys.exit(2) +except Exception as e: + print "UNKNOWN: ", str(e) + if args.debug: + raise + sys.exit(3) diff --git a/roles/nagios_client/files/scripts/check_testcloud b/roles/nagios_client/files/scripts/check_testcloud new file mode 100644 index 0000000000..eb8c7aab3b --- /dev/null +++ b/roles/nagios_client/files/scripts/check_testcloud @@ -0,0 +1,19 @@ +#!/bin/bash + +RUNNING_VMS=`testcloud instance list | grep -i 'running' | wc -l` +CRITICAL=20 +WARNING=15 + + +if [ $RUNNING_VMS -gt $CRITICAL ] +then + echo "Testcloud: CRITICAL Number of VMs running: $RUNNING_VMS" + exit 2 +elif [ $RUNNING_VMS -gt $WARNING ] +then + echo "Testcloud: WARNING Number of VMs running: $RUNNING_VMS" + exit 1 +else + echo "Testcloud: OK Number of VMs running: $RUNNING_VMS" + exit 0 +fi diff --git a/roles/nagios_client/files/selinux/fi-nrpe.mod b/roles/nagios_client/files/selinux/fi-nrpe.mod new file mode 100644 index 0000000000..f0552460cd Binary files /dev/null and b/roles/nagios_client/files/selinux/fi-nrpe.mod differ diff --git a/roles/nagios_client/files/selinux/fi-nrpe.pp b/roles/nagios_client/files/selinux/fi-nrpe.pp new file mode 100644 index 0000000000..1243b0e73e Binary files /dev/null and b/roles/nagios_client/files/selinux/fi-nrpe.pp differ diff --git a/roles/nagios_client/files/selinux/fi-nrpe.te b/roles/nagios_client/files/selinux/fi-nrpe.te new file mode 100644 index 0000000000..91bcdcc972 --- /dev/null +++ b/roles/nagios_client/files/selinux/fi-nrpe.te @@ -0,0 +1,11 @@ +module fi-nrpe 1.0; + +require { + type nagios_system_plugin_t; + type nrpe_exec_t; + class file getattr; +} + +#============= nagios_system_plugin_t ============== +allow nagios_system_plugin_t nrpe_exec_t:file getattr; + diff --git a/roles/nagios_client/handlers/main.yml b/roles/nagios_client/handlers/main.yml new file mode 100644 index 0000000000..11c84acd9b --- /dev/null +++ b/roles/nagios_client/handlers/main.yml @@ -0,0 +1,3 @@ +--- +- name: restart nrpe + service: name=nrpe state=restarted diff --git a/roles/nagios_client/tasks/main.yml b/roles/nagios_client/tasks/main.yml new file mode 100644 index 0000000000..b5011aa41e --- /dev/null +++ b/roles/nagios_client/tasks/main.yml @@ -0,0 +1,228 @@ +# nagios-client/nrpe + +--- +# install pkgs: +- name: install nagios client pkgs + yum: name={{ item }} state=present + with_items: + - nrpe + - nagios-plugins + - nagios-plugins-disk + - nagios-plugins-file_age + - nagios-plugins-users + - nagios-plugins-procs + - nagios-plugins-swap + - nagios-plugins-load + - nagios-plugins-ping + tags: + - packages + - nagios_client + when: ansible_distribution_major_version|int < 22 + +# install pkgs: +- name: install nagios client pkgs + dnf: name={{ item }} state=present + with_items: + - nrpe + - nagios-plugins + - nagios-plugins-disk + - nagios-plugins-file_age + - nagios-plugins-users + - nagios-plugins-procs + - nagios-plugins-swap + - nagios-plugins-load + - nagios-plugins-ping + tags: + - packages + - nagios_client + when: ansible_distribution_major_version|int > 21 + +- name: install local nrpe check scripts that are not packaged + copy: src="scripts/{{ item }}" dest="{{ libdir }}/nagios/plugins/{{ item }}" mode=0755 owner=nagios group=nagios + with_items: + - check_haproxy_conns.py + - check_haproxy_mirrorlist.py + - check_postfix_queue + - check_raid.py + - check_lock + - check_fcomm_queue + - check_fedmsg_consumer_backlog.py + - check_fedmsg_consumer_exceptions.py + - check_fedmsg_producer_last_ran.py + - check_fedmsg_producers_consumers.py + - check_supybot_plugin + - check_rabbitmq_size + - check_datanommer_timesince.py + - check_memcache_connect + - check_readonly_fs + - check_lock_file_age + - check_testcloud + - check_osbs_builds.py + - check_osbs_api.py + - check_ipa_replication + when: not inventory_hostname.startswith('noc') + tags: + - nagios_client + +# create dirs +# puppet used to make /var/spool/nagios (owned by nagios.nagios) mode 750 +# and /usr/lib/nagios/plugins (owned by root) mode 755 - but we don't know WHY +# then stuff it with plugins from the plugins dir in the nagios module +# then we symlinked that to /usr/lib64/nagios/plugins +# it was a nightmare - don't do that - my ghost will haunt you if you do +# skvidal 2013-05-21 + + +# Three tasks for handling our custom selinux module +- name: ensure a directory exists for our custom selinux module + file: dest=/usr/share/nrpe state=directory + +- name: copy over our custom selinux module + copy: src=selinux/fi-nrpe.pp dest=/usr/share/nrpe/fi-nrpe.pp + register: selinux_module + +- name: install our custom selinux module + command: semodule -i /usr/share/nrpe/fi-nrpe.pp + when: ansible_distribution_major_version|int == 7 and selinux_module|changed + + +# Set up our base config. +- name: /etc/nagios/nrpe.cfg + template: src=nrpe.cfg.j2 dest=/etc/nagios/nrpe.cfg + when: not inventory_hostname.startswith('noc') + notify: + - restart nrpe + tags: + - config + - nagios_client + +# +# The actual items files here end in .j2 (they are templates) +# So when adding or modifying them change the .j2 version in git. +# +- name: install nrpe client configs + template: src={{ item }}.j2 dest=/etc/nrpe.d/{{ item }} + with_items: + - check_mirrorlist_cache.cfg + - check_raid.cfg + - check_ipa.cfg + - check_readonly_fs.cfg + - check_cron.cfg + - check_disk.cfg + - check_swap.cfg + - check_postfix_queue.cfg + - check_lock.cfg + - check_fedmsg_hub_proc.cfg + - check_fedmsg_irc_proc.cfg + - check_fedmsg_relay_proc.cfg + - check_fedmsg_gateway_proc.cfg + - check_fedmsg_masher_proc.cfg + - check_redis_proc.cfg + - check_autocloud_proc.cfg + - check_fedmsg_consumers.cfg + - check_supybot_fedmsg_plugin.cfg + - check_datanommer_history.cfg + - check_memcache.cfg + - check_lock_file_age.cfg + - check_basset.cfg + - check_fmn.cfg + - check_osbs.cfg + - check_koschei_polling_proc.cfg + - check_koschei_resolver_proc.cfg + - check_koschei_scheduler_proc.cfg + - check_koschei_watcher_proc.cfg + - check_testcloud.cfg + notify: + - restart nrpe + tags: + - config + - nagios_client + +# +# The actual items files here end in .j2 (they are templates) +# So when adding or modifying them change the .j2 version in git. +# +- name: install nrpe bugyou fedmsg hubs check config + template: src=check_fedmsg_hub_procs_bugyou.cfg.j2 dest=/etc/nrpe.d/check_fedmsg_hub_procs_bugyou.cfg + when: inventory_hostname.startswith('bugyou01') + notify: + - restart nrpe + tags: + - nagios_client + +# +# The actual items files here end in .j2 (they are templates) +# So when adding or modifying them change the .j2 version in git. +# +- name: install nrpe openvpn check config + template: src=check_openvpn_link.cfg.j2 dest=/etc/nrpe.d/check_openvpn_link.cfg + when: datacenter != 'phx2' + notify: + - restart nrpe + tags: + - nagios_client +# +# The actual items files here end in .j2 (they are templates) +# So when adding or modifying them change the .j2 version in git. +# +- name: install nrpe unbound check config + template: src=check_unbound_proc.cfg.j2 dest=/etc/nrpe.d/check_unbound_proc.cfg + when: inventory_hostname.startswith('unbound') + notify: + - restart nrpe + tags: + - nagios_client +# +# The actual items files here end in .j2 (they are templates) +# So when adding or modifying them change the .j2 version in git. +# +- name: install nrpe merged log check script on log01 + template: src=check_merged_file_age.cfg.j2 dest=/etc/nrpe.d/check_merged_file_age.cfg + when: inventory_hostname.startswith('log0') + notify: + - restart nrpe + tags: + - nagios_client +# +# The actual items files here end in .j2 (they are templates) +# So when adding or modifying them change the .j2 version in git. +# +- name: install nrpe check_mysql config for mariadb servers + template: src=check_mysql.cfg.j2 dest=/etc/nrpe.d/check_mysql.cfg + when: inventory_hostname.startswith('db03') + notify: + - restart nrpe + tags: + - nagios_client + +- name: install nrpe checks for proxies + template: src={{ item }}.j2 dest=/etc/nrpe.d/{{ item }} + with_items: + - check_happroxy_conns.cfg + - check_happroxy_mirrorlist.cfg + - check_varnish_proc.cfg + when: inventory_hostname.startswith('proxy') + notify: + - restart nrpe + tags: + - nagios_client + +- name: nrpe service start + service: name=nrpe state=running enabled=true + tags: + - service + - nagios_client + +- name: Check if the fedmsg group exists + shell: /usr/bin/getent group fedmsg | /usr/bin/wc -l | tr -d ' ' + register: fedmsg_exists + check_mode: no + changed_when: "1 != 1" + tags: + - nagios_client + +- name: Add nrpe user to the fedmsg group if it exists + user: name=nrpe groups=fedmsg append=yes + when: fedmsg_exists.stdout == "1" + tags: + - nagios_client diff --git a/roles/nagios_client/templates/check_autocloud_proc.cfg.j2 b/roles/nagios_client/templates/check_autocloud_proc.cfg.j2 new file mode 100644 index 0000000000..0aefef2e62 --- /dev/null +++ b/roles/nagios_client/templates/check_autocloud_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_autocloud_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'python' -a 'autocloud_job.py' -u root diff --git a/roles/nagios_client/templates/check_basset.cfg.j2 b/roles/nagios_client/templates/check_basset.cfg.j2 new file mode 100644 index 0000000000..c543d1c6b0 --- /dev/null +++ b/roles/nagios_client/templates/check_basset.cfg.j2 @@ -0,0 +1,4 @@ +command[check_mongo_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u mongodb -C mongod -c 1:1 +command[check_rabbitmq_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u rabbitmq -C beam.smp -c 1:1 +command[check_worker_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u basset-worker -C basset-worker -c 1:6 +command[check_basset_queue]={{ libdir }}/nagios/plugins/check_rabbitmq_size check_submission 10 20 diff --git a/roles/nagios_client/templates/check_cron.cfg.j2 b/roles/nagios_client/templates/check_cron.cfg.j2 new file mode 100644 index 0000000000..b2030c579b --- /dev/null +++ b/roles/nagios_client/templates/check_cron.cfg.j2 @@ -0,0 +1 @@ +command[check_cron]={{ libdir }}/nagios/plugins/check_procs -c 1:15 -C 'crond' -u root diff --git a/roles/nagios_client/templates/check_datanommer_history.cfg.j2 b/roles/nagios_client/templates/check_datanommer_history.cfg.j2 new file mode 100644 index 0000000000..2b1c6cbbec --- /dev/null +++ b/roles/nagios_client/templates/check_datanommer_history.cfg.j2 @@ -0,0 +1,50 @@ +# Checks on the datanommer history to make sure we're still receiving messages +# of all types. +# +# The following are fedmsg/datanommer checks to be run on busgateway01. +# They check for the time since the latest message in any particular category. +# The first number is the seconds elapsed until we should raise a warning. +# The second number is the seconds elapsed until we should raise an error. +# For your reference: +# 4 hours -> 14400 +# 1 day -> 86400 +# 3 days -> 259200 +# 1 week -> 604800 +# 3 weeks -> 1814400 +# 1 month -> 2628000 +# 3 months -> 7884000 +command[check_datanommer_buildsys]={{libdir}}/nagios/plugins/check_datanommer_timesince.py buildsys 14400 86400 +command[check_datanommer_git]={{libdir}}/nagios/plugins/check_datanommer_timesince.py git 86400 604800 +command[check_datanommer_bodhi]={{libdir}}/nagios/plugins/check_datanommer_timesince.py bodhi 86400 604800 +command[check_datanommer_wiki]={{libdir}}/nagios/plugins/check_datanommer_timesince.py wiki 259200 1814400 +command[check_datanommer_compose]={{libdir}}/nagios/plugins/check_datanommer_timesince.py compose 259200 1814400 +command[check_datanommer_meetbot]={{libdir}}/nagios/plugins/check_datanommer_timesince.py meetbot 604800 2628000 +command[check_datanommer_fas]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fas 1814400 2628000 +command[check_datanommer_pkgdb]={{libdir}}/nagios/plugins/check_datanommer_timesince.py pkgdb 1814400 2628000 +command[check_datanommer_fedoratagger]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedoratagger 2628000 7884000 +command[check_datanommer_planet]={{libdir}}/nagios/plugins/check_datanommer_timesince.py planet 2628000 7884000 +command[check_datanommer_copr]={{libdir}}/nagios/plugins/check_datanommer_timesince.py copr 21600 86400 +command[check_datanommer_trac]={{libdir}}/nagios/plugins/check_datanommer_timesince.py trac 86400 259200 +command[check_datanommer_askbot]={{libdir}}/nagios/plugins/check_datanommer_timesince.py askbot 86400 259200 +command[check_datanommer_fedbadges]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedbadges 86400 259200 +command[check_datanommer_fedocal]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedocal 7884000 23652000 +command[check_datanommer_ansible]={{libdir}}/nagios/plugins/check_datanommer_timesince.py ansible 432000 604800 +command[check_datanommer_summershum]={{libdir}}/nagios/plugins/check_datanommer_timesince.py summershum 604800 1814400 +command[check_datanommer_jenkins]={{libdir}}/nagios/plugins/check_datanommer_timesince.py jenkins 432000 604800 +command[check_datanommer_github]={{libdir}}/nagios/plugins/check_datanommer_timesince.py github 432000 604800 +command[check_datanommer_kerneltest]={{libdir}}/nagios/plugins/check_datanommer_timesince.py kerneltest 604800 1814400 +command[check_datanommer_fmn]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fmn 604800 1814400 +command[check_datanommer_anitya]={{libdir}}/nagios/plugins/check_datanommer_timesince.py anitya 604800 1814400 +command[check_datanommer_fedimg]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedimg 259200 604800 +command[check_datanommer_hotness]={{libdir}}/nagios/plugins/check_datanommer_timesince.py hotness 604800 1814400 +command[check_datanommer_faf]={{libdir}}/nagios/plugins/check_datanommer_timesince.py faf 86400 259200 +command[check_datanommer_koschei]={{libdir}}/nagios/plugins/check_datanommer_timesince.py koschei 86400 604800 +command[check_datanommer_autocloud]={{libdir}}/nagios/plugins/check_datanommer_timesince.py autocloud 259200 1814400 +command[check_datanommer_twoweekatomic]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py org.fedoraproject.prod.releng.atomic.twoweek.complete 1296000 1382400 + +# This one is retired since it times out all the time. Too few messages. +#command[check_datanommer_nuancier]={{libdir}}/nagios/plugins/check_datanommer_timesince.py nuancier 23652000 31536000 + +# These are not actually finished and deployed yet +command[check_datanommer_mailman]={{libdir}}/nagios/plugins/check_datanommer_timesince.py mailman 14400 86400 +command[check_datanommer_bugzilla]={{libdir}}/nagios/plugins/check_datanommer_timesince.py bugzilla 86400 259200 diff --git a/roles/nagios_client/templates/check_disk.cfg.j2 b/roles/nagios_client/templates/check_disk.cfg.j2 new file mode 100644 index 0000000000..d2b64c5c88 --- /dev/null +++ b/roles/nagios_client/templates/check_disk.cfg.j2 @@ -0,0 +1,7 @@ +command[check_disk_/]={{ libdir }}/nagios/plugins/check_disk -w 14% -c 10% -p / +command[check_disk_/boot]={{ libdir }}/nagios/plugins/check_disk -w 15% -c 10% -p /boot +command[check_disk_/srv/cache/lookaside]={{ libdir }}/nagios/plugins/check_disk -w 20% -c 10% -p /srv/cache/lookaside +command[check_disk_/srv]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv +command[check_disk_/srv/buildmaster]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/buildmaster +command[check_disk_/srv/taskotron]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/taskotron +command[check_disk_/var/log]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 15% -p /var/log diff --git a/roles/nagios_client/templates/check_fedmsg_consumers.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_consumers.cfg.j2 new file mode 100644 index 0000000000..0b4b973489 --- /dev/null +++ b/roles/nagios_client/templates/check_fedmsg_consumers.cfg.j2 @@ -0,0 +1,63 @@ +# Fedmsg checks for consumers and producers +command[check_fedmsg_cp_busgateway_hub]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub Nommer MonitoringProducer +command[check_fedmsg_cp_busgateway_relay]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer +command[check_fedmsg_cp_busgateway_gateway]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-gateway GatewayConsumer MonitoringProducer +command[check_fedmsg_cp_anitya_relay]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer +command[check_fedmsg_cp_app]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer +command[check_fedmsg_cp_value]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-irc IRCBotConsumer MonitoringProducer +command[check_fedmsg_cp_pkgs]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub GenACLsConsumer MonitoringProducer +command[check_fedmsg_cp_summershum]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub SummerShumConsumer MonitoringProducer +command[check_fedmsg_cp_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FedoraBadgesConsumer MonitoringProducer +command[check_fedmsg_cp_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FMNConsumer DigestProducer ConfirmationProducer MonitoringProducer +command[check_fedmsg_cp_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py moksha-hub BugzillaConsumer MonitoringProducer +command[check_fedmsg_cp_fedimg_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FedimgConsumer MonitoringProducer +command[check_fedmsg_cp_hotness_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub BugzillaTicketFiler MonitoringProducer +command[check_fedmsg_cp_bodhi_backend01_hub]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub Masher MonitoringProducer +command[check_fedmsg_cp_bodhi_backend02_hub]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub UpdatesHandler MonitoringProducer +command[check_fedmsg_cp_autocloud_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub AutoCloudConsumer MonitoringProducer +command[check_fedmsg_cp_packages_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub CacheInvalidator MonitoringProducer +command[check_fedmsg_cp_bugyou_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub BugyouConsumer MonitoringProducer +command[check_fedmsg_cp_pdc_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub PDCUpdater MonitoringProducer + +command[check_fedmsg_cexceptions_busgateway_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub Nommer 1 10 +command[check_fedmsg_cexceptions_busgateway_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10 +command[check_fedmsg_cexceptions_busgateway_gateway]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-gateway GatewayConsumer 1 10 +command[check_fedmsg_cexceptions_anitya_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10 +command[check_fedmsg_cexceptions_app]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10 +command[check_fedmsg_cexceptions_value]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-irc IRCBotConsumer 1 10 +command[check_fedmsg_cexceptions_pkgs]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub GenACLsConsumer 1 10 +command[check_fedmsg_cexceptions_summershum]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub SummerShumConsumer 1 10 +command[check_fedmsg_cexceptions_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FedoraBadgesConsumer 1 10 +command[check_fedmsg_cexceptions_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FMNConsumer 1 10 +command[check_fedmsg_cexceptions_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py moksha-hub BugzillaConsumer 1 10 +command[check_fedmsg_cexceptions_fedimg_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FedimgConsumer 1 10 +command[check_fedmsg_cexceptions_hotness_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub BugzillaTicketFiler 1 10 +command[check_fedmsg_cexceptions_bodhi_backend01_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub Masher 1 10 +command[check_fedmsg_cexceptions_bodhi_backend02_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub UpdatesHandler 1 10 +command[check_fedmsg_cexceptions_autocloud_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub AutoCloudConsumer 1 10 +command[check_fedmsg_cexceptions_packages_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub CacheInvalidator 1 10 +command[check_fedmsg_cexceptions_bugyou_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub BugyouConsumer 1 10 +command[check_fedmsg_cexceptions_pdc_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub PDCUpdater 1 10 + +command[check_fedmsg_cbacklog_busgateway_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub Nommer 500 1000 +command[check_fedmsg_cbacklog_busgateway_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50 +command[check_fedmsg_cbacklog_busgateway_gateway]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-gateway GatewayConsumer 10 50 +command[check_fedmsg_cbacklog_anitya_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50 +command[check_fedmsg_cbacklog_app]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50 +command[check_fedmsg_cbacklog_value]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-irc IRCBotConsumer 10 50 +command[check_fedmsg_cbacklog_pkgs]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub GenACLsConsumer 10 50 +command[check_fedmsg_cbacklog_summershum]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub SummerShumConsumer 100 500 +command[check_fedmsg_cbacklog_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedoraBadgesConsumer 7000 10000 +command[check_fedmsg_cbacklog_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FMNConsumer 15000 20000 +command[check_fedmsg_cbacklog_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py moksha-hub BugzillaConsumer 10 100 +command[check_fedmsg_cbacklog_fedimg_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedimgConsumer 2000 5000 +command[check_fedmsg_cbacklog_hotness_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub BugzillaTicketFiler 1000 5000 +command[check_fedmsg_cbacklog_bodhi_backend01_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub Masher 500 1000 +command[check_fedmsg_cbacklog_bodhi_backend02_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub UpdatesHandler 500 1000 +command[check_fedmsg_cbacklog_autocloud_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub AutoCloudConsumer 100 500 +command[check_fedmsg_cbacklog_packages_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub CacheInvalidator 20000 30000 +command[check_fedmsg_cbacklog_bugyou_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub BugyouConsumer 5000 10000 +command[check_fedmsg_cbacklog_pdc_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub PDCUpdater 10000 20000 + +command[check_fedmsg_fmn_digest_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 90 600 +command[check_fedmsg_fmn_confirm_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 90 600 diff --git a/roles/nagios_client/templates/check_fedmsg_gateway_proc.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_gateway_proc.cfg.j2 new file mode 100644 index 0000000000..d6e9774a85 --- /dev/null +++ b/roles/nagios_client/templates/check_fedmsg_gateway_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_fedmsg_gateway_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-gateway' -u fedmsg diff --git a/roles/nagios_client/templates/check_fedmsg_hub_proc.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_hub_proc.cfg.j2 new file mode 100644 index 0000000000..17ec341c4a --- /dev/null +++ b/roles/nagios_client/templates/check_fedmsg_hub_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_fedmsg_hub_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-hub' -u fedmsg diff --git a/roles/nagios_client/templates/check_fedmsg_hub_procs_bugyou.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_hub_procs_bugyou.cfg.j2 new file mode 100644 index 0000000000..94678ebb3d --- /dev/null +++ b/roles/nagios_client/templates/check_fedmsg_hub_procs_bugyou.cfg.j2 @@ -0,0 +1 @@ +command[check_fedmsg_hub_procs_bugyou]={{ libdir }}/nagios/plugins/check_procs -c 3:3 -C 'fedmsg-hub' -u fedmsg diff --git a/roles/nagios_client/templates/check_fedmsg_irc_proc.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_irc_proc.cfg.j2 new file mode 100644 index 0000000000..92090dc62d --- /dev/null +++ b/roles/nagios_client/templates/check_fedmsg_irc_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_fedmsg_irc_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-irc' -u fedmsg diff --git a/roles/nagios_client/templates/check_fedmsg_masher_proc.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_masher_proc.cfg.j2 new file mode 100644 index 0000000000..b6ad466b59 --- /dev/null +++ b/roles/nagios_client/templates/check_fedmsg_masher_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_fedmsg_masher_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-hub' -u apache diff --git a/roles/nagios_client/templates/check_fedmsg_relay_proc.cfg.j2 b/roles/nagios_client/templates/check_fedmsg_relay_proc.cfg.j2 new file mode 100644 index 0000000000..c471575715 --- /dev/null +++ b/roles/nagios_client/templates/check_fedmsg_relay_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_fedmsg_relay_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-relay' -u fedmsg diff --git a/roles/nagios_client/templates/check_fmn.cfg.j2 b/roles/nagios_client/templates/check_fmn.cfg.j2 new file mode 100644 index 0000000000..05111bdb2f --- /dev/null +++ b/roles/nagios_client/templates/check_fmn.cfg.j2 @@ -0,0 +1,2 @@ +command[check_fmn_worker_queue]={{ libdir }}/nagios/plugins/check_rabbitmq_size workers 200 1000 +command[check_fmn_backend_queue]={{ libdir }}/nagios/plugins/check_rabbitmq_size backends 100 200 diff --git a/roles/nagios_client/templates/check_happroxy_conns.cfg.j2 b/roles/nagios_client/templates/check_happroxy_conns.cfg.j2 new file mode 100644 index 0000000000..381d2b25a5 --- /dev/null +++ b/roles/nagios_client/templates/check_happroxy_conns.cfg.j2 @@ -0,0 +1 @@ +command[check_haproxy_conns]=/usr/lib64/nagios/plugins/check_haproxy_conns.py diff --git a/roles/nagios_client/templates/check_happroxy_mirrorlist.cfg.j2 b/roles/nagios_client/templates/check_happroxy_mirrorlist.cfg.j2 new file mode 100644 index 0000000000..241d2ce25d --- /dev/null +++ b/roles/nagios_client/templates/check_happroxy_mirrorlist.cfg.j2 @@ -0,0 +1 @@ +command[check_haproxy_mirrorlist]=/usr/lib64/nagios/plugins/check_haproxy_mirrorlist.py diff --git a/roles/nagios_client/templates/check_ipa.cfg.j2 b/roles/nagios_client/templates/check_ipa.cfg.j2 new file mode 100644 index 0000000000..031473879f --- /dev/null +++ b/roles/nagios_client/templates/check_ipa.cfg.j2 @@ -0,0 +1 @@ +command[check_ipa_replication]={{ libdir }}/nagios/plugins/check_ipa_replication -u ldaps://localhost/ diff --git a/roles/nagios_client/templates/check_koschei_polling_proc.cfg.j2 b/roles/nagios_client/templates/check_koschei_polling_proc.cfg.j2 new file mode 100644 index 0000000000..d71f83a92c --- /dev/null +++ b/roles/nagios_client/templates/check_koschei_polling_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_koschei_polling_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-polling -c 1:1 diff --git a/roles/nagios_client/templates/check_koschei_resolver_proc.cfg.j2 b/roles/nagios_client/templates/check_koschei_resolver_proc.cfg.j2 new file mode 100644 index 0000000000..ff4814291c --- /dev/null +++ b/roles/nagios_client/templates/check_koschei_resolver_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_koschei_resolver_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-resolve -c 1:1 diff --git a/roles/nagios_client/templates/check_koschei_scheduler_proc.cfg.j2 b/roles/nagios_client/templates/check_koschei_scheduler_proc.cfg.j2 new file mode 100644 index 0000000000..c094915e85 --- /dev/null +++ b/roles/nagios_client/templates/check_koschei_scheduler_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_koschei_scheduler_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-schedul -c 1:1 diff --git a/roles/nagios_client/templates/check_koschei_watcher_proc.cfg.j2 b/roles/nagios_client/templates/check_koschei_watcher_proc.cfg.j2 new file mode 100644 index 0000000000..620ab4b574 --- /dev/null +++ b/roles/nagios_client/templates/check_koschei_watcher_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_koschei_watcher_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-watcher -c 1:1 diff --git a/roles/nagios_client/templates/check_lock.cfg.j2 b/roles/nagios_client/templates/check_lock.cfg.j2 new file mode 100644 index 0000000000..70015b7652 --- /dev/null +++ b/roles/nagios_client/templates/check_lock.cfg.j2 @@ -0,0 +1 @@ +command[check_lock]={{ libdir }}/nagios/plugins/check_lock diff --git a/roles/nagios_client/templates/check_lock_file_age.cfg.j2 b/roles/nagios_client/templates/check_lock_file_age.cfg.j2 new file mode 100644 index 0000000000..c36459a449 --- /dev/null +++ b/roles/nagios_client/templates/check_lock_file_age.cfg.j2 @@ -0,0 +1 @@ +command[check_lock_file_age]={{ libdir }}/nagios/plugins/check_lock_file_age -w 1 -c 5 -f /var/lock/fedora-ca/lock diff --git a/roles/nagios_client/templates/check_memcache.cfg.j2 b/roles/nagios_client/templates/check_memcache.cfg.j2 new file mode 100644 index 0000000000..b0ec100a5d --- /dev/null +++ b/roles/nagios_client/templates/check_memcache.cfg.j2 @@ -0,0 +1,2 @@ +command[check_memcache]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/memcached' -u memcached +command[check_memcache_connect]=/usr/lib64/nagios/plugins/check_memcache_connect diff --git a/roles/nagios_client/templates/check_merged_file_age.cfg.j2 b/roles/nagios_client/templates/check_merged_file_age.cfg.j2 new file mode 100644 index 0000000000..90df1c7b42 --- /dev/null +++ b/roles/nagios_client/templates/check_merged_file_age.cfg.j2 @@ -0,0 +1 @@ +command[check_merged_file_age]=/usr/lib64/nagios/plugins/check_file_age -w 120 -c 300 /var/log/merged/messages.log diff --git a/roles/nagios_client/templates/check_mirrorlist_cache.cfg.j2 b/roles/nagios_client/templates/check_mirrorlist_cache.cfg.j2 new file mode 100644 index 0000000000..94c58be10f --- /dev/null +++ b/roles/nagios_client/templates/check_mirrorlist_cache.cfg.j2 @@ -0,0 +1 @@ +command[check_mirrorlist_cache]={{ libdir }}/nagios/plugins/check_file_age -w 14400 -c 129600 -f /var/lib/mirrormanager/mirrorlist_cache.pkl diff --git a/roles/nagios_client/templates/check_mysql.cfg.j2 b/roles/nagios_client/templates/check_mysql.cfg.j2 new file mode 100644 index 0000000000..2b825d2a54 --- /dev/null +++ b/roles/nagios_client/templates/check_mysql.cfg.j2 @@ -0,0 +1 @@ +command[check_mysql_backup]={{ libdir }}/nagios/plugins/check_file_age -w 86400 -c 129600 -f /backups/fpo-mediawiki-latest.xz diff --git a/roles/nagios_client/templates/check_openvpn_link.cfg.j2 b/roles/nagios_client/templates/check_openvpn_link.cfg.j2 new file mode 100644 index 0000000000..77d3e660c9 --- /dev/null +++ b/roles/nagios_client/templates/check_openvpn_link.cfg.j2 @@ -0,0 +1 @@ +command[check_openvpn_link]={{ libdir }}/nagios/plugins/check_ping -H 192.168.1.41 -w 375.0,20% -c 500,60% diff --git a/roles/nagios_client/templates/check_osbs.cfg.j2 b/roles/nagios_client/templates/check_osbs.cfg.j2 new file mode 100644 index 0000000000..1b427f3816 --- /dev/null +++ b/roles/nagios_client/templates/check_osbs.cfg.j2 @@ -0,0 +1,2 @@ +command[check_osbs_builds]={{ libdir }}/nagios/plugins/check_osbs_builds.py +command[check_osbs_api]={{ libdir }}/nagios/plugins/check_osbs_api.py diff --git a/roles/nagios_client/templates/check_postfix_queue.cfg.j2 b/roles/nagios_client/templates/check_postfix_queue.cfg.j2 new file mode 100644 index 0000000000..40ab59225e --- /dev/null +++ b/roles/nagios_client/templates/check_postfix_queue.cfg.j2 @@ -0,0 +1 @@ +command[check_postfix_queue]={{ libdir }}/nagios/plugins/check_postfix_queue -w {{ nrpe_check_postfix_queue_warn }} -c {{ nrpe_check_postfix_queue_crit }} diff --git a/roles/nagios_client/templates/check_raid.cfg.j2 b/roles/nagios_client/templates/check_raid.cfg.j2 new file mode 100644 index 0000000000..ef47d12297 --- /dev/null +++ b/roles/nagios_client/templates/check_raid.cfg.j2 @@ -0,0 +1 @@ +command[check_raid]={{ libdir }}/nagios/plugins/check_raid.py diff --git a/roles/nagios_client/templates/check_readonly_fs.cfg.j2 b/roles/nagios_client/templates/check_readonly_fs.cfg.j2 new file mode 100644 index 0000000000..df896b7dc3 --- /dev/null +++ b/roles/nagios_client/templates/check_readonly_fs.cfg.j2 @@ -0,0 +1 @@ +command[check_readonly_fs]=/usr/lib64/nagios/plugins/check_readonly_fs diff --git a/roles/nagios_client/templates/check_redis_proc.cfg.j2 b/roles/nagios_client/templates/check_redis_proc.cfg.j2 new file mode 100644 index 0000000000..7f05bc5d66 --- /dev/null +++ b/roles/nagios_client/templates/check_redis_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_redis_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'redis-server' -u redis diff --git a/roles/nagios_client/templates/check_supybot_fedmsg_plugin.cfg.j2 b/roles/nagios_client/templates/check_supybot_fedmsg_plugin.cfg.j2 new file mode 100644 index 0000000000..514cf751cc --- /dev/null +++ b/roles/nagios_client/templates/check_supybot_fedmsg_plugin.cfg.j2 @@ -0,0 +1 @@ +command[check_supybot_fedmsg_plugin]={{libdir}}/nagios/plugins/check_supybot_plugin -t fedmsg diff --git a/roles/nagios_client/templates/check_swap.cfg.j2 b/roles/nagios_client/templates/check_swap.cfg.j2 new file mode 100644 index 0000000000..68695c9c57 --- /dev/null +++ b/roles/nagios_client/templates/check_swap.cfg.j2 @@ -0,0 +1 @@ +command[check_swap]={{ libdir }}/nagios/plugins/check_swap -w 15% -c 10% diff --git a/roles/nagios_client/templates/check_testcloud.cfg.j2 b/roles/nagios_client/templates/check_testcloud.cfg.j2 new file mode 100644 index 0000000000..25a314f2b1 --- /dev/null +++ b/roles/nagios_client/templates/check_testcloud.cfg.j2 @@ -0,0 +1 @@ +command[check_testcloud]={{ libdir }}/nagios/plugins/check_testcloud diff --git a/roles/nagios_client/templates/check_unbound_proc.cfg.j2 b/roles/nagios_client/templates/check_unbound_proc.cfg.j2 new file mode 100644 index 0000000000..cbae839cfd --- /dev/null +++ b/roles/nagios_client/templates/check_unbound_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_unbound_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'unbound' -u unbound diff --git a/roles/nagios_client/templates/check_varnish_proc.cfg.j2 b/roles/nagios_client/templates/check_varnish_proc.cfg.j2 new file mode 100644 index 0000000000..3935c16e5a --- /dev/null +++ b/roles/nagios_client/templates/check_varnish_proc.cfg.j2 @@ -0,0 +1 @@ +command[check_varnish_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:2 -C 'varnishd' -u varnish diff --git a/roles/nagios_client/templates/nrpe.cfg.j2 b/roles/nagios_client/templates/nrpe.cfg.j2 new file mode 100644 index 0000000000..64e8a2aa6d --- /dev/null +++ b/roles/nagios_client/templates/nrpe.cfg.j2 @@ -0,0 +1,228 @@ +############################################################################# +# Sample NRPE Config File +# Written by: Ethan Galstad (nagios@nagios.org) +# +# Last Modified: 11-23-2007 +# +# NOTES: +# This is a sample configuration file for the NRPE daemon. It needs to be +# located on the remote host that is running the NRPE daemon, not the host +# from which the check_nrpe client is being executed. +############################################################################# + + +# LOG FACILITY +# The syslog facility that should be used for logging purposes. + +log_facility=daemon + + + +# PID FILE +# The name of the file in which the NRPE daemon should write it's process ID +# number. The file is only written if the NRPE daemon is started by the root +# user and is running in standalone mode. + +pid_file=/var/run/nrpe/nrpe.pid + + + +# PORT NUMBER +# Port number we should wait for connections on. +# NOTE: This must be a non-priviledged port (i.e. > 1024). +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +server_port=5666 + + + +# SERVER ADDRESS +# Address that nrpe should bind to in case there are more than one interface +# and you do not want nrpe to bind on all interfaces. +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +#server_address=127.0.0.1 + + + +# NRPE USER +# This determines the effective user that the NRPE daemon should run as. +# You can either supply a username or a UID. +# +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +nrpe_user=nrpe + + + +# NRPE GROUP +# This determines the effective group that the NRPE daemon should run as. +# You can either supply a group name or a GID. +# +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +nrpe_group=nrpe + + + +# ALLOWED HOST ADDRESSES +# This is an optional comma-delimited list of IP address or hostnames +# that are allowed to talk to the NRPE daemon. Network addresses with a bit mask +# (i.e. 192.168.1.0/24) are also supported. Hostname wildcards are not currently +# supported. +# +# Note: The daemon only does rudimentary checking of the client's IP +# address. I would highly recommend adding entries in your /etc/hosts.allow +# file to allow only the specified host to connect to the port +# you are running this daemon on. +# +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + + +allowed_hosts=10.5.126.41,192.168.1.10,192.168.1.20,209.132.181.35 + + + +# COMMAND ARGUMENT PROCESSING +# This option determines whether or not the NRPE daemon will allow clients +# to specify arguments to commands that are executed. This option only works +# if the daemon was configured with the --enable-command-args configure script +# option. +# +# *** ENABLING THIS OPTION IS A SECURITY RISK! *** +# Read the SECURITY file for information on some of the security implications +# of enabling this variable. +# +# Values: 0=do not allow arguments, 1=allow command arguments + +dont_blame_nrpe=0 + + + +# COMMAND PREFIX +# This option allows you to prefix all commands with a user-defined string. +# A space is automatically added between the specified prefix string and the +# command line from the command definition. +# +# *** THIS EXAMPLE MAY POSE A POTENTIAL SECURITY RISK, SO USE WITH CAUTION! *** +# Usage scenario: +# Execute restricted commmands using sudo. For this to work, you need to add +# the nagios user to your /etc/sudoers. An example entry for alllowing +# execution of the plugins from might be: +# +# nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/ +# +# This lets the nagios user run all commands in that directory (and only them) +# without asking for a password. If you do this, make sure you don't give +# random users write access to that directory or its contents! + +# command_prefix=/usr/bin/sudo + + + +# DEBUGGING OPTION +# This option determines whether or not debugging messages are logged to the +# syslog facility. +# Values: 0=debugging off, 1=debugging on + +debug=0 + + + +# COMMAND TIMEOUT +# This specifies the maximum number of seconds that the NRPE daemon will +# allow plugins to finish executing before killing them off. + +command_timeout=100 + + + +# CONNECTION TIMEOUT +# This specifies the maximum number of seconds that the NRPE daemon will +# wait for a connection to be established before exiting. This is sometimes +# seen where a network problem stops the SSL being established even though +# all network sessions are connected. This causes the nrpe daemons to +# accumulate, eating system resources. Do not set this too low. + +connection_timeout=300 + + + +# WEEK RANDOM SEED OPTION +# This directive allows you to use SSL even if your system does not have +# a /dev/random or /dev/urandom (on purpose or because the necessary patches +# were not applied). The random number generator will be seeded from a file +# which is either a file pointed to by the environment valiable $RANDFILE +# or $HOME/.rnd. If neither exists, the pseudo random number generator will +# be initialized and a warning will be issued. +# Values: 0=only seed from /dev/[u]random, 1=also seed from weak randomness + +#allow_weak_random_seed=1 + + + +# INCLUDE CONFIG FILE +# This directive allows you to include definitions from an external config file. + +#include= + + + +# INCLUDE CONFIG DIRECTORY +# This directive allows you to include definitions from config files (with a +# .cfg extension) in one or more directories (with recursion). + +include_dir=/etc/nrpe.d/ + + + +# COMMAND DEFINITIONS +# Command definitions that this daemon will run. Definitions +# are in the following format: +# +# command[]= +# +# When the daemon receives a request to return the results of +# it will execute the command specified by the argument. +# +# Unlike Nagios, the command line cannot contain macros - it must be +# typed exactly as it should be executed. +# +# Note: Any plugins that are used in the command lines must reside +# on the machine that this daemon is running on! The examples below +# assume that you have plugins installed in a /usr/local/nagios/libexec +# directory. Also note that you will have to modify the definitions below +# to match the argument format the plugins expect. Remember, these are +# examples only! + + +# The following examples use hardcoded command arguments... + +command[check_users]={{ libdir }}/nagios/plugins/check_users -w 5 -c 10 +command[check_load]={{ libdir }}/nagios/plugins/check_load -w 15,10,5 -c 30,25,20 +command[check_hda1]={{ libdir }}/nagios/plugins/check_disk -w 20% -c 10% -p /dev/hda1 +{% if inventory_hostname not in groups['zombie-infested'] %} +command[check_zombie_procs]={{ libdir }}/nagios/plugins/check_procs -w 5 -c 10 -s Z +{% else %} +# This host is prone to Zombies and we do not care or want to alert on it so we make the limits very high +command[check_zombie_procs]={{ libdir }}/nagios/plugins/check_procs -w 50000 -c 100000 -s Z +{% endif %} +command[check_total_procs]={{ libdir }}/nagios/plugins/check_procs -w {{ nrpe_procs_warn }} -c {{ nrpe_procs_crit }} + + +# The following examples allow user-supplied arguments and can +# only be used if the NRPE daemon was compiled with support for +# command arguments *AND* the dont_blame_nrpe directive in this +# config file is set to '1'. This poses a potential security risk, so +# make sure you read the SECURITY file before doing this. + +#command[check_users]=/usr/lib64/nagios/plugins/check_users -w $ARG1$ -c $ARG2$ +#command[check_load]=/usr/lib64/nagios/plugins/check_load -w $ARG1$ -c $ARG2$ +#command[check_disk]=/usr/lib64/nagios/plugins/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$ +#command[check_procs]=/usr/lib64/nagios/plugins/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$ + + +# NEVER ADD ANYTHING HERE - ANY ENTRIES TO NRPE SHOULD BE in .cfg files in /etc/nrpe.d/ + +# NEVER NEVER NEVER +# diff --git a/roles/nagios_server/README.rst b/roles/nagios_server/README.rst new file mode 100644 index 0000000000..3152303bde --- /dev/null +++ b/roles/nagios_server/README.rst @@ -0,0 +1,78 @@ +=================================== + Nagios 4 Configuration for Fedora +=================================== + +The Fedora Infrastructure Nagios is built on a set of configurations +originally written for Nagios 2 and then upgraded over time to Nagios +3 and then 4.08. With additional changes made in the 4.2 series of +Nagios this needed a better rewrite as various parts came from +pre-puppet and then various puppet modules added on top. + +In order to get this rewrite done, we will use as much of the original +layout of the Fedora ansible nagios module but with rewrites to better +match current Nagios configurations so that it can be maintained. + +Role directory layout +===================== +The original layout branched out from + + roles/nagios/client/ + roles/nagios/server/ + +With the usual trees below this. This breaks ansible best practices +and how most new modules are set up so the rewrite uses: + + roles/nagios_client/ + roles/nagios_server/ + +===================== + Nagios Server Files +===================== + +The Nagios Server Files require a large layout change. The original +Nagios system used multiple independant modes and files which caused +problems when hosts were removed. The new system will use hosts set up +from the Fedora Ansible Inventory with hostgroups set up to match +groups. + + roles/nagios_server/{files,handlers,tasks,templates} + + r.../n.../files/httpd ==> /etc/httpd/conf.d files + r.../n.../files/nagios ==> /etc/nagios/ files + r.../n.../files/nagios/commands command files + r.../n.../files/nagios/hosts host files + r.../n.../files/nagios/hostgroups groups made from hosts + r.../n.../files/nagios/services services + r.../n.../files/nagios/servicegroups groups made from services + r.../n.../files/nagios/contacts files for people + r.../n.../files/nagios/contactgroups groups made from contacts + + similar layout for templates + handlers has the ways to restart and check configuration + tasks has the main rules for building stuff. + +=================== +Nagios Module Steps +=================== + +1. Check to see if the nagios user is configured. Someone years ago + chose that our monitoring uses UID/GID 420. Har Har. + Setup any other groups and permissions +2. Install the needed packages for the server. +3. Setup the directories on the server + /etc/nagios/{child} +4. Synchonise over the static files + /etc/nagios/commands/ + /etc/nagios/services/ + /etc/nagios/servicegroups/ + /etc/nagios/contacts/ + /etc/nagios/contactgroups/ + /usr/lib64/nagios/plugins/ + /usr/local/bin + /usr/share/nagios/html/ +5. Build template files + /etc/nagios/commands/ + /etc/nagios/hosts/{ansible-inventory, ansible-vars, other} + /etc/nagios/hostgroups/ +6. Fix selinux policy +7. Restart services diff --git a/roles/nagios_server/files/httpd/nagios.conf b/roles/nagios_server/files/httpd/nagios.conf new file mode 100644 index 0000000000..79dd734ff6 --- /dev/null +++ b/roles/nagios_server/files/httpd/nagios.conf @@ -0,0 +1,36 @@ +# noc1 +ScriptAlias /nagios/cgi-bin/ /usr/lib64/nagios/cgi-bin/ + +# noc2 +ScriptAlias /nagios-external/cgi-bin/ /usr/lib64/nagios/cgi-bin/ + +# test +ScriptAlias /nagios-just-a-test/cgi-bin/ /usr/lib64/nagios/cgi-bin/ + +ScriptAlias /tac.cgi /usr/lib64/nagios/cgi-bin/tac.cgi + + + AuthName "Nagios GSSAPI Login" + GssapiCredStore keytab:/etc/krb5.HTTP_admin.fedoraproject.org.keytab + AuthType GSSAPI + # This is off because Apache (and thus mod_auth_gssapi) doesn't know this is proxied over TLS + GssapiSSLonly Off + GssapiLocalName on + Require valid-user + + + + Options ExecCGI + + + + Options None + + +Alias /nagios /usr/share/nagios/html/ + +# This will only affect noc2 because the proxies only forward -external to it. +Alias /nagios-external /usr/share/nagios/html/ + +# Test +Alias /nagios-test /usr/share/nagios/html/ diff --git a/roles/nagios_server/files/nagios/commands/bzr.cfg b/roles/nagios_server/files/nagios/commands/bzr.cfg new file mode 100644 index 0000000000..f6dcaffa80 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/bzr.cfg @@ -0,0 +1,8 @@ +# 'check_bzr' command definition +# I'd like this to actually interact with BZR, but I can't find any +# proper documentation on the protocol to craft send/expect/quit +# strings. +define command{ + command_name check_bzr + command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p 4155 +} diff --git a/roles/nagios_server/files/nagios/commands/disk.cfg b/roles/nagios_server/files/nagios/commands/disk.cfg new file mode 100644 index 0000000000..2c0bb52221 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/disk.cfg @@ -0,0 +1,15 @@ +define command { + command_name check_by_ssh_check_raid + command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_raid.py" +} + +define command { + command_name check_by_ssh_check_disk + command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_disk -w $ARG1$% -c $ARG2$% -p $ARG3$" +} + +# 'check_postgres_conns' command definition +define command{ + command_name check_postgres_conns + command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_procs -u postgres -w $ARG1$ -c $ARG2$ -a $ARG3$" +} diff --git a/roles/nagios_server/files/nagios/commands/dns.cfg b/roles/nagios_server/files/nagios/commands/dns.cfg new file mode 100644 index 0000000000..88b52e52b5 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/dns.cfg @@ -0,0 +1,11 @@ +# 'check_dns' command definition +define command{ + command_name check_dns + command_line $USER1$/check_dns -H www.yahoo.com -s $HOSTADDRESS$ + } + +# 'check_dns_fpo' command definition +define command{ + command_name check_dns_fpo + command_line $USER1$/check_dns -t 30 -H fedoraproject.org -A -s $HOSTADDRESS$ + } diff --git a/roles/nagios_server/files/nagios/commands/git.cfg b/roles/nagios_server/files/nagios/commands/git.cfg new file mode 100644 index 0000000000..070fcb2b2e --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/git.cfg @@ -0,0 +1,8 @@ +# 'check_git' command definition +# I'd like this to actually interact with GIT, but I can't find any +# proper documentation on the protocol to craft send/expect/quit +# strings. +define command{ + command_name check_git + command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p 9418 +} diff --git a/roles/nagios_server/files/nagios/commands/httpd.cfg b/roles/nagios_server/files/nagios/commands/httpd.cfg new file mode 100644 index 0000000000..74e65e67b4 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/httpd.cfg @@ -0,0 +1,79 @@ +## +## This file has the commands to check and restart general httpd services +## and websites. +## + +################################################################################ +# COMMAND DEFINITIONS +# +# SYNTAX: +# +# define command{ +# template +# name +# command_name +# command_line +# } +# +# WHERE: +# +# = object name of another command definition that should be +# used as a template for this definition (optional) +# = object name of command definition, referenced by other +# command definitions that use it as a template (optional) +# = name of the command, as recognized/used by Nagios +# = command line +# +################################################################################ + +# 'reload httpd' +define command { + command_name restart_httpd + command_line $USER1$/restart_httpd $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTADDRESS$ "$HOSTALIAS$" "$SERVICEDESC$" "$SERVICESTATE$" +} + + +# +# 'check_website_publiclist' command definition +define command{ + command_name check_website_publiclist + command_line $USER1$/check_http -w 60 -c 80 -I $HOSTADDRESS$ -H $ARG1$ -u "$ARG2$" -s "$ARG3$" +} + +# 'check_website' command definition +define command{ + command_name check_website + command_line $USER1$/check_http -w 30 -c 40 -I $HOSTADDRESS$ -H $ARG1$ -u "$ARG2$" -s "$ARG3$" +} + +define command{ + command_name check_website_ppc + command_line $USER1$/check_http -w 300 -c 400 -I $HOSTADDRESS$ -H $ARG1$ -u "$ARG2$" -s "$ARG3$" +} + +define command{ + command_name check_website_ssl + command_line $USER1$/check_http -w 30 -c 40 --ssl -I $HOSTADDRESS$ -H $ARG1$ -u $ARG2$ -s "$ARG3$" +} + +define command{ + command_name check_ssl_cert + command_line $USER1$/check_http -I $HOSTADDRESS$ -H $ARG1$ -C $ARG2$ +} + +define command{ + command_name check_website_publiclist_ssl + command_line $USER1$/check_http -w 40 -c 60 --ssl -I $HOSTADDRESS$ -H $ARG1$ -u $ARG2$ -s "$ARG3$" +} + +# 'check_http' command definition +define command{ + command_name check_http + command_line $USER1$/check_http -H $HOSTADDRESS$ +} + +# 'check_https' command definition +define command{ + command_name check_https + command_line $USER1$/check_http -H $HOSTADDRESS$ --ssl +} diff --git a/roles/nagios_server/files/nagios/commands/koji.cfg b/roles/nagios_server/files/nagios/commands/koji.cfg new file mode 100644 index 0000000000..03ab1d33f7 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/koji.cfg @@ -0,0 +1,29 @@ +################################################################################ +# COMMAND DEFINITIONS +# +# SYNTAX: +# +# define command{ +# template +# name +# command_name +# command_line +# } +# +# WHERE: +# +# = object name of another command definition that should be +# used as a template for this definition (optional) +# = object name of command definition, referenced by other +# command definitions that use it as a template (optional) +# = name of the command, as recognized/used by Nagios +# = command line +# +################################################################################ + +# 'check_koji' +define command{ + command_name check_koji + command_line $USER1$/check_koji +} + diff --git a/roles/nagios_server/files/nagios/commands/local.cfg b/roles/nagios_server/files/nagios/commands/local.cfg new file mode 100644 index 0000000000..37e7ba95d0 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/local.cfg @@ -0,0 +1,36 @@ +# 'check_local_disk' command definition +define command{ + command_name check_local_disk + command_line $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$ + } + +# 'check_local_load' command definition +define command{ + command_name check_local_load + command_line $USER1$/check_load -w $ARG1$ -c $ARG2$ + } + +# 'check_local_procs' command definition +define command{ + command_name check_local_procs + command_line $USER1$/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$ + } + +# 'check_local_users' command definition +define command{ + command_name check_local_users + command_line $USER1$/check_users -w $ARG1$ -c $ARG2$ + } + +# 'check_local_swap' command definition +define command{ + command_name check_local_swap + command_line $USER1$/check_swap -w $ARG1$ -c $ARG2$ + } + +# 'check_local_mrtgtraf' command definition +define command{ + command_name check_local_mrtgtraf + command_line $USER1$/check_mrtgtraf -F $ARG1$ -a $ARG2$ -w $ARG3$ -c $ARG4$ -e $ARG5$ + } + diff --git a/roles/nagios_server/files/nagios/commands/misc.cfg b/roles/nagios_server/files/nagios/commands/misc.cfg new file mode 100644 index 0000000000..4c0e4da4c3 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/misc.cfg @@ -0,0 +1,96 @@ +################################################################################ +# COMMAND DEFINITIONS +# +# SYNTAX: +# +# define command{ +# template +# name +# command_name +# command_line +# } +# +# WHERE: +# +# = object name of another command definition that should be +# used as a template for this definition (optional) +# = object name of command definition, referenced by other +# command definitions that use it as a template (optional) +# = name of the command, as recognized/used by Nagios +# = command line +# +################################################################################ + +define command{ + command_name true + command_line /bin/true +} + +define command{ + command_name check_dummy + command_line $USER1$/check_dummy $ARG1$ $ARG2$ +} + +# 'check_tape' +define command{ + command_name check_tape + command_line $USER1$/check_tape +} + +# 'check_ftp' command definition +define command{ + command_name check_ftp + command_line $USER1$/check_ftp -H $HOSTADDRESS$ + } + + +# 'check_hpjd' command definition +define command{ + command_name check_hpjd + command_line $USER1$/check_hpjd -H $HOSTADDRESS$ -C public + } + +# 'check_snmp' command definition +define command{ + command_name check_snmp + command_line $USER1$/check_snmp -H $HOSTADDRESS$ $ARG1$ + } + + +# 'check_nntp' command definition +define command{ + command_name check_nntp + command_line $USER1$/check_nntp -H $HOSTADDRESS$ + } + + +# 'check_telnet' command definition +define command{ + command_name check_telnet + command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p 23 + } + +# 'check_dhcp' command definition +define command{ + command_name check_dhcp + command_line $USER1$/check_dhcp $ARG1$ + } + +# 'check_pop' command definition +define command{ + command_name check_pop + command_line $USER1$/check_pop -H $HOSTADDRESS$ + } + +# 'check_imap' command definition +define command{ + command_name check_imap + command_line $USER1$/check_imap -H $HOSTADDRESS$ $ARG1$ + } + +# 'check_nt' command definition +define command{ + command_name check_nt + command_line $USER1$/check_nt -H $HOSTADDRESS$ -p 12489 -v $ARG1$ $ARG2$ + } + diff --git a/roles/nagios_server/files/nagios/commands/notify.cfg b/roles/nagios_server/files/nagios/commands/notify.cfg new file mode 100644 index 0000000000..867b387ef0 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/notify.cfg @@ -0,0 +1,87 @@ +################################################################################ +# +# SAMPLE NOTIFICATION COMMANDS +# +# These are some example notification commands. They may or may not work on +# your system without modification. As an example, some systems will require +# you to use "/usr/bin/mailx" instead of "/usr/bin/mail" in the commands below. +# +################################################################################ + +# 'host-notify-by-email' command definition +define command{ + command_name host-notify-by-email + command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\nSource: $$(hostname)\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "Host $HOSTSTATE$ alert for $HOSTNAME$!" $CONTACTEMAIL$ + } + +# 'notify-service-by-email' command definition +define command{ + command_name notify-service-by-email + command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /usr/bin/mail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$ + } + +# 'notify-by-epager' command definition +define command{ + command_name notify-by-epager + command_line /usr/bin/printf "%b" "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nInfo: $SERVICEOUTPUT$\nSource: $$(hostname -s)\nDate: $LONGDATETIME$" | /bin/mail -s "$NOTIFICATIONTYPE$: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$" $CONTACTPAGER$ + } + + +# 'host-notify-by-epager' command definition +define command{ + command_name host-notify-by-epager + command_line /usr/bin/printf "%b" "Host '$HOSTALIAS$' is $HOSTSTATE$\nInfo: $HOSTOUTPUT$\nSource: $$(hostname -s)\nTime: $LONGDATETIME$" | /bin/mail -s "$NOTIFICATIONTYPE$ alert - Host $HOSTNAME$ is $HOSTSTATE$" $CONTACTPAGER$ + } + +# 'host-notify-by-ircbot' command definition +define command{ + command_name host-notify-by-ircbot + command_line /usr/bin/printf "%b" "#fedora-noc $NOTIFICATIONTYPE$ - $HOSTALIAS$ is $HOSTSTATE$: $HOSTOUTPUT$ ($$(hostname -s)) $HOSTACKAUTHOR$ $SERVICEACKAUTHOR$" | /usr/local/bin/irc-colorize.py | nc -w 1 value01 5050 + } + +# 'notify-by-email' command definition +define command{ + command_name notify-by-email + command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\nSource: $$(hostname)\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$" | /bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$ + } + +# 'notify-by-ircbot' command definition +define command{ + command_name notify-by-ircbot + command_line /usr/bin/printf "%b" "#fedora-noc $NOTIFICATIONTYPE$ - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$: $SERVICEOUTPUT$ ($$(hostname -s)) $HOSTACKAUTHOR$ $SERVICEACKAUTHOR$" | /usr/local/bin/irc-colorize.py | nc -w 1 value01 5050 + } + +# 'host-notify-by-fedmsg' command definition +define command{ + command_name host-notify-by-fedmsg + command_line /usr/bin/echo '{"type": "$NOTIFICATIONTYPE$", "host": "$HOSTALIAS$", "state": "$HOSTSTATE$", "output": "$HOSTOUTPUT$", "host_ack_author": "$HOSTACKAUTHOR$", "service_ack_author": "$SERVICEACKAUTHOR$"}' | fedmsg-logger --cert-prefix nagios --modname nagios --topic host.state.change --json-input + } + +# 'notify-by-epager' command definition +define command{ + command_name notify-by-epager + command_line /usr/bin/printf "%b" "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nInfo: $SERVICEOUTPUT$\nSource: $$(hostname -s)\nDate: $LONGDATETIME$" | /bin/mail -s "$NOTIFICATIONTYPE$: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$" $CONTACTPAGER$ + } + + +# 'notify-by-fedmsg' command definition +define command{ + command_name notify-by-fedmsg + command_line /usr/bin/echo '{"type": "$NOTIFICATIONTYPE$", "host": "$HOSTALIAS$", "state": "$SERVICESTATE$", "service": "$SERVICEDESC$", "output": "$SERVICEOUTPUT$", "host_ack_author": "$HOSTACKAUTHOR$", "service_ack_author": "$SERVICEACKAUTHOR$"}' | fedmsg-logger --cert-prefix nagios --modname nagios --topic service.state.change --json-input + } + +# 'notify-by-xmpp' command definition +define command{ + command_name notify-by-xmpp + command_line /usr/local/bin/xmppsend -a /etc/nagios/private/xmppnagios.ini "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nInfo: $SERVICEOUTPUT$\nDate: $LONGDATETIME$" $CONTACTEMAIL$ + } + + +# 'host-notify-by-xmpp' command definition +define command{ + command_name host-notify-by-xmpp + command_line /usr/local/bin/xmppsend -a /etc/nagios/private/xmppnagios.ini "Host '$HOSTALIAS$' is $HOSTSTATE$\nInfo: $HOSTOUTPUT$\nDate: $LONGDATETIME$" $CONTACTEMAIL$ + } + + + diff --git a/roles/nagios_server/files/nagios/commands/nrpe.cfg b/roles/nagios_server/files/nagios/commands/nrpe.cfg new file mode 100644 index 0000000000..01bf14099a --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/nrpe.cfg @@ -0,0 +1,17 @@ +# 'test nrpe' +define command{ + command_name test_nrpe + command_line $USER1$/check_nrpe -t 30 -H $HOSTADDRESS$ + +} +# 'check by nrpe' +define command{ + command_name check_by_nrpe + command_line $USER1$/check_nrpe -t 30 -H $HOSTADDRESS$ -c $ARG1$ +} + +# 'check-host-alive-nrpe' is better for hosts that are on vpn. +define command{ + command_name check-host-alive-nrpe + command_line $USER1$/check_nrpe -t 30 -H $HOSTADDRESS$ + } diff --git a/roles/nagios_server/files/nagios/commands/perfdata.cfg b/roles/nagios_server/files/nagios/commands/perfdata.cfg new file mode 100644 index 0000000000..a3dd4f7878 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/perfdata.cfg @@ -0,0 +1,26 @@ +################################################################################ +# +# SAMPLE PERFORMANCE DATA COMMANDS +# +# These are sample performance data commands that can be used to send performance +# data output to two text files (one for hosts, another for services). If you +# plan on simply writing performance data out to a file, consider using the +# host_perfdata_file and service_perfdata_file options in the main config file. +# +################################################################################ + + +# 'process-host-perfdata' command definition +define command{ + command_name process-host-perfdata + command_line /usr/bin/printf "%b" "$LASTHOSTCHECK$\t$HOSTNAME$\t$HOSTSTATE$\t$HOSTATTEMPT$\t$HOSTSTATETYPE$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$\n" >> /var/log/nagios/host-perfdata.out + } + + +# 'process-service-perfdata' command definition +define command{ + command_name process-service-perfdata + command_line /usr/bin/printf "%b" "$LASTSERVICECHECK$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICESTATE$\t$SERVICEATTEMPT$\t$SERVICESTATETYPE$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$\n" >> /var/log/nagios/service-perfdata.out + } + + diff --git a/roles/nagios_server/files/nagios/commands/ping.cfg b/roles/nagios_server/files/nagios/commands/ping.cfg new file mode 100644 index 0000000000..86a7010eb7 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/ping.cfg @@ -0,0 +1,31 @@ +# This command checks to see if a host is "alive" by pinging it +# The check must result in a 100% packet loss or 5 second (3000ms) round trip +# average time to produce a critical error. +# Note: Only one ICMP echo packet is sent (determined by the '-p 1' argument) + +# 'check-host-alive' command definition +define command{ + command_name check-host-alive + command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5 + } + +define command{ + command_name check-host-alive4 + command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 2 + } + +define command{ + command_name check-host-alive6 + command_line $USER1$/check_ping -6 -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 2 + } + +# 'check_ping' command definition +define command{ + command_name check_ping4 + command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5 + } + +define command{ + command_name check_ping6 + command_line $USER1$/check_ping -6 -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5 + } diff --git a/roles/nagios_server/files/nagios/commands/postgres.cfg b/roles/nagios_server/files/nagios/commands/postgres.cfg new file mode 100644 index 0000000000..6a4a6217f7 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/postgres.cfg @@ -0,0 +1,5 @@ +# 'pgsql' +define command{ + command_name check_pgsql + command_line $USER1$/check_pgsql -H $HOSTADDRESS$ -d $ARG1$ -p '{{nagios_db_user_password}}' --logname 'nagiosuser' +} diff --git a/roles/nagios_server/files/nagios/commands/rsyslog.cfg b/roles/nagios_server/files/nagios/commands/rsyslog.cfg new file mode 100644 index 0000000000..df6e0b7784 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/rsyslog.cfg @@ -0,0 +1,28 @@ +################################################################################ +# COMMAND DEFINITIONS +# +# SYNTAX: +# +# define command{ +# template +# name +# command_name +# command_line +# } +# +# WHERE: +# +# = object name of another command definition that should be +# used as a template for this definition (optional) +# = object name of command definition, referenced by other +# command definitions that use it as a template (optional) +# = name of the command, as recognized/used by Nagios +# = command line +# +################################################################################ + + +define command { + command_name restart_rsyslog + command_line $USER1$/restart_rsyslog $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTADDRESS$ "$HOSTALIAS$" "$SERVICEDESC$" "$SERVICESTATE$" +} diff --git a/roles/nagios_server/files/nagios/commands/smtp.cfg b/roles/nagios_server/files/nagios/commands/smtp.cfg new file mode 100644 index 0000000000..c3d53adcb4 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/smtp.cfg @@ -0,0 +1,12 @@ +# 'check_smtp' command definition +define command{ + command_name check_smtp + command_line $USER1$/check_smtp -H $HOSTADDRESS$ + } + + +# 'check_email_delivery' command definition +define command{ + command_name check_email_delivery + command_line $USER1$/check_email_delivery_epn -H $ARG1$ --mailto $ARG2$ --mailfrom $ARG3$ --username $ARG4$ --password $ARG5$ -w $ARG6$ -c $ARG7$ +} diff --git a/roles/nagios_server/files/nagios/commands/ssh.cfg b/roles/nagios_server/files/nagios/commands/ssh.cfg new file mode 100644 index 0000000000..6553347890 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/ssh.cfg @@ -0,0 +1,22 @@ +# 'check_ssh' command definition +define command{ + command_name check_ssh + command_line $USER1$/check_ssh -H $HOSTADDRESS$ +} + + +define command { + command_name check_by_ssh_check_raid + command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_raid.py" +} + +define command { + command_name check_by_ssh_check_disk + command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_disk -w $ARG1$% -c $ARG2$% -p $ARG3$" +} + +# 'check_postgres_conns' command definition +define command{ + command_name check_postgres_conns + command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_procs -u postgres -w $ARG1$ -c $ARG2$ -a $ARG3$" +} diff --git a/roles/nagios_server/files/nagios/commands/tcp.cfg b/roles/nagios_server/files/nagios/commands/tcp.cfg new file mode 100644 index 0000000000..d2268ab4de --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/tcp.cfg @@ -0,0 +1,6 @@ + +# 'check_tcp' command definition +define command{ + command_name check_tcp + command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ +} diff --git a/roles/nagios_server/files/nagios/commands/testcloud.cfg b/roles/nagios_server/files/nagios/commands/testcloud.cfg new file mode 100644 index 0000000000..7e2c235343 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/testcloud.cfg @@ -0,0 +1,5 @@ +# 'check_testcloud' +define command{ + command_name check_testcloud + command_line $USER1$/check_testcloud +} diff --git a/roles/nagios_server/files/nagios/commands/udp.cfg b/roles/nagios_server/files/nagios/commands/udp.cfg new file mode 100644 index 0000000000..6c2cfbe6d6 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/udp.cfg @@ -0,0 +1,5 @@ +# 'check_udp' command definition +define command{ + command_name check_udp + command_line $USER1$/check_udp -H $HOSTADDRESS$ -p $ARG1$ + } diff --git a/roles/nagios_server/files/nagios/commands/unbound.cfg b/roles/nagios_server/files/nagios/commands/unbound.cfg new file mode 100644 index 0000000000..aa57db5a04 --- /dev/null +++ b/roles/nagios_server/files/nagios/commands/unbound.cfg @@ -0,0 +1,12 @@ +# 'check_unbound_80' command definition +define command{ + command_name check_unbound_80 + command_line $USER1$/check_dig -H $HOSTADDRESS$ -w 5 -c 9 -p 80 -l $ARG1$ -A "+tcp" + } + + +# 'check_unbound_443' command definition +define command{ + command_name check_unbound_443 + command_line $USER1$/check_dig_ssl -H $HOSTADDRESS$ -w 5 -c 9 -p 443 -L $ARG1$ -l $ARG2$ -A "+tcp" + } diff --git a/roles/nagios_server/files/nagios/configs/escalations.cfg b/roles/nagios_server/files/nagios/configs/escalations.cfg new file mode 100644 index 0000000000..0232815da0 --- /dev/null +++ b/roles/nagios_server/files/nagios/configs/escalations.cfg @@ -0,0 +1,22 @@ +define hostescalation{ + host_name * + hostgroup_name * + contact_groups fedora-sysadmin-email,fedora-sysadmin-emergency,fedora-sysadmin-ircbot + first_notification 2 + last_notification 0 + notification_interval 60 + escalation_period 24x7 + escalation_options d,u,r +} + + +define serviceescalation{ + host_name * + service_description * + contact_groups fedora-sysadmin-email,fedora-sysadmin-emergency,fedora-sysadmin-ircbot + first_notification 2 + last_notification 0 + notification_interval 60 + escalation_period 24x7 + escalation_options w,u,c,r +} diff --git a/roles/nagios_server/files/nagios/configs/minimal.cfg b/roles/nagios_server/files/nagios/configs/minimal.cfg new file mode 100644 index 0000000000..c41bec4d91 --- /dev/null +++ b/roles/nagios_server/files/nagios/configs/minimal.cfg @@ -0,0 +1,362 @@ +############################################################################### +# MINIMAL.CFG +# +# MINIMALISTIC OBJECT CONFIG FILE (Template-Based Object File Format) +# +# Last Modified: 08-10-2005 +# +# +# NOTE: This config file is intended to be used to test a Nagios installation +# that has been compiled with support for the template-based object +# configuration files. +# +# This config file is intended to servce as an *extremely* simple +# example of how you can create your object configuration file(s). +# If you're interested in more complex object configuration files for +# Nagios, look in the sample-config/template-object/ subdirectory of +# the distribution. +# +############################################################################### + + + +############################################################################### +############################################################################### +# +# TIME PERIODS +# +############################################################################### +############################################################################### + +# This defines a timeperiod where all times are valid for checks, +# notifications, etc. The classic "24x7" support nightmare. :-) + +define timeperiod{ + timeperiod_name 24x7 + alias 24 Hours A Day, 7 Days A Week + sunday 00:00-24:00 + monday 00:00-24:00 + tuesday 00:00-24:00 + wednesday 00:00-24:00 + thursday 00:00-24:00 + friday 00:00-24:00 + saturday 00:00-24:00 + } + + + + +############################################################################### +############################################################################### +# +# COMMANDS +# +############################################################################### +############################################################################### + +# This is a sample service notification command that can be used to send email +# notifications (about service alerts) to contacts. +# 'check_ssh' command definition +define command{ + command_name notify-by-email + command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$OUTPUT$" | /bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$ + } + + +# This is a sample host notification command that can be used to send email +# notifications (about host alerts) to contacts. + +define command{ + command_name host-notify-by-email + command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $OUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "Host $HOSTSTATE$ alert for $HOSTNAME$!" $CONTACTEMAIL$ + } + + +# Command to check to see if a host is "alive" (up) by pinging it + +define command{ + command_name check-host-alive + command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w 300,99% -c 500,100% -p 2 + } + + +# Generic command to check a device by pinging it + +define command{ + command_name check_ping + command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5 + } + + +# Command used to check disk space usage on local partitions + +define command{ + command_name check_local_disk + command_line $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$ + } + + +# Command used to check the number of currently logged in users on the +# local machine + +define command{ + command_name check_local_users + command_line $USER1$/check_users -w $ARG1$ -c $ARG2$ + } + + +# Command to check the number of running processing on the local machine + +define command{ + command_name check_local_procs + command_line $USER1$/check_procs -w $ARG1$ -c $ARG2$ + } + + +# Command to check the load on the local machine + +define command{ + command_name check_local_load + command_line $USER1$/check_load -w $ARG1$ -c $ARG2$ + } + + + +############################################################################### +############################################################################### +# +# CONTACTS +# +############################################################################### +############################################################################### + +# In this simple config file, a single contact will receive all alerts. +# This assumes that you have an account (or email alias) called +# "nagios-admin" on the local host. + +define contact{ + contact_name nagios-admin + alias Nagios Admin + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email admin@fedoraproject.org + } + + + +############################################################################### +############################################################################### +# +# CONTACT GROUPS +# +############################################################################### +############################################################################### + +# We only have one contact in this simple configuration file, so there is +# no need to create more than one contact group. + +define contactgroup{ + contactgroup_name admins + alias Nagios Administrators + members nagios-admin + } + + + +############################################################################### +############################################################################### +# +# HOSTS +# +############################################################################### +############################################################################### + +# Generic host definition template - This is NOT a real host, just a template! + +define host{ + name generic-host ; The name of this host template + notifications_enabled 1 ; Host notifications are enabled + event_handler_enabled 1 ; Host event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + failure_prediction_enabled 1 ; Failure prediction is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information across program restarts + retain_nonstatus_information 1 ; Retain non-status information across program restarts + register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE! + } + + +# Since this is a simple configuration file, we only monitor one host - the +# local host (this machine). + +define host{ + use generic-host ; Name of host template to use + host_name localhost + alias localhost + address 127.0.0.1 + check_command check-host-alive + max_check_attempts 10 + notification_interval 120 + notification_period 24x7 + notification_options d,r + contact_groups admins + } + + + +############################################################################### +############################################################################### +# +# HOST GROUPS +# +############################################################################### +############################################################################### + +# We only have one host in our simple config file, so there is no need to +# create more than one hostgroup. + +define hostgroup{ + hostgroup_name test + alias Test Servers + members localhost + } + + + +############################################################################### +############################################################################### +# +# SERVICES +# +############################################################################### +############################################################################### + +# Generic service definition template - This is NOT a real service, just a template! + +define service{ + name generic-service ; The 'name' of this service template + active_checks_enabled 1 ; Active service checks are enabled + passive_checks_enabled 1 ; Passive service checks are enabled/accepted + parallelize_check 1 ; Active service checks should be parallelized (disabling this can lead to major performance problems) + obsess_over_service 1 ; We should obsess over this service (if necessary) + check_freshness 0 ; Default is to NOT check service 'freshness' + notifications_enabled 1 ; Service notifications are enabled + event_handler_enabled 1 ; Service event handler is enabled + flap_detection_enabled 1 ; Flap detection is enabled + failure_prediction_enabled 1 ; Failure prediction is enabled + process_perf_data 1 ; Process performance data + retain_status_information 1 ; Retain status information across program restarts + retain_nonstatus_information 1 ; Retain non-status information across program restarts + register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE! + } + + +# Define a service to "ping" the local machine + +define service{ + use generic-service ; Name of service template to use + host_name localhost + service_description PING + is_volatile 0 + check_period 24x7 + max_check_attempts 4 + normal_check_interval 5 + retry_check_interval 1 + contact_groups admins + notification_options w,u,c,r + notification_interval 960 + notification_period 24x7 + check_command check_ping!100.0,20%!500.0,60% + } + + +# Define a service to check the disk space of the root partition +# on the local machine. Warning if < 20% free, critical if +# < 10% free space on partition. + +define service{ + use generic-service ; Name of service template to use + host_name localhost + service_description Root Partition + is_volatile 0 + check_period 24x7 + max_check_attempts 4 + normal_check_interval 5 + retry_check_interval 1 + contact_groups admins + notification_options w,u,c,r + notification_interval 960 + notification_period 24x7 + check_command check_local_disk!20%!10%!/ + } + + + +# Define a service to check the number of currently logged in +# users on the local machine. Warning if > 20 users, critical +# if > 50 users. + +define service{ + use generic-service ; Name of service template to use + host_name localhost + service_description Current Users + is_volatile 0 + check_period 24x7 + max_check_attempts 4 + normal_check_interval 5 + retry_check_interval 1 + contact_groups admins + notification_options w,u,c,r + notification_interval 960 + notification_period 24x7 + check_command check_local_users!20!50 + } + + +# Define a service to check the number of currently running procs +# on the local machine. Warning if > 250 processes, critical if +# > 400 users. + +define service{ + use generic-service ; Name of service template to use + host_name localhost + service_description Total Processes + is_volatile 0 + check_period 24x7 + max_check_attempts 4 + normal_check_interval 5 + retry_check_interval 1 + contact_groups admins + notification_options w,u,c,r + notification_interval 960 + notification_period 24x7 + check_command check_local_procs!250!400 + } + + + +# Define a service to check the load on the local machine. + +define service{ + use generic-service ; Name of service template to use + host_name localhost + service_description Current Load + is_volatile 0 + check_period 24x7 + max_check_attempts 4 + normal_check_interval 5 + retry_check_interval 1 + contact_groups admins + notification_options w,u,c,r + notification_interval 960 + notification_period 24x7 + check_command check_local_load!5.0,4.0,3.0!10.0,6.0,4.0 + } + + + +# EOF diff --git a/roles/nagios_server/files/nagios/configs/nagios.cfg b/roles/nagios_server/files/nagios/configs/nagios.cfg new file mode 100644 index 0000000000..b6110acdae --- /dev/null +++ b/roles/nagios_server/files/nagios/configs/nagios.cfg @@ -0,0 +1,1349 @@ +############################################################################## +# +# NAGIOS.CFG - Sample Main Config File for Nagios 4.0.8 +# +# Read the documentation for more information on this configuration +# file. I've provided some comments here, but things may not be so +# clear without further explanation. +# +# +############################################################################## + + +# LOG FILE +# This is the main log file where service and host events are logged +# for historical purposes. This should be the first option specified +# in the config file!!! + +log_file=/var/log/nagios/nagios.log + + + +# OBJECT CONFIGURATION FILE(S) +# These are the object configuration files in which you define hosts, +# host groups, contacts, contact groups, services, etc. +# You can split your object definitions across several config files +# if you wish (as shown below), or keep them all in a single config file. + +# You can specify individual object config files as shown below: +#cfg_file=/etc/nagios/objects/commands.cfg +#cfg_file=/etc/nagios/objects/contacts.cfg +cfg_file=/etc/nagios/timeperiods.cfg +#cfg_file=/etc/nagios/objects/templates.cfg +cfg_file=/etc/nagios/escalations.cfg +cfg_file=/etc/nagios/checkcommands.cfg +cfg_file=/etc/nagios/misccommands.cfg + +# Definitions for monitoring the local (Linux) host +#cfg_file=/etc/nagios/objects/localhost.cfg + +# Definitions for monitoring a Windows machine +#cfg_file=/etc/nagios/objects/windows.cfg + +# Definitions for monitoring a router/switch +#cfg_file=/etc/nagios/objects/switch.cfg + +# Definitions for monitoring a network printer +#cfg_file=/etc/nagios/objects/printer.cfg + + +# You can also tell Nagios to process all config files (with a .cfg +# extension) in a particular directory by using the cfg_dir +# directive as shown below: + +#cfg_dir=/etc/nagios/servers +#cfg_dir=/etc/nagios/printers +#cfg_dir=/etc/nagios/switches +#cfg_dir=/etc/nagios/routers + +cfg_dir=/etc/nagios/hosts +cfg_dir=/etc/nagios/hostgroups +cfg_dir=/etc/nagios/services +cfg_dir=/etc/nagios/contacts +cfg_dir=/etc/nagios/contactgroups +cfg_dir=/etc/nagios/servicegroups +cfg_dir=/etc/nagios/servicedeps + +cfg_dir=/etc/nagios/conf.d + + + + +# OBJECT CACHE FILE +# This option determines where object definitions are cached when +# Nagios starts/restarts. The CGIs read object definitions from +# this cache file (rather than looking at the object config files +# directly) in order to prevent inconsistencies that can occur +# when the config files are modified after Nagios starts. + +object_cache_file=/var/log/nagios/objects.cache + + + +# PRE-CACHED OBJECT FILE +# This options determines the location of the precached object file. +# If you run Nagios with the -p command line option, it will preprocess +# your object configuration file(s) and write the cached config to this +# file. You can then start Nagios with the -u option to have it read +# object definitions from this precached file, rather than the standard +# object configuration files (see the cfg_file and cfg_dir options above). +# Using a precached object file can speed up the time needed to (re)start +# the Nagios process if you've got a large and/or complex configuration. +# Read the documentation section on optimizing Nagios to find our more +# about how this feature works. + +precached_object_file=/var/log/nagios/objects.precache + + + +# RESOURCE FILE +# This is an optional resource file that contains $USERx$ macro +# definitions. Multiple resource files can be specified by using +# multiple resource_file definitions. The CGIs will not attempt to +# read the contents of resource files, so information that is +# considered to be sensitive (usernames, passwords, etc) can be +# defined as macros in this file and restrictive permissions (600) +# can be placed on this file. + +resource_file=/etc/nagios/private/resource.cfg + + + +# STATUS FILE +# This is where the current status of all monitored services and +# hosts is stored. Its contents are read and processed by the CGIs. +# The contents of the status file are deleted every time Nagios +# restarts. + +status_file=/var/log/nagios/status.dat + + + +# STATUS FILE UPDATE INTERVAL +# This option determines the frequency (in seconds) that +# Nagios will periodically dump program, host, and +# service status data. + +status_update_interval=10 + + + +# NAGIOS USER +# This determines the effective user that Nagios should run as. +# You can either supply a username or a UID. + +nagios_user=nagios + + + +# NAGIOS GROUP +# This determines the effective group that Nagios should run as. +# You can either supply a group name or a GID. + +nagios_group=nagios + + + +# EXTERNAL COMMAND OPTION +# This option allows you to specify whether or not Nagios should check +# for external commands (in the command file defined below). By default +# Nagios will *not* check for external commands, just to be on the +# cautious side. If you want to be able to use the CGI command interface +# you will have to enable this. +# Values: 0 = disable commands, 1 = enable commands + +check_external_commands=1 + + + +# EXTERNAL COMMAND FILE +# This is the file that Nagios checks for external command requests. +# It is also where the command CGI will write commands that are submitted +# by users, so it must be writeable by the user that the web server +# is running as (usually 'nobody'). Permissions should be set at the +# directory level instead of on the file, as the file is deleted every +# time its contents are processed. + +command_file=/var/spool/nagios/cmd/nagios.cmd + + + +# QUERY HANDLER INTERFACE +# This is the socket that is created for the Query Handler interface + +query_socket=/var/spool/nagios/cmd/nagios.qh + + + +# LOCK FILE +# This is the lockfile that Nagios will use to store its PID number +# in when it is running in daemon mode. + +lock_file=/var/run/nagios/nagios.pid + + + +# TEMP FILE +# This is a temporary file that is used as scratch space when Nagios +# updates the status log, cleans the comment file, etc. This file +# is created, used, and deleted throughout the time that Nagios is +# running. + +temp_file=/var/log/nagios/nagios.tmp + + + +# TEMP PATH +# This is path where Nagios can create temp files for service and +# host check results, etc. + +temp_path=/tmp + + + +# EVENT BROKER OPTIONS +# Controls what (if any) data gets sent to the event broker. +# Values: 0 = Broker nothing +# -1 = Broker everything +# = See documentation + +event_broker_options=-1 + + + +# EVENT BROKER MODULE(S) +# This directive is used to specify an event broker module that should +# by loaded by Nagios at startup. Use multiple directives if you want +# to load more than one module. Arguments that should be passed to +# the module at startup are seperated from the module path by a space. +# +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# WARNING !!! WARNING !!! WARNING !!! WARNING !!! WARNING !!! WARNING +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# +# Do NOT overwrite modules while they are being used by Nagios or Nagios +# will crash in a fiery display of SEGFAULT glory. This is a bug/limitation +# either in dlopen(), the kernel, and/or the filesystem. And maybe Nagios... +# +# The correct/safe way of updating a module is by using one of these methods: +# 1. Shutdown Nagios, replace the module file, restart Nagios +# 2. Delete the original module file, move the new module file into place, restart Nagios +# +# Example: +# +# broker_module= [moduleargs] + +#broker_module=/somewhere/module1.o +#broker_module=/somewhere/module2.o arg1 arg2=3 debug=0 + + + +# LOG ROTATION METHOD +# This is the log rotation method that Nagios should use to rotate +# the main log file. Values are as follows.. +# n = None - don't rotate the log +# h = Hourly rotation (top of the hour) +# d = Daily rotation (midnight every day) +# w = Weekly rotation (midnight on Saturday evening) +# m = Monthly rotation (midnight last day of month) + +log_rotation_method=d + + + +# LOG ARCHIVE PATH +# This is the directory where archived (rotated) log files should be +# placed (assuming you've chosen to do log rotation). + +log_archive_path=/var/log/nagios/archives + + + +# LOGGING OPTIONS +# If you want messages logged to the syslog facility, as well as the +# Nagios log file set this option to 1. If not, set it to 0. + +use_syslog=1 + + + +# NOTIFICATION LOGGING OPTION +# If you don't want notifications to be logged, set this value to 0. +# If notifications should be logged, set the value to 1. + +log_notifications=1 + + + +# SERVICE RETRY LOGGING OPTION +# If you don't want service check retries to be logged, set this value +# to 0. If retries should be logged, set the value to 1. + +log_service_retries=1 + + + +# HOST RETRY LOGGING OPTION +# If you don't want host check retries to be logged, set this value to +# 0. If retries should be logged, set the value to 1. + +log_host_retries=1 + + + +# EVENT HANDLER LOGGING OPTION +# If you don't want host and service event handlers to be logged, set +# this value to 0. If event handlers should be logged, set the value +# to 1. + +log_event_handlers=1 + + + +# INITIAL STATES LOGGING OPTION +# If you want Nagios to log all initial host and service states to +# the main log file (the first time the service or host is checked) +# you can enable this option by setting this value to 1. If you +# are not using an external application that does long term state +# statistics reporting, you do not need to enable this option. In +# this case, set the value to 0. + +log_initial_states=0 + + + +# CURRENT STATES LOGGING OPTION +# If you don't want Nagios to log all current host and service states +# after log has been rotated to the main log file, you can disable this +# option by setting this value to 0. Default value is 1. + +log_current_states=1 + + + +# EXTERNAL COMMANDS LOGGING OPTION +# If you don't want Nagios to log external commands, set this value +# to 0. If external commands should be logged, set this value to 1. +# Note: This option does not include logging of passive service +# checks - see the option below for controlling whether or not +# passive checks are logged. + +log_external_commands=1 + + + +# PASSIVE CHECKS LOGGING OPTION +# If you don't want Nagios to log passive host and service checks, set +# this value to 0. If passive checks should be logged, set +# this value to 1. + +log_passive_checks=1 + + + +# GLOBAL HOST AND SERVICE EVENT HANDLERS +# These options allow you to specify a host and service event handler +# command that is to be run for every host or service state change. +# The global event handler is executed immediately prior to the event +# handler that you have optionally specified in each host or +# service definition. The command argument is the short name of a +# command definition that you define in your host configuration file. +# Read the HTML docs for more information. + +#global_host_event_handler=somecommand +#global_service_event_handler=somecommand + + + +# SERVICE INTER-CHECK DELAY METHOD +# This is the method that Nagios should use when initially +# "spreading out" service checks when it starts monitoring. The +# default is to use smart delay calculation, which will try to +# space all service checks out evenly to minimize CPU load. +# Using the dumb setting will cause all checks to be scheduled +# at the same time (with no delay between them)! This is not a +# good thing for production, but is useful when testing the +# parallelization functionality. +# n = None - don't use any delay between checks +# d = Use a "dumb" delay of 1 second between checks +# s = Use "smart" inter-check delay calculation +# x.xx = Use an inter-check delay of x.xx seconds + +service_inter_check_delay_method=s + + + +# MAXIMUM SERVICE CHECK SPREAD +# This variable determines the timeframe (in minutes) from the +# program start time that an initial check of all services should +# be completed. Default is 30 minutes. + +max_service_check_spread=30 + + + +# SERVICE CHECK INTERLEAVE FACTOR +# This variable determines how service checks are interleaved. +# Interleaving the service checks allows for a more even +# distribution of service checks and reduced load on remote +# hosts. Setting this value to 1 is equivalent to how versions +# of Nagios previous to 0.0.5 did service checks. Set this +# value to s (smart) for automatic calculation of the interleave +# factor unless you have a specific reason to change it. +# s = Use "smart" interleave factor calculation +# x = Use an interleave factor of x, where x is a +# number greater than or equal to 1. + +service_interleave_factor=s + + + +# HOST INTER-CHECK DELAY METHOD +# This is the method that Nagios should use when initially +# "spreading out" host checks when it starts monitoring. The +# default is to use smart delay calculation, which will try to +# space all host checks out evenly to minimize CPU load. +# Using the dumb setting will cause all checks to be scheduled +# at the same time (with no delay between them)! +# n = None - don't use any delay between checks +# d = Use a "dumb" delay of 1 second between checks +# s = Use "smart" inter-check delay calculation +# x.xx = Use an inter-check delay of x.xx seconds + +host_inter_check_delay_method=s + + + +# MAXIMUM HOST CHECK SPREAD +# This variable determines the timeframe (in minutes) from the +# program start time that an initial check of all hosts should +# be completed. Default is 30 minutes. + +max_host_check_spread=30 + + + +# MAXIMUM CONCURRENT SERVICE CHECKS +# This option allows you to specify the maximum number of +# service checks that can be run in parallel at any given time. +# Specifying a value of 1 for this variable essentially prevents +# any service checks from being parallelized. A value of 0 +# will not restrict the number of concurrent checks that are +# being executed. + +max_concurrent_checks=0 + + + +# HOST AND SERVICE CHECK REAPER FREQUENCY +# This is the frequency (in seconds!) that Nagios will process +# the results of host and service checks. + +check_result_reaper_frequency=10 + + + + +# MAX CHECK RESULT REAPER TIME +# This is the max amount of time (in seconds) that a single +# check result reaper event will be allowed to run before +# returning control back to Nagios so it can perform other +# duties. + +max_check_result_reaper_time=30 + + + + +# CHECK RESULT PATH +# This is directory where Nagios stores the results of host and +# service checks that have not yet been processed. +# +# Note: Make sure that only one instance of Nagios has access +# to this directory! + +check_result_path=/var/log/nagios/spool/checkresults + + + + +# MAX CHECK RESULT FILE AGE +# This option determines the maximum age (in seconds) which check +# result files are considered to be valid. Files older than this +# threshold will be mercilessly deleted without further processing. + +max_check_result_file_age=3600 + + + + +# CACHED HOST CHECK HORIZON +# This option determines the maximum amount of time (in seconds) +# that the state of a previous host check is considered current. +# Cached host states (from host checks that were performed more +# recently that the timeframe specified by this value) can immensely +# improve performance in regards to the host check logic. +# Too high of a value for this option may result in inaccurate host +# states being used by Nagios, while a lower value may result in a +# performance hit for host checks. Use a value of 0 to disable host +# check caching. + +cached_host_check_horizon=15 + + + +# CACHED SERVICE CHECK HORIZON +# This option determines the maximum amount of time (in seconds) +# that the state of a previous service check is considered current. +# Cached service states (from service checks that were performed more +# recently that the timeframe specified by this value) can immensely +# improve performance in regards to predictive dependency checks. +# Use a value of 0 to disable service check caching. + +cached_service_check_horizon=15 + + + +# ENABLE PREDICTIVE HOST DEPENDENCY CHECKS +# This option determines whether or not Nagios will attempt to execute +# checks of hosts when it predicts that future dependency logic test +# may be needed. These predictive checks can help ensure that your +# host dependency logic works well. +# Values: +# 0 = Disable predictive checks +# 1 = Enable predictive checks (default) + +enable_predictive_host_dependency_checks=1 + + + +# ENABLE PREDICTIVE SERVICE DEPENDENCY CHECKS +# This option determines whether or not Nagios will attempt to execute +# checks of service when it predicts that future dependency logic test +# may be needed. These predictive checks can help ensure that your +# service dependency logic works well. +# Values: +# 0 = Disable predictive checks +# 1 = Enable predictive checks (default) + +enable_predictive_service_dependency_checks=1 + + + +# SOFT STATE DEPENDENCIES +# This option determines whether or not Nagios will use soft state +# information when checking host and service dependencies. Normally +# Nagios will only use the latest hard host or service state when +# checking dependencies. If you want it to use the latest state (regardless +# of whether its a soft or hard state type), enable this option. +# Values: +# 0 = Don't use soft state dependencies (default) +# 1 = Use soft state dependencies + +soft_state_dependencies=0 + + + +# TIME CHANGE ADJUSTMENT THRESHOLDS +# These options determine when Nagios will react to detected changes +# in system time (either forward or backwards). + +#time_change_threshold=900 + + + +# AUTO-RESCHEDULING OPTION +# This option determines whether or not Nagios will attempt to +# automatically reschedule active host and service checks to +# "smooth" them out over time. This can help balance the load on +# the monitoring server. +# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE +# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY + +auto_reschedule_checks=0 + + + +# AUTO-RESCHEDULING INTERVAL +# This option determines how often (in seconds) Nagios will +# attempt to automatically reschedule checks. This option only +# has an effect if the auto_reschedule_checks option is enabled. +# Default is 30 seconds. +# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE +# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY + +auto_rescheduling_interval=30 + + + +# AUTO-RESCHEDULING WINDOW +# This option determines the "window" of time (in seconds) that +# Nagios will look at when automatically rescheduling checks. +# Only host and service checks that occur in the next X seconds +# (determined by this variable) will be rescheduled. This option +# only has an effect if the auto_reschedule_checks option is +# enabled. Default is 180 seconds (3 minutes). +# WARNING: THIS IS AN EXPERIMENTAL FEATURE - IT CAN DEGRADE +# PERFORMANCE, RATHER THAN INCREASE IT, IF USED IMPROPERLY + +auto_rescheduling_window=180 + + + +# TIMEOUT VALUES +# These options control how much time Nagios will allow various +# types of commands to execute before killing them off. Options +# are available for controlling maximum time allotted for +# service checks, host checks, event handlers, notifications, the +# ocsp command, and performance data commands. All values are in +# seconds. + +service_check_timeout=60 +host_check_timeout=30 +event_handler_timeout=30 +notification_timeout=30 +ocsp_timeout=5 +perfdata_timeout=5 + + + +# RETAIN STATE INFORMATION +# This setting determines whether or not Nagios will save state +# information for services and hosts before it shuts down. Upon +# startup Nagios will reload all saved service and host state +# information before starting to monitor. This is useful for +# maintaining long-term data on state statistics, etc, but will +# slow Nagios down a bit when it (re)starts. Since its only +# a one-time penalty, I think its well worth the additional +# startup delay. + +retain_state_information=1 + + + +# STATE RETENTION FILE +# This is the file that Nagios should use to store host and +# service state information before it shuts down. The state +# information in this file is also read immediately prior to +# starting to monitor the network when Nagios is restarted. +# This file is used only if the retain_state_information +# variable is set to 1. + +state_retention_file=/var/log/nagios/retention.dat + + + +# RETENTION DATA UPDATE INTERVAL +# This setting determines how often (in minutes) that Nagios +# will automatically save retention data during normal operation. +# If you set this value to 0, Nagios will not save retention +# data at regular interval, but it will still save retention +# data before shutting down or restarting. If you have disabled +# state retention, this option has no effect. + +retention_update_interval=60 + + + +# USE RETAINED PROGRAM STATE +# This setting determines whether or not Nagios will set +# program status variables based on the values saved in the +# retention file. If you want to use retained program status +# information, set this value to 1. If not, set this value +# to 0. + +use_retained_program_state=1 + + + +# USE RETAINED SCHEDULING INFO +# This setting determines whether or not Nagios will retain +# the scheduling info (next check time) for hosts and services +# based on the values saved in the retention file. If you +# If you want to use retained scheduling info, set this +# value to 1. If not, set this value to 0. + +use_retained_scheduling_info=1 + + + +# RETAINED ATTRIBUTE MASKS (ADVANCED FEATURE) +# The following variables are used to specify specific host and +# service attributes that should *not* be retained by Nagios during +# program restarts. +# +# The values of the masks are bitwise ANDs of values specified +# by the "MODATTR_" definitions found in include/common.h. +# For example, if you do not want the current enabled/disabled state +# of flap detection and event handlers for hosts to be retained, you +# would use a value of 24 for the host attribute mask... +# MODATTR_EVENT_HANDLER_ENABLED (8) + MODATTR_FLAP_DETECTION_ENABLED (16) = 24 + +# This mask determines what host attributes are not retained +retained_host_attribute_mask=0 + +# This mask determines what service attributes are not retained +retained_service_attribute_mask=0 + +# These two masks determine what process attributes are not retained. +# There are two masks, because some process attributes have host and service +# options. For example, you can disable active host checks, but leave active +# service checks enabled. +retained_process_host_attribute_mask=0 +retained_process_service_attribute_mask=0 + +# These two masks determine what contact attributes are not retained. +# There are two masks, because some contact attributes have host and +# service options. For example, you can disable host notifications for +# a contact, but leave service notifications enabled for them. +retained_contact_host_attribute_mask=0 +retained_contact_service_attribute_mask=0 + + + +# INTERVAL LENGTH +# This is the seconds per unit interval as used in the +# host/contact/service configuration files. Setting this to 60 means +# that each interval is one minute long (60 seconds). Other settings +# have not been tested much, so your mileage is likely to vary... + +interval_length=60 + + + +# CHECK FOR UPDATES +# This option determines whether Nagios will automatically check to +# see if new updates (releases) are available. It is recommend that you +# enable this option to ensure that you stay on top of the latest critical +# patches to Nagios. Nagios is critical to you - make sure you keep it in +# good shape. Nagios will check once a day for new updates. Data collected +# by Nagios Enterprises from the update check is processed in accordance +# with our privacy policy - see http://api.nagios.org for details. + +check_for_updates=1 + + + +# BARE UPDATE CHECK +# This option deterines what data Nagios will send to api.nagios.org when +# it checks for updates. By default, Nagios will send information on the +# current version of Nagios you have installed, as well as an indicator as +# to whether this was a new installation or not. Nagios Enterprises uses +# this data to determine the number of users running specific version of +# Nagios. Enable this option if you do not want this information to be sent. + +bare_update_check=0 + + + +# AGGRESSIVE HOST CHECKING OPTION +# If you don't want to turn on aggressive host checking features, set +# this value to 0 (the default). Otherwise set this value to 1 to +# enable the aggressive check option. Read the docs for more info +# on what aggressive host check is or check out the source code in +# base/checks.c + +use_aggressive_host_checking=0 + + + +# SERVICE CHECK EXECUTION OPTION +# This determines whether or not Nagios will actively execute +# service checks when it initially starts. If this option is +# disabled, checks are not actively made, but Nagios can still +# receive and process passive check results that come in. Unless +# you're implementing redundant hosts or have a special need for +# disabling the execution of service checks, leave this enabled! +# Values: 1 = enable checks, 0 = disable checks + +execute_service_checks=1 + + + +# PASSIVE SERVICE CHECK ACCEPTANCE OPTION +# This determines whether or not Nagios will accept passive +# service checks results when it initially (re)starts. +# Values: 1 = accept passive checks, 0 = reject passive checks + +accept_passive_service_checks=1 + + + +# HOST CHECK EXECUTION OPTION +# This determines whether or not Nagios will actively execute +# host checks when it initially starts. If this option is +# disabled, checks are not actively made, but Nagios can still +# receive and process passive check results that come in. Unless +# you're implementing redundant hosts or have a special need for +# disabling the execution of host checks, leave this enabled! +# Values: 1 = enable checks, 0 = disable checks + +execute_host_checks=1 + + + +# PASSIVE HOST CHECK ACCEPTANCE OPTION +# This determines whether or not Nagios will accept passive +# host checks results when it initially (re)starts. +# Values: 1 = accept passive checks, 0 = reject passive checks + +accept_passive_host_checks=1 + + + +# NOTIFICATIONS OPTION +# This determines whether or not Nagios will sent out any host or +# service notifications when it is initially (re)started. +# Values: 1 = enable notifications, 0 = disable notifications + +enable_notifications=1 + + + +# EVENT HANDLER USE OPTION +# This determines whether or not Nagios will run any host or +# service event handlers when it is initially (re)started. Unless +# you're implementing redundant hosts, leave this option enabled. +# Values: 1 = enable event handlers, 0 = disable event handlers + +enable_event_handlers=1 + + + +# PROCESS PERFORMANCE DATA OPTION +# This determines whether or not Nagios will process performance +# data returned from service and host checks. If this option is +# enabled, host performance data will be processed using the +# host_perfdata_command (defined below) and service performance +# data will be processed using the service_perfdata_command (also +# defined below). Read the HTML docs for more information on +# performance data. +# Values: 1 = process performance data, 0 = do not process performance data + +process_performance_data=0 + + + +# HOST AND SERVICE PERFORMANCE DATA PROCESSING COMMANDS +# These commands are run after every host and service check is +# performed. These commands are executed only if the +# enable_performance_data option (above) is set to 1. The command +# argument is the short name of a command definition that you +# define in your host configuration file. Read the HTML docs for +# more information on performance data. + +#host_perfdata_command=process-host-perfdata +#service_perfdata_command=process-service-perfdata + + + +# HOST AND SERVICE PERFORMANCE DATA FILES +# These files are used to store host and service performance data. +# Performance data is only written to these files if the +# enable_performance_data option (above) is set to 1. + +#host_perfdata_file=/var/log/nagios/host-perfdata +#service_perfdata_file=/var/log/nagios/service-perfdata + + + +# HOST AND SERVICE PERFORMANCE DATA FILE TEMPLATES +# These options determine what data is written (and how) to the +# performance data files. The templates may contain macros, special +# characters (\t for tab, \r for carriage return, \n for newline) +# and plain text. A newline is automatically added after each write +# to the performance data file. Some examples of what you can do are +# shown below. + +#host_perfdata_file_template=[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$ +#service_perfdata_file_template=[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$ + + + +# HOST AND SERVICE PERFORMANCE DATA FILE MODES +# This option determines whether or not the host and service +# performance data files are opened in write ("w") or append ("a") +# mode. If you want to use named pipes, you should use the special +# pipe ("p") mode which avoid blocking at startup, otherwise you will +# likely want the defult append ("a") mode. + +#host_perfdata_file_mode=a +#service_perfdata_file_mode=a + + + +# HOST AND SERVICE PERFORMANCE DATA FILE PROCESSING INTERVAL +# These options determine how often (in seconds) the host and service +# performance data files are processed using the commands defined +# below. A value of 0 indicates the files should not be periodically +# processed. + +#host_perfdata_file_processing_interval=0 +#service_perfdata_file_processing_interval=0 + + + +# HOST AND SERVICE PERFORMANCE DATA FILE PROCESSING COMMANDS +# These commands are used to periodically process the host and +# service performance data files. The interval at which the +# processing occurs is determined by the options above. + +#host_perfdata_file_processing_command=process-host-perfdata-file +#service_perfdata_file_processing_command=process-service-perfdata-file + + + +# HOST AND SERVICE PERFORMANCE DATA PROCESS EMPTY RESULTS +# These options determine wether the core will process empty perfdata +# results or not. This is needed for distributed monitoring, and intentionally +# turned on by default. +# If you don't require empty perfdata - saving some cpu cycles +# on unwanted macro calculation - you can turn that off. Be careful! +# Values: 1 = enable, 0 = disable + +#host_perfdata_process_empty_results=1 +#service_perfdata_process_empty_results=1 + + +# OBSESS OVER SERVICE CHECKS OPTION +# This determines whether or not Nagios will obsess over service +# checks and run the ocsp_command defined below. Unless you're +# planning on implementing distributed monitoring, do not enable +# this option. Read the HTML docs for more information on +# implementing distributed monitoring. +# Values: 1 = obsess over services, 0 = do not obsess (default) + +obsess_over_services=0 + + + +# OBSESSIVE COMPULSIVE SERVICE PROCESSOR COMMAND +# This is the command that is run for every service check that is +# processed by Nagios. This command is executed only if the +# obsess_over_services option (above) is set to 1. The command +# argument is the short name of a command definition that you +# define in your host configuration file. Read the HTML docs for +# more information on implementing distributed monitoring. + +#ocsp_command=somecommand + + + +# OBSESS OVER HOST CHECKS OPTION +# This determines whether or not Nagios will obsess over host +# checks and run the ochp_command defined below. Unless you're +# planning on implementing distributed monitoring, do not enable +# this option. Read the HTML docs for more information on +# implementing distributed monitoring. +# Values: 1 = obsess over hosts, 0 = do not obsess (default) + +obsess_over_hosts=0 + + + +# OBSESSIVE COMPULSIVE HOST PROCESSOR COMMAND +# This is the command that is run for every host check that is +# processed by Nagios. This command is executed only if the +# obsess_over_hosts option (above) is set to 1. The command +# argument is the short name of a command definition that you +# define in your host configuration file. Read the HTML docs for +# more information on implementing distributed monitoring. + +#ochp_command=somecommand + + + +# TRANSLATE PASSIVE HOST CHECKS OPTION +# This determines whether or not Nagios will translate +# DOWN/UNREACHABLE passive host check results into their proper +# state for this instance of Nagios. This option is useful +# if you have distributed or failover monitoring setup. In +# these cases your other Nagios servers probably have a different +# "view" of the network, with regards to the parent/child relationship +# of hosts. If a distributed monitoring server thinks a host +# is DOWN, it may actually be UNREACHABLE from the point of +# this Nagios instance. Enabling this option will tell Nagios +# to translate any DOWN or UNREACHABLE host states it receives +# passively into the correct state from the view of this server. +# Values: 1 = perform translation, 0 = do not translate (default) + +translate_passive_host_checks=0 + + + +# PASSIVE HOST CHECKS ARE SOFT OPTION +# This determines whether or not Nagios will treat passive host +# checks as being HARD or SOFT. By default, a passive host check +# result will put a host into a HARD state type. This can be changed +# by enabling this option. +# Values: 0 = passive checks are HARD, 1 = passive checks are SOFT + +passive_host_checks_are_soft=0 + + + +# ORPHANED HOST/SERVICE CHECK OPTIONS +# These options determine whether or not Nagios will periodically +# check for orphaned host service checks. Since service checks are +# not rescheduled until the results of their previous execution +# instance are processed, there exists a possibility that some +# checks may never get rescheduled. A similar situation exists for +# host checks, although the exact scheduling details differ a bit +# from service checks. Orphaned checks seem to be a rare +# problem and should not happen under normal circumstances. +# If you have problems with service checks never getting +# rescheduled, make sure you have orphaned service checks enabled. +# Values: 1 = enable checks, 0 = disable checks + +check_for_orphaned_services=1 +check_for_orphaned_hosts=1 + + + +# SERVICE FRESHNESS CHECK OPTION +# This option determines whether or not Nagios will periodically +# check the "freshness" of service results. Enabling this option +# is useful for ensuring passive checks are received in a timely +# manner. +# Values: 1 = enabled freshness checking, 0 = disable freshness checking + +check_service_freshness=1 + + + +# SERVICE FRESHNESS CHECK INTERVAL +# This setting determines how often (in seconds) Nagios will +# check the "freshness" of service check results. If you have +# disabled service freshness checking, this option has no effect. + +service_freshness_check_interval=60 + + + +# SERVICE CHECK TIMEOUT STATE +# This setting determines the state Nagios will report when a +# service check times out - that is does not respond within +# service_check_timeout seconds. This can be useful if a +# machine is running at too high a load and you do not want +# to consider a failed service check to be critical (the default). +# Valid settings are: +# c - Critical (default) +# u - Unknown +# w - Warning +# o - OK + +service_check_timeout_state=c + + + +# HOST FRESHNESS CHECK OPTION +# This option determines whether or not Nagios will periodically +# check the "freshness" of host results. Enabling this option +# is useful for ensuring passive checks are received in a timely +# manner. +# Values: 1 = enabled freshness checking, 0 = disable freshness checking + +check_host_freshness=0 + + + +# HOST FRESHNESS CHECK INTERVAL +# This setting determines how often (in seconds) Nagios will +# check the "freshness" of host check results. If you have +# disabled host freshness checking, this option has no effect. + +host_freshness_check_interval=60 + + + + +# ADDITIONAL FRESHNESS THRESHOLD LATENCY +# This setting determines the number of seconds that Nagios +# will add to any host and service freshness thresholds that +# it calculates (those not explicitly specified by the user). + +additional_freshness_latency=15 + + + + +# FLAP DETECTION OPTION +# This option determines whether or not Nagios will try +# and detect hosts and services that are "flapping". +# Flapping occurs when a host or service changes between +# states too frequently. When Nagios detects that a +# host or service is flapping, it will temporarily suppress +# notifications for that host/service until it stops +# flapping. Flap detection is very experimental, so read +# the HTML documentation before enabling this feature! +# Values: 1 = enable flap detection +# 0 = disable flap detection (default) + +enable_flap_detection=1 + + + +# FLAP DETECTION THRESHOLDS FOR HOSTS AND SERVICES +# Read the HTML documentation on flap detection for +# an explanation of what this option does. This option +# has no effect if flap detection is disabled. + +low_service_flap_threshold=5.0 +high_service_flap_threshold=20.0 +low_host_flap_threshold=5.0 +high_host_flap_threshold=20.0 + + + +# DATE FORMAT OPTION +# This option determines how short dates are displayed. Valid options +# include: +# us (MM-DD-YYYY HH:MM:SS) +# euro (DD-MM-YYYY HH:MM:SS) +# iso8601 (YYYY-MM-DD HH:MM:SS) +# strict-iso8601 (YYYY-MM-DDTHH:MM:SS) +# + +date_format=us + + + + +# TIMEZONE OFFSET +# This option is used to override the default timezone that this +# instance of Nagios runs in. If not specified, Nagios will use +# the system configured timezone. +# +# NOTE: In order to display the correct timezone in the CGIs, you +# will also need to alter the Apache directives for the CGI path +# to include your timezone. Example: +# +# +# SetEnv TZ "Australia/Brisbane" +# ... +# + +#use_timezone=US/Mountain +#use_timezone=Australia/Brisbane + + + +# ILLEGAL OBJECT NAME CHARACTERS +# This option allows you to specify illegal characters that cannot +# be used in host names, service descriptions, or names of other +# object types. + +illegal_object_name_chars=`~!$%^&*|'"<>?,()= + + + +# ILLEGAL MACRO OUTPUT CHARACTERS +# This option allows you to specify illegal characters that are +# stripped from macros before being used in notifications, event +# handlers, etc. This DOES NOT affect macros used in service or +# host check commands. +# The following macros are stripped of the characters you specify: +# $HOSTOUTPUT$ +# $HOSTPERFDATA$ +# $HOSTACKAUTHOR$ +# $HOSTACKCOMMENT$ +# $SERVICEOUTPUT$ +# $SERVICEPERFDATA$ +# $SERVICEACKAUTHOR$ +# $SERVICEACKCOMMENT$ + +illegal_macro_output_chars=`~$&|'"<> + + + +# REGULAR EXPRESSION MATCHING +# This option controls whether or not regular expression matching +# takes place in the object config files. Regular expression +# matching is used to match host, hostgroup, service, and service +# group names/descriptions in some fields of various object types. +# Values: 1 = enable regexp matching, 0 = disable regexp matching + +use_regexp_matching=0 + + + +# "TRUE" REGULAR EXPRESSION MATCHING +# This option controls whether or not "true" regular expression +# matching takes place in the object config files. This option +# only has an effect if regular expression matching is enabled +# (see above). If this option is DISABLED, regular expression +# matching only occurs if a string contains wildcard characters +# (* and ?). If the option is ENABLED, regexp matching occurs +# all the time (which can be annoying). +# Values: 1 = enable true matching, 0 = disable true matching + +use_true_regexp_matching=0 + + + +# ADMINISTRATOR EMAIL/PAGER ADDRESSES +# The email and pager address of a global administrator (likely you). +# Nagios never uses these values itself, but you can access them by +# using the $ADMINEMAIL$ and $ADMINPAGER$ macros in your notification +# commands. + +admin_email=nagios@localhost +admin_pager=pagenagios@localhost + + + +# DAEMON CORE DUMP OPTION +# This option determines whether or not Nagios is allowed to create +# a core dump when it runs as a daemon. Note that it is generally +# considered bad form to allow this, but it may be useful for +# debugging purposes. Enabling this option doesn't guarantee that +# a core file will be produced, but that's just life... +# Values: 1 - Allow core dumps +# 0 - Do not allow core dumps (default) + +daemon_dumps_core=0 + + + +# LARGE INSTALLATION TWEAKS OPTION +# This option determines whether or not Nagios will take some shortcuts +# which can save on memory and CPU usage in large Nagios installations. +# Read the documentation for more information on the benefits/tradeoffs +# of enabling this option. +# Values: 1 - Enabled tweaks +# 0 - Disable tweaks (default) + +use_large_installation_tweaks=0 + + + +# ENABLE ENVIRONMENT MACROS +# This option determines whether or not Nagios will make all standard +# macros available as environment variables when host/service checks +# and system commands (event handlers, notifications, etc.) are +# executed. +# Enabling this is a very bad idea for anything but very small setups, +# as it means plugins, notification scripts and eventhandlers may run +# out of environment space. It will also cause a significant increase +# in CPU- and memory usage and drastically reduce the number of checks +# you can run. +# Values: 1 - Enable environment variable macros +# 0 - Disable environment variable macros (default) + +enable_environment_macros=0 + + + +# CHILD PROCESS MEMORY OPTION +# This option determines whether or not Nagios will free memory in +# child processes (processed used to execute system commands and host/ +# service checks). If you specify a value here, it will override +# program defaults. +# Value: 1 - Free memory in child processes +# 0 - Do not free memory in child processes + +#free_child_process_memory=1 + + + +# CHILD PROCESS FORKING BEHAVIOR +# This option determines how Nagios will fork child processes +# (used to execute system commands and host/service checks). Normally +# child processes are fork()ed twice, which provides a very high level +# of isolation from problems. Fork()ing once is probably enough and will +# save a great deal on CPU usage (in large installs), so you might +# want to consider using this. If you specify a value here, it will +# program defaults. +# Value: 1 - Child processes fork() twice +# 0 - Child processes fork() just once + +#child_processes_fork_twice=1 + + + +# DEBUG LEVEL +# This option determines how much (if any) debugging information will +# be written to the debug file. OR values together to log multiple +# types of information. +# Values: +# -1 = Everything +# 0 = Nothing +# 1 = Functions +# 2 = Configuration +# 4 = Process information +# 8 = Scheduled events +# 16 = Host/service checks +# 32 = Notifications +# 64 = Event broker +# 128 = External commands +# 256 = Commands +# 512 = Scheduled downtime +# 1024 = Comments +# 2048 = Macros + +debug_level=0 + + + +# DEBUG VERBOSITY +# This option determines how verbose the debug log out will be. +# Values: 0 = Brief output +# 1 = More detailed +# 2 = Very detailed + +debug_verbosity=1 + + + +# DEBUG FILE +# This option determines where Nagios should write debugging information. + +debug_file=/var/log/nagios/nagios.debug + + + +# MAX DEBUG FILE SIZE +# This option determines the maximum size (in bytes) of the debug file. If +# the file grows larger than this size, it will be renamed with a .old +# extension. If a file already exists with a .old extension it will +# automatically be deleted. This helps ensure your disk space usage doesn't +# get out of control when debugging Nagios. + +max_debug_file_size=1000000 + + + +# Should we allow hostgroups to have no hosts, we default this to off since +# that was the old behavior + +allow_empty_hostgroup_assignment=0 + + + +# Normally worker count is dynamically allocated based on 1.5 * number of cpu's +# with a minimum of 4 workers. This value will override the defaults + +#check_workers=3 + + + +# EXPERIMENTAL load controlling options +# To get current defaults based on your system issue a command to +# the query handler. Please note that this is an experimental feature +# and not meant for production use. Used incorrectly it can induce +# enormous latency. +# #core loadctl +# jobs_max - The maximum amount of jobs to run at one time +# jobs_min - The minimum amount of jobs to run at one time +# jobs_limit - The maximum amount of jobs the current load lets us run +# backoff_limit - The minimum backoff_change +# backoff_change - # of jobs to remove from jobs_limit when backing off +# rampup_limit - Minimum rampup_change +# rampup_change - # of jobs to add to jobs_limit when ramping up +# NOTE: The backoff_limit and rampup_limit are NOT used by anything currently, +# so if your system is under load nothing will actively modify the jobs +# even if you have these options enabled, they are for external +# connector information only. However, if you change the jobs_max or +# jobs_min manually here or through the query handler interface that +# WILL affect your system +#loadctl_options=jobs_max=100;backoff_limit=10;rampup_change=5 diff --git a/roles/nagios_server/files/nagios/configs/timeperiods.cfg b/roles/nagios_server/files/nagios/configs/timeperiods.cfg new file mode 100644 index 0000000000..3ea8eb2461 --- /dev/null +++ b/roles/nagios_server/files/nagios/configs/timeperiods.cfg @@ -0,0 +1,135 @@ +############################################################################### +# TIMEPERIODS.CFG - SAMPLE TIMEPERIOD DEFINITIONS +# +# +# NOTES: This config file provides you with some example timeperiod definitions +# that you can reference in host, service, contact, and dependency +# definitions. +# +# You don't need to keep timeperiods in a separate file from your other +# object definitions. This has been done just to make things easier to +# understand. +# +############################################################################### + + + +############################################################################### +############################################################################### +# +# TIME PERIODS +# +############################################################################### +############################################################################### + + +define timeperiod{ + timeperiod_name 24x7 + alias 24 Hours A Day, 7 Days A Week + sunday 00:00-24:00 + monday 00:00-24:00 + tuesday 00:00-24:00 + wednesday 00:00-24:00 + thursday 00:00-24:00 + friday 00:00-24:00 + saturday 00:00-24:00 +} + +define timeperiod{ + timeperiod_name 16x7 + alias 15 Hours a day, 7 days a week + sunday 00:00-04:00,13:00-24:00 + monday 00:00-04:00,13:00-24:00 + tuesday 00:00-04:00,13:00-24:00 + wednesday 00:00-04:00,13:00-24:00 + thursday 00:00-04:00,13:00-24:00 + friday 00:00-04:00,13:00-24:00 + saturday 00:00-04:00,13:00-24:00 + } + +define timeperiod{ + timeperiod_name 16x7-AU + alias 15 Hours a day, 7 days a week + sunday 00:00-14:00,22:00-24:00 + monday 00:00-14:00,22:00-24:00 + tuesday 00:00-14:00,22:00-24:00 + wednesday 00:00-14:00,22:00-24:00 + thursday 00:00-14:00,22:00-24:00 + friday 00:00-14:00,22:00-24:00 + saturday 00:00-14:00,22:00-24:00 + } + + +# Members of sysadmin-main already get nagios messages +define timeperiod{ + timeperiod_name never + alias Never + } + +# This defines a timeperiod where all times are valid for checks, +# notifications, etc. The classic "24x7" support nightmare. :-) +define timeperiod{ + timeperiod_name 24x7 + alias 24 Hours A Day, 7 Days A Week + sunday 00:00-24:00 + monday 00:00-24:00 + tuesday 00:00-24:00 + wednesday 00:00-24:00 + thursday 00:00-24:00 + friday 00:00-24:00 + saturday 00:00-24:00 + } + + +# 'workhours' timeperiod definition +define timeperiod{ + timeperiod_name workhours + alias Normal Work Hours + monday 09:00-17:00 + tuesday 09:00-17:00 + wednesday 09:00-17:00 + thursday 09:00-17:00 + friday 09:00-17:00 + } + + +# 'none' timeperiod definition +define timeperiod{ + timeperiod_name none + alias No Time Is A Good Time + } + +# Some U.S. holidays +# Note: The timeranges for each holiday are meant to *exclude* the holidays from being +# treated as a valid time for notifications, etc. You probably don't want your pager +# going off on New Year's. Although you're employer might... :-) +define timeperiod{ + name us-holidays + timeperiod_name us-holidays + alias U.S. Holidays + + january 1 00:00-00:00 ; New Years + monday -1 may 00:00-00:00 ; Memorial Day (last Monday in May) + july 4 00:00-00:00 ; Independence Day + monday 1 september 00:00-00:00 ; Labor Day (first Monday in September) + thursday 4 november 00:00-00:00 ; Thanksgiving (4th Thursday in November) + december 25 00:00-00:00 ; Christmas + } + + +# This defines a modified "24x7" timeperiod that covers every day of the +# year, except for U.S. holidays (defined in the timeperiod above). +define timeperiod{ + timeperiod_name 24x7_sans_holidays + alias 24x7 Sans Holidays + + use us-holidays ; Get holiday exceptions from other timeperiod + + sunday 00:00-24:00 + monday 00:00-24:00 + tuesday 00:00-24:00 + wednesday 00:00-24:00 + thursday 00:00-24:00 + friday 00:00-24:00 + saturday 00:00-24:00 + } diff --git a/roles/nagios_server/files/nagios/contactgroups/bodhi.cfg b/roles/nagios_server/files/nagios/contactgroups/bodhi.cfg new file mode 100644 index 0000000000..43ab777175 --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/bodhi.cfg @@ -0,0 +1,5 @@ +define contactgroup { + contactgroup_name bodhi + alias Bodhi Notifications + members bowlofeggs +} diff --git a/roles/nagios_server/files/nagios/contactgroups/build-sysadmin-email.cfg b/roles/nagios_server/files/nagios/contactgroups/build-sysadmin-email.cfg new file mode 100644 index 0000000000..92f56c5750 --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/build-sysadmin-email.cfg @@ -0,0 +1,5 @@ +#define contactgroup{ +# contactgroup_name build-sysadmin-email +# alias Build Sysadmin Email Contacts +# members kevin,aditya +# } diff --git a/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-email.cfg b/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-email.cfg new file mode 100644 index 0000000000..33940427e5 --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-email.cfg @@ -0,0 +1,5 @@ +define contactgroup{ + contactgroup_name fedora-sysadmin-email + alias Fedora Sysadmin Email Contacts + members admin,kevin,puiterwijkp,smooge,ausil,jcollie,nb,rigeld2,codeblock,hvivani + } diff --git a/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-ircbot.cfg b/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-ircbot.cfg new file mode 100644 index 0000000000..5d8fbeaa92 --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-ircbot.cfg @@ -0,0 +1,5 @@ +define contactgroup{ + contactgroup_name fedora-sysadmin-ircbot + alias Fedora Sysadmin irc Contacts + members ircbot,fedmsg + } diff --git a/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-pager.cfg b/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-pager.cfg new file mode 100644 index 0000000000..e290656c35 --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/fedora-sysadmin-pager.cfg @@ -0,0 +1,10 @@ +define contactgroup{ + contactgroup_name fedora-sysadmin-pager + alias Fedora Sysadmin Pager Contacts + members smoogep,kevinp,puiterwijkp +} +define contactgroup{ + contactgroup_name fedora-sysadmin-emergency + alias Fedora Sysadmin Pager Contacts + members smooge-emergency,kevin-emergency,puiterwijk-emergency +} diff --git a/roles/nagios_server/files/nagios/contactgroups/null.cfg b/roles/nagios_server/files/nagios/contactgroups/null.cfg new file mode 100644 index 0000000000..e9c2067b77 --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/null.cfg @@ -0,0 +1,5 @@ +define contactgroup{ + contactgroup_name null + alias null + members null +} diff --git a/roles/nagios_server/files/nagios/contactgroups/ppc-secondary-email.cfg b/roles/nagios_server/files/nagios/contactgroups/ppc-secondary-email.cfg new file mode 100644 index 0000000000..f0e11858b1 --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/ppc-secondary-email.cfg @@ -0,0 +1,5 @@ +define contactgroup { + contactgroup_name ppc-secondary-email + alias Fedora PPC secondary arch Email Contacts + members kevin,parasense,karsten +} diff --git a/roles/nagios_server/files/nagios/contactgroups/retrace.cfg b/roles/nagios_server/files/nagios/contactgroups/retrace.cfg new file mode 100644 index 0000000000..8f3310b5dc --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/retrace.cfg @@ -0,0 +1,5 @@ +define contactgroup { + contactgroup_name retrace-email + alias Fedora Retrace server Email Contacts + members kevin,mtoman +} diff --git a/roles/nagios_server/files/nagios/contactgroups/sysadmin-qa-email.cfg b/roles/nagios_server/files/nagios/contactgroups/sysadmin-qa-email.cfg new file mode 100644 index 0000000000..e25dfe84fa --- /dev/null +++ b/roles/nagios_server/files/nagios/contactgroups/sysadmin-qa-email.cfg @@ -0,0 +1,5 @@ +define contactgroup { + contactgroup_name sysadmin-qa-email + alias Fedora SysAdmin QA Email Contacts + members sysadmin-qa +} diff --git a/roles/nagios_server/files/nagios/contacts/aditya.cfg b/roles/nagios_server/files/nagios/contacts/aditya.cfg new file mode 100644 index 0000000000..d35dbb7b86 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/aditya.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name aditya + alias Aditya Patawari + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email adimania@fedoraproject.org +} diff --git a/roles/nagios_server/files/nagios/contacts/admin.cfg b/roles/nagios_server/files/nagios/contacts/admin.cfg new file mode 100644 index 0000000000..fabbb5e3f0 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/admin.cfg @@ -0,0 +1,13 @@ +define contact{ + contact_name admin + alias Fedora Sysadmins + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email sysadmin-members@fedoraproject.org + } + + diff --git a/roles/nagios_server/files/nagios/contacts/ausil.cfg b/roles/nagios_server/files/nagios/contacts/ausil.cfg new file mode 100644 index 0000000000..f8f5e6c8a9 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/ausil.cfg @@ -0,0 +1,16 @@ +define contact{ + contact_name ausil + alias Dennis Gilmore + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + #service_notification_commands notify-by-epager + #host_notification_commands host-notify-by-epager + email ausil@fedoraproject.org + #pager 3098682442@tmomail.net + #email 3098682442@tmomail.net +} + diff --git a/roles/nagios_server/files/nagios/contacts/bowlofeggs.cfg b/roles/nagios_server/files/nagios/contacts/bowlofeggs.cfg new file mode 100644 index 0000000000..43ec89761e --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/bowlofeggs.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name bowlofeggs + alias Randy Barlow + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email rbarlow@redhat.com +} diff --git a/roles/nagios_server/files/nagios/contacts/codeblock.cfg b/roles/nagios_server/files/nagios/contacts/codeblock.cfg new file mode 100644 index 0000000000..3e6b003f7c --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/codeblock.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name codeblock + alias Ricky Elrod + service_notification_period never + host_notification_period never + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email codeblock@elrod.me +} diff --git a/roles/nagios_server/files/nagios/contacts/fedmsg.cfg b/roles/nagios_server/files/nagios/contacts/fedmsg.cfg new file mode 100644 index 0000000000..cd577aa696 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/fedmsg.cfg @@ -0,0 +1,10 @@ +define contact{ + contact_name fedmsg + alias BUS + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-fedmsg + host_notification_commands host-notify-by-fedmsg +} diff --git a/roles/nagios_server/files/nagios/contacts/hvivani.cfg b/roles/nagios_server/files/nagios/contacts/hvivani.cfg new file mode 100644 index 0000000000..2686f59109 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/hvivani.cfg @@ -0,0 +1,12 @@ +define contact{ + contact_name hvivani + alias Hernan Vivani + service_notification_period never + host_notification_period never + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email hernan@vivani.com.ar +} + diff --git a/roles/nagios_server/files/nagios/contacts/ircbot.cfg b/roles/nagios_server/files/nagios/contacts/ircbot.cfg new file mode 100644 index 0000000000..9f3d4fee2e --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/ircbot.cfg @@ -0,0 +1,10 @@ +define contact{ + contact_name ircbot + alias ZOD + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-ircbot + host_notification_commands host-notify-by-ircbot +} diff --git a/roles/nagios_server/files/nagios/contacts/jcollie.cfg b/roles/nagios_server/files/nagios/contacts/jcollie.cfg new file mode 100644 index 0000000000..2fce9d252f --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/jcollie.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name jcollie + alias Jeffrey Ollie + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email jeff@ocjtech.us +} diff --git a/roles/nagios_server/files/nagios/contacts/jmtaylor.cfg b/roles/nagios_server/files/nagios/contacts/jmtaylor.cfg new file mode 100644 index 0000000000..7915a565dc --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/jmtaylor.cfg @@ -0,0 +1,11 @@ +#define contact{ +# contact_name jmtaylor +# alias Jason Taylor +# service_notification_period 24x7 +# host_notification_period 24x7 +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-email +# host_notification_commands host-notify-by-email +# email jmtaylor90@gmail.com +#} diff --git a/roles/nagios_server/files/nagios/contacts/jstanley.cfg b/roles/nagios_server/files/nagios/contacts/jstanley.cfg new file mode 100644 index 0000000000..b32dd9902d --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/jstanley.cfg @@ -0,0 +1,37 @@ +define contact{ + contact_name jstanley + alias Jon Stanley + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email jonstanley@gmail.com +} + +define contact{ + contact_name jstanley-emergency + alias Jon Stanley + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email 9178159801@vtext.com + pager 9178159801@vtext.com +} + +define contact{ + contact_name jstanleyp + alias Jon Stanley + service_notification_period 16x7 + host_notification_period 16x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email 9178159801@vtext.com + pager 9178159801@vtext.com +} diff --git a/roles/nagios_server/files/nagios/contacts/karsten.cfg b/roles/nagios_server/files/nagios/contacts/karsten.cfg new file mode 100644 index 0000000000..30fb005f49 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/karsten.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name karsten + alias Karsten Hopp + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email karsten@redhat.com +} diff --git a/roles/nagios_server/files/nagios/contacts/kevin.cfg b/roles/nagios_server/files/nagios/contacts/kevin.cfg new file mode 100644 index 0000000000..9509abc77c --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/kevin.cfg @@ -0,0 +1,35 @@ +define contact{ + contact_name kevin + alias Kevin Fenzi + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email kevin-pager@scrye.com +} + +define contact{ + contact_name kevin-emergency + alias Kevin Fenzi + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + pager kevin-urgent@scrye.com +} + +define contact{ + contact_name kevinp + alias Kevin Fenzi + service_notification_period 16x7 + host_notification_period 16x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + pager kevin-urgent@scrye.com +} diff --git a/roles/nagios_server/files/nagios/contacts/lmacken.cfg b/roles/nagios_server/files/nagios/contacts/lmacken.cfg new file mode 100644 index 0000000000..92d35ae981 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/lmacken.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name lmacken + alias Luke Macken + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email lewk@vtext.com +} diff --git a/roles/nagios_server/files/nagios/contacts/mmcgrath.cfg b/roles/nagios_server/files/nagios/contacts/mmcgrath.cfg new file mode 100644 index 0000000000..3c130d4568 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/mmcgrath.cfg @@ -0,0 +1,38 @@ +define contact{ + contact_name mmcgrath + alias Mike McGrath + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email mmcgrath@redhat.com +} + +define contact{ + contact_name mmcgrath-emergency + alias Mike McGrath + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email imlinux+mobile@gmail.com + pager imlinux+mobile@gmail.com +} + +define contact{ + contact_name mmcgrathp + alias Mike McGrath + service_notification_period 16x7 + host_notification_period 16x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email imlinux+mobile@gmail.com + pager imlinux+mobile@gmail.com +} + diff --git a/roles/nagios_server/files/nagios/contacts/mtoman.cfg b/roles/nagios_server/files/nagios/contacts/mtoman.cfg new file mode 100644 index 0000000000..6e68b8f6dc --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/mtoman.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name mtoman + alias Michal Toman + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email mtoman@redhat.com +} diff --git a/roles/nagios_server/files/nagios/contacts/nb.cfg b/roles/nagios_server/files/nagios/contacts/nb.cfg new file mode 100644 index 0000000000..ccf2dcd937 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/nb.cfg @@ -0,0 +1,38 @@ +define contact{ + contact_name nb + alias Nick Bebout + service_notification_period never + host_notification_period never + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email nick@bebout.net +} + +#define contact{ +# contact_name nb-emergency +# alias Nick Bebout +# service_notification_period never +# host_notification_period never +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-epager +# host_notification_commands host-notify-by-epager +# email nb5@txt.att.net +# pager nb5@txt.att.net +#} + +#define contact{ +# contact_name nbp +# alias Nick Bebout +# service_notification_period never +# host_notification_period never +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-epager +# host_notification_commands host-notify-by-epager +# email nb5@txt.att.net +# pager nb5@txt.att.net +#} + diff --git a/roles/nagios_server/files/nagios/contacts/nigelj.cfg b/roles/nagios_server/files/nagios/contacts/nigelj.cfg new file mode 100644 index 0000000000..59a4e228b6 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/nigelj.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name nigelj + alias Nigel Jones + service_notification_period never + host_notification_period never + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email dev@nigelj.com +} diff --git a/roles/nagios_server/files/nagios/contacts/null.cfg b/roles/nagios_server/files/nagios/contacts/null.cfg new file mode 100644 index 0000000000..792c5e9b5a --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/null.cfg @@ -0,0 +1,11 @@ +define contact{ + contact_name null + alias null + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email nobody@fedoraproject.org +} diff --git a/roles/nagios_server/files/nagios/contacts/parasense.cfg b/roles/nagios_server/files/nagios/contacts/parasense.cfg new file mode 100644 index 0000000000..27d6792f42 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/parasense.cfg @@ -0,0 +1,23 @@ +define contact{ + contact_name parasense + alias Jon Disnard + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email jdisnard@gmail.com +} + +define contact{ + contact_name parasense-pager + alias Jon Disnard + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email parasense@fedoraproject.org +} diff --git a/roles/nagios_server/files/nagios/contacts/ppc-secondary.cfg b/roles/nagios_server/files/nagios/contacts/ppc-secondary.cfg new file mode 100644 index 0000000000..2e0d056127 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/ppc-secondary.cfg @@ -0,0 +1,10 @@ +define contact { + contact_name ppc-secondary + alias Fedora PPC secondary arch SysAdmins + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email +} diff --git a/roles/nagios_server/files/nagios/contacts/puiterwijk.cfg b/roles/nagios_server/files/nagios/contacts/puiterwijk.cfg new file mode 100644 index 0000000000..10da127438 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/puiterwijk.cfg @@ -0,0 +1,35 @@ +define contact{ + contact_name puiterwijk + alias Patrick Uiterwijk + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + pager pager@puiterwijk.org +} + +define contact{ + contact_name puiterwijkp + alias Patrick Uiterwijk + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + pager pager@puiterwijk.org +} + +define contact{ + contact_name puiterwijk-emergency + alias Patrick Uiterwijk + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + pager emergency@puiterwijk.org +} diff --git a/roles/nagios_server/files/nagios/contacts/ricky.cfg b/roles/nagios_server/files/nagios/contacts/ricky.cfg new file mode 100644 index 0000000000..615f839b42 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/ricky.cfg @@ -0,0 +1,38 @@ +define contact{ + contact_name ricky + alias Ricky Zhou + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email ricky@rzhou.org +} + +define contact{ + contact_name ricky-emergency + alias Ricky Zhou + service_notification_period never + host_notification_period never + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email 2014030692@vtext.com + pager 2014030692@vtext.com +} + +define contact{ + contact_name rickyp + alias Ricky Zhou + service_notification_period never + host_notification_period never + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email 2014030692@vtext.com + pager 2014030692@vtext.com +} + diff --git a/roles/nagios_server/files/nagios/contacts/rigeld2.cfg b/roles/nagios_server/files/nagios/contacts/rigeld2.cfg new file mode 100644 index 0000000000..7a29771974 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/rigeld2.cfg @@ -0,0 +1,11 @@ +define contact{ +contact_name rigeld2 +alias Rob Marti +service_notification_period 24x7 +host_notification_period 24x7 +service_notification_options w,u,c,r +host_notification_options d,u,r +service_notification_commands notify-by-email +host_notification_commands host-notify-by-email +email robmartiwork@gmail.com +} diff --git a/roles/nagios_server/files/nagios/contacts/skvidal.cfg b/roles/nagios_server/files/nagios/contacts/skvidal.cfg new file mode 100644 index 0000000000..27465a1484 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/skvidal.cfg @@ -0,0 +1,48 @@ +#define contact{ +# contact_name skvidal +# alias Seth Vidal +# service_notification_period 24x7 +# host_notification_period 24x7 +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-email +# host_notification_commands host-notify-by-email +# email seth-alert@sethdot.org +#} +# +#define contact{ +# contact_name skvidal_xmpp +# alias Seth Vidal +# service_notification_period 24x7 +# host_notification_period 24x7 +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-xmpp +# host_notification_commands host-notify-by-xmpp +# email skvidal@jabber.org +#} +# +#define contact{ +# contact_name skvidal-emergency +# alias Seth Vidal +# service_notification_period 24x7 +# host_notification_period 24x7 +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-epager +# host_notification_commands host-notify-by-epager +# email page-seth-vidal@sethdot.org +#} +# +#define contact{ +# contact_name skvidalp +# alias Seth Vidal +# service_notification_period 16x7 +# host_notification_period 16x7 +# service_notification_options w,u,c,r +# host_notification_options d,u,r +# service_notification_commands notify-by-epager +# host_notification_commands host-notify-by-epager +# email page-seth-vidal@sethdot.org +# pager page-seth-vidal@sethdot.org +#} diff --git a/roles/nagios_server/files/nagios/contacts/smooge.cfg b/roles/nagios_server/files/nagios/contacts/smooge.cfg new file mode 100644 index 0000000000..079502c799 --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/smooge.cfg @@ -0,0 +1,38 @@ +define contact{ + contact_name smooge + alias Stephen Smoogen + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email smooge+notify@gmail.com +} + +define contact{ + contact_name smooge-emergency + alias Stephen Smoogen + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email smooge+mobile@gmail.com + pager smooge+mobile@gmail.com +} + +define contact{ + contact_name smoogep + alias Stephen Smoogen + service_notification_period 16x7 + host_notification_period 16x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-epager + host_notification_commands host-notify-by-epager + email smooge+mobile@gmail.com + pager smooge+mobile@gmail.com +} + diff --git a/roles/nagios_server/files/nagios/contacts/sysadmin-qa.cfg b/roles/nagios_server/files/nagios/contacts/sysadmin-qa.cfg new file mode 100644 index 0000000000..d4ed945f0e --- /dev/null +++ b/roles/nagios_server/files/nagios/contacts/sysadmin-qa.cfg @@ -0,0 +1,13 @@ +define contact { + contact_name sysadmin-qa + alias Fedora QA SysAdmins + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,u,r + service_notification_commands notify-by-email + host_notification_commands host-notify-by-email + email sysadmin-qa-members@fedoraproject.org +} + + diff --git a/roles/nagios_server/files/nagios/hosts/templates.cfg b/roles/nagios_server/files/nagios/hosts/templates.cfg new file mode 100644 index 0000000000..a68ffe1b12 --- /dev/null +++ b/roles/nagios_server/files/nagios/hosts/templates.cfg @@ -0,0 +1,91 @@ +define host { + name defaulttemplate + #check_command check-host-alive + check_command check-host-alive-nrpe + max_check_attempts 8 + checks_enabled 1 + failure_prediction_enabled 0 + retain_status_information 1 + retain_nonstatus_information 1 + notification_interval 10 + notifications_enabled 1 + notification_options d,r + contact_groups fedora-sysadmin-ircbot + + register 0 +} + +define host { + name mincheck + check_command check-host-alive + max_check_attempts 8 + checks_enabled 1 + failure_prediction_enabled 0 + retain_status_information 1 + retain_nonstatus_information 1 + notification_interval 10 + notifications_enabled 1 + notification_options d,r + contact_groups fedora-sysadmin-ircbot + register 0 +} + +define host { + name autoqatemplate + check_command check-host-alive + max_check_attempts 8 + checks_enabled 1 + failure_prediction_enabled 0 + retain_status_information 1 + retain_nonstatus_information 1 + notification_interval 480 + notifications_enabled 1 + notification_options d,r + contact_groups sysadmin-qa-email + register 0 +} + +define host { + name ppc-secondarytemplate + check_command check-host-alive + max_check_attempts 8 + checks_enabled 1 + failure_prediction_enabled 0 + retain_status_information 1 + retain_nonstatus_information 1 + notification_interval 10 + notifications_enabled 1 + notification_options d,r + contact_groups ppc-secondary-email + register 0 +} + +define host { + name retracetemplate + check_command check-host-alive + max_check_attempts 8 + checks_enabled 1 + failure_prediction_enabled 0 + retain_status_information 1 + retain_nonstatus_information 1 + notification_interval 10 + notifications_enabled 1 + notification_options d,r + contact_groups retrace-email + register 0 +} + +define host { + name defaultbuilders + max_check_attempts 8 + checks_enabled 1 + failure_prediction_enabled 0 + retain_status_information 1 + retain_nonstatus_information 1 + notification_interval 10 + notifications_enabled 1 + notification_options d,r + contact_groups fedora-sysadmin-ircbot + register 0 +} + diff --git a/roles/nagios_server/files/nagios/nrpe/nrpe.cfg b/roles/nagios_server/files/nagios/nrpe/nrpe.cfg new file mode 100644 index 0000000000..c6eed0d07e --- /dev/null +++ b/roles/nagios_server/files/nagios/nrpe/nrpe.cfg @@ -0,0 +1,367 @@ +############################################################################# +# Sample NRPE Config File +# Written by: Ethan Galstad (nagios@nagios.org) +# +# Last Modified: 12-11-2006 +# +# NOTES: +# This is a sample configuration file for the NRPE daemon. It needs to be +# located on the remote host that is running the NRPE daemon, not the host +# from which the check_nrpe client is being executed. +############################################################################# + + +# PID FILE +# The name of the file in which the NRPE daemon should write it's process ID +# number. The file is only written if the NRPE daemon is started by the root +# user and is running in standalone mode. + +pid_file=/var/run/nrpe.pid + + + +# PORT NUMBER +# Port number we should wait for connections on. +# NOTE: This must be a non-priviledged port (i.e. > 1024). +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +server_port=5666 + + + +# SERVER ADDRESS +# Address that nrpe should bind to in case there are more than one interface +# and you do not want nrpe to bind on all interfaces. +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +#server_address=192.168.1.1 + + + +# NRPE USER +# This determines the effective user that the NRPE daemon should run as. +# You can either supply a username or a UID. +# +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +nrpe_user=nrpe + + + +# NRPE GROUP +# This determines the effective group that the NRPE daemon should run as. +# You can either supply a group name or a GID. +# +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +nrpe_group=nrpe + + + +# ALLOWED HOST ADDRESSES +# This is an optional comma-delimited list of IP address or hostnames +# that are allowed to talk to the NRPE daemon. +# +# Note: The daemon only does rudimentary checking of the client's IP +# address. I would highly recommend adding entries in your /etc/hosts.allow +# file to allow only the specified host to connect to the port +# you are running this daemon on. +# +# NOTE: This option is ignored if NRPE is running under either inetd or xinetd + +#allowed_hosts=127.0.0.1,192.168.0.2 +allowed_hosts=10.5.126.41,127.0.0.1,192.168.1.10,192.168.1.20,209.132.181.35 + + + +# COMMAND ARGUMENT PROCESSING +# This option determines whether or not the NRPE daemon will allow clients +# to specify arguments to commands that are executed. This option only works +# if the daemon was configured with the --enable-command-args configure script +# option. +# +# *** ENABLING THIS OPTION IS A SECURITY RISK! *** +# Read the SECURITY file for information on some of the security implications +# of enabling this variable. +# +# Values: 0=do not allow arguments, 1=allow command arguments + +dont_blame_nrpe=0 + + + +# COMMAND PREFIX +# This option allows you to prefix all commands with a user-defined string. +# A space is automatically added between the specified prefix string and the +# command line from the command definition. +# +# *** THIS EXAMPLE MAY POSE A POTENTIAL SECURITY RISK, SO USE WITH CAUTION! *** +# Usage scenario: +# Execute restricted commmands using sudo. For this to work, you need to add +# the nagios user to your /etc/sudoers. An example entry for alllowing +# execution of the plugins from might be: +# +# nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/ +# +# This lets the nagios user run all commands in that directory (and only them) +# without asking for a password. If you do this, make sure you don't give +# random users write access to that directory or its contents! + +# command_prefix=/usr/bin/sudo + + + +# DEBUGGING OPTION +# This option determines whether or not debugging messages are logged to the +# syslog facility. +# Values: 0=debugging off, 1=debugging on + +debug=0 + + + +# COMMAND TIMEOUT +# This specifies the maximum number of seconds that the NRPE daemon will +# allow plugins to finish executing before killing them off. + +command_timeout=60 + + + +# CONNECTION TIMEOUT +# This specifies the maximum number of seconds that the NRPE daemon will +# wait for a connection to be established before exiting. This is sometimes +# seen where a network problem stops the SSL being established even though +# all network sessions are connected. This causes the nrpe daemons to +# accumulate, eating system resources. Do not set this too low. + +connection_timeout=300 + + + +# WEEK RANDOM SEED OPTION +# This directive allows you to use SSL even if your system does not have +# a /dev/random or /dev/urandom (on purpose or because the necessary patches +# were not applied). The random number generator will be seeded from a file +# which is either a file pointed to by the environment valiable $RANDFILE +# or $HOME/.rnd. If neither exists, the pseudo random number generator will +# be initialized and a warning will be issued. +# Values: 0=only seed from /dev/[u]random, 1=also seed from weak randomness + +#allow_weak_random_seed=1 + + + +# INCLUDE CONFIG FILE +# This directive allows you to include definitions from an external config file. + +#include= + + + +# INCLUDE CONFIG DIRECTORY +# This directive allows you to include definitions from config files (with a +# .cfg extension) in one or more directories (with recursion). + +#include_dir= +#include_dir= + + + +# COMMAND DEFINITIONS +# Command definitions that this daemon will run. Definitions +# are in the following format: +# +# command[]= +# +# When the daemon receives a request to return the results of +# it will execute the command specified by the argument. +# +# Unlike Nagios, the command line cannot contain macros - it must be +# typed exactly as it should be executed. +# +# Note: Any plugins that are used in the command lines must reside +# on the machine that this daemon is running on! The examples below +# assume that you have plugins installed in a /usr/local/nagios/libexec +# directory. Also note that you will have to modify the definitions below +# to match the argument format the plugins expect. Remember, these are +# examples only! + +# The following examples use hardcoded command arguments... + +command[check_nrpe]=/bin/date +command[check_users]=/usr/lib64/nagios/plugins/check_users -w 5 -c 10 +command[check_load]=/usr/lib64/nagios/plugins/check_load -w 15,10,5 -c 30,25,20 +command[check_hosted_load]=/usr/lib64/nagios/plugins/check_load -w 35,30,25 -c 70,60,50 +command[check_raid]=/usr/lib64/nagios/plugins/check_raid.py +command[check_disk_/]=/usr/lib64/nagios/plugins/check_disk -w 15% -c 10% -p / +command[check_disk_/u01]=/usr/lib64/nagios/plugins/check_disk -w 15% -c 10% -p /u01 +command[check_disk_/mnt/koji]=/usr/lib64/nagios/plugins/check_disk -w 10% -c 5% -p /mnt/koji +command[check_disk_/boot]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /boot +command[check_disk_/git]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /git +command[check_disk_/postgreslogs]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /postgreslogs +command[check_disk_/srv]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv +command[check_disk_/srv/diskimages]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/diskimages +command[check_disk_/srv/buildmaster]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/buildmaster +command[check_disk_/srv/taskotron]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/taskotron +command[check_disk_/var/lib64/mock]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /var/lib/mock +command[check_disk_/var/log]=/usr/lib64/nagios/plugins/check_disk -w 15% -c 10% -p /var/log +command[check_disk_/srv/cache/lookaside]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/cache/lookaside +command[check_zombie_procs]=/usr/lib64/nagios/plugins/check_procs -w 15 -c 25 -s Z +command[check_total_procs]=/usr/lib64/nagios/plugins/check_procs -w 900 -c 1000 +command[check_swap]=/usr/lib64/nagios/plugins/check_swap -w 15% -c 10% +command[check_temp]=/usr/lib64/nagios/plugins/check_ipmi -t +command[check_fans]=/usr/lib64/nagios/plugins/check_ipmi -f +command[check_mirrorlist_cache]=/usr/lib64/nagios/plugins/check_file_age -w 86400 -c 129600 -f /var/lib/mirrormanager/mirrorlist_cache.pkl +command[check_mysql_backup]=/usr/lib64/nagios/plugins/check_file_age -w 86400 -c 129600 -f /backups/fpo-mediawiki.xz +command[check_pgsql_koji_backup]=/usr/lib64/nagios/plugins/check_file_age -w 86400 -c 129600 -f /backups/db04.phx2.fedoraproject.org/koji.db +command[check_pgsql_backup]=/usr/lib64/nagios/plugins/check_file_age -w 86400 -c 129600 -f /backups/db-fas01.phx2.fedoraproject.org/fas2.db +command[check_puppetmaster]=/usr/lib64/nagios/plugins/check_procs -c 8:8 -a '/usr/bin/ruby /usr/sbin/puppetmasterd' -u puppet +command[check_supervisor]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/python /usr/bin/supervisord' -u root +command[check_lock]=/usr/lib64/nagios/plugins/check_lock +command[check_lock_file_age]=/usr/lib64/nagios/plugins/check_lock_file_age -w 1 -c 5 -f /var/lock/fedora-ca/lock +command[check_nagios]=/usr/lib64/nagios/plugins/check_nagios -e 5 -F /var/log/nagios/status.dat -C /usr/sbin/nagios +command[check_auditd]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'auditd' -u root +command[check_cron]=/usr/lib64/nagios/plugins/check_procs -c 1:10 -C 'crond' -u root +command[check_varnish_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:2 -C 'varnishd' -u varnish +command[check_saslauthd]=/usr/lib64/nagios/plugins/check_procs -c 1:10 -C 'saslauthd' -u root +command[check_readonly_fs]=/usr/lib64/nagios/plugins/check_readonly_fs +command[check_postfix_queue]=/usr/lib64/nagios/plugins/check_postfix_queue -w 2 -c 5 +command[check_merged_file_age]=/usr/lib64/nagios/plugins/check_file_age -w 120 -c 300 /var/log/merged/messages.log +command[check_unbound_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'unbound' -u unbound +command[check_fedmsg_relay_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-relay' -u fedmsg +command[check_fedmsg_hub_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-hub' -u fedmsg +command[check_fedmsg_gateway_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-gateway' -u fedmsg +command[check_fedmsg_irc_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-irc' -u fedmsg +command[check_fedmsg_tweet_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-tweet' -u fedmsg +command[check_fedmsg_masher_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-hub' -u apache +command[check_supybot_fedmsg_plugin]=/usr/lib64/nagios/plugins/check_supybot_plugin -t fedmsg +command[check_haproxy_conns]=/usr/lib64/nagios/plugins/check_haproxy_conns.py +command[check_haproxy_mirrorlist]=/usr/lib64/nagios/plugins/check_haproxy_mirrorlist.py +command[check_redis_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'redis-server' -u redis +command[check_autocloud_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'python' -a 'autocloud_job.py' -u root +command[check_openvpn_link]=/usr/lib64/nagios/plugins/check_ping -H 192.168.1.41 -w 375.0,20% -c 500,60% +command[check_memcache]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/memcached' -u memcached +command[check_memcache_connect]=/usr/lib64/nagios/plugins/check_memcache_connect +command[check_koschei_polling_proc]=/usr/lib64/nagios/plugins/check_procs -s RSD -u koschei -C koschei-polling -c 1:1 +command[check_koschei_resolver_proc]=/usr/lib64/nagios/plugins/check_procs -s RSD -u koschei -C koschei-resolve -c 1:1 +command[check_koschei_scheduler_proc]=/usr/lib64/nagios/plugins/check_procs -s RSD -u koschei -C koschei-schedul -c 1:1 +command[check_koschei_watcher_proc]=/usr/lib64/nagios/plugins/check_procs -s RSD -u koschei -C koschei-watcher -c 1:1 + +# The following are fedmsg/datanommer checks to be run on busgateway01. +# They check for the time since the latest message in any particular category. +# The first number is the seconds elapsed until we should raise a warning. +# The second number is the seconds elapsed until we should raise an error. +# For your reference: +# 4 hours -> 14400 +# 1 day -> 86400 +# 3 days -> 259200 +# 1 week -> 604800 +# 3 weeks -> 1814400 +# 1 month -> 2628000 +# 3 months -> 7884000 +command[check_datanommer_buildsys]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py buildsys 14400 86400 +command[check_datanommer_git]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py git 86400 604800 +command[check_datanommer_bodhi]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py bodhi 86400 604800 +command[check_datanommer_wiki]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py wiki 259200 1814400 +command[check_datanommer_compose]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py compose 259200 1814400 +command[check_datanommer_meetbot]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py meetbot 604800 2628000 +command[check_datanommer_fas]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py fas 1814400 2628000 +command[check_datanommer_pkgdb]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py pkgdb 1814400 2628000 +command[check_datanommer_fedoratagger]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py fedoratagger 2628000 7884000 +command[check_datanommer_planet]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py planet 2628000 7884000 +command[check_datanommer_copr]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py copr 21600 86400 +command[check_datanommer_trac]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py trac 86400 259200 +command[check_datanommer_askbot]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py askbot 86400 259200 +command[check_datanommer_fedbadges]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py fedbadges 86400 259200 +command[check_datanommer_nuancier]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py nuancier 23652000 31536000 +command[check_datanommer_fedocal]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py fedocal 7884000 23652000 +command[check_datanommer_ansible]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py ansible 432000 604800 +command[check_datanommer_anitya]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py anitya 604800 1814400 +command[check_datanommer_autocloud]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py autocloud 604800 1814400 + +# These are not actually finished and deployed yet +command[check_datanommer_mailman]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py mailman 14400 86400 +command[check_datanommer_summershum]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py summershum 604800 1814400 + +# Fedmsg checks for consumers and producers +command[check_fedmsg_cp_busgateway_hub]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub Nommer MonitoringProducer +command[check_fedmsg_cp_busgateway_relay]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer +command[check_fedmsg_cp_busgateway_gateway]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-gateway GatewayConsumer MonitoringProducer +command[check_fedmsg_cp_app]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer +command[check_fedmsg_cp_value]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-irc IRCBotConsumer MonitoringProducer +command[check_fedmsg_cp_pkgs]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub GenACLsConsumer MonitoringProducer +command[check_fedmsg_cp_summershum]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub SummerShumConsumer MonitoringProducer +command[check_fedmsg_cp_badges_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FedoraBadgesConsumer MonitoringProducer +command[check_fedmsg_cp_notifs_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FMNConsumer DigestProducer ConfirmationProducer MonitoringProducer +command[check_fedmsg_cp_bugzilla2fedmsg]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py moksha-hub BugzillaConsumer MonitoringProducer +command[check_fedmsg_cp_fedimg_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FedimgConsumer MonitoringProducer +command[check_fedmsg_cp_hotness_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub BugzillaTicketFiler MonitoringProducer +command[check_fedmsg_cp_bodhi_backend01_hub]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub Masher MonitoringProducer +command[check_fedmsg_cp_bodhi_backend02_hub]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub UpdatesHandler MonitoringProducer +command[check_fedmsg_cp_autocloud_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub AutoCloudConsumer MonitoringProducer +command[check_fedmsg_cp_packages_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub CacheInvalidator MonitoringProducer +command[check_fedmsg_cp_bugyou_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub BugyouConsumer MonitoringProducer +command[check_fedmsg_cp_pdc_backend]=/usr/lib64/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub PDCUpdater MonitoringProducer + +command[check_fedmsg_cexceptions_busgateway_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub Nommer 1 10 +command[check_fedmsg_cexceptions_busgateway_relay]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10 +command[check_fedmsg_cexceptions_busgateway_gateway]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-gateway GatewayConsumer 1 10 +command[check_fedmsg_cexceptions_app]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10 +command[check_fedmsg_cexceptions_value]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-irc IRCBotConsumer 1 10 +command[check_fedmsg_cexceptions_pkgs]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub GenACLsConsumer 1 10 +command[check_fedmsg_cexceptions_summershum]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub SummerShumConsumer 1 10 +command[check_fedmsg_cexceptions_badges_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FedoraBadgesConsumer 1 10 +command[check_fedmsg_cexceptions_notifs_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FMNConsumer 1 10 +command[check_fedmsg_cexceptions_bugzilla2fedmsg]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py moksha-hub BugzillaConsumer 1 10 +command[check_fedmsg_cexceptions_fedimg_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FedimgConsumer 1 10 +command[check_fedmsg_cexceptions_hotness_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub BugzillaTicketFiler 1 10 +command[check_fedmsg_cexceptions_bodhi_backend01_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub Masher 1 10 +command[check_fedmsg_cexceptions_bodhi_backend02_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub UpdatesHandler 1 10 +command[check_fedmsg_cexceptions_autocloud_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub AutoCloudConsumer 1 10 +command[check_fedmsg_cexceptions_packages_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub CacheInvalidator 1 10 +command[check_fedmsg_cexceptions_bugyou_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub BugyouConsumer 1 10 +command[check_fedmsg_cexceptions_pdc_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub PDCUpdater 1 10 + +command[check_fedmsg_cbacklog_busgateway_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub Nommer 500 1000 +command[check_fedmsg_cbacklog_busgateway_relay]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50 +command[check_fedmsg_cbacklog_busgateway_gateway]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-gateway GatewayConsumer 10 50 +command[check_fedmsg_cbacklog_app]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50 +command[check_fedmsg_cbacklog_value]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-irc IRCBotConsumer 10 50 +command[check_fedmsg_cbacklog_pkgs]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub GenACLsConsumer 10 50 +command[check_fedmsg_cbacklog_summershum]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub SummerShumConsumer 10 50 +command[check_fedmsg_cbacklog_badges_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedoraBadgesConsumer 10 50 +command[check_fedmsg_cbacklog_notifs_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FMNConsumer 10 50 +command[check_fedmsg_cbacklog_bugzilla2fedmsg]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py moksha-hub BugzillaConsumer 10 100 +command[check_fedmsg_cbacklog_fedimg_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedimgConsumer 2000 5000 +command[check_fedmsg_cbacklog_hotness_backend]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub BugzillaTicketFiler 100 500 +command[check_fedmsg_cbacklog_bodhi_backend01_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub Masher 500 1000 +command[check_fedmsg_cbacklog_bodhi_backend02_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub UpdatesHandler 500 1000 +command[check_fedmsg_cbacklog_autocloud_backend_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub AutoCloudConsumer 500 1000 +command[check_fedmsg_cbacklog_packages_backend_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub CacheInvalidator 5000 10000 +command[check_fedmsg_cbacklog_bugyou_backend_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub BugyouConsumer 5000 10000 +command[check_fedmsg_cbacklog_pdc_backend_hub]=/usr/lib64/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub PDCUpdater 10000 20000 + +command[check_fedmsg_fmn_digest_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 90 600 +command[check_fedmsg_fmn_confirm_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 30 300 + +# The following are 'action commands' where by an actual action is performed +# like restarting httpd + +command[service_httpd_restart]=/usr/bin/sudo /sbin/service httpd restart +command[service_httpd_reload]=/usr/bin/sudo /sbin/service httpd reload +command[service_httpd_graceful]=/usr/bin/sudo /sbin/service httpd graceful + +# Used to restart rsyslog on log02 when it goes wonky +command[service_rsyslog_restart]=/usr/bin/sudo /sbin/service rsyslog restart + +# The following examples allow user-supplied arguments and can +# only be used if the NRPE daemon was compiled with support for +# command arguments *AND* the dont_blame_nrpe directive in this +# config file is set to '1'... + +#command[check_users]=/usr/lib64/nagios/plugins/check_users -w $ARG1$ -c $ARG2$ +#command[check_load]=/usr/lib64/nagios/plugins/check_load -w $ARG1$ -c $ARG2$ +#command[check_disk]=/usr/lib64/nagios/plugins/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$ +#command[check_procs]=/usr/lib64/nagios/plugins/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$ + diff --git a/roles/nagios_server/files/nagios/plugins/check_datanommer_timesince.py b/roles/nagios_server/files/nagios/plugins/check_datanommer_timesince.py new file mode 100755 index 0000000000..d4fcef1725 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_datanommer_timesince.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +""" NRPE check for datanommer/fedmsg health. +Given a category like 'bodhi', 'buildsys', or 'git', return an error if +datanommer hasn't seen a message of that type in such and such time. + +Requires: python-dateutil + +Usage: + + $ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH + +:Author: Ralph Bean + +""" + +import dateutil.relativedelta +import subprocess +import sys +import json + + +def query_timesince(category): + cmd = 'datanommer-latest --category %s --timesince' % category + sys.stderr.write("Running %r\n" % cmd) + process = subprocess.Popen(cmd.split(), shell=False, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + data = json.loads(stdout) + return float(data[0]) + + +def main(): + category, warning_threshold, critical_threshold = sys.argv[-3:] + timesince = query_timesince(category) + warning_threshold = int(warning_threshold) + critical_threshold = int(critical_threshold) + + time_strings = [] + rd = dateutil.relativedelta.relativedelta(seconds=timesince) + for denomination in ['years', 'months', 'days', 'hours', 'minutes', 'seconds']: + value = getattr(rd, denomination, 0) + if value: + time_strings.append("%d %s" % (value, denomination)) + + string = ", ".join(time_strings) + reason = "datanommer has not seen a %r message in %s" % (category, string) + + if timesince > critical_threshold: + print "CRIT: ", reason + sys.exit(2) + + if timesince > warning_threshold: + print "WARN: ", reason + sys.exit(1) + + print "OK: ", reason + sys.exit(0) + + +if __name__ == '__main__': + try: + main() + except Exception as e: + print "UNKNOWN: ", str(e) + sys.exit(3) diff --git a/roles/nagios_server/files/nagios/plugins/check_dig_ssl b/roles/nagios_server/files/nagios/plugins/check_dig_ssl new file mode 100644 index 0000000000..0769733a8b --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_dig_ssl @@ -0,0 +1,44 @@ +#!/bin/bash +# +# 29-02-2012 +# Author: Christos Triantafyllidis +# + +# Default values +STUNNEL_EXEC=${STUNNEL_EXEC:-/usr/bin/stunnel} +CHECK_DIG_EXEC=${CHECK_DIG_EXEC:-/usr/lib64/nagios/plugins/check_dig} +lport=8443 + +ARGS="" +while getopts "L:H:p:l:T:a:A:w:c:t:v" options +do + case $options in + L ) lport=$OPTARG ;; + H ) host=$OPTARG ;; + p ) port=$OPTARG ;; + * ) ARGS="$ARGS -$options $OPTARG";; + esac +done + +# Create a ssl tunnel to the request socket +TMPFILE=`mktemp /tmp/$(basename $0)_${host}_${port}_XXXXX` +echo " +client = yes +verify = 0 +syslog = no +pid=$TMPFILE.pid +[${host}_${port}] +accept=${lport} +connect=${host}:${port} +" > $TMPFILE + +$STUNNEL_EXEC $TMPFILE + +# Use check_dig via the stunnel +$CHECK_DIG_EXEC -H localhost -p ${lport} $ARGS +e_status=$? + +# cleanup +kill -9 `cat $TMPFILE.pid` +rm -f $TMPFILE $TMPFILE.pid +exit $e_status diff --git a/roles/nagios_server/files/nagios/plugins/check_email_delivery_epn b/roles/nagios_server/files/nagios/plugins/check_email_delivery_epn new file mode 100644 index 0000000000..a097f20988 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_email_delivery_epn @@ -0,0 +1,472 @@ +#!/usr/bin/perl +use strict; +my $VERSION = '0.6.2'; +my $COPYRIGHT = 'Copyright (C) 2005-2008 Jonathan Buhacoff '; +my $LICENSE = 'http://www.gnu.org/licenses/gpl.txt'; +my %status = ( 'OK' => 0, 'WARNING' => 1, 'CRITICAL' => 2, 'UNKNOWN' => 3 ); + +# look for required modules +exit $status{UNKNOWN} unless load_modules(qw/Getopt::Long/); + +# get options from command line +Getopt::Long::Configure("bundling"); +my $verbose = 0; +my $help = ""; +my $help_usage = ""; +my $show_version = ""; +my $host = ""; +my $smtp_server = ""; +my $smtp_port = ""; +my $imap_server = ""; +my $smtp_username = ""; +my $smtp_password = ""; +my $smtp_tls = ""; +my $imap_port = ""; +my $imap_username = ""; +my $imap_password = ""; +my $username = ""; +my $password = ""; +my $ssl = ""; +my $imap_ssl = ""; +my $mailto = ""; +my $mailfrom = ""; +my @header = (); +my $body = ""; +my $warnstr = ""; +my $critstr = ""; +my $waitstr = ""; +my $delay_warn = 95; +my $delay_crit = 300; +my $smtp_warn = 15; +my $smtp_crit = 30; +my $imap_warn = 15; +my $imap_crit = 30; +my $timeout = ""; +my @alert_plugins = (); +my $imap_interval = 5; +my $imap_retries = 5; +my @plugins = (); +my @token_formats = (); +my $tokenfile = ""; +my $default_crit = 30; +my $default_warn = 15; +my $default_wait = 5; +my $default_timeout = 60; +#my $libexec = "/usr/local/nagios/libexec"; +my $libexec = "/usr/lib64/nagios/plugins"; +my $ok; +$ok = Getopt::Long::GetOptions( + "V|version"=>\$show_version, + "v|verbose+"=>\$verbose,"h|help"=>\$help,"usage"=>\$help_usage, + "w|warning=s"=>\$warnstr,"c|critical=s"=>\$critstr, "t|timeout=s"=>\$timeout, + "libexec=s"=>\$libexec, + # plugin settings + "p|plugin=s"=>\@plugins, "T|token=s"=>\@token_formats, + "A|alert=i"=>\@alert_plugins, + "F|file=s"=>\$tokenfile, + # common settings + "H|hostname=s"=>\$host, + "U|username=s"=>\$username,"P|password=s"=>\$password, + # smtp settings + "smtp-server=s"=>\$smtp_server,"smtp-port=i"=>\$smtp_port, + "mailto=s"=>\$mailto, "mailfrom=s",\$mailfrom, + "header=s"=>\@header, "body=s"=>\$body, + # smtp-tls settings + "smtptls!"=>\$smtp_tls, + "smtp-username=s"=>\$smtp_username,"smtp-password=s"=>\$smtp_password, + # delay settings + "wait=s"=>\$waitstr, + # imap settings + "imap-server=s"=>\$imap_server,"imap-port=i"=>\$imap_port, + "imap-username=s"=>\$imap_username,"imap-password=s"=>\$imap_password, + "imap-check-interval=i"=>\$imap_interval,"imap-retries=i"=>\$imap_retries, + "imapssl!"=>\$imap_ssl, + ); + +if( $show_version ) { + print "$VERSION\n"; + if( $verbose ) { + print "Warning threshold: $delay_warn seconds\n"; + print "Critical threshold: $delay_crit seconds\n"; + print "Default wait: $default_wait seconds\n"; + print "Default timeout: $default_timeout seconds\n"; + } + exit $status{UNKNOWN}; +} + +if( $help ) { + exec "perldoc", $0 or print "Try `perldoc $0`\n"; + exit $status{UNKNOWN}; +} + +if( $host ) { + $smtp_server = $host if $smtp_server eq ""; + $imap_server = $host if $imap_server eq ""; +} + +if( $username ) { + $smtp_username = $username if $smtp_username eq ""; + $imap_username = $username if $imap_username eq ""; +} + +if( $password ) { + $smtp_password = $password if $smtp_password eq ""; + $imap_password = $password if $imap_password eq ""; +} + +if( $ssl ) { + $imap_ssl = $ssl if $imap_ssl eq ""; + $smtp_tls = $ssl if $smtp_tls eq ""; +} + +if( $help_usage + || + ( + scalar(@plugins) == 0 + && + ( + $smtp_server eq "" || $mailto eq "" || $mailfrom eq "" + || $imap_server eq "" || $username eq "" || $password eq "" + ) + ) + ) { + print "Usage 1: $0 -H host \n\t". + "--mailto recipient\@your.net --mailfrom sender\@your.net --body 'message' \n\t". + "--username username --password password \n\t". + "[-w ] [-c ]\n\t" . + "[--imap-check-interval ] [--imap-retries ]\n"; + print "Usage 2: $0 \n\t". + "-p 'first plugin command with %TOKEN1% embedded' \n\t". + "-p 'second plugin command with %TOKEN1% embedded' \n\t". + "[-w ,] [-c ,] \n"; + exit $status{UNKNOWN}; +} + +# determine thresholds +my @warning_times = split(",", $warnstr); +my @critical_times = split(",", $critstr); +my @alarm_times = split(",", $timeout); +my @wait_times = split(",", $waitstr); +my ($dw,$sw,$rw) = split(",", $warnstr); +my ($dc,$sc,$rc) = split(",", $critstr); +my ($wait) = split(",", $waitstr); +$delay_warn = $dw if defined $dw and $dw ne ""; +$smtp_warn = $sw if defined $sw and $sw ne ""; +$imap_warn = $rw if defined $rw and $rw ne ""; +$delay_crit = $dc if defined $dc and $dc ne ""; +$smtp_crit = $sc if defined $sc and $sc ne ""; +$imap_crit = $rc if defined $rc and $rc ne ""; +my $smtp_thresholds = ""; +$smtp_thresholds .= "-w $smtp_warn " if defined $smtp_warn and $smtp_warn ne ""; +$smtp_thresholds .= "-c $smtp_crit " if defined $smtp_crit and $smtp_crit ne ""; +my $imap_thresholds = ""; +$imap_thresholds .= "-w $imap_warn " if defined $imap_warn and $imap_warn ne ""; +$imap_thresholds .= "-c $imap_crit " if defined $imap_crit and $imap_crit ne ""; +$imap_thresholds .= "--imap-check-interval $imap_interval " if defined $imap_interval and $imap_interval ne ""; +$imap_thresholds .= "--imap-retries $imap_retries " if defined $imap_retries and $imap_retries ne ""; +if( scalar(@alarm_times) == 1 ) { + $default_timeout = shift(@alarm_times); +} + +# determine which other options to include +my $smtp_options = ""; +$smtp_options .= "-H $smtp_server " if defined $smtp_server and $smtp_server ne ""; +$smtp_options .= "-p $smtp_port " if defined $smtp_port and $smtp_port ne ""; +$smtp_options .= "--tls " if defined $smtp_tls and $smtp_tls; +$smtp_options .= "-U $username " if defined $smtp_username and $smtp_username ne ""; +$smtp_options .= "-P $password " if defined $smtp_password and $smtp_password ne ""; +$smtp_options .= "--mailto $mailto " if defined $mailto and $mailto ne ""; +$smtp_options .= "--mailfrom $mailfrom " if defined $mailfrom and $mailfrom ne ""; +my $imap_options = ""; +$imap_options .= "-H $imap_server " if defined $imap_server and $imap_server ne ""; +$imap_options .= "-p $imap_port " if defined $imap_port and $imap_port ne ""; +$imap_options .= "-U $username " if defined $imap_username and $imap_username ne ""; +$imap_options .= "-P $password " if defined $imap_password and $imap_password ne ""; +$imap_options .= "--ssl " if defined $imap_ssl and $imap_ssl; + +# create the report object +my $report = new PluginReport; +my @report_plugins = (); # populated later with either (smtp,imap) or (plugin1,plugin2,...) +my $time_start; # initialized later with time the work actually starts + +# create token formats for use with the plugins +my @alpha = qw/a b c d e f g h i j k l m n o p q r s t u v w x y z/; +my @numeric = qw/0 1 2 3 4 5 6 7 8 9/; +my @hex = qw/0 1 2 3 4 5 6 7 8 9 a b c d e f/; +my @pgp_even = qw/aardvark absurd accrue acme adrift adult afflict ahead aimless Algol allow alone ammo ancient apple artist assume Athens atlas Aztec baboon backfield backward banjo beaming bedlamp beehive beeswax befriend Belfast berserk billiard bison blackjack blockade blowtorch bluebird bombast bookshelf brackish breadline breakup brickyard briefcase Burbank button buzzard cement chairlift chatter checkup chisel choking chopper Christmas clamshell classic classroom cleanup clockwork cobra commence concert cowbell crackdown cranky crowfoot crucial crumpled crusade cubic dashboard deadbolt deckhand dogsled dragnet drainage dreadful drifter dropper drumbeat drunken Dupont dwelling eating edict egghead eightball endorse endow enlist erase escape exceed eyeglass eyetooth facial fallout flagpole flatfoot flytrap fracture framework freedom frighten gazelle Geiger glitter glucose goggles goldfish gremlin guidance hamlet highchair hockey indoors indulge inverse involve island jawbone keyboard kickoff kiwi klaxon locale lockup merit minnow miser Mohawk mural music necklace Neptune newborn nightbird Oakland obtuse offload optic orca payday peachy pheasant physique playhouse Pluto preclude prefer preshrunk printer prowler pupil puppy python quadrant quiver quota ragtime ratchet rebirth reform regain reindeer rematch repay retouch revenge reward rhythm ribcage ringbolt robust rocker ruffled sailboat sawdust scallion scenic scorecard Scotland seabird select sentence shadow shamrock showgirl skullcap skydive slingshot slowdown snapline snapshot snowcap snowslide solo southward soybean spaniel spearhead spellbind spheroid spigot spindle spyglass stagehand stagnate stairway standard stapler steamship sterling stockman stopwatch stormy sugar surmount suspense sweatband swelter tactics talon tapeworm tempest tiger tissue tonic topmost tracker transit trauma treadmill Trojan trouble tumor tunnel tycoon uncut unearth unwind uproot upset upshot vapor village virus Vulcan waffle wallet watchword wayside willow woodlark Zulu/; +my @pgp_odd = qw/adroitness adviser aftermath aggregate alkali almighty amulet amusement antenna applicant Apollo armistice article asteroid Atlantic atmosphere autopsy Babylon backwater barbecue belowground bifocals bodyguard bookseller borderline bottomless Bradbury bravado Brazilian breakaway Burlington businessman butterfat Camelot candidate cannonball Capricorn caravan caretaker celebrate cellulose certify chambermaid Cherokee Chicago clergyman coherence combustion commando company component concurrent confidence conformist congregate consensus consulting corporate corrosion councilman crossover crucifix cumbersome customer Dakota decadence December decimal designing detector detergent determine dictator dinosaur direction disable disbelief disruptive distortion document embezzle enchanting enrollment enterprise equation equipment escapade Eskimo everyday examine existence exodus fascinate filament finicky forever fortitude frequency gadgetry Galveston getaway glossary gossamer graduate gravity guitarist hamburger Hamilton handiwork hazardous headwaters hemisphere hesitate hideaway holiness hurricane hydraulic impartial impetus inception indigo inertia infancy inferno informant insincere insurgent integrate intention inventive Istanbul Jamaica Jupiter leprosy letterhead liberty maritime matchmaker maverick Medusa megaton microscope microwave midsummer millionaire miracle misnomer molasses molecule Montana monument mosquito narrative nebula newsletter Norwegian October Ohio onlooker opulent Orlando outfielder Pacific pandemic Pandora paperweight paragon paragraph paramount passenger pedigree Pegasus penetrate perceptive performance pharmacy phonetic photograph pioneer pocketful politeness positive potato processor provincial proximate puberty publisher pyramid quantity racketeer rebellion recipe recover repellent replica reproduce resistor responsive retraction retrieval retrospect revenue revival revolver sandalwood sardonic Saturday savagery scavenger sensation sociable souvenir specialist speculate stethoscope stupendous supportive surrender suspicious sympathy tambourine telephone therapist tobacco tolerance tomorrow torpedo tradition travesty trombonist truncated typewriter ultimate undaunted underfoot unicorn unify universe unravel upcoming vacancy vagabond vertigo Virginia visitor vocalist voyager warranty Waterloo whimsical Wichita Wilmington Wyoming yesteryear Yucatan/; +my %formats = ( + 'a' => sub { pick_random(@alpha) }, + 'n' => sub { pick_random(@numeric) }, + 'c' => sub { pick_random(@alpha,@numeric) }, + 'h' => sub { pick_random(@hex) }, + 'U' => sub { time }, + 'X' => sub { pick_random(@pgp_even) }, + 'Y' => sub { pick_random(@pgp_odd) }, +); +if( scalar(@plugins) ) { + # scan the plugin commands for use of tokens to count how many we need + my $token_count = 0; + foreach my $p (@plugins) { + my @matches = sort ($p =~ m/%TOKEN(\d+)%/g); + my $max = pop @matches; + $token_count = $max if defined($max) && $max > $token_count; + } + # create the tokens + my @tokens = (); + foreach my $t (1..$token_count) { + my $format = shift @token_formats; + $format = "U-X-Y" unless $format; + my @format_characters = split(//, $format); + my $token = ""; + foreach my $c (@format_characters) { + if( defined $formats{$c} ) { + $token .= &{$formats{$c}}; + } + else { + $token .= $c; + } + } + push @tokens, $token; + } + # substitute the tokens into each plugin command + foreach my $p (@plugins) { + foreach my $t (1..$token_count) { + my $token = $tokens[$t-1]; + $p =~ s/%TOKEN$t%/$token/g; + } + } + # mark plugins that are allowed to generate alerts. default behavior is to alert for all plugins. + my %alert_plugins = (); + if( scalar(@alert_plugins) > 0 ) { + %alert_plugins = map { $_ => 1 } @alert_plugins; + } + else { + %alert_plugins = map { $_ => 1 } (1..scalar(@plugins)); + } + # run each plugin and store its output in a report + $time_start = time; + my $i = 0; + foreach my $p( @plugins ) { + $i++; + my $plugin_timeout = shift(@alarm_times) || $default_timeout; + # run the plugin + eval { + local $SIG{ALRM} = sub { die "exceeded timeout $plugin_timeout seconds\n" }; # NB: \n required, see `perldoc -f alarm` + alarm $plugin_timeout; + my $output = `$p`; + chomp $output; + if( $output !~ m/OK|WARNING|CRITICAL/ ) { + print "EMAIL DELIVERY UNKNOWN - plugin $i error: $output\n"; + print "Plugin $i: $p\n" if $verbose; + # record tokens in a file if option is enabled + record_tokens($tokenfile,\@tokens,$time_start,undef,'UNKNOWN',$i,$output) if $tokenfile; + exit $status{UNKNOWN}; + } + if( $output =~ m/CRITICAL/ && $alert_plugins{$i} ) { + print "EMAIL DELIVERY CRITICAL - plugin $i failed: $output\n"; + print "Plugin $i: $p" if $verbose; + # record tokens in a file if option is enabled + record_tokens($tokenfile,\@tokens,$time_start,undef,'CRITICAL',$i,$output) if $tokenfile; + exit $status{CRITICAL}; + } + if( $output =~ m/WARNING/ && $alert_plugins{$i} ) { + print "EMAIL DELIVERY WARNING - plugin $i warning: $output\n"; + print "Plugin $i: $p\n" if $verbose; + # record tokens in a file if option is enabled + record_tokens($tokenfile,\@tokens,$time_start,undef,'WARNING',$i,$output) if $tokenfile; + exit $status{WARNING}; + } + $report->{"plugin".$i} = $output; + alarm 0; + }; + if( $@ && $alert_plugins{$i} ) { + print "EMAIL DELIVERY CRITICAL - Could not run plugin $i: $@\n"; + print "Plugin $i: $p\n" if $verbose; + exit $status{CRITICAL}; + } + # if this wasn't the last plugin, wait before continuing + if( $i < scalar(@plugins) ) { + my $wait = shift(@wait_times) || $default_wait; + sleep $wait; + } + # compatibility with the "not using plugins" method... pretend to calculate the total round trip time (the delay) using data from the plugins ... + $report->{max} = 0; + $report->{delay} = 0; + } + # register the list of reports + foreach my $r ( 1..scalar(@plugins)) { + push @report_plugins, "plugin".$r; + } + # record tokens in a file if option is enabled + my $tmp_long_report = join(", ", map { "$_: $report->{$_}" } @report_plugins ) if $tokenfile; + record_tokens($tokenfile,\@tokens,$time_start,time,'OK',scalar(@plugins),$tmp_long_report) if $tokenfile; +} +else { + # not using plugins. + $time_start = time; + + # send email via SMTP + my $id = $time_start; # XXX should include localhost name maybe or some random number in case the same mailbox is used for multiple delivery tests + + my $smtp_plugin = "$libexec/check_smtp_send"; + $smtp_plugin = "$libexec/check_smtp_send.pl" unless -e $smtp_plugin; + my $smtp_timeout = shift(@alarm_times) || $default_timeout; + eval { + local $SIG{ALRM} = sub { die "exceeded timeout $smtp_timeout seconds\n" }; # NB: \n required, see `perldoc -f alarm` + alarm $smtp_timeout; + my $smtp = `$smtp_plugin $smtp_options --header 'Subject: Nagios Message SMTP $smtp_server ID $id.' --body 'Nagios Email Delivery Plugin\n$body' $smtp_thresholds`; + if( $smtp !~ m/OK|WARNING|CRITICAL/ ) { + print "EMAIL DELIVERY UNKNOWN - smtp unknown: $smtp\n"; + exit $status{UNKNOWN}; + } + if( $smtp =~ m/CRITICAL/ ) { + print "EMAIL DELIVERY CRITICAL - smtp failed: $smtp\n"; + exit $status{CRITICAL}; + } + chomp $smtp; + $report->{smtp} = $smtp; + alarm 0; + }; + if( $@ ) { + print "EMAIL DELIVERY CRITICAL - Could not connect to SMTP server $smtp_server: $@\n"; + exit $status{CRITICAL}; + } + + # wait before checking the delivery + $wait = shift(@wait_times) || $default_wait; + sleep $wait; + + # check email via IMAP + my $imap_plugin = "$libexec/check_imap_receive"; + $imap_plugin = "$libexec/check_imap_receive.pl" unless -e $imap_plugin; + my $imap_timeout = shift(@alarm_times) || $default_timeout; + eval { + local $SIG{ALRM} = sub { die "exceeded timeout $imap_timeout seconds\n" }; # NB: \n required, see `perldoc -f alarm` + alarm $imap_timeout; + my $imap = `$imap_plugin $imap_options -s SUBJECT -s 'Nagios Message SMTP $smtp_server ID' --capture-max 'Nagios Message SMTP $smtp_server ID (\\d+)' --nodelete-captured $imap_thresholds`; + if( $imap !~ m/OK|WARNING|CRITICAL/ ) { + print "EMAIL DELIVERY UNKNOWN - imap unknown: $imap\n"; + exit $status{UNKNOWN}; + } + if( $imap =~ m/CRITICAL/ ) { + print "EMAIL DELIVERY CRITICAL - imap failed: $imap\n"; + exit $status{CRITICAL}; + } + if( $imap =~ m/ (\d+) max/ ) { + my $last_received = $1; + $report->{max} = $1; + my $delay = time - $last_received; + $report->{delay} = $delay; + } + chomp $imap; + $report->{imap} = $imap; + alarm 0; + }; + if( $@ ) { + print "EMAIL DELIVERY CRITICAL - Could not connect to IMAP server $imap_server: $@\n"; + exit $status{CRITICAL}; + } + # register the list of reports + push @report_plugins, ("smtp","imap"); +} + + +# calculate elapsed time and issue warnings +my $time_end = time; +my $elapsedtime = $time_end - $time_start; +$report->{seconds} = $elapsedtime; + +my @warning = (); +my @critical = (); + +push @warning, "most recent received $report->{delay} seconds ago" if( defined($report->{delay}) && $report->{delay} > $delay_warn ); +push @critical, "most recent received $report->{delay} seconds ago" if( defined($report->{delay}) && $report->{delay} > $delay_crit ); +push @warning, "no emails found" if( !defined($report->{delay}) ); + +# print report and exit with known status +my $short_report = $report->text(qw/seconds delay/); +my $long_report = join("", map { "$_: $report->{$_}\n" } @report_plugins ); +if( scalar @critical ) { + my $alerts = join(", ", @critical); + print "EMAIL DELIVERY CRITICAL - $alerts; $short_report\n"; + print $long_report if $verbose; + exit $status{CRITICAL}; +} +if( scalar @warning ) { + my $alerts = join(", ", @warning); + print "EMAIL DELIVERY WARNING - $alerts; $short_report\n"; + print $long_report if $verbose; + exit $status{WARNING}; +} +print "EMAIL DELIVERY OK - $short_report\n"; +print $long_report if $verbose; +exit $status{OK}; + +# utility to load required modules. exits if unable to load one or more of the modules. +sub load_modules { + my @missing_modules = (); + foreach( @_ ) { + eval "require $_"; + push @missing_modules, $_ if $@; + } + if( @missing_modules ) { + print "Missing perl modules: @missing_modules\n"; + return 0; + } + return 1; +} + +# returns one random character from a set of characters +sub pick_random { + my @set = @_; + my $size = scalar @set; + my $string = $set[int(rand($size))]; + return $string; +} + +# appens tokens and times to a tab-separated value file +sub record_tokens { + my ($tokenfile,$tokens,$time_start,$time_end,$status,$plugin_num,$output) = @_; + if( $tokenfile ) { + my @tokens = @$tokens; + $time_end = "" unless defined $time_end; + $status = "" unless defined $status; + $plugin_num = "" unless defined $plugin_num; + $output = "" unless defined $output; + print "saving ".scalar(@tokens)." tokens into $tokenfile\n" if $verbose; + open(TOKENFILE,">>$tokenfile"); + foreach(@tokens) { + print TOKENFILE "$_\t$time_start\t$time_end\t$status\t$plugin_num\t$output\n"; + } + close(TOKENFILE); + } +} + +# NAME +# PluginReport +# SYNOPSIS +# $report = new PluginReport; +# $report->{label1} = "value1"; +# $report->{label2} = "value2"; +# print $report->text(qw/label1 label2/); +package PluginReport; + +sub new { + my ($proto,%p) = @_; + my $class = ref($proto) || $proto; + my $self = bless {}, $class; + $self->{$_} = $p{$_} foreach keys %p; + return $self; +} + +sub text { + my ($self,@labels) = @_; + my @report = map { "$self->{$_} $_" } grep { defined $self->{$_} } @labels; + my $text = join(", ", @report); + return $text; +} + +package main; +1; + diff --git a/roles/nagios_server/files/nagios/plugins/check_fcomm_queue b/roles/nagios_server/files/nagios/plugins/check_fcomm_queue new file mode 100644 index 0000000000..b38d7f8664 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_fcomm_queue @@ -0,0 +1,23 @@ +#!/usr/bin/env python +import sys + +try: + import retask.queue + + queue = retask.queue.Queue('fedora-packages') + queue.connect() + + items = queue.length + if items > 500: + print "CRITICAL: %i tasks in fcomm queue" % items + sys.exit(2) + elif items > 250: + print "WARNING: %i tasks in fcomm queue" % items + sys.exit(1) + else: + print "OK: %i tasks in fcomm queue" % items + sys.exit(0) + +except Exception as e: + print "UNKNOWN:", str(e) + sys.exit(3) diff --git a/roles/nagios_server/files/nagios/plugins/check_fedmsg_consumer_backlog.py b/roles/nagios_server/files/nagios/plugins/check_fedmsg_consumer_backlog.py new file mode 100644 index 0000000000..0b9da670ea --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_fedmsg_consumer_backlog.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import json +import os +import socket +import sys +import zmq + +try: + service = sys.argv[1] + check_consumer = sys.argv[2] + backlog_warning = int(sys.argv[3]) + backlog_critical = int(sys.argv[4]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 10000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + for consumer in msg['consumers']: + if consumer['name'] == check_consumer: + if consumer['backlog'] is None: + print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name'] + sys.exit(3) + elif consumer['backlog'] > backlog_critical: + print 'CRITICAL: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog']) + sys.exit(2) + elif consumer['backlog'] > backlog_warning: + print 'WARNING: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog']) + sys.exit(1) + else: + print 'OK: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog']) + sys.exit(0) + + print "UNKNOWN: fedmsg consumer %s not found" % check_consumer + sys.exit(3) +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_server/files/nagios/plugins/check_fedmsg_consumer_exceptions.py b/roles/nagios_server/files/nagios/plugins/check_fedmsg_consumer_exceptions.py new file mode 100644 index 0000000000..630e877410 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_fedmsg_consumer_exceptions.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import json +import os +import socket +import sys +import zmq + +try: + service = sys.argv[1] + check_consumer = sys.argv[2] + exceptions_warning = int(sys.argv[3]) + exceptions_critical = int(sys.argv[4]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 10000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + for consumer in msg['consumers']: + if consumer['name'] == check_consumer: + if consumer['exceptions'] > exceptions_critical: + print 'CRITICAL: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions']) + sys.exit(2) + elif consumer['exceptions'] > exceptions_warning: + print 'WARNING: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions']) + sys.exit(1) + else: + print 'OK: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions']) + sys.exit(0) + + print "UNKNOWN: fedmsg consumers %s not found" % check_consumer + sys.exit(3) +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_server/files/nagios/plugins/check_fedmsg_producers_consumers.py b/roles/nagios_server/files/nagios/plugins/check_fedmsg_producers_consumers.py new file mode 100644 index 0000000000..92bc2b604f --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_fedmsg_producers_consumers.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import json +import os +import socket +import sys +import zmq + +try: + service = sys.argv[1] + check_list = frozenset(sys.argv[2:]) + fname = '/var/run/fedmsg/monitoring-%s.socket' % service + + if not check_list: + print "UNKNOWN - empty list of fedmsg consumers and producers to check" + sys.exit(3) + + if not os.path.exists(fname): + print "UNKNOWN - %s does not exist" % fname + sys.exit(3) + + connect_to = "ipc:///%s" % fname + ctx = zmq.Context() + s = ctx.socket(zmq.SUB) + s.connect(connect_to) + s.setsockopt(zmq.SUBSCRIBE, '') + poller = zmq.Poller() + poller.register(s, zmq.POLLIN) + + timeout = 10000 + + events = dict(poller.poll(timeout)) + if s in events and events[s] == zmq.POLLIN: + msg = s.recv() + msg = json.loads(msg) + else: + print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout + sys.exit(3) + + for consumer in msg['consumers']: + if consumer['name'] in check_list and not consumer['initialized']: + print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name'] + sys.exit(2) + + for producer in msg['producers']: + if producer['name'] in check_list and not producer['initialized']: + print 'ERROR: fedmsg producer %s is not initialized' % producer['name'] + sys.exit(2) + + for item in check_list: + if item not in [p['name'] for p in msg['producers'] + msg['consumers']]: + print 'ERROR: %s not found among installed plugins' % item + sys.exit(2) + + print "OK: fedmsg consumer(s) and producer(s) initialized" + sys.exit(0) + +except Exception as err: + print "UNKNOWN:", str(err) + sys.exit(3) diff --git a/roles/nagios_server/files/nagios/plugins/check_haproxy_conns.py b/roles/nagios_server/files/nagios/plugins/check_haproxy_conns.py new file mode 100755 index 0000000000..e9e8c9f968 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_haproxy_conns.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +""" Nagios check for haproxy over-subscription. + +fedmsg-gateway is the primary concern as it can eat up a ton of simultaneous +connections. + +:Author: Ralph Bean +""" + +import socket +import sys + + +def _numeric(value): + """ Type casting utility """ + try: + return int(value) + except ValueError: + try: + return float(value) + except ValueError: + return value + + +def query(sockname="/var/run/haproxy-stat"): + """ Read stats from the haproxy socket and return a dict """ + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect("/var/run/haproxy-stat") + s.send('show info\n') + try: + response = s.recv(1024).strip() + lines = response.split('\n') + data = dict([map(str.strip, line.split(':')) for line in lines]) + data = dict([(k, _numeric(v)) for k, v in data.items()]) + return data + except Exception, e: + print str(e) + finally: + s.close() + + return None + + +def nagios_check(data): + """ Print warnings and return nagios exit codes. """ + + current = data['CurrConns'] + maxconn = data['Maxconn'] + percent = 100 * float(current) / float(maxconn) + details = "%.2f%% subscribed. %i current of %i maxconn." % ( + percent, current, maxconn, + ) + + if percent < 50: + print "HAPROXY SUBS OK: " + details + return 0 + + if percent < 75: + print "HAPROXY SUBS WARN: " + details + return 1 + + if percent <= 100: + print "HAPROXY SUBS CRIT: " + details + return 2 + + print "HAPROXY SUBS UNKNOWN: " + details + return 3 + + +if __name__ == '__main__': + try: + data = query(sockname="/var/run/haproxy-stat") + except Exception as e: + print "HAPROXY SUBS UNKNOWN: " + str(e) + sys.exit(3) + sys.exit(nagios_check(data)) diff --git a/roles/nagios_server/files/nagios/plugins/check_haproxy_mirrorlist.py b/roles/nagios_server/files/nagios/plugins/check_haproxy_mirrorlist.py new file mode 100755 index 0000000000..6ea3dec610 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_haproxy_mirrorlist.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +import socket +import sys + + +try: + + unixsocket="/var/run/haproxy-stat" + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(unixsocket) + s.send('show stat\n') + + try: + + output = s.recv(16384).strip().split('\n') + fields = output.pop(0).split(',') + fields[0]=fields[0].replace('# ','') + proxies = list() + for line in output: + proxies.append(dict(zip(fields,line.split(',')))) + + except Exception, e: + print str(e) + finally: + s.close() + +except Exception as e: + print "MIRRORLIST STATE UNKNOWN: " + str(e) + sys.exit(3) + +total=0 +downcount=0 +downlist="" +for proxy in proxies: + if proxy['svname'] == "FRONTEND" or proxy['svname'] == "BACKEND": + continue + if proxy['pxname'] == "mirror-lists": + total+=1 + if proxy['status'] == "DOWN": + downlist+=proxy["svname"]+" " + downcount+=1 + +unavailability = 100 * float(downcount) / float(total) + +if unavailability == 0: + print "MIRRORLIST STATE OK: " + downlist + sys.exit(0) + +if unavailability < 50: + print "MIRRORLIST STATE WARN: " + downlist + sys.exit(1) + +if unavailability >= 50: + print "MIRRORLIST STATE CRIT: " + downlist + sys.exit(2) + +print "MIRRORLIST STATE UNKNOWN: " + downlist +sys.exit(3) diff --git a/roles/nagios_server/files/nagios/plugins/check_imap_receive_epn b/roles/nagios_server/files/nagios/plugins/check_imap_receive_epn new file mode 100644 index 0000000000..d490cd006f --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_imap_receive_epn @@ -0,0 +1,387 @@ +#!/usr/bin/perl +use strict; +my $VERSION = '0.6.0'; +my $COPYRIGHT = 'Copyright (C) 2005-2008 Jonathan Buhacoff '; +my $LICENSE = 'http://www.gnu.org/licenses/gpl.txt'; +my %status = ( 'OK' => 0, 'WARNING' => 1, 'CRITICAL' => 2, 'UNKNOWN' => 3 ); + +# look for required modules +exit $status{UNKNOWN} unless load_modules(qw/Getopt::Long Mail::IMAPClient/); + +# get options from command line +Getopt::Long::Configure("bundling"); +my $verbose = 0; +my $help = ""; +my $help_usage = ""; +my $show_version = ""; +my $imap_server = ""; +my $default_imap_port = "143"; +my $default_imap_ssl_port = "993"; +my $imap_port = ""; +my $username = ""; +my $password = ""; +my $mailbox = "INBOX"; +my @search = (); +my $search_critical_min = 1; +my $capture_max = ""; +my $capture_min = ""; +my $delete = 1; +my $no_delete_captured = ""; +my $warntime = 15; +my $criticaltime = 30; +my $timeout = 60; +my $interval = 5; +my $max_retries = 10; +my $download = ""; +my $download_max = ""; +my $ssl = 0; +my $tls = 0; +my $ok; +$ok = Getopt::Long::GetOptions( + "V|version"=>\$show_version, + "v|verbose+"=>\$verbose,"h|help"=>\$help,"usage"=>\$help_usage, + "w|warning=i"=>\$warntime,"c|critical=i"=>\$criticaltime,"t|timeout=i"=>\$timeout, + # imap settings + "H|hostname=s"=>\$imap_server,"p|port=i"=>\$imap_port, + "U|username=s"=>\$username,"P|password=s"=>\$password, "m|mailbox=s"=>\$mailbox, + "imap-check-interval=i"=>\$interval,"imap-retries=i"=>\$max_retries, + "ssl!"=>\$ssl, "tls!"=>\$tls, + # search settings + "s|search=s"=>\@search, + "search-critical-min=i"=>\$search_critical_min, + "capture-max=s"=>\$capture_max, "capture-min=s"=>\$capture_min, + "delete!"=>\$delete, "nodelete-captured"=>\$no_delete_captured, + "download!"=>\$download, "download_max=i"=>\$download_max, + ); + +if( $show_version ) { + print "$VERSION\n"; + if( $verbose ) { + print "Default warning threshold: $warntime seconds\n"; + print "Default critical threshold: $criticaltime seconds\n"; + print "Default timeout: $timeout seconds\n"; + } + exit $status{UNKNOWN}; +} + +if( $help ) { + exec "perldoc", $0 or print "Try `perldoc $0`\n"; + exit $status{UNKNOWN}; +} + +my @required_module = (); +push @required_module, 'IO::Socket::SSL' if ($ssl or $tls); +push @required_module, 'Email::Simple' if ($download); +exit $status{UNKNOWN} unless load_modules(@required_module); + +if( $help_usage + || + ( $imap_server eq "" || $username eq "" || $password eq "" || scalar(@search)==0 ) + ) { + print "Usage: $0 -H host [-p port] -U username -P password -s HEADER -s X-Nagios -s 'ID: 1234.' [-w ] [-c ] [--imap-check-interval ] [--imap-retries ]\n"; + exit $status{UNKNOWN}; +} + +# initialize +my $report = new PluginReport; +my $time_start = time; + +# connect to IMAP server +my $imap; +eval { + local $SIG{ALRM} = sub { die "exceeded timeout $timeout seconds\n" }; # NB: \n required, see `perldoc -f alarm` + alarm $timeout; + + if( $ssl ) { + $imap_port = $default_imap_ssl_port unless $imap_port; + my $socket = IO::Socket::SSL->new("$imap_server:$imap_port"); + die IO::Socket::SSL::errstr() unless $socket; + $socket->autoflush(1); + $imap = Mail::IMAPClient->new(Socket=>$socket, Debug => 0 ); + $imap->State(Mail::IMAPClient->Connected); + $imap->_read_line() if "$Mail::IMAPClient::VERSION" le "2.2.9"; # necessary to remove the server's "ready" line from the input buffer for old versions of Mail::IMAPClient. Using string comparison for the version check because the numeric didn't work on Darwin and for Mail::IMAPClient the next version is 2.3.0 and then 3.00 so string comparison works + $imap->User($username); + $imap->Password($password); + $imap->login() or die "$@"; + } + elsif( $tls ) { + # XXX THIS PART IS NOT DONE YET ... NEED TO OPEN A REGULAR IMAP CONNECTION, THEN ISSUE THE "STARTTLS" COMMAND MANUALLY, SWITCHING THE SOCKET TO IO::SOCKET::SSL, AND THEN GIVING IT BACK TO MAIL::IMAPCLIENT ... + $imap_port = $default_imap_port unless $imap_port; + $imap = Mail::IMAPClient->new(Debug => 0 ); + $imap->Server("$imap_server:$imap_port"); + $imap->User($username); + $imap->Password($password); + $imap->connect() or die "$@"; + } + else { + $imap_port = $default_imap_port unless $imap_port; + $imap = Mail::IMAPClient->new(Debug => 0 ); + $imap->Server("$imap_server:$imap_port"); + $imap->User($username); + $imap->Password($password); + $imap->connect() or die "$@"; + } + + alarm 0; +}; +if( $@ ) { + chomp $@; + print "IMAP RECEIVE CRITICAL - Could not connect to $imap_server port $imap_port: $@\n"; + exit $status{CRITICAL}; +} +unless( $imap ) { + print "IMAP RECEIVE CRITICAL - Could not connect to $imap_server port $imap_port: $@\n"; + exit $status{CRITICAL}; +} +my $time_connected = time; + +# select a mailbox +unless( $imap->select($mailbox) ) { + print "IMAP RECEIVE CRITICAL - Could not select $mailbox: $@ $!\n"; + $imap->logout(); + exit $status{CRITICAL}; +} + + +# search for messages +my $tries = 0; +my @msgs; +until( scalar(@msgs) != 0 || $tries >= $max_retries ) { + eval { + $imap->select( $mailbox ); + # if download flag is on, we download recent messages and search ourselves + if( $download ) { + @msgs = download_and_search($imap,@search); + } + else { + @msgs = $imap->search(@search); + die "Invalid search parameters: $@" if $@; + } + }; + if( $@ ) { + chomp $@; + print "Cannot search messages: $@\n"; + $imap->close(); + $imap->logout(); + exit $status{UNKNOWN}; + } + $report->{found} = scalar(@msgs); + $tries++; + sleep $interval unless (scalar(@msgs) != 0 || $tries >= $max_retries); +} + +sub download_and_search { + my ($imap,@search) = @_; + my $ims = new ImapMessageSearch; + $ims->querytokens(@search); + my @found = (); + @msgs = $imap->messages or die "Cannot list messages: $@\n"; + @msgs = @msgs[0..$download_max-1] if $download_max; + foreach my $m (@msgs) { + my $message = $imap->message_string($m); + push @found, $m if $ims->match($message); + } + return @found; +} + + + +# capture data in messages +my $captured_max_id = ""; +my $captured_min_id = ""; +if( $capture_max || $capture_min ) { + my $max = undef; + my $min = undef; + my %captured = (); + for (my $i=0;$i < scalar(@msgs); $i++) { + my $message = $imap->message_string($msgs[$i]); + if( $message =~ m/$capture_max/ ) { + if( !defined($max) || $1 > $max ) { + $captured{ $i } = 1; + $max = $1; + $captured_max_id = $msgs[$i]; + } + } + if( $message =~ m/$capture_min/ ) { + if( !defined($min) || $1 < $min ) { + $captured{ $i } = 1; + $min = $1; + $captured_min_id = $msgs[$i]; + } + } + print $message if $verbose > 1; + } + $report->{captured} = scalar keys %captured; + $report->{max} = $max if defined $max; + $report->{min} = $min if defined $min; +} + +# delete messages +if( $delete ) { + my $deleted = 0; + for (my $i=0;$i < scalar(@msgs); $i++) { + next if ($no_delete_captured && ($captured_max_id eq $msgs[$i])); + next if ($no_delete_captured && ($captured_min_id eq $msgs[$i])); + $imap->delete_message($msgs[$i]); + $deleted++; + } + $report->{deleted} = $deleted; + $imap->expunge() if $deleted; +} + +# deselect the mailbox +$imap->close(); + +# disconnect from IMAP server +$imap->logout(); + +# calculate elapsed time and issue warnings +my $time_end = time; +my $elapsedtime = $time_end - $time_start; +$report->{seconds} = $elapsedtime; + +my @warning = (); +my @critical = (); + +push @warning, "no messages" if( scalar(@msgs) == 0 ); +push @critical, "found less than $search_critical_min" if ( scalar(@msgs) < $search_critical_min ); +push @warning, "connection time more than $warntime" if( $time_connected - $time_start > $warntime ); +push @critical, "connection time more than $criticaltime" if( $time_connected - $time_start > $criticaltime ); + +# print report and exit with known status +my $short_report = $report->text(qw/seconds found captured max min deleted/); +if( scalar @critical ) { + my $crit_alerts = join(", ", @critical); + print "IMAP RECEIVE CRITICAL - $crit_alerts; $short_report\n"; + exit $status{CRITICAL}; +} +if( scalar @warning ) { + my $warn_alerts = join(", ", @warning); + print "IMAP RECEIVE WARNING - $warn_alerts; $short_report\n"; + exit $status{WARNING}; +} +print "IMAP RECEIVE OK - $short_report\n"; +exit $status{OK}; + + +# utility to load required modules. exits if unable to load one or more of the modules. +sub load_modules { + my @missing_modules = (); + foreach( @_ ) { + eval "require $_"; + push @missing_modules, $_ if $@; + } + if( @missing_modules ) { + print "Missing perl modules: @missing_modules\n"; + return 0; + } + return 1; +} + + +# NAME +# PluginReport +# SYNOPSIS +# $report = new PluginReport; +# $report->{label1} = "value1"; +# $report->{label2} = "value2"; +# print $report->text(qw/label1 label2/); +package PluginReport; + +sub new { + my ($proto,%p) = @_; + my $class = ref($proto) || $proto; + my $self = bless {}, $class; + $self->{$_} = $p{$_} foreach keys %p; + return $self; +} + +sub text { + my ($self,@labels) = @_; + my @report = map { "$self->{$_} $_" } grep { defined $self->{$_} } @labels; + my $text = join(", ", @report); + return $text; +} + +package ImapMessageSearch; + +require Email::Simple; + +sub new { + my ($proto,%p) = @_; + my $class = ref($proto) || $proto; + my $self = bless {}, $class; + $self->{querystring} = []; + $self->{querytokens} = []; + $self->{queryfnlist} = []; + $self->{mimemessage} = undef; + $self->{$_} = $p{$_} foreach keys %p; + return $self; +} + +sub querystring { + my ($self,$string) = @_; + $self->{querystring} = $string; + return $self->querytokens( parseimapsearch($string) ); +} + +sub querytokens { + my ($self,@tokens) = @_; + $self->{querytokens} = [@tokens]; + $self->{queryfnlist} = [create_search_expressions(@search)]; + return $self; +} + +sub match { + my ($self,$message_string) = @_; + my $message_mime = Email::Simple->new($message_string); + return $self->matchmime($message_mime); +} + +sub matchmime { + my ($self,$message_mime) = @_; + my $match = 1; + foreach my $x (@{$self->{queryfnlist}}) { + $match = $match and $x->($message_mime); + } + return $match; +} + +# this should probably become its own Perl module... see also Net::IMAP::Server::Command::Search +sub create_search_expressions { + my (@search) = @_; + return () unless scalar(@search); + my $token = shift @search; + if( $token eq 'TEXT' ) { + my $value = shift @search; + return (sub {shift->as_string =~ /\Q$value\E/i},create_search_expressions(@search)); + } + if( $token eq 'BODY' ) { + my $value = shift @search; + return (sub {shift->body =~ /\Q$value\E/i},create_search_expressions(@search)); + } + if( $token eq 'SUBJECT' ) { + my $value = shift @search; + return (sub {shift->header('Subject') =~ /\Q$value\E/i},create_search_expressions(@search)); + } + if( $token eq 'HEADER' ) { + my $name = shift @search; + my $value = shift @search; + return (sub {shift->header($name) =~ /\Q$value\E/i},create_search_expressions(@search)); + } + if( $token eq 'NOT' ) { + my @exp = create_search_expressions(@search); + my $next = shift @exp; + return (sub { ! $next->(@_) }, @exp); + } + if( $token eq 'OR' ) { + my @exp = create_search_expressions(@search); + my $next1 = shift @exp; + my $next2 = shift @exp; + return (sub { $next1->(@_) or $next2->(@_) }, @exp); + } +} + +package main; +1; + diff --git a/roles/nagios_server/files/nagios/plugins/check_ipmi b/roles/nagios_server/files/nagios/plugins/check_ipmi new file mode 100755 index 0000000000..f85397e1f8 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_ipmi @@ -0,0 +1,99 @@ +#!/usr/bin/python +# mmcgrath#redhat.com +# Aug 08 2007 +# License: GPL +from optparse import OptionParser +import commands +import sys + +parser = OptionParser(version='0.1') +parser.add_option('-t', '--temperature', + dest = 'temp', + default = False, + action = 'store_true', + help = 'Check Temperatures') +parser.add_option('-f', '--fans', + dest = 'fans', + default = False, + action = 'store_true', + help = 'Check Fans') + + + +(opts, args) = parser.parse_args() + +class ipmiValue: + def __init__(self, param='', value='', status=''): + self.param = param + try: + self.value = (int(value.split(' ')[0], 10) * 9) / 5 + 32 + except ValueError: + self.value = value + self.status = status + +class ipmi: + def __init__(self): + self.rawOutput = commands.getstatusoutput('/usr/bin/ipmitool sdr')[1].split('\n') + self.sdr = [] + for i in self.rawOutput: + try: + param = i.split('|')[0].strip() + value = i.split('|')[1].strip() + status = i.split('|')[2].strip() + self.sdr.append(ipmiValue(param, value, status)) + except IndexError: + print "ERROR - Invalid output from ipmi tool (is it installed? /usr/bin/ipmitool)" + sys.exit(3) + + def temps(self): + ''' Return Known Temperatures ''' + temps = [] + for i in self.sdr: + if i.param.find('Temp') != -1 and i.status.find('ns') == -1: + temps.append(i) + return temps + + def fans(self): + ''' Return Known Fan Speeds ''' + temps = [] + for i in self.sdr: + if i.param.find('FAN') != -1 and i.status.find('ns') == -1: + temps.append(i) + return temps + +str = False +exitCode = 0 +if opts.temp: + ok=True + str='Temps (F)' + i = ipmi() + for temp in i.temps(): + str = '%s:%s' % (str, temp.value) + if temp.status != 'ok': + ok=temp.status + if ok: + str = str + ' OK!' + else: + str = str + ' %s' % ok + exitCode = 2 + +if opts.fans: + ok=True + str='Fans (RPM)' + i = ipmi() + for fan in i.fans(): + str = '%s:%s' % (str, fan.value) + if fan.status != 'ok': + ok=fan.status + if ok: + str = str + ' OK!' + else: + str = str + ' %s' % ok + exitCode = 2 + +if str: + print str + sys.exit(0) +else: + print 'Please see -h for help' + sys.exit(2) diff --git a/roles/nagios_server/files/nagios/plugins/check_koji b/roles/nagios_server/files/nagios/plugins/check_koji new file mode 100755 index 0000000000..9e1b68611f --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_koji @@ -0,0 +1,19 @@ +#!/bin/bash + +FAILURES=$(/usr/bin/wget -q --no-check-certificate -O- http://koji.phx2.fedoraproject.org/koji/builds | /bin/grep -c failed.png) +WARNING=20 +CRITICAL=25 + +if [ $FAILURES -gt $CRITICAL ] +then + echo "Koji: CRITICAL failed builds: $FAILURES" + exit 2 +elif [ $FAILURES -gt $WARNING ] +then + echo "Koji: WARNING failed builds: $FAILURES" + exit 1 +else + echo "Koji: OK failed builds: $FAILURES" + exit 0 +fi + diff --git a/roles/nagios_server/files/nagios/plugins/check_lock b/roles/nagios_server/files/nagios/plugins/check_lock new file mode 100755 index 0000000000..1a58e95ef8 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_lock @@ -0,0 +1,17 @@ +#!/usr/bin/python + +import fcntl +import sys + +try: + f = open('/mnt/koji/.nagios_test', 'r') + f.close() + f = open('/mnt/koji/.nagios_test', 'w') +except IOError: + print "Could not create file" + sys.exit(2) + +fcntl.flock(f, fcntl.LOCK_EX) +f.close() +print "File Locked Successfully" +sys.exit(0) diff --git a/roles/nagios_server/files/nagios/plugins/check_lock_file_age b/roles/nagios_server/files/nagios/plugins/check_lock_file_age new file mode 100755 index 0000000000..f5abaa9e11 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_lock_file_age @@ -0,0 +1,123 @@ +#! /usr/bin/perl -w + +# check_lock_file_age.pl Copyright (C) 2010 Ricky Elrod +# +# Fork of check_file_age.pl +# +# Checks a lock file's size and modification time to make sure it's not empty +# and that it's sufficiently recent. +# +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# you should have received a copy of the GNU General Public License +# along with this program (or with Nagios); if not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA + +use strict; +use English; +use Getopt::Long; +use File::stat; +use vars qw($PROGNAME); +use lib "/usr/lib64/nagios/plugins"; +use utils qw (%ERRORS &print_revision &support); + +sub print_help (); +sub print_usage (); + +my ($opt_c, $opt_f, $opt_w, $opt_h, $opt_V); +my ($result, $message, $age, $size, $st); + +$PROGNAME="check_lock_file_age"; + +$opt_w = 1; +$opt_c = 5; +$opt_f = ""; + +Getopt::Long::Configure('bundling'); +GetOptions( + "V" => \$opt_V, "version" => \$opt_V, + "h" => \$opt_h, "help" => \$opt_h, + "f=s" => \$opt_f, "file" => \$opt_f, + "w=f" => \$opt_w, "warning-age=f" => \$opt_w, + "c=f" => \$opt_c, "critical-age=f" => \$opt_c); + +if ($opt_V) { + print_revision($PROGNAME, '1.4.14'); + exit $ERRORS{'OK'}; +} + +if ($opt_h) { + print_help(); + exit $ERRORS{'OK'}; +} + +if (($opt_c and $opt_w) and ($opt_c < $opt_w)) { + print "Warning time must be less than Critical time.\n"; + exit $ERRORS{'UNKNOWN'}; +} + +$opt_f = shift unless ($opt_f); + +if (! $opt_f) { + print "LOCK_FILE_AGE UNKNOWN: No file specified\n"; + exit $ERRORS{'UNKNOWN'}; +} + +# Check that file exists (can be directory or link) +unless (-e $opt_f) { + print "LOCK_FILE_AGE OK: File not found (Lock file removed) - $opt_f\n"; + exit $ERRORS{'OK'}; +} + +$st = File::stat::stat($opt_f); +$age = time - $st->mtime; + +$result = 'OK'; + +# Convert minutes to seconds +if($opt_c) { $opt_c *= 60; } +if($opt_w) { $opt_w *= 60; } + +if ($opt_c and $age > $opt_c) { + $result = 'CRITICAL'; +} +elsif ($opt_w and $age > $opt_w) { + $result = 'WARNING'; +} + +# If the age is higher than 2 minutes, convert seconds -> minutes +# If it's higher than a day, use days. +# Just a nicety, to make people not have to do math ;) +if($age > 86400) { $age = int(($age/86400))." days"; } +elsif($age > 120) { $age = int(($age/60))." minutes"; } +else { $age = "$age seconds"; } + +print "LOCK_FILE_AGE $result: $opt_f is $age old.\n"; +exit $ERRORS{$result}; + +sub print_usage () { + print "Usage:\n"; + print " $PROGNAME [-w ] [-c ] -f \n"; + print " $PROGNAME [-h | --help]\n"; + print " $PROGNAME [-V | --version]\n"; +} + +sub print_help () { + print_revision($PROGNAME, '1.4.14'); + print "Copyright (c) 2010 Ricky Elrod\n\n"; + print_usage(); + print "\n"; + print " File must be no more than this many minutes old (default: warn 1m, crit 5m)\n"; + print "\n"; + support(); +} diff --git a/roles/nagios_server/files/nagios/plugins/check_postfix_queue b/roles/nagios_server/files/nagios/plugins/check_postfix_queue new file mode 100755 index 0000000000..44ab4445f9 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_postfix_queue @@ -0,0 +1,49 @@ +#!/bin/bash +# +# 19-07-2010 +# Author: Cherwin Nooitmeer +# + +# exit codes +e_ok=0 +e_warning=1 +e_critical=2 +e_unknown=3 + +# regular expression that matches queue IDs (e.g. D71EF7AC80F8) +queue_id='^[A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9]' + +usage="Invalid command line usage" + +if [ -z $1 ]; then + echo $usage + exit $e_unknown +fi + +while getopts ":w:c:" options +do + case $options in + w ) warning=$OPTARG ;; + c ) critical=$OPTARG ;; + * ) echo $usage + exit $e_unknown ;; + esac +done + +# determine queue size +qsize=$(mailq | egrep -c $queue_id) +if [ -z $qsize ] +then + exit $e_unknown +fi + +if [ $qsize -ge $critical ]; then + retval=$e_critical +elif [ $qsize -ge $warning ]; then + retval=$e_warning +elif [ $qsize -lt $warning ]; then + retval=$e_ok +fi + +echo "$qsize mail(s) in queue | mail_queue=$qsize" +exit $retval diff --git a/roles/nagios_server/files/nagios/plugins/check_raid.py b/roles/nagios_server/files/nagios/plugins/check_raid.py new file mode 100755 index 0000000000..48cddd93d4 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_raid.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# +# very simple python script to parse out /proc/mdstat +# and give results for nagios to monitor +# + +import sys +import string + +devices = [] + +try: + mdstat = string.split(open('/proc/mdstat').read(), '\n') +except IOError: + # seems we have no software raid on this machines + sys.exit(0) + +error = "" +i = 0 +for line in mdstat: + if line[0:2] == 'md': + device = string.split(line)[0] + devices.append(device) + status = string.split(mdstat[i+1])[3] + if string.count(status, "_"): + # see if we can figure out what's going on + err = string.split(mdstat[i+2]) + msg = "device=%s status=%s" % (device, status) + if len(err) > 0: + msg = msg + " rebuild=%s" % err[0] + + if not error: + error = msg + else: + error = error + ", " + msg + i = i + 1 + +if not error: + print "DEVICES %s OK" % " ".join(devices) + sys.exit(0) + +else: + print error + sys.exit(2) + diff --git a/roles/nagios_server/files/nagios/plugins/check_readonly_fs b/roles/nagios_server/files/nagios/plugins/check_readonly_fs new file mode 100755 index 0000000000..cd2b1973a7 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_readonly_fs @@ -0,0 +1,84 @@ +#!/bin/bash + +# check_readonlyfs: Check for readonly filesystems +# Copyright (C) 2010 Davide Madrisan + +PROGNAME=`/bin/basename $0` +PROGPATH=`echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,'` +REVISION=`echo '$Revision: 1 $' | sed -e 's/[^0-9.]//g'` + +. $PROGPATH/utils.sh + +print_usage() { + echo "Usage: $PROGNAME --no-network-fs" + echo "Usage: $PROGNAME --help" + echo "Usage: $PROGNAME --version" +} + +print_help() { + print_revision $PROGNAME $REVISION + echo "" + print_usage + echo "" + echo "readonly filesystem checker plugin for Nagios" + echo "" + support +} + +NETFS=1 + +# Grab the command line arguments + +exitstatus=$STATE_WARNING #default + +while test -n "$1"; do + case "$1" in + --help|-h) + print_help + exit $STATE_OK + ;; + --version|-V) + print_revision $PROGNAME $REVISION + exit $STATE_OK + ;; + --no-network-fs|-n) + NETFS="0" + ;; + *) + echo "Unknown argument: $1" + print_usage + exit $STATE_UNKNOWN + ;; + esac + shift +done + +[ -r /proc/mounts ] || { echo "cannot read /proc/mounts!"; exit $STATE_UNKNOWN; } + +nerr=0 +IFS_SAVE="$IFS" + +rofs_list="" +while read dev mp fs mopt ignore; do + [ "$dev" = none ] && continue + case $fs in binfmt_misc|devpts|iso9660|proc|selinuxfs|rpc_pipefs|sysfs|tmpfs|usbfs) + continue ;; + esac + case $fs in autofs|nfs|nfs4|smbfs) + # skip the network filesystems + [ "$NETFS" = 0 ] && continue ;; + esac + + IFS=","; set -- $mopt; IFS="$IFS_SAVE" + while :; do + case "$1" in + ro) rofs_list="$rofs_list $mp"; nerr=$(( $nerr + 1 )) ;; + "") shift; break ;; + esac + shift + done +done < <(LC_ALL=C /bin/cat /proc/mounts 2>/dev/null) + +[ $nerr -eq 0 ] && { echo OK; exit $STATE_OK; } || echo "$rofs_list: read only fs" + +exit $exitstatus diff --git a/roles/nagios_server/files/nagios/plugins/check_smtp_send_epn b/roles/nagios_server/files/nagios/plugins/check_smtp_send_epn new file mode 100644 index 0000000000..db0354ce6e --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_smtp_send_epn @@ -0,0 +1,290 @@ +#!/usr/bin/perl +use strict; +my $VERSION = '0.4.5'; +my $COPYRIGHT = 'Copyright (C) 2005-2008 Jonathan Buhacoff '; +my $LICENSE = 'http://www.gnu.org/licenses/gpl.txt'; +my %status = ( 'OK' => 0, 'WARNING' => 1, 'CRITICAL' => 2, 'UNKNOWN' => 3 ); + +# look for required modules +exit $status{UNKNOWN} unless load_modules(qw/Getopt::Long Net::SMTP/); + +Getopt::Long::Configure("bundling"); +my $verbose = 0; +my $help = ""; +my $help_usage = ""; +my $show_version = ""; +my $smtp_server = ""; +my $default_smtp_port = "25"; +my $default_smtp_ssl_port = "465"; +my $default_smtp_tls_port = "587"; +my $smtp_port = ""; +my @mailto = (); +my $mailfrom = ""; +my @header = (); +my $body = ""; +my $stdin = ""; +my $expect_response = "250"; +my $warntime = 15; +my $criticaltime = 30; +my $timeout = 60; +my $tls = 0; +my $ssl = 0; +my $auth_method = undef; +my $username = ""; +my $password = ""; +my $ok; +$ok = Getopt::Long::GetOptions( + "V|version"=>\$show_version, + "v|verbose+"=>\$verbose,"h|help"=>\$help,"usage"=>\$help_usage, + "w|warning=i"=>\$warntime,"c|critical=i"=>\$criticaltime,"t|timeout=i"=>\$timeout, + # smtp settings + "H|hostname=s"=>\$smtp_server,"p|port=i"=>\$smtp_port, + "mailto=s"=>\@mailto, "mailfrom=s",\$mailfrom, + "header=s"=>\@header, "body=s"=>\$body, "stdin"=>\$stdin, + # SSL/TLS/auth options + "tls!"=>\$tls, "ssl!"=>\$ssl, "auth=s"=>\$auth_method, + "U|username=s"=>\$username,"P|password=s"=>\$password, + # Server response + "E|expect-response=s"=>\$expect_response, + ); + +if( $show_version ) { + print "$VERSION\n"; + if( $verbose ) { + print "Default warning threshold: $warntime seconds\n"; + print "Default critical threshold: $criticaltime seconds\n"; + print "Default timeout: $timeout seconds\n"; + } + exit $status{UNKNOWN}; +} + +if( $help ) { + exec "perldoc", $0 or print "Try `perldoc $0`\n"; + exit $status{UNKNOWN}; +} + +my @required_module = (); +push @required_module, 'Net::SMTP::SSL' if $ssl; +push @required_module, ('MIME::Base64','Authen::SASL') if $ssl && $username; +push @required_module, 'Net::SMTP::TLS' if $tls; +push @required_module, 'Net::SMTP_auth' if $auth_method; +exit $status{UNKNOWN} unless load_modules(@required_module); + + +# split up @mailto if commas were used instead of multiple options +@mailto = split(/,/,join(',',@mailto)); + +if( $help_usage || + ( + $smtp_server eq "" || scalar(@mailto)==0 || $mailfrom eq "" + ) + ) { + print "Usage: $0 -H host [-p port] --mailto recipient\@your.net [--mailto recipient2\@your.net ...] --mailfrom sender\@your.net --body 'some text' [-w ] [-c ]\n"; + exit $status{UNKNOWN}; +} + +# initialize +my $report = new PluginReport; +my $time_start = time; +my $actual_response = undef; +my @warning = (); +my @critical = (); + + +# connect to SMTP server +# create the smtp handle using Net::SMTP, Net::SMTP::SSL, or Net::SMTP::TLS +my $smtp; +eval { + if( $tls ) { + $smtp_port = $default_smtp_tls_port unless $smtp_port; + $smtp = Net::SMTP::TLS->new($smtp_server, Timeout=>$timeout, Port=>$smtp_port, User=>$username, Password=>$password); + } + elsif( $ssl ) { + $smtp_port = $default_smtp_ssl_port unless $smtp_port; + $smtp = Net::SMTP::SSL->new($smtp_server, Port => $smtp_port, Timeout=>$timeout,Debug=>0); + if( $smtp && $username ) { + $smtp->auth($username, $password); + } + } + elsif( $auth_method ) { + $smtp_port = $default_smtp_port unless $smtp_port; + $smtp = Net::SMTP_auth->new($smtp_server, Port=>$smtp_port, Timeout=>$timeout,Debug=>0); + if( $smtp ) { + $smtp->auth($auth_method, $username, $password); + } + } + else { + $smtp_port = $default_smtp_port unless $smtp_port; + $smtp = Net::SMTP->new($smtp_server, Port=>$smtp_port, Timeout=>$timeout,Debug=>0); + if( $smtp && $username ) { + $smtp->auth($username, $password); + } + } +}; +if( $@ ) { + $@ =~ s/\n/ /g; # the error message can be multiline but we want our output to be just one line + print "SMTP SEND CRITICAL - $@\n"; + exit $status{CRITICAL}; +} +unless( $smtp ) { + print "SMTP SEND CRITICAL - Could not connect to $smtp_server port $smtp_port\n"; + exit $status{CRITICAL}; +} +my $time_connected = time; + +# add the monitored server's banner to the report +if( $tls ) { + $report->{banner} = ""; +} +elsif( $ssl ) { + $report->{banner} = $smtp->banner || ""; + chomp $report->{banner}; +} +else { + $report->{banner} = $smtp->banner || ""; + chomp $report->{banner}; +} + + +# send email +if( $stdin ) { + $body = ""; + while() { + $body .= $_; + } +} +$smtp->mail($mailfrom); +foreach( @mailto ) { + # the two SMTP modules have different error reporting mechanisms: + if( $tls ) { + # Net::SMTP::TLS croaks when the recipient is rejected + eval { + $smtp->to($_); + }; + if( $@ ) { + print "SMTP SEND CRITICAL - Could not send to $_\n"; + exit $status{CRITICAL}; + } + } + else { + # Net::SMTP returns false when the recipient is rejected + my $to_returned = $smtp->to($_); + if( !$to_returned ) { + print "SMTP SEND CRITICAL - Could not send to $_\n"; + exit $status{CRITICAL}; + } + } +} + +# Net::SMTP::TLS doesn't implement code() so we need to wrap calls in eval to get our error messages + + # start data transfer (expect response 354) + $smtp->data(); + + # send data + $smtp->datasend("To: ".join(", ",@mailto)."\n"); + $smtp->datasend("From: $mailfrom\n"); + foreach( @header ) { + $smtp->datasend("$_\n"); + } + $smtp->datasend("\n"); + $smtp->datasend($body); + $smtp->datasend("\n"); + +eval { + # end data transfer (expect response 250) + $smtp->dataend(); +}; +if( $@ ) { + $actual_response = $tls ? get_tls_error($@) : $smtp->code(); +} +else { + $actual_response = $tls ? "250" : $smtp->code(); # no error means we got 250 +} + +eval { + # disconnect from SMTP server (expect response 221) + $smtp->quit(); +}; +if( $@ ) { + push @warning, "Error while disconnecting from $smtp_server"; +} + +# calculate elapsed time and issue warnings +my $time_end = time; +my $elapsedtime = $time_end - $time_start; +$report->{seconds} = $elapsedtime; + +push @warning, "connection time more than $warntime" if( $time_connected - $time_start > $warntime ); +push @critical, "connection time more than $criticaltime" if( $time_connected - $time_start > $criticaltime ); +push @critical, "response was $actual_response but expected $expect_response" if ( $actual_response ne $expect_response ); + +# print report and exit with known status +my $short_report = $report->text(qw/seconds/); +my $long_report = join("", map { "$_: $report->{$_}\n" } qw/banner/ ); +if( scalar @critical ) { + my $crit_alerts = join(", ", @critical); + print "SMTP SEND CRITICAL - $crit_alerts; $short_report\n"; + print $long_report if $verbose; + exit $status{CRITICAL}; +} +if( scalar @warning ) { + my $warn_alerts = join(", ", @warning); + print "SMTP SEND WARNING - $warn_alerts; $short_report\n"; + print $long_report if $verbose; + exit $status{WARNING}; +} +print "SMTP SEND OK - $short_report\n"; +print $long_report if $verbose; +exit $status{OK}; + + +# utility to load required modules. exits if unable to load one or more of the modules. +sub load_modules { + my @missing_modules = (); + foreach( @_ ) { + eval "require $_"; + push @missing_modules, $_ if $@; + } + if( @missing_modules ) { + print "Missing perl modules: @missing_modules\n"; + return 0; + } + return 1; +} + +# utility to extract error codes out of Net::SMTP::TLS croak messages +sub get_tls_error { + my ($errormsg) = @_; + $errormsg =~ m/: (\d+) (.+)/; + my $code = $1; + return $code; +} + +# NAME +# PluginReport +# SYNOPSIS +# $report = new PluginReport; +# $report->{label1} = "value1"; +# $report->{label2} = "value2"; +# print $report->text(qw/label1 label2/); +package PluginReport; + +sub new { + my ($proto,%p) = @_; + my $class = ref($proto) || $proto; + my $self = bless {}, $class; + $self->{$_} = $p{$_} foreach keys %p; + return $self; +} + +sub text { + my ($self,@labels) = @_; + my @report = map { "$self->{$_} $_" } grep { defined $self->{$_} } @labels; + my $text = join(", ", @report); + return $text; +} + +package main; +1; + diff --git a/roles/nagios_server/files/nagios/plugins/check_supybot_plugin b/roles/nagios_server/files/nagios/plugins/check_supybot_plugin new file mode 100755 index 0000000000..a66ead2e7e --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_supybot_plugin @@ -0,0 +1,108 @@ +#!/usr/bin/env python +""" check_supybot_plugin -- ensure that a plugin is loaded by supybot. + +Run like: + + check_supybot_plugin --target fedmsg + check_supybot_plugin --target koji --debug + +""" + +import argparse +import sys +import socket +import string +import uuid + + +def process_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '-t', '--target', default=None, dest='target', + help="Required. The plugin we're looking for." + ) + parser.add_argument( + '-n', '--nick', default=None, dest='nick', + help="NICK to use when connecting to freenode.", + ) + parser.add_argument( + '-d', '--debug', default=False, action='store_true', + help='Print out debug information.', dest='debug', + ) + parser.add_argument( + '-H', '--host', default='irc.freenode.net', + help='Host to connect to.', dest='host', + ) + parser.add_argument( + '-p', '--port', default=6667, type=int, + help='Host to connect to.', dest='port', + ) + return parser.parse_args() + +args = process_args() + +# Use a random nick so people can't mess with us +if not args.nick: + args.nick = 'nrpe-' + str(uuid.uuid4()).split('-')[0] + +name = "NRPE Bot" +readbuffer = "" + +if not args.target: + print "UNKNOWN: No 'target' specified." + sys.exit(3) + +args.target = args.target.lower() + +if args.debug: + print "connecting to %s/%i" % (args.host, args.port) + +try: + s = socket.socket() + s.connect((args.host, args.port)) + + if args.debug: + print "as %s/%s (%s)" % (args.nick, args.nick, name) + + s.send("nick %s\r\n" % args.nick) + s.send("USER %s %s bla :%s\r\n" % (args.nick, args.host, name)) + + while 1: + readbuffer = readbuffer+s.recv(1024) + temp = string.split(readbuffer, "\n") + readbuffer = temp.pop() + + for line in temp: + line = string.rstrip(line) + + if args.debug: + print " * ", line + + line = string.split(line) + + if line[1] == 'MODE': + msg = "privmsg zodbot :list\r\n" + if args.debug: + print "sending:" + print " ->", msg + s.send(msg) + + if line[1] == 'PRIVMSG': + if args.debug: + print "Got our response.." + + plugins = map(str.lower, ' '.join(line[3:][1:]).split(', ')) + + if args.target in plugins: + print "OK" + s.send("QUIT") + sys.exit(0) + else: + print "CRITICAL: %r not loaded by supybot" % args.target + s.send("QUIT") + sys.exit(2) +except Exception as e: + print "UNKNOWN: ", str(e) + if args.debug: + raise + sys.exit(3) diff --git a/roles/nagios_server/files/nagios/plugins/check_tape b/roles/nagios_server/files/nagios/plugins/check_tape new file mode 100644 index 0000000000..0173b0006d --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_tape @@ -0,0 +1,17 @@ +#!/bin/bash + +CODE=$(snmpwalk -v 1 -c public tape01.phx2.fedoraproject.org 1.3.6.1.4.1.674.10893.2.102.2.1 | awk '{print $4}') +WARNING=4 + +if [ $CODE -gt $WARNING ] +then + echo "Tape: CRITICAL global status: $CODE" + exit 2 +elif [ $CODE -eq $WARNING ] +then + echo "Tape: WARNING global status: $CODE" + exit 1 +else + echo "Tape: OK global status: $CODE" + exit 0 +fi diff --git a/roles/nagios_server/files/nagios/plugins/check_testcloud b/roles/nagios_server/files/nagios/plugins/check_testcloud new file mode 100644 index 0000000000..eb8c7aab3b --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_testcloud @@ -0,0 +1,19 @@ +#!/bin/bash + +RUNNING_VMS=`testcloud instance list | grep -i 'running' | wc -l` +CRITICAL=20 +WARNING=15 + + +if [ $RUNNING_VMS -gt $CRITICAL ] +then + echo "Testcloud: CRITICAL Number of VMs running: $RUNNING_VMS" + exit 2 +elif [ $RUNNING_VMS -gt $WARNING ] +then + echo "Testcloud: WARNING Number of VMs running: $RUNNING_VMS" + exit 1 +else + echo "Testcloud: OK Number of VMs running: $RUNNING_VMS" + exit 0 +fi diff --git a/roles/nagios_server/files/nagios/plugins/restart_httpd b/roles/nagios_server/files/nagios/plugins/restart_httpd new file mode 100755 index 0000000000..f461f45553 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/restart_httpd @@ -0,0 +1,73 @@ +#!/bin/sh +# +# Event handler script for restarting the web server on the local machine +# +# Note: This script will only restart the web server if the service is +# retried 3 times (in a "soft" state) or if the web service somehow +# manages to fall into a "hard" error state. +# + +servicestate=$1 +servicestatetype=$2 +serviceattempt=$3 +remotehost=$4 +hostalias=$5 +servicedesc=$6 +servicestate=$7 + +# What state is the HTTP service in? +case "$servicestate" in +OK) + # The service just came back up, so don't do anything... + ;; +WARNING) + # We don't really care about warning states, since the service is probably still running... + ;; +UNKNOWN) + # We don't know what might be causing an unknown error, so don't do anything... + ;; +CRITICAL) + # Aha! The HTTP service appears to have a problem - perhaps we should restart the server... + + # Is this a "soft" or a "hard" state? + case "$servicestatetype" in + + # We're in a "soft" state, meaning that Nagios is in the middle of retrying the + # check before it turns into a "hard" state and contacts get notified... + SOFT) + + # What check attempt are we on? We don't want to restart the web server on the first + # check, because it may just be a fluke! + case "$serviceattempt" in + + # Wait until the check has been tried 2 times before reloading the web server. + # If the check fails on the 4th time (after we restart the web server), the state + # type will turn to "hard" and contacts will be notified of the problem. + # Hopefully this will restart the web server successfully, so the 4th check will + # result in a "soft" recovery. If that happens no one gets notified because we + # fixed the problem! + 2) + echo -n "Restarting HTTP service (3rd soft critical state)..." + # Call the init script to restart the HTTPD server + echo "#fedora-noc $hostalias - Attempting to reload httpd. $servicedesc is $servicestate (2nd check)" | /usr/bin/nc -w 1 value01 5050 + /usr/lib64/nagios/plugins/check_nrpe -H $remotehost -c service_httpd_reload + ;; + esac + ;; + + # The HTTP service somehow managed to turn into a hard error without getting fixed. + # It should have been restarted by the code above, but for some reason it didn't. + # Let's give it one last try, shall we? + # Note: Contacts have already been notified of a problem with the service at this + # point (unless you disabled notifications for this service) + HARD) + echo -n "Restarting HTTP service..." + echo "#fedora-noc $hostalias - Attempting to reload httpd. $servicedesc is $servicestate" | /usr/bin/nc -w 1 value01 5050 + # Call the init script to restart the HTTPD server + /usr/lib64/nagios/plugins/check_nrpe -H $remotehost -c service_httpd_restart + ;; + esac + ;; +esac +exit 0 + diff --git a/roles/nagios_server/files/nagios/plugins/restart_rsyslog b/roles/nagios_server/files/nagios/plugins/restart_rsyslog new file mode 100755 index 0000000000..2ece8a3e42 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/restart_rsyslog @@ -0,0 +1,73 @@ +#!/bin/sh +# +# Event handler script for restarting the rsyslog server on the local machine +# +# Note: This script will only restart the web server if the service is +# retried 3 times (in a "soft" state) or if the web service somehow +# manages to fall into a "hard" error state. +# + +servicestate=$1 +servicestatetype=$2 +serviceattempt=$3 +remotehost=$4 +hostalias=$5 +servicedesc=$6 +servicestate=$7 + +# What state is the HTTP service in? +case "$servicestate" in +OK) + # The service just came back up, so don't do anything... + ;; +WARNING) + # We don't really care about warning states, since the service is probably still running... + ;; +UNKNOWN) + # We don't know what might be causing an unknown error, so don't do anything... + ;; +CRITICAL) + # Aha! The rsyslog service appears to have a problem - perhaps we should restart the server... + + # Is this a "soft" or a "hard" state? + case "$servicestatetype" in + + # We're in a "soft" state, meaning that Nagios is in the middle of retrying the + # check before it turns into a "hard" state and contacts get notified... + SOFT) + + # What check attempt are we on? We don't want to restart the web server on the first + # check, because it may just be a fluke! + case "$serviceattempt" in + + # Wait until the check has been tried 2 times before reloading the web server. + # If the check fails on the 4th time (after we restart the web server), the state + # type will turn to "hard" and contacts will be notified of the problem. + # Hopefully this will restart the web server successfully, so the 4th check will + # result in a "soft" recovery. If that happens no one gets notified because we + # fixed the problem! + 2) + echo -n "Restarting rsyslog service (3rd soft critical state)..." + # Call the init script to restart the rsyslog server + echo "#fedora-noc $hostalias - Attempting to reload rsyslog. $servicedesc is $servicestate (2nd check)" | /usr/bin/nc -w 1 value01 5050 + /usr/lib64/nagios/plugins/check_nrpe -H $remotehost -c service_rsyslog_reload + ;; + esac + ;; + + # The HTTP service somehow managed to turn into a hard error without getting fixed. + # It should have been restarted by the code above, but for some reason it didn't. + # Let's give it one last try, shall we? + # Note: Contacts have already been notified of a problem with the service at this + # point (unless you disabled notifications for this service) + HARD) + echo -n "Restarting rsyslog service..." + echo "#fedora-noc $hostalias - Attempting to restart rsyslog. $servicedesc is $servicestate" | /usr/bin/nc -w 1 value01 5050 + # Call the init script to restart the HTTPD server + /usr/lib64/nagios/plugins/check_nrpe -H $remotehost -c service_rsyslog_restart + ;; + esac + ;; +esac +exit 0 + diff --git a/roles/nagios_server/files/nagios/scripts/check_nagios_notifications.py b/roles/nagios_server/files/nagios/scripts/check_nagios_notifications.py new file mode 100755 index 0000000000..7e66202018 --- /dev/null +++ b/roles/nagios_server/files/nagios/scripts/check_nagios_notifications.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# +# A script to read the Nagios status file and send email for notifications +# off, but have recovered. +# +# Written by Athmane Madjoudj , 2011-11-15 +# based on tummy.com's work , 2010-11-16 +# Released under the GPLv2. + +import re +from smtplib import SMTP +from email.mime.text import MIMEText +from socket import gethostname + +# Settings +debug = 0 +EMAIL_FROM="nagios@fedoraproject.org" +EMAIL_TO="sysadmin-noc-members@fedoraproject.org" +#EMAIL_TO="athmane@fedoraproject.org" +nagios_status_file = '/var/log/nagios/status.dat' + +class NagiosStatus: + def __init__(self, filename): + self.filename = filename + self.hosts = {} + self.load_status_file() + + def load_status_file(self): + fp = open(self.filename, 'r') + while True: + line = fp.readline() + if not line: break + + m = re.match(r'^hoststatus\s+{\s*$', line) + if m: + if debug >= 2: print 'START OF HOST' + data = { 'services' : [] } + while True: + line = fp.readline() + if not line: break + if debug >= 2: print 'host: %s' % line.rstrip() + m2 = re.match(r'^\s+([^=]+)=(\S.*)*$', line.rstrip()) + if not m2: break + data[m2.group(1)] = m2.group(2) + self.hosts[data['host_name']] = data + if debug >= 2: print 'END OF HOST' + + m = re.match(r'^servicestatus\s+{\s*$', line) + if m: + if debug >= 2: print 'START OF SERVICE' + data = {} + while True: + line = fp.readline() + if not line: break + if debug >= 2: print 'service: %s' % line.rstrip() + m2 = re.match(r'^\s+([^=]+)=(.*)$', line.rstrip()) + if not m2: break + data[m2.group(1)] = m2.group(2) + self.hosts[data['host_name']]['services'].append(data) + if debug >= 2: print 'END OF SERVICE' + +def main(): + status = NagiosStatus(nagios_status_file) + output = "" + for host in sorted(status.hosts.keys()): + host = status.hosts[host] + if host.get('notifications_enabled', None) == None: + output+= 'Host %s has no notifications_enabled line \n' % host['host_name'] + continue + + # are there any hard states that aren't 0 or 1? + hard_states = [ x for x in + [ int(x['last_hard_state']) for x in host['services'] ] + if not x in [0,1] ] + need_newline = False + if host['notifications_enabled'] == '0' and not hard_states: + output += ('Host %s has notifications disabled and all services ok \n' + % host['host_name']) + need_newline = True + + for service in host['services']: + if debug: print '%s@%s' % ( service['check_command'], host['host_name'] ) + if debug: print ' notifications_enabled: %(notifications_enabled)s last_hard_state: %(last_hard_state)s' % service + if (int(service['notifications_enabled']) == 0 + and int(service['last_hard_state']) in [0,1]): + output+= (('Service %(check_command)s@%(host_name)s\n' + ' has notifications disabled, but is ok\n') % service) + need_newline = True + + if need_newline: output+="\n\n" + + if output.strip() != '': + msg_body = "List of notifications off for recovered hosts/services: \n\n"+output + msg = MIMEText(msg_body) + msg['Subject']="Notifications status on %s" % gethostname() + msg['From']=EMAIL_FROM + msg['To']=EMAIL_TO + smtp_conn = SMTP() + smtp_conn.connect('localhost') + smtp_conn.sendmail(EMAIL_FROM, EMAIL_TO, msg.as_string()) + smtp_conn.quit() + +if __name__ == '__main__': + main() diff --git a/roles/nagios_server/files/nagios/scripts/irc-colorize.py b/roles/nagios_server/files/nagios/scripts/irc-colorize.py new file mode 100755 index 0000000000..d62d3658a6 --- /dev/null +++ b/roles/nagios_server/files/nagios/scripts/irc-colorize.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +""" Reads a string from stdin and prints it to stdout with irc colors + +:license: LGPLv2+ +:author: Ralph Bean +""" + +import sys + +mirc_colors = { + "white": 0, + "black": 1, + "blue": 2, + "green": 3, + "red": 4, + "brown": 5, + "purple": 6, + "orange": 7, + "yellow": 8, + "light green": 9, + "teal": 10, + "light cyan": 11, + "light blue": 12, + "pink": 13, + "grey": 14, + "light grey": 15, +} + +mapping = { + 'RECOVERY': 'green', + 'OK': 'green', + 'ACKNOWLEDGEMENT': 'yellow', + 'UNKNOWN': 'purple', + 'WARNING': 'teal', + # 'red' probably makes the most sense here, but it behaved oddly + 'PROBLEM': 'brown', + 'CRITICAL': 'brown', +} + + +def markup(string, color): + return "\x02\x03%i%s\x03\x02" % (mirc_colors[color], string) + + +def colorize(word): + suffix = '' + if word.endswith(':'): + word, suffix = word[:-1], word[-1] + + if word in mapping: + word = markup(word, mapping[word]) + + return word + suffix + + +if __name__ == '__main__': + lines = sys.stdin.readlines() + for line in lines: + print " ".join([colorize(word) for word in line.strip().split()]) diff --git a/roles/nagios_server/files/nagios/servicedeps/nrpe.cfg b/roles/nagios_server/files/nagios/servicedeps/nrpe.cfg new file mode 100644 index 0000000000..304224f39e --- /dev/null +++ b/roles/nagios_server/files/nagios/servicedeps/nrpe.cfg @@ -0,0 +1,152 @@ +define servicedependency { + host_name hosted03 + service_description nrpe + dependent_host_name hosted03 + dependent_service_description Disk Space /srv, Disk Space /, Total Processes, Zombie Processes, Cron Daemon, Check Raid, Swap + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +#define servicedependency { +# host_name hosted04 +# service_description nrpe +# dependent_host_name hosted04 +# dependent_service_description Disk Space /srv, Disk Space /, Total Processes, Zombie Processes, Cron Daemon, Check Raid, Swap +# notification_failure_criteria w,c +# execution_failure_criteria w,c +#} + +define servicedependency { + host_name pkgdb01 + service_description nrpe + dependent_host_name pkgdb01 + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name pkgdb02 + service_description nrpe + dependent_host_name pkgdb02 + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name pkgdb01.stg + service_description nrpe + dependent_host_name pkgdb01.stg + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +#define servicedependency { +# host_name bapp02 +# service_description nrpe +# dependent_host_name bapp02 +# dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes +# notification_failure_criteria w,c +# execution_failure_criteria w,c +#} + +define servicedependency { + host_name bastion02 + service_description nrpe + dependent_host_name bastion02 + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name bastion-vpn + service_description nrpe + dependent_host_name bastion-vpn + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +#define servicedependency { +# host_name bodhost01 +# service_description nrpe +# dependent_host_name bodhost01 +# dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes +# notification_failure_criteria w,c +# execution_failure_criteria w,c +#} + +define servicedependency { + host_name sundries01 + service_description nrpe + dependent_host_name sundries01 + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name sundries01.stg + service_description nrpe + dependent_host_name sundries01.stg + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name sundries02 + service_description nrpe + dependent_host_name sundries02 + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name wiki01 + service_description nrpe + dependent_host_name wiki01 + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name wiki01.stg + service_description nrpe + dependent_host_name wiki01.stg + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name wiki02 + service_description nrpe + dependent_host_name wiki02 + dependent_service_description Check Raid, Cron Daemon, Disk Space /, Swap, Total Processes, Zombie Processes + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name packages03 + service_description packages-internal + dependent_host_name packages03 + dependent_service_description packages-internal-bugstab + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name packages04 + service_description packages-internal + dependent_host_name packages04 + dependent_service_description packages-internal-bugstab + notification_failure_criteria w,c + execution_failure_criteria w,c +} diff --git a/roles/nagios_server/files/nagios/servicedeps/websitedeps.cfg b/roles/nagios_server/files/nagios/servicedeps/websitedeps.cfg new file mode 100644 index 0000000000..a526b17710 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicedeps/websitedeps.cfg @@ -0,0 +1,90 @@ + +define servicedependency { + host_name proxy03.fedoraproject.org + service_description https + dependent_host_name proxy03.fedoraproject.org + dependent_service_description bodhi, pkgdb, elections, accounts, mirrors.fedoraproject.org - mirrorlist, mirrors.fedoraproject.org - publiclist, docs.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 209.132.181.16-phx2 + service_description https + dependent_host_name 209.132.181.16-phx2 + dependent_service_description bodhi, pkgdb, elections, docs.fedoraproject.org, mirrors.fedoraproject.org - mirrorlist, mirrors.fedoraproject.org - publiclist, start.fedoraproject.org, accounts, fedoraproject.org, fedoraproject.org - wiki + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 152.19.134.142-ibiblio + service_description https + dependent_host_name 152.19.134.142-ibiblio + dependent_service_description bodhi, pkgdb, elections, accounts, mirrors.fedoraproject.org - mirrorlist, mirrors.fedoraproject.org - publiclist, docs.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 152.19.134.198-ibiblio + service_description https + dependent_host_name 152.19.134.198-ibiblio + dependent_service_description bodhi, pkgdb, elections, accounts, mirrors.fedoraproject.org - mirrorlist, mirrors.fedoraproject.org - publiclist, docs.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 85.236.55.6-internetx + service_description https + dependent_host_name 85.236.55.6-internetx + dependent_service_description bodhi, pkgdb, elections, accounts, mirrors.fedoraproject.org - mirrorlist, mirrors.fedoraproject.org - publiclist, docs.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 67.203.2.67-coloamerica + service_description https + dependent_host_name 67.203.2.67-coloamerica + dependent_service_description bodhi, pkgdb, elections, docs.fedoraproject.org, mirrors.fedoraproject.org - mirrorlist, mirrors.fedoraproject.org - publiclist, start.fedoraproject.org, accounts, fedoraproject.org, fedoraproject.org - wiki + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 66.35.62.162-tummy + service_description http + dependent_host_name 66.35.62.162-tummy + dependent_service_description fedoraproject.org, fedoraproject.org - wiki, start.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 152.19.134.142-ibiblio + service_description http + dependent_host_name 152.19.134.142-ibiblio + dependent_service_description fedoraproject.org, fedoraproject.org - wiki, start.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 152.19.134.198-ibiblio + service_description http + dependent_host_name 152.19.134.198-ibiblio + dependent_service_description fedoraproject.org, fedoraproject.org - wiki, start.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} + +define servicedependency { + host_name 85.236.55.6-internetx + service_description http + dependent_host_name 85.236.55.6-internetx + dependent_service_description fedoraproject.org, fedoraproject.org - wiki, start.fedoraproject.org + notification_failure_criteria w,c + execution_failure_criteria w,c +} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/autoqa.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/autoqa.cfg new file mode 100644 index 0000000000..5bf6bde22b --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/autoqa.cfg @@ -0,0 +1,5 @@ +#define servicegroup { +# servicegroup_name autoqa +# alias AutoQA Hosts +# members autoqa01,autoqa01-autotest-frontend +#} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/bodhi.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/bodhi.cfg new file mode 100644 index 0000000000..46dbcba899 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/bodhi.cfg @@ -0,0 +1,7 @@ +define servicegroup { + servicegroup_name bodhi + alias Bodhi + members proxy01,bodhi,proxy02,bodhi,proxy03,bodhi,proxy04,bodhi,proxy06,bodhi,proxy08,bodhi,proxy09,bodhi,proxy05,bodhi,proxy10,bodhi,proxy11,bodhi,proxy12,bodhi,bodhi03,bodhi-internal,bodhi04,bodhi-internal + +} + diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/fas.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/fas.cfg new file mode 100644 index 0000000000..d7cec01444 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/fas.cfg @@ -0,0 +1,5 @@ +define servicegroup { + servicegroup_name fas + alias Fedora Account System + members proxy01,accounts,proxy02,accounts,proxy03,accounts,proxy04,accounts,proxy08,accounts,proxy12,accounts,fas01,accounts,fas02,accounts,fas03,accounts,db-fas01,Check FAS DB,proxy05,accounts,proxy10,accounts,proxy11,accounts,proxy12,accounts,proxy06,accounts +} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/fedorahosted.org.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/fedorahosted.org.cfg new file mode 100644 index 0000000000..aa36349824 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/fedorahosted.org.cfg @@ -0,0 +1,5 @@ +define servicegroup { + servicegroup_name fedorahosted + alias Fedora Hosted + members hosted03,BZR,hosted03,GIT,hosted03,bzr.fedorahosted.org,hosted03,fedorahosted.org,hosted03,git.fedorahosted.org,hosted03,hg.fedorahosted.org,hosted03,svn.fedorahosted.org +} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/fp-wiki.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/fp-wiki.cfg new file mode 100644 index 0000000000..8a57d1d73c --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/fp-wiki.cfg @@ -0,0 +1,6 @@ +define servicegroup { + servicegroup_name fp-wiki + alias Fedora Project Wiki + members proxy01,fedoraproject.org - wiki - non-cached,proxy02,fedoraproject.org - wiki - non-cached,proxy03,fedoraproject.org - wiki - non-cached,proxy04,fedoraproject.org - wiki - non-cached,proxy06,fedoraproject.org - wiki - non-cached,proxy08,fedoraproject.org - wiki - non-cached,proxy09,fedoraproject.org - wiki - non-cached,proxy01,fedoraproject.org - wiki,proxy02,fedoraproject.org - wiki,proxy03,fedoraproject.org - wiki,proxy04,fedoraproject.org - wiki,proxy06,fedoraproject.org - wiki,proxy08,fedoraproject.org - wiki,proxy09,fedoraproject.org - wiki,proxy05,fedoraproject.org - wiki,proxy10,fedoraproject.org - wiki,proxy11,fedoraproject.org - wiki,proxy12,fedoraproject.org - wiki +} + diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/freemedia.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/freemedia.cfg new file mode 100644 index 0000000000..e88f0b3e1e --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/freemedia.cfg @@ -0,0 +1,6 @@ +define servicegroup { + servicegroup_name freemedia + alias FreeMedia + members sundries01,freemedia-internal,sundries02,freemedia-internal +} + diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/ipa.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/ipa.cfg new file mode 100644 index 0000000000..6075220e28 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/ipa.cfg @@ -0,0 +1,5 @@ +define servicegroup { + servicegroup_name ipa + alias IPA Servers + members ipa01,IPA Replication Status,ipa02,IPA Replication Status +} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/mgmt-http.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/mgmt-http.cfg new file mode 100644 index 0000000000..3a7858bc2c --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/mgmt-http.cfg @@ -0,0 +1,5 @@ +define servicegroup { + servicegroup_name mgmt-http + alias mgmt interfaces on http + members backup01.mgmt.fedoraproject.org,backup01.mgmt.fedoraproject.org-http,bvirthost01.mgmt.fedoraproject.org,bvirthost01.mgmt.fedoraproject.org-http,download01.mgmt.fedoraproject.org,download01.mgmt.fedoraproject.org-http,download02.mgmt.fedoraproject.org,download02.mgmt.fedoraproject.org-http,download03.mgmt.fedoraproject.org,download03.mgmt.fedoraproject.org-http,download04.mgmt.fedoraproject.org,download04.mgmt.fedoraproject.org-http,download05.mgmt.fedoraproject.org,download05.mgmt.fedoraproject.org-http,qa01.mgmt.fedoraproject.org,qa01.mgmt.fedoraproject.org-http,qa02.mgmt.fedoraproject.org,qa02.mgmt.fedoraproject.org-http,qa03.mgmt.fedoraproject.org,qa03.mgmt.fedoraproject.org-http,qa04.mgmt.fedoraproject.org,qa04.mgmt.fedoraproject.org-http,qa05.mgmt.fedoraproject.org,qa05.mgmt.fedoraproject.org-http,qa06.mgmt.fedoraproject.org,qa06.mgmt.fedoraproject.org-http,qa07.mgmt.fedoraproject.org,qa07.mgmt.fedoraproject.org-http,qa08.mgmt.fedoraproject.org,qa08.mgmt.fedoraproject.org-http,qa09.mgmt.fedoraproject.org,qa09.mgmt.fedoraproject.org-http,qa10.mgmt.fedoraproject.org,qa10.mgmt.fedoraproject.org-http,qa11.mgmt.fedoraproject.org,qa11.mgmt.fedoraproject.org-http,qa12.mgmt.fedoraproject.org,qa12.mgmt.fedoraproject.org-http,qa13.mgmt.fedoraproject.org,qa13.mgmt.fedoraproject.org-http,qa14.mgmt.fedoraproject.org,qa14.mgmt.fedoraproject.org-http,virthost01.mgmt.fedoraproject.org,virthost01.mgmt.fedoraproject.org-http,virthost03.mgmt.fedoraproject.org,virthost03.mgmt.fedoraproject.org-http,virthost12.mgmt.fedoraproject.org,virthost12.mgmt.fedoraproject.org-http,virthost14.mgmt.fedoraproject.org,virthost14.mgmt.fedoraproject.org-http,virthost15.mgmt.fedoraproject.org,virthost15.mgmt.fedoraproject.org-http,virthost16.mgmt.fedoraproject.org,virthost16.mgmt.fedoraproject.org-http,virthost17.mgmt.fedoraproject.org,virthost17.mgmt.fedoraproject.org-http,virthost18.mgmt.fedoraproject.org,virthost18.mgmt.fedoraproject.org-http,virthost-comm02.mgmt.fedoraproject.org,virthost-comm02.mgmt.fedoraproject.org-http, virthost-comm03.mgmt.fedoraproject.org,virthost-comm03.mgmt.fedoraproject.org-http, virthost-comm04.mgmt.fedoraproject.org,virthost-comm04.mgmt.fedoraproject.org-http,sign-vault03.mgmt.fedoraproject.org,sign-vault03.mgmt.fedoraproject.org-http,sign-vault04.mgmt.fedoraproject.org,sign-vault04.mgmt.fedoraproject.org-http,virthost02.mgmt.fedoraproject.org,virthost02.mgmt.fedoraproject.org-http,virthost11.mgmt.fedoraproject.org,virthost11.mgmt.fedoraproject.org-http +} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/mgmt-https.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/mgmt-https.cfg new file mode 100644 index 0000000000..619f17742d --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/mgmt-https.cfg @@ -0,0 +1,5 @@ +define servicegroup { + servicegroup_name mgmt-https + alias mgmt interfaces on https + members backup01.mgmt.fedoraproject.org,backup01.mgmt.fedoraproject.org-https,bvirthost01.mgmt.fedoraproject.org,bvirthost01.mgmt.fedoraproject.org-https,download01.mgmt.fedoraproject.org,download01.mgmt.fedoraproject.org-https,download02.mgmt.fedoraproject.org,download02.mgmt.fedoraproject.org-https,download03.mgmt.fedoraproject.org,download03.mgmt.fedoraproject.org-https,download04.mgmt.fedoraproject.org,download04.mgmt.fedoraproject.org-https,download05.mgmt.fedoraproject.org,download05.mgmt.fedoraproject.org-https,qa01.mgmt.fedoraproject.org,qa01.mgmt.fedoraproject.org-https,qa02.mgmt.fedoraproject.org,qa02.mgmt.fedoraproject.org-https,qa03.mgmt.fedoraproject.org,qa03.mgmt.fedoraproject.org-https,qa04.mgmt.fedoraproject.org,qa04.mgmt.fedoraproject.org-https,qa05.mgmt.fedoraproject.org,qa05.mgmt.fedoraproject.org-https,qa06.mgmt.fedoraproject.org,qa06.mgmt.fedoraproject.org-https,qa07.mgmt.fedoraproject.org,qa07.mgmt.fedoraproject.org-https,qa08.mgmt.fedoraproject.org,qa08.mgmt.fedoraproject.org-https,qa09.mgmt.fedoraproject.org,qa09.mgmt.fedoraproject.org-https,qa10.mgmt.fedoraproject.org,qa10.mgmt.fedoraproject.org-https,qa11.mgmt.fedoraproject.org,qa11.mgmt.fedoraproject.org-https,qa12.mgmt.fedoraproject.org,qa12.mgmt.fedoraproject.org-https,qa13.mgmt.fedoraproject.org,qa13.mgmt.fedoraproject.org-https,qa14.mgmt.fedoraproject.org,qa14.mgmt.fedoraproject.org-https,virthost02.mgmt.fedoraproject.org,virthost02.mgmt.fedoraproject.org-https,virthost11.mgmt.fedoraproject.org,virthost11.mgmt.fedoraproject.org-https,virthost03.mgmt.fedoraproject.org,virthost03.mgmt.fedoraproject.org-https,virthost12.mgmt.fedoraproject.org,virthost12.mgmt.fedoraproject.org-https,virthost14.mgmt.fedoraproject.org,virthost14.mgmt.fedoraproject.org-https,virthost15.mgmt.fedoraproject.org,virthost15.mgmt.fedoraproject.org-https,virthost16.mgmt.fedoraproject.org,virthost16.mgmt.fedoraproject.org-https,virthost17.mgmt.fedoraproject.org,virthost17.mgmt.fedoraproject.org-https,virthost18.mgmt.fedoraproject.org,virthost18.mgmt.fedoraproject.org-https,virthost-comm02.mgmt.fedoraproject.org,virthost-comm02.mgmt.fedoraproject.org-https, virthost-comm03.mgmt.fedoraproject.org,virthost-comm03.mgmt.fedoraproject.org-https, virthost-comm04.mgmt.fedoraproject.org,virthost-comm04.mgmt.fedoraproject.org-https, sign-vault03.mgmt.fedoraproject.org,sign-vault03.mgmt.fedoraproject.org-https,sign-vault04.mgmt.fedoraproject.org,sign-vault04.mgmt.fedoraproject.org-https,fed-cloud09.mgmt.fedoraproject.org,fed-cloud09.mgmt.fedoraproject.org-https,fed-cloud08.mgmt.fedoraproject.org,fed-cloud08.mgmt.fedoraproject.org-https,fed-cloud10.mgmt.fedoraproject.org,fed-cloud10.mgmt.fedoraproject.org-https,fed-cloud11.mgmt.fedoraproject.org,fed-cloud11.mgmt.fedoraproject.org-https,fed-cloud12.mgmt.fedoraproject.org,fed-cloud12.mgmt.fedoraproject.org-https,fed-cloud13.mgmt.fedoraproject.org,fed-cloud13.mgmt.fedoraproject.org-https,fed-cloud14.mgmt.fedoraproject.org,fed-cloud14.mgmt.fedoraproject.org-https,fed-cloud15.mgmt.fedoraproject.org,fed-cloud15.mgmt.fedoraproject.org-https +} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/mirrorlist.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/mirrorlist.cfg new file mode 100644 index 0000000000..3bfb386761 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/mirrorlist.cfg @@ -0,0 +1,6 @@ +define servicegroup { + servicegroup_name mirrorlist + alias Mirrorlist + members proxy01,mirrors.fedoraproject.org - mirrorlist,proxy02,mirrors.fedoraproject.org - mirrorlist,proxy03,mirrors.fedoraproject.org - mirrorlist,proxy04,mirrors.fedoraproject.org - mirrorlist,proxy06,mirrors.fedoraproject.org - mirrorlist,proxy08,mirrors.fedoraproject.org - mirrorlist,proxy09,mirrors.fedoraproject.org - mirrorlist,proxy05,mirrors.fedoraproject.org - mirrorlist,proxy10,mirrors.fedoraproject.org - mirrorlist,proxy11,mirrors.fedoraproject.org - mirrorlist,proxy12,mirrors.fedoraproject.org - mirrorlist +} + diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/pkgdb.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/pkgdb.cfg new file mode 100644 index 0000000000..566078c695 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/pkgdb.cfg @@ -0,0 +1,6 @@ +define servicegroup { + servicegroup_name pkgdb + alias Package Database + members proxy01,pkgdb-external,proxy02,pkgdb-external,proxy03,pkgdb-external,proxy04,pkgdb-external,proxy08,pkgdb-external,proxy09,pkgdb-external,pkgdb01,pkgdb-internal,pkgdb02,pkgdb-internal,pkgdb01.stg,pkgdb-internal,proxy11,pkgdb-external,proxy12,pkgdb-external +} + diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/ppc-secondary.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/ppc-secondary.cfg new file mode 100644 index 0000000000..ab33bf3f3c --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/ppc-secondary.cfg @@ -0,0 +1,5 @@ +define servicegroup { + servicegroup_name ppc-secondary + alias PPC Secondary Hosts + members ppc-hub,ppc-koji-frontend +} diff --git a/roles/nagios_server/files/nagios/servicegroups/servicegroups/retrace.cfg b/roles/nagios_server/files/nagios/servicegroups/servicegroups/retrace.cfg new file mode 100644 index 0000000000..dc30917ca4 --- /dev/null +++ b/roles/nagios_server/files/nagios/servicegroups/servicegroups/retrace.cfg @@ -0,0 +1,5 @@ +define servicegroup { + servicegroup_name retrace + alias Retrace Hosts + members retrace01.qa,Disk space /,retrace01.qa,Total Processes,retrace01.qa,Check Raid,retrace01.qa,Swap,retrace01.qa,SSH +} diff --git a/roles/nagios_server/files/nagios/services/autocloud.cfg b/roles/nagios_server/files/nagios/services/autocloud.cfg new file mode 100644 index 0000000000..28dcb8f7fd --- /dev/null +++ b/roles/nagios_server/files/nagios/services/autocloud.cfg @@ -0,0 +1,13 @@ +define service { + host_name autocloud-backend-libvirt2,autocloud-backend-vbox2 + service_description Check for autocloud proc + check_command check_by_nrpe!check_autocloud_proc + use defaulttemplate +} + +define service { + host_name autocloud-backend-libvirt2,autocloud-backend-vbox2 + service_description Check for redis proc + check_command check_by_nrpe!check_redis_proc + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/basset.cfg b/roles/nagios_server/files/nagios/services/basset.cfg new file mode 100644 index 0000000000..4ea295aaa6 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/basset.cfg @@ -0,0 +1,27 @@ +define service { + host_name basset01 + service_description mongo process + check_command check_by_nrpe!check_mongo_proc + use defaulttemplate +} + +define service { + host_name basset01 + service_description rabbitmq process + check_command check_by_nrpe!check_rabbitmq_proc + use defaulttemplate +} + +define service { + host_name basset01 + service_description basset worker processes + check_command check_by_nrpe!check_worker_proc + use defaulttemplate +} + +define service { + host_name basset01 + service_description basset processing queue + check_command check_by_nrpe!check_basset_queue + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/copr.cfg b/roles/nagios_server/files/nagios/services/copr.cfg new file mode 100644 index 0000000000..f7ab32985c --- /dev/null +++ b/roles/nagios_server/files/nagios/services/copr.cfg @@ -0,0 +1,6 @@ +define service { + host_name copr-be + service_description Check Copr backend consecutive build failures + check_command check_by_nrpe!check_copr_backend_failed + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/db_backups.cfg b/roles/nagios_server/files/nagios/services/db_backups.cfg new file mode 100644 index 0000000000..c285f7d8a0 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/db_backups.cfg @@ -0,0 +1,21 @@ +define service { + host_name db03 + service_description Check MySQL Backup + check_command check_by_nrpe!check_mysql_backup + use defaulttemplate +} + +#define service { +# host_name db05 +# service_description Check Koji PGSQL Backup +# check_command check_by_nrpe!check_pgsql_koji_backup +# use defaulttemplate +#} + +#define service { +# host_name db05, db01 +# service_description Check PGSQL Backup +# check_command check_by_nrpe!check_pgsql_backup +# use defaulttemplate +#} + diff --git a/roles/nagios_server/files/nagios/services/disk.cfg b/roles/nagios_server/files/nagios/services/disk.cfg new file mode 100644 index 0000000000..6e084000dc --- /dev/null +++ b/roles/nagios_server/files/nagios/services/disk.cfg @@ -0,0 +1,99 @@ +define service { + hostgroup servers + service_description Disk Space / + check_command check_by_nrpe!check_disk_/ + use disktemplate +} + +#define service { +# hostgroup buildservers +# service_description Disk Space / +# check_command check_by_nrpe!check_disk_/ +# use builderdisktemplate +# retry_check_interval 5 +#} + +define service { + host_name noc01, proxy01, proxy02, rawhide-composer, db01 + service_description Disk Space /boot + check_command check_by_nrpe!check_disk_/boot + use disktemplate +} + +define service { + hostgroup hosted + service_description Disk Space /srv + check_command check_by_nrpe!check_disk_/srv + use disktemplate +} + +define service { + host_name qa10.qa, qa11.qa, qa12.qa, qa13.qa + service_description Disk Space /srv + check_command check_by_nrpe!check_disk_/srv + use disktemplate +} + +define service { + host_name taskotron01.qa, taskotron-stg01.qa, taskotron-dev01.qa + service_description Disk Space /srv/buildmaster + check_command check_by_nrpe!check_disk_/srv/buildmaster + use disktemplate +} + +define service { + host_name taskotron01.qa, taskotron-stg01.qa, taskotron-dev01.qa + service_description Disk Space /srv/taskotron + check_command check_by_nrpe!check_disk_/srv/taskotron + use disktemplate +} + + +#define service { +# host_name hosted04 +# service_description Disk Space /srv +# check_command check_by_nrpe!check_disk_/srv +# use disktemplate +#} + +define service { + host_name log01 + service_description Disk space /var/log + check_command check_by_nrpe!check_disk_/var/log + use disktemplate +} + +#define service { +# host_name nfs01 +# service_description Disk space /mnt/koji +# check_command check_by_nrpe!check_disk_/mnt/koji +# use disktemplate +#} + +define service { + host_name pkgs02 + service_description Check read-only filesystem + check_command check_by_nrpe!check_readonly_fs + use disktemplate +} + +define service { + host_name pkgs02 + service_description Disk space /srv/cache/lookaside + check_command check_by_nrpe!check_disk_/srv/cache/lookaside + use disktemplate +} + +define service { + host_name ppc-hub + service_description Disk space / + check_command check_by_nrpe!check_disk_/ + use ppc-secondarytemplate +} + +define service { + host_name retrace01.qa + service_description Disk space / + check_command check_by_nrpe!check_disk_/ + use retracetemplate +} diff --git a/roles/nagios_server/files/nagios/services/dns.cfg b/roles/nagios_server/files/nagios/services/dns.cfg new file mode 100644 index 0000000000..0f1eb14b8d --- /dev/null +++ b/roles/nagios_server/files/nagios/services/dns.cfg @@ -0,0 +1,6 @@ +define service { + hostgroup_name dnsservers + service_description DNS: fp.o + check_command check_dns_fpo + use criticaltemplate +} diff --git a/roles/nagios_server/files/nagios/services/fedmsg.cfg b/roles/nagios_server/files/nagios/services/fedmsg.cfg new file mode 100644 index 0000000000..7012d87dc7 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/fedmsg.cfg @@ -0,0 +1,712 @@ +## There are lots of different sections in this now-enormous file +## Each one starts with a 'BEGIN' comment. + + +# BEGIN, check for the existance of processes +define service { + host_name value01 + service_description Check for fedmsg-irc proc + check_command check_by_nrpe!check_fedmsg_irc_proc + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check for fedmsg-gateway proc + check_command check_by_nrpe!check_fedmsg_gateway_proc + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check for fedmsg-relay proc + check_command check_by_nrpe!check_fedmsg_relay_proc + use defaulttemplate +} + +define service { + host_name proxy01,proxy02,proxy03,proxy04,proxy05,proxy06,proxy08,proxy09,proxy10,proxy11,proxy12 + service_description Check for existence fedmsg-gateway proc + check_command check_by_nrpe!check_fedmsg_gateway_proc + use defaulttemplate +} + +define service { + host_name anitya-frontend01 + service_description Check for fedmsg-relay proc + check_command check_by_nrpe!check_fedmsg_relay_proc + use defaulttemplate +} + +define service { + host_name badges-backend01 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name summershum01 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name pkgs02 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name fedimg01 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name hotness01 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name bodhi-backend03 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_masher_proc + use defaulttemplate +} + +define service { + host_name bodhi-backend02 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_masher_proc + use defaulttemplate +} +define service { + host_name autocloud-backend-libvirt2,autocloud-backend-vbox2 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} +define service { + host_name packages03 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} +define service { + host_name bugyou01 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_procs_bugyou + use defaulttemplate +} +define service { + host_name pdc-backend01 + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + + +# Odd one, check for the supybot fedmsg plugin +define service { + host_name value01 + service_description Check supybot fedmsg plugin + check_command check_by_nrpe!check_supybot_fedmsg_plugin + use defaulttemplate +} + + +# BEGIN, check datanommer history +define service { + host_name busgateway01 + service_description Check datanommer for recent buildsys/koji messages + check_command check_by_nrpe!check_datanommer_buildsys + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent git messages + check_command check_by_nrpe!check_datanommer_git + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent bodhi messages + check_command check_by_nrpe!check_datanommer_bodhi + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent wiki messages + check_command check_by_nrpe!check_datanommer_wiki + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent compose messages + check_command check_by_nrpe!check_datanommer_compose + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent meetbot messages + check_command check_by_nrpe!check_datanommer_meetbot + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent fas messages + check_command check_by_nrpe!check_datanommer_fas + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent pkgdb messages + check_command check_by_nrpe!check_datanommer_pkgdb + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent fedoratagger messages + check_command check_by_nrpe!check_datanommer_fedoratagger + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent fedoraplanet messages + check_command check_by_nrpe!check_datanommer_planet + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent copr finished build messages + check_command check_by_nrpe!check_datanommer_copr + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent trac messages + check_command check_by_nrpe!check_datanommer_trac + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent askbot messages + check_command check_by_nrpe!check_datanommer_askbot + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent fedbadges messages + check_command check_by_nrpe!check_datanommer_fedbadges + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent fedocal messages + check_command check_by_nrpe!check_datanommer_fedocal + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent ansible messages + check_command check_by_nrpe!check_datanommer_ansible + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent anitya messages + check_command check_by_nrpe!check_datanommer_anitya + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent fedimg messages + check_command check_by_nrpe!check_datanommer_fedimg + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent hotness messages + check_command check_by_nrpe!check_datanommer_hotness + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent faf messages + check_command check_by_nrpe!check_datanommer_faf + use defaulttemplate +} + +# This one is retired since it times out all the time. Too few messages. +#define service { +# host_name busgateway01 +# service_description Check datanommer for recent nuancier messages +# check_command check_by_nrpe!check_datanommer_nuancier +# use defaulttemplate +#} + +define service { + host_name busgateway01 + service_description Check datanommer for recent mailman messages + check_command check_by_nrpe!check_datanommer_mailman + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check datanommer for recent bugzilla messages + check_command check_by_nrpe!check_datanommer_bugzilla + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check datanommer for recent summershum messages + check_command check_by_nrpe!check_datanommer_summershum + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent jenkins messages + check_command check_by_nrpe!check_datanommer_jenkins + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent github messages + check_command check_by_nrpe!check_datanommer_github + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent kerneltest messages + check_command check_by_nrpe!check_datanommer_kerneltest + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent fmn messages + check_command check_by_nrpe!check_datanommer_fmn + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent autocloud messages + check_command check_by_nrpe!check_datanommer_autocloud + use defaulttemplate +} +define service { + host_name busgateway01 + service_description Check datanommer for recent atomic compose + check_command check_by_nrpe!check_datanommer_twoweekatomic + use defaulttemplate +} + + +# BEGIN, check consumers and producers +define service { + host_name busgateway01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_busgateway_hub + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check fedmsg consumers and producers relay + check_command check_by_nrpe!check_fedmsg_cp_busgateway_relay + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check fedmsg consumers and producers gateway + check_command check_by_nrpe!check_fedmsg_cp_busgateway_gateway + use defaulttemplate +} + +define service { + host_name proxy01,proxy02,proxy03,proxy04,proxy06,proxy08,proxy05,proxy09,proxy10,proxy11,proxy12 + service_description Check fedmsg consumers and producers gateway + check_command check_by_nrpe!check_fedmsg_cp_busgateway_gateway + use defaulttemplate +} + +define service { + host_name anitya-frontend01 + service_description Check fedmsg consumers and producers relay + check_command check_by_nrpe!check_fedmsg_cp_anitya_relay + use defaulttemplate +} + +define service { + host_name value01 + service_description Check fedmsg consumers and producers irc + check_command check_by_nrpe!check_fedmsg_cp_value + use defaulttemplate +} + +define service { + host_name pkgs02 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_pkgs + use defaulttemplate +} + +define service { + host_name summershum01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_summershum + use defaulttemplate +} + +define service { + host_name badges-backend01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_badges_backend + use defaulttemplate +} + +define service { + host_name bugzilla2fedmsg01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_bugzilla2fedmsg + use defaulttemplate +} + +define service { + host_name fedimg01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_fedimg_backend + use defaulttemplate +} + +define service { + host_name hotness01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_hotness_backend + use defaulttemplate +} + +define service { + host_name bodhi-backend03 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_bodhi_backend01_hub + use defaulttemplate +} + +define service { + host_name bodhi-backend02 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_bodhi_backend02_hub + use defaulttemplate +} + +define service { + host_name autocloud-backend-libvirt2,autocloud-backend-vbox2 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_autocloud_backend + use defaulttemplate +} +define service { + host_name packages03 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_packages_backend + use defaulttemplate +} +define service { + host_name bugyou01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_bugyou_backend + use defaulttemplate +} +define service { + host_name pdc-backend01 + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_pdc_backend + use defaulttemplate +} + + +# BEGIN exceptions counter +define service { + host_name busgateway01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_hub + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check fedmsg-relay consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_relay + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check fedmsg-gateway consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_gateway + use defaulttemplate +} + +define service { + host_name proxy01,proxy02,proxy03,proxy04,proxy06,proxy08,proxy05,proxy09,proxy10,proxy11,proxy12 + service_description Check fedmsg-gateway consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_gateway + use defaulttemplate +} + +define service { + host_name anitya-frontend01 + service_description Check fedmsg-relay consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_anitya_relay + use defaulttemplate +} + +define service { + host_name value01 + service_description Check fedmsg-irc consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_value + use defaulttemplate +} + +define service { + host_name pkgs02 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_pkgs + use defaulttemplate +} + +define service { + host_name summershum01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_summershum + use defaulttemplate +} + +define service { + host_name badges-backend01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_badges_backend + use defaulttemplate +} + +define service { + host_name notifs-backend01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_notifs_backend + use defaulttemplate +} + +define service { + host_name bugzilla2fedmsg01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_bugzilla2fedmsg + use defaulttemplate +} + +define service { + host_name fedimg01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_fedimg_backend + use defaulttemplate +} + +define service { + host_name hotness01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_hotness_backend + use defaulttemplate +} + +define service { + host_name bodhi-backend03 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_bodhi_backend01_hub + use defaulttemplate +} + +define service { + host_name bodhi-backend02 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_bodhi_backend02_hub + use defaulttemplate +} + +define service { + host_name autocloud-backend-libvirt2,autocloud-backend-vbox2 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_autocloud_backend + use defaulttemplate +} + +define service { + host_name packages03 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_packages_backend + use defaulttemplate +} + +define service { + host_name bugyou01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_bugyou_backend + use defaulttemplate +} + +define service { + host_name pdc-backend01 + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_pdc_backend + use defaulttemplate +} + + + +# BEGIN backlog checking +define service { + host_name busgateway01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_hub + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check fedmsg-relay consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_relay + use defaulttemplate +} + +define service { + host_name busgateway01 + service_description Check fedmsg-gateway consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_gateway + use defaulttemplate +} + +define service { + host_name proxy01,proxy02,proxy03,proxy04,proxy06,proxy08,proxy05,proxy09,proxy10,proxy11,proxy12 + service_description Check fedmsg-gateway consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_gateway + use defaulttemplate +} + +define service { + host_name anitya-frontend01 + service_description Check fedmsg-relay consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_anitya_relay + use defaulttemplate +} + +define service { + host_name value01 + service_description Check fedmsg-irc consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_value + use defaulttemplate +} + +define service { + host_name pkgs02 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_pkgs + use defaulttemplate +} + +define service { + host_name summershum01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_summershum + use defaulttemplate +} + +define service { + host_name badges-backend01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_badges_backend + use defaulttemplate +} + +define service { + host_name notifs-backend01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_notifs_backend + use defaulttemplate +} + +define service { + host_name bugzilla2fedmsg01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_bugzilla2fedmsg + use defaulttemplate +} + +define service { + host_name fedimg01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_fedimg_backend + use defaulttemplate +} + +define service { + host_name hotness01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_hotness_backend + use defaulttemplate +} + +define service { + host_name bodhi-backend03 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_bodhi_backend01_hub + use defaulttemplate +} + +define service { + host_name bodhi-backend02 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_bodhi_backend02_hub + use defaulttemplate +} + +define service { + host_name autocloud-backend-libvirt2,autocloud-backend-vbox2 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_autocloud_backend + use defaulttemplate +} + +define service { + host_name packages03 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_packages_backend + use defaulttemplate +} + +define service { + host_name bugyou01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_bugyou_backend + use defaulttemplate +} + +define service { + host_name pdc-backend01 + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_pdc_backend + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/file_age.cfg b/roles/nagios_server/files/nagios/services/file_age.cfg new file mode 100644 index 0000000000..658ae5634f --- /dev/null +++ b/roles/nagios_server/files/nagios/services/file_age.cfg @@ -0,0 +1,16 @@ +define service { + host_name mirrorlist-osuosl, mirrorlist-ibiblio, mirrorlist-ibiblio02, mirrorlist-phx2, mirrorlist-host1plus, mirrorlist-dedicatedsolutions + service_description Check MirrorList Cache + check_command check_by_nrpe!check_mirrorlist_cache + use defaulttemplate + normal_check_interval 120 +} + +define service { + host_name log01 + service_description Check Merged Log + check_command check_by_nrpe!check_merged_file_age + use defaulttemplate + normal_check_interval 120 + event_handler restart_rsyslog +} diff --git a/roles/nagios_server/files/nagios/services/fmn.cfg b/roles/nagios_server/files/nagios/services/fmn.cfg new file mode 100644 index 0000000000..83c131c5e8 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/fmn.cfg @@ -0,0 +1,13 @@ +define service { + host_name notifs-backend01 + service_description Check backend queue size + check_command check_by_nrpe!check_fmn_backend_queue + use defaulttemplate +} + +define service { + host_name notifs-backend01 + service_description Check worker queue size + check_command check_by_nrpe!check_fmn_worker_queue + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/haproxy.cfg b/roles/nagios_server/files/nagios/services/haproxy.cfg new file mode 100644 index 0000000000..d4e87d0314 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/haproxy.cfg @@ -0,0 +1,6 @@ +define service { + hostgroup_name proxies + service_description Check proxies for oversubscription + check_command check_by_nrpe!check_haproxy_conns + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/haproxy_mirrorlist.cfg b/roles/nagios_server/files/nagios/services/haproxy_mirrorlist.cfg new file mode 100644 index 0000000000..30bd49ac8b --- /dev/null +++ b/roles/nagios_server/files/nagios/services/haproxy_mirrorlist.cfg @@ -0,0 +1,13 @@ +define service { + host_name proxy01 + service_description Check proxy01 for DOWN mirrorlist servers + check_command check_by_nrpe!check_haproxy_mirrorlist + use defaulttemplate +} + +define service { + host_name proxy04 + service_description Check proxy04 for DOWN mirrorlist servers + check_command check_by_nrpe!check_haproxy_mirrorlist + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/hosted.cfg b/roles/nagios_server/files/nagios/services/hosted.cfg new file mode 100644 index 0000000000..0816af7a3a --- /dev/null +++ b/roles/nagios_server/files/nagios/services/hosted.cfg @@ -0,0 +1,57 @@ +define service { + hostgroup hosted + service_description BZR + check_command check_bzr + use defaulttemplate +} + +define service { + hostgroup hosted + service_description GIT + check_command check_git + use defaulttemplate +} + +define service { + hostgroup hosted + service_description bzr.fedorahosted.org + check_command check_website!bzr.fedorahosted.org!/bzr/!loggerheadCont + use websitetemplate +} + +define service { + hostgroup hosted + service_description fedorahosted.org + check_command check_website!fedorahosted.org!/! + use websitetemplate +} + +define service { + hostgroup hosted + service_description git.fedorahosted.org + check_command check_website!git.fedorahosted.org!/git/fedora-infrastructure.git/!