put in the first run at new nagios configs

This commit is contained in:
Stephen Smoogen 2017-01-05 00:55:16 +00:00
parent a1957d29d4
commit 8cf72ff116
310 changed files with 13255 additions and 26 deletions

View file

@ -1264,6 +1264,12 @@ docker-candidate-registry01.phx2.fedoraproject.org
docker-registry01.stg.phx2.fedoraproject.org
docker-candidate-registry01.stg.phx2.fedoraproject.org
[webservers:children]
proxies
ipsilon
ipa
fas
#
# Hosts in this group have zombie processes for various reasons
# and we want to not alert on those, so to the client nrpe.conf uses
@ -1276,3 +1282,4 @@ pkgs02.phx2.fedoraproject.org
fed-cloud09.cloud.fedoraproject.org
# Ansible from time to time in large runs has zombie threads
batcave01.phx2.fedoraproject.org

View file

@ -35,4 +35,3 @@ define contact{
email 9178159801@vtext.com
pager 9178159801@vtext.com
}

View file

@ -10,29 +10,29 @@ define contact{
email nick@bebout.net
}
define contact{
contact_name nb-emergency
alias Nick Bebout
service_notification_period never
host_notification_period never
service_notification_options w,u,c,r
host_notification_options d,u,r
service_notification_commands notify-by-epager
host_notification_commands host-notify-by-epager
email nb5@txt.att.net
pager nb5@txt.att.net
}
#define contact{
# contact_name nb-emergency
# alias Nick Bebout
# service_notification_period never
# host_notification_period never
# service_notification_options w,u,c,r
# host_notification_options d,u,r
# service_notification_commands notify-by-epager
# host_notification_commands host-notify-by-epager
# email nb5@txt.att.net
# pager nb5@txt.att.net
#}
define contact{
contact_name nbp
alias Nick Bebout
service_notification_period never
host_notification_period never
service_notification_options w,u,c,r
host_notification_options d,u,r
service_notification_commands notify-by-epager
host_notification_commands host-notify-by-epager
email nb5@txt.att.net
pager nb5@txt.att.net
}
#define contact{
# contact_name nbp
# alias Nick Bebout
# service_notification_period never
# host_notification_period never
# service_notification_options w,u,c,r
# host_notification_options d,u,r
# service_notification_commands notify-by-epager
# host_notification_commands host-notify-by-epager
# email nb5@txt.att.net
# pager nb5@txt.att.net
#}

View file

@ -11,7 +11,19 @@
#}
#
#define contact{
# contact_name skvidalp
# contact_name skvidal_xmpp
# alias Seth Vidal
# service_notification_period 24x7
# host_notification_period 24x7
# service_notification_options w,u,c,r
# host_notification_options d,u,r
# service_notification_commands notify-by-xmpp
# host_notification_commands host-notify-by-xmpp
# email skvidal@jabber.org
#}
#
#define contact{
# contact_name skvidal-emergency
# alias Seth Vidal
# service_notification_period 24x7
# host_notification_period 24x7
@ -20,5 +32,17 @@
# service_notification_commands notify-by-epager
# host_notification_commands host-notify-by-epager
# email page-seth-vidal@sethdot.org
#}
#
#define contact{
# contact_name skvidalp
# alias Seth Vidal
# service_notification_period 16x7
# host_notification_period 16x7
# service_notification_options w,u,c,r
# host_notification_options d,u,r
# service_notification_commands notify-by-epager
# host_notification_commands host-notify-by-epager
# email page-seth-vidal@sethdot.org
# pager page-seth-vidal@sethdot.org
#}

View file

@ -0,0 +1,36 @@
===================================
Nagios 4 Configuration for Fedora
===================================
The Fedora Infrastructure Nagios is built on a set of configurations
originally written for Nagios 2 and then upgraded over time to Nagios
3 and then 4.08. With additional changes made in the 4.2 series of
Nagios this needed a better rewrite as various parts came from
pre-puppet and then various puppet modules added on top.
In order to get this rewrite done, we will use as much of the original
layout of the Fedora ansible nagios module but with rewrites to better
match current Nagios configurations so that it can be maintained.
Role directory layout
=====================
The original layout branched out from
roles/nagios/client/
roles/nagios/server/
With the usual trees below this. This breaks ansible best practices
and how most new modules are set up so the rewrite uses:
roles/nagios_client/
roles/nagios_server/
=====================
Nagios Client Files
=====================
For the most part the Nagios Client files seem to work from the
original layout to the new site. Changes will only need to be made to
playbooks for the initial changes.

View file

@ -0,0 +1,72 @@
#!/usr/bin/env python
""" NRPE check for datanommer/fedmsg health.
Given a category like 'bodhi', 'buildsys', or 'git', return an error if
datanommer hasn't seen a message of that type in such and such time.
You can alternatively provide a 'topic' which might look like
org.fedoraproject.prod.bodhi.update.comment.
Requires: python-dateutil
Usage:
$ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH
:Author: Ralph Bean <rbean@redhat.com>
"""
import dateutil.relativedelta
import subprocess
import sys
import json
def query_timesince(identifier):
# If it has a '.', then assume it is a topic.
if '.' in identifier:
cmd = 'datanommer-latest --topic %s --timesince' % identifier
else:
cmd = 'datanommer-latest --category %s --timesince' % identifier
sys.stderr.write("Running %r\n" % cmd)
process = subprocess.Popen(cmd.split(), shell=False,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
prefix, stdout = stdout.split("INFO] ", 1)
data = json.loads(stdout)
return float(data[0])
def main():
identifier, warning_threshold, critical_threshold = sys.argv[-3:]
timesince = query_timesince(identifier)
warning_threshold = int(warning_threshold)
critical_threshold = int(critical_threshold)
time_strings = []
rd = dateutil.relativedelta.relativedelta(seconds=timesince)
for denomination in ['years', 'months', 'days', 'hours', 'minutes', 'seconds']:
value = getattr(rd, denomination, 0)
if value:
time_strings.append("%d %s" % (value, denomination))
string = ", ".join(time_strings)
reason = "datanommer has not seen a %r message in %s" % (identifier, string)
if timesince > critical_threshold:
print "CRIT: ", reason
sys.exit(2)
if timesince > warning_threshold:
print "WARN: ", reason
sys.exit(1)
print "OK: ", reason
sys.exit(0)
if __name__ == '__main__':
try:
main()
except Exception as e:
print "UNKNOWN: ", str(e)
sys.exit(3)

View file

@ -0,0 +1,23 @@
#!/usr/bin/env python
import sys
try:
import retask.queue
queue = retask.queue.Queue('fedora-packages')
queue.connect()
items = queue.length
if items > 500:
print "CRITICAL: %i tasks in fcomm queue" % items
sys.exit(2)
elif items > 250:
print "WARNING: %i tasks in fcomm queue" % items
sys.exit(1)
else:
print "OK: %i tasks in fcomm queue" % items
sys.exit(0)
except Exception as e:
print "UNKNOWN:", str(e)
sys.exit(3)

View file

@ -0,0 +1,62 @@
#!/usr/bin/env python
import json
import os
import socket
import sys
import zmq
try:
service = sys.argv[1]
check_consumer = sys.argv[2]
backlog_warning = int(sys.argv[3])
backlog_critical = int(sys.argv[4])
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
if not os.path.exists(fname):
print "UNKNOWN - %s does not exist" % fname
sys.exit(3)
if not os.access(fname, os.W_OK):
print "UNKNOWN - cannot write to %s" % fname
sys.exit(3)
connect_to = "ipc:///%s" % fname
ctx = zmq.Context()
s = ctx.socket(zmq.SUB)
s.connect(connect_to)
s.setsockopt(zmq.SUBSCRIBE, '')
poller = zmq.Poller()
poller.register(s, zmq.POLLIN)
timeout = 20000
events = dict(poller.poll(timeout))
if s in events and events[s] == zmq.POLLIN:
msg = s.recv()
msg = json.loads(msg)
else:
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
sys.exit(3)
for consumer in msg['consumers']:
if consumer['name'] == check_consumer:
if consumer['backlog'] is None:
print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name']
sys.exit(3)
elif consumer['backlog'] > backlog_critical:
print 'CRITICAL: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog'])
sys.exit(2)
elif consumer['backlog'] > backlog_warning:
print 'WARNING: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog'])
sys.exit(1)
else:
print 'OK: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog'])
sys.exit(0)
print "UNKNOWN: fedmsg consumer %s not found" % check_consumer
sys.exit(3)
except Exception as err:
print "UNKNOWN:", str(err)
sys.exit(3)

View file

@ -0,0 +1,58 @@
#!/usr/bin/env python
import json
import os
import socket
import sys
import zmq
try:
service = sys.argv[1]
check_consumer = sys.argv[2]
exceptions_warning = int(sys.argv[3])
exceptions_critical = int(sys.argv[4])
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
if not os.path.exists(fname):
print "UNKNOWN - %s does not exist" % fname
sys.exit(3)
if not os.access(fname, os.W_OK):
print "UNKNOWN - cannot write to %s" % fname
sys.exit(3)
connect_to = "ipc:///%s" % fname
ctx = zmq.Context()
s = ctx.socket(zmq.SUB)
s.connect(connect_to)
s.setsockopt(zmq.SUBSCRIBE, '')
poller = zmq.Poller()
poller.register(s, zmq.POLLIN)
timeout = 20000
events = dict(poller.poll(timeout))
if s in events and events[s] == zmq.POLLIN:
msg = s.recv()
msg = json.loads(msg)
else:
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
sys.exit(3)
for consumer in msg['consumers']:
if consumer['name'] == check_consumer:
if consumer['exceptions'] > exceptions_critical:
print 'CRITICAL: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions'])
sys.exit(2)
elif consumer['exceptions'] > exceptions_warning:
print 'WARNING: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions'])
sys.exit(1)
else:
print 'OK: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions'])
sys.exit(0)
print "UNKNOWN: fedmsg consumers %s not found" % check_consumer
sys.exit(3)
except Exception as err:
print "UNKNOWN:", str(err)
sys.exit(3)

View file

@ -0,0 +1,69 @@
#!/usr/bin/env python
import arrow
import json
import os
import socket
import sys
import time
import zmq
try:
service = sys.argv[1]
check_producer = sys.argv[2]
elapsed_warning = int(sys.argv[3])
elapsed_critical = int(sys.argv[4])
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
if not os.path.exists(fname):
print "UNKNOWN - %s does not exist" % fname
sys.exit(3)
if not os.access(fname, os.W_OK):
print "UNKNOWN - cannot write to %s" % fname
sys.exit(3)
connect_to = "ipc:///%s" % fname
ctx = zmq.Context()
s = ctx.socket(zmq.SUB)
s.connect(connect_to)
s.setsockopt(zmq.SUBSCRIBE, '')
poller = zmq.Poller()
poller.register(s, zmq.POLLIN)
timeout = 20000
events = dict(poller.poll(timeout))
if s in events and events[s] == zmq.POLLIN:
msg = s.recv()
msg = json.loads(msg)
else:
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
sys.exit(3)
now = time.time()
for prod in msg['producers']:
if prod['name'] != check_producer:
continue
diff = now - prod['last_ran']
then = arrow.get(prod['last_ran']).humanize()
if diff > elapsed_critical:
print "CRITICAL: %s last ran %s (%i seconds ago)" % (
check_producer, then, diff)
sys.exit(2)
elif diff > elapsed_warning:
print "WARNING: %s last ran %s (%i seconds ago)" % (
check_producer, then, diff)
sys.exit(1)
else:
print "OK: %s last ran %s (%i seconds ago)" % (
check_producer, then, diff)
sys.exit(0)
print "UNKNOWN: fedmsg producer %s not found" % check_producer
sys.exit(3)
except Exception as err:
print "UNKNOWN:", str(err)
sys.exit(3)

View file

@ -0,0 +1,64 @@
#!/usr/bin/env python
import json
import os
import socket
import sys
import zmq
try:
service = sys.argv[1]
check_list = frozenset(sys.argv[2:])
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
if not check_list:
print "UNKNOWN - empty list of fedmsg consumers and producers to check"
sys.exit(3)
if not os.path.exists(fname):
print "UNKNOWN - %s does not exist" % fname
sys.exit(3)
if not os.access(fname, os.W_OK):
print "UNKNOWN - cannot write to %s" % fname
sys.exit(3)
connect_to = "ipc:///%s" % fname
ctx = zmq.Context()
s = ctx.socket(zmq.SUB)
s.connect(connect_to)
s.setsockopt(zmq.SUBSCRIBE, '')
poller = zmq.Poller()
poller.register(s, zmq.POLLIN)
timeout = 20000
events = dict(poller.poll(timeout))
if s in events and events[s] == zmq.POLLIN:
msg = s.recv()
msg = json.loads(msg)
else:
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
sys.exit(3)
for consumer in msg['consumers']:
if consumer['name'] in check_list and not consumer['initialized']:
print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name']
sys.exit(2)
for producer in msg['producers']:
if producer['name'] in check_list and not producer['initialized']:
print 'ERROR: fedmsg producer %s is not initialized' % producer['name']
sys.exit(2)
for item in check_list:
if item not in [p['name'] for p in msg['producers'] + msg['consumers']]:
print 'ERROR: %s not found among installed plugins' % item
sys.exit(2)
print "OK: fedmsg consumer(s) and producer(s) initialized"
sys.exit(0)
except Exception as err:
print "UNKNOWN:", str(err)
sys.exit(3)

View file

@ -0,0 +1,76 @@
#!/usr/bin/env python
""" Nagios check for haproxy over-subscription.
fedmsg-gateway is the primary concern as it can eat up a ton of simultaneous
connections.
:Author: Ralph Bean <rbean@redhat.com>
"""
import socket
import sys
def _numeric(value):
""" Type casting utility """
try:
return int(value)
except ValueError:
try:
return float(value)
except ValueError:
return value
def query(sockname="/var/run/haproxy-stat"):
""" Read stats from the haproxy socket and return a dict """
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
s.connect("/var/run/haproxy-stat")
s.send('show info\n')
try:
response = s.recv(1024).strip()
lines = response.split('\n')
data = dict([map(str.strip, line.split(':')) for line in lines])
data = dict([(k, _numeric(v)) for k, v in data.items()])
return data
except Exception, e:
print str(e)
finally:
s.close()
return None
def nagios_check(data):
""" Print warnings and return nagios exit codes. """
current = data['CurrConns']
maxconn = data['Maxconn']
percent = 100 * float(current) / float(maxconn)
details = "%.2f%% subscribed. %i current of %i maxconn." % (
percent, current, maxconn,
)
if percent < 50:
print "HAPROXY SUBS OK: " + details
return 0
if percent < 75:
print "HAPROXY SUBS WARN: " + details
return 1
if percent <= 100:
print "HAPROXY SUBS CRIT: " + details
return 2
print "HAPROXY SUBS UNKNOWN: " + details
return 3
if __name__ == '__main__':
try:
data = query(sockname="/var/run/haproxy-stat")
except Exception as e:
print "HAPROXY SUBS UNKNOWN: " + str(e)
sys.exit(3)
sys.exit(nagios_check(data))

View file

@ -0,0 +1,59 @@
#!/usr/bin/env python
import socket
import sys
try:
unixsocket="/var/run/haproxy-stat"
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
s.connect(unixsocket)
s.send('show stat\n')
try:
output = s.recv(16384).strip().split('\n')
fields = output.pop(0).split(',')
fields[0]=fields[0].replace('# ','')
proxies = list()
for line in output:
proxies.append(dict(zip(fields,line.split(','))))
except Exception, e:
print str(e)
finally:
s.close()
except Exception as e:
print "MIRRORLIST STATE UNKNOWN: " + str(e)
sys.exit(3)
total=0
downcount=0
downlist=""
for proxy in proxies:
if proxy['svname'] == "FRONTEND" or proxy['svname'] == "BACKEND":
continue
if proxy['pxname'] == "mirror-lists":
total+=1
if proxy['status'] == "DOWN":
downlist+=proxy["svname"]+" "
downcount+=1
unavailability = 100 * float(downcount) / float(total)
if unavailability == 0:
print "MIRRORLIST STATE OK: " + downlist
sys.exit(0)
if unavailability < 50:
print "MIRRORLIST STATE WARN: " + downlist
sys.exit(1)
if unavailability >= 50:
print "MIRRORLIST STATE CRIT: " + downlist
sys.exit(2)
print "MIRRORLIST STATE UNKNOWN: " + downlist
sys.exit(3)

View file

@ -0,0 +1,74 @@
#!/usr/bin/python
# Source: https://github.com/opinkerfi/nagios-plugins/blob/master/check_ipa/check_ipa_replication
# Copyright 2013, Tomas Edwardsson
# Copyright 2016, Patrick Uiterwijk
#
# This script is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import ldap
from pynag.Plugins import PluginHelper, critical, warning, ok
plugin = PluginHelper()
plugin.parser.add_option('-u', help="ldap uri", dest="uri")
plugin.parser.add_option('-D', help="bind DN", dest="binddn")
plugin.parser.add_option('-w', help="bind password", dest="bindpw")
plugin.parse_arguments()
if not plugin.options.uri:
plugin.parser.error('-u (uri) argument is required')
try:
l = ldap.initialize(plugin.options.uri)
if plugin.options.binddn:
l.bind_s(plugin.options.binddn, plugin.options.bindpw)
replication = l.search_s('cn=config',
ldap.SCOPE_SUBTREE,
'(objectclass=nsds5replicationagreement)',
['nsDS5ReplicaHost', 'nsds5replicaLastUpdateStatus'])
except Exception, e:
plugin.status(critical)
plugin.add_summary("Unable to initialize ldap connection: %s" % (e))
plugin.exit()
# Loop through replication agreements
for rhost in replication:
plugin.add_summary("Replica %s Status: %s" % (rhost[1]['nsDS5ReplicaHost'][0], rhost[1]['nsds5replicaLastUpdateStatus'][0]))
status = rhost[1]['nsds5replicaLastUpdateStatus'][0]
code = status[:2]
if status.startswith('Error ('):
# IPA >=4.4.0
code = status[status.find('(')+1:status.find(')')]
else:
# IPA <4.4.0
code = status[:status.find(' ')]
if code == '0':
plugin.status(ok)
elif code == '1':
# Busy Replica is not an error, its "unknown" (but its "ok" for now)
plugin.status(ok)
else:
plugin.status(critical)
if not len(replication):
plugin.add_summary("Warning: No replicas found")
plugin.status(warning)
plugin.exit()

View file

@ -0,0 +1,17 @@
#!/usr/bin/python
import fcntl
import sys
try:
f = open('/mnt/koji/.nagios_test', 'r')
f.close()
f = open('/mnt/koji/.nagios_test', 'w')
except IOError:
print "Could not create file"
sys.exit(2)
fcntl.flock(f, fcntl.LOCK_EX)
f.close()
print "File Locked Successfully"
sys.exit(0)

View file

@ -0,0 +1,123 @@
#! /usr/bin/perl -w
# check_lock_file_age.pl Copyright (C) 2010 Ricky Elrod <codeblock@fedoraproject.org>
#
# Fork of check_file_age.pl
#
# Checks a lock file's size and modification time to make sure it's not empty
# and that it's sufficiently recent.
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# you should have received a copy of the GNU General Public License
# along with this program (or with Nagios); if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA
use strict;
use English;
use Getopt::Long;
use File::stat;
use vars qw($PROGNAME);
use lib "/usr/lib64/nagios/plugins";
use utils qw (%ERRORS &print_revision &support);
sub print_help ();
sub print_usage ();
my ($opt_c, $opt_f, $opt_w, $opt_h, $opt_V);
my ($result, $message, $age, $size, $st);
$PROGNAME="check_lock_file_age";
$opt_w = 1;
$opt_c = 5;
$opt_f = "";
Getopt::Long::Configure('bundling');
GetOptions(
"V" => \$opt_V, "version" => \$opt_V,
"h" => \$opt_h, "help" => \$opt_h,
"f=s" => \$opt_f, "file" => \$opt_f,
"w=f" => \$opt_w, "warning-age=f" => \$opt_w,
"c=f" => \$opt_c, "critical-age=f" => \$opt_c);
if ($opt_V) {
print_revision($PROGNAME, '1.4.14');
exit $ERRORS{'OK'};
}
if ($opt_h) {
print_help();
exit $ERRORS{'OK'};
}
if (($opt_c and $opt_w) and ($opt_c < $opt_w)) {
print "Warning time must be less than Critical time.\n";
exit $ERRORS{'UNKNOWN'};
}
$opt_f = shift unless ($opt_f);
if (! $opt_f) {
print "LOCK_FILE_AGE UNKNOWN: No file specified\n";
exit $ERRORS{'UNKNOWN'};
}
# Check that file exists (can be directory or link)
unless (-e $opt_f) {
print "LOCK_FILE_AGE OK: File not found (Lock file removed) - $opt_f\n";
exit $ERRORS{'OK'};
}
$st = File::stat::stat($opt_f);
$age = time - $st->mtime;
$result = 'OK';
# Convert minutes to seconds
if($opt_c) { $opt_c *= 60; }
if($opt_w) { $opt_w *= 60; }
if ($opt_c and $age > $opt_c) {
$result = 'CRITICAL';
}
elsif ($opt_w and $age > $opt_w) {
$result = 'WARNING';
}
# If the age is higher than 2 minutes, convert seconds -> minutes
# If it's higher than a day, use days.
# Just a nicety, to make people not have to do math ;)
if($age > 86400) { $age = int(($age/86400))." days"; }
elsif($age > 120) { $age = int(($age/60))." minutes"; }
else { $age = "$age seconds"; }
print "LOCK_FILE_AGE $result: $opt_f is $age old.\n";
exit $ERRORS{$result};
sub print_usage () {
print "Usage:\n";
print " $PROGNAME [-w <secs>] [-c <secs>] -f <file>\n";
print " $PROGNAME [-h | --help]\n";
print " $PROGNAME [-V | --version]\n";
}
sub print_help () {
print_revision($PROGNAME, '1.4.14');
print "Copyright (c) 2010 Ricky Elrod\n\n";
print_usage();
print "\n";
print " <mins> File must be no more than this many minutes old (default: warn 1m, crit 5m)\n";
print "\n";
support();
}

View file

@ -0,0 +1,24 @@
#!/bin/bash
#
# 2014-11-19
# Author: Ralph Bean <rbean@redhat.com>
# exit codes
ok=0
warn=1
crit=2
unkn=3
# Right now we just check to see if we can even run this command without
# hanging and timing out. In the future, we could parse stdout for more
# fine-grained information.
echo stats | nc 127.0.0.1 11211 > /dev/null
status=$?
if [ $status -ne 0 ]; then
echo "CRIT: stats command got status code $status"
exit $crit
else
echo "OK: stats command got status code $status"
exit $ok
fi

View file

@ -0,0 +1,14 @@
#!/usr/bin/python
import requests
import sys
r = requests.get("https://localhost:8443/", verify=False)
if 'paths' in r.json().keys():
print "OK: OSBS API endpoint is responding with path data"
sys.exit(0)
else:
print "CRITICAL: OSBS API not responding properly"
sys.exit(2)

View file

@ -0,0 +1,23 @@
#!/usr/bin/python
import subprocess
import sys
sp = subprocess.Popen(
["osbs", "list-builds"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.PIPE
)
sp_out, sp_err = sp.communicate()
sp_err = sp_err.split('\n')
if 'not attached to terminal' in sp_err[0]:
sp_err = sp_err[1:]
if sp_err[0].split()[0] == 'BUILD':
print "OK: OSBS is responsive to 'osbs list-builds'"
sys.exit(0)
else:
print "CRITICAL: OSBS UNRESPONSIVE"
sys.exit(2)

View file

@ -0,0 +1,49 @@
#!/bin/bash
#
# 19-07-2010
# Author: Cherwin Nooitmeer <cherwin@gmail.com>
#
# exit codes
e_ok=0
e_warning=1
e_critical=2
e_unknown=3
# regular expression that matches queue IDs (e.g. D71EF7AC80F8)
queue_id='^[A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9]'
usage="Invalid command line usage"
if [ -z $1 ]; then
echo $usage
exit $e_unknown
fi
while getopts ":w:c:" options
do
case $options in
w ) warning=$OPTARG ;;
c ) critical=$OPTARG ;;
* ) echo $usage
exit $e_unknown ;;
esac
done
# determine queue size
qsize=$(mailq | egrep -c $queue_id)
if [ -z $qsize ]
then
exit $e_unknown
fi
if [ $qsize -ge $critical ]; then
retval=$e_critical
elif [ $qsize -ge $warning ]; then
retval=$e_warning
elif [ $qsize -lt $warning ]; then
retval=$e_ok
fi
echo "$qsize mail(s) in queue | mail_queue=$qsize"
exit $retval

View file

@ -0,0 +1,26 @@
#!/bin/python
import sys
import requests
url = 'http://localhost:15672/api/queues/%%2f/%s' % (sys.argv[1])
r = requests.get(url, auth=('guest', 'guest')).json()
consumers = r['consumers']
messages = r['messages']
msg = 'Messages in queue: %i (%i consumers)' % (messages, consumers)
if consumers < 1:
print 'CRITICAL: No consumers: %s' % msg
sys.exit(2)
if messages > sys.argv[2]:
print 'CRITICAL: %s' % msg
sys.exit(2)
if messages > sys.argv[3]:
print 'WARNING: %s' % msg
sys.exit(1)
print 'OK: %s' % msg
sys.exit(0)

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python
#
# very simple python script to parse out /proc/mdstat
# and give results for nagios to monitor
#
import sys
import string
devices = []
try:
mdstat = string.split(open('/proc/mdstat').read(), '\n')
except IOError:
# seems we have no software raid on this machines
sys.exit(0)
error = ""
i = 0
for line in mdstat:
if line[0:2] == 'md':
device = string.split(line)[0]
devices.append(device)
status = string.split(mdstat[i+1])[3]
if string.count(status, "_"):
# see if we can figure out what's going on
err = string.split(mdstat[i+2])
msg = "device=%s status=%s" % (device, status)
if len(err) > 0:
msg = msg + " rebuild=%s" % err[0]
if not error:
error = msg
else:
error = error + ", " + msg
i = i + 1
if not error:
print "DEVICES %s OK" % " ".join(devices)
sys.exit(0)
else:
print error
sys.exit(2)

View file

@ -0,0 +1,84 @@
#!/bin/bash
# check_readonlyfs: Check for readonly filesystems
# Copyright (C) 2010 Davide Madrisan <davide.madrisan@gmail.com>
PROGNAME=`/bin/basename $0`
PROGPATH=`echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,'`
REVISION=`echo '$Revision: 1 $' | sed -e 's/[^0-9.]//g'`
. $PROGPATH/utils.sh
print_usage() {
echo "Usage: $PROGNAME --no-network-fs"
echo "Usage: $PROGNAME --help"
echo "Usage: $PROGNAME --version"
}
print_help() {
print_revision $PROGNAME $REVISION
echo ""
print_usage
echo ""
echo "readonly filesystem checker plugin for Nagios"
echo ""
support
}
NETFS=1
# Grab the command line arguments
exitstatus=$STATE_WARNING #default
while test -n "$1"; do
case "$1" in
--help|-h)
print_help
exit $STATE_OK
;;
--version|-V)
print_revision $PROGNAME $REVISION
exit $STATE_OK
;;
--no-network-fs|-n)
NETFS="0"
;;
*)
echo "Unknown argument: $1"
print_usage
exit $STATE_UNKNOWN
;;
esac
shift
done
[ -r /proc/mounts ] || { echo "cannot read /proc/mounts!"; exit $STATE_UNKNOWN; }
nerr=0
IFS_SAVE="$IFS"
rofs_list=""
while read dev mp fs mopt ignore; do
[ "$dev" = none ] && continue
case $fs in binfmt_misc|devpts|iso9660|proc|selinuxfs|rpc_pipefs|sysfs|tmpfs|usbfs)
continue ;;
esac
case $fs in autofs|nfs|nfs4|smbfs)
# skip the network filesystems
[ "$NETFS" = 0 ] && continue ;;
esac
IFS=","; set -- $mopt; IFS="$IFS_SAVE"
while :; do
case "$1" in
ro) rofs_list="$rofs_list $mp"; nerr=$(( $nerr + 1 )) ;;
"") shift; break ;;
esac
shift
done
done < <(LC_ALL=C /bin/cat /proc/mounts 2>/dev/null)
[ $nerr -eq 0 ] && { echo OK; exit $STATE_OK; } || echo "$rofs_list: read only fs"
exit $exitstatus

View file

@ -0,0 +1,108 @@
#!/usr/bin/env python
""" check_supybot_plugin -- ensure that a plugin is loaded by supybot.
Run like:
check_supybot_plugin --target fedmsg
check_supybot_plugin --target koji --debug
"""
import argparse
import sys
import socket
import string
import uuid
def process_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'-t', '--target', default=None, dest='target',
help="Required. The plugin we're looking for."
)
parser.add_argument(
'-n', '--nick', default=None, dest='nick',
help="NICK to use when connecting to freenode.",
)
parser.add_argument(
'-d', '--debug', default=False, action='store_true',
help='Print out debug information.', dest='debug',
)
parser.add_argument(
'-H', '--host', default='irc.freenode.net',
help='Host to connect to.', dest='host',
)
parser.add_argument(
'-p', '--port', default=6667, type=int,
help='Host to connect to.', dest='port',
)
return parser.parse_args()
args = process_args()
# Use a random nick so people can't mess with us
if not args.nick:
args.nick = 'nrpe-' + str(uuid.uuid4()).split('-')[0]
name = "NRPE Bot"
readbuffer = ""
if not args.target:
print "UNKNOWN: No 'target' specified."
sys.exit(3)
args.target = args.target.lower()
if args.debug:
print "connecting to %s/%i" % (args.host, args.port)
try:
s = socket.socket()
s.connect((args.host, args.port))
if args.debug:
print "as %s/%s (%s)" % (args.nick, args.nick, name)
s.send("nick %s\r\n" % args.nick)
s.send("USER %s %s bla :%s\r\n" % (args.nick, args.host, name))
while 1:
readbuffer = readbuffer+s.recv(1024)
temp = string.split(readbuffer, "\n")
readbuffer = temp.pop()
for line in temp:
line = string.rstrip(line)
if args.debug:
print " * ", line
line = string.split(line)
if line[1] == 'MODE':
msg = "privmsg zodbot :list\r\n"
if args.debug:
print "sending:"
print " ->", msg
s.send(msg)
if line[1] == 'PRIVMSG':
if args.debug:
print "Got our response.."
plugins = map(str.lower, ' '.join(line[3:][1:]).split(', '))
if args.target in plugins:
print "OK"
s.send("QUIT")
sys.exit(0)
else:
print "CRITICAL: %r not loaded by supybot" % args.target
s.send("QUIT")
sys.exit(2)
except Exception as e:
print "UNKNOWN: ", str(e)
if args.debug:
raise
sys.exit(3)

View file

@ -0,0 +1,19 @@
#!/bin/bash
RUNNING_VMS=`testcloud instance list | grep -i 'running' | wc -l`
CRITICAL=20
WARNING=15
if [ $RUNNING_VMS -gt $CRITICAL ]
then
echo "Testcloud: CRITICAL Number of VMs running: $RUNNING_VMS"
exit 2
elif [ $RUNNING_VMS -gt $WARNING ]
then
echo "Testcloud: WARNING Number of VMs running: $RUNNING_VMS"
exit 1
else
echo "Testcloud: OK Number of VMs running: $RUNNING_VMS"
exit 0
fi

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,11 @@
module fi-nrpe 1.0;
require {
type nagios_system_plugin_t;
type nrpe_exec_t;
class file getattr;
}
#============= nagios_system_plugin_t ==============
allow nagios_system_plugin_t nrpe_exec_t:file getattr;

View file

@ -0,0 +1,3 @@
---
- name: restart nrpe
service: name=nrpe state=restarted

View file

@ -0,0 +1,228 @@
# nagios-client/nrpe
---
# install pkgs:
- name: install nagios client pkgs
yum: name={{ item }} state=present
with_items:
- nrpe
- nagios-plugins
- nagios-plugins-disk
- nagios-plugins-file_age
- nagios-plugins-users
- nagios-plugins-procs
- nagios-plugins-swap
- nagios-plugins-load
- nagios-plugins-ping
tags:
- packages
- nagios_client
when: ansible_distribution_major_version|int < 22
# install pkgs:
- name: install nagios client pkgs
dnf: name={{ item }} state=present
with_items:
- nrpe
- nagios-plugins
- nagios-plugins-disk
- nagios-plugins-file_age
- nagios-plugins-users
- nagios-plugins-procs
- nagios-plugins-swap
- nagios-plugins-load
- nagios-plugins-ping
tags:
- packages
- nagios_client
when: ansible_distribution_major_version|int > 21
- name: install local nrpe check scripts that are not packaged
copy: src="scripts/{{ item }}" dest="{{ libdir }}/nagios/plugins/{{ item }}" mode=0755 owner=nagios group=nagios
with_items:
- check_haproxy_conns.py
- check_haproxy_mirrorlist.py
- check_postfix_queue
- check_raid.py
- check_lock
- check_fcomm_queue
- check_fedmsg_consumer_backlog.py
- check_fedmsg_consumer_exceptions.py
- check_fedmsg_producer_last_ran.py
- check_fedmsg_producers_consumers.py
- check_supybot_plugin
- check_rabbitmq_size
- check_datanommer_timesince.py
- check_memcache_connect
- check_readonly_fs
- check_lock_file_age
- check_testcloud
- check_osbs_builds.py
- check_osbs_api.py
- check_ipa_replication
when: not inventory_hostname.startswith('noc')
tags:
- nagios_client
# create dirs
# puppet used to make /var/spool/nagios (owned by nagios.nagios) mode 750
# and /usr/lib/nagios/plugins (owned by root) mode 755 - but we don't know WHY
# then stuff it with plugins from the plugins dir in the nagios module
# then we symlinked that to /usr/lib64/nagios/plugins
# it was a nightmare - don't do that - my ghost will haunt you if you do
# skvidal 2013-05-21
# Three tasks for handling our custom selinux module
- name: ensure a directory exists for our custom selinux module
file: dest=/usr/share/nrpe state=directory
- name: copy over our custom selinux module
copy: src=selinux/fi-nrpe.pp dest=/usr/share/nrpe/fi-nrpe.pp
register: selinux_module
- name: install our custom selinux module
command: semodule -i /usr/share/nrpe/fi-nrpe.pp
when: ansible_distribution_major_version|int == 7 and selinux_module|changed
# Set up our base config.
- name: /etc/nagios/nrpe.cfg
template: src=nrpe.cfg.j2 dest=/etc/nagios/nrpe.cfg
when: not inventory_hostname.startswith('noc')
notify:
- restart nrpe
tags:
- config
- nagios_client
#
# The actual items files here end in .j2 (they are templates)
# So when adding or modifying them change the .j2 version in git.
#
- name: install nrpe client configs
template: src={{ item }}.j2 dest=/etc/nrpe.d/{{ item }}
with_items:
- check_mirrorlist_cache.cfg
- check_raid.cfg
- check_ipa.cfg
- check_readonly_fs.cfg
- check_cron.cfg
- check_disk.cfg
- check_swap.cfg
- check_postfix_queue.cfg
- check_lock.cfg
- check_fedmsg_hub_proc.cfg
- check_fedmsg_irc_proc.cfg
- check_fedmsg_relay_proc.cfg
- check_fedmsg_gateway_proc.cfg
- check_fedmsg_masher_proc.cfg
- check_redis_proc.cfg
- check_autocloud_proc.cfg
- check_fedmsg_consumers.cfg
- check_supybot_fedmsg_plugin.cfg
- check_datanommer_history.cfg
- check_memcache.cfg
- check_lock_file_age.cfg
- check_basset.cfg
- check_fmn.cfg
- check_osbs.cfg
- check_koschei_polling_proc.cfg
- check_koschei_resolver_proc.cfg
- check_koschei_scheduler_proc.cfg
- check_koschei_watcher_proc.cfg
- check_testcloud.cfg
notify:
- restart nrpe
tags:
- config
- nagios_client
#
# The actual items files here end in .j2 (they are templates)
# So when adding or modifying them change the .j2 version in git.
#
- name: install nrpe bugyou fedmsg hubs check config
template: src=check_fedmsg_hub_procs_bugyou.cfg.j2 dest=/etc/nrpe.d/check_fedmsg_hub_procs_bugyou.cfg
when: inventory_hostname.startswith('bugyou01')
notify:
- restart nrpe
tags:
- nagios_client
#
# The actual items files here end in .j2 (they are templates)
# So when adding or modifying them change the .j2 version in git.
#
- name: install nrpe openvpn check config
template: src=check_openvpn_link.cfg.j2 dest=/etc/nrpe.d/check_openvpn_link.cfg
when: datacenter != 'phx2'
notify:
- restart nrpe
tags:
- nagios_client
#
# The actual items files here end in .j2 (they are templates)
# So when adding or modifying them change the .j2 version in git.
#
- name: install nrpe unbound check config
template: src=check_unbound_proc.cfg.j2 dest=/etc/nrpe.d/check_unbound_proc.cfg
when: inventory_hostname.startswith('unbound')
notify:
- restart nrpe
tags:
- nagios_client
#
# The actual items files here end in .j2 (they are templates)
# So when adding or modifying them change the .j2 version in git.
#
- name: install nrpe merged log check script on log01
template: src=check_merged_file_age.cfg.j2 dest=/etc/nrpe.d/check_merged_file_age.cfg
when: inventory_hostname.startswith('log0')
notify:
- restart nrpe
tags:
- nagios_client
#
# The actual items files here end in .j2 (they are templates)
# So when adding or modifying them change the .j2 version in git.
#
- name: install nrpe check_mysql config for mariadb servers
template: src=check_mysql.cfg.j2 dest=/etc/nrpe.d/check_mysql.cfg
when: inventory_hostname.startswith('db03')
notify:
- restart nrpe
tags:
- nagios_client
- name: install nrpe checks for proxies
template: src={{ item }}.j2 dest=/etc/nrpe.d/{{ item }}
with_items:
- check_happroxy_conns.cfg
- check_happroxy_mirrorlist.cfg
- check_varnish_proc.cfg
when: inventory_hostname.startswith('proxy')
notify:
- restart nrpe
tags:
- nagios_client
- name: nrpe service start
service: name=nrpe state=running enabled=true
tags:
- service
- nagios_client
- name: Check if the fedmsg group exists
shell: /usr/bin/getent group fedmsg | /usr/bin/wc -l | tr -d ' '
register: fedmsg_exists
check_mode: no
changed_when: "1 != 1"
tags:
- nagios_client
- name: Add nrpe user to the fedmsg group if it exists
user: name=nrpe groups=fedmsg append=yes
when: fedmsg_exists.stdout == "1"
tags:
- nagios_client

View file

@ -0,0 +1 @@
command[check_autocloud_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'python' -a 'autocloud_job.py' -u root

View file

@ -0,0 +1,4 @@
command[check_mongo_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u mongodb -C mongod -c 1:1
command[check_rabbitmq_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u rabbitmq -C beam.smp -c 1:1
command[check_worker_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u basset-worker -C basset-worker -c 1:6
command[check_basset_queue]={{ libdir }}/nagios/plugins/check_rabbitmq_size check_submission 10 20

View file

@ -0,0 +1 @@
command[check_cron]={{ libdir }}/nagios/plugins/check_procs -c 1:15 -C 'crond' -u root

View file

@ -0,0 +1,50 @@
# Checks on the datanommer history to make sure we're still receiving messages
# of all types.
#
# The following are fedmsg/datanommer checks to be run on busgateway01.
# They check for the time since the latest message in any particular category.
# The first number is the seconds elapsed until we should raise a warning.
# The second number is the seconds elapsed until we should raise an error.
# For your reference:
# 4 hours -> 14400
# 1 day -> 86400
# 3 days -> 259200
# 1 week -> 604800
# 3 weeks -> 1814400
# 1 month -> 2628000
# 3 months -> 7884000
command[check_datanommer_buildsys]={{libdir}}/nagios/plugins/check_datanommer_timesince.py buildsys 14400 86400
command[check_datanommer_git]={{libdir}}/nagios/plugins/check_datanommer_timesince.py git 86400 604800
command[check_datanommer_bodhi]={{libdir}}/nagios/plugins/check_datanommer_timesince.py bodhi 86400 604800
command[check_datanommer_wiki]={{libdir}}/nagios/plugins/check_datanommer_timesince.py wiki 259200 1814400
command[check_datanommer_compose]={{libdir}}/nagios/plugins/check_datanommer_timesince.py compose 259200 1814400
command[check_datanommer_meetbot]={{libdir}}/nagios/plugins/check_datanommer_timesince.py meetbot 604800 2628000
command[check_datanommer_fas]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fas 1814400 2628000
command[check_datanommer_pkgdb]={{libdir}}/nagios/plugins/check_datanommer_timesince.py pkgdb 1814400 2628000
command[check_datanommer_fedoratagger]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedoratagger 2628000 7884000
command[check_datanommer_planet]={{libdir}}/nagios/plugins/check_datanommer_timesince.py planet 2628000 7884000
command[check_datanommer_copr]={{libdir}}/nagios/plugins/check_datanommer_timesince.py copr 21600 86400
command[check_datanommer_trac]={{libdir}}/nagios/plugins/check_datanommer_timesince.py trac 86400 259200
command[check_datanommer_askbot]={{libdir}}/nagios/plugins/check_datanommer_timesince.py askbot 86400 259200
command[check_datanommer_fedbadges]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedbadges 86400 259200
command[check_datanommer_fedocal]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedocal 7884000 23652000
command[check_datanommer_ansible]={{libdir}}/nagios/plugins/check_datanommer_timesince.py ansible 432000 604800
command[check_datanommer_summershum]={{libdir}}/nagios/plugins/check_datanommer_timesince.py summershum 604800 1814400
command[check_datanommer_jenkins]={{libdir}}/nagios/plugins/check_datanommer_timesince.py jenkins 432000 604800
command[check_datanommer_github]={{libdir}}/nagios/plugins/check_datanommer_timesince.py github 432000 604800
command[check_datanommer_kerneltest]={{libdir}}/nagios/plugins/check_datanommer_timesince.py kerneltest 604800 1814400
command[check_datanommer_fmn]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fmn 604800 1814400
command[check_datanommer_anitya]={{libdir}}/nagios/plugins/check_datanommer_timesince.py anitya 604800 1814400
command[check_datanommer_fedimg]={{libdir}}/nagios/plugins/check_datanommer_timesince.py fedimg 259200 604800
command[check_datanommer_hotness]={{libdir}}/nagios/plugins/check_datanommer_timesince.py hotness 604800 1814400
command[check_datanommer_faf]={{libdir}}/nagios/plugins/check_datanommer_timesince.py faf 86400 259200
command[check_datanommer_koschei]={{libdir}}/nagios/plugins/check_datanommer_timesince.py koschei 86400 604800
command[check_datanommer_autocloud]={{libdir}}/nagios/plugins/check_datanommer_timesince.py autocloud 259200 1814400
command[check_datanommer_twoweekatomic]=/usr/lib64/nagios/plugins/check_datanommer_timesince.py org.fedoraproject.prod.releng.atomic.twoweek.complete 1296000 1382400
# This one is retired since it times out all the time. Too few messages.
#command[check_datanommer_nuancier]={{libdir}}/nagios/plugins/check_datanommer_timesince.py nuancier 23652000 31536000
# These are not actually finished and deployed yet
command[check_datanommer_mailman]={{libdir}}/nagios/plugins/check_datanommer_timesince.py mailman 14400 86400
command[check_datanommer_bugzilla]={{libdir}}/nagios/plugins/check_datanommer_timesince.py bugzilla 86400 259200

View file

@ -0,0 +1,7 @@
command[check_disk_/]={{ libdir }}/nagios/plugins/check_disk -w 14% -c 10% -p /
command[check_disk_/boot]={{ libdir }}/nagios/plugins/check_disk -w 15% -c 10% -p /boot
command[check_disk_/srv/cache/lookaside]={{ libdir }}/nagios/plugins/check_disk -w 20% -c 10% -p /srv/cache/lookaside
command[check_disk_/srv]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv
command[check_disk_/srv/buildmaster]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/buildmaster
command[check_disk_/srv/taskotron]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p /srv/taskotron
command[check_disk_/var/log]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 15% -p /var/log

View file

@ -0,0 +1,63 @@
# Fedmsg checks for consumers and producers
command[check_fedmsg_cp_busgateway_hub]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub Nommer MonitoringProducer
command[check_fedmsg_cp_busgateway_relay]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer
command[check_fedmsg_cp_busgateway_gateway]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-gateway GatewayConsumer MonitoringProducer
command[check_fedmsg_cp_anitya_relay]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer
command[check_fedmsg_cp_app]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-relay RelayConsumer MonitoringProducer
command[check_fedmsg_cp_value]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-irc IRCBotConsumer MonitoringProducer
command[check_fedmsg_cp_pkgs]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub GenACLsConsumer MonitoringProducer
command[check_fedmsg_cp_summershum]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub SummerShumConsumer MonitoringProducer
command[check_fedmsg_cp_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FedoraBadgesConsumer MonitoringProducer
command[check_fedmsg_cp_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FMNConsumer DigestProducer ConfirmationProducer MonitoringProducer
command[check_fedmsg_cp_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py moksha-hub BugzillaConsumer MonitoringProducer
command[check_fedmsg_cp_fedimg_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub FedimgConsumer MonitoringProducer
command[check_fedmsg_cp_hotness_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub BugzillaTicketFiler MonitoringProducer
command[check_fedmsg_cp_bodhi_backend01_hub]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub Masher MonitoringProducer
command[check_fedmsg_cp_bodhi_backend02_hub]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub UpdatesHandler MonitoringProducer
command[check_fedmsg_cp_autocloud_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub AutoCloudConsumer MonitoringProducer
command[check_fedmsg_cp_packages_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub CacheInvalidator MonitoringProducer
command[check_fedmsg_cp_bugyou_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub BugyouConsumer MonitoringProducer
command[check_fedmsg_cp_pdc_backend]={{libdir}}/nagios/plugins/check_fedmsg_producers_consumers.py fedmsg-hub PDCUpdater MonitoringProducer
command[check_fedmsg_cexceptions_busgateway_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub Nommer 1 10
command[check_fedmsg_cexceptions_busgateway_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10
command[check_fedmsg_cexceptions_busgateway_gateway]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-gateway GatewayConsumer 1 10
command[check_fedmsg_cexceptions_anitya_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10
command[check_fedmsg_cexceptions_app]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-relay RelayConsumer 1 10
command[check_fedmsg_cexceptions_value]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-irc IRCBotConsumer 1 10
command[check_fedmsg_cexceptions_pkgs]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub GenACLsConsumer 1 10
command[check_fedmsg_cexceptions_summershum]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub SummerShumConsumer 1 10
command[check_fedmsg_cexceptions_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FedoraBadgesConsumer 1 10
command[check_fedmsg_cexceptions_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FMNConsumer 1 10
command[check_fedmsg_cexceptions_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py moksha-hub BugzillaConsumer 1 10
command[check_fedmsg_cexceptions_fedimg_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub FedimgConsumer 1 10
command[check_fedmsg_cexceptions_hotness_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub BugzillaTicketFiler 1 10
command[check_fedmsg_cexceptions_bodhi_backend01_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub Masher 1 10
command[check_fedmsg_cexceptions_bodhi_backend02_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub UpdatesHandler 1 10
command[check_fedmsg_cexceptions_autocloud_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub AutoCloudConsumer 1 10
command[check_fedmsg_cexceptions_packages_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub CacheInvalidator 1 10
command[check_fedmsg_cexceptions_bugyou_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub BugyouConsumer 1 10
command[check_fedmsg_cexceptions_pdc_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_exceptions.py fedmsg-hub PDCUpdater 1 10
command[check_fedmsg_cbacklog_busgateway_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub Nommer 500 1000
command[check_fedmsg_cbacklog_busgateway_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50
command[check_fedmsg_cbacklog_busgateway_gateway]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-gateway GatewayConsumer 10 50
command[check_fedmsg_cbacklog_anitya_relay]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50
command[check_fedmsg_cbacklog_app]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-relay RelayConsumer 10 50
command[check_fedmsg_cbacklog_value]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-irc IRCBotConsumer 10 50
command[check_fedmsg_cbacklog_pkgs]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub GenACLsConsumer 10 50
command[check_fedmsg_cbacklog_summershum]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub SummerShumConsumer 100 500
command[check_fedmsg_cbacklog_badges_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedoraBadgesConsumer 7000 10000
command[check_fedmsg_cbacklog_notifs_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FMNConsumer 15000 20000
command[check_fedmsg_cbacklog_bugzilla2fedmsg]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py moksha-hub BugzillaConsumer 10 100
command[check_fedmsg_cbacklog_fedimg_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub FedimgConsumer 2000 5000
command[check_fedmsg_cbacklog_hotness_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub BugzillaTicketFiler 1000 5000
command[check_fedmsg_cbacklog_bodhi_backend01_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub Masher 500 1000
command[check_fedmsg_cbacklog_bodhi_backend02_hub]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub UpdatesHandler 500 1000
command[check_fedmsg_cbacklog_autocloud_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub AutoCloudConsumer 100 500
command[check_fedmsg_cbacklog_packages_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub CacheInvalidator 20000 30000
command[check_fedmsg_cbacklog_bugyou_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub BugyouConsumer 5000 10000
command[check_fedmsg_cbacklog_pdc_backend]={{libdir}}/nagios/plugins/check_fedmsg_consumer_backlog.py fedmsg-hub PDCUpdater 10000 20000
command[check_fedmsg_fmn_digest_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 90 600
command[check_fedmsg_fmn_confirm_last_ran]={{libdir}}/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 90 600

View file

@ -0,0 +1 @@
command[check_fedmsg_gateway_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-gateway' -u fedmsg

View file

@ -0,0 +1 @@
command[check_fedmsg_hub_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-hub' -u fedmsg

View file

@ -0,0 +1 @@
command[check_fedmsg_hub_procs_bugyou]={{ libdir }}/nagios/plugins/check_procs -c 3:3 -C 'fedmsg-hub' -u fedmsg

View file

@ -0,0 +1 @@
command[check_fedmsg_irc_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-irc' -u fedmsg

View file

@ -0,0 +1 @@
command[check_fedmsg_masher_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-hub' -u apache

View file

@ -0,0 +1 @@
command[check_fedmsg_relay_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'fedmsg-relay' -u fedmsg

View file

@ -0,0 +1,2 @@
command[check_fmn_worker_queue]={{ libdir }}/nagios/plugins/check_rabbitmq_size workers 200 1000
command[check_fmn_backend_queue]={{ libdir }}/nagios/plugins/check_rabbitmq_size backends 100 200

View file

@ -0,0 +1 @@
command[check_haproxy_conns]=/usr/lib64/nagios/plugins/check_haproxy_conns.py

View file

@ -0,0 +1 @@
command[check_haproxy_mirrorlist]=/usr/lib64/nagios/plugins/check_haproxy_mirrorlist.py

View file

@ -0,0 +1 @@
command[check_ipa_replication]={{ libdir }}/nagios/plugins/check_ipa_replication -u ldaps://localhost/

View file

@ -0,0 +1 @@
command[check_koschei_polling_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-polling -c 1:1

View file

@ -0,0 +1 @@
command[check_koschei_resolver_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-resolve -c 1:1

View file

@ -0,0 +1 @@
command[check_koschei_scheduler_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-schedul -c 1:1

View file

@ -0,0 +1 @@
command[check_koschei_watcher_proc]={{ libdir }}/nagios/plugins/check_procs -s RSD -u koschei -C koschei-watcher -c 1:1

View file

@ -0,0 +1 @@
command[check_lock]={{ libdir }}/nagios/plugins/check_lock

View file

@ -0,0 +1 @@
command[check_lock_file_age]={{ libdir }}/nagios/plugins/check_lock_file_age -w 1 -c 5 -f /var/lock/fedora-ca/lock

View file

@ -0,0 +1,2 @@
command[check_memcache]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/memcached' -u memcached
command[check_memcache_connect]=/usr/lib64/nagios/plugins/check_memcache_connect

View file

@ -0,0 +1 @@
command[check_merged_file_age]=/usr/lib64/nagios/plugins/check_file_age -w 120 -c 300 /var/log/merged/messages.log

View file

@ -0,0 +1 @@
command[check_mirrorlist_cache]={{ libdir }}/nagios/plugins/check_file_age -w 14400 -c 129600 -f /var/lib/mirrormanager/mirrorlist_cache.pkl

View file

@ -0,0 +1 @@
command[check_mysql_backup]={{ libdir }}/nagios/plugins/check_file_age -w 86400 -c 129600 -f /backups/fpo-mediawiki-latest.xz

View file

@ -0,0 +1 @@
command[check_openvpn_link]={{ libdir }}/nagios/plugins/check_ping -H 192.168.1.41 -w 375.0,20% -c 500,60%

View file

@ -0,0 +1,2 @@
command[check_osbs_builds]={{ libdir }}/nagios/plugins/check_osbs_builds.py
command[check_osbs_api]={{ libdir }}/nagios/plugins/check_osbs_api.py

View file

@ -0,0 +1 @@
command[check_postfix_queue]={{ libdir }}/nagios/plugins/check_postfix_queue -w {{ nrpe_check_postfix_queue_warn }} -c {{ nrpe_check_postfix_queue_crit }}

View file

@ -0,0 +1 @@
command[check_raid]={{ libdir }}/nagios/plugins/check_raid.py

View file

@ -0,0 +1 @@
command[check_readonly_fs]=/usr/lib64/nagios/plugins/check_readonly_fs

View file

@ -0,0 +1 @@
command[check_redis_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'redis-server' -u redis

View file

@ -0,0 +1 @@
command[check_supybot_fedmsg_plugin]={{libdir}}/nagios/plugins/check_supybot_plugin -t fedmsg

View file

@ -0,0 +1 @@
command[check_swap]={{ libdir }}/nagios/plugins/check_swap -w 15% -c 10%

View file

@ -0,0 +1 @@
command[check_testcloud]={{ libdir }}/nagios/plugins/check_testcloud

View file

@ -0,0 +1 @@
command[check_unbound_proc]={{ libdir }}/nagios/plugins/check_procs -c 1:1 -C 'unbound' -u unbound

View file

@ -0,0 +1 @@
command[check_varnish_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:2 -C 'varnishd' -u varnish

View file

@ -0,0 +1,228 @@
#############################################################################
# Sample NRPE Config File
# Written by: Ethan Galstad (nagios@nagios.org)
#
# Last Modified: 11-23-2007
#
# NOTES:
# This is a sample configuration file for the NRPE daemon. It needs to be
# located on the remote host that is running the NRPE daemon, not the host
# from which the check_nrpe client is being executed.
#############################################################################
# LOG FACILITY
# The syslog facility that should be used for logging purposes.
log_facility=daemon
# PID FILE
# The name of the file in which the NRPE daemon should write it's process ID
# number. The file is only written if the NRPE daemon is started by the root
# user and is running in standalone mode.
pid_file=/var/run/nrpe/nrpe.pid
# PORT NUMBER
# Port number we should wait for connections on.
# NOTE: This must be a non-priviledged port (i.e. > 1024).
# NOTE: This option is ignored if NRPE is running under either inetd or xinetd
server_port=5666
# SERVER ADDRESS
# Address that nrpe should bind to in case there are more than one interface
# and you do not want nrpe to bind on all interfaces.
# NOTE: This option is ignored if NRPE is running under either inetd or xinetd
#server_address=127.0.0.1
# NRPE USER
# This determines the effective user that the NRPE daemon should run as.
# You can either supply a username or a UID.
#
# NOTE: This option is ignored if NRPE is running under either inetd or xinetd
nrpe_user=nrpe
# NRPE GROUP
# This determines the effective group that the NRPE daemon should run as.
# You can either supply a group name or a GID.
#
# NOTE: This option is ignored if NRPE is running under either inetd or xinetd
nrpe_group=nrpe
# ALLOWED HOST ADDRESSES
# This is an optional comma-delimited list of IP address or hostnames
# that are allowed to talk to the NRPE daemon. Network addresses with a bit mask
# (i.e. 192.168.1.0/24) are also supported. Hostname wildcards are not currently
# supported.
#
# Note: The daemon only does rudimentary checking of the client's IP
# address. I would highly recommend adding entries in your /etc/hosts.allow
# file to allow only the specified host to connect to the port
# you are running this daemon on.
#
# NOTE: This option is ignored if NRPE is running under either inetd or xinetd
allowed_hosts=10.5.126.41,192.168.1.10,192.168.1.20,209.132.181.35
# COMMAND ARGUMENT PROCESSING
# This option determines whether or not the NRPE daemon will allow clients
# to specify arguments to commands that are executed. This option only works
# if the daemon was configured with the --enable-command-args configure script
# option.
#
# *** ENABLING THIS OPTION IS A SECURITY RISK! ***
# Read the SECURITY file for information on some of the security implications
# of enabling this variable.
#
# Values: 0=do not allow arguments, 1=allow command arguments
dont_blame_nrpe=0
# COMMAND PREFIX
# This option allows you to prefix all commands with a user-defined string.
# A space is automatically added between the specified prefix string and the
# command line from the command definition.
#
# *** THIS EXAMPLE MAY POSE A POTENTIAL SECURITY RISK, SO USE WITH CAUTION! ***
# Usage scenario:
# Execute restricted commmands using sudo. For this to work, you need to add
# the nagios user to your /etc/sudoers. An example entry for alllowing
# execution of the plugins from might be:
#
# nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/
#
# This lets the nagios user run all commands in that directory (and only them)
# without asking for a password. If you do this, make sure you don't give
# random users write access to that directory or its contents!
# command_prefix=/usr/bin/sudo
# DEBUGGING OPTION
# This option determines whether or not debugging messages are logged to the
# syslog facility.
# Values: 0=debugging off, 1=debugging on
debug=0
# COMMAND TIMEOUT
# This specifies the maximum number of seconds that the NRPE daemon will
# allow plugins to finish executing before killing them off.
command_timeout=100
# CONNECTION TIMEOUT
# This specifies the maximum number of seconds that the NRPE daemon will
# wait for a connection to be established before exiting. This is sometimes
# seen where a network problem stops the SSL being established even though
# all network sessions are connected. This causes the nrpe daemons to
# accumulate, eating system resources. Do not set this too low.
connection_timeout=300
# WEEK RANDOM SEED OPTION
# This directive allows you to use SSL even if your system does not have
# a /dev/random or /dev/urandom (on purpose or because the necessary patches
# were not applied). The random number generator will be seeded from a file
# which is either a file pointed to by the environment valiable $RANDFILE
# or $HOME/.rnd. If neither exists, the pseudo random number generator will
# be initialized and a warning will be issued.
# Values: 0=only seed from /dev/[u]random, 1=also seed from weak randomness
#allow_weak_random_seed=1
# INCLUDE CONFIG FILE
# This directive allows you to include definitions from an external config file.
#include=<somefile.cfg>
# INCLUDE CONFIG DIRECTORY
# This directive allows you to include definitions from config files (with a
# .cfg extension) in one or more directories (with recursion).
include_dir=/etc/nrpe.d/
# COMMAND DEFINITIONS
# Command definitions that this daemon will run. Definitions
# are in the following format:
#
# command[<command_name>]=<command_line>
#
# When the daemon receives a request to return the results of <command_name>
# it will execute the command specified by the <command_line> argument.
#
# Unlike Nagios, the command line cannot contain macros - it must be
# typed exactly as it should be executed.
#
# Note: Any plugins that are used in the command lines must reside
# on the machine that this daemon is running on! The examples below
# assume that you have plugins installed in a /usr/local/nagios/libexec
# directory. Also note that you will have to modify the definitions below
# to match the argument format the plugins expect. Remember, these are
# examples only!
# The following examples use hardcoded command arguments...
command[check_users]={{ libdir }}/nagios/plugins/check_users -w 5 -c 10
command[check_load]={{ libdir }}/nagios/plugins/check_load -w 15,10,5 -c 30,25,20
command[check_hda1]={{ libdir }}/nagios/plugins/check_disk -w 20% -c 10% -p /dev/hda1
{% if inventory_hostname not in groups['zombie-infested'] %}
command[check_zombie_procs]={{ libdir }}/nagios/plugins/check_procs -w 5 -c 10 -s Z
{% else %}
# This host is prone to Zombies and we do not care or want to alert on it so we make the limits very high
command[check_zombie_procs]={{ libdir }}/nagios/plugins/check_procs -w 50000 -c 100000 -s Z
{% endif %}
command[check_total_procs]={{ libdir }}/nagios/plugins/check_procs -w {{ nrpe_procs_warn }} -c {{ nrpe_procs_crit }}
# The following examples allow user-supplied arguments and can
# only be used if the NRPE daemon was compiled with support for
# command arguments *AND* the dont_blame_nrpe directive in this
# config file is set to '1'. This poses a potential security risk, so
# make sure you read the SECURITY file before doing this.
#command[check_users]=/usr/lib64/nagios/plugins/check_users -w $ARG1$ -c $ARG2$
#command[check_load]=/usr/lib64/nagios/plugins/check_load -w $ARG1$ -c $ARG2$
#command[check_disk]=/usr/lib64/nagios/plugins/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
#command[check_procs]=/usr/lib64/nagios/plugins/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$
# NEVER ADD ANYTHING HERE - ANY ENTRIES TO NRPE SHOULD BE in .cfg files in /etc/nrpe.d/
# NEVER NEVER NEVER
#

View file

@ -0,0 +1,78 @@
===================================
Nagios 4 Configuration for Fedora
===================================
The Fedora Infrastructure Nagios is built on a set of configurations
originally written for Nagios 2 and then upgraded over time to Nagios
3 and then 4.08. With additional changes made in the 4.2 series of
Nagios this needed a better rewrite as various parts came from
pre-puppet and then various puppet modules added on top.
In order to get this rewrite done, we will use as much of the original
layout of the Fedora ansible nagios module but with rewrites to better
match current Nagios configurations so that it can be maintained.
Role directory layout
=====================
The original layout branched out from
roles/nagios/client/
roles/nagios/server/
With the usual trees below this. This breaks ansible best practices
and how most new modules are set up so the rewrite uses:
roles/nagios_client/
roles/nagios_server/
=====================
Nagios Server Files
=====================
The Nagios Server Files require a large layout change. The original
Nagios system used multiple independant modes and files which caused
problems when hosts were removed. The new system will use hosts set up
from the Fedora Ansible Inventory with hostgroups set up to match
groups.
roles/nagios_server/{files,handlers,tasks,templates}
r.../n.../files/httpd ==> /etc/httpd/conf.d files
r.../n.../files/nagios ==> /etc/nagios/ files
r.../n.../files/nagios/commands command files
r.../n.../files/nagios/hosts host files
r.../n.../files/nagios/hostgroups groups made from hosts
r.../n.../files/nagios/services services
r.../n.../files/nagios/servicegroups groups made from services
r.../n.../files/nagios/contacts files for people
r.../n.../files/nagios/contactgroups groups made from contacts
similar layout for templates
handlers has the ways to restart and check configuration
tasks has the main rules for building stuff.
===================
Nagios Module Steps
===================
1. Check to see if the nagios user is configured. Someone years ago
chose that our monitoring uses UID/GID 420. Har Har.
Setup any other groups and permissions
2. Install the needed packages for the server.
3. Setup the directories on the server
/etc/nagios/{child}
4. Synchonise over the static files
/etc/nagios/commands/
/etc/nagios/services/
/etc/nagios/servicegroups/
/etc/nagios/contacts/
/etc/nagios/contactgroups/
/usr/lib64/nagios/plugins/
/usr/local/bin
/usr/share/nagios/html/
5. Build template files
/etc/nagios/commands/
/etc/nagios/hosts/{ansible-inventory, ansible-vars, other}
/etc/nagios/hostgroups/
6. Fix selinux policy
7. Restart services

View file

@ -0,0 +1,36 @@
# noc1
ScriptAlias /nagios/cgi-bin/ /usr/lib64/nagios/cgi-bin/
# noc2
ScriptAlias /nagios-external/cgi-bin/ /usr/lib64/nagios/cgi-bin/
# test
ScriptAlias /nagios-just-a-test/cgi-bin/ /usr/lib64/nagios/cgi-bin/
ScriptAlias /tac.cgi /usr/lib64/nagios/cgi-bin/tac.cgi
<Location />
AuthName "Nagios GSSAPI Login"
GssapiCredStore keytab:/etc/krb5.HTTP_admin.fedoraproject.org.keytab
AuthType GSSAPI
# This is off because Apache (and thus mod_auth_gssapi) doesn't know this is proxied over TLS
GssapiSSLonly Off
GssapiLocalName on
Require valid-user
</Location>
<Location ~ "/(nagios|nagios-external|nagios-just-a-test)/cgi-bin/">
Options ExecCGI
</Location>
<Directory "/usr/share/nagios/html">
Options None
</Directory>
Alias /nagios /usr/share/nagios/html/
# This will only affect noc2 because the proxies only forward -external to it.
Alias /nagios-external /usr/share/nagios/html/
# Test
Alias /nagios-test /usr/share/nagios/html/

View file

@ -0,0 +1,8 @@
# 'check_bzr' command definition
# I'd like this to actually interact with BZR, but I can't find any
# proper documentation on the protocol to craft send/expect/quit
# strings.
define command{
command_name check_bzr
command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p 4155
}

View file

@ -0,0 +1,15 @@
define command {
command_name check_by_ssh_check_raid
command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_raid.py"
}
define command {
command_name check_by_ssh_check_disk
command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_disk -w $ARG1$% -c $ARG2$% -p $ARG3$"
}
# 'check_postgres_conns' command definition
define command{
command_name check_postgres_conns
command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_procs -u postgres -w $ARG1$ -c $ARG2$ -a $ARG3$"
}

View file

@ -0,0 +1,11 @@
# 'check_dns' command definition
define command{
command_name check_dns
command_line $USER1$/check_dns -H www.yahoo.com -s $HOSTADDRESS$
}
# 'check_dns_fpo' command definition
define command{
command_name check_dns_fpo
command_line $USER1$/check_dns -t 30 -H fedoraproject.org -A -s $HOSTADDRESS$
}

View file

@ -0,0 +1,8 @@
# 'check_git' command definition
# I'd like this to actually interact with GIT, but I can't find any
# proper documentation on the protocol to craft send/expect/quit
# strings.
define command{
command_name check_git
command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p 9418
}

View file

@ -0,0 +1,79 @@
##
## This file has the commands to check and restart general httpd services
## and websites.
##
################################################################################
# COMMAND DEFINITIONS
#
# SYNTAX:
#
# define command{
# template <templatename>
# name <objectname>
# command_name <commandname>
# command_line <commandline>
# }
#
# WHERE:
#
# <templatename> = object name of another command definition that should be
# used as a template for this definition (optional)
# <objectname> = object name of command definition, referenced by other
# command definitions that use it as a template (optional)
# <commandname> = name of the command, as recognized/used by Nagios
# <commandline> = command line
#
################################################################################
# 'reload httpd'
define command {
command_name restart_httpd
command_line $USER1$/restart_httpd $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTADDRESS$ "$HOSTALIAS$" "$SERVICEDESC$" "$SERVICESTATE$"
}
#
# 'check_website_publiclist' command definition
define command{
command_name check_website_publiclist
command_line $USER1$/check_http -w 60 -c 80 -I $HOSTADDRESS$ -H $ARG1$ -u "$ARG2$" -s "$ARG3$"
}
# 'check_website' command definition
define command{
command_name check_website
command_line $USER1$/check_http -w 30 -c 40 -I $HOSTADDRESS$ -H $ARG1$ -u "$ARG2$" -s "$ARG3$"
}
define command{
command_name check_website_ppc
command_line $USER1$/check_http -w 300 -c 400 -I $HOSTADDRESS$ -H $ARG1$ -u "$ARG2$" -s "$ARG3$"
}
define command{
command_name check_website_ssl
command_line $USER1$/check_http -w 30 -c 40 --ssl -I $HOSTADDRESS$ -H $ARG1$ -u $ARG2$ -s "$ARG3$"
}
define command{
command_name check_ssl_cert
command_line $USER1$/check_http -I $HOSTADDRESS$ -H $ARG1$ -C $ARG2$
}
define command{
command_name check_website_publiclist_ssl
command_line $USER1$/check_http -w 40 -c 60 --ssl -I $HOSTADDRESS$ -H $ARG1$ -u $ARG2$ -s "$ARG3$"
}
# 'check_http' command definition
define command{
command_name check_http
command_line $USER1$/check_http -H $HOSTADDRESS$
}
# 'check_https' command definition
define command{
command_name check_https
command_line $USER1$/check_http -H $HOSTADDRESS$ --ssl
}

View file

@ -0,0 +1,29 @@
################################################################################
# COMMAND DEFINITIONS
#
# SYNTAX:
#
# define command{
# template <templatename>
# name <objectname>
# command_name <commandname>
# command_line <commandline>
# }
#
# WHERE:
#
# <templatename> = object name of another command definition that should be
# used as a template for this definition (optional)
# <objectname> = object name of command definition, referenced by other
# command definitions that use it as a template (optional)
# <commandname> = name of the command, as recognized/used by Nagios
# <commandline> = command line
#
################################################################################
# 'check_koji'
define command{
command_name check_koji
command_line $USER1$/check_koji
}

View file

@ -0,0 +1,36 @@
# 'check_local_disk' command definition
define command{
command_name check_local_disk
command_line $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}
# 'check_local_load' command definition
define command{
command_name check_local_load
command_line $USER1$/check_load -w $ARG1$ -c $ARG2$
}
# 'check_local_procs' command definition
define command{
command_name check_local_procs
command_line $USER1$/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$
}
# 'check_local_users' command definition
define command{
command_name check_local_users
command_line $USER1$/check_users -w $ARG1$ -c $ARG2$
}
# 'check_local_swap' command definition
define command{
command_name check_local_swap
command_line $USER1$/check_swap -w $ARG1$ -c $ARG2$
}
# 'check_local_mrtgtraf' command definition
define command{
command_name check_local_mrtgtraf
command_line $USER1$/check_mrtgtraf -F $ARG1$ -a $ARG2$ -w $ARG3$ -c $ARG4$ -e $ARG5$
}

View file

@ -0,0 +1,96 @@
################################################################################
# COMMAND DEFINITIONS
#
# SYNTAX:
#
# define command{
# template <templatename>
# name <objectname>
# command_name <commandname>
# command_line <commandline>
# }
#
# WHERE:
#
# <templatename> = object name of another command definition that should be
# used as a template for this definition (optional)
# <objectname> = object name of command definition, referenced by other
# command definitions that use it as a template (optional)
# <commandname> = name of the command, as recognized/used by Nagios
# <commandline> = command line
#
################################################################################
define command{
command_name true
command_line /bin/true
}
define command{
command_name check_dummy
command_line $USER1$/check_dummy $ARG1$ $ARG2$
}
# 'check_tape'
define command{
command_name check_tape
command_line $USER1$/check_tape
}
# 'check_ftp' command definition
define command{
command_name check_ftp
command_line $USER1$/check_ftp -H $HOSTADDRESS$
}
# 'check_hpjd' command definition
define command{
command_name check_hpjd
command_line $USER1$/check_hpjd -H $HOSTADDRESS$ -C public
}
# 'check_snmp' command definition
define command{
command_name check_snmp
command_line $USER1$/check_snmp -H $HOSTADDRESS$ $ARG1$
}
# 'check_nntp' command definition
define command{
command_name check_nntp
command_line $USER1$/check_nntp -H $HOSTADDRESS$
}
# 'check_telnet' command definition
define command{
command_name check_telnet
command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p 23
}
# 'check_dhcp' command definition
define command{
command_name check_dhcp
command_line $USER1$/check_dhcp $ARG1$
}
# 'check_pop' command definition
define command{
command_name check_pop
command_line $USER1$/check_pop -H $HOSTADDRESS$
}
# 'check_imap' command definition
define command{
command_name check_imap
command_line $USER1$/check_imap -H $HOSTADDRESS$ $ARG1$
}
# 'check_nt' command definition
define command{
command_name check_nt
command_line $USER1$/check_nt -H $HOSTADDRESS$ -p 12489 -v $ARG1$ $ARG2$
}

View file

@ -0,0 +1,87 @@
################################################################################
#
# SAMPLE NOTIFICATION COMMANDS
#
# These are some example notification commands. They may or may not work on
# your system without modification. As an example, some systems will require
# you to use "/usr/bin/mailx" instead of "/usr/bin/mail" in the commands below.
#
################################################################################
# 'host-notify-by-email' command definition
define command{
command_name host-notify-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\nSource: $$(hostname)\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "Host $HOSTSTATE$ alert for $HOSTNAME$!" $CONTACTEMAIL$
}
# 'notify-service-by-email' command definition
define command{
command_name notify-service-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /usr/bin/mail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}
# 'notify-by-epager' command definition
define command{
command_name notify-by-epager
command_line /usr/bin/printf "%b" "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nInfo: $SERVICEOUTPUT$\nSource: $$(hostname -s)\nDate: $LONGDATETIME$" | /bin/mail -s "$NOTIFICATIONTYPE$: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$" $CONTACTPAGER$
}
# 'host-notify-by-epager' command definition
define command{
command_name host-notify-by-epager
command_line /usr/bin/printf "%b" "Host '$HOSTALIAS$' is $HOSTSTATE$\nInfo: $HOSTOUTPUT$\nSource: $$(hostname -s)\nTime: $LONGDATETIME$" | /bin/mail -s "$NOTIFICATIONTYPE$ alert - Host $HOSTNAME$ is $HOSTSTATE$" $CONTACTPAGER$
}
# 'host-notify-by-ircbot' command definition
define command{
command_name host-notify-by-ircbot
command_line /usr/bin/printf "%b" "#fedora-noc $NOTIFICATIONTYPE$ - $HOSTALIAS$ is $HOSTSTATE$: $HOSTOUTPUT$ ($$(hostname -s)) $HOSTACKAUTHOR$ $SERVICEACKAUTHOR$" | /usr/local/bin/irc-colorize.py | nc -w 1 value01 5050
}
# 'notify-by-email' command definition
define command{
command_name notify-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\nSource: $$(hostname)\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$" | /bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}
# 'notify-by-ircbot' command definition
define command{
command_name notify-by-ircbot
command_line /usr/bin/printf "%b" "#fedora-noc $NOTIFICATIONTYPE$ - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$: $SERVICEOUTPUT$ ($$(hostname -s)) $HOSTACKAUTHOR$ $SERVICEACKAUTHOR$" | /usr/local/bin/irc-colorize.py | nc -w 1 value01 5050
}
# 'host-notify-by-fedmsg' command definition
define command{
command_name host-notify-by-fedmsg
command_line /usr/bin/echo '{"type": "$NOTIFICATIONTYPE$", "host": "$HOSTALIAS$", "state": "$HOSTSTATE$", "output": "$HOSTOUTPUT$", "host_ack_author": "$HOSTACKAUTHOR$", "service_ack_author": "$SERVICEACKAUTHOR$"}' | fedmsg-logger --cert-prefix nagios --modname nagios --topic host.state.change --json-input
}
# 'notify-by-epager' command definition
define command{
command_name notify-by-epager
command_line /usr/bin/printf "%b" "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nInfo: $SERVICEOUTPUT$\nSource: $$(hostname -s)\nDate: $LONGDATETIME$" | /bin/mail -s "$NOTIFICATIONTYPE$: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$" $CONTACTPAGER$
}
# 'notify-by-fedmsg' command definition
define command{
command_name notify-by-fedmsg
command_line /usr/bin/echo '{"type": "$NOTIFICATIONTYPE$", "host": "$HOSTALIAS$", "state": "$SERVICESTATE$", "service": "$SERVICEDESC$", "output": "$SERVICEOUTPUT$", "host_ack_author": "$HOSTACKAUTHOR$", "service_ack_author": "$SERVICEACKAUTHOR$"}' | fedmsg-logger --cert-prefix nagios --modname nagios --topic service.state.change --json-input
}
# 'notify-by-xmpp' command definition
define command{
command_name notify-by-xmpp
command_line /usr/local/bin/xmppsend -a /etc/nagios/private/xmppnagios.ini "Service: $SERVICEDESC$\nHost: $HOSTNAME$\nInfo: $SERVICEOUTPUT$\nDate: $LONGDATETIME$" $CONTACTEMAIL$
}
# 'host-notify-by-xmpp' command definition
define command{
command_name host-notify-by-xmpp
command_line /usr/local/bin/xmppsend -a /etc/nagios/private/xmppnagios.ini "Host '$HOSTALIAS$' is $HOSTSTATE$\nInfo: $HOSTOUTPUT$\nDate: $LONGDATETIME$" $CONTACTEMAIL$
}

View file

@ -0,0 +1,17 @@
# 'test nrpe'
define command{
command_name test_nrpe
command_line $USER1$/check_nrpe -t 30 -H $HOSTADDRESS$
}
# 'check by nrpe'
define command{
command_name check_by_nrpe
command_line $USER1$/check_nrpe -t 30 -H $HOSTADDRESS$ -c $ARG1$
}
# 'check-host-alive-nrpe' is better for hosts that are on vpn.
define command{
command_name check-host-alive-nrpe
command_line $USER1$/check_nrpe -t 30 -H $HOSTADDRESS$
}

View file

@ -0,0 +1,26 @@
################################################################################
#
# SAMPLE PERFORMANCE DATA COMMANDS
#
# These are sample performance data commands that can be used to send performance
# data output to two text files (one for hosts, another for services). If you
# plan on simply writing performance data out to a file, consider using the
# host_perfdata_file and service_perfdata_file options in the main config file.
#
################################################################################
# 'process-host-perfdata' command definition
define command{
command_name process-host-perfdata
command_line /usr/bin/printf "%b" "$LASTHOSTCHECK$\t$HOSTNAME$\t$HOSTSTATE$\t$HOSTATTEMPT$\t$HOSTSTATETYPE$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$\n" >> /var/log/nagios/host-perfdata.out
}
# 'process-service-perfdata' command definition
define command{
command_name process-service-perfdata
command_line /usr/bin/printf "%b" "$LASTSERVICECHECK$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICESTATE$\t$SERVICEATTEMPT$\t$SERVICESTATETYPE$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$\n" >> /var/log/nagios/service-perfdata.out
}

View file

@ -0,0 +1,31 @@
# This command checks to see if a host is "alive" by pinging it
# The check must result in a 100% packet loss or 5 second (3000ms) round trip
# average time to produce a critical error.
# Note: Only one ICMP echo packet is sent (determined by the '-p 1' argument)
# 'check-host-alive' command definition
define command{
command_name check-host-alive
command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5
}
define command{
command_name check-host-alive4
command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 2
}
define command{
command_name check-host-alive6
command_line $USER1$/check_ping -6 -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 2
}
# 'check_ping' command definition
define command{
command_name check_ping4
command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5
}
define command{
command_name check_ping6
command_line $USER1$/check_ping -6 -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5
}

View file

@ -0,0 +1,5 @@
# 'pgsql'
define command{
command_name check_pgsql
command_line $USER1$/check_pgsql -H $HOSTADDRESS$ -d $ARG1$ -p '{{nagios_db_user_password}}' --logname 'nagiosuser'
}

View file

@ -0,0 +1,28 @@
################################################################################
# COMMAND DEFINITIONS
#
# SYNTAX:
#
# define command{
# template <templatename>
# name <objectname>
# command_name <commandname>
# command_line <commandline>
# }
#
# WHERE:
#
# <templatename> = object name of another command definition that should be
# used as a template for this definition (optional)
# <objectname> = object name of command definition, referenced by other
# command definitions that use it as a template (optional)
# <commandname> = name of the command, as recognized/used by Nagios
# <commandline> = command line
#
################################################################################
define command {
command_name restart_rsyslog
command_line $USER1$/restart_rsyslog $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$ $HOSTADDRESS$ "$HOSTALIAS$" "$SERVICEDESC$" "$SERVICESTATE$"
}

View file

@ -0,0 +1,12 @@
# 'check_smtp' command definition
define command{
command_name check_smtp
command_line $USER1$/check_smtp -H $HOSTADDRESS$
}
# 'check_email_delivery' command definition
define command{
command_name check_email_delivery
command_line $USER1$/check_email_delivery_epn -H $ARG1$ --mailto $ARG2$ --mailfrom $ARG3$ --username $ARG4$ --password $ARG5$ -w $ARG6$ -c $ARG7$
}

View file

@ -0,0 +1,22 @@
# 'check_ssh' command definition
define command{
command_name check_ssh
command_line $USER1$/check_ssh -H $HOSTADDRESS$
}
define command {
command_name check_by_ssh_check_raid
command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_raid.py"
}
define command {
command_name check_by_ssh_check_disk
command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_disk -w $ARG1$% -c $ARG2$% -p $ARG3$"
}
# 'check_postgres_conns' command definition
define command{
command_name check_postgres_conns
command_line $USER1$/check_by_ssh -t 30 -H $HOSTADDRESS$ "$USER1$/check_procs -u postgres -w $ARG1$ -c $ARG2$ -a $ARG3$"
}

View file

@ -0,0 +1,6 @@
# 'check_tcp' command definition
define command{
command_name check_tcp
command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$
}

View file

@ -0,0 +1,5 @@
# 'check_testcloud'
define command{
command_name check_testcloud
command_line $USER1$/check_testcloud
}

View file

@ -0,0 +1,5 @@
# 'check_udp' command definition
define command{
command_name check_udp
command_line $USER1$/check_udp -H $HOSTADDRESS$ -p $ARG1$
}

View file

@ -0,0 +1,12 @@
# 'check_unbound_80' command definition
define command{
command_name check_unbound_80
command_line $USER1$/check_dig -H $HOSTADDRESS$ -w 5 -c 9 -p 80 -l $ARG1$ -A "+tcp"
}
# 'check_unbound_443' command definition
define command{
command_name check_unbound_443
command_line $USER1$/check_dig_ssl -H $HOSTADDRESS$ -w 5 -c 9 -p 443 -L $ARG1$ -l $ARG2$ -A "+tcp"
}

View file

@ -0,0 +1,22 @@
define hostescalation{
host_name *
hostgroup_name *
contact_groups fedora-sysadmin-email,fedora-sysadmin-emergency,fedora-sysadmin-ircbot
first_notification 2
last_notification 0
notification_interval 60
escalation_period 24x7
escalation_options d,u,r
}
define serviceescalation{
host_name *
service_description *
contact_groups fedora-sysadmin-email,fedora-sysadmin-emergency,fedora-sysadmin-ircbot
first_notification 2
last_notification 0
notification_interval 60
escalation_period 24x7
escalation_options w,u,c,r
}

View file

@ -0,0 +1,362 @@
###############################################################################
# MINIMAL.CFG
#
# MINIMALISTIC OBJECT CONFIG FILE (Template-Based Object File Format)
#
# Last Modified: 08-10-2005
#
#
# NOTE: This config file is intended to be used to test a Nagios installation
# that has been compiled with support for the template-based object
# configuration files.
#
# This config file is intended to servce as an *extremely* simple
# example of how you can create your object configuration file(s).
# If you're interested in more complex object configuration files for
# Nagios, look in the sample-config/template-object/ subdirectory of
# the distribution.
#
###############################################################################
###############################################################################
###############################################################################
#
# TIME PERIODS
#
###############################################################################
###############################################################################
# This defines a timeperiod where all times are valid for checks,
# notifications, etc. The classic "24x7" support nightmare. :-)
define timeperiod{
timeperiod_name 24x7
alias 24 Hours A Day, 7 Days A Week
sunday 00:00-24:00
monday 00:00-24:00
tuesday 00:00-24:00
wednesday 00:00-24:00
thursday 00:00-24:00
friday 00:00-24:00
saturday 00:00-24:00
}
###############################################################################
###############################################################################
#
# COMMANDS
#
###############################################################################
###############################################################################
# This is a sample service notification command that can be used to send email
# notifications (about service alerts) to contacts.
# 'check_ssh' command definition
define command{
command_name notify-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$OUTPUT$" | /bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}
# This is a sample host notification command that can be used to send email
# notifications (about host alerts) to contacts.
define command{
command_name host-notify-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $OUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "Host $HOSTSTATE$ alert for $HOSTNAME$!" $CONTACTEMAIL$
}
# Command to check to see if a host is "alive" (up) by pinging it
define command{
command_name check-host-alive
command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w 300,99% -c 500,100% -p 2
}
# Generic command to check a device by pinging it
define command{
command_name check_ping
command_line $USER1$/check_ping -4 -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5
}
# Command used to check disk space usage on local partitions
define command{
command_name check_local_disk
command_line $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}
# Command used to check the number of currently logged in users on the
# local machine
define command{
command_name check_local_users
command_line $USER1$/check_users -w $ARG1$ -c $ARG2$
}
# Command to check the number of running processing on the local machine
define command{
command_name check_local_procs
command_line $USER1$/check_procs -w $ARG1$ -c $ARG2$
}
# Command to check the load on the local machine
define command{
command_name check_local_load
command_line $USER1$/check_load -w $ARG1$ -c $ARG2$
}
###############################################################################
###############################################################################
#
# CONTACTS
#
###############################################################################
###############################################################################
# In this simple config file, a single contact will receive all alerts.
# This assumes that you have an account (or email alias) called
# "nagios-admin" on the local host.
define contact{
contact_name nagios-admin
alias Nagios Admin
service_notification_period 24x7
host_notification_period 24x7
service_notification_options w,u,c,r
host_notification_options d,r
service_notification_commands notify-by-email
host_notification_commands host-notify-by-email
email admin@fedoraproject.org
}
###############################################################################
###############################################################################
#
# CONTACT GROUPS
#
###############################################################################
###############################################################################
# We only have one contact in this simple configuration file, so there is
# no need to create more than one contact group.
define contactgroup{
contactgroup_name admins
alias Nagios Administrators
members nagios-admin
}
###############################################################################
###############################################################################
#
# HOSTS
#
###############################################################################
###############################################################################
# Generic host definition template - This is NOT a real host, just a template!
define host{
name generic-host ; The name of this host template
notifications_enabled 1 ; Host notifications are enabled
event_handler_enabled 1 ; Host event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
failure_prediction_enabled 1 ; Failure prediction is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE!
}
# Since this is a simple configuration file, we only monitor one host - the
# local host (this machine).
define host{
use generic-host ; Name of host template to use
host_name localhost
alias localhost
address 127.0.0.1
check_command check-host-alive
max_check_attempts 10
notification_interval 120
notification_period 24x7
notification_options d,r
contact_groups admins
}
###############################################################################
###############################################################################
#
# HOST GROUPS
#
###############################################################################
###############################################################################
# We only have one host in our simple config file, so there is no need to
# create more than one hostgroup.
define hostgroup{
hostgroup_name test
alias Test Servers
members localhost
}
###############################################################################
###############################################################################
#
# SERVICES
#
###############################################################################
###############################################################################
# Generic service definition template - This is NOT a real service, just a template!
define service{
name generic-service ; The 'name' of this service template
active_checks_enabled 1 ; Active service checks are enabled
passive_checks_enabled 1 ; Passive service checks are enabled/accepted
parallelize_check 1 ; Active service checks should be parallelized (disabling this can lead to major performance problems)
obsess_over_service 1 ; We should obsess over this service (if necessary)
check_freshness 0 ; Default is to NOT check service 'freshness'
notifications_enabled 1 ; Service notifications are enabled
event_handler_enabled 1 ; Service event handler is enabled
flap_detection_enabled 1 ; Flap detection is enabled
failure_prediction_enabled 1 ; Failure prediction is enabled
process_perf_data 1 ; Process performance data
retain_status_information 1 ; Retain status information across program restarts
retain_nonstatus_information 1 ; Retain non-status information across program restarts
register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
}
# Define a service to "ping" the local machine
define service{
use generic-service ; Name of service template to use
host_name localhost
service_description PING
is_volatile 0
check_period 24x7
max_check_attempts 4
normal_check_interval 5
retry_check_interval 1
contact_groups admins
notification_options w,u,c,r
notification_interval 960
notification_period 24x7
check_command check_ping!100.0,20%!500.0,60%
}
# Define a service to check the disk space of the root partition
# on the local machine. Warning if < 20% free, critical if
# < 10% free space on partition.
define service{
use generic-service ; Name of service template to use
host_name localhost
service_description Root Partition
is_volatile 0
check_period 24x7
max_check_attempts 4
normal_check_interval 5
retry_check_interval 1
contact_groups admins
notification_options w,u,c,r
notification_interval 960
notification_period 24x7
check_command check_local_disk!20%!10%!/
}
# Define a service to check the number of currently logged in
# users on the local machine. Warning if > 20 users, critical
# if > 50 users.
define service{
use generic-service ; Name of service template to use
host_name localhost
service_description Current Users
is_volatile 0
check_period 24x7
max_check_attempts 4
normal_check_interval 5
retry_check_interval 1
contact_groups admins
notification_options w,u,c,r
notification_interval 960
notification_period 24x7
check_command check_local_users!20!50
}
# Define a service to check the number of currently running procs
# on the local machine. Warning if > 250 processes, critical if
# > 400 users.
define service{
use generic-service ; Name of service template to use
host_name localhost
service_description Total Processes
is_volatile 0
check_period 24x7
max_check_attempts 4
normal_check_interval 5
retry_check_interval 1
contact_groups admins
notification_options w,u,c,r
notification_interval 960
notification_period 24x7
check_command check_local_procs!250!400
}
# Define a service to check the load on the local machine.
define service{
use generic-service ; Name of service template to use
host_name localhost
service_description Current Load
is_volatile 0
check_period 24x7
max_check_attempts 4
normal_check_interval 5
retry_check_interval 1
contact_groups admins
notification_options w,u,c,r
notification_interval 960
notification_period 24x7
check_command check_local_load!5.0,4.0,3.0!10.0,6.0,4.0
}
# EOF

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,135 @@
###############################################################################
# TIMEPERIODS.CFG - SAMPLE TIMEPERIOD DEFINITIONS
#
#
# NOTES: This config file provides you with some example timeperiod definitions
# that you can reference in host, service, contact, and dependency
# definitions.
#
# You don't need to keep timeperiods in a separate file from your other
# object definitions. This has been done just to make things easier to
# understand.
#
###############################################################################
###############################################################################
###############################################################################
#
# TIME PERIODS
#
###############################################################################
###############################################################################
define timeperiod{
timeperiod_name 24x7
alias 24 Hours A Day, 7 Days A Week
sunday 00:00-24:00
monday 00:00-24:00
tuesday 00:00-24:00
wednesday 00:00-24:00
thursday 00:00-24:00
friday 00:00-24:00
saturday 00:00-24:00
}
define timeperiod{
timeperiod_name 16x7
alias 15 Hours a day, 7 days a week
sunday 00:00-04:00,13:00-24:00
monday 00:00-04:00,13:00-24:00
tuesday 00:00-04:00,13:00-24:00
wednesday 00:00-04:00,13:00-24:00
thursday 00:00-04:00,13:00-24:00
friday 00:00-04:00,13:00-24:00
saturday 00:00-04:00,13:00-24:00
}
define timeperiod{
timeperiod_name 16x7-AU
alias 15 Hours a day, 7 days a week
sunday 00:00-14:00,22:00-24:00
monday 00:00-14:00,22:00-24:00
tuesday 00:00-14:00,22:00-24:00
wednesday 00:00-14:00,22:00-24:00
thursday 00:00-14:00,22:00-24:00
friday 00:00-14:00,22:00-24:00
saturday 00:00-14:00,22:00-24:00
}
# Members of sysadmin-main already get nagios messages
define timeperiod{
timeperiod_name never
alias Never
}
# This defines a timeperiod where all times are valid for checks,
# notifications, etc. The classic "24x7" support nightmare. :-)
define timeperiod{
timeperiod_name 24x7
alias 24 Hours A Day, 7 Days A Week
sunday 00:00-24:00
monday 00:00-24:00
tuesday 00:00-24:00
wednesday 00:00-24:00
thursday 00:00-24:00
friday 00:00-24:00
saturday 00:00-24:00
}
# 'workhours' timeperiod definition
define timeperiod{
timeperiod_name workhours
alias Normal Work Hours
monday 09:00-17:00
tuesday 09:00-17:00
wednesday 09:00-17:00
thursday 09:00-17:00
friday 09:00-17:00
}
# 'none' timeperiod definition
define timeperiod{
timeperiod_name none
alias No Time Is A Good Time
}
# Some U.S. holidays
# Note: The timeranges for each holiday are meant to *exclude* the holidays from being
# treated as a valid time for notifications, etc. You probably don't want your pager
# going off on New Year's. Although you're employer might... :-)
define timeperiod{
name us-holidays
timeperiod_name us-holidays
alias U.S. Holidays
january 1 00:00-00:00 ; New Years
monday -1 may 00:00-00:00 ; Memorial Day (last Monday in May)
july 4 00:00-00:00 ; Independence Day
monday 1 september 00:00-00:00 ; Labor Day (first Monday in September)
thursday 4 november 00:00-00:00 ; Thanksgiving (4th Thursday in November)
december 25 00:00-00:00 ; Christmas
}
# This defines a modified "24x7" timeperiod that covers every day of the
# year, except for U.S. holidays (defined in the timeperiod above).
define timeperiod{
timeperiod_name 24x7_sans_holidays
alias 24x7 Sans Holidays
use us-holidays ; Get holiday exceptions from other timeperiod
sunday 00:00-24:00
monday 00:00-24:00
tuesday 00:00-24:00
wednesday 00:00-24:00
thursday 00:00-24:00
friday 00:00-24:00
saturday 00:00-24:00
}

View file

@ -0,0 +1,5 @@
define contactgroup {
contactgroup_name bodhi
alias Bodhi Notifications
members bowlofeggs
}

View file

@ -0,0 +1,5 @@
#define contactgroup{
# contactgroup_name build-sysadmin-email
# alias Build Sysadmin Email Contacts
# members kevin,aditya
# }

View file

@ -0,0 +1,5 @@
define contactgroup{
contactgroup_name fedora-sysadmin-email
alias Fedora Sysadmin Email Contacts
members admin,kevin,puiterwijkp,smooge,ausil,jcollie,nb,rigeld2,codeblock,hvivani
}

View file

@ -0,0 +1,5 @@
define contactgroup{
contactgroup_name fedora-sysadmin-ircbot
alias Fedora Sysadmin irc Contacts
members ircbot,fedmsg
}

View file

@ -0,0 +1,10 @@
define contactgroup{
contactgroup_name fedora-sysadmin-pager
alias Fedora Sysadmin Pager Contacts
members smoogep,kevinp,puiterwijkp
}
define contactgroup{
contactgroup_name fedora-sysadmin-emergency
alias Fedora Sysadmin Pager Contacts
members smooge-emergency,kevin-emergency,puiterwijk-emergency
}

View file

@ -0,0 +1,5 @@
define contactgroup{
contactgroup_name null
alias null
members null
}

Some files were not shown because too many files have changed in this diff Show more