put in the first run at new nagios configs
This commit is contained in:
parent
a1957d29d4
commit
8cf72ff116
310 changed files with 13255 additions and 26 deletions
72
roles/nagios_client/files/scripts/check_datanommer_timesince.py
Executable file
72
roles/nagios_client/files/scripts/check_datanommer_timesince.py
Executable file
|
@ -0,0 +1,72 @@
|
|||
#!/usr/bin/env python
|
||||
""" NRPE check for datanommer/fedmsg health.
|
||||
Given a category like 'bodhi', 'buildsys', or 'git', return an error if
|
||||
datanommer hasn't seen a message of that type in such and such time.
|
||||
You can alternatively provide a 'topic' which might look like
|
||||
org.fedoraproject.prod.bodhi.update.comment.
|
||||
|
||||
Requires: python-dateutil
|
||||
|
||||
Usage:
|
||||
|
||||
$ check_datanommer_timesince CATEGORY WARNING_THRESH CRITICAL_THRESH
|
||||
|
||||
:Author: Ralph Bean <rbean@redhat.com>
|
||||
|
||||
"""
|
||||
|
||||
import dateutil.relativedelta
|
||||
import subprocess
|
||||
import sys
|
||||
import json
|
||||
|
||||
|
||||
def query_timesince(identifier):
|
||||
# If it has a '.', then assume it is a topic.
|
||||
if '.' in identifier:
|
||||
cmd = 'datanommer-latest --topic %s --timesince' % identifier
|
||||
else:
|
||||
cmd = 'datanommer-latest --category %s --timesince' % identifier
|
||||
sys.stderr.write("Running %r\n" % cmd)
|
||||
process = subprocess.Popen(cmd.split(), shell=False,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = process.communicate()
|
||||
prefix, stdout = stdout.split("INFO] ", 1)
|
||||
data = json.loads(stdout)
|
||||
return float(data[0])
|
||||
|
||||
|
||||
def main():
|
||||
identifier, warning_threshold, critical_threshold = sys.argv[-3:]
|
||||
timesince = query_timesince(identifier)
|
||||
warning_threshold = int(warning_threshold)
|
||||
critical_threshold = int(critical_threshold)
|
||||
|
||||
time_strings = []
|
||||
rd = dateutil.relativedelta.relativedelta(seconds=timesince)
|
||||
for denomination in ['years', 'months', 'days', 'hours', 'minutes', 'seconds']:
|
||||
value = getattr(rd, denomination, 0)
|
||||
if value:
|
||||
time_strings.append("%d %s" % (value, denomination))
|
||||
|
||||
string = ", ".join(time_strings)
|
||||
reason = "datanommer has not seen a %r message in %s" % (identifier, string)
|
||||
|
||||
if timesince > critical_threshold:
|
||||
print "CRIT: ", reason
|
||||
sys.exit(2)
|
||||
|
||||
if timesince > warning_threshold:
|
||||
print "WARN: ", reason
|
||||
sys.exit(1)
|
||||
|
||||
print "OK: ", reason
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
print "UNKNOWN: ", str(e)
|
||||
sys.exit(3)
|
23
roles/nagios_client/files/scripts/check_fcomm_queue
Normal file
23
roles/nagios_client/files/scripts/check_fcomm_queue
Normal file
|
@ -0,0 +1,23 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
|
||||
try:
|
||||
import retask.queue
|
||||
|
||||
queue = retask.queue.Queue('fedora-packages')
|
||||
queue.connect()
|
||||
|
||||
items = queue.length
|
||||
if items > 500:
|
||||
print "CRITICAL: %i tasks in fcomm queue" % items
|
||||
sys.exit(2)
|
||||
elif items > 250:
|
||||
print "WARNING: %i tasks in fcomm queue" % items
|
||||
sys.exit(1)
|
||||
else:
|
||||
print "OK: %i tasks in fcomm queue" % items
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as e:
|
||||
print "UNKNOWN:", str(e)
|
||||
sys.exit(3)
|
|
@ -0,0 +1,62 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import zmq
|
||||
|
||||
try:
|
||||
service = sys.argv[1]
|
||||
check_consumer = sys.argv[2]
|
||||
backlog_warning = int(sys.argv[3])
|
||||
backlog_critical = int(sys.argv[4])
|
||||
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
|
||||
|
||||
if not os.path.exists(fname):
|
||||
print "UNKNOWN - %s does not exist" % fname
|
||||
sys.exit(3)
|
||||
|
||||
if not os.access(fname, os.W_OK):
|
||||
print "UNKNOWN - cannot write to %s" % fname
|
||||
sys.exit(3)
|
||||
|
||||
connect_to = "ipc:///%s" % fname
|
||||
ctx = zmq.Context()
|
||||
s = ctx.socket(zmq.SUB)
|
||||
s.connect(connect_to)
|
||||
s.setsockopt(zmq.SUBSCRIBE, '')
|
||||
|
||||
poller = zmq.Poller()
|
||||
poller.register(s, zmq.POLLIN)
|
||||
|
||||
timeout = 20000
|
||||
|
||||
events = dict(poller.poll(timeout))
|
||||
if s in events and events[s] == zmq.POLLIN:
|
||||
msg = s.recv()
|
||||
msg = json.loads(msg)
|
||||
else:
|
||||
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
|
||||
sys.exit(3)
|
||||
|
||||
for consumer in msg['consumers']:
|
||||
if consumer['name'] == check_consumer:
|
||||
if consumer['backlog'] is None:
|
||||
print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name']
|
||||
sys.exit(3)
|
||||
elif consumer['backlog'] > backlog_critical:
|
||||
print 'CRITICAL: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog'])
|
||||
sys.exit(2)
|
||||
elif consumer['backlog'] > backlog_warning:
|
||||
print 'WARNING: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog'])
|
||||
sys.exit(1)
|
||||
else:
|
||||
print 'OK: fedmsg consumer %s backlog value is %i' % (consumer['name'],consumer['backlog'])
|
||||
sys.exit(0)
|
||||
|
||||
print "UNKNOWN: fedmsg consumer %s not found" % check_consumer
|
||||
sys.exit(3)
|
||||
except Exception as err:
|
||||
print "UNKNOWN:", str(err)
|
||||
sys.exit(3)
|
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import zmq
|
||||
|
||||
try:
|
||||
service = sys.argv[1]
|
||||
check_consumer = sys.argv[2]
|
||||
exceptions_warning = int(sys.argv[3])
|
||||
exceptions_critical = int(sys.argv[4])
|
||||
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
|
||||
|
||||
if not os.path.exists(fname):
|
||||
print "UNKNOWN - %s does not exist" % fname
|
||||
sys.exit(3)
|
||||
|
||||
if not os.access(fname, os.W_OK):
|
||||
print "UNKNOWN - cannot write to %s" % fname
|
||||
sys.exit(3)
|
||||
|
||||
connect_to = "ipc:///%s" % fname
|
||||
ctx = zmq.Context()
|
||||
s = ctx.socket(zmq.SUB)
|
||||
s.connect(connect_to)
|
||||
s.setsockopt(zmq.SUBSCRIBE, '')
|
||||
poller = zmq.Poller()
|
||||
poller.register(s, zmq.POLLIN)
|
||||
|
||||
timeout = 20000
|
||||
|
||||
events = dict(poller.poll(timeout))
|
||||
if s in events and events[s] == zmq.POLLIN:
|
||||
msg = s.recv()
|
||||
msg = json.loads(msg)
|
||||
else:
|
||||
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
|
||||
sys.exit(3)
|
||||
|
||||
for consumer in msg['consumers']:
|
||||
if consumer['name'] == check_consumer:
|
||||
if consumer['exceptions'] > exceptions_critical:
|
||||
print 'CRITICAL: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions'])
|
||||
sys.exit(2)
|
||||
elif consumer['exceptions'] > exceptions_warning:
|
||||
print 'WARNING: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions'])
|
||||
sys.exit(1)
|
||||
else:
|
||||
print 'OK: fedmsg consumer %s exceptions value is %i' % (consumer['name'],consumer['exceptions'])
|
||||
sys.exit(0)
|
||||
|
||||
print "UNKNOWN: fedmsg consumers %s not found" % check_consumer
|
||||
sys.exit(3)
|
||||
except Exception as err:
|
||||
print "UNKNOWN:", str(err)
|
||||
sys.exit(3)
|
|
@ -0,0 +1,69 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import arrow
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import zmq
|
||||
|
||||
try:
|
||||
service = sys.argv[1]
|
||||
check_producer = sys.argv[2]
|
||||
elapsed_warning = int(sys.argv[3])
|
||||
elapsed_critical = int(sys.argv[4])
|
||||
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
|
||||
|
||||
if not os.path.exists(fname):
|
||||
print "UNKNOWN - %s does not exist" % fname
|
||||
sys.exit(3)
|
||||
|
||||
if not os.access(fname, os.W_OK):
|
||||
print "UNKNOWN - cannot write to %s" % fname
|
||||
sys.exit(3)
|
||||
|
||||
connect_to = "ipc:///%s" % fname
|
||||
ctx = zmq.Context()
|
||||
s = ctx.socket(zmq.SUB)
|
||||
s.connect(connect_to)
|
||||
s.setsockopt(zmq.SUBSCRIBE, '')
|
||||
|
||||
poller = zmq.Poller()
|
||||
poller.register(s, zmq.POLLIN)
|
||||
|
||||
timeout = 20000
|
||||
|
||||
events = dict(poller.poll(timeout))
|
||||
if s in events and events[s] == zmq.POLLIN:
|
||||
msg = s.recv()
|
||||
msg = json.loads(msg)
|
||||
else:
|
||||
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
|
||||
sys.exit(3)
|
||||
|
||||
now = time.time()
|
||||
|
||||
for prod in msg['producers']:
|
||||
if prod['name'] != check_producer:
|
||||
continue
|
||||
diff = now - prod['last_ran']
|
||||
then = arrow.get(prod['last_ran']).humanize()
|
||||
if diff > elapsed_critical:
|
||||
print "CRITICAL: %s last ran %s (%i seconds ago)" % (
|
||||
check_producer, then, diff)
|
||||
sys.exit(2)
|
||||
elif diff > elapsed_warning:
|
||||
print "WARNING: %s last ran %s (%i seconds ago)" % (
|
||||
check_producer, then, diff)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print "OK: %s last ran %s (%i seconds ago)" % (
|
||||
check_producer, then, diff)
|
||||
sys.exit(0)
|
||||
|
||||
print "UNKNOWN: fedmsg producer %s not found" % check_producer
|
||||
sys.exit(3)
|
||||
except Exception as err:
|
||||
print "UNKNOWN:", str(err)
|
||||
sys.exit(3)
|
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import zmq
|
||||
|
||||
try:
|
||||
service = sys.argv[1]
|
||||
check_list = frozenset(sys.argv[2:])
|
||||
fname = '/var/run/fedmsg/monitoring-%s.socket' % service
|
||||
|
||||
if not check_list:
|
||||
print "UNKNOWN - empty list of fedmsg consumers and producers to check"
|
||||
sys.exit(3)
|
||||
|
||||
if not os.path.exists(fname):
|
||||
print "UNKNOWN - %s does not exist" % fname
|
||||
sys.exit(3)
|
||||
|
||||
if not os.access(fname, os.W_OK):
|
||||
print "UNKNOWN - cannot write to %s" % fname
|
||||
sys.exit(3)
|
||||
|
||||
connect_to = "ipc:///%s" % fname
|
||||
ctx = zmq.Context()
|
||||
s = ctx.socket(zmq.SUB)
|
||||
s.connect(connect_to)
|
||||
s.setsockopt(zmq.SUBSCRIBE, '')
|
||||
poller = zmq.Poller()
|
||||
poller.register(s, zmq.POLLIN)
|
||||
|
||||
timeout = 20000
|
||||
|
||||
events = dict(poller.poll(timeout))
|
||||
if s in events and events[s] == zmq.POLLIN:
|
||||
msg = s.recv()
|
||||
msg = json.loads(msg)
|
||||
else:
|
||||
print 'UNKNOWN - ZMQ timeout. No message received in %i ms' % timeout
|
||||
sys.exit(3)
|
||||
|
||||
for consumer in msg['consumers']:
|
||||
if consumer['name'] in check_list and not consumer['initialized']:
|
||||
print 'ERROR: fedmsg consumer %s is not initialized' % consumer['name']
|
||||
sys.exit(2)
|
||||
|
||||
for producer in msg['producers']:
|
||||
if producer['name'] in check_list and not producer['initialized']:
|
||||
print 'ERROR: fedmsg producer %s is not initialized' % producer['name']
|
||||
sys.exit(2)
|
||||
|
||||
for item in check_list:
|
||||
if item not in [p['name'] for p in msg['producers'] + msg['consumers']]:
|
||||
print 'ERROR: %s not found among installed plugins' % item
|
||||
sys.exit(2)
|
||||
|
||||
print "OK: fedmsg consumer(s) and producer(s) initialized"
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as err:
|
||||
print "UNKNOWN:", str(err)
|
||||
sys.exit(3)
|
76
roles/nagios_client/files/scripts/check_haproxy_conns.py
Executable file
76
roles/nagios_client/files/scripts/check_haproxy_conns.py
Executable file
|
@ -0,0 +1,76 @@
|
|||
#!/usr/bin/env python
|
||||
""" Nagios check for haproxy over-subscription.
|
||||
|
||||
fedmsg-gateway is the primary concern as it can eat up a ton of simultaneous
|
||||
connections.
|
||||
|
||||
:Author: Ralph Bean <rbean@redhat.com>
|
||||
"""
|
||||
|
||||
import socket
|
||||
import sys
|
||||
|
||||
|
||||
def _numeric(value):
|
||||
""" Type casting utility """
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return value
|
||||
|
||||
|
||||
def query(sockname="/var/run/haproxy-stat"):
|
||||
""" Read stats from the haproxy socket and return a dict """
|
||||
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
s.connect("/var/run/haproxy-stat")
|
||||
s.send('show info\n')
|
||||
try:
|
||||
response = s.recv(1024).strip()
|
||||
lines = response.split('\n')
|
||||
data = dict([map(str.strip, line.split(':')) for line in lines])
|
||||
data = dict([(k, _numeric(v)) for k, v in data.items()])
|
||||
return data
|
||||
except Exception, e:
|
||||
print str(e)
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def nagios_check(data):
|
||||
""" Print warnings and return nagios exit codes. """
|
||||
|
||||
current = data['CurrConns']
|
||||
maxconn = data['Maxconn']
|
||||
percent = 100 * float(current) / float(maxconn)
|
||||
details = "%.2f%% subscribed. %i current of %i maxconn." % (
|
||||
percent, current, maxconn,
|
||||
)
|
||||
|
||||
if percent < 50:
|
||||
print "HAPROXY SUBS OK: " + details
|
||||
return 0
|
||||
|
||||
if percent < 75:
|
||||
print "HAPROXY SUBS WARN: " + details
|
||||
return 1
|
||||
|
||||
if percent <= 100:
|
||||
print "HAPROXY SUBS CRIT: " + details
|
||||
return 2
|
||||
|
||||
print "HAPROXY SUBS UNKNOWN: " + details
|
||||
return 3
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
data = query(sockname="/var/run/haproxy-stat")
|
||||
except Exception as e:
|
||||
print "HAPROXY SUBS UNKNOWN: " + str(e)
|
||||
sys.exit(3)
|
||||
sys.exit(nagios_check(data))
|
59
roles/nagios_client/files/scripts/check_haproxy_mirrorlist.py
Executable file
59
roles/nagios_client/files/scripts/check_haproxy_mirrorlist.py
Executable file
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import socket
|
||||
import sys
|
||||
|
||||
|
||||
try:
|
||||
|
||||
unixsocket="/var/run/haproxy-stat"
|
||||
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
s.connect(unixsocket)
|
||||
s.send('show stat\n')
|
||||
|
||||
try:
|
||||
|
||||
output = s.recv(16384).strip().split('\n')
|
||||
fields = output.pop(0).split(',')
|
||||
fields[0]=fields[0].replace('# ','')
|
||||
proxies = list()
|
||||
for line in output:
|
||||
proxies.append(dict(zip(fields,line.split(','))))
|
||||
|
||||
except Exception, e:
|
||||
print str(e)
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
except Exception as e:
|
||||
print "MIRRORLIST STATE UNKNOWN: " + str(e)
|
||||
sys.exit(3)
|
||||
|
||||
total=0
|
||||
downcount=0
|
||||
downlist=""
|
||||
for proxy in proxies:
|
||||
if proxy['svname'] == "FRONTEND" or proxy['svname'] == "BACKEND":
|
||||
continue
|
||||
if proxy['pxname'] == "mirror-lists":
|
||||
total+=1
|
||||
if proxy['status'] == "DOWN":
|
||||
downlist+=proxy["svname"]+" "
|
||||
downcount+=1
|
||||
|
||||
unavailability = 100 * float(downcount) / float(total)
|
||||
|
||||
if unavailability == 0:
|
||||
print "MIRRORLIST STATE OK: " + downlist
|
||||
sys.exit(0)
|
||||
|
||||
if unavailability < 50:
|
||||
print "MIRRORLIST STATE WARN: " + downlist
|
||||
sys.exit(1)
|
||||
|
||||
if unavailability >= 50:
|
||||
print "MIRRORLIST STATE CRIT: " + downlist
|
||||
sys.exit(2)
|
||||
|
||||
print "MIRRORLIST STATE UNKNOWN: " + downlist
|
||||
sys.exit(3)
|
74
roles/nagios_client/files/scripts/check_ipa_replication
Normal file
74
roles/nagios_client/files/scripts/check_ipa_replication
Normal file
|
@ -0,0 +1,74 @@
|
|||
#!/usr/bin/python
|
||||
# Source: https://github.com/opinkerfi/nagios-plugins/blob/master/check_ipa/check_ipa_replication
|
||||
# Copyright 2013, Tomas Edwardsson
|
||||
# Copyright 2016, Patrick Uiterwijk
|
||||
#
|
||||
# This script is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This script is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import ldap
|
||||
from pynag.Plugins import PluginHelper, critical, warning, ok
|
||||
|
||||
plugin = PluginHelper()
|
||||
|
||||
plugin.parser.add_option('-u', help="ldap uri", dest="uri")
|
||||
plugin.parser.add_option('-D', help="bind DN", dest="binddn")
|
||||
plugin.parser.add_option('-w', help="bind password", dest="bindpw")
|
||||
plugin.parse_arguments()
|
||||
|
||||
if not plugin.options.uri:
|
||||
plugin.parser.error('-u (uri) argument is required')
|
||||
|
||||
try:
|
||||
l = ldap.initialize(plugin.options.uri)
|
||||
|
||||
if plugin.options.binddn:
|
||||
l.bind_s(plugin.options.binddn, plugin.options.bindpw)
|
||||
|
||||
replication = l.search_s('cn=config',
|
||||
ldap.SCOPE_SUBTREE,
|
||||
'(objectclass=nsds5replicationagreement)',
|
||||
['nsDS5ReplicaHost', 'nsds5replicaLastUpdateStatus'])
|
||||
except Exception, e:
|
||||
plugin.status(critical)
|
||||
plugin.add_summary("Unable to initialize ldap connection: %s" % (e))
|
||||
plugin.exit()
|
||||
|
||||
|
||||
# Loop through replication agreements
|
||||
for rhost in replication:
|
||||
plugin.add_summary("Replica %s Status: %s" % (rhost[1]['nsDS5ReplicaHost'][0], rhost[1]['nsds5replicaLastUpdateStatus'][0]))
|
||||
|
||||
status = rhost[1]['nsds5replicaLastUpdateStatus'][0]
|
||||
code = status[:2]
|
||||
if status.startswith('Error ('):
|
||||
# IPA >=4.4.0
|
||||
code = status[status.find('(')+1:status.find(')')]
|
||||
else:
|
||||
# IPA <4.4.0
|
||||
code = status[:status.find(' ')]
|
||||
|
||||
if code == '0':
|
||||
plugin.status(ok)
|
||||
elif code == '1':
|
||||
# Busy Replica is not an error, its "unknown" (but its "ok" for now)
|
||||
plugin.status(ok)
|
||||
else:
|
||||
plugin.status(critical)
|
||||
|
||||
if not len(replication):
|
||||
plugin.add_summary("Warning: No replicas found")
|
||||
plugin.status(warning)
|
||||
|
||||
plugin.exit()
|
||||
|
17
roles/nagios_client/files/scripts/check_lock
Normal file
17
roles/nagios_client/files/scripts/check_lock
Normal file
|
@ -0,0 +1,17 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import fcntl
|
||||
import sys
|
||||
|
||||
try:
|
||||
f = open('/mnt/koji/.nagios_test', 'r')
|
||||
f.close()
|
||||
f = open('/mnt/koji/.nagios_test', 'w')
|
||||
except IOError:
|
||||
print "Could not create file"
|
||||
sys.exit(2)
|
||||
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
f.close()
|
||||
print "File Locked Successfully"
|
||||
sys.exit(0)
|
123
roles/nagios_client/files/scripts/check_lock_file_age
Executable file
123
roles/nagios_client/files/scripts/check_lock_file_age
Executable file
|
@ -0,0 +1,123 @@
|
|||
#! /usr/bin/perl -w
|
||||
|
||||
# check_lock_file_age.pl Copyright (C) 2010 Ricky Elrod <codeblock@fedoraproject.org>
|
||||
#
|
||||
# Fork of check_file_age.pl
|
||||
#
|
||||
# Checks a lock file's size and modification time to make sure it's not empty
|
||||
# and that it's sufficiently recent.
|
||||
#
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty
|
||||
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# you should have received a copy of the GNU General Public License
|
||||
# along with this program (or with Nagios); if not, write to the
|
||||
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
# Boston, MA 02111-1307, USA
|
||||
|
||||
use strict;
|
||||
use English;
|
||||
use Getopt::Long;
|
||||
use File::stat;
|
||||
use vars qw($PROGNAME);
|
||||
use lib "/usr/lib64/nagios/plugins";
|
||||
use utils qw (%ERRORS &print_revision &support);
|
||||
|
||||
sub print_help ();
|
||||
sub print_usage ();
|
||||
|
||||
my ($opt_c, $opt_f, $opt_w, $opt_h, $opt_V);
|
||||
my ($result, $message, $age, $size, $st);
|
||||
|
||||
$PROGNAME="check_lock_file_age";
|
||||
|
||||
$opt_w = 1;
|
||||
$opt_c = 5;
|
||||
$opt_f = "";
|
||||
|
||||
Getopt::Long::Configure('bundling');
|
||||
GetOptions(
|
||||
"V" => \$opt_V, "version" => \$opt_V,
|
||||
"h" => \$opt_h, "help" => \$opt_h,
|
||||
"f=s" => \$opt_f, "file" => \$opt_f,
|
||||
"w=f" => \$opt_w, "warning-age=f" => \$opt_w,
|
||||
"c=f" => \$opt_c, "critical-age=f" => \$opt_c);
|
||||
|
||||
if ($opt_V) {
|
||||
print_revision($PROGNAME, '1.4.14');
|
||||
exit $ERRORS{'OK'};
|
||||
}
|
||||
|
||||
if ($opt_h) {
|
||||
print_help();
|
||||
exit $ERRORS{'OK'};
|
||||
}
|
||||
|
||||
if (($opt_c and $opt_w) and ($opt_c < $opt_w)) {
|
||||
print "Warning time must be less than Critical time.\n";
|
||||
exit $ERRORS{'UNKNOWN'};
|
||||
}
|
||||
|
||||
$opt_f = shift unless ($opt_f);
|
||||
|
||||
if (! $opt_f) {
|
||||
print "LOCK_FILE_AGE UNKNOWN: No file specified\n";
|
||||
exit $ERRORS{'UNKNOWN'};
|
||||
}
|
||||
|
||||
# Check that file exists (can be directory or link)
|
||||
unless (-e $opt_f) {
|
||||
print "LOCK_FILE_AGE OK: File not found (Lock file removed) - $opt_f\n";
|
||||
exit $ERRORS{'OK'};
|
||||
}
|
||||
|
||||
$st = File::stat::stat($opt_f);
|
||||
$age = time - $st->mtime;
|
||||
|
||||
$result = 'OK';
|
||||
|
||||
# Convert minutes to seconds
|
||||
if($opt_c) { $opt_c *= 60; }
|
||||
if($opt_w) { $opt_w *= 60; }
|
||||
|
||||
if ($opt_c and $age > $opt_c) {
|
||||
$result = 'CRITICAL';
|
||||
}
|
||||
elsif ($opt_w and $age > $opt_w) {
|
||||
$result = 'WARNING';
|
||||
}
|
||||
|
||||
# If the age is higher than 2 minutes, convert seconds -> minutes
|
||||
# If it's higher than a day, use days.
|
||||
# Just a nicety, to make people not have to do math ;)
|
||||
if($age > 86400) { $age = int(($age/86400))." days"; }
|
||||
elsif($age > 120) { $age = int(($age/60))." minutes"; }
|
||||
else { $age = "$age seconds"; }
|
||||
|
||||
print "LOCK_FILE_AGE $result: $opt_f is $age old.\n";
|
||||
exit $ERRORS{$result};
|
||||
|
||||
sub print_usage () {
|
||||
print "Usage:\n";
|
||||
print " $PROGNAME [-w <secs>] [-c <secs>] -f <file>\n";
|
||||
print " $PROGNAME [-h | --help]\n";
|
||||
print " $PROGNAME [-V | --version]\n";
|
||||
}
|
||||
|
||||
sub print_help () {
|
||||
print_revision($PROGNAME, '1.4.14');
|
||||
print "Copyright (c) 2010 Ricky Elrod\n\n";
|
||||
print_usage();
|
||||
print "\n";
|
||||
print " <mins> File must be no more than this many minutes old (default: warn 1m, crit 5m)\n";
|
||||
print "\n";
|
||||
support();
|
||||
}
|
24
roles/nagios_client/files/scripts/check_memcache_connect
Normal file
24
roles/nagios_client/files/scripts/check_memcache_connect
Normal file
|
@ -0,0 +1,24 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# 2014-11-19
|
||||
# Author: Ralph Bean <rbean@redhat.com>
|
||||
|
||||
# exit codes
|
||||
ok=0
|
||||
warn=1
|
||||
crit=2
|
||||
unkn=3
|
||||
|
||||
# Right now we just check to see if we can even run this command without
|
||||
# hanging and timing out. In the future, we could parse stdout for more
|
||||
# fine-grained information.
|
||||
echo stats | nc 127.0.0.1 11211 > /dev/null
|
||||
status=$?
|
||||
|
||||
if [ $status -ne 0 ]; then
|
||||
echo "CRIT: stats command got status code $status"
|
||||
exit $crit
|
||||
else
|
||||
echo "OK: stats command got status code $status"
|
||||
exit $ok
|
||||
fi
|
14
roles/nagios_client/files/scripts/check_osbs_api.py
Executable file
14
roles/nagios_client/files/scripts/check_osbs_api.py
Executable file
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import requests
|
||||
import sys
|
||||
|
||||
r = requests.get("https://localhost:8443/", verify=False)
|
||||
|
||||
if 'paths' in r.json().keys():
|
||||
print "OK: OSBS API endpoint is responding with path data"
|
||||
sys.exit(0)
|
||||
else:
|
||||
print "CRITICAL: OSBS API not responding properly"
|
||||
sys.exit(2)
|
||||
|
23
roles/nagios_client/files/scripts/check_osbs_builds.py
Executable file
23
roles/nagios_client/files/scripts/check_osbs_builds.py
Executable file
|
@ -0,0 +1,23 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
sp = subprocess.Popen(
|
||||
["osbs", "list-builds"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
stdin=subprocess.PIPE
|
||||
)
|
||||
sp_out, sp_err = sp.communicate()
|
||||
sp_err = sp_err.split('\n')
|
||||
|
||||
if 'not attached to terminal' in sp_err[0]:
|
||||
sp_err = sp_err[1:]
|
||||
|
||||
if sp_err[0].split()[0] == 'BUILD':
|
||||
print "OK: OSBS is responsive to 'osbs list-builds'"
|
||||
sys.exit(0)
|
||||
else:
|
||||
print "CRITICAL: OSBS UNRESPONSIVE"
|
||||
sys.exit(2)
|
49
roles/nagios_client/files/scripts/check_postfix_queue
Normal file
49
roles/nagios_client/files/scripts/check_postfix_queue
Normal file
|
@ -0,0 +1,49 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# 19-07-2010
|
||||
# Author: Cherwin Nooitmeer <cherwin@gmail.com>
|
||||
#
|
||||
|
||||
# exit codes
|
||||
e_ok=0
|
||||
e_warning=1
|
||||
e_critical=2
|
||||
e_unknown=3
|
||||
|
||||
# regular expression that matches queue IDs (e.g. D71EF7AC80F8)
|
||||
queue_id='^[A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9][A-F0-9]'
|
||||
|
||||
usage="Invalid command line usage"
|
||||
|
||||
if [ -z $1 ]; then
|
||||
echo $usage
|
||||
exit $e_unknown
|
||||
fi
|
||||
|
||||
while getopts ":w:c:" options
|
||||
do
|
||||
case $options in
|
||||
w ) warning=$OPTARG ;;
|
||||
c ) critical=$OPTARG ;;
|
||||
* ) echo $usage
|
||||
exit $e_unknown ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# determine queue size
|
||||
qsize=$(mailq | egrep -c $queue_id)
|
||||
if [ -z $qsize ]
|
||||
then
|
||||
exit $e_unknown
|
||||
fi
|
||||
|
||||
if [ $qsize -ge $critical ]; then
|
||||
retval=$e_critical
|
||||
elif [ $qsize -ge $warning ]; then
|
||||
retval=$e_warning
|
||||
elif [ $qsize -lt $warning ]; then
|
||||
retval=$e_ok
|
||||
fi
|
||||
|
||||
echo "$qsize mail(s) in queue | mail_queue=$qsize"
|
||||
exit $retval
|
26
roles/nagios_client/files/scripts/check_rabbitmq_size
Normal file
26
roles/nagios_client/files/scripts/check_rabbitmq_size
Normal file
|
@ -0,0 +1,26 @@
|
|||
#!/bin/python
|
||||
import sys
|
||||
import requests
|
||||
|
||||
url = 'http://localhost:15672/api/queues/%%2f/%s' % (sys.argv[1])
|
||||
|
||||
r = requests.get(url, auth=('guest', 'guest')).json()
|
||||
consumers = r['consumers']
|
||||
messages = r['messages']
|
||||
|
||||
msg = 'Messages in queue: %i (%i consumers)' % (messages, consumers)
|
||||
|
||||
if consumers < 1:
|
||||
print 'CRITICAL: No consumers: %s' % msg
|
||||
sys.exit(2)
|
||||
|
||||
if messages > sys.argv[2]:
|
||||
print 'CRITICAL: %s' % msg
|
||||
sys.exit(2)
|
||||
|
||||
if messages > sys.argv[3]:
|
||||
print 'WARNING: %s' % msg
|
||||
sys.exit(1)
|
||||
|
||||
print 'OK: %s' % msg
|
||||
sys.exit(0)
|
45
roles/nagios_client/files/scripts/check_raid.py
Normal file
45
roles/nagios_client/files/scripts/check_raid.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# very simple python script to parse out /proc/mdstat
|
||||
# and give results for nagios to monitor
|
||||
#
|
||||
|
||||
import sys
|
||||
import string
|
||||
|
||||
devices = []
|
||||
|
||||
try:
|
||||
mdstat = string.split(open('/proc/mdstat').read(), '\n')
|
||||
except IOError:
|
||||
# seems we have no software raid on this machines
|
||||
sys.exit(0)
|
||||
|
||||
error = ""
|
||||
i = 0
|
||||
for line in mdstat:
|
||||
if line[0:2] == 'md':
|
||||
device = string.split(line)[0]
|
||||
devices.append(device)
|
||||
status = string.split(mdstat[i+1])[3]
|
||||
if string.count(status, "_"):
|
||||
# see if we can figure out what's going on
|
||||
err = string.split(mdstat[i+2])
|
||||
msg = "device=%s status=%s" % (device, status)
|
||||
if len(err) > 0:
|
||||
msg = msg + " rebuild=%s" % err[0]
|
||||
|
||||
if not error:
|
||||
error = msg
|
||||
else:
|
||||
error = error + ", " + msg
|
||||
i = i + 1
|
||||
|
||||
if not error:
|
||||
print "DEVICES %s OK" % " ".join(devices)
|
||||
sys.exit(0)
|
||||
|
||||
else:
|
||||
print error
|
||||
sys.exit(2)
|
||||
|
84
roles/nagios_client/files/scripts/check_readonly_fs
Executable file
84
roles/nagios_client/files/scripts/check_readonly_fs
Executable file
|
@ -0,0 +1,84 @@
|
|||
#!/bin/bash
|
||||
|
||||
# check_readonlyfs: Check for readonly filesystems
|
||||
# Copyright (C) 2010 Davide Madrisan <davide.madrisan@gmail.com>
|
||||
|
||||
PROGNAME=`/bin/basename $0`
|
||||
PROGPATH=`echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,'`
|
||||
REVISION=`echo '$Revision: 1 $' | sed -e 's/[^0-9.]//g'`
|
||||
|
||||
. $PROGPATH/utils.sh
|
||||
|
||||
print_usage() {
|
||||
echo "Usage: $PROGNAME --no-network-fs"
|
||||
echo "Usage: $PROGNAME --help"
|
||||
echo "Usage: $PROGNAME --version"
|
||||
}
|
||||
|
||||
print_help() {
|
||||
print_revision $PROGNAME $REVISION
|
||||
echo ""
|
||||
print_usage
|
||||
echo ""
|
||||
echo "readonly filesystem checker plugin for Nagios"
|
||||
echo ""
|
||||
support
|
||||
}
|
||||
|
||||
NETFS=1
|
||||
|
||||
# Grab the command line arguments
|
||||
|
||||
exitstatus=$STATE_WARNING #default
|
||||
|
||||
while test -n "$1"; do
|
||||
case "$1" in
|
||||
--help|-h)
|
||||
print_help
|
||||
exit $STATE_OK
|
||||
;;
|
||||
--version|-V)
|
||||
print_revision $PROGNAME $REVISION
|
||||
exit $STATE_OK
|
||||
;;
|
||||
--no-network-fs|-n)
|
||||
NETFS="0"
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1"
|
||||
print_usage
|
||||
exit $STATE_UNKNOWN
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
[ -r /proc/mounts ] || { echo "cannot read /proc/mounts!"; exit $STATE_UNKNOWN; }
|
||||
|
||||
nerr=0
|
||||
IFS_SAVE="$IFS"
|
||||
|
||||
rofs_list=""
|
||||
while read dev mp fs mopt ignore; do
|
||||
[ "$dev" = none ] && continue
|
||||
case $fs in binfmt_misc|devpts|iso9660|proc|selinuxfs|rpc_pipefs|sysfs|tmpfs|usbfs)
|
||||
continue ;;
|
||||
esac
|
||||
case $fs in autofs|nfs|nfs4|smbfs)
|
||||
# skip the network filesystems
|
||||
[ "$NETFS" = 0 ] && continue ;;
|
||||
esac
|
||||
|
||||
IFS=","; set -- $mopt; IFS="$IFS_SAVE"
|
||||
while :; do
|
||||
case "$1" in
|
||||
ro) rofs_list="$rofs_list $mp"; nerr=$(( $nerr + 1 )) ;;
|
||||
"") shift; break ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
done < <(LC_ALL=C /bin/cat /proc/mounts 2>/dev/null)
|
||||
|
||||
[ $nerr -eq 0 ] && { echo OK; exit $STATE_OK; } || echo "$rofs_list: read only fs"
|
||||
|
||||
exit $exitstatus
|
108
roles/nagios_client/files/scripts/check_supybot_plugin
Executable file
108
roles/nagios_client/files/scripts/check_supybot_plugin
Executable file
|
@ -0,0 +1,108 @@
|
|||
#!/usr/bin/env python
|
||||
""" check_supybot_plugin -- ensure that a plugin is loaded by supybot.
|
||||
|
||||
Run like:
|
||||
|
||||
check_supybot_plugin --target fedmsg
|
||||
check_supybot_plugin --target koji --debug
|
||||
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import socket
|
||||
import string
|
||||
import uuid
|
||||
|
||||
|
||||
def process_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'-t', '--target', default=None, dest='target',
|
||||
help="Required. The plugin we're looking for."
|
||||
)
|
||||
parser.add_argument(
|
||||
'-n', '--nick', default=None, dest='nick',
|
||||
help="NICK to use when connecting to freenode.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d', '--debug', default=False, action='store_true',
|
||||
help='Print out debug information.', dest='debug',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-H', '--host', default='irc.freenode.net',
|
||||
help='Host to connect to.', dest='host',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-p', '--port', default=6667, type=int,
|
||||
help='Host to connect to.', dest='port',
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
args = process_args()
|
||||
|
||||
# Use a random nick so people can't mess with us
|
||||
if not args.nick:
|
||||
args.nick = 'nrpe-' + str(uuid.uuid4()).split('-')[0]
|
||||
|
||||
name = "NRPE Bot"
|
||||
readbuffer = ""
|
||||
|
||||
if not args.target:
|
||||
print "UNKNOWN: No 'target' specified."
|
||||
sys.exit(3)
|
||||
|
||||
args.target = args.target.lower()
|
||||
|
||||
if args.debug:
|
||||
print "connecting to %s/%i" % (args.host, args.port)
|
||||
|
||||
try:
|
||||
s = socket.socket()
|
||||
s.connect((args.host, args.port))
|
||||
|
||||
if args.debug:
|
||||
print "as %s/%s (%s)" % (args.nick, args.nick, name)
|
||||
|
||||
s.send("nick %s\r\n" % args.nick)
|
||||
s.send("USER %s %s bla :%s\r\n" % (args.nick, args.host, name))
|
||||
|
||||
while 1:
|
||||
readbuffer = readbuffer+s.recv(1024)
|
||||
temp = string.split(readbuffer, "\n")
|
||||
readbuffer = temp.pop()
|
||||
|
||||
for line in temp:
|
||||
line = string.rstrip(line)
|
||||
|
||||
if args.debug:
|
||||
print " * ", line
|
||||
|
||||
line = string.split(line)
|
||||
|
||||
if line[1] == 'MODE':
|
||||
msg = "privmsg zodbot :list\r\n"
|
||||
if args.debug:
|
||||
print "sending:"
|
||||
print " ->", msg
|
||||
s.send(msg)
|
||||
|
||||
if line[1] == 'PRIVMSG':
|
||||
if args.debug:
|
||||
print "Got our response.."
|
||||
|
||||
plugins = map(str.lower, ' '.join(line[3:][1:]).split(', '))
|
||||
|
||||
if args.target in plugins:
|
||||
print "OK"
|
||||
s.send("QUIT")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print "CRITICAL: %r not loaded by supybot" % args.target
|
||||
s.send("QUIT")
|
||||
sys.exit(2)
|
||||
except Exception as e:
|
||||
print "UNKNOWN: ", str(e)
|
||||
if args.debug:
|
||||
raise
|
||||
sys.exit(3)
|
19
roles/nagios_client/files/scripts/check_testcloud
Normal file
19
roles/nagios_client/files/scripts/check_testcloud
Normal file
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
RUNNING_VMS=`testcloud instance list | grep -i 'running' | wc -l`
|
||||
CRITICAL=20
|
||||
WARNING=15
|
||||
|
||||
|
||||
if [ $RUNNING_VMS -gt $CRITICAL ]
|
||||
then
|
||||
echo "Testcloud: CRITICAL Number of VMs running: $RUNNING_VMS"
|
||||
exit 2
|
||||
elif [ $RUNNING_VMS -gt $WARNING ]
|
||||
then
|
||||
echo "Testcloud: WARNING Number of VMs running: $RUNNING_VMS"
|
||||
exit 1
|
||||
else
|
||||
echo "Testcloud: OK Number of VMs running: $RUNNING_VMS"
|
||||
exit 0
|
||||
fi
|
Loading…
Add table
Add a link
Reference in a new issue