From 85c486b34b257fb586e105962fbccec2a4803cd6 Mon Sep 17 00:00:00 2001 From: Ralph Bean Date: Wed, 19 Nov 2014 18:35:14 +0000 Subject: [PATCH] Check for connectivity to memcached. This will attempt to call the daemon's stats command which, if broken, might hung and cause nrpe to time out. We want that, as it will give us a clue to what might be causing some other app to fail. --- .../files/scripts/check_memcache_connect | 24 +++++++++++++++++++ roles/nagios_client/tasks/main.yml | 1 + .../templates/check_memcache.cfg.j2 | 2 +- .../files/nagios/services/memcached.cfg | 16 +++++++++++-- roles/nagios_server/files/nrpe.cfg | 1 + 5 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 roles/nagios_client/files/scripts/check_memcache_connect diff --git a/roles/nagios_client/files/scripts/check_memcache_connect b/roles/nagios_client/files/scripts/check_memcache_connect new file mode 100644 index 0000000000..7c472e3ec8 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_memcache_connect @@ -0,0 +1,24 @@ +#!/bin/bash +# +# 2014-12-19 +# Author: Ralph Bean + +# exit codes +ok=0 +warn=1 +crit=2 +unkn=3 + +# Right now we just check to see if we can even run this command without +# hanging and timing out. In the future, we could parse stdout for more +# fine-grained information. +echo stats | nc 127.0.0.1 11211 > /dev/null +status=$? + +if [ $status -ne 0 ]; then + echo "CRIT: stats command got status code $status" + exit $crit +else + echo "OK: stats command got status code $status" + exit $ok +fi diff --git a/roles/nagios_client/tasks/main.yml b/roles/nagios_client/tasks/main.yml index 6c91ddab69..aa9b6c2223 100644 --- a/roles/nagios_client/tasks/main.yml +++ b/roles/nagios_client/tasks/main.yml @@ -31,6 +31,7 @@ - check_fedmsg_producers_consumers.py - check_supybot_plugin - check_datanommer_timesince.py + - check_memcache_connect when: not inventory_hostname.startswith('noc') tags: - nagios_client diff --git a/roles/nagios_client/templates/check_memcache.cfg.j2 b/roles/nagios_client/templates/check_memcache.cfg.j2 index b350a654e3..b0ec100a5d 100644 --- a/roles/nagios_client/templates/check_memcache.cfg.j2 +++ b/roles/nagios_client/templates/check_memcache.cfg.j2 @@ -1,2 +1,2 @@ command[check_memcache]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/memcached' -u memcached - +command[check_memcache_connect]=/usr/lib64/nagios/plugins/check_memcache_connect diff --git a/roles/nagios_server/files/nagios/services/memcached.cfg b/roles/nagios_server/files/nagios/services/memcached.cfg index 9f497b50c6..814a5a8530 100644 --- a/roles/nagios_server/files/nagios/services/memcached.cfg +++ b/roles/nagios_server/files/nagios/services/memcached.cfg @@ -1,12 +1,24 @@ define service { host_name memcached01 - service_description Check memcached daemon + service_description Check for the presence of the memcached daemon check_command check_by_nrpe!check_memcache use defaulttemplate } define service { host_name memcached02 - service_description Check memcached daemon + service_description Check for the presence of the memcached daemon check_command check_by_nrpe!check_memcache use defaulttemplate } +define service { + host_name memcached01 + service_description Check for connectivity to the memcached daemon + check_command check_by_nrpe!check_memcache_connect + use defaulttemplate +} +define service { + host_name memcached02 + service_description Check for connectivity to the memcached daemon + check_command check_by_nrpe!check_memcache_connect + use defaulttemplate +} diff --git a/roles/nagios_server/files/nrpe.cfg b/roles/nagios_server/files/nrpe.cfg index 86af64b5da..4fb1cdb424 100644 --- a/roles/nagios_server/files/nrpe.cfg +++ b/roles/nagios_server/files/nrpe.cfg @@ -238,6 +238,7 @@ command[check_fcomm_queue]=/usr/lib64/nagios/plugins/check_fcomm_queue command[check_redis_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'redis-server' -u redis command[check_openvpn_link]=/usr/lib64/nagios/plugins/check_ping -H 192.168.1.58 -w 375.0,20% -c 500,60% command[check_memcache]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/memcached' -u memcached +command[check_memcache_connect]=/usr/lib64/nagios/plugins/check_memcache_connect # The following are fedmsg/datanommer checks to be run on busgateway01. # They check for the time since the latest message in any particular category.