script to monitor systemd units on pagure

This commit is contained in:
seddikalaouiismaili 2020-12-20 01:10:52 +01:00 committed by pingou
parent aace9bb2cc
commit 890dd31cb0
5 changed files with 68 additions and 1 deletions

View file

@ -304,6 +304,17 @@
tags:
- nagios_client
- name: install nrpe check for systemd units
template:
src={{ item }}.j2 dest=/etc/nrpe.d/{{ item }} owner=root group=root mode=0644
with_items:
- check_systemd_units.cfg
when: inventory_hostname.startswith('pagure02')
notify:
- restart nrpe
tags:
- nagios_client
- name: Check if the fedmsg group exists
shell: /usr/bin/getent group fedmsg | /usr/bin/wc -l | tr -d ' '
register: fedmsg_exists

View file

@ -0,0 +1 @@
command[check_systemd_units]=/usr/lib64/nagios/plugins/check_systemd_units

View file

@ -0,0 +1,48 @@
#!/usr/bin/env bash
# Description : script to check the status of systemd units
# if they failed, try to restart the service once !!
# Author : Seddik Alaoui Ismaili
# Version : 1.0
# Exits code
warning_exit="1"
ok_exit="0"
# Unit list
unit_list=(pagure_ci
pagure_ev
pagure_fast_worker
pagure_loadjson
pagure_logcom
pagure_medium_worker
pagure_milter
pagure_mirror
pagure_slow_worker
pagure_webhook
pagure_worker
pagure_mirror_project_in.timer)
#Element's arrays
failed_array=()
active_array=()
# Check units's status
echo -e "here the lenght of array : ${#active_array[@]}"
for element in ${unit_list[@]}; do
status=$(systemctl status ${element} |grep -E "Active:" | awk '{ print $2 }')
if [ $status == failed ]; then
systemctl restart ${element} && active_array+=($element) || failed_array+=($element)
fi
done
# check the lenght of array and print result/exit code for nagios
if [ ${#failed_array[@]} -ne "0" ]; then
echo -e "WARNING - Failed systemd units after restart : ${failed_array[@]}"
exit ${warning_exit}
elif [ ${#failed_array[@]} -eq "0" ]; then
echo -e "OK - Systemd units are active"
exit ${ok_exit}
fi

View file

@ -57,3 +57,10 @@ define service {
use defaulttemplate
}
define service {
hostgroup pagure
service_description Systemd Units
check_command check_by_nrpe!check_systemd_units
use defaulttemplate
}

View file

@ -421,7 +421,7 @@ command[check_fedmsg_cbacklog_mbs_backend_hub]=/usr/lib64/nagios/plugins/check_f
command[check_fedmsg_fmn_digest_last_ran]=/usr/lib64/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 90 600
command[check_fedmsg_fmn_confirm_last_ran]=/usr/lib64/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 30 300
command[check_systemd_units]=/usr/lib64/nagios/plugins/check_systemd_units
# The following are 'action commands' where by an actual action is performed
# like restarting httpd