From 890dd31cb0a58e1b2f692287b9b38fd3a549b968 Mon Sep 17 00:00:00 2001 From: seddikalaouiismaili Date: Sun, 20 Dec 2020 01:10:52 +0100 Subject: [PATCH] script to monitor systemd units on pagure --- roles/nagios_client/tasks/main.yml | 11 +++++ .../templates/check_systemd_units.cfg.j2 | 1 + .../files/nagios/plugins/check_systemd_units | 48 +++++++++++++++++++ .../files/nagios/services/procs.cfg | 7 +++ .../nagios_server/templates/nrpe/nrpe.cfg.j2 | 2 +- 5 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 roles/nagios_client/templates/check_systemd_units.cfg.j2 create mode 100755 roles/nagios_server/files/nagios/plugins/check_systemd_units diff --git a/roles/nagios_client/tasks/main.yml b/roles/nagios_client/tasks/main.yml index 490919d050..62a9be980e 100644 --- a/roles/nagios_client/tasks/main.yml +++ b/roles/nagios_client/tasks/main.yml @@ -304,6 +304,17 @@ tags: - nagios_client +- name: install nrpe check for systemd units + template: + src={{ item }}.j2 dest=/etc/nrpe.d/{{ item }} owner=root group=root mode=0644 + with_items: + - check_systemd_units.cfg + when: inventory_hostname.startswith('pagure02') + notify: + - restart nrpe + tags: + - nagios_client + - name: Check if the fedmsg group exists shell: /usr/bin/getent group fedmsg | /usr/bin/wc -l | tr -d ' ' register: fedmsg_exists diff --git a/roles/nagios_client/templates/check_systemd_units.cfg.j2 b/roles/nagios_client/templates/check_systemd_units.cfg.j2 new file mode 100644 index 0000000000..834de6c86f --- /dev/null +++ b/roles/nagios_client/templates/check_systemd_units.cfg.j2 @@ -0,0 +1 @@ +command[check_systemd_units]=/usr/lib64/nagios/plugins/check_systemd_units diff --git a/roles/nagios_server/files/nagios/plugins/check_systemd_units b/roles/nagios_server/files/nagios/plugins/check_systemd_units new file mode 100755 index 0000000000..accdc9ab20 --- /dev/null +++ b/roles/nagios_server/files/nagios/plugins/check_systemd_units @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Description : script to check the status of systemd units +# if they failed, try to restart the service once !! + +# Author : Seddik Alaoui Ismaili +# Version : 1.0 + + +# Exits code +warning_exit="1" +ok_exit="0" + +# Unit list +unit_list=(pagure_ci +pagure_ev +pagure_fast_worker +pagure_loadjson +pagure_logcom +pagure_medium_worker +pagure_milter +pagure_mirror +pagure_slow_worker +pagure_webhook +pagure_worker +pagure_mirror_project_in.timer) + +#Element's arrays +failed_array=() +active_array=() + +# Check units's status + echo -e "here the lenght of array : ${#active_array[@]}" +for element in ${unit_list[@]}; do + status=$(systemctl status ${element} |grep -E "Active:" | awk '{ print $2 }') + if [ $status == failed ]; then + systemctl restart ${element} && active_array+=($element) || failed_array+=($element) + fi +done + +# check the lenght of array and print result/exit code for nagios +if [ ${#failed_array[@]} -ne "0" ]; then + echo -e "WARNING - Failed systemd units after restart : ${failed_array[@]}" + exit ${warning_exit} +elif [ ${#failed_array[@]} -eq "0" ]; then + echo -e "OK - Systemd units are active" + exit ${ok_exit} +fi diff --git a/roles/nagios_server/files/nagios/services/procs.cfg b/roles/nagios_server/files/nagios/services/procs.cfg index 23f99ecfa6..d44caca3fc 100644 --- a/roles/nagios_server/files/nagios/services/procs.cfg +++ b/roles/nagios_server/files/nagios/services/procs.cfg @@ -57,3 +57,10 @@ define service { use defaulttemplate } +define service { + hostgroup pagure + service_description Systemd Units + check_command check_by_nrpe!check_systemd_units + use defaulttemplate +} + diff --git a/roles/nagios_server/templates/nrpe/nrpe.cfg.j2 b/roles/nagios_server/templates/nrpe/nrpe.cfg.j2 index 3ac9b04bfa..e31505bf33 100644 --- a/roles/nagios_server/templates/nrpe/nrpe.cfg.j2 +++ b/roles/nagios_server/templates/nrpe/nrpe.cfg.j2 @@ -421,7 +421,7 @@ command[check_fedmsg_cbacklog_mbs_backend_hub]=/usr/lib64/nagios/plugins/check_f command[check_fedmsg_fmn_digest_last_ran]=/usr/lib64/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub DigestProducer 90 600 command[check_fedmsg_fmn_confirm_last_ran]=/usr/lib64/nagios/plugins/check_fedmsg_producer_last_ran.py fedmsg-hub ConfirmationProducer 30 300 - +command[check_systemd_units]=/usr/lib64/nagios/plugins/check_systemd_units # The following are 'action commands' where by an actual action is performed # like restarting httpd