From e26ead0f70fcfa5e083f93dbb2d1ba0667d14d59 Mon Sep 17 00:00:00 2001 From: Stephen Smoogen Date: Mon, 8 Jun 2020 11:17:14 -0400 Subject: [PATCH] try and get nagios working on noc01.iad2 --- playbooks/groups/noc.yml | 7 +- .../nagios/services/iad2_internal/basset.cfg | 27 + .../services/iad2_internal/certgetter.cfg | 6 + .../services/iad2_internal/db_backups.cfg | 6 + .../nagios/services/iad2_internal/disk.cfg | 76 +++ .../nagios/services/iad2_internal/fedmsg.cfg | 487 ++++++++++++++++++ .../services/iad2_internal/file_age.cfg | 45 ++ .../nagios/services/iad2_internal/fmn.cfg | 20 + .../nagios/services/iad2_internal/koji.cfg | 16 + .../nagios/services/iad2_internal/locking.cfg | 13 + .../nagios/services/iad2_internal/mailman.cfg | 7 + .../nagios/services/iad2_internal/nrpe.cfg | 8 + .../nagios/services/iad2_internal/osbs.cfg | 7 + .../nagios/services/iad2_internal/pgsql.cfg | 14 + .../services/iad2_internal/rabbitmq.cfg | 96 ++++ roles/nagios_server/tasks/main.yml | 8 + .../nagios/hostgroups/all-iad2.cfg.j2 | 46 ++ .../nagios/plugins/check_koji.j2} | 3 +- 18 files changed, 886 insertions(+), 6 deletions(-) create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/basset.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/certgetter.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/db_backups.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/disk.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/fedmsg.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/file_age.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/fmn.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/koji.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/locking.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/mailman.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/nrpe.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/osbs.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/pgsql.cfg create mode 100644 roles/nagios_server/files/nagios/services/iad2_internal/rabbitmq.cfg create mode 100644 roles/nagios_server/templates/nagios/hostgroups/all-iad2.cfg.j2 rename roles/nagios_server/{files/nagios/plugins/check_koji => templates/nagios/plugins/check_koji.j2} (69%) diff --git a/playbooks/groups/noc.yml b/playbooks/groups/noc.yml index d01b072350..2a19f12378 100644 --- a/playbooks/groups/noc.yml +++ b/playbooks/groups/noc.yml @@ -23,8 +23,7 @@ - collectd/base - { role: rsyncd, when: datacenter == 'phx2' or datacenter == 'iad2' } - sudo - - { role: openvpn/client, - when: env != "staging" } + - { role: openvpn/client, when: env != "staging" } - mod_wsgi - role: keytab/service owner_user: apache @@ -63,9 +62,9 @@ roles: - { role: dhcp_server, when: datacenter == 'phx2' or datacenter == 'iad2' } - { role: tftp_server, when: datacenter == 'phx2' or datacenter == 'iad2' } - - { role: nagios_server, when: datacenter == 'phx2' or datacenter == 'ibiblio'} + - { role: nagios_server } - { role: fedmsg/base, when: deployment_type == "prod" and datacenter != 'iad2'} - - { role: rabbit/user, when: deployment_type == "stg" } + - { role: rabbit/user, when: deployment_type == "stg" or datacenter == 'iad2'} tasks: - name: install some packages which arent in playbooks diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/basset.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/basset.cfg new file mode 100644 index 0000000000..ba0bc9ff28 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/basset.cfg @@ -0,0 +1,27 @@ +define service { + hostgroup_name basset + service_description mongo process + check_command check_by_nrpe!check_mongo_proc + use defaulttemplate +} + +define service { + hostgroup_name basset + service_description rabbitmq process + check_command check_by_nrpe!check_rabbitmq_proc + use defaulttemplate +} + +define service { + hostgroup_name basset + service_description basset worker processes + check_command check_by_nrpe!check_worker_proc + use defaulttemplate +} + +define service { + hostgroup_name basset + service_description basset processing queue + check_command check_by_nrpe!check_basset_queue + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/certgetter.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/certgetter.cfg new file mode 100644 index 0000000000..d258b3ec76 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/certgetter.cfg @@ -0,0 +1,6 @@ +define service { + host_name certgetter01.iad2.fedoraproject.org + service_description certgetter-http + check_command check_http!certgetter01.iad2.fedoraproject.org + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/db_backups.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/db_backups.cfg new file mode 100644 index 0000000000..1685af994a --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/db_backups.cfg @@ -0,0 +1,6 @@ +define service { + host_name db03.iad2.fedoraproject.org + service_description Check MySQL Backup + check_command check_by_nrpe!check_mysql_backup + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/disk.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/disk.cfg new file mode 100644 index 0000000000..ca69054b8e --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/disk.cfg @@ -0,0 +1,76 @@ +define service { + hostgroup_name all, !mincheckgrp + service_description Disk_Space_/ + check_command check_by_nrpe!check_disk_/ + use disktemplate +} + +define service { + hostgroup_name all, !mincheckgrp + service_description Disk Space /boot + check_command check_by_nrpe!check_disk_/boot + use disktemplate +} + +define service { + hostgroup_name qahardware + service_description Disk Space /srv + check_command check_by_nrpe!check_disk_/srv + use disktemplate +} + +define service { + host_name log01.iad2.fedoraproject.org + service_description Disk space /var/log + check_command check_by_nrpe!check_disk_/var/log + use disktemplate +} + +define service { + hostgroup_name pkgs + service_description Check read-only filesystem + check_command check_by_nrpe!check_readonly_fs + use disktemplate +} + +define service { + hostgroup_name pkgs + service_description Disk space /srv/cache/lookaside + check_command check_by_nrpe!check_disk_/srv/cache/lookaside + use disktemplate +} + +define service { + hostgroup_name koji + service_description Disk space / + check_command check_by_nrpe!check_disk_/ + use ppc-secondarytemplate +} + +define service { + hostgroup_name retrace + service_description Disk space / + check_command check_by_nrpe!check_disk_/ + use retracetemplate +} + +define service { + hostgroup_name retrace + service_description Disk Space for huge /srv + check_command check_by_nrpe!check_disk_huge_/srv + use disktemplate +} + +define service { + hostgroup_name people + service_description Disk space /project + check_command check_by_nrpe!check_disk_/project/ + use disktemplate +} + +define service { + hostgroup_name oci_registry + service_description Disk space /srv/registry + check_command check_by_nrpe!check_disk_/srv/registry + use disktemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/fedmsg.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/fedmsg.cfg new file mode 100644 index 0000000000..5000f12e35 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/fedmsg.cfg @@ -0,0 +1,487 @@ +## There are lots of different sections in this now-enormous file +## Each one starts with a 'BEGIN' comment. + + +# BEGIN, check for the existance of processes +define service { + host_name value01.iad2.fedoraproject.org + service_description Check for fedmsg-irc proc + check_command check_by_nrpe!check_fedmsg_irc_proc + use defaulttemplate +} + +define service { + hostgroup_name proxies + service_description Check fedmsg-gateway consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_gateway + use defaulttemplate +} + +define service { + hostgroup_name proxies + service_description Check for existence fedmsg-gateway proc + check_command check_by_nrpe!check_fedmsg_gateway_proc + use defaulttemplate +} + +define service { + hostgroup_name proxies + service_description Check fedmsg consumers and producers gateway + check_command check_by_nrpe!check_fedmsg_cp_busgateway_gateway + use defaulttemplate +} + +define service { + hostgroup_name proxies + service_description Check fedmsg-gateway consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_gateway + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check for fedmsg-gateway proc + check_command check_by_nrpe!check_fedmsg_gateway_proc + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check for fedmsg-relay proc + check_command check_by_nrpe!check_fedmsg_relay_proc + use defaulttemplate +} + +define service { + host_name badges-backend01.iad2.fedoraproject.org + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name pkgs02.iad2.fedoraproject.org + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name fedimg01.iad2.fedoraproject.org + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + +define service { + host_name packages03.iad2.fedoraproject.org + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} +define service { + host_name pdc-backend01.iad2.fedoraproject.org + service_description Check for fedmsg-hub proc + check_command check_by_nrpe!check_fedmsg_hub_proc + use defaulttemplate +} + + +# Odd one, check for the supybot fedmsg plugin +define service { + host_name value01.iad2.fedoraproject.org + service_description Check supybot fedmsg plugin + check_command check_by_nrpe!check_supybot_fedmsg_plugin + use defaulttemplate +} + + +# BEGIN, check datanommer history +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent buildsys/koji messages + check_command check_by_nrpe!check_datanommer_buildsys + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent git messages + check_command check_by_nrpe!check_datanommer_git + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent bodhi compose messages + check_command check_by_nrpe!check_datanommer_bodhi_composes + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent bodhi messages + check_command check_by_nrpe!check_datanommer_bodhi + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent wiki messages + check_command check_by_nrpe!check_datanommer_wiki + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent compose messages + check_command check_by_nrpe!check_datanommer_compose + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent meetbot messages + check_command check_by_nrpe!check_datanommer_meetbot + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent fas messages + check_command check_by_nrpe!check_datanommer_fas + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent fedoraplanet messages + check_command check_by_nrpe!check_datanommer_planet + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent copr finished build messages + check_command check_by_nrpe!check_datanommer_copr + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent fedbadges messages + check_command check_by_nrpe!check_datanommer_fedbadges + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent fedocal messages + check_command check_by_nrpe!check_datanommer_fedocal + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent ansible messages + check_command check_by_nrpe!check_datanommer_ansible + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent fedimg messages + check_command check_by_nrpe!check_datanommer_fedimg + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent hotness messages + check_command check_by_nrpe!check_datanommer_hotness + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent faf messages + check_command check_by_nrpe!check_datanommer_faf + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent mailman messages + check_command check_by_nrpe!check_datanommer_mailman + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent bugzilla messages + check_command check_by_nrpe!check_datanommer_bugzilla + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent github messages + check_command check_by_nrpe!check_datanommer_github + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent kerneltest messages + check_command check_by_nrpe!check_datanommer_kerneltest + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent fmn messages + check_command check_by_nrpe!check_datanommer_fmn + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent rpm sign messages + check_command check_by_nrpe!check_datanommer_rpmsign + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent mdapi messages + check_command check_by_nrpe!check_datanommer_mdapi + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent greenwave messages + check_command check_by_nrpe!check_datanommer_greenwave + use defaulttemplate +} +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check datanommer for recent resultsdb messages + check_command check_by_nrpe!check_datanommer_resultsdb + use defaulttemplate +} + + +# BEGIN, check consumers and producers +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_busgateway_hub + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers relay + check_command check_by_nrpe!check_fedmsg_cp_busgateway_relay + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers gateway + check_command check_by_nrpe!check_fedmsg_cp_busgateway_gateway + use defaulttemplate +} + +define service { + host_name value01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers irc + check_command check_by_nrpe!check_fedmsg_cp_value + use defaulttemplate +} + +define service { + host_name badges-backend01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_badges_backend + use defaulttemplate +} + +define service { + host_name bugzilla2fedmsg01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_bugzilla2fedmsg + use defaulttemplate +} + +define service { + host_name fedimg01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_fedimg_backend + use defaulttemplate +} + +define service { + host_name packages03.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_packages_backend + use defaulttemplate +} +define service { + host_name pdc-backend01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_pdc_backend + use defaulttemplate +} +define service { + host_name mbs-backend01.iad2.fedoraproject.org + service_description Check fedmsg consumers and producers hub + check_command check_by_nrpe!check_fedmsg_cp_mbs_backend + use defaulttemplate +} + + +# BEGIN exceptions counter +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_hub + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg-relay consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_relay + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg-gateway consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_busgateway_gateway + use defaulttemplate +} + +define service { + host_name value01.iad2.fedoraproject.org + service_description Check fedmsg-irc consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_value + use defaulttemplate +} + +define service { + host_name badges-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_badges_backend + use defaulttemplate +} + +define service { + host_name notifs-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_notifs_backend + use defaulttemplate +} + +define service { + host_name bugzilla2fedmsg01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_bugzilla2fedmsg + use defaulttemplate +} + +define service { + host_name fedimg01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_fedimg_backend + use defaulttemplate +} + +define service { + host_name packages03.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_packages_backend + use defaulttemplate +} +define service { + host_name pdc-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_pdc_backend + use defaulttemplate +} +define service { + host_name mbs-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers exceptions + check_command check_by_nrpe!check_fedmsg_cexceptions_mbs_backend + use defaulttemplate +} + + + +# BEGIN backlog checking +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_hub + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg-relay consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_relay + use defaulttemplate +} + +define service { + host_name busgateway01.iad2.fedoraproject.org + service_description Check fedmsg-gateway consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_busgateway_gateway + use defaulttemplate +} + +define service { + host_name value01.iad2.fedoraproject.org + service_description Check fedmsg-irc consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_value + use defaulttemplate +} + +define service { + host_name badges-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_badges_backend + use defaulttemplate +} + +define service { + host_name notifs-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_notifs_backend + use defaulttemplate +} + +define service { + host_name bugzilla2fedmsg01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_bugzilla2fedmsg + use defaulttemplate +} + +define service { + host_name fedimg01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_fedimg_backend + use defaulttemplate +} + +define service { + host_name packages03.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_packages_backend + use defaulttemplate +} + +define service { + host_name pdc-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_pdc_backend + use defaulttemplate +} + +define service { + host_name mbs-backend01.iad2.fedoraproject.org + service_description Check fedmsg-hub consumers backlog + check_command check_by_nrpe!check_fedmsg_cbacklog_mbs_backend + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/file_age.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/file_age.cfg new file mode 100644 index 0000000000..fdf25e043a --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/file_age.cfg @@ -0,0 +1,45 @@ +define service { + hostgroup_name proxies + service_description Check MirrorList 1 Cache + check_command check_by_nrpe!check_mirrorlist1_cache + use defaulttemplate + check_interval 120 + notification_interval 130 +} + +define service { + hostgroup_name proxies + service_description Check MirrorList 2 Cache + check_command check_by_nrpe!check_mirrorlist2_cache + use defaulttemplate + check_interval 120 + notification_interval 130 +} + +define service { + hostgroup_name proxies + service_description Check TicketKey age + check_command check_by_nrpe!check_ticketkey_age + use defaulttemplate + check_interval 120 + notification_interval 130 +} + +define service { + hostgroup_name proxies + service_description Check ostree summary age + check_command check_by_nrpe!check_ostree_summary_file_age + use defaulttemplate + check_interval 120 + notification_interval 130 +} + +define service { + host_name log01.iad2.fedoraproject.org + service_description Check Merged Log + check_command check_by_nrpe!check_merged_file_age + use defaulttemplate + check_interval 120 + notification_interval 130 + event_handler restart_rsyslog +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/fmn.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/fmn.cfg new file mode 100644 index 0000000000..8130bf6fa9 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/fmn.cfg @@ -0,0 +1,20 @@ +define service { + host_name notifs-backend01.iad2.fedoraproject.org + service_description Check backend irc queue size + check_command check_by_nrpe!check_fmn_backend_irc_queue + use defaulttemplate +} + +define service { + host_name notifs-backend01.iad2.fedoraproject.org + service_description Check backend email queue size + check_command check_by_nrpe!check_fmn_backend_email_queue + use defaulttemplate +} + +define service { + host_name notifs-backend01.iad2.fedoraproject.org + service_description Check worker queue size + check_command check_by_nrpe!check_fmn_worker_queue + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/koji.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/koji.cfg new file mode 100644 index 0000000000..3ed70cfdff --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/koji.cfg @@ -0,0 +1,16 @@ +define service { + host_name koji01.iad2.fedoraproject.org + service_description Check Koji + check_command check_koji + max_check_attempts 5 + use criticaltemplate +} + + +define service { + host_name koji01.iad2.fedoraproject.org + service_description Check Koji wellness + check_command check_koji_wellness.py!koji.fedoraproject.org!koji + max_check_attempts 5 + use criticaltemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/locking.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/locking.cfg new file mode 100644 index 0000000000..5be7264ade --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/locking.cfg @@ -0,0 +1,13 @@ +define service { + host_name rawhide-composer.iad2.fedoraproject.org, koji01.phx2.fedoraproject.org + service_description Check NFS File Locks + check_command check_by_nrpe!check_lock + use criticaltemplate +} + +define service { + host_name fas01.iad2.fedoraproject.org + service_description Check certificate lock + check_command check_by_nrpe!check_lock_file_age + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/mailman.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/mailman.cfg new file mode 100644 index 0000000000..97adc9f3b3 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/mailman.cfg @@ -0,0 +1,7 @@ +define service { + host_name mailman01.iad2.fedoraproject.org + service_description check mailman api + check_command check_by_nrpe!check_mailman_api + max_check_attempts 5 + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/nrpe.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/nrpe.cfg new file mode 100644 index 0000000000..735e31af5b --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/nrpe.cfg @@ -0,0 +1,8 @@ +define service { + host_name bastion02.iad2.fedoraproject.org, bastion01.phx2.fedoraproject.org, sundries01.phx2.fedoraproject.org, sundries02.phx2.fedoraproject.org, wiki01.phx2.fedoraproject.org, wiki02.phx2.fedoraproject.org + service_description nrpe + check_command test_nrpe + max_check_attempts 2 + check_interval 2 + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/osbs.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/osbs.cfg new file mode 100644 index 0000000000..15db42ce1c --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/osbs.cfg @@ -0,0 +1,7 @@ +define service { + host_name osbs-master01.iad2.fedoraproject.org + service_description Check OSBS API endpoint paths + check_command check_by_nrpe!check_osbs_api + max_check_attempts 5 + use defaulttemplate +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/pgsql.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/pgsql.cfg new file mode 100644 index 0000000000..5e8e40d545 --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/pgsql.cfg @@ -0,0 +1,14 @@ +define service { + host_name db-koji01.iad2.fedoraproject.org + service_description Check Koji DB + check_command check_pgsql!koji + use criticaltemplate +} + +define service { + host_name db-fas01.iad2.fedoraproject.org + service_description Check FAS DB + check_command check_pgsql!fas2 + use criticaltemplate + servicegroups fas +} diff --git a/roles/nagios_server/files/nagios/services/iad2_internal/rabbitmq.cfg b/roles/nagios_server/files/nagios/services/iad2_internal/rabbitmq.cfg new file mode 100644 index 0000000000..c458b3955d --- /dev/null +++ b/roles/nagios_server/files/nagios/services/iad2_internal/rabbitmq.cfg @@ -0,0 +1,96 @@ +# RabbitMQ processes (for each host) + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus server processes + check_command check_by_nrpe!check_rabbitmq_server + use defaulttemplate +} + +define service { + host_name rabbitmq02.iad2.fedoraproject.org + service_description Check bus server processes + check_command check_by_nrpe!check_rabbitmq_server + use defaulttemplate +} + +define service { + host_name rabbitmq03.iad2.fedoraproject.org + service_description Check bus server processes + check_command check_by_nrpe!check_rabbitmq_server + use defaulttemplate +} + +# RabbitMQ alarms (for each host) + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus server alarms + check_command check_by_nrpe!check_rabbitmq_watermark + use defaulttemplate +} + +define service { + host_name rabbitmq02.iad2.fedoraproject.org + service_description Check bus server alarms + check_command check_by_nrpe!check_rabbitmq_watermark + use defaulttemplate +} + +define service { + host_name rabbitmq03.iad2.fedoraproject.org + service_description Check bus server alarms + check_command check_by_nrpe!check_rabbitmq_watermark + use defaulttemplate +} + +# The following results are cluster-wide, no need to run them on each cluster member + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus cluster + check_command check_by_nrpe!check_rabbitmq_cluster + use defaulttemplate +} + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus cluster connections + check_command check_by_nrpe!check_rabbitmq_connections + use defaulttemplate +} + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus cluster overview + check_command check_by_nrpe!check_rabbitmq_overview + use defaulttemplate +} + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus exchanges in /pubsub + check_command check_by_nrpe!check_rabbitmq_exchange_pubsub + use defaulttemplate +} + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus exchanges in /public_pubsub + check_command check_by_nrpe!check_rabbitmq_exchange_public_pubsub + use defaulttemplate +} + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus queues in /pubsub + check_command check_by_nrpe!check_rabbitmq_queue_pubsub + use defaulttemplate +} + +define service { + host_name rabbitmq01.iad2.fedoraproject.org + service_description Check bus queues in /public_pubsub + check_command check_by_nrpe!check_rabbitmq_queue_public_pubsub + use defaulttemplate +} \ No newline at end of file diff --git a/roles/nagios_server/tasks/main.yml b/roles/nagios_server/tasks/main.yml index 56b69f5fbe..5a7027bec3 100644 --- a/roles/nagios_server/tasks/main.yml +++ b/roles/nagios_server/tasks/main.yml @@ -330,6 +330,14 @@ - nagios_config notify: restart nagios +- name: Template over plugins + template: src=nagios/plugins/{{item}}.j2 dest=/usr/lib64/nagios/plugins/ mode=0755 owner=root group=root + with_items: + - check_koji + tags: + - nagios_server + + - name: Template over commands template: src=nagios/commands/{{item}}.j2 dest=/etc/nagios/commands/{{item}} mode=0644 owner=root group=root with_items: diff --git a/roles/nagios_server/templates/nagios/hostgroups/all-iad2.cfg.j2 b/roles/nagios_server/templates/nagios/hostgroups/all-iad2.cfg.j2 new file mode 100644 index 0000000000..63816f2877 --- /dev/null +++ b/roles/nagios_server/templates/nagios/hostgroups/all-iad2.cfg.j2 @@ -0,0 +1,46 @@ +############### +# All Servers and associated devices +############### +## {{ env }} +{% for key, value in groups.items()|sort %} +{% if groups[key] != [] and key not in vars['exclude_hostgroups'] %} +define hostgroup{ + hostgroup_name {{ key }} + alias {{ key }} + members {% for host in groups[key]|sort %}{% if hostvars[host].nagios_Can_Connect == true %}{{host}}, {% endif %}{% endfor %} + +} +{% endif %} +{% endfor %} + +define hostgroup{ + hostgroup_name no_ping + alias no_ping + members {% for host in groups['all']|sort %}{% if hostvars[host].nagios_Check_Services['ping'] == true or hostvars[host].nagios_Can_Connect == true %}{{host}}, {% endif %}{% endfor %} +} + +## Services with minimal monitoring + +define hostgroup{ + hostgroup_name cloud_aws_group + alias cloud_aws_group + members {% for host in groups['cloud_aws']|sort %}{{host}}, {% endfor %} +} + + +define hostgroup{ + hostgroup_name mincheckgrp + alias mincheckgrp + members {% for host in groups['all']|sort %}{% if hostvars[host].nagios_Check_Services['nrpe'] != true and hostvars[host].nagios_Can_Connect == true %}{{host}}, {% endif %}{% endfor %} + +} + +define hostgroup{ + hostgroup_name routers + alias routers + members phx2-gw, ibiblio-gw, dedicated-gw, host1plus-gw, internetx-gw, osuosl-gw, rdu-gw, rdu-cc-gw, iad2-gw +} + + +## +## Management hardware diff --git a/roles/nagios_server/files/nagios/plugins/check_koji b/roles/nagios_server/templates/nagios/plugins/check_koji.j2 similarity index 69% rename from roles/nagios_server/files/nagios/plugins/check_koji rename to roles/nagios_server/templates/nagios/plugins/check_koji.j2 index 3113d609db..11fbfea8b4 100755 --- a/roles/nagios_server/files/nagios/plugins/check_koji +++ b/roles/nagios_server/templates/nagios/plugins/check_koji.j2 @@ -1,6 +1,6 @@ #!/bin/bash -FAILURES=$(/usr/bin/wget --timeout=60 -q -O- http://koji.phx2.fedoraproject.org/koji/builds | /bin/grep -c failed.png) +FAILURES=$(/usr/bin/wget --timeout=60 -q -O- http://koji.{{ datacenter }}.fedoraproject.org/koji/builds | /bin/grep -c failed.png) WARNING=20 CRITICAL=25 @@ -16,4 +16,3 @@ else echo "Koji: OK failed builds: $FAILURES" exit 0 fi -