diff --git a/inventory/inventory b/inventory/inventory index 6318444cef..6e2e103475 100644 --- a/inventory/inventory +++ b/inventory/inventory @@ -1180,3 +1180,16 @@ osbs-master01.stg.phx2.fedoraproject.org [docker-registry-stg] docker-registry01.stg.phx2.fedoraproject.org + +# +# Hosts in this group have zombie processes for various reasons +# and we want to not alert on those, so to the client nrpe.conf uses +# this group to denote those. +# +[zombie-infested] +# anon git via systemd socket seems to get zombies from time to time +pkgs02.phx2.fedoraproject.org +# the openstack 5.0 vnc console viewer causes bunches of Zombies +fed-cloud09.cloud.fedoraproject.org +# Ansible from time to time in large runs has zombie threads +batcave01.phx2.fedoraproject.org diff --git a/roles/nagios/client/templates/nrpe.cfg.j2 b/roles/nagios/client/templates/nrpe.cfg.j2 index ca61669f1f..6fb16e59ea 100644 --- a/roles/nagios/client/templates/nrpe.cfg.j2 +++ b/roles/nagios/client/templates/nrpe.cfg.j2 @@ -201,7 +201,12 @@ include_dir=/etc/nrpe.d/ command[check_users]={{ libdir }}/nagios/plugins/check_users -w 5 -c 10 command[check_load]={{ libdir }}/nagios/plugins/check_load -w 15,10,5 -c 30,25,20 command[check_hda1]={{ libdir }}/nagios/plugins/check_disk -w 20% -c 10% -p /dev/hda1 +{% if inventory_hostname not in groups['zombie-infested'] command[check_zombie_procs]={{ libdir }}/nagios/plugins/check_procs -w 5 -c 10 -s Z +{% else %} +# This host is prone to Zombies and we do not care or want to alert on it so we make the limits very high +command[check_zombie_procs]={{ libdir }}/nagios/plugins/check_procs -w 50000 -c 100000 -s Z +{% endif %} command[check_total_procs]={{ libdir }}/nagios/plugins/check_procs -w {{ nrpe_procs_warn }} -c {{ nrpe_procs_crit }}