Try to fix all our nagios downtime/uptime toggles.

This commit is contained in:
Ralph Bean 2015-05-04 14:38:03 +00:00
parent 81982de371
commit 5a1017b916
9 changed files with 26 additions and 40 deletions

View file

@ -8,7 +8,7 @@
tasks:
- name: tell nagios to shush
nagios: action=downtime minutes=60 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=60 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -22,6 +22,6 @@
command: ntpdate -u 66.187.233.4
- name: tell nagios to unshush
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -23,7 +23,7 @@
when: install_packages_indexer
- name: tell nagios to shush for these hosts
nagios: action=downtime minutes=300 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=300 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -91,6 +91,6 @@
- fcomm-cache-worker
- name: tell nagios to start bothering us again
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -59,7 +59,7 @@
tasks:
- name: schedule a 15 minute downtime. give notifs backend time to start up.
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -68,7 +68,7 @@
pre_tasks:
- name: tell nagios to shush w.r.t. the frontend
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -90,7 +90,7 @@
pre_tasks:
- name: tell nagios to shush w.r.t. the backend
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -111,7 +111,7 @@
post_tasks:
- name: tell nagios to unshush w.r.t. the backend
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -130,6 +130,6 @@
post_tasks:
- name: tell nagios to unshush w.r.t. the frontend
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -40,7 +40,7 @@
- include: "{{ handlers }}/restart_services.yml"
pre_tasks:
- name: tell nagios to shush
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
roles:
@ -59,7 +59,7 @@
- include: "{{ handlers }}/restart_services.yml"
pre_tasks:
- name: tell nagios to shush
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
roles:
@ -78,7 +78,7 @@
- include: "{{ handlers }}/restart_services.yml"
pre_tasks:
- name: tell nagios to shush
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
roles:
@ -97,7 +97,7 @@
- include: "{{ handlers }}/restart_services.yml"
pre_tasks:
- name: tell nagios to shush
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=120 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
roles:
@ -118,7 +118,7 @@
post_tasks:
- name: tell nagios to unshush
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -137,7 +137,7 @@
- service: name="httpd" state=started
post_tasks:
- name: tell nagios to unshush
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -154,6 +154,6 @@
- service: name="fedmsg-hub" state=started
post_tasks:
- name: tell nagios to unshush
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -33,7 +33,7 @@
pre_tasks:
- name: tell nagios to shush w.r.t. the frontend
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -55,7 +55,7 @@
pre_tasks:
- name: tell nagios to shush w.r.t. the backend
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=15 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -78,7 +78,7 @@
# up anyways, so just let the downtime expire.
#post_tasks:
#- name: tell nagios to unshush w.r.t. the backend
# nagios: action=unsilence service=host host={{ inventory_hostname }}
# nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
# delegate_to: noc01.phx2.fedoraproject.org
# ignore_errors: true
@ -97,6 +97,6 @@
post_tasks:
- name: tell nagios to unshush w.r.t. the frontend
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -33,7 +33,7 @@
pre_tasks:
- name: tell nagios to shush
nagios: action=downtime minutes=60 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=60 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -43,6 +43,6 @@
post_tasks:
- service: name="fedmsg-hub" state=restarted
- name: tell nagios to unshush
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -43,16 +43,9 @@
tasks:
- name: schedule regular host downtime
nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}
nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
when: inventory_hostname.find('.stg.') == -1
- name: schedule stg host downtime
nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}.stg
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
when: inventory_hostname.find('.stg.') != -1
- name: halt the vm instances - to poweroff
command: /sbin/shutdown -h 1
@ -75,7 +68,7 @@
tasks:
- name: tell nagios to shush
nagios: action=downtime minutes=60 service=host host={{ inventory_hostname }}
nagios: action=downtime minutes=60 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
@ -100,7 +93,7 @@
when: inventory_hostname_short.startswith('serverbeach')
- name: tell nagios to unshush
nagios: action=unsilence service=host host={{ inventory_hostname }}
nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true

View file

@ -27,16 +27,9 @@
tasks:
- name: schedule regular host downtime
nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}
nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
when: inventory_hostname.find('.stg.') == -1
- name: schedule stg host downtime
nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}.stg
delegate_to: noc01.phx2.fedoraproject.org
ignore_errors: true
when: inventory_hostname.find('.stg.') != -1
- name: expire-caches
command: yum clean expire-cache