ansible/playbooks/vhost_reboot.yml

#
# This playbook lets you safely reboot a virthost and all it's guests.
#
# requires --extra-vars="target=somevhost fqdn"
# Might add nodns=true or nonagios=true to the extra vars

#General overview:
# talk to the vhost
# get back list of instances
# add each of their hostnames to an addhoc group
# halt each of them in a second play
# wait for them to die
# third play, reboot the vhost
#     wait for vhost to come back

# TODO: Figure out how to compare virt info pre and post boot.

- name: find instances
  hosts: "{{ target }}"
  gather_facts: False
  user: root

  tasks:
  - name: get list of guests
    virt: command=list_vms state=running
    register: vmlist

#  - name: get info on guests (prereboot)
#    virt: command=info
#    register: vminfo_pre

  - name: add them to myvms_new group
    local_action: add_host hostname={{ item }} groupname=myvms_new
    with_items: "{{ vmlist.list_vms }}"

# Call out to another playbook.  Disable any proxies that may live here
- import_playbook: update-proxy-dns.yml status=disable proxies=myvms_new:&proxies
  when: nodns is not defined or not "true" in nodns

- name: halt instances
  hosts: myvms_new
  user: root
  gather_facts: False
  serial: 1

  tasks:
  - name: drain OS node if necessary
    command: oc adm drain {{inventory_hostname }} --ignore-daemonsets --delete-local-data
    delegate_to: os-master01{{env_suffix}}.phx2.fedoraproject.org
    when: inventory_hostname.startswith('os-node')

  - name: schedule regular host downtime
    nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
    delegate_to: noc01.phx2.fedoraproject.org
    ignore_errors: true
    when: nonagios is not defined or not nonagios

  - name: halt the vm instances - to poweroff
    command: /sbin/shutdown -h 1
    ignore_errors: true
    # if one of them is down we don't care

- name: wait for the whole set to die.
  hosts: myvms_new
  gather_facts: False
  user: root

  tasks:
  - name: wait for them to die
    local_action: wait_for port=22 delay=30 timeout=300 state=stopped host={{ inventory_hostname }}

- name: reboot vhost
  hosts: "{{ target }}"
  gather_facts: False
  user: root

  tasks:
  - name: tell nagios to shush
    nagios: action=downtime minutes=60 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
    delegate_to: noc01.phx2.fedoraproject.org
    ignore_errors: true
    when: nonagios is not defined or not nonagios

  - name: reboot the virthost
    command: /sbin/shutdown -r 1

  - name: wait for virthost to come back - up to 15 minutes
    local_action: wait_for host={{ target }} port=22 delay=120 timeout=900 search_regex=OpenSSH

  - name: wait for libvirtd to come back on the virthost
    wait_for: path=/var/run/libvirtd.pid state=present delay=10

  - name: look up vmlist
    virt: command=list_vms
    register: newvmlist

  - name: add them to myvms_postreboot group
    local_action: add_host hostname={{ item }} groupname=myvms_postreboot
    with_items: "{{ newvmlist.list_vms }}"

  - name: sync time
    command: ntpdate -u 1.rhel.pool.ntp.org

  - name: tell nagios to unshush
    nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
    delegate_to: noc01.phx2.fedoraproject.org
    ignore_errors: true
    when: nonagios is not defined or not nonagios

- name: post reboot tasks
  hosts: myvms_postreboot
  user: root
  gather_facts: False
  serial: 1

  tasks:
  - name: Add back to openshift
    command: oc adm uncordon {{inventory_hostname}}
    delegate_to: os-master01{{env_suffix}}.phx2.fedoraproject.org
    when: inventory_hostname.startswith('os-node')

# Call out to that dns playbook.  Put proxies back in now that they're back
- import_playbook: update-proxy-dns.yml status=enable proxies=myvms_new:&proxies
  when: nodns is not defined or not nodns

- name: Fix unbound if necessary
  # intersection - hosts that are in our dynamic group and also in unbound-dns
  hosts: "myvms_new:&unbound"
  user: root

  vars_files:
   - /srv/web/infra/ansible/vars/global.yml
   - /srv/web/infra/ansible/vars/{{ ansible_distribution }}.yml

  tasks:
  - import_tasks: "{{ tasks_path }}/restart_unbound.yml"

#  - name: get info on guests (postreboot)
#    virt: command=info
#    register: vminfo_post
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`#`
Death to all trailing whitespace. 2016-08-08 19:36:31 +00:00			`# This playbook lets you safely reboot a virthost and all it's guests.`
			`#`
migrate all the script-like playbooks such that the primary host-spec is the same variable: $target 2013-03-04 22:37:13 +00:00			`# requires --extra-vars="target=somevhost fqdn"`
Add nonagios to update and reboot playbooks 2015-07-22 21:52:30 +00:00			`# Might add nodns=true or nonagios=true to the extra vars`
update notes on what else needs to be done 2012-11-21 21:09:19 +00:00
			`#General overview:`
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00			`# talk to the vhost`
			`# get back list of instances`
			`# add each of their hostnames to an addhoc group`
			`# halt each of them in a second play`
update notes on what else needs to be done 2012-11-21 21:09:19 +00:00			`# wait for them to die`
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00			`# third play, reboot the vhost`
			`# wait for vhost to come back`
update notes on what else needs to be done 2012-11-21 21:09:19 +00:00
Death to all trailing whitespace. 2016-08-08 19:36:31 +00:00			`# TODO: Figure out how to compare virt info pre and post boot.`
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00
			`- name: find instances`
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`hosts: "{{ target }}"`
			`gather_facts: False`
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00			`user: root`

			`tasks:`
			`- name: get list of guests`
Only reboot running vms Signed-off-by: Patrick Uiterwijk <puiterwijk@redhat.com> 2018-01-05 12:25:23 +00:00			`virt: command=list_vms state=running`
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00			`register: vmlist`
typo in the yaml 2012-11-21 17:59:07 +00:00
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`# - name: get info on guests (prereboot)`
			`# virt: command=info`
			`# register: vminfo_pre`

update notes on what else needs to be done 2012-11-21 21:09:19 +00:00			`- name: add them to myvms_new group`
More fixes 2013-11-21 22:07:02 +00:00			`local_action: add_host hostname={{ item }} groupname=myvms_new`
fix vhost_reboot playbook syntax 2016-11-29 01:45:38 +00:00			`with_items: "{{ vmlist.list_vms }}"`
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00
Include update-proxy-dns in vhost_reboot.yml. 2014-12-15 21:36:50 +00:00			`# Call out to another playbook. Disable any proxies that may live here`
fix up includes in vhost_reboot playbook 2018-04-04 20:07:03 +00:00			`- import_playbook: update-proxy-dns.yml status=disable proxies=myvms_new:&proxies`
You know the very powerful and the very stupid have one thing in common. They don't alter their views to fit the facts. They alter the facts to fit the views. Which can be uncomfortable if you happen to be one of the facts that needs altering. 2017-01-24 20:00:30 +00:00			`when: nodns is not defined or not "true" in nodns`
Include update-proxy-dns in vhost_reboot.yml. 2014-12-15 21:36:50 +00:00
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00			`- name: halt instances`
			`hosts: myvms_new`
			`user: root`
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`gather_facts: False`
try out vhost-reboot with nagios cancelling of guests and vhost 2013-01-28 21:27:18 +00:00			`serial: 1`
trial vhost-reboot playbooks 2012-11-21 17:56:10 +00:00
			`tasks:`
attempt to auto drain/re-add openshift nodes on vhost_reboot Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 02:25:38 +00:00			`- name: drain OS node if necessary`
			`command: oc adm drain {{inventory_hostname }} --ignore-daemonsets --delete-local-data`
another } - I am bad at this Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 03:22:55 +00:00			`delegate_to: os-master01{{env_suffix}}.phx2.fedoraproject.org`
attempt to auto drain/re-add openshift nodes on vhost_reboot Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 02:25:38 +00:00			`when: inventory_hostname.startswith('os-node')`

Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`- name: schedule regular host downtime`
Try to fix all our nagios downtime/uptime toggles. 2015-05-04 14:38:03 +00:00			`nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}{{ env_suffix }}`
Readd nagios stuff here, it should work now. 2013-03-01 16:24:16 +00:00			`delegate_to: noc01.phx2.fedoraproject.org`
you have to ignore errors if you're rebooting nagios :) 2013-05-07 21:50:45 +00:00			`ignore_errors: true`
Rework vhost_reboot so that ansible 2.0 likes it. 2015-12-15 20:25:54 +00:00			`when: nonagios is not defined or not nonagios`
try out vhost-reboot with nagios cancelling of guests and vhost 2013-01-28 21:27:18 +00:00
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`- name: halt the vm instances - to poweroff`
Same trick for the guests. 2014-07-24 19:13:48 +00:00			`command: /sbin/shutdown -h 1`
more involved - temp check 2012-11-21 18:45:20 +00:00			`ignore_errors: true`
			`# if one of them is down we don't care`

Try to wait for downed vguests in parallel. 2013-11-21 10:24:17 +00:00			`- name: wait for the whole set to die.`
			`hosts: myvms_new`
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`gather_facts: False`
Try to wait for downed vguests in parallel. 2013-11-21 10:24:17 +00:00			`user: root`

			`tasks:`
more involved - temp check 2012-11-21 18:45:20 +00:00			`- name: wait for them to die`
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`local_action: wait_for port=22 delay=30 timeout=300 state=stopped host={{ inventory_hostname }}`
Try to wait for downed vguests in parallel. 2013-11-21 10:24:17 +00:00
echo'y 2012-11-21 18:22:03 +00:00			`- name: reboot vhost`
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`hosts: "{{ target }}"`
			`gather_facts: False`
echo'y 2012-11-21 18:22:03 +00:00			`user: root`

			`tasks:`
Readd nagios stuff here, it should work now. 2013-03-01 16:24:16 +00:00			`- name: tell nagios to shush`
Try to fix all our nagios downtime/uptime toggles. 2015-05-04 14:38:03 +00:00			`nagios: action=downtime minutes=60 service=host host={{ inventory_hostname_short }}{{ env_suffix }}`
Readd nagios stuff here, it should work now. 2013-03-01 16:24:16 +00:00			`delegate_to: noc01.phx2.fedoraproject.org`
you have to ignore errors if you're rebooting nagios :) 2013-05-07 21:50:45 +00:00			`ignore_errors: true`
Rework vhost_reboot so that ansible 2.0 likes it. 2015-12-15 20:25:54 +00:00			`when: nonagios is not defined or not nonagios`
try out vhost-reboot with nagios cancelling of guests and vhost 2013-01-28 21:27:18 +00:00
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`- name: reboot the virthost`
Switch this back to copy for now 2014-07-24 17:26:26 +00:00			`command: /sbin/shutdown -r 1`
more involved - temp check 2012-11-21 18:45:20 +00:00
Fix unbound name in the playbook. 2014-03-31 22:47:37 +00:00			`- name: wait for virthost to come back - up to 15 minutes`
Switch this back to copy for now 2014-07-24 17:26:26 +00:00			`local_action: wait_for host={{ target }} port=22 delay=120 timeout=900 search_regex=OpenSSH`
more involved - temp check 2012-11-21 18:45:20 +00:00
Lets try this to fix issues 2013-12-19 20:25:13 +00:00			`- name: wait for libvirtd to come back on the virthost`
Add a small delay here in vhost_reboot 2014-08-12 16:54:28 +00:00			`wait_for: path=/var/run/libvirtd.pid state=present delay=10`
Lets try this to fix issues 2013-12-19 20:25:13 +00:00
not sure how to compare old to new, yet 2012-11-21 18:58:50 +00:00			`- name: look up vmlist`
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`virt: command=list_vms`
more involved - temp check 2012-11-21 18:45:20 +00:00			`register: newvmlist`
update notes on what else needs to be done 2012-11-21 21:09:19 +00:00
do some magic to get it to see the post-reboot vms as a group Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 03:34:03 +00:00			`- name: add them to myvms_postreboot group`
			`local_action: add_host hostname={{ item }} groupname=myvms_postreboot`
			`with_items: "{{ newvmlist.list_vms }}"`

Add a ntpdate after the vhost comes back up. 2013-11-21 20:36:36 +00:00			`- name: sync time`
Fix these reboot playbooks for ntp too 2015-08-31 16:35:06 +00:00			`command: ntpdate -u 1.rhel.pool.ntp.org`
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00
Add a nagios unsush to reboot playbook 2014-10-01 01:53:33 +00:00			`- name: tell nagios to unshush`
Try to fix all our nagios downtime/uptime toggles. 2015-05-04 14:38:03 +00:00			`nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}`
Add a nagios unsush to reboot playbook 2014-10-01 01:53:33 +00:00			`delegate_to: noc01.phx2.fedoraproject.org`
			`ignore_errors: true`
Rework vhost_reboot so that ansible 2.0 likes it. 2015-12-15 20:25:54 +00:00			`when: nonagios is not defined or not nonagios`
Add a nagios unsush to reboot playbook 2014-10-01 01:53:33 +00:00
attempt to auto drain/re-add openshift nodes on vhost_reboot Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 02:25:38 +00:00			`- name: post reboot tasks`
do some magic to get it to see the post-reboot vms as a group Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 03:34:03 +00:00			`hosts: myvms_postreboot`
attempt to auto drain/re-add openshift nodes on vhost_reboot Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 02:25:38 +00:00			`user: root`
			`gather_facts: False`
			`serial: 1`

add tasks: key Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 03:21:14 +00:00			`tasks:`
attempt to auto drain/re-add openshift nodes on vhost_reboot Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 02:25:38 +00:00			`- name: Add back to openshift`
			`command: oc adm uncordon {{inventory_hostname}}`
} Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 03:21:59 +00:00			`delegate_to: os-master01{{env_suffix}}.phx2.fedoraproject.org`
attempt to auto drain/re-add openshift nodes on vhost_reboot Signed-off-by: Rick Elrod <relrod@redhat.com> 2018-10-03 02:25:38 +00:00			`when: inventory_hostname.startswith('os-node')`

Include update-proxy-dns in vhost_reboot.yml. 2014-12-15 21:36:50 +00:00			`# Call out to that dns playbook. Put proxies back in now that they're back`
fix up includes in vhost_reboot playbook 2018-04-04 20:07:03 +00:00			`- import_playbook: update-proxy-dns.yml status=enable proxies=myvms_new:&proxies`
Rework vhost_reboot so that ansible 2.0 likes it. 2015-12-15 20:25:54 +00:00			`when: nodns is not defined or not nodns`
Include update-proxy-dns in vhost_reboot.yml. 2014-12-15 21:36:50 +00:00
Attempt to run the restart_unbound sequence on reboot if necessary. 2013-12-20 00:46:24 +00:00			`- name: Fix unbound if necessary`
			`# intersection - hosts that are in our dynamic group and also in unbound-dns`
Fix unbound name in the playbook. 2014-03-31 22:47:37 +00:00			`hosts: "myvms_new:&unbound"`
Attempt to run the restart_unbound sequence on reboot if necessary. 2013-12-20 00:46:24 +00:00			`user: root`

Need to include vars here if we are using them. 2013-12-20 18:17:02 +00:00			`vars_files:`
			`- /srv/web/infra/ansible/vars/global.yml`
Hard code vars_path for now. 2014-01-06 18:22:18 +00:00			`- /srv/web/infra/ansible/vars/{{ ansible_distribution }}.yml`
Need to include vars here if we are using them. 2013-12-20 18:17:02 +00:00
Attempt to run the restart_unbound sequence on reboot if necessary. 2013-12-20 00:46:24 +00:00			`tasks:`
switch all the include tasks to import tasks 2017-10-17 17:37:03 +00:00			`- import_tasks: "{{ tasks_path }}/restart_unbound.yml"`
Attempt to run the restart_unbound sequence on reboot if necessary. 2013-12-20 00:46:24 +00:00
Update for current syntax, reorder, add serverbeach thing. 2013-11-21 21:52:05 +00:00			`# - name: get info on guests (postreboot)`
			`# virt: command=info`
			`# register: vminfo_post`