ansible/playbooks/vhost_reboot.yml

#
# This playbook lets you safely reboot a virthost and all it's guests.
#
# requires --extra-vars="target=somevhost fqdn"
# Might add nodns=true or nonagios=true to the extra vars

#General overview:
# talk to the vhost
# get back list of instances
# add each of their hostnames to an addhoc group
# halt each of them in a second play
# wait for them to die
# third play, reboot the vhost
#     wait for vhost to come back

# TODO: Figure out how to compare virt info pre and post boot.

- name: find instances
  hosts: "{{ target }}"
  gather_facts: False
  user: root

  tasks:
  - name: get list of guests
    virt: command=list_vms state=running
    register: vmlist

#  - name: get info on guests (prereboot)
#    virt: command=info
#    register: vminfo_pre

  - name: add them to myvms_new group
    local_action: add_host hostname={{ item }} groupname=myvms_new
    with_items: "{{ vmlist.list_vms }}"

# Call out to another playbook.  Disable any proxies that may live here
- import_playbook: update-proxy-dns.yml
  vars:
    status: enable
    proxies: myvms_new:&proxies
  when: nodns is not defined or not "true" in nodns

- name: halt instances
  hosts: myvms_new
  user: root
  gather_facts: False
  serial: 1

  tasks:
#  - name: delegate to on-control01 for ocp4 cluster
#    set_fact:
#      os_delegate_via: os-control01
#    when: inventory_hostname in groups['ocp']
#
#  - name: delegate to on-control01 for ocp4 cluster (stg)
#    set_fact:
#      os_delegate_via: os-control01.stg
#    when: inventory_hostname in groups['ocp_stg']
#
#  - name: drain OS node if necessary
#    command: oc adm drain {{inventory_hostname }} --ignore-daemonsets --delete-local-data
#    delegate_to: "{{os_delegate_via}}{{env_suffix}}.iad2.fedoraproject.org"
#    when: inventory_hostname.startswith(('ocp', 'worker')) and hostvars[inventory_hostname].datacenter == 'iad2'
#
  - name: schedule regular host downtime
    nagios: action=downtime minutes=30 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
    delegate_to: noc01.iad2.fedoraproject.org
    ignore_errors: true
    when: nonagios is not defined or not nonagios

  - name: shutdown vms
    virt: command=shutdown name={{ inventory_hostname }}
    ignore_errors: true
    delegate_to: "{{ target }}"

- name: wait for the whole set to die.
  hosts: myvms_new
  gather_facts: False
  user: root

  tasks:
  - name: wait for them to die
    local_action: wait_for port=22 delay=30 timeout=300 state=stopped host={{ inventory_hostname }}

- name: reboot vhost
  hosts: "{{ target }}"
  gather_facts: False
  user: root

  tasks:
  - name: tell nagios to shush
    nagios: action=downtime minutes=60 service=host host={{ inventory_hostname_short }}{{ env_suffix }}
    delegate_to: noc01.iad2.fedoraproject.org
    ignore_errors: true
    when: nonagios is not defined or not nonagios

  - name: reboot the virthost
    reboot:

  - name: wait for libvirtd to come back on the virthost
    wait_for: path=/run/libvirt/libvirt-sock state=present

  - name: look up vmlist
    virt: command=list_vms
    register: newvmlist

  - name: add them to myvms_postreboot group
    local_action: add_host hostname={{ item }} groupname=myvms_postreboot
    with_items: "{{ newvmlist.list_vms }}"

#  - name: sync time
#    command: ntpdate -u 1.rhel.pool.ntp.org

  - name: tell nagios to unshush
    nagios: action=unsilence service=host host={{ inventory_hostname_short }}{{ env_suffix }}
    delegate_to: noc01.iad2.fedoraproject.org
    ignore_errors: true
    when: nonagios is not defined or not nonagios

- name: post reboot tasks
  hosts: myvms_postreboot
  user: root
  gather_facts: False
  serial: 1

  tasks:

  - name: delegate to on-control01 for ocp4 cluster
    set_fact:
      os_delegate_via: os-control01
    when: inventory_hostname in groups['ocp']

  - name: delegate to on-control01 for ocp4 cluster (stg)
    set_fact:
      os_delegate_via: os-control01.stg
    when: inventory_hostname in groups['ocp_stg']

  - name: Add back to openshift
    command: oc adm uncordon {{inventory_hostname}}
    delegate_to: "{{os_delegate_via}}{{env_suffix}}.iad2.fedoraproject.org"
    when: inventory_hostname.startswith(('ocp', 'worker')) and hostvars[inventory_hostname].datacenter == 'iad2'

  - name: restart gssproxy if we rebooted a ipa server
    service: name=gssproxy state=restarted
    when: inventory_hostname.startswith('ipa')

  - name: restart rabbitmq if we rebooted a rabbit server
    service: name=rabbitmq-server state=restarted
    when: inventory_hostname.startswith('rabbitmq')

# Call out to that dns playbook.  Put proxies back in now that they're back
- import_playbook: update-proxy-dns.yml
  vars:
    status: enable
    proxies: myvms_new:&proxies
  when: nodns is not defined or not nodns

  vars_files:
   - /srv/web/infra/ansible/vars/global.yml
   - /srv/web/infra/ansible/vars/{{ ansible_distribution }}.yml

#  - name: get info on guests (postreboot)
#    virt: command=info
#    register: vminfo_post