diff --git a/roles/openqa/worker/files/kill-stuck-qemu.sh b/roles/openqa/worker/files/kill-stuck-qemu.sh new file mode 100755 index 0000000000..05e0a0c61b --- /dev/null +++ b/roles/openqa/worker/files/kill-stuck-qemu.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# this is a hideous hack to find and kill qemu processes stuck as a +# result of https://github.com/os-autoinst/os-autoinst/issues/2549 +# which cause workers to be stuck in broken state. affected workers +# should recover some minutes after this script runs +for i in {1..35}; do journalctl -u openqa-worker-plain@$i.service -n 5 | grep "is still running" | grep -o "PID: [0-9]\+" | cut -d" " -f2 | sort -u | xargs kill 2> /dev/null; done diff --git a/roles/openqa/worker/tasks/main.yml b/roles/openqa/worker/tasks/main.yml index ad5f4088d1..713d0b1af3 100644 --- a/roles/openqa/worker/tasks/main.yml +++ b/roles/openqa/worker/tasks/main.yml @@ -167,6 +167,9 @@ service: name=rngd enabled=yes state=started when: "openqa_rngd is defined and openqa_rngd" +- name: Install cron job to kill stuck qemu processes + copy: src=kill-stuck-qemu.sh dest=/etc/cron.daily/kill-stuck-qemu owner=root group=root mode=0755 + - include_tasks: nfs-client.yml when: openqa_nfs_worker|bool