From 28110d34beb24efe8bef1cac8fa675f42487a3c3 Mon Sep 17 00:00:00 2001 From: Adam Williamson Date: Fri, 25 Nov 2022 14:05:47 -0800 Subject: [PATCH] openqa/worker: prepare to handle multiple tap worker classes I'm going to try splitting the tap jobs across multiple worker hosts. We have quite a lot of tap jobs, now, and I have seen sometimes a situation where all non-tap jobs have been run and the non-tap worker hosts are sitting idle, but the single tap worker host has a long queue of tap jobs to get through. We can't just put multiple hosts per instance into the tap class, because then we might get a case where job A from a tap group is run on one host and job B from a tap group is run on a different host, and they can't communicate. It's actually possible to set this up so it works, but it needs yet more complex networking stuff I don't want to mess with. So instead I'm just gonna split the tap job groups across two classes, 'tap' and 'tap2'. That way we can have one 'tap' worker host and one 'tap2' worker host per instance and arch, and they will each get about half the tap jobs. Unfortunately since we only have one aarch64 worker for lab it will still have to run all the jobs, but for all other cases we do have at least two workers, so we can split the load. Signed-off-by: Adam Williamson --- inventory/group_vars/openqa_tap12_workers | 1 + inventory/group_vars/openqa_tap1_workers | 1 + inventory/group_vars/openqa_tap2_workers | 1 + inventory/group_vars/openqa_tap_workers | 1 - inventory/inventory | 23 +++++++++++++++++++- roles/openqa/worker/defaults/main.yml | 2 +- roles/openqa/worker/tasks/main.yml | 17 ++++++++++++--- roles/openqa/worker/templates/workers.ini.j2 | 6 ++--- 8 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 inventory/group_vars/openqa_tap12_workers create mode 100644 inventory/group_vars/openqa_tap1_workers create mode 100644 inventory/group_vars/openqa_tap2_workers diff --git a/inventory/group_vars/openqa_tap12_workers b/inventory/group_vars/openqa_tap12_workers new file mode 100644 index 0000000000..c5a19cb7e5 --- /dev/null +++ b/inventory/group_vars/openqa_tap12_workers @@ -0,0 +1 @@ +openqa_tap: tap,tap2 diff --git a/inventory/group_vars/openqa_tap1_workers b/inventory/group_vars/openqa_tap1_workers new file mode 100644 index 0000000000..5ae0f2366f --- /dev/null +++ b/inventory/group_vars/openqa_tap1_workers @@ -0,0 +1 @@ +openqa_tap: tap diff --git a/inventory/group_vars/openqa_tap2_workers b/inventory/group_vars/openqa_tap2_workers new file mode 100644 index 0000000000..ce151e66c4 --- /dev/null +++ b/inventory/group_vars/openqa_tap2_workers @@ -0,0 +1 @@ +openqa_tap: tap2 diff --git a/inventory/group_vars/openqa_tap_workers b/inventory/group_vars/openqa_tap_workers index 8976c6f9e5..9d0e814e0c 100644 --- a/inventory/group_vars/openqa_tap_workers +++ b/inventory/group_vars/openqa_tap_workers @@ -7,4 +7,3 @@ host_group: openqa-tap-workers nat_rules: [ # masquerade for openQA openvswitch workers to reach the outside '-A POSTROUTING -o {{ openqa_tap_iface }} -j MASQUERADE'] -openqa_tap: true diff --git a/inventory/inventory b/inventory/inventory index f41f2538b0..4145dd43b2 100644 --- a/inventory/inventory +++ b/inventory/inventory @@ -470,7 +470,10 @@ openqa-x86-worker05.iad2.fedoraproject.org openqa-x86-worker06.iad2.fedoraproject.org # the workers that can run networked jobs. each server should have *one* of these per arch -[openqa_tap_workers] +# per tap worker class defined in os-autoinst-distri-fedora (currently there are two +# classes; the purpose of multiple classes is to split the load of tap jobs across +# multiple workers) +[openqa_tap1_workers] # prod openqa-x86-worker01.iad2.fedoraproject.org openqa-a64-worker02.iad2.fedoraproject.org @@ -478,6 +481,18 @@ openqa-a64-worker02.iad2.fedoraproject.org openqa-x86-worker04.iad2.fedoraproject.org openqa-a64-worker01.iad2.fedoraproject.org openqa-p09-worker01.iad2.fedoraproject.org +[openqa_tap2_workers] +# prod +openqa-x86-worker02.iad2.fedoraproject.org +openqa-a64-worker03.iad2.fedoraproject.org +# lab +openqa-x86-worker05.iad2.fedoraproject.org +openqa-p09-worker02.iad2.fedoraproject.org +# these do both tap and tap2 - because we only have one aarch64 worker +# for lab, it has to handle all tap jobs +[openqa_tap12_workers] +# lab +openqa-a64-worker01.iad2.fedoraproject.org # the workers that run createhdds to create the base disk images. Again, # only one per arch per instance should be present. @@ -502,6 +517,12 @@ openqa_lab_workers openqa openqa_lab +# common group for all tap workers +[openqa_tap_workers:children] +openqa_tap1_workers +openqa_tap2_workers +openqa_tap12_workers + [packages] [packages_stg] diff --git a/roles/openqa/worker/defaults/main.yml b/roles/openqa/worker/defaults/main.yml index c44923f7e3..848e7df8ae 100644 --- a/roles/openqa/worker/defaults/main.yml +++ b/roles/openqa/worker/defaults/main.yml @@ -2,5 +2,5 @@ openqa_hostname: localhost openqa_repo: updates openqa_createhdds_branch: main openqa_nfs_worker: false -openqa_tap: false +openqa_tap: "" openqa_hdds_worker: false diff --git a/roles/openqa/worker/tasks/main.yml b/roles/openqa/worker/tasks/main.yml index 42923680db..f78f63b6e7 100644 --- a/roles/openqa/worker/tasks/main.yml +++ b/roles/openqa/worker/tasks/main.yml @@ -23,9 +23,20 @@ ## per deployment ## default - false # - openqa_tap -## bool - whether this is the tap- and swtpm-enabled host or not -## each deployment should have *one* tap-capable worker host -## default - false +## string - tap worker classes this worker should be part of, a comma- +## separated string, e.g. "tap" or "tap,tap2". If this is +## empty, the worker will not be configured for tap and +## swtpm at all; if it's not empty, the worker will be +## configured for tap and swtpm and the string substituted +## into the workers.ini config file. Only *one* worker +## should be in each tap worker class (so there should not +## be two workers in the "tap" class), but if you are short +## on workers, one worker can be in *multiple* tap classes, +## meaning it will pick up more tap jobs. The purpose of +## having multiple tap classes is to split the load of tap +## jobs across more than one host, when enough hosts are +## available +## default - empty string (disabled) # Optional vars # - openqa_rngd diff --git a/roles/openqa/worker/templates/workers.ini.j2 b/roles/openqa/worker/templates/workers.ini.j2 index 0e977c3a88..4a9bdeebcd 100644 --- a/roles/openqa/worker/templates/workers.ini.j2 +++ b/roles/openqa/worker/templates/workers.ini.j2 @@ -4,10 +4,10 @@ HOST = http://{{ openqa_hostname|default('localhost') }} WORKER_CLASS = {{ openqa_worker_class }} {% elif openqa_tap|bool %} {% if ansible_architecture == 'ppc64' or ansible_architecture == 'ppc64le' %} -WORKER_CLASS = tap,tpm,qemu_ppc64le,qemu_ppc64 +WORKER_CLASS = {{ openqa_tap }},tpm,qemu_ppc64le,qemu_ppc64 {% elif ansible_architecture == 'aarch64' %} -WORKER_CLASS = tap,tpm,qemu_aarch64,qemu_arm +WORKER_CLASS = {{ openqa_tap }},tpm,qemu_aarch64,qemu_arm {% else %} -WORKER_CLASS = tap,tpm,qemu_x86_64,qemu_i686,qemu_i586 +WORKER_CLASS = {{ openqa_tap }},tpm,qemu_x86_64,qemu_i686,qemu_i586 {% endif %} {% endif %}