openqa/worker: prepare to handle multiple tap worker classes

I'm going to try splitting the tap jobs across multiple worker
hosts. We have quite a lot of tap jobs, now, and I have seen
sometimes a situation where all non-tap jobs have been run and
the non-tap worker hosts are sitting idle, but the single tap
worker host has a long queue of tap jobs to get through.

We can't just put multiple hosts per instance into the tap
class, because then we might get a case where job A from a tap
group is run on one host and job B from a tap group is run on
a different host, and they can't communicate. It's actually
possible to set this up so it works, but it needs yet more
complex networking stuff I don't want to mess with. So instead
I'm just gonna split the tap job groups across two classes,
'tap' and 'tap2'. That way we can have one 'tap' worker host
and one 'tap2' worker host per instance and arch, and they will
each get about half the tap jobs.

Unfortunately since we only have one aarch64 worker for lab it
will still have to run all the jobs, but for all other cases we
do have at least two workers, so we can split the load.

Signed-off-by: Adam Williamson <awilliam@redhat.com>
This commit is contained in:
Adam Williamson 2022-11-25 14:05:47 -08:00
parent 31ff414ca8
commit 28110d34be
8 changed files with 43 additions and 9 deletions

View file

@ -0,0 +1 @@
openqa_tap: tap,tap2

View file

@ -0,0 +1 @@
openqa_tap: tap

View file

@ -0,0 +1 @@
openqa_tap: tap2

View file

@ -7,4 +7,3 @@ host_group: openqa-tap-workers
nat_rules: [
# masquerade for openQA openvswitch workers to reach the outside
'-A POSTROUTING -o {{ openqa_tap_iface }} -j MASQUERADE']
openqa_tap: true

View file

@ -470,7 +470,10 @@ openqa-x86-worker05.iad2.fedoraproject.org
openqa-x86-worker06.iad2.fedoraproject.org
# the workers that can run networked jobs. each server should have *one* of these per arch
[openqa_tap_workers]
# per tap worker class defined in os-autoinst-distri-fedora (currently there are two
# classes; the purpose of multiple classes is to split the load of tap jobs across
# multiple workers)
[openqa_tap1_workers]
# prod
openqa-x86-worker01.iad2.fedoraproject.org
openqa-a64-worker02.iad2.fedoraproject.org
@ -478,6 +481,18 @@ openqa-a64-worker02.iad2.fedoraproject.org
openqa-x86-worker04.iad2.fedoraproject.org
openqa-a64-worker01.iad2.fedoraproject.org
openqa-p09-worker01.iad2.fedoraproject.org
[openqa_tap2_workers]
# prod
openqa-x86-worker02.iad2.fedoraproject.org
openqa-a64-worker03.iad2.fedoraproject.org
# lab
openqa-x86-worker05.iad2.fedoraproject.org
openqa-p09-worker02.iad2.fedoraproject.org
# these do both tap and tap2 - because we only have one aarch64 worker
# for lab, it has to handle all tap jobs
[openqa_tap12_workers]
# lab
openqa-a64-worker01.iad2.fedoraproject.org
# the workers that run createhdds to create the base disk images. Again,
# only one per arch per instance should be present.
@ -502,6 +517,12 @@ openqa_lab_workers
openqa
openqa_lab
# common group for all tap workers
[openqa_tap_workers:children]
openqa_tap1_workers
openqa_tap2_workers
openqa_tap12_workers
[packages]
[packages_stg]

View file

@ -2,5 +2,5 @@ openqa_hostname: localhost
openqa_repo: updates
openqa_createhdds_branch: main
openqa_nfs_worker: false
openqa_tap: false
openqa_tap: ""
openqa_hdds_worker: false

View file

@ -23,9 +23,20 @@
## per deployment
## default - false
# - openqa_tap
## bool - whether this is the tap- and swtpm-enabled host or not
## each deployment should have *one* tap-capable worker host
## default - false
## string - tap worker classes this worker should be part of, a comma-
## separated string, e.g. "tap" or "tap,tap2". If this is
## empty, the worker will not be configured for tap and
## swtpm at all; if it's not empty, the worker will be
## configured for tap and swtpm and the string substituted
## into the workers.ini config file. Only *one* worker
## should be in each tap worker class (so there should not
## be two workers in the "tap" class), but if you are short
## on workers, one worker can be in *multiple* tap classes,
## meaning it will pick up more tap jobs. The purpose of
## having multiple tap classes is to split the load of tap
## jobs across more than one host, when enough hosts are
## available
## default - empty string (disabled)
# Optional vars
# - openqa_rngd

View file

@ -4,10 +4,10 @@ HOST = http://{{ openqa_hostname|default('localhost') }}
WORKER_CLASS = {{ openqa_worker_class }}
{% elif openqa_tap|bool %}
{% if ansible_architecture == 'ppc64' or ansible_architecture == 'ppc64le' %}
WORKER_CLASS = tap,tpm,qemu_ppc64le,qemu_ppc64
WORKER_CLASS = {{ openqa_tap }},tpm,qemu_ppc64le,qemu_ppc64
{% elif ansible_architecture == 'aarch64' %}
WORKER_CLASS = tap,tpm,qemu_aarch64,qemu_arm
WORKER_CLASS = {{ openqa_tap }},tpm,qemu_aarch64,qemu_arm
{% else %}
WORKER_CLASS = tap,tpm,qemu_x86_64,qemu_i686,qemu_i586
WORKER_CLASS = {{ openqa_tap }},tpm,qemu_x86_64,qemu_i686,qemu_i586
{% endif %}
{% endif %}