copr: aarch64 spawning via resalloc-server

For now enable this only on stg.  Imported from:
https://github.com/praiskup/fedora-copr-spinup-aarch64
This commit is contained in:
Pavel Raiskup 2019-05-13 11:01:39 +02:00
parent aaa95ac79b
commit 4f58067295
14 changed files with 673 additions and 0 deletions

View file

@ -0,0 +1,16 @@
# This playbook is executed from resalloc's home directory, by vm-aarch64-new
# script, it's just a thin wrapper around provision_builder_tasks.yml.
# IOW: never execute this as 'copr' user..
- name: provision builder
hosts: all
gather_facts: False
remote_user: root
vars_files:
- nova_cloud_vars.yml
vars:
prepare_base_image: False
tasks:
- include: "provision_builder_tasks.yml"

View file

@ -0,0 +1,8 @@
- name: create instance
hosts: 127.0.0.1
gather_facts: False
connection: local
tasks:
- name: spin machine with script
local_action: shell /bin/bash ./vm-aarch64-alloc

View file

@ -0,0 +1,8 @@
- name: terminate instance
hosts: 127.0.0.1
gather_facts: False
connection: local
tasks:
- name: terminate VM
local_action: shell /bin/bash ./vm-aarch64-terminate "{{ copr_task.vm_name }}"

View file

@ -0,0 +1,19 @@
#! /bin/bash
ticket=
cleanup()
{
test -n "$ticket" && resalloc ticket-close "$ticket"
}
trap cleanup EXIT
ticket=$(resalloc ticket --tag aarch64)
ip=$(resalloc ticket-wait "$ticket")
test -z "$ip" && exit 1
trap '' EXIT
echo "vm_name=$ticket"
echo "IP=$ip"

View file

@ -0,0 +1,7 @@
#! /bin/sh
die() { echo >&2 "$*" ; exit 1 ;}
test -n "$1" || die "one argument required"
resalloc ticket-close "$1"

View file

@ -0,0 +1,31 @@
#! /bin/sh -x
die(){ echo >&2 "!! $*"; exit 1; }
test -z "$RESALLOC_NAME" && die "no vm specified, empty \$RESALLOC_NAME variable"
case "$RESALLOC_POOL_ID" in
*aarch64_01*)
conn=qemu+ssh://copr@virthost-aarch64-os01.fedorainfracloud.org/system
;;
*aarch64_02*)
conn=qemu+ssh://copr@virthost-aarch64-os02.fedorainfracloud.org/system
;;
*) die "unknown RESALLOC_POOL_ID=$RESALLOC_POOL_ID" ;;
esac
repeat()
{
counter=0
while :; do
counter=$(echo "1 + $counter" | bc)
"$@"
test $? -eq 0 && break
# give up?
test "$counter" -ge 3 && break
sleep 15
done
}
repeat virsh -c "$conn" destroy "$RESALLOC_NAME"
repeat virsh -c "$conn" undefine "$RESALLOC_NAME" --remove-all-storage --nvram

View file

@ -0,0 +1,278 @@
#! /usr/bin/python2
import os
import sys
import logging
import subprocess
import tempfile
import shutil
import json
import time
import uuid
import pipes
import argparse
# configuration
disk_pool = 'default'
img_volume = 'copr-builder'
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('--swap-vol-size', metavar='GB', type=int, default=20)
parser.add_argument('--root-vol-size', metavar='GB', type=int)
parser.add_argument('--cpu-count', default=2)
parser.add_argument('--ram-size', metavar='MB', default=4096)
return parser
# global variables
workdir = None
connection = None
def setup_connection(pool_id):
global connection
if 'aarch64_01' in pool_id:
connection = 'qemu+ssh://copr@virthost-aarch64-os01.fedorainfracloud.org/system'
elif 'aarch64_02' in pool_id:
connection = 'qemu+ssh://copr@virthost-aarch64-os02.fedorainfracloud.org/system'
else:
raise Exception("wrong pool_id")
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger()
script_path = os.path.dirname(os.path.realpath(__file__))
def call(cmd, *args, **kwargs):
log.debug("cmd: " + ' '.join([pipes.quote(str(x)) for x in cmd]))
start = time.time()
status = subprocess.call(cmd, *args, **kwargs)
log.debug(" -> exit_status={0}, time={1}s".format(
status, round(time.time() - start, 3)))
return status
def wait_for_ssh(ip):
script = os.path.join(script_path, 'wait-for-ssh', 'wait-for-ssh')
if call([script, '--timeout', '180', ip, '--log', 'debug']):
raise Exception("waiting not successful")
def execute_spinup_playbook(ip):
playbook = '/home/resalloc/provision/_provision_aarch64.yml'
cmd = ['timeout', '600', 'ansible-playbook', playbook, '-i', ip + ","]
if call(cmd, stdout=sys.stderr):
raise Exception("can't spinup")
def generate_config_iso(name, ip):
script_content = """#! /bin/bash
growpart /dev/vda 2
resize2fs /dev/vda2
mount -o remount /
nmcli con add con-name "{con_name}" ifname {device} type ethernet ip4 {ip}/23 gw4 38.145.49.254
nmcli con mod "{con_name}" ipv4.dns "8.8.8.8,1.1.1.1"
nmcli con up "{con_name}" iface {device}
""".format(
ip=ip,
con_name="copr-static",
device='eth0',
)
config_dir = os.path.join(workdir, "config")
os.makedirs(config_dir)
with open(os.path.join(config_dir, 'pre-network-script.sh'), 'w') as f:
f.write(script_content)
image = os.path.join(workdir, 'config.iso')
rc = call(['mkisofs', '-o', image, '-V', 'copr_config', '-r', '-J', '--quiet', config_dir])
if rc != 0:
raise Exception("mkisofs failed")
return image
def virsh_silent(args):
return call(['virsh', '-c', connection] + args, stdout=sys.stderr)
def mac_and_ip_address(pool_id, vm_id):
if pool_id == 'aarch64_01_prod':
ip_shift = 0 # 1, 2, 3, 4
elif pool_id == 'aarch64_01_dev':
ip_shift = 4 # 5, 6
elif pool_id == 'aarch64_02_prod':
ip_shift = 6 # 7, 8, 9, 10
elif pool_id == 'aarch64_02_dev':
ip_shift = 10 # 11, 12
else:
raise Exception("unknown pool_id {0}".format(pool_id))
if pool_id.endswith('prod'):
max_id=4
else:
max_id=1
vm_id = int(vm_id)
if vm_id < 0:
raise Exception("wrong RESALLOC_ID_IN_POOL={0}".format(vm_id))
elif vm_id > max_id:
raise Exception("too high RESALLOC_ID_IN_POOL={0}".format(vm_id))
last_part = 101 + ip_shift + vm_id
mac_last_part = hex(last_part)[2:]
if len(mac_last_part) == 1:
mac_last_part = '0' + mac_last_part
mac = '52:54:00:14:07:' + mac_last_part
ip = "38.145.48.{0}".format(last_part)
return mac, ip
cleanup_actions = {} # id: [function, args ...]
def cleanup():
log.info("doing cleanup")
for action in sorted(cleanup_actions):
log.debug("cleanup {0}".format(action))
command = cleanup_actions[action]
counter = 0
while True:
counter += 1
rc = command[0](command[1:])
if rc == 0:
break
if counter >= 3:
# give up :-(
break
log.debug("retry cleanup action ...")
time.sleep(15)
def alloc_disk(name, size, pool=disk_pool):
if type(size) == int:
size = "{0}G".format(size)
if virsh_silent(['vol-create-as', pool, name, str(size)]) != 0:
raise Exception("can't create '{0}' disk".format(name))
cleanup_actions['80_delete_disk_{0}'.format(name)] = [virsh_silent,
'vol-delete', name, '--pool', pool]
def create_volume_from_iso(name, prealloc_size, iso, pool=disk_pool):
alloc_disk(name, prealloc_size, pool)
if virsh_silent(['vol-upload', name, iso, '--pool', pool]):
raise Exception("can not vol-upload the config disk")
def create_volume_from_volume(name, volume, pool=disk_pool, size=None):
if virsh_silent(['vol-clone', volume, name, '--pool', pool]):
raise Exception("vol-clone failed")
cleanup_actions['80_delete_disk_{0}'.format(name)] = [virsh_silent,
'vol-delete', name, '--pool', pool]
if size:
if virsh_silent(['vol-resize', '--vol', name, '--capacity',
size, '--pool', pool]):
raise Exception(['cant resize ' + name])
def boot_machine(name, volumes, vcpus):
cmd = [
'virt-install',
'--connect', connection,
'--ram', '4096',
'--os-type', 'generic',
'--vcpus', str(vcpus),
'--vnc',
'--features', 'acpi=off',
'--noautoconsole',
'--import',
'-n', name,
'--network', 'bridge=br0,model=virtio',
'--channel', "unix,target_type=virtio,name='org.qemu.guest_agent.0'",
'--rng', '/dev/random',
# '--boot', 'kernel_args="ds=nocloud-net"'
]
for vol in volumes:
cmd += ['--disk', 'vol={0},device={1},bus=virtio'.format(*vol)]
if call(cmd, stdout=sys.stderr):
raise Exception("can not boot the machine")
cleanup_actions['50_shut_down'.format(name)] = [
virsh_silent, 'destroy', name]
cleanup_actions['51_shut_down'.format(name)] = [
virsh_silent, 'undefine', name, '--nvram']
def main():
fail = False
for required_var in ['RESALLOC_NAME', 'RESALLOC_POOL_ID', 'RESALLOC_ID_IN_POOL']:
if required_var not in os.environ:
fail = True
log.error("missing variable {0}".format(required_var))
if fail:
sys.exit(1)
parser = get_parser()
args = parser.parse_args()
vm_name = os.environ['RESALLOC_NAME']
_, ip = mac_and_ip_address(os.environ['RESALLOC_POOL_ID'],
os.environ['RESALLOC_ID_IN_POOL'])
setup_connection(os.environ['RESALLOC_POOL_ID'])
try:
global workdir
workdir = tempfile.mkdtemp()
log.debug("workdir is '{0}'".format(workdir))
# cloud-init config volume
image_file = generate_config_iso(vm_name, ip)
log.debug("config image {0}".format(image_file))
create_volume_from_iso(vm_name + '_config', '1M', image_file, pool='images')
# the / volume
create_volume_from_volume(vm_name + '_root', img_volume, pool='images',
size=args.root_vol_size)
# swap volume
alloc_disk(vm_name + '_swap', args.swap_vol_size)
# start the VM
volumes = [
('images/' + vm_name + '_root', 'disk', 'virtio'),
('images/' + vm_name + '_config', 'cdrom', 'virtio'),
('default/' + vm_name + '_swap', 'disk', 'virtio'),
]
boot_machine(vm_name, volumes, args.cpu_count)
wait_for_ssh(ip)
execute_spinup_playbook(ip)
sys.stdout.write("{0}\n".format(ip))
sys.stdout.flush()
except (Exception, KeyboardInterrupt) as e:
log.exception(e)
cleanup()
sys.exit(1)
finally:
log.debug("cleaning up workdir")
shutil.rmtree(workdir)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,2 @@
Download up2date version from:
https://github.com/praiskup/wait-for-ssh

View file

@ -0,0 +1,163 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2017 Pavel Raiskup
#
# This program accepts one argument IP or HOSTNAME. First try to connect to the
# HOSTNAME as 'root' user. If cloud-init scripts instruct us to use different
# user than 'root', switch to that user and check again. In the end, print the
# successful username on stdout.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from re import compile as re_compile
from time import sleep
from sys import exit
from os import devnull
from threading import Thread, Event
from argparse import ArgumentParser
from subprocess import Popen, CalledProcessError, PIPE
from pipes import quote
import logging
handler = logging.StreamHandler()
log = logging.getLogger()
log.setLevel(logging.INFO)
log.addHandler(handler)
# create console handler and set level to debug
ssh = [
'ssh',
'-o', 'StrictHostKeyChecking=no',
'-o', 'UserKnownHostsFile=/dev/null',
'-o', 'PasswordAuthentication=no',
]
expected_output = 'foobar'
inner_cmd = 'echo ' + expected_output
class Checker(Thread):
user = 'root'
daemon = True
user_re = '[a-zA-Z0-9_.][a-zA-Z0-9_.-]*[$]?'
re_clouduser = re_compile('Please login as the user "({0})"'.format(user_re))
event = Event()
def loop(self):
cmd = ssh + [
'{0}@{1}'.format(self.user, self.args.host),
inner_cmd,
]
with open(devnull, 'w') as drop:
log.debug('executing: ' + ' '.join(cmd))
self.child = Popen(cmd, stdout=PIPE, stderr=drop)
(stdout, _) = self.child.communicate()
exp = (expected_output + '\n').encode('ascii')
if self.child.returncode == 0 and stdout == exp:
if self.args.print_user:
print(self.user)
return True
if self.args.cloud_user:
match = self.re_clouduser.search(str(stdout))
if match:
self.user = match.group(1)
log.info('cloud user switched to ' + self.user)
return False
def run(self):
while True:
if self.loop():
# Success!
break
if self.event.wait(1):
log.debug("stopping per kill event")
break
def kill(self):
self.event.set()
# Best effort kill.
try:
self.child.kill()
except:
pass
self.join()
parser = ArgumentParser(
description="Wait till the host's ssh becomes responsive.")
parser.add_argument('host', help='hostname or IP')
parser.add_argument('--timeout',
help='seconds to wait before failure, default=indefinitely',
default=None, type=float)
parser.add_argument('--check-cloud-user', action='store_true', default=False,
dest='cloud_user',
help='if cloud-init disallows "root" login, try to detect the cloud ' \
+'user and use that')
parser.add_argument('--print-user', action='store_true', default=False,
dest='print_user',
help='print the username which succeeded to connect on stdout')
parser.add_argument('--log', default=False,
dest='log_verbosity',
help='set the threshold for logging, e.g. debug, info, error, ...')
def main():
sleep_period = 1.0
args = parser.parse_args()
if args.log_verbosity:
log.setLevel(logging.getLevelName(args.log_verbosity.upper()))
def timeouted():
if args.timeout is None:
return False
log.debug("wait {0}s, remains {1}s".format(sleep_period, args.timeout))
args.timeout -= sleep_period
return args.timeout <= 0
checker = Checker()
checker.args = args
checker.start()
try:
# threading.join() is not Ctrl-C interruptable :( in python2, so we need
# this ugly infinite loop.
# https://stackoverflow.com/questions/25676835/signal-handling-in-multi-threaded-python
while True:
checker.join(sleep_period)
if not checker.is_alive():
# Success!
return 0
if timeouted():
log.error("timeout!")
checker.kill()
return 1
except KeyboardInterrupt:
log.error("interrupt by user")
checker.kill()
return 1
if __name__ == "__main__":
exit(main())

View file

@ -136,6 +136,12 @@
# tags:
# - provision_config
- name: resalloc
import_tasks: resalloc.yml
when: devel
tags:
- resalloc
- name: put ansible.cfg for all this into /etc/ansible/ on the system
copy: src="provision/ansible.cfg" dest=/etc/ansible/ansible.cfg
tags:

View file

@ -0,0 +1,55 @@
- name: install packages needed by resalloc server
dnf:
state: present
name: ['resalloc-server', 'resalloc', 'sqlite']
- name: resalloc: sync copr provisioning files
synchronize: src="provision/" dest="/var/lib/resallocserver/provision/"
tags:
- provision_config
- name: resalloc: sync resalloc provisioning files
synchronize: src="resalloc_provision/" dest="/var/lib/resallocserver/resalloc_provision/"
tags:
- provision_config
- name: resalloc: ssh directory
file:
path: /var/lib/resallocserver/.ssh
state: directory
mode: 0700
owner: resalloc
group: resalloc
- name: resalloc: copy backend ssh identity
copy:
src: "{{ private }}/files/copr/buildsys.priv"
dest: /var/lib/resallocserver/.ssh/id_rsa
owner: resalloc
group: resalloc
mode: 0600
- name: resalloc: ssh config file
copy:
src: "ssh_config"
dest: /var/lib/resallocserver/.ssh/config
owner: resalloc
group: resalloc
mode: 0600
- name: resalloc: server config
template:
src: "resalloc/{{ item }}"
dest: "/etc/resallocserver/{{ item }}"
mode: 0600
owner: resalloc
group: resalloc
with_items:
- server.yaml
- pools.yaml
- name: start/enable resalloc server
- service:
name: resalloc
state: started
enabled: yes

View file

@ -26,7 +26,12 @@ frontend_auth={{ copr_backend_password }}
# vm_max_check_fails=2 - when machine is consequently X times marked as failed then it is terminated
# vm_terminating_timeout=600 - when machine was terminated and terminate PB did not finish within this number of second, we will run the PB once again.
{% if devel %}
# TODO: swap to 3 also on production
build_groups=3
{% else %}
build_groups=2
{% endif %}
group0_name=PC
group0_archs=i386,x86_64,i586
@ -61,6 +66,25 @@ group1_max_vm_total=8
group1_max_spawn_processes=2
{% endif %}
group2_name=AARCH64
group2_archs=aarch64
group2_spawn_playbook=/home/copr/provision/builderpb_libvirt_aarch64.yml
group2_terminate_playbook=/home/copr/provision/terminatepb_libvirt_aarch64.yml
group2_vm_health_check_period=30
group2_vm_health_check_max_time=120
# TODO: switch devel/production
{% if devel %}
group2_max_vm_per_user=4
group2_max_vm_total=8
# we can not over-load hypervisors, there's max-spawn limit in resalloc config
group2_max_spawn_processes=8
{% else %}
group2_max_vm_per_user=2
group2_max_vm_total=4
group2_max_spawn_processes=4
{% endif %}
# directory where results are stored
# should be accessible from web using 'results_baseurl' URL
# no default

View file

@ -0,0 +1,50 @@
# TODO: s/devel/not devel/
{% if devel %}
# Production configuration. On each aarch64 host we have
# 2 guests VMs of
# - 10 VCPUs
# - 80 GB SWAP
# - 24 GB RAM
# - 8 GB root partition (it's sparse qcow2, so thin-provisioning)
aarch64_01_prod:
max: 4
max_starting: 2
max_prealloc: 4
cmd_new: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-new --swap-vol-size 80 --cpu-count 10 --ram-size 20480"
cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-delete"
tags:
- aarch64
aarch64_02_prod:
max: 4
max_starting: 2
max_prealloc: 4
cmd_new: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-new --swap-vol-size 80 --cpu-count 10 --ram-size 20480"
cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-delete"
tags:
- aarch64
{% else %}
# Development configuration. On each aarch64 host we have
# 2 guests vms of
# - 2 VCPUs (default)
# - 20 GB SWAP (default)
# - 4 GB RAM (default)
# - 8 GB root partition (it's sparse qcow2, so thin-provisioning)
aarch64_01_dev:
max: 2
max_starting: 2
max_prealloc: 2
cmd_new: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-new"
cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-delete"
tags:
- aarch64
aarch64_02_dev:
max: 2
max_starting: 2
max_prealloc: 2
cmd_new: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-new"
cmd_delete: "/var/lib/resallocserver/resalloc_provision/vm-aarch64-delete"
tags:
- aarch64
{% endif %}

View file

@ -0,0 +1,6 @@
db_url: 'sqlite:////var/lib/resallocserver/db.sqlite'
logdir: '/var/log/resallocserver'
# Listen only on localhost!
hostname: 'localhost'
#port: 49100
loglevel: debug