distgit-bugzilla-sync/distgit_bugzilla_sync/script.py
Pierre-Yves Chibon 4c697a63ce Allow specifying which projects to update
This allows to specify which projects to update instead of running on
all of them.
The project should be provided in form of namespace/name, for example:
``-p rpms/koji rpms/guake``.

Signed-off-by: Pierre-Yves Chibon <pingou@pingoured.fr>
2019-11-21 12:09:18 +01:00

767 lines
30 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright © 2013-2019 Red Hat, Inc.
#
# This copyrighted material is made available to anyone wishing to use, modify,
# copy, or redistribute it subject to the terms and conditions of the GNU
# General Public License v.2, or (at your option) any later version. This
# program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY expressed or implied, including the implied warranties of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details. You should have received a copy of the GNU
# General Public License along with this program; if not, write to the Free
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA. Any Red Hat trademarks that are incorporated in the source
# code or documentation are not subject to the GNU General Public License and
# may only be used or replicated with the express permission of Red Hat, Inc.
#
# Red Hat Author(s): Toshio Kuratomi <tkuratom@redhat.com>
# Author(s): Mike Watters <valholla75@fedoraproject.org>
# Author(s): Pierre-Yves Chibon <pingou@pingoured.fr>
# Author(s): Matt Prahl <mprahl@redhat.com>
# Author(s): Ralph Bean <rbean@redhat.com
# Author(s): Patrick Uiterwijk <puiterwijk@redhat.com>
#
'''
sync information from the Pagure into bugzilla
This ... script takes information about package onwership and imports it
into bugzilla.
'''
import argparse
import datetime
from email.message import EmailMessage
import itertools
import json
from operator import itemgetter
import os
import re
import smtplib
import sys
import time
import traceback
import xmlrpc.client
from bugzilla import Bugzilla
import dogpile.cache
import fedora.client
from fedora.client.fas2 import AccountSystem
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import yaml
from . import package_summary
from .config import config, email_overrides, load_configuration
cache = dogpile.cache.make_region().configure(
'dogpile.cache.memory',
expiration_time=3600,
)
def retry_session():
session = requests.Session()
retry = Retry(
total=5,
read=5,
connect=5,
backoff_factor=0.3,
status_forcelist=(500, 502, 504),
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def resilient_partial(fn, *initial, **kwargs):
""" A decorator that partially applies arguments.
It additionally catches all raised exceptions, prints them, but then returns
None instead of propagating the failures.
This is used to protect functions used in a threadpool. If one fails, we
want to know about it, but we don't want it to kill the whole program. So
catch its error, log it, but proceed.
"""
def wrapper(*additional):
try:
full = initial + additional
return fn(*full, **kwargs)
except Exception:
traceback.print_exc()
return None
wrapper.__name__ = fn.__name__
wrapper.__doc__ = fn.__doc__
return wrapper
class DataChangedError(Exception):
'''Raised when data we are manipulating changes while we're modifying it.'''
pass
def segment(iterable, chunk, fill=None):
'''Collect data into `chunk` sized block'''
args = [iter(iterable)] * chunk
return itertools.zip_longest(*args, fillvalue=fill)
class BugzillaProxy:
def __init__(self, bzServer, username, password, config):
self.bzXmlRpcServer = bzServer
self.username = username
self.password = password
self.server = Bugzilla(
url=self.bzXmlRpcServer,
user=self.username,
password=self.password)
self.productCache = {}
# Connect to the fedora account system
self.fas = AccountSystem(
base_url=config['fas']['url'],
username=config['fas']['username'],
password=config['fas']['password'])
self.config = config
try:
self.userCache = self.fas.people_by_key(
key='username',
fields=['bugzilla_email'])
except fedora.client.ServerError:
# Sometimes, building the userCache up front fails with a timeout.
# It's ok, we build the cache as-needed later in the script.
self.userCache = {}
def build_product_cache(self, pagure_project):
""" Cache the bugzilla info about each package in each product.
"""
products = {}
if self.config['bugzilla']['compat_api'] == 'getcomponentsdetails':
# Old API -- in python-bugzilla. But with current server, this
# gives ProxyError
for collection, product in self.config["products"].items():
self.productCache[collection] = self.server.getcomponentsdetails(product)
elif self.config['bugzilla']['compat_api'] == 'component.get':
# Way that's undocumented in the partner-bugzilla api but works
# currently
products = {}
for collection, product in self.config["products"].items():
# restrict the list of info returned to only the packages of
# interest
pkglist = [
project["name"]
for project in pagure_project
if product in project["products"]
]
for pkg_segment in segment(pkglist, self.config['bugzilla']['req_segment']):
# Format that bugzilla will understand. Strip None's that
# segment() pads out the final data segment() with
query = [
dict(
product=self.config['products'][collection],
component=p
)
for p in pkg_segment
if p is not None
]
raw_data = self.server._proxy.Component.get(dict(names=query))
for package in raw_data['components']:
# Reformat data to be the same as what's returned from
# getcomponentsdetails
product = dict(
initialowner=package['default_assignee'],
description=package['description'],
initialqacontact=package['default_qa_contact'],
initialcclist=package['default_cc']
)
products[package['name'].lower()] = product
self.productCache[collection] = products
def _get_bugzilla_email(self, username):
'''Return the bugzilla email address for a user.
First looks in a cache for a username => bugzilla email. If not found,
reloads the cache from fas and tries again.
'''
try:
return self.userCache[username]['bugzilla_email'].lower()
except KeyError:
if username.startswith('@'):
group = self.fas.group_by_name(username[1:])
bz_email = group.mailing_list
if bz_email is None:
return
self.userCache[username] = {
'bugzilla_email': bz_email}
else:
person = self.fas.person_by_username(username)
bz_email = person.get('bugzilla_email', None)
if bz_email is None:
return
self.userCache[username] = {'bugzilla_email': bz_email}
return self.userCache[username]['bugzilla_email'].lower()
def add_edit_component(self, package, collection, owner, description=None,
qacontact=None, cclist=None):
'''Add or update a component to have the values specified.
'''
# Turn the cclist into something usable by bugzilla
initialCCList = []
for watcher in cclist:
bz_email = self._get_bugzilla_email(watcher)
if bz_email:
initialCCList.append(bz_email)
else:
print(f"** {watcher} has no bugzilla_email or mailing_list set "
f"({collection}/{package}) **")
# Add owner to the cclist so comaintainers taking over a bug don't
# have to do this manually
owner = self._get_bugzilla_email(owner)
if owner not in initialCCList:
initialCCList.append(owner)
# Lookup product
try:
product = self.productCache[collection]
except xmlrpc.client.Fault as e:
# Output something useful in args
e.args = (e.faultCode, e.faultString)
raise
except xmlrpc.client.ProtocolError as e:
e.args = ('ProtocolError', e.errcode, e.errmsg)
raise
pkgKey = package.lower()
if pkgKey in product:
# edit the package information
data = {}
# Grab bugzilla email for things changable via xmlrpc
if qacontact:
qacontact = self._get_bugzilla_email(qacontact)
else:
qacontact = 'extras-qa@fedoraproject.org'
# Check for changes to the owner, qacontact, or description
if product[pkgKey]['initialowner'] != owner:
data['initialowner'] = owner
if description and product[pkgKey]['description'] != description:
data['description'] = description
if qacontact and product[pkgKey]['initialqacontact'] != qacontact:
data['initialqacontact'] = qacontact
if len(product[pkgKey]['initialcclist']) != len(initialCCList):
data['initialcclist'] = initialCCList
else:
for ccMember in product[pkgKey]['initialcclist']:
if ccMember not in initialCCList:
data['initialcclist'] = initialCCList
break
if data:
# Changes occurred. Submit a request to change via xmlrpc
data['product'] = self.config['products'][collection]
data['component'] = package
if self.config["verbose"]:
print('[EDITCOMP] %s/%s' % (data["product"], data["component"]))
for key in ["initialowner", "description", "initialqacontact", "initialcclist"]:
if data.get(key):
print(f" {key} changed from {product[pkgKey][key]} "
f"to {data.get(key)}")
# FIXME: initialowner has been made mandatory for some
# reason. Asking dkl why.
data['initialowner'] = owner
if not self.config["dryrun"]:
try:
self.server.editcomponent(data)
except xmlrpc.client.Fault as e:
# Output something useful in args
e.args = (data, e.faultCode, e.faultString)
raise
except xmlrpc.client.ProtocolError as e:
e.args = ('ProtocolError', e.errcode, e.errmsg)
raise
else:
# Add component
if qacontact:
qacontact = self._get_bugzilla_email(qacontact)
else:
qacontact = 'extras-qa@fedoraproject.org'
data = {
'product': self.config['products'][collection],
'component': package,
'description': description or 'NA',
'initialowner': owner,
'initialqacontact': qacontact
}
if initialCCList:
data['initialcclist'] = initialCCList
if self.config["verbose"]:
print('[ADDCOMP] %s/%s' % (data["product"], data["component"]))
for key in ["initialowner", "description", "initialqacontact", "initialcclist"]:
if data.get(key):
print(f" {key} set to {data.get(key)}")
if not self.config["dryrun"]:
try:
self.server.addcomponent(data)
except xmlrpc.client.Fault as e:
# Output something useful in args
e.args = (data, e.faultCode, e.faultString)
raise
def send_email(fromAddress, toAddress, subject, message, ccAddress=None):
'''Send an email if there's an error.
This will be replaced by sending messages to a log later.
'''
if env == 'staging':
# Send no email in staging...
pass
else:
msg = EmailMessage()
msg.add_header('To', ','.join(toAddress))
msg.add_header('From', fromAddress)
msg.add_header('Subject', subject)
if ccAddress is not None:
msg.add_header('Cc', ','.join(ccAddress))
toAddress = toAddress + ccAddress
msg.set_payload(message)
smtp = smtplib.SMTP('bastion')
smtp.sendmail(fromAddress, toAddress, msg.as_string())
smtp.quit()
def _get_pdc_branches(session, repo):
"""
Gets the branches on a project. This function is used for mapping.
:param repo: the project dict
:return: a list of the repo's branches
"""
branches_url = '{0}component-branches/'.format(env['pdc_url'])
params = dict(
global_component=repo['name'],
type=env['pdc_types'][repo['namespace']]
)
if config["verbose"]:
print('Querying {0} {1}'.format(branches_url, params))
rv = session.get(branches_url, params=params, timeout=60)
# If the project's branches can't be reported, just return no branches and
# it will be skipped later on
if not rv.ok:
print(('The connection to "{0}" failed with the status code {1} and '
'output "{2}"'.format(branches_url, rv.status_code, rv.text)),
file=sys.stderr)
return []
data = rv.json()
return [branch['name'] for branch in data['results']]
def _is_retired(product, project):
branches = project['branches']
if product == 'Fedora EPEL':
for branch, active in branches:
if re.match(r'^epel\d+$', branch):
if active:
return False
# No active branches means it is retired.
return True
else:
for branch, active in branches:
if active:
return False
return True
class ScriptExecError(RuntimeError):
def __init__(self, *args, **kwargs):
self.errorcode = kwargs.pop('errorcode', 1)
super().__init__(*args, **kwargs)
class DistgitBugzillaSync:
def notify_users(self, errors):
''' Browse the list of errors and when we can retrieve the email
address, use it to notify the user about the issue.
'''
data = {}
if os.path.exists(self.env['data_cache']):
try:
with open(self.env['data_cache']) as stream:
data = json.load(stream)
except Exception as err:
print('Could not read the json file at %s: \nError: %s' % (
env['data_cache'], err))
new_data = {}
seen = []
for error in errors:
notify_user = False
if 'The name ' in error and ' is not a valid username' in error:
user_email = error.split(' is not a valid username')[0].split(
'The name ')[1].strip()
now = datetime.datetime.utcnow()
# See if we already know about this user
if user_email in data and data[user_email]['last_update']:
last_update = datetime.datetime.fromtimestamp(
int(data[user_email]['last_update']))
# Only notify users once per hour
if (now - last_update).seconds >= 3600:
notify_user = True
else:
new_data[user_email] = data[user_email]
elif not data or user_email not in data:
notify_user = True
# Ensure we notify the user only once, no matter how many errors we
# got concerning them.
if user_email not in seen:
seen.append(user_email)
else:
notify_user = False
if notify_user:
send_email(
self.env['email_from'],
[user_email],
subject='Please fix your bugzilla.redhat.com account',
message=self.env['tmpl_user_email'],
ccAddress=self.env['notify_emails'],
)
new_data[user_email] = {
'last_update': time.mktime(now.timetuple())
}
with open(env['data_cache'], 'w') as stream:
json.dump(new_data, stream)
def get_cli_arguments(self):
""" Set the CLI argument parser and return the argument parsed.
"""
parser = argparse.ArgumentParser(
description='Script syncing information between Pagure and bugzilla'
)
parser.add_argument(
'--dry-run', dest='dryrun', action='store_true', default=False,
help='Do not actually make any changes - Overrides the configuration')
parser.add_argument(
'--verbose', dest='verbose', action='store_true', default=False,
help='Print actions verbosely - Overrides the configuration')
parser.add_argument(
'--debug', dest='debug', action='store_true', default=False,
help='Combination of --verbose and --dry-run')
parser.add_argument(
'--env', dest='env',
help='Run the script for a specific environment, overrides the one '
'set in the configuration file')
parser.add_argument('--add-config-file', metavar='CONFIG_FILE',
dest='addl_config_files', action='append',
help="File(s) from which to read overriding configuration")
parser.add_argument('--add-email-overrides-file', metavar='EMAIL_OVERRIDES_FILE',
dest='addl_email_overrides_files', action='append',
help="File(s) from which to read additional email overrides")
parser.add_argument(
'-p', '--project', dest='projects', nargs='+',
help='Update one or more projects (provided as namespace/name), '
'in all of its products')
self.args = parser.parse_args()
def get_pagure_project(self, project_list=None):
""" Builds a large list of all the projects on pagure.
Each item in that list is a dict containing:
- the namespace of the project
- the name of the project
- the point of contact of this project (ie: the default assignee
in bugzilla)
- the watchers of this project (ie: the initial CC list in bugzilla)
"""
# Get the initial ownership and CC data from pagure
# This part is easy.
poc_url = self.env['distgit_url'] + '/extras/pagure_poc.json'
if self.env["verbose"]:
print("Querying %r for points of contact." % poc_url)
pagure_namespace_to_poc = self.session.get(poc_url, timeout=120).json()
cc_url = self.env['distgit_url'] + '/extras/pagure_bz.json'
if self.env["verbose"]:
print("Querying %r for initial cc list." % cc_url)
pagure_namespace_to_cc = self.session.get(cc_url, timeout=120).json()
# Combine and collapse those two into a single list:
self.pagure_projects = []
if project_list:
project_list = set(tuple(p.split("/", 1)) for p in project_list)
for namespace, entries in pagure_namespace_to_poc.items():
for name, poc in entries.items():
if not project_list or (namespace, name) in project_list:
self.pagure_projects.append(dict(
namespace=namespace,
name=name,
poc=poc,
watchers=pagure_namespace_to_cc[namespace][name],
))
def add_branches_product_and_summary(self):
""" For each project retrieved this method adds branches, products
and summary information.
The branches are retrieved from PDC
The products are determined based on the branches.
The summary is coming from the primary.xml file from the repodata
of the rawhide repository in koji.
"""
branches_url = "/".join([
self.env['pdc_url'].split('rest_api')[0].rstrip("/"),
'extras/active_branches.json',
])
if self.env["verbose"]:
print("Querying %r for EOL information." % branches_url)
pdc_branches = self.session.get(branches_url, timeout=120).json()
for idx, project in enumerate(self.pagure_projects):
# Summary
summary = None
if project["namespace"] == "rpms":
summary = self.rpm_summary.get(project["name"])
project["summary"] = summary
# Branches
if project['namespace'] not in self.env['pdc_types']:
project['branches'] = []
project['products'] = []
if self.env["verbose"]:
print(
f'! Namespace {project["namespace"]} not found in the pdc_type '
f'configuration key, project {project["namespace"]}/{project["name"]} '
'ignored'
)
continue
pdc_type = self.env['pdc_types'][project['namespace']]
project['branches'] = pdc_branches.get(pdc_type, {}).get(project['name'], [])
if not project['branches']:
if self.env["verbose"]:
print(f"! No PDC branch found for {project['namespace']}/{project['name']}")
# Products
products = set()
for branch, active in project.get('branches'):
if re.match(r'^epel\d+$', branch):
products.add('Fedora EPEL')
else:
products.add(self.env['namespace_to_product'][project['namespace']])
project['products'] = list(products)
self.pagure_projects[idx] = project
@cache.cache_on_arguments()
def _get_override_yaml(self, project, session):
pagure_override_url = '{0}/{1}/raw/master/f/{2}/{3}'.format(
self.env['pagure_url'].rstrip('/'),
self.env['bugzilla']['override_repo'],
project['namespace'],
project['name'],
)
if self.env["verbose"]:
print('Querying {0}'.format(pagure_override_url))
override_rv = session.get(pagure_override_url, timeout=30)
if override_rv.status_code == 200:
override_yaml = yaml.safe_load(override_rv.text)
return override_yaml.get('bugzilla_contact', {})
return {}
@classmethod
def main(cls):
"""The entrypoint for running the script."""
dbs = cls()
try:
dbs.run()
except ScriptExecError as e:
print(str(e), file=sys.stderr)
sys.exit(e.errorcode)
else:
sys.exit(0)
def run(self):
"""Run the script."""
global envname, env, projects_dict
times = {
"start": time.time(),
}
self.get_cli_arguments()
load_configuration(addl_config_files=self.args.addl_config_files,
addl_email_overrides_files=self.args.addl_email_overrides_files)
self.config = config
envname = self.config['environment']
if self.args.env:
if self.args.env in self.config['environments']:
envname = self.args.env
else:
raise ScriptExecError(f"Invalid environment specified: {self.args.env}")
self.env = self.config['environments'][envname]
if self.args.debug:
self.env["verbose"] = True
self.env["dryrun"] = True
if self.args.verbose:
self.env["verbose"] = True
if self.args.dryrun:
self.env["dryrun"] = True
# Non-fatal errors to alert people about
errors = []
self.session = retry_session()
if self.env["verbose"]:
print("Building a cache of the rpm packages' summary")
self.rpm_summary = package_summary.get_package_summary()
self.get_pagure_project(self.args.projects)
self.add_branches_product_and_summary()
if self.env["verbose"]:
print(f"{len(self.pagure_projects)} projects to consider")
if not self.pagure_projects:
return
if self.env["verbose"]:
times["data structure end"] = time.time()
delta = times["data structure end"] - times["start"]
print("Ran for %s seconds -- ie: %.2f minutes" % (delta, delta/60.0))
print("Building FAS' cache")
# Initialize the connection to bugzilla
bugzilla = BugzillaProxy(self.env['bugzilla']['url'],
self.env['bugzilla']['user'],
self.env['bugzilla']['password'],
self.env)
if self.env["verbose"]:
times["FAS cache building end"] = time.time()
delta = times["FAS cache building end"] - times["data structure end"]
print(f"Ran for {delta} seconds -- ie: {delta/60} minutes")
if self.env["dryrun"]:
print("Querying bugzilla but not doing anything")
else:
print("Updating bugzilla")
bugzilla.build_product_cache(self.pagure_projects)
for project in sorted(self.pagure_projects, key=itemgetter('name')):
for product in project["products"]:
if product not in self.env['products']:
if self.env["verbose"]:
print(f"Ignoring: {product}/{project['name']}")
continue
owner = project["poc"]
# Check if the project is retired in PDC, and if so set assignee to orphan.
if _is_retired(product, project):
owner = 'orphan'
# Check if the Bugzilla ticket assignee has been manually overridden
override_yaml = self._get_override_yaml(project, self.session)
if override_yaml.get(product) \
and isinstance(override_yaml[product], str):
owner = override_yaml[product]
try:
bugzilla.add_edit_component(
package=project["name"],
collection=product,
owner=owner,
description=project['summary'],
qacontact=None,
cclist=project['watchers']
)
except ValueError as e:
# A username didn't have a bugzilla address
errors.append(str(e.args))
except DataChangedError as e:
# A Package or Collection was returned via xmlrpc but wasn't
# present when we tried to change it
errors.append(str(e.args))
except xmlrpc.client.ProtocolError as e:
# Unrecoverable and likely means that nothing is going to
# succeed.
errors.append(str(e.args))
break
except xmlrpc.client.Error as e:
# An error occurred in the xmlrpc call. Shouldn't happen but
# we better see what it is
errors.append('%s -- %s' % (project["name"], e.args[-1]))
# Send notification of errors
if errors:
if self.env["verbose"] or self.env["dryrun"]:
print('[DEBUG]', '\n'.join(errors))
else:
self.notify_users(errors)
send_email(
self.env['email_from'],
self.env['notify_emails'],
'Errors while syncing bugzilla with the PackageDB',
self.env['tmpl_admin_email'].format(errors='\n'.join(errors))
)
else:
with open(self.env['data_cache'], 'w') as stream:
json.dump({}, stream)
if self.env["verbose"]:
times["end"] = time.time()
print(" ----------")
print("Building the data structure")
delta = times["data structure end"] - times["start"]
print(f" Ran on {delta:.2f} seconds -- ie: {delta/60:.2f} minutes")
print("Building the FAS cache")
delta = times["FAS cache building end"] - times["data structure end"]
print(f" Ran on {delta:.2f} seconds -- ie: {delta/60:.2f} minutes")
print("Interacting with bugzilla")
delta = times["end"] - times["FAS cache building end"]
print(f" Ran on {delta:.2f} seconds -- ie: {delta/60:.2f} minutes")
print("Total")
delta = times["end"] - times["start"]
print(f" Ran on {delta:.2f} seconds -- ie: {delta/60:.2f} minutes")
if __name__ == '__main__':
DistgitBugzillaSync.main()