Use multithread on Pagure and PDC queries

This commit is contained in:
Matt Prahl 2017-08-18 18:40:25 +00:00 committed by Ralph Bean
parent 11ace40beb
commit 27a90039fe

View file

@ -27,7 +27,7 @@ sync information from the Pagure into bugzilla
This short script takes information about package onwership and imports it This short script takes information about package onwership and imports it
into bugzilla. into bugzilla.
''' '''
from __future__ import print_function
import re import re
import argparse import argparse
import datetime import datetime
@ -39,6 +39,9 @@ import json
import xmlrpclib import xmlrpclib
import codecs import codecs
import smtplib import smtplib
import multiprocessing.pool
from math import ceil
from functools import partial
try: try:
from email.Message import Message from email.Message import Message
except ImportError: except ImportError:
@ -483,8 +486,10 @@ def _get_watchers_rv_json(pagure_project):
print('Querying {0}'.format(watchers_api_url)) print('Querying {0}'.format(watchers_api_url))
watchers_rv = session.get(watchers_api_url, timeout=60) watchers_rv = session.get(watchers_api_url, timeout=60)
if not watchers_rv.ok: if not watchers_rv.ok:
error_msg = base_error_msg.format( error_msg = ('The connection to "{0}" failed with the status code {1} '
watchers_api_url, watchers_rv.status_code, watchers_rv.text) 'and output "{2}"'.format(
watchers_api_url, watchers_rv.status_code,
watchers_rv.text))
raise RuntimeError(error_msg) raise RuntimeError(error_msg)
return watchers_rv.json() return watchers_rv.json()
@ -507,8 +512,10 @@ def _is_retired_in_pdc(product, project):
raise RuntimeError("Could not find %r in PDC." % project) raise RuntimeError("Could not find %r in PDC." % project)
branches = pdc_rv.json()['results'] branches = pdc_rv.json()['results']
if not branches: if not branches:
raise RuntimeError("No results for %r in PDC." % project) print("No results for %r in PDC." % project, file=sys.stderr)
return branches[0]['active'] # Assume it's not retired if we can't find out for sure
return False
return not branches[0]['active']
@cache.cache_on_arguments() @cache.cache_on_arguments()
@ -526,18 +533,100 @@ def _get_override_yaml(project):
return {} return {}
def pagure_project_to_acl_schema(pagure_project, product): @cache.cache_on_arguments()
def _get_package_summary_from_mdapi(namespace, repo, session=None):
summary = None
if namespace != 'rpms':
return summary
if session is None:
session = retry_session()
url = '{0}/rawhide/srcpkg/{1}'.format(MDAPIURL.rstrip('/'), repo)
if DRY_RUN:
print('Querying {0}'.format(url))
rv = session.get(url, timeout=60)
if rv.ok:
rv_json = rv.json()
summary = rv_json['summary']
elif not rv.ok and rv.status_code != 404:
error_msg = ('The connection to "{0}" failed with the status code {1} '
'and output "{2}"').format(url, rv.status_code, rv.text)
raise RuntimeError(error_msg)
return summary
def _get_pdc_project_name_and_branches(session, namespace, repo):
"""
Gets the branches on a project. This function is used for mapping.
:param namespace: string of the namespace the project is in
:param repo: string of the project
:return: a tuple with the repo name and a list of the repo's branches
"""
branches_url = '{0}component-branches/'.format(PDCURL)
params = dict(
global_component=repo,
type=PDC_TYPES[namespace]
)
if DRY_RUN:
print('Querying {0} {1}'.format(branches_url, params))
rv = session.get(branches_url, timeout=60)
# If the project's branches can't be reported, just return no branches and
# it will be skipped later on
if not rv.ok:
print(('The connection to "{0}" failed with the status code {1} and '
'output "{2}"'.format(branches_url, rv.status_code, rv.text)),
file = sys.stderr)
return repo, []
data = rv.json()
return repo, [branch['name'] for branch in data['results']]
def _get_pagure_projects_from_page(session, namespace, page):
"""
Gets the names of all the Pagure projects on a page. This function is to be
used for mapping.
:param namespace: string of the namespace to query for projects
:param page: int of the page to query at
:return: list of projects on the page
"""
url = ('{0}/api/0/projects?namespace={1}&page={2}&per_page=100&'
'fork=false'.format(
PAGURE_DIST_GIT_URL.rstrip('/'), namespace, page))
if DRY_RUN:
print('- Querying {0}'.format(url))
response = session.get(url, timeout=120)
if not bool(response):
print("Failed to talk to %r %r." % (
response.request.url, response), file=sys.stderr)
raise RuntimeError('Failed to talk to {0} {1}.'.format(
response.request.url, response))
return response.json()['projects']
def _pagure_project_to_acl_schema(project_and_product, session=None):
""" """
This function translates the JSON of a Pagure project to what PkgDB used to This function translates the JSON of a Pagure project to what PkgDB used to
output in the Bugzilla API. output in the Bugzilla API. This function is used for mapping.
:param pagure_project: a dictionary of the JSON of a Pagure project :param project_and_product: a tuple containing the dictionary of the JSON
:return: a dictionary of the content that the Bugzilla API would output of a Pagure project and a string of the product (e.g. "Fedora",
"Fedora EPEL")
:param session: a requests session object or None
:return: a dictionary of the content that the PkgDB Bugzilla API would
return
""" """
project, product = project_and_product
if session is None:
session = retry_session() session = retry_session()
base_error_msg = ('The connection to "{0}" failed with the status code '
'{1} and output "{2}"')
watchers_rv_json = _get_watchers_rv_json(pagure_project) watchers_rv_json = _get_watchers_rv_json(project)
user_cc_list = [] user_cc_list = []
for user, watch_levels in watchers_rv_json['watchers'].items(): for user, watch_levels in watchers_rv_json['watchers'].items():
@ -545,28 +634,15 @@ def pagure_project_to_acl_schema(pagure_project, product):
if 'issues' in watch_levels: if 'issues' in watch_levels:
user_cc_list.append(user) user_cc_list.append(user)
summary = None summary = _get_package_summary_from_mdapi(
if pagure_project['namespace'] == 'rpms': project['namespace'], project['name'], session)
mdapi_url = '{0}/rawhide/srcpkg/{1}'.format(
MDAPIURL.rstrip('/'), pagure_project['name'])
if DRY_RUN:
print('Querying {0}'.format(mdapi_url))
mdapi_rv = session.get(mdapi_url, timeout=60)
if mdapi_rv.ok:
mdapi_rv_json = mdapi_rv.json()
summary = mdapi_rv_json['summary']
elif not mdapi_rv.ok and mdapi_rv.status_code != 404:
error_msg = base_error_msg.format(
mdapi_url, mdapi_rv.status_code, mdapi_rv.text)
raise RuntimeError(error_msg)
# Check if the branch is retired in PDC, and if so set assignee to orphan. # Check if the project is retired in PDC, and if so set assignee to orphan.
owner = pagure_project['access_users']['owner'][0] owner = project['access_users']['owner'][0]
if _is_retired_in_pdc(product, project): if _is_retired_in_pdc(product, project):
owner = 'extras-orphan@fedoraproject.org' owner = 'extras-orphan@fedoraproject.org'
# Check if the Bugzilla ticket assignee has been manually overridden # Check if the Bugzilla ticket assignee has been manually overridden
owner = pagure_project['access_users']['owner'][0]
override_yaml = _get_override_yaml(project) override_yaml = _get_override_yaml(project)
if override_yaml.get(product) \ if override_yaml.get(product) \
and isinstance(override_yaml[product], string_types): and isinstance(override_yaml[product], string_types):
@ -583,7 +659,11 @@ def pagure_project_to_acl_schema(pagure_project, product):
# No package has this set in PkgDB's API, so it can be safely turned # No package has this set in PkgDB's API, so it can be safely turned
# off and set to the defaults later on in the code # off and set to the defaults later on in the code
'qacontact': None, 'qacontact': None,
'summary': summary 'summary': summary,
# These two values are not part of original PkgDB RV, but they are
# useful
'product': product,
'project': project['name']
} }
@ -610,57 +690,99 @@ if __name__ == '__main__':
'Fedora Container': {}, 'Fedora Container': {},
'Fedora EPEL': {}, 'Fedora EPEL': {},
} }
pagure_rpms_api_url = ('{0}/api/0/projects?fork=false&namespace=rpms&page=1&'
'per_page=100'.format(
PAGURE_DIST_GIT_URL.rstrip('/')))
session = retry_session()
while True: session = retry_session()
pagure_namespace_to_project_lists = {}
pool = multiprocessing.pool.ThreadPool(8)
# Query for all the rpm and container projects and store them in
# pagure_namespace_to_projects
for namespace in ['rpms', 'container']:
first_page_url = ('{0}/api/0/projects?namespace={1}&fork=false&page=1'
'&per_page=1'.format(PAGURE_DIST_GIT_URL, namespace))
if DRY_RUN: if DRY_RUN:
print('Querying {0}'.format(pagure_rpms_api_url)) print('- Querying {0}'.format(first_page_url))
rv_json = session.get(pagure_rpms_api_url, timeout=120).json() first_page_rv = session.get(first_page_url, timeout=120)
for project in rv_json['projects']:
pagure_project_branches_api_url = ( if not bool(first_page_rv):
'{0}/api/0/rpms/{1}/git/branches' raise RuntimeError('Failed to talk to {0} {1}.'.format(
.format(PAGURE_DIST_GIT_URL.rstrip('/'), project['name'])) first_page_rv.request.url, first_page_rv))
branch_rv_json = session.get(
pagure_project_branches_api_url, timeout=60).json() total_projects = first_page_rv.json()['total_projects']
epel = False num_pages = int(ceil(total_projects / 100.0))
fedora = False
for branch in branch_rv_json['branches']: # Since we are going to multi-thread, we need to make a partial
if re.match(r'epel\d+', branch): # function call so that all the function needs is an iterable to run
p_get_pagure_projects_from_page = partial(
_get_pagure_projects_from_page, session, namespace)
pagure_namespace_to_project_lists[namespace] = pool.map(
p_get_pagure_projects_from_page, range(1, num_pages + 1))
# Flatten the list of lists (each page is a list of a projects)
pagure_namespace_to_projects = {}
for namespace in ['rpms', 'container']:
pagure_namespace_to_projects[namespace] = []
for project_list in pagure_namespace_to_project_lists[namespace]:
pagure_namespace_to_projects[namespace] += project_list
# This is no longer needed, so we can save some RAM
del pagure_namespace_to_project_lists
# Now, we must get all the branches for the RPM projects we just queried.
# This will be stored in pagure_rpm_project_branches as a dictionary of
# {'python-requests': 'master', 'f27', 'f26'}
pagure_rpm_project_names = [project['name'] for project in
pagure_namespace_to_projects['rpms']]
p_get_pdc_project_name_and_branches = partial(
_get_pdc_project_name_and_branches, session, 'rpms')
pagure_rpm_project_branches = dict(pool.map(
p_get_pdc_project_name_and_branches, pagure_rpm_project_names))
# This is no longer needed, so we can save some RAM
del pagure_rpm_project_names
# Determine what products each project maps to based on its branches.
# pagure_rpms_project_products will be in the format of
# [('python-requests': 'Fedora')...] which will be used my a mapping
# function below
pagure_rpms_project_products = []
for project in pagure_namespace_to_projects['rpms']:
name = project['name']
products = []
branches = pagure_rpm_project_branches[name]
for branch in branches:
if re.match(r'^epel\d+$', branch):
epel = True epel = True
projects_dict['Fedora EPEL'][project['name']] = \ products.append('Fedora EPEL')
pagure_project_to_acl_schema(project, 'Fedora EPEL')
else: else:
fedora = True fedora = True
projects_dict['Fedora'][project['name']] = \ products.append('Fedora')
pagure_project_to_acl_schema(project, 'Fedora')
if fedora and epel: if 'Fedora' in products and 'Fedora EPEL' in products:
break break
if rv_json['pagination']['next']: for product in products:
pagure_rpms_api_url = rv_json['pagination']['next'] pagure_rpms_project_products.append((project, product))
else:
break
pagure_container_api_url = ( for project in pagure_namespace_to_projects['container']:
'{0}/api/0/projects?fork=false&namespace=container&page=1&per_page=100' pagure_rpms_project_products.append((project, 'Fedora Container'))
.format(PAGURE_DIST_GIT_URL))
while True:
if DRY_RUN:
print('Querying {0}'.format(pagure_container_api_url))
rv_json = session.get(pagure_container_api_url, timeout=120).json()
for project in rv_json['projects']:
project_pkgdb_schema = pagure_project_to_acl_schema(project)
projects_dict['Fedora Container'][project['name']] = \
project_pkgdb_schema
if rv_json['pagination']['next']: # Save some RAM since this large dict is no longer needed
pagure_container_api_url = rv_json['pagination']['next'] del pagure_namespace_to_projects
else:
break # Now, we must transform the data we collected into something that PkgDB
# would have returned
p_pagure_project_to_acl_schema = partial(
_pagure_project_to_acl_schema, session=session)
project_to_acl_schemas = pool.map(
p_pagure_project_to_acl_schema, pagure_rpms_project_products)
pool.close()
# Transform the data returned in project_to_acl_schemas to be an orderly
# dictionary for ease of use later on.
for rv in project_to_acl_schemas:
projects_dict[rv['product']][rv['project']] = rv
# This is no longer needed, so we can save some RAM
del project_to_acl_schemas
# Initialize the connection to bugzilla # Initialize the connection to bugzilla
bugzilla = Bugzilla(BZSERVER, BZUSER, BZPASS, projects_dict) bugzilla = Bugzilla(BZSERVER, BZUSER, BZPASS, projects_dict)