Use multithread on Pagure and PDC queries

2017-08-18 18:40:25 +00:00 · 2017-08-18 18:40:25 +00:00 · 27a90039fe
commit 27a90039fe
parent 11ace40beb
1 changed files with 197 additions and 75 deletions
--- a/roles/distgit/pagure/templates/pagure-sync-bugzilla.py.j2
+++ b/roles/distgit/pagure/templates/pagure-sync-bugzilla.py.j2
@ -27,7 +27,7 @@ sync information from the Pagure into bugzilla
 This short script takes information about package onwership and imports it
 into bugzilla.
 '''
-
+from __future__ import print_function
 import re
 import argparse
 import datetime
@ -39,6 +39,9 @@ import json
 import xmlrpclib
 import codecs
 import smtplib
+import multiprocessing.pool
+from math import ceil
+from functools import partial
 try:
    from email.Message import Message
 except ImportError:
@ -483,8 +486,10 @@ def _get_watchers_rv_json(pagure_project):
        print('Querying {0}'.format(watchers_api_url))
    watchers_rv = session.get(watchers_api_url, timeout=60)
    if not watchers_rv.ok:
-        error_msg = base_error_msg.format(
-            watchers_api_url, watchers_rv.status_code, watchers_rv.text)
+        error_msg = ('The connection to "{0}" failed with the status code {1} '
+                     'and output "{2}"'.format(
+                         watchers_api_url, watchers_rv.status_code,
+                         watchers_rv.text))
        raise RuntimeError(error_msg)
    return watchers_rv.json()

@ -507,8 +512,10 @@ def _is_retired_in_pdc(product, project):
        raise RuntimeError("Could not find %r in PDC." % project)
    branches = pdc_rv.json()['results']
    if not branches:
-        raise RuntimeError("No results for %r in PDC." % project)
-    return branches[0]['active']
+        print("No results for %r in PDC." % project, file=sys.stderr)
+        # Assume it's not retired if we can't find out for sure
+        return False
+    return not branches[0]['active']


@cache.cache_on_arguments()
@ -526,18 +533,100 @@ def _get_override_yaml(project):
    return {}


-def pagure_project_to_acl_schema(pagure_project, product):
+@cache.cache_on_arguments()
+def _get_package_summary_from_mdapi(namespace, repo, session=None):
+    summary = None
+    if namespace != 'rpms':
+        return summary
+
+    if session is None:
+        session = retry_session()
+
+    url = '{0}/rawhide/srcpkg/{1}'.format(MDAPIURL.rstrip('/'), repo)
+    if DRY_RUN:
+        print('Querying {0}'.format(url))
+
+    rv = session.get(url, timeout=60)
+    if rv.ok:
+        rv_json = rv.json()
+        summary = rv_json['summary']
+    elif not rv.ok and rv.status_code != 404:
+        error_msg = ('The connection to "{0}" failed with the status code {1} '
+                     'and output "{2}"').format(url, rv.status_code, rv.text)
+        raise RuntimeError(error_msg)
+
+    return summary
+
+
+def _get_pdc_project_name_and_branches(session, namespace, repo):
+    """
+    Gets the branches on a project. This function is used for mapping.
+    :param namespace: string of the namespace the project is in
+    :param repo: string of the project
+    :return: a tuple with the repo name and a list of the repo's branches
+    """
+    branches_url = '{0}component-branches/'.format(PDCURL)
+    params = dict(
+        global_component=repo,
+        type=PDC_TYPES[namespace]
+    )
+    if DRY_RUN:
+        print('Querying {0} {1}'.format(branches_url, params))
+    rv = session.get(branches_url, timeout=60)
+
+    # If the project's branches can't be reported, just return no branches and
+    # it will be skipped later on
+    if not rv.ok:
+        print(('The connection to "{0}" failed with the status code {1} and '
+               'output "{2}"'.format(branches_url, rv.status_code, rv.text)),
+              file = sys.stderr)
+        return repo, []
+
+    data = rv.json()
+    return repo, [branch['name'] for branch in data['results']]
+
+
+def _get_pagure_projects_from_page(session, namespace, page):
+    """
+    Gets the names of all the Pagure projects on a page. This function is to be
+    used for mapping.
+    :param namespace: string of the namespace to query for projects
+    :param page: int of the page to query at
+    :return: list of projects on the page
+    """
+    url = ('{0}/api/0/projects?namespace={1}&page={2}&per_page=100&'
+           'fork=false'.format(
+               PAGURE_DIST_GIT_URL.rstrip('/'), namespace, page))
+
+    if DRY_RUN:
+        print('- Querying {0}'.format(url))
+
+    response = session.get(url, timeout=120)
+    if not bool(response):
+        print("Failed to talk to %r %r." % (
+            response.request.url, response), file=sys.stderr)
+        raise RuntimeError('Failed to talk to {0} {1}.'.format(
+            response.request.url, response))
+
+    return response.json()['projects']
+
+
+def _pagure_project_to_acl_schema(project_and_product, session=None):
    """
    This function translates the JSON of a Pagure project to what PkgDB used to
-    output in the Bugzilla API.
-    :param pagure_project: a dictionary of the JSON of a Pagure project
-    :return: a dictionary of the content that the Bugzilla API would output
+    output in the Bugzilla API. This function is used for mapping.
+    :param project_and_product: a tuple containing the dictionary of the JSON
+    of a Pagure project and a string of the product (e.g. "Fedora",
+    "Fedora EPEL")
+    :param session: a requests session object or None
+    :return: a dictionary of the content that the PkgDB Bugzilla API would
+    return
    """
-    session = retry_session()
-    base_error_msg = ('The connection to "{0}" failed with the status code '
-                      '{1} and output "{2}"')
+    project, product = project_and_product
+    if session is None:
+        session = retry_session()

-    watchers_rv_json = _get_watchers_rv_json(pagure_project)
+    watchers_rv_json = _get_watchers_rv_json(project)

    user_cc_list = []
    for user, watch_levels in watchers_rv_json['watchers'].items():
@ -545,28 +634,15 @@ def pagure_project_to_acl_schema(pagure_project, product):
        if 'issues' in watch_levels:
            user_cc_list.append(user)

-    summary = None
-    if pagure_project['namespace'] == 'rpms':
-        mdapi_url = '{0}/rawhide/srcpkg/{1}'.format(
-            MDAPIURL.rstrip('/'), pagure_project['name'])
-        if DRY_RUN:
-            print('Querying {0}'.format(mdapi_url))
-        mdapi_rv = session.get(mdapi_url, timeout=60)
-        if mdapi_rv.ok:
-            mdapi_rv_json = mdapi_rv.json()
-            summary = mdapi_rv_json['summary']
-        elif not mdapi_rv.ok and mdapi_rv.status_code != 404:
-            error_msg = base_error_msg.format(
-                mdapi_url, mdapi_rv.status_code, mdapi_rv.text)
-            raise RuntimeError(error_msg)
+    summary = _get_package_summary_from_mdapi(
+        project['namespace'], project['name'], session)

-    # Check if the branch is retired in PDC, and if so set assignee to orphan.
-    owner = pagure_project['access_users']['owner'][0]
+    # Check if the project is retired in PDC, and if so set assignee to orphan.
+    owner = project['access_users']['owner'][0]
    if _is_retired_in_pdc(product, project):
        owner = 'extras-orphan@fedoraproject.org'

    # Check if the Bugzilla ticket assignee has been manually overridden
-    owner = pagure_project['access_users']['owner'][0]
    override_yaml = _get_override_yaml(project)
    if override_yaml.get(product) \
            and isinstance(override_yaml[product], string_types):
@ -583,7 +659,11 @@ def pagure_project_to_acl_schema(pagure_project, product):
        # No package has this set in PkgDB's API, so it can be safely turned
        # off and set to the defaults later on in the code
        'qacontact': None,
-        'summary': summary
+        'summary': summary,
+        # These two values are not part of original PkgDB RV, but they are
+        # useful
+        'product': product,
+        'project': project['name']
    }


@ -610,57 +690,99 @@ if __name__ == '__main__':
        'Fedora Container': {},
        'Fedora EPEL': {},
    }
-    pagure_rpms_api_url = ('{0}/api/0/projects?fork=false&namespace=rpms&page=1&'
-                           'per_page=100'.format(
-                               PAGURE_DIST_GIT_URL.rstrip('/')))
+
    session = retry_session()
+    pagure_namespace_to_project_lists = {}
+    pool = multiprocessing.pool.ThreadPool(8)

-    while True:
+    # Query for all the rpm and container projects and store them in
+    # pagure_namespace_to_projects
+    for namespace in ['rpms', 'container']:
+        first_page_url = ('{0}/api/0/projects?namespace={1}&fork=false&page=1'
+                         '&per_page=1'.format(PAGURE_DIST_GIT_URL, namespace))
        if DRY_RUN:
-            print('Querying {0}'.format(pagure_rpms_api_url))
-        rv_json = session.get(pagure_rpms_api_url, timeout=120).json()
-        for project in rv_json['projects']:
-            pagure_project_branches_api_url = (
-                '{0}/api/0/rpms/{1}/git/branches'
-                .format(PAGURE_DIST_GIT_URL.rstrip('/'), project['name']))
-            branch_rv_json = session.get(
-                pagure_project_branches_api_url, timeout=60).json()
-            epel = False
-            fedora = False
-            for branch in branch_rv_json['branches']:
-                if re.match(r'epel\d+', branch):
-                    epel = True
-                    projects_dict['Fedora EPEL'][project['name']] = \
-                        pagure_project_to_acl_schema(project, 'Fedora EPEL')
-                else:
-                    fedora = True
-                    projects_dict['Fedora'][project['name']] = \
-                        pagure_project_to_acl_schema(project, 'Fedora')
+            print('- Querying {0}'.format(first_page_url))
+        first_page_rv = session.get(first_page_url, timeout=120)

-                if fedora and epel:
-                    break
+        if not bool(first_page_rv):
+            raise RuntimeError('Failed to talk to {0} {1}.'.format(
+                first_page_rv.request.url, first_page_rv))

-        if rv_json['pagination']['next']:
-            pagure_rpms_api_url = rv_json['pagination']['next']
-        else:
-            break
+        total_projects = first_page_rv.json()['total_projects']
+        num_pages = int(ceil(total_projects / 100.0))

-    pagure_container_api_url = (
-        '{0}/api/0/projects?fork=false&namespace=container&page=1&per_page=100'
-        .format(PAGURE_DIST_GIT_URL))
-    while True:
-        if DRY_RUN:
-            print('Querying {0}'.format(pagure_container_api_url))
-        rv_json = session.get(pagure_container_api_url, timeout=120).json()
-        for project in rv_json['projects']:
-            project_pkgdb_schema = pagure_project_to_acl_schema(project)
-            projects_dict['Fedora Container'][project['name']] = \
-                project_pkgdb_schema
+        # Since we are going to multi-thread, we need to make a partial
+        # function call so that all the function needs is an iterable to run
+        p_get_pagure_projects_from_page = partial(
+            _get_pagure_projects_from_page, session, namespace)
+        pagure_namespace_to_project_lists[namespace] = pool.map(
+            p_get_pagure_projects_from_page, range(1, num_pages + 1))

-        if rv_json['pagination']['next']:
-            pagure_container_api_url = rv_json['pagination']['next']
-        else:
-            break
+    # Flatten the list of lists (each page is a list of a projects)
+    pagure_namespace_to_projects = {}
+    for namespace in ['rpms', 'container']:
+        pagure_namespace_to_projects[namespace] = []
+        for project_list in pagure_namespace_to_project_lists[namespace]:
+            pagure_namespace_to_projects[namespace] += project_list
+    # This is no longer needed, so we can save some RAM
+    del pagure_namespace_to_project_lists
+
+    # Now, we must get all the branches for the RPM projects we just queried.
+    # This will be stored in pagure_rpm_project_branches as a dictionary of
+    # {'python-requests': 'master', 'f27', 'f26'}
+    pagure_rpm_project_names = [project['name'] for project in
+                                pagure_namespace_to_projects['rpms']]
+    p_get_pdc_project_name_and_branches = partial(
+        _get_pdc_project_name_and_branches, session, 'rpms')
+    pagure_rpm_project_branches = dict(pool.map(
+        p_get_pdc_project_name_and_branches, pagure_rpm_project_names))
+    # This is no longer needed, so we can save some RAM
+    del pagure_rpm_project_names
+
+    # Determine what products each project maps to based on its branches.
+    # pagure_rpms_project_products will be in the format of
+    # [('python-requests': 'Fedora')...] which will be used my a mapping
+    # function below
+    pagure_rpms_project_products = []
+    for project in pagure_namespace_to_projects['rpms']:
+        name = project['name']
+        products = []
+        branches = pagure_rpm_project_branches[name]
+        for branch in branches:
+            if re.match(r'^epel\d+$', branch):
+                epel = True
+                products.append('Fedora EPEL')
+            else:
+                fedora = True
+                products.append('Fedora')
+
+            if 'Fedora' in products and 'Fedora EPEL' in products:
+                break
+
+        for product in products:
+            pagure_rpms_project_products.append((project, product))
+
+    for project in pagure_namespace_to_projects['container']:
+        pagure_rpms_project_products.append((project, 'Fedora Container'))
+
+    # Save some RAM since this large dict is no longer needed
+    del pagure_namespace_to_projects
+
+    # Now, we must transform the data we collected into something that PkgDB
+    # would have returned
+    p_pagure_project_to_acl_schema = partial(
+        _pagure_project_to_acl_schema, session=session)
+    project_to_acl_schemas = pool.map(
+        p_pagure_project_to_acl_schema, pagure_rpms_project_products)
+    pool.close()
+
+    # Transform the data returned in project_to_acl_schemas to be an orderly
+    # dictionary for ease of use later on.
+    for rv in project_to_acl_schemas:
+        projects_dict[rv['product']][rv['project']] = rv
+
+    # This is no longer needed, so we can save some RAM
+    del project_to_acl_schemas

    # Initialize the connection to bugzilla
    bugzilla = Bugzilla(BZSERVER, BZUSER, BZPASS, projects_dict)