Change the way we retrieve the package's summary

Instead of relying on mdapi to provide us the information, download the latest version of the primary.xml file for the rawhide repository hosted on koji and parse it to extract for each package their summary. The primary.xml file is cached on disk and only updated when it differs. The download of the xml file has a cost in time, but it is out-weighed by far by the gain this approach has over querying mdapi thousands of times. Signed-off-by: Pierre-Yves Chibon <pingou@pingoured.fr>
2019-11-19 14:14:08 +01:00 · 2019-11-19 14:14:08 +01:00 · 4a9d30a9dd
commit 4a9d30a9dd
parent b8e37f62fc
2 changed files with 222 additions and 30 deletions
--- a/package_summary.py
+++ b/package_summary.py
@ -0,0 +1,210 @@
+#!/usr/bin/python3 -tt
+
+"""
+This module provides the functionality to download the latest primary.xml
+database from koji on the rawhide repo.
+Decompress that xml file (which are downloaded compressed).
+Read its content and build a dictionary with the package names as keys
+and their summaries as values.
+
+This code can then be used to create an in-memory cache of this information
+which can then later be re-used in other places.
+This prevents relying on remote services such as mdapi (of which a lot of
+code here is coming from) when needing to access the summary of a lot of
+packages.
+
+"""
+import contextlib
+import hashlib
+import logging
+import os
+import time
+import xml.etree.ElementTree as ET
+import xml.sax
+
+import defusedxml.sax
+import requests
+
+KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'
+
+repomd_xml_namespace = {
+    'repo': 'http://linux.duke.edu/metadata/repo',
+    'rpm': 'http://linux.duke.edu/metadata/rpm',
+}
+
+log = logging.getLogger(__name__)
+
+
+def download_db(name, repomd_url, archive):
+    log.info('%s Downloading file: %s to %s' % (
+        name.ljust(12), repomd_url, archive))
+    response = requests.get(repomd_url, verify=True)
+    with open(archive, 'wb') as stream:
+        stream.write(response.content)
+
+
+def decompress_db(name, archive, location):
+    ''' Decompress the given archive at the specified location. '''
+    log.info('%s Extracting %s to %s' % (name.ljust(12), archive, location))
+    if archive.endswith('.xz'):
+        import lzma
+        with contextlib.closing(lzma.LZMAFile(archive)) as stream_xz:
+            data = stream_xz.read()
+        with open(location, 'wb') as stream:
+            stream.write(data)
+    elif archive.endswith('.tar.gz'):
+        import tarfile
+        with tarfile.open(archive) as tar:
+            tar.extractall(path=location)
+    elif archive.endswith('.gz'):
+        import gzip
+        with open(location, 'wb') as out:
+            with gzip.open(archive, 'rb') as inp:
+                out.write(inp.read())
+    elif archive.endswith('.bz2'):
+        import bz2
+        with open(location, 'wb') as out:
+            bzar = bz2.BZ2File(archive)
+            out.write(bzar.read())
+            bzar.close()
+    else:
+        raise NotImplementedError(archive)
+
+
+def needs_update(local_file, remote_sha, sha_type):
+    ''' Compare sha of a local and remote file.
+    Return True if our local file needs to be updated.
+    '''
+
+    if not os.path.isfile(local_file):
+        # If we have never downloaded this before, then obviously it has
+        # "changed"
+        return True
+
+    # Old old epel5 doesn't even know which sha it is using..
+    if sha_type == 'sha':
+        sha_type = 'sha1'
+
+    hash = getattr(hashlib, sha_type)()
+    with open(local_file, 'rb') as f:
+        hash.update(f.read())
+
+    local_sha = hash.hexdigest()
+    if local_sha != remote_sha:
+        return True
+
+    return False
+
+
+class PackageHandler(xml.sax.ContentHandler):
+    def __init__(self):
+        self.current_data = ""
+        self.name = ""
+        self.summary = ""
+        self.output = {}
+        self.pkg = {}
+
+    # Call when an element starts
+    def startElement(self, tag, attributes):
+        self.current_data = tag
+        if tag == "package":
+            if self.pkg:
+                self.output[self.pkg["name"]] = self.pkg["summary"]
+            self.type = attributes["type"]
+            self.pkg = {}
+
+    # Call when a character is read
+    def characters(self, content):
+        if self.current_data == "summary":
+            self.summary = content
+        elif self.current_data == "name":
+            self.name = content
+
+    # Call when an elements ends
+    def endElement(self, tag):
+        if self.current_data == "summary":
+            # print("Summary:", self.summary)
+            self.pkg["summary"] = self.summary
+        elif self.current_data == "name":
+            # print("name:", self.name)
+            self.pkg["name"] = self.name
+
+        self.current_data = ""
+
+
+def get_primary_xml(destfolder, url, name):
+    ''' Retrieve the repo metadata at the given url and store them using
+    the provided name.
+    '''
+    repomd_url = url + '/repomd.xml'
+    response = requests.get(repomd_url, verify=True)
+    if not bool(response):
+        print('%s !! Failed to get %r %r' % (
+            name.ljust(12), repomd_url, response))
+        return
+
+    # Parse the xml doc and get a list of locations and their shasum.
+    files = ((
+        node.find('repo:location', repomd_xml_namespace),
+        node.find('repo:open-checksum', repomd_xml_namespace),
+    ) for node in ET.fromstring(response.text))
+
+    # Extract out the attributes that we're really interested in.
+    files = (
+        (f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])
+        for f, s in files if f is not None and s is not None
+    )
+
+    # Filter down to only the primary.xml files
+    files = list((f, s, t) for f, s, t in files if 'primary.xml' in f)
+
+    if not files:
+        log.debug('No primary.xml could be found in %s' % url)
+    elif len(files) > 1:
+        log.debug("More than one primary.xml could be found in %s" % url)
+        return
+
+    filename, shasum, shatype = files[0]
+    repomd_url = url + '/' + filename
+
+    # First, determine if the file has changed by comparing hash
+    db = "distgit-bugzilla-sync-primary.xml"
+
+    # Have we downloaded this before?  Did it change?
+    destfile = os.path.join(destfolder, db)
+    if not needs_update(destfile, shasum, shatype):
+        log.debug('%s No change of %s' % (name.ljust(12), repomd_url))
+    else:
+        # If it has changed, then download it and move it into place.
+        archive = os.path.join(destfolder, filename)
+
+        download_db(name, repomd_url, archive)
+        decompress_db(name, archive, destfile)
+
+    return destfile
+
+
+def get_package_summary():
+    start = time.time()
+
+    primary_xml = get_primary_xml(
+        "/var/tmp",
+        KOJI_REPO + 'rawhide/latest/x86_64/repodata',
+        "koji",
+    )
+
+    handler = PackageHandler()
+    defusedxml.sax.parse(primary_xml, handler)
+
+    delta = time.time() - start
+    log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")
+
+    return handler.output
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    db = get_package_summary()
+    print(f"guake: {db.get('guake')}")
+    print(f"geany: {db.get('geany')}")
+    print(f"kernel: {db.get('kernel')}")
--- a/pagure_sync_bugzilla.py
+++ b/pagure_sync_bugzilla.py
@ -53,6 +53,8 @@ from requests.adapters import HTTPAdapter
 from urllib3.util import Retry
 import yaml

+import package_summary
+

 env = 'staging'

@ -536,31 +538,6 @@ def _get_override_yaml(project, session):
    return {}


-@cache.cache_on_arguments()
-def _get_package_summary_from_mdapi(namespace, repo, session=None):
-    summary = None
-    if namespace != 'rpms':
-        return summary
-
-    if session is None:
-        session = retry_session()
-
-    url = '{0}/rawhide/srcpkg/{1}'.format(MDAPIURL.rstrip('/'), repo)
-    if VERBOSE:
-        print('Querying {0}'.format(url))
-
-    rv = session.get(url, timeout=60)
-    if rv.ok:
-        rv_json = rv.json()
-        summary = rv_json['summary']
-    elif not rv.ok and rv.status_code != 404:
-        error_msg = ('The connection to "{0}" failed with the status code {1} '
-                     'and output "{2}"').format(url, rv.status_code, rv.text)
-        raise RuntimeError(error_msg)
-
-    return summary
-
-
 def _get_pdc_branches(session, repo):
    """
    Gets the branches on a project. This function is used for mapping.
@ -604,7 +581,7 @@ def _is_retired(product, project):
        return True


-def _to_legacy_schema(product_and_project, session=None):
+def _to_legacy_schema(product_and_project_and_summary, session=None):
    """
    This function translates the JSON of a Pagure project to what PkgDB used to
    output in the Bugzilla API. This function is used for mapping.
@ -615,7 +592,7 @@ def _to_legacy_schema(product_and_project, session=None):
    :return: a dictionary of the content that the PkgDB Bugzilla API would
    return
    """
-    product, project = product_and_project
+    product, project, rpm_summary = product_and_project_and_summary

    if session is None:
        session = retry_session()
@ -623,8 +600,9 @@ def _to_legacy_schema(product_and_project, session=None):
    owner = project['poc']
    watchers = project['watchers']

-    summary = _get_package_summary_from_mdapi(
-        project['namespace'], project['name'], session)
+    summary = None
+    if project["namespace"] == "rpms":
+        summary = rpm_summary.get(project["name"])

    # Check if the project is retired in PDC, and if so set assignee to orphan.
    if _is_retired(product, project):
@ -708,6 +686,10 @@ def main():
        print("Querying %r for initial cc list." % cc_url)
    pagure_namespace_to_cc = session.get(cc_url, timeout=120).json()

+    if VERBOSE:
+        print("Building a cache of the rpm packages' summary")
+    rpm_summary = package_summary.get_package_summary()
+
    # Combine and collapse those two into a single list:
    pagure_projects = []
    for namespace, entries in pagure_namespace_to_poc.items():
@ -758,7 +740,7 @@ def main():
    # would have returned
    p_to_legacy_schema = resilient_partial(_to_legacy_schema, session=session)
    items = [
-        (product, project)
+        (product, project, rpm_summary)
        for project in pagure_projects
        for product in project['products']
    ]