Change the way we retrieve the package's summary

Instead of relying on mdapi to provide us the information, download the
latest version of the primary.xml file for the rawhide repository hosted
on koji and parse it to extract for each package their summary.

The primary.xml file is cached on disk and only updated when it differs.

The download of the xml file has a cost in time, but it is out-weighed
by far by the gain this approach has over querying mdapi thousands of
times.

Signed-off-by: Pierre-Yves Chibon <pingou@pingoured.fr>
This commit is contained in:
Pierre-Yves Chibon 2019-11-19 14:14:08 +01:00 committed by Nils Philippsen
parent b8e37f62fc
commit 4a9d30a9dd
2 changed files with 222 additions and 30 deletions

210
package_summary.py Normal file
View file

@ -0,0 +1,210 @@
#!/usr/bin/python3 -tt
"""
This module provides the functionality to download the latest primary.xml
database from koji on the rawhide repo.
Decompress that xml file (which are downloaded compressed).
Read its content and build a dictionary with the package names as keys
and their summaries as values.
This code can then be used to create an in-memory cache of this information
which can then later be re-used in other places.
This prevents relying on remote services such as mdapi (of which a lot of
code here is coming from) when needing to access the summary of a lot of
packages.
"""
import contextlib
import hashlib
import logging
import os
import time
import xml.etree.ElementTree as ET
import xml.sax
import defusedxml.sax
import requests
KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'
repomd_xml_namespace = {
'repo': 'http://linux.duke.edu/metadata/repo',
'rpm': 'http://linux.duke.edu/metadata/rpm',
}
log = logging.getLogger(__name__)
def download_db(name, repomd_url, archive):
log.info('%s Downloading file: %s to %s' % (
name.ljust(12), repomd_url, archive))
response = requests.get(repomd_url, verify=True)
with open(archive, 'wb') as stream:
stream.write(response.content)
def decompress_db(name, archive, location):
''' Decompress the given archive at the specified location. '''
log.info('%s Extracting %s to %s' % (name.ljust(12), archive, location))
if archive.endswith('.xz'):
import lzma
with contextlib.closing(lzma.LZMAFile(archive)) as stream_xz:
data = stream_xz.read()
with open(location, 'wb') as stream:
stream.write(data)
elif archive.endswith('.tar.gz'):
import tarfile
with tarfile.open(archive) as tar:
tar.extractall(path=location)
elif archive.endswith('.gz'):
import gzip
with open(location, 'wb') as out:
with gzip.open(archive, 'rb') as inp:
out.write(inp.read())
elif archive.endswith('.bz2'):
import bz2
with open(location, 'wb') as out:
bzar = bz2.BZ2File(archive)
out.write(bzar.read())
bzar.close()
else:
raise NotImplementedError(archive)
def needs_update(local_file, remote_sha, sha_type):
''' Compare sha of a local and remote file.
Return True if our local file needs to be updated.
'''
if not os.path.isfile(local_file):
# If we have never downloaded this before, then obviously it has
# "changed"
return True
# Old old epel5 doesn't even know which sha it is using..
if sha_type == 'sha':
sha_type = 'sha1'
hash = getattr(hashlib, sha_type)()
with open(local_file, 'rb') as f:
hash.update(f.read())
local_sha = hash.hexdigest()
if local_sha != remote_sha:
return True
return False
class PackageHandler(xml.sax.ContentHandler):
def __init__(self):
self.current_data = ""
self.name = ""
self.summary = ""
self.output = {}
self.pkg = {}
# Call when an element starts
def startElement(self, tag, attributes):
self.current_data = tag
if tag == "package":
if self.pkg:
self.output[self.pkg["name"]] = self.pkg["summary"]
self.type = attributes["type"]
self.pkg = {}
# Call when a character is read
def characters(self, content):
if self.current_data == "summary":
self.summary = content
elif self.current_data == "name":
self.name = content
# Call when an elements ends
def endElement(self, tag):
if self.current_data == "summary":
# print("Summary:", self.summary)
self.pkg["summary"] = self.summary
elif self.current_data == "name":
# print("name:", self.name)
self.pkg["name"] = self.name
self.current_data = ""
def get_primary_xml(destfolder, url, name):
''' Retrieve the repo metadata at the given url and store them using
the provided name.
'''
repomd_url = url + '/repomd.xml'
response = requests.get(repomd_url, verify=True)
if not bool(response):
print('%s !! Failed to get %r %r' % (
name.ljust(12), repomd_url, response))
return
# Parse the xml doc and get a list of locations and their shasum.
files = ((
node.find('repo:location', repomd_xml_namespace),
node.find('repo:open-checksum', repomd_xml_namespace),
) for node in ET.fromstring(response.text))
# Extract out the attributes that we're really interested in.
files = (
(f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])
for f, s in files if f is not None and s is not None
)
# Filter down to only the primary.xml files
files = list((f, s, t) for f, s, t in files if 'primary.xml' in f)
if not files:
log.debug('No primary.xml could be found in %s' % url)
elif len(files) > 1:
log.debug("More than one primary.xml could be found in %s" % url)
return
filename, shasum, shatype = files[0]
repomd_url = url + '/' + filename
# First, determine if the file has changed by comparing hash
db = "distgit-bugzilla-sync-primary.xml"
# Have we downloaded this before? Did it change?
destfile = os.path.join(destfolder, db)
if not needs_update(destfile, shasum, shatype):
log.debug('%s No change of %s' % (name.ljust(12), repomd_url))
else:
# If it has changed, then download it and move it into place.
archive = os.path.join(destfolder, filename)
download_db(name, repomd_url, archive)
decompress_db(name, archive, destfile)
return destfile
def get_package_summary():
start = time.time()
primary_xml = get_primary_xml(
"/var/tmp",
KOJI_REPO + 'rawhide/latest/x86_64/repodata',
"koji",
)
handler = PackageHandler()
defusedxml.sax.parse(primary_xml, handler)
delta = time.time() - start
log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")
return handler.output
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
db = get_package_summary()
print(f"guake: {db.get('guake')}")
print(f"geany: {db.get('geany')}")
print(f"kernel: {db.get('kernel')}")

View file

@ -53,6 +53,8 @@ from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import yaml
import package_summary
env = 'staging'
@ -536,31 +538,6 @@ def _get_override_yaml(project, session):
return {}
@cache.cache_on_arguments()
def _get_package_summary_from_mdapi(namespace, repo, session=None):
summary = None
if namespace != 'rpms':
return summary
if session is None:
session = retry_session()
url = '{0}/rawhide/srcpkg/{1}'.format(MDAPIURL.rstrip('/'), repo)
if VERBOSE:
print('Querying {0}'.format(url))
rv = session.get(url, timeout=60)
if rv.ok:
rv_json = rv.json()
summary = rv_json['summary']
elif not rv.ok and rv.status_code != 404:
error_msg = ('The connection to "{0}" failed with the status code {1} '
'and output "{2}"').format(url, rv.status_code, rv.text)
raise RuntimeError(error_msg)
return summary
def _get_pdc_branches(session, repo):
"""
Gets the branches on a project. This function is used for mapping.
@ -604,7 +581,7 @@ def _is_retired(product, project):
return True
def _to_legacy_schema(product_and_project, session=None):
def _to_legacy_schema(product_and_project_and_summary, session=None):
"""
This function translates the JSON of a Pagure project to what PkgDB used to
output in the Bugzilla API. This function is used for mapping.
@ -615,7 +592,7 @@ def _to_legacy_schema(product_and_project, session=None):
:return: a dictionary of the content that the PkgDB Bugzilla API would
return
"""
product, project = product_and_project
product, project, rpm_summary = product_and_project_and_summary
if session is None:
session = retry_session()
@ -623,8 +600,9 @@ def _to_legacy_schema(product_and_project, session=None):
owner = project['poc']
watchers = project['watchers']
summary = _get_package_summary_from_mdapi(
project['namespace'], project['name'], session)
summary = None
if project["namespace"] == "rpms":
summary = rpm_summary.get(project["name"])
# Check if the project is retired in PDC, and if so set assignee to orphan.
if _is_retired(product, project):
@ -708,6 +686,10 @@ def main():
print("Querying %r for initial cc list." % cc_url)
pagure_namespace_to_cc = session.get(cc_url, timeout=120).json()
if VERBOSE:
print("Building a cache of the rpm packages' summary")
rpm_summary = package_summary.get_package_summary()
# Combine and collapse those two into a single list:
pagure_projects = []
for namespace, entries in pagure_namespace_to_poc.items():
@ -758,7 +740,7 @@ def main():
# would have returned
p_to_legacy_schema = resilient_partial(_to_legacy_schema, session=session)
items = [
(product, project)
(product, project, rpm_summary)
for project in pagure_projects
for product in project['products']
]