distgit-bugzilla-sync/distgit_bugzilla_sync/package_summaries.py
Nils Philippsen 4a590170b0 more typo, plural, wording fixes
Signed-off-by: Nils Philippsen <nils@redhat.com>
2019-11-22 15:16:10 +01:00

207 lines
6.3 KiB
Python

"""
This module provides the functionality to download the latest primary.xml
database from koji on the rawhide repo.
Decompress that xml file (which are downloaded compressed).
Read its content and build a dictionary with the package names as keys
and their summaries as values.
This code can then be used to create an in-memory cache of this information
which can then later be re-used in other places.
This prevents relying on remote services such as mdapi (of which a lot of
code here is coming from) when needing to access the summary of a lot of
packages.
"""
import contextlib
import hashlib
import logging
import os
import time
import xml.etree.ElementTree as ET
import xml.sax
import defusedxml.sax
import requests
KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'
repomd_xml_namespace = {
'repo': 'http://linux.duke.edu/metadata/repo',
'rpm': 'http://linux.duke.edu/metadata/rpm',
}
log = logging.getLogger(__name__)
def download_db(name, repomd_url, archive):
log.info('%s Downloading file: %s to %s' % (
name.ljust(12), repomd_url, archive))
response = requests.get(repomd_url, verify=True)
with open(archive, 'wb') as stream:
stream.write(response.content)
def decompress_db(name, archive, location):
''' Decompress the given archive at the specified location. '''
log.info('%s Extracting %s to %s' % (name.ljust(12), archive, location))
if archive.endswith('.xz'):
import lzma
with contextlib.closing(lzma.LZMAFile(archive)) as stream_xz:
data = stream_xz.read()
with open(location, 'wb') as stream:
stream.write(data)
elif archive.endswith('.tar.gz'):
import tarfile
with tarfile.open(archive) as tar:
tar.extractall(path=location)
elif archive.endswith('.gz'):
import gzip
with open(location, 'wb') as out:
with gzip.open(archive, 'rb') as inp:
out.write(inp.read())
elif archive.endswith('.bz2'):
import bz2
with open(location, 'wb') as out:
bzar = bz2.BZ2File(archive)
out.write(bzar.read())
bzar.close()
else:
raise NotImplementedError(archive)
def needs_update(local_file, remote_sha, sha_type):
''' Compare hash of a local and remote file.
Return True if our local file needs to be updated.
'''
if not os.path.isfile(local_file):
# If we have never downloaded this before, then obviously it has
# "changed"
return True
# Old epel5 doesn't even know which sha it is using...
if sha_type == 'sha':
sha_type = 'sha1'
hash = getattr(hashlib, sha_type)()
with open(local_file, 'rb') as f:
hash.update(f.read())
local_sha = hash.hexdigest()
if local_sha != remote_sha:
return True
return False
class PackageHandler(xml.sax.ContentHandler):
def __init__(self):
self.current_data = ""
self.name = ""
self.summary = ""
self.output = {}
self.pkg = {}
# Call when an element starts
def startElement(self, tag, attributes):
self.current_data = tag
if tag == "package":
if self.pkg:
self.output[self.pkg["name"]] = self.pkg["summary"]
self.type = attributes["type"]
self.pkg = {}
# Call when a character is read
def characters(self, content):
if self.current_data == "summary":
self.summary = content
elif self.current_data == "name":
self.name = content
# Call when an elements ends
def endElement(self, tag):
if self.current_data == "summary":
# print("Summary:", self.summary)
self.pkg["summary"] = self.summary
elif self.current_data == "name":
# print("name:", self.name)
self.pkg["name"] = self.name
self.current_data = ""
def get_primary_xml(destfolder, url, name):
''' Retrieve the repo metadata at the given url and store them using
the provided name.
'''
repomd_url = url + '/repomd.xml'
response = requests.get(repomd_url, verify=True)
if not bool(response):
print('%s !! Failed to get %r %r' % (
name.ljust(12), repomd_url, response))
return
# Parse the xml doc and get a list of locations and their shasum.
files = ((
node.find('repo:location', repomd_xml_namespace),
node.find('repo:open-checksum', repomd_xml_namespace),
) for node in ET.fromstring(response.text))
# Extract out the attributes that we're really interested in.
files = (
(f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])
for f, s in files if f is not None and s is not None
)
# Filter down to only the primary.xml files
files = list((f, s, t) for f, s, t in files if 'primary.xml' in f)
if not files:
log.debug('No primary.xml could be found in %s' % url)
elif len(files) > 1:
log.debug("More than one primary.xml could be found in %s" % url)
return
filename, shasum, shatype = files[0]
repomd_url = url + '/' + filename
# First, determine if the file has changed by comparing hash
db = "distgit-bugzilla-sync-primary.xml"
# Have we downloaded this before? Did it change?
destfile = os.path.join(destfolder, db)
if not needs_update(destfile, shasum, shatype):
log.debug('%s No change of %s' % (name.ljust(12), repomd_url))
else:
# If it has changed, then download it and move it into place.
archive = os.path.join(destfolder, filename)
download_db(name, repomd_url, archive)
decompress_db(name, archive, destfile)
return destfile
def get_package_summaries():
start = time.time()
primary_xml = get_primary_xml(
"/var/tmp",
KOJI_REPO + 'rawhide/latest/x86_64/repodata',
"koji",
)
handler = PackageHandler()
defusedxml.sax.parse(primary_xml, handler)
delta = time.time() - start
log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")
return handler.output
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
db = get_package_summaries()
print(f"guake: {db.get('guake')}")
print(f"geany: {db.get('geany')}")
print(f"kernel: {db.get('kernel')}")