more typo, plural, wording fixes

Signed-off-by: Nils Philippsen <nils@redhat.com>
This commit is contained in:
Nils Philippsen 2019-11-22 14:58:13 +01:00
parent 82374d6739
commit 4a590170b0
2 changed files with 14 additions and 14 deletions

View file

@ -0,0 +1,207 @@
"""
This module provides the functionality to download the latest primary.xml
database from koji on the rawhide repo.
Decompress that xml file (which are downloaded compressed).
Read its content and build a dictionary with the package names as keys
and their summaries as values.
This code can then be used to create an in-memory cache of this information
which can then later be re-used in other places.
This prevents relying on remote services such as mdapi (of which a lot of
code here is coming from) when needing to access the summary of a lot of
packages.
"""
import contextlib
import hashlib
import logging
import os
import time
import xml.etree.ElementTree as ET
import xml.sax
import defusedxml.sax
import requests
KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'
repomd_xml_namespace = {
'repo': 'http://linux.duke.edu/metadata/repo',
'rpm': 'http://linux.duke.edu/metadata/rpm',
}
log = logging.getLogger(__name__)
def download_db(name, repomd_url, archive):
log.info('%s Downloading file: %s to %s' % (
name.ljust(12), repomd_url, archive))
response = requests.get(repomd_url, verify=True)
with open(archive, 'wb') as stream:
stream.write(response.content)
def decompress_db(name, archive, location):
''' Decompress the given archive at the specified location. '''
log.info('%s Extracting %s to %s' % (name.ljust(12), archive, location))
if archive.endswith('.xz'):
import lzma
with contextlib.closing(lzma.LZMAFile(archive)) as stream_xz:
data = stream_xz.read()
with open(location, 'wb') as stream:
stream.write(data)
elif archive.endswith('.tar.gz'):
import tarfile
with tarfile.open(archive) as tar:
tar.extractall(path=location)
elif archive.endswith('.gz'):
import gzip
with open(location, 'wb') as out:
with gzip.open(archive, 'rb') as inp:
out.write(inp.read())
elif archive.endswith('.bz2'):
import bz2
with open(location, 'wb') as out:
bzar = bz2.BZ2File(archive)
out.write(bzar.read())
bzar.close()
else:
raise NotImplementedError(archive)
def needs_update(local_file, remote_sha, sha_type):
''' Compare hash of a local and remote file.
Return True if our local file needs to be updated.
'''
if not os.path.isfile(local_file):
# If we have never downloaded this before, then obviously it has
# "changed"
return True
# Old epel5 doesn't even know which sha it is using...
if sha_type == 'sha':
sha_type = 'sha1'
hash = getattr(hashlib, sha_type)()
with open(local_file, 'rb') as f:
hash.update(f.read())
local_sha = hash.hexdigest()
if local_sha != remote_sha:
return True
return False
class PackageHandler(xml.sax.ContentHandler):
def __init__(self):
self.current_data = ""
self.name = ""
self.summary = ""
self.output = {}
self.pkg = {}
# Call when an element starts
def startElement(self, tag, attributes):
self.current_data = tag
if tag == "package":
if self.pkg:
self.output[self.pkg["name"]] = self.pkg["summary"]
self.type = attributes["type"]
self.pkg = {}
# Call when a character is read
def characters(self, content):
if self.current_data == "summary":
self.summary = content
elif self.current_data == "name":
self.name = content
# Call when an elements ends
def endElement(self, tag):
if self.current_data == "summary":
# print("Summary:", self.summary)
self.pkg["summary"] = self.summary
elif self.current_data == "name":
# print("name:", self.name)
self.pkg["name"] = self.name
self.current_data = ""
def get_primary_xml(destfolder, url, name):
''' Retrieve the repo metadata at the given url and store them using
the provided name.
'''
repomd_url = url + '/repomd.xml'
response = requests.get(repomd_url, verify=True)
if not bool(response):
print('%s !! Failed to get %r %r' % (
name.ljust(12), repomd_url, response))
return
# Parse the xml doc and get a list of locations and their shasum.
files = ((
node.find('repo:location', repomd_xml_namespace),
node.find('repo:open-checksum', repomd_xml_namespace),
) for node in ET.fromstring(response.text))
# Extract out the attributes that we're really interested in.
files = (
(f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])
for f, s in files if f is not None and s is not None
)
# Filter down to only the primary.xml files
files = list((f, s, t) for f, s, t in files if 'primary.xml' in f)
if not files:
log.debug('No primary.xml could be found in %s' % url)
elif len(files) > 1:
log.debug("More than one primary.xml could be found in %s" % url)
return
filename, shasum, shatype = files[0]
repomd_url = url + '/' + filename
# First, determine if the file has changed by comparing hash
db = "distgit-bugzilla-sync-primary.xml"
# Have we downloaded this before? Did it change?
destfile = os.path.join(destfolder, db)
if not needs_update(destfile, shasum, shatype):
log.debug('%s No change of %s' % (name.ljust(12), repomd_url))
else:
# If it has changed, then download it and move it into place.
archive = os.path.join(destfolder, filename)
download_db(name, repomd_url, archive)
decompress_db(name, archive, destfile)
return destfile
def get_package_summaries():
start = time.time()
primary_xml = get_primary_xml(
"/var/tmp",
KOJI_REPO + 'rawhide/latest/x86_64/repodata',
"koji",
)
handler = PackageHandler()
defusedxml.sax.parse(primary_xml, handler)
delta = time.time() - start
log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")
return handler.output
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
db = get_package_summaries()
print(f"guake: {db.get('guake')}")
print(f"geany: {db.get('geany')}")
print(f"kernel: {db.get('kernel')}")