fix extracting summaries from primary.xml

The previous implementation using a SAX parser processed incomplete XML
elements, which would e.g. change BZ component description to only the
last couple characters of the RPM summary.

In the course, consolidate to use only one the defusedxml element tree
XML parser.

fixes: #23

Signed-off-by: Nils Philippsen <nils@redhat.com>
This commit is contained in:
Nils Philippsen 2019-11-22 15:00:27 +01:00
parent 5e4a0e45a3
commit 81256bdd8a

View file

@ -16,12 +16,11 @@ import hashlib
import logging
import os
import time
import xml.etree.ElementTree as ET
import xml.sax
import defusedxml.sax
from defusedxml import cElementTree as etree
import requests
KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'
repomd_xml_namespace = {
@ -93,42 +92,6 @@ def needs_update(local_file, remote_sha, sha_type):
return False
class PackageHandler(xml.sax.ContentHandler):
def __init__(self):
self.current_data = ""
self.name = ""
self.summary = ""
self.output = {}
self.pkg = {}
# Call when an element starts
def startElement(self, tag, attributes):
self.current_data = tag
if tag == "package":
if self.pkg:
self.output[self.pkg["name"]] = self.pkg["summary"]
self.type = attributes["type"]
self.pkg = {}
# Call when a character is read
def characters(self, content):
if self.current_data == "summary":
self.summary = content
elif self.current_data == "name":
self.name = content
# Call when an elements ends
def endElement(self, tag):
if self.current_data == "summary":
# print("Summary:", self.summary)
self.pkg["summary"] = self.summary
elif self.current_data == "name":
# print("name:", self.name)
self.pkg["name"] = self.name
self.current_data = ""
def get_primary_xml(destfolder, url, name):
''' Retrieve the repo metadata at the given url and store them using
the provided name.
@ -144,7 +107,7 @@ def get_primary_xml(destfolder, url, name):
files = ((
node.find('repo:location', repomd_xml_namespace),
node.find('repo:open-checksum', repomd_xml_namespace),
) for node in ET.fromstring(response.text))
) for node in etree.fromstring(response.text))
# Extract out the attributes that we're really interested in.
files = (
@ -182,6 +145,8 @@ def get_primary_xml(destfolder, url, name):
def get_package_summaries():
summaries = {}
start = time.time()
primary_xml = get_primary_xml(
@ -190,13 +155,28 @@ def get_package_summaries():
"koji",
)
handler = PackageHandler()
defusedxml.sax.parse(primary_xml, handler)
context = etree.iterparse(primary_xml, events=('start', 'end'))
root = None
# iterate over the rest of the primary.xml tree
for event, elem in context:
if not root:
root = elem
continue
if event == 'end' and elem.tag == 'package' and elem.get('type', 'rpm') == 'rpm':
name = elem.findtext('name')
summary = elem.findtext('summary')
if name is not None and summary is not None:
summaries[name] = summary
# remove package child from root element to keep memory consumption low
root.clear()
delta = time.time() - start
log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")
return handler.output
return summaries
if __name__ == "__main__":