fix extracting summaries from primary.xml
The previous implementation using a SAX parser processed incomplete XML elements, which would e.g. change BZ component description to only the last couple characters of the RPM summary. In the course, consolidate to use only one the defusedxml element tree XML parser. fixes: #23 Signed-off-by: Nils Philippsen <nils@redhat.com>
This commit is contained in:
parent
5e4a0e45a3
commit
81256bdd8a
1 changed files with 23 additions and 43 deletions
|
@ -16,12 +16,11 @@ import hashlib
|
|||
import logging
|
||||
import os
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
import xml.sax
|
||||
|
||||
import defusedxml.sax
|
||||
from defusedxml import cElementTree as etree
|
||||
import requests
|
||||
|
||||
|
||||
KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'
|
||||
|
||||
repomd_xml_namespace = {
|
||||
|
@ -93,42 +92,6 @@ def needs_update(local_file, remote_sha, sha_type):
|
|||
return False
|
||||
|
||||
|
||||
class PackageHandler(xml.sax.ContentHandler):
|
||||
def __init__(self):
|
||||
self.current_data = ""
|
||||
self.name = ""
|
||||
self.summary = ""
|
||||
self.output = {}
|
||||
self.pkg = {}
|
||||
|
||||
# Call when an element starts
|
||||
def startElement(self, tag, attributes):
|
||||
self.current_data = tag
|
||||
if tag == "package":
|
||||
if self.pkg:
|
||||
self.output[self.pkg["name"]] = self.pkg["summary"]
|
||||
self.type = attributes["type"]
|
||||
self.pkg = {}
|
||||
|
||||
# Call when a character is read
|
||||
def characters(self, content):
|
||||
if self.current_data == "summary":
|
||||
self.summary = content
|
||||
elif self.current_data == "name":
|
||||
self.name = content
|
||||
|
||||
# Call when an elements ends
|
||||
def endElement(self, tag):
|
||||
if self.current_data == "summary":
|
||||
# print("Summary:", self.summary)
|
||||
self.pkg["summary"] = self.summary
|
||||
elif self.current_data == "name":
|
||||
# print("name:", self.name)
|
||||
self.pkg["name"] = self.name
|
||||
|
||||
self.current_data = ""
|
||||
|
||||
|
||||
def get_primary_xml(destfolder, url, name):
|
||||
''' Retrieve the repo metadata at the given url and store them using
|
||||
the provided name.
|
||||
|
@ -144,7 +107,7 @@ def get_primary_xml(destfolder, url, name):
|
|||
files = ((
|
||||
node.find('repo:location', repomd_xml_namespace),
|
||||
node.find('repo:open-checksum', repomd_xml_namespace),
|
||||
) for node in ET.fromstring(response.text))
|
||||
) for node in etree.fromstring(response.text))
|
||||
|
||||
# Extract out the attributes that we're really interested in.
|
||||
files = (
|
||||
|
@ -182,6 +145,8 @@ def get_primary_xml(destfolder, url, name):
|
|||
|
||||
|
||||
def get_package_summaries():
|
||||
summaries = {}
|
||||
|
||||
start = time.time()
|
||||
|
||||
primary_xml = get_primary_xml(
|
||||
|
@ -190,13 +155,28 @@ def get_package_summaries():
|
|||
"koji",
|
||||
)
|
||||
|
||||
handler = PackageHandler()
|
||||
defusedxml.sax.parse(primary_xml, handler)
|
||||
context = etree.iterparse(primary_xml, events=('start', 'end'))
|
||||
|
||||
root = None
|
||||
|
||||
# iterate over the rest of the primary.xml tree
|
||||
for event, elem in context:
|
||||
if not root:
|
||||
root = elem
|
||||
continue
|
||||
|
||||
if event == 'end' and elem.tag == 'package' and elem.get('type', 'rpm') == 'rpm':
|
||||
name = elem.findtext('name')
|
||||
summary = elem.findtext('summary')
|
||||
if name is not None and summary is not None:
|
||||
summaries[name] = summary
|
||||
# remove package child from root element to keep memory consumption low
|
||||
root.clear()
|
||||
|
||||
delta = time.time() - start
|
||||
log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")
|
||||
|
||||
return handler.output
|
||||
return summaries
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue