From 81256bdd8a7074768a35a26661458fe6b73b0842 Mon Sep 17 00:00:00 2001 From: Nils Philippsen Date: Fri, 22 Nov 2019 15:00:27 +0100 Subject: [PATCH] fix extracting summaries from primary.xml The previous implementation using a SAX parser processed incomplete XML elements, which would e.g. change BZ component description to only the last couple characters of the RPM summary. In the course, consolidate to use only one the defusedxml element tree XML parser. fixes: #23 Signed-off-by: Nils Philippsen --- distgit_bugzilla_sync/package_summaries.py | 66 ++++++++-------------- 1 file changed, 23 insertions(+), 43 deletions(-) diff --git a/distgit_bugzilla_sync/package_summaries.py b/distgit_bugzilla_sync/package_summaries.py index 7166a3b..478b324 100644 --- a/distgit_bugzilla_sync/package_summaries.py +++ b/distgit_bugzilla_sync/package_summaries.py @@ -16,12 +16,11 @@ import hashlib import logging import os import time -import xml.etree.ElementTree as ET -import xml.sax -import defusedxml.sax +from defusedxml import cElementTree as etree import requests + KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/' repomd_xml_namespace = { @@ -93,42 +92,6 @@ def needs_update(local_file, remote_sha, sha_type): return False -class PackageHandler(xml.sax.ContentHandler): - def __init__(self): - self.current_data = "" - self.name = "" - self.summary = "" - self.output = {} - self.pkg = {} - - # Call when an element starts - def startElement(self, tag, attributes): - self.current_data = tag - if tag == "package": - if self.pkg: - self.output[self.pkg["name"]] = self.pkg["summary"] - self.type = attributes["type"] - self.pkg = {} - - # Call when a character is read - def characters(self, content): - if self.current_data == "summary": - self.summary = content - elif self.current_data == "name": - self.name = content - - # Call when an elements ends - def endElement(self, tag): - if self.current_data == "summary": - # print("Summary:", self.summary) - self.pkg["summary"] = self.summary - elif self.current_data == "name": - # print("name:", self.name) - self.pkg["name"] = self.name - - self.current_data = "" - - def get_primary_xml(destfolder, url, name): ''' Retrieve the repo metadata at the given url and store them using the provided name. @@ -144,7 +107,7 @@ def get_primary_xml(destfolder, url, name): files = (( node.find('repo:location', repomd_xml_namespace), node.find('repo:open-checksum', repomd_xml_namespace), - ) for node in ET.fromstring(response.text)) + ) for node in etree.fromstring(response.text)) # Extract out the attributes that we're really interested in. files = ( @@ -182,6 +145,8 @@ def get_primary_xml(destfolder, url, name): def get_package_summaries(): + summaries = {} + start = time.time() primary_xml = get_primary_xml( @@ -190,13 +155,28 @@ def get_package_summaries(): "koji", ) - handler = PackageHandler() - defusedxml.sax.parse(primary_xml, handler) + context = etree.iterparse(primary_xml, events=('start', 'end')) + + root = None + + # iterate over the rest of the primary.xml tree + for event, elem in context: + if not root: + root = elem + continue + + if event == 'end' and elem.tag == 'package' and elem.get('type', 'rpm') == 'rpm': + name = elem.findtext('name') + summary = elem.findtext('summary') + if name is not None and summary is not None: + summaries[name] = summary + # remove package child from root element to keep memory consumption low + root.clear() delta = time.time() - start log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes") - return handler.output + return summaries if __name__ == "__main__":