From 6e574a50673c0b28dc7a642c82c6d3bb5440f8db Mon Sep 17 00:00:00 2001 From: Nils Philippsen Date: Mon, 25 Nov 2019 17:55:10 +0100 Subject: [PATCH] straighten out finding primary.xml file Signed-off-by: Nils Philippsen --- distgit_bugzilla_sync/package_summaries.py | 52 +++++++++++++--------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/distgit_bugzilla_sync/package_summaries.py b/distgit_bugzilla_sync/package_summaries.py index bfbf474..1ad2e82 100644 --- a/distgit_bugzilla_sync/package_summaries.py +++ b/distgit_bugzilla_sync/package_summaries.py @@ -19,6 +19,7 @@ import time from defusedxml import cElementTree as etree import requests +from xml.etree.ElementTree import ParseError KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/' @@ -99,32 +100,39 @@ def get_primary_xml(destfolder, url, name): repomd_url = url + '/repomd.xml' response = requests.get(repomd_url, verify=True) if not bool(response): - print('%s !! Failed to get %r %r' % ( - name.ljust(12), repomd_url, response)) + log.warning('%-12s !! Failed to get %s %s', name, repomd_url, response) return - # Parse the xml doc and get a list of locations and their shasum. - files = (( - node.find('repo:location', repomd_xml_namespace), - node.find('repo:open-checksum', repomd_xml_namespace), - ) for node in etree.fromstring(response.text)) - - # Extract out the attributes that we're really interested in. - files = ( - (f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type']) - for f, s in files if f is not None and s is not None - ) - - # Filter down to only the primary.xml files - files = [(f, s, t) for f, s, t in files if 'primary.xml' in f] - - if not files: - log.debug('No primary.xml could be found in %s' % url) - elif len(files) > 1: - log.debug("More than one primary.xml could be found in %s" % url) + try: + root = etree.fromstring(response.text) + except ParseError: + log.warning('%-12s !! Failed to parse %s %s', name, repomd_url, response) return - filename, shasum, shatype = files[0] + data_nodes = list(root.findall('repo:data[@type="primary"]', repomd_xml_namespace)) + if not data_nodes: + log.debug('No primary.xml could be found in %s', url) + return + elif len(data_nodes) > 1: + log.debug("More than one primary.xml could be found in %s", url) + return + + primary_node = data_nodes[0] + + location_node = primary_node.find('repo:location', repomd_xml_namespace) + if location_node is None or 'href' not in location_node.attrib: + log.debug('No valid location found for primary.xml in %s', url) + return + + cksuminfo_node = primary_node.find('repo:open-checksum', repomd_xml_namespace) + if cksuminfo_node is None or 'type' not in cksuminfo_node.attrib: + log.debug('No valid checksum information found for primary.xml in %s', url) + return + + filename = location_node.attrib['href'].replace('repodata/', '') + hash_digest = cksuminfo_node.text + hash_type = cksuminfo_node.attrib['type'] + repomd_url = url + '/' + filename # First, determine if the file has changed by comparing hash