Source code for resyndicator.utils.sitemapparser

import re
from xml.etree import cElementTree


namespace_re = re.compile('{[^}]*}')


class SitemapParsingException(Exception):
    pass


[docs]def dictify(element): """Convert an etree element to a dictionary.""" return {str(child.tag): dictify(child) for child in element} \ or str(element.text).strip()
def strip_namespaces(root): for element in root.iter(): # Removing namespaces, sorry if isinstance(element.tag, str): # Sometimes <built-in function Comment> element.tag = namespace_re.sub('', element.tag)
[docs]class SitemapIndex: """Parser class for sitemap indices.""" def __init__(self, xml): self.root = cElementTree.fromstring(xml) if self.root is None: raise SitemapParsingException('Unknown parsing error') strip_namespaces(self.root) self.sitemapindex = list(self._parse_sitemapindex(self.root)) @staticmethod def _parse_sitemapindex(root): if not root.tag == 'sitemapindex': raise SitemapParsingException( 'No sitemapindex found (tag: {tag})'.format(tag=root.tag)) for sitemap in root: if not sitemap.tag == 'sitemap': continue if 'loc' not in [elem.tag for elem in sitemap.getchildren()]: continue yield dictify(sitemap) def __iter__(self): return iter(self.sitemapindex)
[docs]class Sitemap: """Parser class for sitemaps.""" def __init__(self, xml): self.root = cElementTree.fromstring(xml) if self.root is None: raise SitemapParsingException('Unknown parsing error') strip_namespaces(self.root) self.urlset = list(self._parse_urlset(self.root)) @staticmethod def _parse_urlset(root): if not root.tag == 'urlset': raise SitemapParsingException( 'No urlset found (tag: {tag})'.format(tag=root.tag)) for url in root: if not url.tag == 'url': continue if 'loc' not in [elem.tag for elem in url.getchildren()]: continue yield dictify(url) def __iter__(self): return iter(self.urlset)