Source code for resyndicator.utils.sitemapparser

import re
from xml.etree import cElementTree


namespace_re = re.compile('{[^}]*}')


class SitemapParsingException(Exception):
    pass


[docs]def dictify(element):
    """Convert an etree element to a dictionary."""
    return {str(child.tag): dictify(child) for child in element} \
        or str(element.text).strip()


def strip_namespaces(root):
    for element in root.iter():
        # Removing namespaces, sorry
        if isinstance(element.tag, str):
            # Sometimes <built-in function Comment>
            element.tag = namespace_re.sub('', element.tag)


[docs]class SitemapIndex:
    """Parser class for sitemap indices."""

    def __init__(self, xml):
        self.root = cElementTree.fromstring(xml)
        if self.root is None:
            raise SitemapParsingException('Unknown parsing error')
        strip_namespaces(self.root)
        self.sitemapindex = list(self._parse_sitemapindex(self.root))

    @staticmethod
    def _parse_sitemapindex(root):
        if not root.tag == 'sitemapindex':
            raise SitemapParsingException(
                'No sitemapindex found (tag: {tag})'.format(tag=root.tag))
        for sitemap in root:
            if not sitemap.tag == 'sitemap':
                continue
            if 'loc' not in [elem.tag for elem in sitemap.getchildren()]:
                continue
            yield dictify(sitemap)

    def __iter__(self):
        return iter(self.sitemapindex)


[docs]class Sitemap:
    """Parser class for sitemaps."""

    def __init__(self, xml):
        self.root = cElementTree.fromstring(xml)
        if self.root is None:
            raise SitemapParsingException('Unknown parsing error')
        strip_namespaces(self.root)
        self.urlset = list(self._parse_urlset(self.root))

    @staticmethod
    def _parse_urlset(root):
        if not root.tag == 'urlset':
            raise SitemapParsingException(
                'No urlset found (tag: {tag})'.format(tag=root.tag))
        for url in root:
            if not url.tag == 'url':
                continue
            if 'loc' not in [elem.tag for elem in url.getchildren()]:
                continue
            yield dictify(url)

    def __iter__(self):
        return iter(self.urlset)