Source code for resyndicator.fetchers.sitemap
import json
import requests
from utilofies.stdlib import canonicalized
from ..utils import urn_from_string, stopwatch, process_time
from ..utils.logger import logger
from ..utils.sitemapparser import SitemapIndex, Sitemap
from .base import BaseEntryInterface, BaseFetcher, UnchangedException
[docs]class SitemapEntryInterface(BaseEntryInterface):
"""
Entry mapping for the sitemap entry.
"""
@property
def source(self):
"""Entry raw source as JSON inside an HTML snippet."""
return ('<details>\n'
'<summary>JSON Source</summary>\n'
'<div class="entry-source">{}</div>\n'
'</details>').format(json.dumps(
self.raw_entry, indent=4, sort_keys=True, default=str))
@property
def id(self):
"""Entry ID generated from URL."""
return urn_from_string(self.raw_entry['loc'])
@property
def updated(self):
"""Lastmod time of entry with fallback on publish times of video and news extensions."""
return process_time(self.raw_entry.get('lastmod')
or self.raw_entry.get('video', {}).get('publication_date')
or self.raw_entry.get('news', {}).get('publication_date'),
default_tz=self.fetcher.default_tz)
@property
def link(self):
"""Sitemap entry location (i.e. the URL)."""
return self.raw_entry['loc']
[docs]class SitemapFetcher(BaseFetcher):
"""
Fetcher that supports sitemaps and recognizes some features of some sitemap extensions.
"""
EntryInterface = SitemapEntryInterface
@staticmethod
@stopwatch
[docs] def parse(response):
"""Return parsed sitemap."""
return Sitemap(response.content)
[docs] def update(self):
"""Process sitemap."""
response = self.retrieve()
self.raw_entries = self.parse(response)
@property
def id(self):
"""Sitemap ID generated from explicitly set URL."""
return urn_from_string(self.url)
[docs]class SitemapIndexFetcher(BaseFetcher):
"""
This entry point that distributes the sitemap URLs in a sitemap index on
individual sitemap fetchers is still a bit of a hack. It only supports one
level of sitemap indices and circumvents the request scheduling, so that
it can block the scheduler for a while and sends many consecutive requests
to the same host.
"""
EntryInterface = SitemapEntryInterface
SitemapFetcher = SitemapFetcher
def __init__(self, *args, **kwargs):
super(SitemapIndexFetcher, self).__init__(*args, **kwargs)
self.response_headers = {}
@stopwatch
[docs] def parse(self, response):
"""Parse the sitemap index."""
return SitemapIndex(response.content)
[docs] def clean(self):
"""Reset the fetcher."""
self.index = None
[docs] def update(self):
"""Run the retrieval cycle that calls the sitemap fetcher internally."""
response = self.retrieve()
self.index = self.parse(response)
self.urls = []
for sitemap in self.index:
try:
response = self._retrieve_sitemap(sitemap['loc'])
except (IOError, requests.RequestException) as excp:
logger.error('Request exception %r for %s in index %s',
excp, sitemap['loc'], self.url)
except UnchangedException:
logger.info('Sitemap unchanged')
else:
self.urls.extend(self.SitemapFetcher.parse(response))
@property
def id(self):
"""Unique ID of the sitemap generated from explicitly set URL."""
return urn_from_string(self.url)
def _retrieve_sitemap(self, url):
"""Wrapper around the requests library for retrieving sitemap indices."""
self.kwargs['headers'].update(canonicalized({
'if-modified-since': self.response_headers.get('last-modified'),
'if-none-match': self.response_headers.get('etag')}))
response = requests.get(url, **self.kwargs)
response.raise_for_status()
if response.url != url:
logger.info('Redirects to %s', response.url)
if response.status_code == 304:
raise UnchangedException
self.response_headers[url] = response.headers
return response
@property
def raw_entries(self):
"""The raw entries as returned by parser."""
return self.urls