import re
import logging
import six
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
logger = logging.getLogger(__name__)
class SitemapSpider(Spider):
sitemap_urls = ()
sitemap_rules = [('', 'parse')]
sitemap_follow = ['']
sitemap_alternate_links = False
def __init__(self, *a, **kw):
super(SitemapSpider, self).__init__(*a, **kw)
self._cbs = []
for r, c in self.sitemap_rules:
if isinstance(c, six.string_types):
c = getattr(self, c)
self._cbs.append((regex(r), c))
self._follow = [regex(x) for x in self.sitemap_follow]
def start_requests(self):
for url in self.sitemap_urls:
yield Request(url, self._parse_sitemap)
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.text):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
logger.warning("Ignoring invalid sitemap: %(response)s",
{'response': response}, extra={'spider': self})
return
s = Sitemap(body)
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
yield Request(loc, callback=c)
break
def _get_sitemap_body(self, response):
"""Return the sitemap body contained in the given response,
or None if the response is not a sitemap.
"""
if isinstance(response, XmlResponse):
return response.body
elif is_gzipped(response):
return gunzip(response.body)
elif response.url.endswith('.xml'):
return response.body
elif response.url.endswith('.xml.gz'):
return gunzip(response.body)
def regex(x):
if isinstance(x, six.string_types):
return re.compile(x)
return x
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l