Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

hemamaps / Scrapy   python

Repository URL to install this package:

Version: 1.1.1 

/ linkextractors / htmlparser.py

"""
HTMLParser-based link extractor
"""

import warnings
from six.moves.html_parser import HTMLParser
from six.moves.urllib.parse import urljoin

from w3lib.url import safe_url_string

from scrapy.link import Link
from scrapy.utils.python import unique as unique_list
from scrapy.exceptions import ScrapyDeprecationWarning


class HtmlParserLinkExtractor(HTMLParser):

    def __init__(self, tag="a", attr="href", process=None, unique=False):
        HTMLParser.__init__(self)

        warnings.warn(
            "HtmlParserLinkExtractor is deprecated and will be removed in "
            "future releases. Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning, stacklevel=2,
        )

        self.scan_tag = tag if callable(tag) else lambda t: t == tag
        self.scan_attr = attr if callable(attr) else lambda a: a == attr
        self.process_attr = process if callable(process) else lambda v: v
        self.unique = unique

    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            try:
                link.url = urljoin(base_url, link.url)
            except ValueError:
                continue
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret

    def extract_links(self, response):
        # wrapper needed to allow to work directly with text
        return self._extract_links(response.body, response.url, response.encoding)

    def reset(self):
        HTMLParser.reset(self)

        self.base_url = None
        self.current_link = None
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'base':
            self.base_url = dict(attrs).get('href')
        if self.scan_tag(tag):
            for attr, value in attrs:
                if self.scan_attr(attr):
                    url = self.process_attr(value)
                    link = Link(url=url)
                    self.links.append(link)
                    self.current_link = link

    def handle_endtag(self, tag):
        if self.scan_tag(tag):
            self.current_link = None

    def handle_data(self, data):
        if self.current_link:
            self.current_link.text = self.current_link.text + data

    def matches(self, url):
        """This extractor matches with any url, since
        it doesn't contain any patterns"""
        return True