Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

aroundthecode / lxml   python

Repository URL to install this package:

/ html / __init__.py

# Copyright (c) 2004 Ian Bicking. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# 3. Neither the name of Ian Bicking nor the names of its contributors may
# be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""The ``lxml.html`` tool set for HTML handling.
"""

from __future__ import absolute_import

__all__ = [
    'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
    'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
    'find_rel_links', 'find_class', 'make_links_absolute',
    'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']


import copy
import sys
import re
from functools import partial

try:
    # while unnecessary, importing from 'collections.abc' is the right way to do it
    from collections.abc import MutableMapping, MutableSet
except ImportError:
    from collections import MutableMapping, MutableSet

from .. import etree
from . import defs
from ._setmixin import SetMixin

try:
    from urlparse import urljoin
except ImportError:
    # Python 3
    from urllib.parse import urljoin

try:
    unicode
except NameError:
    # Python 3
    unicode = str
try:
    basestring
except NameError:
    # Python 3
    basestring = (str, bytes)


def __fix_docstring(s):
    if not s:
        return s
    if sys.version_info[0] >= 3:
        sub = re.compile(r"^(\s*)u'", re.M).sub
    else:
        sub = re.compile(r"^(\s*)b'", re.M).sub
    return sub(r"\1'", s)


XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"

_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
                               namespaces={'x':XHTML_NAMESPACE})
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
                             namespaces={'x':XHTML_NAMESPACE})
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
                           namespaces={'x':XHTML_NAMESPACE})
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
_collect_string_content = etree.XPath("string()")
_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
                           namespaces={'x':XHTML_NAMESPACE})
_archive_re = re.compile(r'[^ ]+')
_parse_meta_refresh_url = re.compile(
    r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search


def _unquote_match(s, pos):
    if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
        return s[1:-1], pos+1
    else:
        return s,pos


def _transform_result(typ, result):
    """Convert the result back into the input type.
    """
    if issubclass(typ, bytes):
        return tostring(result, encoding='utf-8')
    elif issubclass(typ, unicode):
        return tostring(result, encoding='unicode')
    else:
        return result


def _nons(tag):
    if isinstance(tag, basestring):
        if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
            return tag.split('}')[-1]
    return tag


class Classes(MutableSet):
    """Provides access to an element's class attribute as a set-like collection.
    Usage::

        >>> el = fromstring('<p class="hidden large">Text</p>')
        >>> classes = el.classes  # or: classes = Classes(el.attrib)
        >>> classes |= ['block', 'paragraph']
        >>> el.get('class')
        'hidden large block paragraph'
        >>> classes.toggle('hidden')
        False
        >>> el.get('class')
        'large block paragraph'
        >>> classes -= ('some', 'classes', 'block')
        >>> el.get('class')
        'large paragraph'
    """
    def __init__(self, attributes):
        self._attributes = attributes
        self._get_class_value = partial(attributes.get, 'class', '')

    def add(self, value):
        """
        Add a class.

        This has no effect if the class is already present.
        """
        if not value or re.search(r'\s', value):
            raise ValueError("Invalid class name: %r" % value)
        classes = self._get_class_value().split()
        if value in classes:
            return
        classes.append(value)
        self._attributes['class'] = ' '.join(classes)

    def discard(self, value):
        """
        Remove a class if it is currently present.

        If the class is not present, do nothing.
        """
        if not value or re.search(r'\s', value):
            raise ValueError("Invalid class name: %r" % value)
        classes = [name for name in self._get_class_value().split()
                   if name != value]
        if classes:
            self._attributes['class'] = ' '.join(classes)
        elif 'class' in self._attributes:
            del self._attributes['class']

    def remove(self, value):
        """
        Remove a class; it must currently be present.

        If the class is not present, raise a KeyError.
        """
        if not value or re.search(r'\s', value):
            raise ValueError("Invalid class name: %r" % value)
        super(Classes, self).remove(value)

    def __contains__(self, name):
        classes = self._get_class_value()
        return name in classes and name in classes.split()

    def __iter__(self):
        return iter(self._get_class_value().split())

    def __len__(self):
        return len(self._get_class_value().split())

    # non-standard methods

    def update(self, values):
        """
        Add all names from 'values'.
        """
        classes = self._get_class_value().split()
        extended = False
        for value in values:
            if value not in classes:
                classes.append(value)
                extended = True
        if extended:
            self._attributes['class'] = ' '.join(classes)

    def toggle(self, value):
        """
        Add a class name if it isn't there yet, or remove it if it exists.

        Returns true if the class was added (and is now enabled) and
        false if it was removed (and is now disabled).
        """
        if not value or re.search(r'\s', value):
            raise ValueError("Invalid class name: %r" % value)
        classes = self._get_class_value().split()
        try:
            classes.remove(value)
            enabled = False
        except ValueError:
            classes.append(value)
            enabled = True
        if classes:
            self._attributes['class'] = ' '.join(classes)
        else:
            del self._attributes['class']
        return enabled


class HtmlMixin(object):

    def set(self, key, value=None):
        """set(self, key, value=None)

        Sets an element attribute.  If no value is provided, or if the value is None,
        creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
        for ``form.set('novalidate')``.
        """
        super(HtmlElement, self).set(key, value)

    @property
    def classes(self):
        """
        A set-like wrapper around the 'class' attribute.
        """
        return Classes(self.attrib)

    @classes.setter
    def classes(self, classes):
        assert isinstance(classes, Classes)  # only allow "el.classes |= ..." etc.
        value = classes._get_class_value()
        if value:
            self.set('class', value)
        elif self.get('class') is not None:
            del self.attrib['class']

    @property
    def base_url(self):
        """
        Returns the base URL, given when the page was parsed.

        Use with ``urlparse.urljoin(el.base_url, href)`` to get
        absolute URLs.
        """
        return self.getroottree().docinfo.URL

    @property
    def forms(self):
        """
        Return a list of all the forms
        """
        return _forms_xpath(self)

    @property
    def body(self):
        """
        Return the <body> element.  Can be called from a child element
        to get the document's head.
        """
        return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

    @property
    def head(self):
        """
        Returns the <head> element.  Can be called from a child
        element to get the document's head.
        """
        return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

    @property
    def label(self):
        """
        Get or set any <label> element associated with this element.
        """
        id = self.get('id')
        if not id:
            return None
        result = _label_xpath(self, id=id)
        if not result:
            return None
        else:
            return result[0]

    @label.setter
    def label(self, label):
        id = self.get('id')
        if not id:
            raise TypeError(
                "You cannot set a label for an element (%r) that has no id"
                % self)
        if _nons(label.tag) != 'label':
            raise TypeError(
                "You can only assign label to a label element (not %r)"
                % label)
        label.set('for', id)

    @label.deleter
    def label(self):
        label = self.label
        if label is not None:
            del label.attrib['for']

    def drop_tree(self):
        """
        Removes this element from the tree, including its children and
        text.  The tail text is joined to the previous element or
        parent.
        """
        parent = self.getparent()
        assert parent is not None
        if self.tail:
            previous = self.getprevious()
            if previous is None:
                parent.text = (parent.text or '') + self.tail
            else:
Loading ...