# Copyright (c) 2004 Ian Bicking. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# 3. Neither the name of Ian Bicking nor the names of its contributors may
# be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""The ``lxml.html`` tool set for HTML handling.
"""
from __future__ import absolute_import
__all__ = [
'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
'find_rel_links', 'find_class', 'make_links_absolute',
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
import copy
import sys
import re
from functools import partial
try:
# while unnecessary, importing from 'collections.abc' is the right way to do it
from collections.abc import MutableMapping, MutableSet
except ImportError:
from collections import MutableMapping, MutableSet
from .. import etree
from . import defs
from ._setmixin import SetMixin
try:
from urlparse import urljoin
except ImportError:
# Python 3
from urllib.parse import urljoin
try:
unicode
except NameError:
# Python 3
unicode = str
try:
basestring
except NameError:
# Python 3
basestring = (str, bytes)
def __fix_docstring(s):
if not s:
return s
if sys.version_info[0] >= 3:
sub = re.compile(r"^(\s*)u'", re.M).sub
else:
sub = re.compile(r"^(\s*)b'", re.M).sub
return sub(r"\1'", s)
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
namespaces={'x':XHTML_NAMESPACE})
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
namespaces={'x':XHTML_NAMESPACE})
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
namespaces={'x':XHTML_NAMESPACE})
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
_collect_string_content = etree.XPath("string()")
_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
namespaces={'x':XHTML_NAMESPACE})
_archive_re = re.compile(r'[^ ]+')
_parse_meta_refresh_url = re.compile(
r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
def _unquote_match(s, pos):
if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
return s[1:-1], pos+1
else:
return s,pos
def _transform_result(typ, result):
"""Convert the result back into the input type.
"""
if issubclass(typ, bytes):
return tostring(result, encoding='utf-8')
elif issubclass(typ, unicode):
return tostring(result, encoding='unicode')
else:
return result
def _nons(tag):
if isinstance(tag, basestring):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
return tag
class Classes(MutableSet):
"""Provides access to an element's class attribute as a set-like collection.
Usage::
>>> el = fromstring('<p class="hidden large">Text</p>')
>>> classes = el.classes # or: classes = Classes(el.attrib)
>>> classes |= ['block', 'paragraph']
>>> el.get('class')
'hidden large block paragraph'
>>> classes.toggle('hidden')
False
>>> el.get('class')
'large block paragraph'
>>> classes -= ('some', 'classes', 'block')
>>> el.get('class')
'large paragraph'
"""
def __init__(self, attributes):
self._attributes = attributes
self._get_class_value = partial(attributes.get, 'class', '')
def add(self, value):
"""
Add a class.
This has no effect if the class is already present.
"""
if not value or re.search(r'\s', value):
raise ValueError("Invalid class name: %r" % value)
classes = self._get_class_value().split()
if value in classes:
return
classes.append(value)
self._attributes['class'] = ' '.join(classes)
def discard(self, value):
"""
Remove a class if it is currently present.
If the class is not present, do nothing.
"""
if not value or re.search(r'\s', value):
raise ValueError("Invalid class name: %r" % value)
classes = [name for name in self._get_class_value().split()
if name != value]
if classes:
self._attributes['class'] = ' '.join(classes)
elif 'class' in self._attributes:
del self._attributes['class']
def remove(self, value):
"""
Remove a class; it must currently be present.
If the class is not present, raise a KeyError.
"""
if not value or re.search(r'\s', value):
raise ValueError("Invalid class name: %r" % value)
super(Classes, self).remove(value)
def __contains__(self, name):
classes = self._get_class_value()
return name in classes and name in classes.split()
def __iter__(self):
return iter(self._get_class_value().split())
def __len__(self):
return len(self._get_class_value().split())
# non-standard methods
def update(self, values):
"""
Add all names from 'values'.
"""
classes = self._get_class_value().split()
extended = False
for value in values:
if value not in classes:
classes.append(value)
extended = True
if extended:
self._attributes['class'] = ' '.join(classes)
def toggle(self, value):
"""
Add a class name if it isn't there yet, or remove it if it exists.
Returns true if the class was added (and is now enabled) and
false if it was removed (and is now disabled).
"""
if not value or re.search(r'\s', value):
raise ValueError("Invalid class name: %r" % value)
classes = self._get_class_value().split()
try:
classes.remove(value)
enabled = False
except ValueError:
classes.append(value)
enabled = True
if classes:
self._attributes['class'] = ' '.join(classes)
else:
del self._attributes['class']
return enabled
class HtmlMixin(object):
def set(self, key, value=None):
"""set(self, key, value=None)
Sets an element attribute. If no value is provided, or if the value is None,
creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
for ``form.set('novalidate')``.
"""
super(HtmlElement, self).set(key, value)
@property
def classes(self):
"""
A set-like wrapper around the 'class' attribute.
"""
return Classes(self.attrib)
@classes.setter
def classes(self, classes):
assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
value = classes._get_class_value()
if value:
self.set('class', value)
elif self.get('class') is not None:
del self.attrib['class']
@property
def base_url(self):
"""
Returns the base URL, given when the page was parsed.
Use with ``urlparse.urljoin(el.base_url, href)`` to get
absolute URLs.
"""
return self.getroottree().docinfo.URL
@property
def forms(self):
"""
Return a list of all the forms
"""
return _forms_xpath(self)
@property
def body(self):
"""
Return the <body> element. Can be called from a child element
to get the document's head.
"""
return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
@property
def head(self):
"""
Returns the <head> element. Can be called from a child
element to get the document's head.
"""
return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
@property
def label(self):
"""
Get or set any <label> element associated with this element.
"""
id = self.get('id')
if not id:
return None
result = _label_xpath(self, id=id)
if not result:
return None
else:
return result[0]
@label.setter
def label(self, label):
id = self.get('id')
if not id:
raise TypeError(
"You cannot set a label for an element (%r) that has no id"
% self)
if _nons(label.tag) != 'label':
raise TypeError(
"You can only assign label to a label element (not %r)"
% label)
label.set('for', id)
@label.deleter
def label(self):
label = self.label
if label is not None:
del label.attrib['for']
def drop_tree(self):
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or
parent.
"""
parent = self.getparent()
assert parent is not None
if self.tail:
previous = self.getprevious()
if previous is None:
parent.text = (parent.text or '') + self.tail
else:
Loading ...