html/clean.py · aroundthecode/lxml

Learn more » Push, build, and install RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages
aroundthecode / lxml python

Repository URL to install this package:
Version: 4.2.5

/ html / clean.py

"""A cleanup tool for HTML.

Removes unwanted tags and content.  See the `Cleaner` class for
details.
"""

import re
import copy
try:
    from urlparse import urlsplit
    from urllib import unquote_plus
except ImportError:
    # Python 3
    from urllib.parse import urlsplit, unquote_plus
from lxml import etree
from lxml.html import defs
from lxml.html import fromstring, XHTML_NAMESPACE
from lxml.html import xhtml_to_html, _transform_result

try:
    unichr
except NameError:
    # Python 3
    unichr = chr
try:
    unicode
except NameError:
    # Python 3
    unicode = str
try:
    bytes
except NameError:
    # Python < 2.6
    bytes = str
try:
    basestring
except NameError:
    basestring = (str, bytes)


__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
           'word_break', 'word_break_html']

# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
#   Particularly the CSS cleaning; most of the tag cleaning is integrated now
# I have multiple kinds of schemes searched; but should schemes be
#   whitelisted instead?
# max height?
# remove images?  Also in CSS?  background attribute?
# Some way to whitelist object, iframe, etc (e.g., if you want to
#   allow *just* embedded YouTube movies)
# Log what was deleted and why?
# style="behavior: ..." might be bad in IE?
# Should we have something for just <meta http-equiv>?  That's the worst of the
#   metas.
# UTF-7 detections?  Example:
#     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
#   you don't always have to have the charset set, if the page has no charset
#   and there's UTF7-like code in it.
# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php


# This is an IE-specific construct you can have in a stylesheet to
# run some Javascript:
_css_javascript_re = re.compile(
    r'expression\s*\(.*?\)', re.S|re.I)

# Do I have to worry about @\nimport?
_css_import_re = re.compile(
    r'@\s*import', re.I)

# All kinds of schemes besides just javascript: that can cause
# execution:
_is_image_dataurl = re.compile(
    r'^data:image/.+;base64', re.I).search
_is_possibly_malicious_scheme = re.compile(
    r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
    re.I).search
def _is_javascript_scheme(s):
    if _is_image_dataurl(s):
        return None
    return _is_possibly_malicious_scheme(s)

_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
# FIXME: should data: be blocked?

# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
_conditional_comment_re = re.compile(
    r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)

_find_styled_elements = etree.XPath(
    "descendant-or-self::*[@style]")

_find_external_links = etree.XPath(
    ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
     "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
    namespaces={'x':XHTML_NAMESPACE})


class Cleaner(object):
    """
    Instances cleans the document of each of the possible offending
    elements.  The cleaning is controlled by attributes; you can
    override attributes in a subclass, or set them in the constructor.

    ``scripts``:
        Removes any ``<script>`` tags.

    ``javascript``:
        Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
        as they could contain Javascript.

    ``comments``:
        Removes any comments.

    ``style``:
        Removes any style tags.

    ``inline_style``
        Removes any style attributes.  Defaults to the value of the ``style`` option.

    ``links``:
        Removes any ``<link>`` tags

    ``meta``:
        Removes any ``<meta>`` tags

    ``page_structure``:
        Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.

    ``processing_instructions``:
        Removes any processing instructions.

    ``embedded``:
        Removes any embedded objects (flash, iframes)

    ``frames``:
        Removes any frame-related tags

    ``forms``:
        Removes any form tags

    ``annoying_tags``:
        Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>``

    ``remove_tags``:
        A list of tags to remove.  Only the tags will be removed,
        their content will get pulled up into the parent tag.

    ``kill_tags``:
        A list of tags to kill.  Killing also removes the tag's content,
        i.e. the whole subtree, not just the tag itself.

    ``allow_tags``:
        A list of tags to include (default include all).

    ``remove_unknown_tags``:
        Remove any tags that aren't standard parts of HTML.

    ``safe_attrs_only``:
        If true, only include 'safe' attributes (specifically the list
        from the feedparser HTML sanitisation web site).

    ``safe_attrs``:
        A set of attribute names to override the default list of attributes
        considered 'safe' (when safe_attrs_only=True).

    ``add_nofollow``:
        If true, then any <a> tags will have ``rel="nofollow"`` added to them.

    ``host_whitelist``:
        A list or set of hosts that you can use for embedded content
        (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
        You can also implement/override the method
        ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
        implement more complex rules for what can be embedded.
        Anything that passes this test will be shown, regardless of
        the value of (for instance) ``embedded``.

        Note that this parameter might not work as intended if you do not
        make the links absolute before doing the cleaning.

        Note that you may also need to set ``whitelist_tags``.

    ``whitelist_tags``:
        A set of tags that can be included with ``host_whitelist``.
        The default is ``iframe`` and ``embed``; you may wish to
        include other tags like ``script``, or you may want to
        implement ``allow_embedded_url`` for more control.  Set to None to
        include all tags.

    This modifies the document *in place*.
    """

    scripts = True
    javascript = True
    comments = True
    style = False
    inline_style = None
    links = True
    meta = True
    page_structure = True
    processing_instructions = True
    embedded = True
    frames = True
    forms = True
    annoying_tags = True
    remove_tags = None
    allow_tags = None
    kill_tags = None
    remove_unknown_tags = True
    safe_attrs_only = True
    safe_attrs = defs.safe_attrs
    add_nofollow = False
    host_whitelist = ()
    whitelist_tags = set(['iframe', 'embed'])

    def __init__(self, **kw):
        for name, value in kw.items():
            if not hasattr(self, name):
                raise TypeError(
                    "Unknown parameter: %s=%r" % (name, value))
            setattr(self, name, value)
        if self.inline_style is None and 'inline_style' not in kw:
            self.inline_style = self.style

    # Used to lookup the primary URL for a given tag that is up for
    # removal:
    _tag_link_attrs = dict(
        script='src',
        link='href',
        # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
        # From what I can tell, both attributes can contain a link:
        applet=['code', 'object'],
        iframe='src',
        embed='src',
        layer='src',
        # FIXME: there doesn't really seem like a general way to figure out what
        # links an <object> tag uses; links often go in <param> tags with values
        # that we don't really know.  You'd have to have knowledge about specific
        # kinds of plugins (probably keyed off classid), and match against those.
        ##object=?,
        # FIXME: not looking at the action currently, because it is more complex
        # than than -- if you keep the form, you should keep the form controls.
        ##form='action',
        a='href',
        )

    def __call__(self, doc):
        """
        Cleans the document.
        """
        if hasattr(doc, 'getroot'):
            # ElementTree instance, instead of an element
            doc = doc.getroot()
        # convert XHTML to HTML
        xhtml_to_html(doc)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter('image'):
            el.tag = 'img'
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)

        kill_tags = set(self.kill_tags or ())
        remove_tags = set(self.remove_tags or ())
        allow_tags = set(self.allow_tags or ())

        if self.scripts:
            kill_tags.add('script')
        if self.safe_attrs_only:
            safe_attrs = set(self.safe_attrs)
            for el in doc.iter(etree.Element):
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not (self.safe_attrs_only and
                    self.safe_attrs == defs.safe_attrs):
                # safe_attrs handles events attributes itself
                for el in doc.iter(etree.Element):
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith('on'):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
            # If we're deleting style then we don't have to remove JS links
            # from styles, otherwise...
            if not self.inline_style:
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
                    new = _css_import_re.sub('', new)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
            if not self.style:
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
                        continue
                    old = el.text or ''
                    new = _css_javascript_re.sub('', old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = '/* deleted */'
                    elif new != old:
                        el.text = new
        if self.comments or self.processing_instructions:
            # FIXME: why either?  I feel like there's some obscure reason
            # because you can put PIs in comments...?  But I've already
            # forgotten it
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
        if self.inline_style:
            etree.strip_attributes(doc, 'style')
        if self.links:
            kill_tags.add('link')
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter('link')):
                if 'stylesheet' in el.get('rel', '').lower():
                    # Note this kills alternate stylesheets as well
                    if not self.allow_element(el):
                        el.drop_tree()
        if self.meta:
            kill_tags.add('meta')
        if self.page_structure:
            remove_tags.update(('head', 'html', 'title'))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
Loading ...
aroundthecode / lxml python

Version: 4.2.5

/ html / clean.py

Products

About

Resources

Contact Gemfury