Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

squarecapadmin / bleach   python

Repository URL to install this package:

Version: 1.4 

/ bleach / tests / test_links.py

try:
    from urllib.parse import quote_plus
except ImportError:
    from urllib import quote_plus

from html5lib.tokenizer import HTMLTokenizer
from nose.tools import eq_

from bleach import linkify, url_re, DEFAULT_CALLBACKS as DC
from bleach.tests.tools import in_


def test_url_re():
    def no_match(s):
        match = url_re.search(s)
        if match:
            assert not match, 'matched {0!s}'.format(s[slice(*match.span())])
    yield no_match, 'just what i am looking for...it'


def test_empty():
    eq_('', linkify(''))


def test_simple_link():
    in_(('a <a href="http://example.com" rel="nofollow">http://example.com'
        '</a> link',
        'a <a rel="nofollow" href="http://example.com">http://example.com'
        '</a> link'),
        linkify('a http://example.com link'))
    in_(('a <a href="https://example.com" rel="nofollow">https://example.com'
        '</a> link',
        'a <a rel="nofollow" href="https://example.com">https://example.com'
        '</a> link'),
        linkify('a https://example.com link'))
    in_(('a <a href="http://example.com" rel="nofollow">example.com</a> link',
         'a <a rel="nofollow" href="http://example.com">example.com</a> link'),
        linkify('a example.com link'))


def test_trailing_slash():
    in_(('<a href="http://examp.com/" rel="nofollow">http://examp.com/</a>',
         '<a rel="nofollow" href="http://examp.com/">http://examp.com/</a>'),
        linkify('http://examp.com/'))
    in_(('<a href="http://example.com/foo/" rel="nofollow">'
         'http://example.com/foo/</a>',
         '<a rel="nofollow" href="http://example.com/foo/">'
         'http://example.com/foo/</a>'),
        linkify('http://example.com/foo/'))
    in_(('<a href="http://example.com/foo/bar/" rel="nofollow">'
         'http://example.com/foo/bar/</a>',
         '<a rel="nofollow" href="http://example.com/foo/bar/">'
         'http://example.com/foo/bar/</a>'),
        linkify('http://example.com/foo/bar/'))


def test_mangle_link():
    """We can muck with the href attribute of the link."""
    def filter_url(attrs, new=False):
        quoted = quote_plus(attrs['href'])
        attrs['href'] = 'http://bouncer/?u={0!s}'.format(quoted)
        return attrs

    in_(('<a href="http://bouncer/?u=http%3A%2F%2Fexample.com" rel="nofollow">'
         'http://example.com</a>',
         '<a rel="nofollow" href="http://bouncer/?u=http%3A%2F%2Fexample.com">'
         'http://example.com</a>'),
        linkify('http://example.com', DC + [filter_url]))


def test_mangle_text():
    """We can muck with the inner text of a link."""

    def ft(attrs, new=False):
        attrs['_text'] = 'bar'
        return attrs

    eq_('<a href="http://ex.mp">bar</a> <a href="http://ex.mp/foo">bar</a>',
        linkify('http://ex.mp <a href="http://ex.mp/foo">foo</a>', [ft]))


def test_email_link():
    tests = (
        ('a james@example.com mailto', False, 'a james@example.com mailto'),
        ('a james@example.com.au mailto', False,
            'a james@example.com.au mailto'),
        ('a <a href="mailto:james@example.com">james@example.com</a> mailto',
            True, 'a james@example.com mailto'),
        ('aussie <a href="mailto:james@example.com.au">'
            'james@example.com.au</a> mailto', True,
            'aussie james@example.com.au mailto'),
        # This is kind of a pathological case. I guess we do our best here.
        (('email to <a href="james@example.com" rel="nofollow">'
          'james@example.com</a>',
          'email to <a rel="nofollow" href="james@example.com">'
          'james@example.com</a>'),
         True,
         'email to <a href="james@example.com">james@example.com</a>'),
    )

    def _check(o, p, i):
        if isinstance(o, (list, tuple)):
            in_(o, linkify(i, parse_email=p))
        else:
            eq_(o, linkify(i, parse_email=p))

    for (o, p, i) in tests:
        yield _check, o, p, i


def test_email_link_escaping():
    tests = (
        ('''<a href='mailto:"james"@example.com'>'''
            '''"james"@example.com</a>''',
            '"james"@example.com'),
        ('''<a href="mailto:&quot;j'ames&quot;@example.com">'''
            '''"j'ames"@example.com</a>''',
            '"j\'ames"@example.com'),
        ('''<a href='mailto:"ja>mes"@example.com'>'''
            '''"ja&gt;mes"@example.com</a>''',
            '"ja>mes"@example.com'),
    )

    def _check(o, i):
        eq_(o, linkify(i, parse_email=True))

    for (o, i) in tests:
        yield _check, o, i


def test_prevent_links():
    """Returning None from any callback should remove links or prevent them
    from being created."""

    def no_new_links(attrs, new=False):
        if new:
            return None
        return attrs

    def no_old_links(attrs, new=False):
        if not new:
            return None
        return attrs

    def noop(attrs, new=False):
        return attrs

    in_text = 'a ex.mp <a href="http://example.com">example</a>'
    out_text = 'a <a href="http://ex.mp">ex.mp</a> example'
    tests = (
        ([noop], ('a <a href="http://ex.mp">ex.mp</a> '
                  '<a href="http://example.com">example</a>'), 'noop'),
        ([no_new_links, noop], in_text, 'no new, noop'),
        ([noop, no_new_links], in_text, 'noop, no new'),
        ([no_old_links, noop], out_text, 'no old, noop'),
        ([noop, no_old_links], out_text, 'noop, no old'),
        ([no_old_links, no_new_links], 'a ex.mp example', 'no links'),
    )

    def _check(cb, o, msg):
        eq_(o, linkify(in_text, cb), msg)

    for (cb, o, msg) in tests:
        yield _check, cb, o, msg


def test_set_attrs():
    """We can set random attributes on links."""

    def set_attr(attrs, new=False):
        attrs['rev'] = 'canonical'
        return attrs

    in_(('<a href="http://ex.mp" rev="canonical">ex.mp</a>',
         '<a rev="canonical" href="http://ex.mp">ex.mp</a>'),
        linkify('ex.mp', [set_attr]))


def test_only_proto_links():
    """Only create links if there's a protocol."""
    def only_proto(attrs, new=False):
        if new and not attrs['_text'].startswith(('http:', 'https:')):
            return None
        return attrs

    in_text = 'a ex.mp http://ex.mp <a href="/foo">bar</a>'
    out_text = ('a ex.mp <a href="http://ex.mp">http://ex.mp</a> '
                '<a href="/foo">bar</a>')
    eq_(out_text, linkify(in_text, [only_proto]))


def test_stop_email():
    """Returning None should prevent a link from being created."""
    def no_email(attrs, new=False):
        if attrs['href'].startswith('mailto:'):
            return None
        return attrs
    text = 'do not link james@example.com'
    eq_(text, linkify(text, parse_email=True, callbacks=[no_email]))


def test_tlds():
    in_(('<a href="http://example.com" rel="nofollow">example.com</a>',
         '<a rel="nofollow" href="http://example.com">example.com</a>'),
        linkify('example.com'))
    in_(('<a href="http://example.co.uk" rel="nofollow">example.co.uk</a>',
         '<a rel="nofollow" href="http://example.co.uk">example.co.uk</a>'),
        linkify('example.co.uk'))
    in_(('<a href="http://example.edu" rel="nofollow">example.edu</a>',
         '<a rel="nofollow" href="http://example.edu">example.edu</a>'),
        linkify('example.edu'))
    eq_('example.xxx', linkify('example.xxx'))
    eq_(' brie', linkify(' brie'))
    in_(('<a href="http://bit.ly/fun" rel="nofollow">bit.ly/fun</a>',
         '<a rel="nofollow" href="http://bit.ly/fun">bit.ly/fun</a>'),
        linkify('bit.ly/fun'))


def test_escaping():
    eq_('&lt; unrelated', linkify('< unrelated'))


def test_nofollow_off():
    eq_('<a href="http://example.com">example.com</a>',
        linkify('example.com', []))


def test_link_in_html():
    in_(('<i><a href="http://yy.com" rel="nofollow">http://yy.com</a></i>',
         '<i><a rel="nofollow" href="http://yy.com">http://yy.com</a></i>'),
        linkify('<i>http://yy.com</i>'))

    in_(('<em><strong><a href="http://xx.com" rel="nofollow">http://xx.com'
         '</a></strong></em>',
         '<em><strong><a rel="nofollow" href="http://xx.com">http://xx.com'
         '</a></strong></em>'),
        linkify('<em><strong>http://xx.com</strong></em>'))


def test_links_https():
    in_(('<a href="https://yy.com" rel="nofollow">https://yy.com</a>',
         '<a rel="nofollow" href="https://yy.com">https://yy.com</a>'),
        linkify('https://yy.com'))


def test_add_rel_nofollow():
    """Verify that rel="nofollow" is added to an existing link"""
    in_(('<a href="http://yy.com" rel="nofollow">http://yy.com</a>',
         '<a rel="nofollow" href="http://yy.com">http://yy.com</a>'),
        linkify('<a href="http://yy.com">http://yy.com</a>'))


def test_url_with_path():
    in_(('<a href="http://example.com/path/to/file" rel="nofollow">'
         'http://example.com/path/to/file</a>',
         '<a rel="nofollow" href="http://example.com/path/to/file">'
         'http://example.com/path/to/file</a>'),
        linkify('http://example.com/path/to/file'))


def test_link_ftp():
    in_(('<a href="ftp://ftp.mozilla.org/some/file" rel="nofollow">'
         'ftp://ftp.mozilla.org/some/file</a>',
         '<a rel="nofollow" href="ftp://ftp.mozilla.org/some/file">'
         'ftp://ftp.mozilla.org/some/file</a>'),
        linkify('ftp://ftp.mozilla.org/some/file'))


def test_link_query():
    in_(('<a href="http://xx.com/?test=win" rel="nofollow">'
        'http://xx.com/?test=win</a>',
        '<a rel="nofollow" href="http://xx.com/?test=win">'
        'http://xx.com/?test=win</a>'),
        linkify('http://xx.com/?test=win'))
    in_(('<a href="http://xx.com/?test=win" rel="nofollow">'
        'xx.com/?test=win</a>',
        '<a rel="nofollow" href="http://xx.com/?test=win">'
        'xx.com/?test=win</a>'),
        linkify('xx.com/?test=win'))
    in_(('<a href="http://xx.com?test=win" rel="nofollow">'
        'xx.com?test=win</a>',
        '<a rel="nofollow" href="http://xx.com?test=win">'
        'xx.com?test=win</a>'),
        linkify('xx.com?test=win'))


def test_link_fragment():
    in_(('<a href="http://xx.com/path#frag" rel="nofollow">'
         'http://xx.com/path#frag</a>',
         '<a rel="nofollow" href="http://xx.com/path#frag">'
         'http://xx.com/path#frag</a>'),
        linkify('http://xx.com/path#frag'))


def test_link_entities():
    in_(('<a href="http://xx.com/?a=1&amp;b=2" rel="nofollow">'
        'http://xx.com/?a=1&amp;b=2</a>',
        '<a rel="nofollow" href="http://xx.com/?a=1&amp;b=2">'
        'http://xx.com/?a=1&amp;b=2</a>'),
        linkify('http://xx.com/?a=1&b=2'))


def test_escaped_html():
    """If I pass in escaped HTML, it should probably come out escaped."""
    s = '&lt;em&gt;strong&lt;/em&gt;'
    eq_(s, linkify(s))


def test_link_http_complete():
    in_(('<a href="https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d'
        '&amp;e#f" rel="nofollow">'
        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>',
        '<a rel="nofollow" href="https://user:pass@ftp.mozilla.org/x/'
        'y.exe?a=b&amp;c=d&amp;e#f">'
        'https://user:pass@ftp.mozilla.org/x/y.exe?a=b&amp;c=d&amp;e#f</a>'),
        linkify('https://user:pass@ftp.mozilla.org/x/y.exe?a=b&c=d&e#f'))


def test_non_url():
    """document.vulnerable should absolutely not be linkified."""
    s = 'document.vulnerable'
    eq_(s, linkify(s))


def test_javascript_url():
    """javascript: urls should never be linkified."""
    s = 'javascript:document.vulnerable'
    eq_(s, linkify(s))


def test_unsafe_url():
    """Any unsafe char ({}[]<>, etc.) in the path should end URL scanning."""
    in_(('All your{"<a href="http://xx.yy.com/grover.png" '
         'rel="nofollow">xx.yy.com/grover.png</a>"}base are',
         'All your{"<a rel="nofollow" href="http://xx.yy.com/grover.png"'
         '>xx.yy.com/grover.png</a>"}base are'),
        linkify('All your{"xx.yy.com/grover.png"}base are'))


def test_skip_pre():
    """Skip linkification in <pre> tags."""
    simple = 'http://xx.com <pre>http://xx.com</pre>'
    linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
              '<pre>http://xx.com</pre>',
              '<a rel="nofollow" href="http://xx.com">http://xx.com</a> '
              '<pre>http://xx.com</pre>')
    all_linked = ('<a href="http://xx.com" rel="nofollow">http://xx.com</a> '
                  '<pre><a href="http://xx.com" rel="nofollow">http://xx.com'
                  '</a></pre>',
                  '<a rel="nofollow" href="http://xx.com">http://xx.com</a> '
                  '<pre><a rel="nofollow" href="http://xx.com">http://xx.com'
                  '</a></pre>')
    in_(linked, linkify(simple, skip_pre=True))
    in_(all_linked, linkify(simple))

    already_linked = '<pre><a href="http://xx.com">xx</a></pre>'
    nofollowed = ('<pre><a href="http://xx.com" rel="nofollow">xx</a></pre>',
                  '<pre><a rel="nofollow" href="http://xx.com">xx</a></pre>')
    in_(nofollowed, linkify(already_linked))
    in_(nofollowed, linkify(already_linked, skip_pre=True))


def test_libgl():
    """libgl.so.1 should not be linkified."""
    eq_('libgl.so.1', linkify('libgl.so.1'))


def test_end_of_sentence():
    """example.com. should match."""
    outs = ('<a href="http://{0!s}" rel="nofollow">{0!s}</a>{1!s}',
            '<a rel="nofollow" href="http://{0!s}">{0!s}</a>{1!s}')
    intxt = '{0!s}{1!s}'

    def check(u, p):
        in_([out.format(u, p) for out in outs],
            linkify(intxt.format(u, p)))

    tests = (
        ('example.com', '.'),
        ('example.com', '...'),
        ('ex.com/foo', '.'),
        ('ex.com/foo', '....'),
    )

    for u, p in tests:
        yield check, u, p


def test_end_of_clause():
    """example.com/foo, shouldn't include the ,"""
    in_(('<a href="http://ex.com/foo" rel="nofollow">ex.com/foo</a>, bar',
         '<a rel="nofollow" href="http://ex.com/foo">ex.com/foo</a>, bar'),
        linkify('ex.com/foo, bar'))


def test_sarcasm():
    """Jokes should crash.<sarcasm/>"""
    dirty = 'Yeah right <sarcasm/>'
    clean = 'Yeah right &lt;sarcasm/&gt;'
    eq_(clean, linkify(dirty))


def test_wrapping_parentheses():
    """URLs wrapped in parantheses should not include them."""
    outs = ('{0!s}<a href="http://{1!s}" rel="nofollow">{2!s}</a>{3!s}',
            '{0!s}<a rel="nofollow" href="http://{1!s}">{2!s}</a>{3!s}')

    tests = (
        ('(example.com)', ('(', 'example.com', 'example.com', ')')),
        ('(example.com/)', ('(', 'example.com/', 'example.com/', ')')),
        ('(example.com/foo)', ('(', 'example.com/foo',
         'example.com/foo', ')')),
        ('(((example.com/))))', ('(((', 'example.com/)',
         'example.com/)', ')))')),
        ('example.com/))', ('', 'example.com/))', 'example.com/))', '')),
        ('http://en.wikipedia.org/wiki/Test_(assessment)',
         ('', 'en.wikipedia.org/wiki/Test_(assessment)',
          'http://en.wikipedia.org/wiki/Test_(assessment)', '')),
        ('(http://en.wikipedia.org/wiki/Test_(assessment))',
         ('(', 'en.wikipedia.org/wiki/Test_(assessment)',
          'http://en.wikipedia.org/wiki/Test_(assessment)', ')')),
        ('((http://en.wikipedia.org/wiki/Test_(assessment))',
         ('((', 'en.wikipedia.org/wiki/Test_(assessment',
          'http://en.wikipedia.org/wiki/Test_(assessment', '))')),
        ('(http://en.wikipedia.org/wiki/Test_(assessment)))',
         ('(', 'en.wikipedia.org/wiki/Test_(assessment))',
          'http://en.wikipedia.org/wiki/Test_(assessment))', ')')),
        ('(http://en.wikipedia.org/wiki/)Test_(assessment',
         ('(', 'en.wikipedia.org/wiki/)Test_(assessment',
          'http://en.wikipedia.org/wiki/)Test_(assessment', '')),
    )

    def check(test, expected_output):
        in_([o.format(*expected_output) for o in outs], linkify(test))

    for test, expected_output in tests:
        yield check, test, expected_output


def test_ports():
    """URLs can contain port numbers."""
    tests = (
        ('http://foo.com:8000', ('http://foo.com:8000', '')),
        ('http://foo.com:8000/', ('http://foo.com:8000/', '')),
        ('http://bar.com:xkcd', ('http://bar.com', ':xkcd')),
        ('http://foo.com:81/bar', ('http://foo.com:81/bar', '')),
        ('http://foo.com:', ('http://foo.com', ':')),
    )

    def check(test, output):
        outs = ('<a href="{0}" rel="nofollow">{0}</a>{1}',
                '<a rel="nofollow" href="{0}">{0}</a>{1}')
        in_([out.format(*output) for out in outs],
            linkify(test))

    for test, output in tests:
        yield check, test, output


def test_tokenizer():
    """Linkify doesn't always have to sanitize."""
    raw = '<em>test<x></x></em>'
    eq_('<em>test&lt;x&gt;&lt;/x&gt;</em>', linkify(raw))
    eq_(raw, linkify(raw, tokenizer=HTMLTokenizer))


def test_ignore_bad_protocols():
    eq_('foohttp://bar',
        linkify('foohttp://bar'))
    in_(('fohttp://<a href="http://exampl.com" rel="nofollow">exampl.com</a>',
         'fohttp://<a rel="nofollow" href="http://exampl.com">exampl.com</a>'),
        linkify('fohttp://exampl.com'))


def test_max_recursion_depth():
    """If we hit the max recursion depth, just return the string."""
    test = '<em>' * 2000 + 'foo' + '</em>' * 2000
    eq_(test, linkify(test))


def test_link_emails_and_urls():
    """parse_email=True shouldn't prevent URLs from getting linkified."""
    output = ('<a href="http://example.com" rel="nofollow">'
              'http://example.com</a> <a href="mailto:person@example.com">'
              'person@example.com</a>',
              '<a rel="nofollow" href="http://example.com">'
              'http://example.com</a> <a href="mailto:person@example.com">'
              'person@example.com</a>')
    in_(output, linkify('http://example.com person@example.com',
                        parse_email=True))


def test_links_case_insensitive():
    """Protocols and domain names are case insensitive."""
    expect = ('<a href="HTTP://EXAMPLE.COM" rel="nofollow">'
              'HTTP://EXAMPLE.COM</a>',
              '<a rel="nofollow" href="HTTP://EXAMPLE.COM">'
              'HTTP://EXAMPLE.COM</a>')
    in_(expect, linkify('HTTP://EXAMPLE.COM'))


def test_elements_inside_links():
    in_(('<a href="#" rel="nofollow">hello<br></a>',
         '<a rel="nofollow" href="#">hello<br></a>'),
        linkify('<a href="#">hello<br></a>'))

    in_(('<a href="#" rel="nofollow"><strong>bold</strong> hello<br></a>',
         '<a rel="nofollow" href="#"><strong>bold</strong> hello<br></a>'),
        linkify('<a href="#"><strong>bold</strong> hello<br></a>'))