Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
crossover / opt / cxoffice / lib / python / cxhtmlutils.py
Size: Mime:
# (c) Copyright 2018. CodeWeavers, Inc.

import re
import sys

import cxutils

# Regular expressions for parse_meta_refresh()
_REFRESH_RE = re.compile(b'http-equiv\\s*=\\s*["\']?refresh["\']?', re.IGNORECASE)
_CONTENT_RE = re.compile(b'content\\s*=\\s*(?P<quote>["\'])(?P<content>.*?)(?P=quote)', re.IGNORECASE)
_CONTENT_DELAY_RE = re.compile(r'^(?P<delay>[0-9]*)(?:\s*;|\s+|$)')

# Firefox still processes the redirect if the 'url=' is omitted, for instance
# if content="0;http://...".
_CONTENT_PREFIX_RE = re.compile(r'^url\s*=\s*', re.IGNORECASE)

# The target URL may or may not be quoted.
# URLs cannot contain single or double quotes but may contain spaces although
# it should really not.
_CONTENT_URL_RE = re.compile(r'^["\']?\s*(?P<url>[^"\']*[^ "\'])')

def parse_meta_refresh(meta):
    """Given a <meta ... /> tag string, returns a (match, delay, url) triplet.
    match - True if the string is an http-equiv=refresh <meta> tag.
    delay - None if the content field is invalid, and the delay in second
            before the redirect happens otherwise.
    url   - None if no URL was specified (reload the same page), and the
            URL to load otherwise.
    """

    match = _REFRESH_RE.search(meta)
    if not match:
        # This is not an http-equiv <meta> tag
        return (False, None, None)

    match = _CONTENT_RE.search(meta)
    if not match:
        return (True, None, None)

    # Note that older versions of HTMLParser pretty much only
    # decode '&amp;' which luckily is mostly what we're after.
    if sys.version_info < (3,):
        import HTMLParser # pylint: disable=E0401
    else:
        import html.parser as HTMLParser # pylint: disable=E0401
    parser = HTMLParser.HTMLParser()
    content = parser.unescape(cxutils.string_to_unicode(match.group('content'))).lstrip()

    match = _CONTENT_DELAY_RE.search(content)
    if not match:
        return (True, None, None)

    if match.group('delay') == "":
        delay = 0
    else:
        delay = int(match.group('delay'))
    content = content[match.end():].lstrip()

    match = _CONTENT_PREFIX_RE.search(content)
    if match:
        content = content[match.end():].lstrip()

    match = _CONTENT_URL_RE.search(content)
    if match:
        url = match.group('url').lstrip()
    else:
        url = None
    return (True, delay, url)


# Regular expressions for is_html_like()
_HTML_RE = re.compile(br'<(?:!doctype html|html|meta|p|script|title|\?xml)[ >]', re.IGNORECASE)
_BODY_RE = re.compile(br'<body[ >]', re.IGNORECASE)
_META_RE = re.compile(br'<meta (?P<attrs>[^>]+)>', re.IGNORECASE)

def is_html(filename, size):
    """Checks whether the specified file is an html file, or similar, and if
    it is, whether it redirects us elsewhere.

    The redirect value can be one of:
    False - If the page does not contain a redirect.
    True  - If the page implicitly redirects to itself.
    A url - If the page explicitly specifies the target URL.
    None  - If we do not know if the page contains a redirect, typically
            because we don't have the full file yet.
    """
    redirect = None
    content = open(filename, 'rb')
    try:
        if size is None:
            data = content.read()
        else:
            data = content.read(size)
        if not _HTML_RE.search(data):
            # Not an HTML file
            return (False, False)

        for match_meta in _META_RE.finditer(data):
            attrs = match_meta.group('attrs')
            match, delay, url = parse_meta_refresh(attrs)
            if not match:
                continue

            if delay is None:
                # There is a refresh directive but it's invalid. Still take
                # its presence as meaning we have seen enough.
                redirect = False
                break

            # A user would not wait over a minute (if that long) to get
            # redirected to the installer binary. Longer delays would only
            # be used to refresh the HTML page (and display new ads).
            # Ignore redirects with long delays.
            if delay > 60:
                redirect = False
                break

            if url is None:
                redirect = True
            else:
                redirect = url

            # In theory there could be more than one refresh directive and in
            # that case the one with the shortest delay wins!
            # Just ignore that special case for now.
            break

        if redirect is None and _BODY_RE.search(data):
            redirect = False
        return (True, redirect)
    finally:
        content.close()
    return (False, False)


#####
#
# Some test code
#
#####

import sys

def test_meta_refresh(fmt, delay, url):
    if delay is None:
        if url is None:
            content = fmt
        else:
            content = fmt % url.replace("&", "&amp;")
            delay = 0
    else:
        if url is None:
            content = fmt % delay
        else:
            content = fmt % (delay, url.replace("&", "&amp;"))

    if "'" in content:
        quote = '"'
    else:
        quote = "'"
    meta = "http-equiv='refresh' content=" + quote + content + quote

    match, meta_delay, meta_url = parse_meta_refresh(meta)
    print("content=[%s]" % content)
    print(" -> match=%s delay=[%s] url=[%s]" % (match, meta_delay, meta_url))
    print('')
    if not match:
        print("*** failed to match ***")
        sys.exit(1)
    if delay != meta_delay:
        print("*** bad delay ***")
        sys.exit(1)
    if url != meta_url:
        print("*** bad url ***")
        sys.exit(1)

def main():
    # Tests based on the behavior of Firefox and Chrome
    # Simple reloads
    test_meta_refresh("%s", 2, None)
    test_meta_refresh("%s;", 2, None)
    test_meta_refresh(" %s ; ", 2, None)

    # Invalid delay values
    test_meta_refresh("2a", None, None)
    test_meta_refresh("2a;", None, None)
    test_meta_refresh("http://www.codeweavers.com", None, None)
    test_meta_refresh("http://www.codeweavers.com;", None, None)

    # Standard delay + redirect
    test_meta_refresh("%s;%s", 2, "http://www.codeweavers.com")
    test_meta_refresh("%s;url=%s", 2, "http://www.codeweavers.com")
    test_meta_refresh("%s;URL=%s", 2, "http://www.codeweavers.com")
    test_meta_refresh(" %s ;  %s ", 2, "http://www.codeweavers.com")
    test_meta_refresh(" %s ; url = %s ", 2, "http://www.codeweavers.com")

    # A space instead of a semi-colon works too!
    test_meta_refresh(" %s %s ", 2, "http://www.codeweavers.com")

    # But there must be a delay
    test_meta_refresh("  http://www.codeweavers.com", None, None)
    test_meta_refresh("  url=http://www.codeweavers.com", None, None)

    # Quoted urls are valid too!
    test_meta_refresh("%s;url='%s'", 2, "http://www.codeweavers.com")
    test_meta_refresh('%s;url="%s"', 2, "http://www.codeweavers.com")
    # even with lots of spaces all around
    test_meta_refresh("%s ; url = ' %s ' ", 2, "http://www.codeweavers.com")

    # More complex URLs
    test_meta_refresh("%s; %s ", 2, "http://www.codeweavers.com/?param=1;2")
    test_meta_refresh("%s %s ", 2, "http://www.codeweavers.com/?param=1;2")
    test_meta_refresh("%s; url=%s", 5, "https://www.codeweavers.com/path/1.2/File.exe?p1=&p2=1234&p3=whitequeen")
    # or even broken ones
    test_meta_refresh("%s; %s ", 2, "http://www.codeweavers.com/?param=1;2 3")
    test_meta_refresh("%s %s ", 2, "http://www.codeweavers.com/?param=1;2 3")

    # Firefox also accepts an implicit 0 delay. Not Chrome.
    test_meta_refresh(";%s", None, "http://www.codeweavers.com")
    test_meta_refresh(" ; %s ", None, "http://www.codeweavers.com")
    test_meta_refresh(";url='%s'", None, "http://www.codeweavers.com")


if __name__ == '__main__':
    main()