Repository URL to install this package:
Version:
19.0.2-1 ▾
|
# (c) Copyright 2018. CodeWeavers, Inc.
import re
import sys
import cxutils
# Regular expressions for parse_meta_refresh()
_REFRESH_RE = re.compile(b'http-equiv\\s*=\\s*["\']?refresh["\']?', re.IGNORECASE)
_CONTENT_RE = re.compile(b'content\\s*=\\s*(?P<quote>["\'])(?P<content>.*?)(?P=quote)', re.IGNORECASE)
_CONTENT_DELAY_RE = re.compile(r'^(?P<delay>[0-9]*)(?:\s*;|\s+|$)')
# Firefox still processes the redirect if the 'url=' is omitted, for instance
# if content="0;http://...".
_CONTENT_PREFIX_RE = re.compile(r'^url\s*=\s*', re.IGNORECASE)
# The target URL may or may not be quoted.
# URLs cannot contain single or double quotes but may contain spaces although
# it should really not.
_CONTENT_URL_RE = re.compile(r'^["\']?\s*(?P<url>[^"\']*[^ "\'])')
def parse_meta_refresh(meta):
"""Given a <meta ... /> tag string, returns a (match, delay, url) triplet.
match - True if the string is an http-equiv=refresh <meta> tag.
delay - None if the content field is invalid, and the delay in second
before the redirect happens otherwise.
url - None if no URL was specified (reload the same page), and the
URL to load otherwise.
"""
match = _REFRESH_RE.search(meta)
if not match:
# This is not an http-equiv <meta> tag
return (False, None, None)
match = _CONTENT_RE.search(meta)
if not match:
return (True, None, None)
# Note that older versions of HTMLParser pretty much only
# decode '&' which luckily is mostly what we're after.
if sys.version_info < (3,):
import HTMLParser # pylint: disable=E0401
else:
import html.parser as HTMLParser # pylint: disable=E0401
parser = HTMLParser.HTMLParser()
content = parser.unescape(cxutils.string_to_unicode(match.group('content'))).lstrip()
match = _CONTENT_DELAY_RE.search(content)
if not match:
return (True, None, None)
if match.group('delay') == "":
delay = 0
else:
delay = int(match.group('delay'))
content = content[match.end():].lstrip()
match = _CONTENT_PREFIX_RE.search(content)
if match:
content = content[match.end():].lstrip()
match = _CONTENT_URL_RE.search(content)
if match:
url = match.group('url').lstrip()
else:
url = None
return (True, delay, url)
# Regular expressions for is_html_like()
_HTML_RE = re.compile(br'<(?:!doctype html|html|meta|p|script|title|\?xml)[ >]', re.IGNORECASE)
_BODY_RE = re.compile(br'<body[ >]', re.IGNORECASE)
_META_RE = re.compile(br'<meta (?P<attrs>[^>]+)>', re.IGNORECASE)
def is_html(filename, size):
"""Checks whether the specified file is an html file, or similar, and if
it is, whether it redirects us elsewhere.
The redirect value can be one of:
False - If the page does not contain a redirect.
True - If the page implicitly redirects to itself.
A url - If the page explicitly specifies the target URL.
None - If we do not know if the page contains a redirect, typically
because we don't have the full file yet.
"""
redirect = None
content = open(filename, 'rb')
try:
if size is None:
data = content.read()
else:
data = content.read(size)
if not _HTML_RE.search(data):
# Not an HTML file
return (False, False)
for match_meta in _META_RE.finditer(data):
attrs = match_meta.group('attrs')
match, delay, url = parse_meta_refresh(attrs)
if not match:
continue
if delay is None:
# There is a refresh directive but it's invalid. Still take
# its presence as meaning we have seen enough.
redirect = False
break
# A user would not wait over a minute (if that long) to get
# redirected to the installer binary. Longer delays would only
# be used to refresh the HTML page (and display new ads).
# Ignore redirects with long delays.
if delay > 60:
redirect = False
break
if url is None:
redirect = True
else:
redirect = url
# In theory there could be more than one refresh directive and in
# that case the one with the shortest delay wins!
# Just ignore that special case for now.
break
if redirect is None and _BODY_RE.search(data):
redirect = False
return (True, redirect)
finally:
content.close()
return (False, False)
#####
#
# Some test code
#
#####
import sys
def test_meta_refresh(fmt, delay, url):
if delay is None:
if url is None:
content = fmt
else:
content = fmt % url.replace("&", "&")
delay = 0
else:
if url is None:
content = fmt % delay
else:
content = fmt % (delay, url.replace("&", "&"))
if "'" in content:
quote = '"'
else:
quote = "'"
meta = "http-equiv='refresh' content=" + quote + content + quote
match, meta_delay, meta_url = parse_meta_refresh(meta)
print("content=[%s]" % content)
print(" -> match=%s delay=[%s] url=[%s]" % (match, meta_delay, meta_url))
print('')
if not match:
print("*** failed to match ***")
sys.exit(1)
if delay != meta_delay:
print("*** bad delay ***")
sys.exit(1)
if url != meta_url:
print("*** bad url ***")
sys.exit(1)
def main():
# Tests based on the behavior of Firefox and Chrome
# Simple reloads
test_meta_refresh("%s", 2, None)
test_meta_refresh("%s;", 2, None)
test_meta_refresh(" %s ; ", 2, None)
# Invalid delay values
test_meta_refresh("2a", None, None)
test_meta_refresh("2a;", None, None)
test_meta_refresh("http://www.codeweavers.com", None, None)
test_meta_refresh("http://www.codeweavers.com;", None, None)
# Standard delay + redirect
test_meta_refresh("%s;%s", 2, "http://www.codeweavers.com")
test_meta_refresh("%s;url=%s", 2, "http://www.codeweavers.com")
test_meta_refresh("%s;URL=%s", 2, "http://www.codeweavers.com")
test_meta_refresh(" %s ; %s ", 2, "http://www.codeweavers.com")
test_meta_refresh(" %s ; url = %s ", 2, "http://www.codeweavers.com")
# A space instead of a semi-colon works too!
test_meta_refresh(" %s %s ", 2, "http://www.codeweavers.com")
# But there must be a delay
test_meta_refresh(" http://www.codeweavers.com", None, None)
test_meta_refresh(" url=http://www.codeweavers.com", None, None)
# Quoted urls are valid too!
test_meta_refresh("%s;url='%s'", 2, "http://www.codeweavers.com")
test_meta_refresh('%s;url="%s"', 2, "http://www.codeweavers.com")
# even with lots of spaces all around
test_meta_refresh("%s ; url = ' %s ' ", 2, "http://www.codeweavers.com")
# More complex URLs
test_meta_refresh("%s; %s ", 2, "http://www.codeweavers.com/?param=1;2")
test_meta_refresh("%s %s ", 2, "http://www.codeweavers.com/?param=1;2")
test_meta_refresh("%s; url=%s", 5, "https://www.codeweavers.com/path/1.2/File.exe?p1=&p2=1234&p3=whitequeen")
# or even broken ones
test_meta_refresh("%s; %s ", 2, "http://www.codeweavers.com/?param=1;2 3")
test_meta_refresh("%s %s ", 2, "http://www.codeweavers.com/?param=1;2 3")
# Firefox also accepts an implicit 0 delay. Not Chrome.
test_meta_refresh(";%s", None, "http://www.codeweavers.com")
test_meta_refresh(" ; %s ", None, "http://www.codeweavers.com")
test_meta_refresh(";url='%s'", None, "http://www.codeweavers.com")
if __name__ == '__main__':
main()