Repository URL to install this package:
|
Version:
3.0.4 ▾
|
chardet
/
test.py
|
|---|
"""
Run chardet on a bunch of documents and see that we get the correct encodings.
:author: Dan Blanchard
:author: Ian Cordasco
"""
from __future__ import with_statement
import textwrap
from difflib import ndiff
from io import open
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext
try:
import hypothesis.strategies as st
from hypothesis import given, assume, settings, Verbosity
HAVE_HYPOTHESIS = True
except ImportError:
HAVE_HYPOTHESIS = False
import pytest
import chardet
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
# retrain model.
MISSING_ENCODINGS = set(['iso-8859-2', 'iso-8859-6', 'windows-1250',
'windows-1254', 'windows-1256'])
EXPECTED_FAILURES = set(['tests/iso-8859-7-greek/disabled.gr.xml',
'tests/iso-8859-9-turkish/divxplanet.com.xml',
'tests/iso-8859-9-turkish/subtitle.srt',
'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'])
def gen_test_params():
"""Yields tuples of paths and encodings to use for test_encoding_detection"""
base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
for encoding in listdir(base_path):
path = join(base_path, encoding)
# Skip files in tests directory
if not isdir(path):
continue
# Remove language suffixes from encoding if pressent
encoding = encoding.lower()
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
'-hebrew', '-hungarian', '-turkish']:
if encoding.endswith(postfix):
encoding = encoding.rpartition(postfix)[0]
break
# Skip directories for encodings we don't handle yet.
if encoding in MISSING_ENCODINGS:
continue
# Test encoding detection for each file we have of encoding for
for file_name in listdir(path):
ext = splitext(file_name)[1].lower()
if ext not in ['.html', '.txt', '.xml', '.srt']:
continue
full_path = join(path, file_name)
test_case = full_path, encoding
if full_path in EXPECTED_FAILURES:
test_case = pytest.mark.xfail(test_case)
yield test_case
@pytest.mark.parametrize ('file_name, encoding', gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, 'rb') as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ''
try:
detected_unicode = input_bytes.decode(result['encoding'])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ''
if result:
encoding_match = (result['encoding'] or '').lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = '\n'.join(textwrap.wrap(expected_unicode, 100)) + '\n'
wrapped_detected = '\n'.join(textwrap.wrap(detected_unicode, 100)) + '\n'
diff = ''.join(ndiff(wrapped_expected.splitlines(True),
wrapped_detected.splitlines(True)))
else:
diff = ''
encoding_match = True
assert encoding_match, ("Expected %s, but got %s for %s. Character "
"differences: \n%s" % (encoding,
result,
file_name,
diff))
if HAVE_HYPOTHESIS:
class JustALengthIssue(Exception):
pass
@pytest.mark.xfail
@given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
'utf-32', 'iso-8859-7',
'iso-8859-8', 'windows-1255']),
st.randoms())
@settings(max_examples=200)
def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
try:
data = txt.encode(enc)
except UnicodeEncodeError:
assume(False)
detected = chardet.detect(data)['encoding']
if detected is None:
with pytest.raises(JustALengthIssue):
@given(st.text(), random=rnd)
@settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
def string_poisons_following_text(suffix):
try:
extended = (txt + suffix).encode(enc)
except UnicodeEncodeError:
assume(False)
result = chardet.detect(extended)
if result and result['encoding'] is not None:
raise JustALengthIssue()