from __future__ import print_function
from functools import partial
import os
import re
import threading
import numpy as np
from numpy.random import rand
import pytest
from pandas.compat import (
PY3, BytesIO, StringIO, is_platform_windows, map, reload, zip)
from pandas.errors import ParserError
import pandas.util._test_decorators as td
from pandas import (
DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv)
import pandas.util.testing as tm
from pandas.util.testing import makeCustomDataframe as mkdf, network
from pandas.io.common import URLError, file_path_to_url
import pandas.io.html
from pandas.io.html import read_html
HERE = os.path.dirname(__file__)
@pytest.fixture(params=[
'chinese_utf-16.html',
'chinese_utf-32.html',
'chinese_utf-8.html',
'letz_latin1.html',
])
def html_encoding_file(request, datapath):
"""Parametrized fixture for HTML encoding test filenames."""
return datapath('io', 'data', 'html_encoding', request.param)
def assert_framelist_equal(list1, list2, *args, **kwargs):
assert len(list1) == len(list2), ('lists are not of equal size '
'len(list1) == {0}, '
'len(list2) == {1}'.format(len(list1),
len(list2)))
msg = 'not all list elements are DataFrames'
both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and
isinstance(y, DataFrame), list1, list2))
assert both_frames, msg
for frame_i, frame_j in zip(list1, list2):
tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
assert not frame_i.empty, 'frames are both empty'
@td.skip_if_no('bs4')
def test_bs4_version_fails(monkeypatch, datapath):
import bs4
monkeypatch.setattr(bs4, '__version__', '4.2')
with pytest.raises(ValueError, match="minimum version"):
read_html(datapath("io", "data", "spam.html"), flavor='bs4')
def test_invalid_flavor():
url = "google.com"
flavor = "invalid flavor"
msg = r"\{" + flavor + r"\} is not a valid set of flavors"
with pytest.raises(ValueError, match=msg):
read_html(url, "google", flavor=flavor)
@td.skip_if_no('bs4')
@td.skip_if_no('lxml')
def test_same_ordering(datapath):
filename = datapath('io', 'data', 'valid_markup.html')
dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
assert_framelist_equal(dfs_lxml, dfs_bs4)
@pytest.mark.parametrize("flavor", [
pytest.param('bs4', marks=pytest.mark.skipif(
not td.safe_import('lxml'), reason='No bs4')),
pytest.param('lxml', marks=pytest.mark.skipif(
not td.safe_import('lxml'), reason='No lxml'))], scope="class")
class TestReadHtml(object):
@pytest.fixture(autouse=True)
def set_files(self, datapath):
self.spam_data = datapath('io', 'data', 'spam.html')
self.spam_data_kwargs = {}
if PY3:
self.spam_data_kwargs['encoding'] = 'UTF-8'
self.banklist_data = datapath("io", "data", "banklist.html")
@pytest.fixture(autouse=True, scope="function")
def set_defaults(self, flavor, request):
self.read_html = partial(read_html, flavor=flavor)
yield
def test_to_html_compat(self):
df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
out = df.to_html()
res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0]
tm.assert_frame_equal(res, df)
@network
def test_banklist_url(self):
url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
df1 = self.read_html(url, 'First Federal Bank of Florida',
attrs={"id": 'table'})
df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'})
assert_framelist_equal(df1, df2)
@network
def test_spam_url(self):
url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&'
'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam')
df1 = self.read_html(url, '.*Water.*')
df2 = self.read_html(url, 'Unit')
assert_framelist_equal(df1, df2)
@pytest.mark.slow
def test_banklist(self):
df1 = self.read_html(self.banklist_data, '.*Florida.*',
attrs={'id': 'table'})
df2 = self.read_html(self.banklist_data, 'Metcalf Bank',
attrs={'id': 'table'})
assert_framelist_equal(df1, df2)
def test_spam(self):
df1 = self.read_html(self.spam_data, '.*Water.*')
df2 = self.read_html(self.spam_data, 'Unit')
assert_framelist_equal(df1, df2)
assert df1[0].iloc[0, 0] == 'Proximates'
assert df1[0].columns[0] == 'Nutrient'
def test_spam_no_match(self):
dfs = self.read_html(self.spam_data)
for df in dfs:
assert isinstance(df, DataFrame)
def test_banklist_no_match(self):
dfs = self.read_html(self.banklist_data, attrs={'id': 'table'})
for df in dfs:
assert isinstance(df, DataFrame)
def test_spam_header(self):
df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
assert df.columns[0] == 'Proximates'
assert not df.empty
def test_skiprows_int(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
assert_framelist_equal(df1, df2)
def test_skiprows_xrange(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0]
df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0]
tm.assert_frame_equal(df1, df2)
def test_skiprows_list(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2])
df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1])
assert_framelist_equal(df1, df2)
def test_skiprows_set(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows={1, 2})
df2 = self.read_html(self.spam_data, 'Unit', skiprows={2, 1})
assert_framelist_equal(df1, df2)
def test_skiprows_slice(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)
assert_framelist_equal(df1, df2)
def test_skiprows_slice_short(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2))
assert_framelist_equal(df1, df2)
def test_skiprows_slice_long(self):
df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1))
assert_framelist_equal(df1, df2)
def test_skiprows_ndarray(self):
df1 = self.read_html(self.spam_data, '.*Water.*',
skiprows=np.arange(2))
df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2))
assert_framelist_equal(df1, df2)
def test_skiprows_invalid(self):
with pytest.raises(TypeError, match=('is not a valid type '
'for skipping rows')):
self.read_html(self.spam_data, '.*Water.*', skiprows='asdf')
def test_index(self):
df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
assert_framelist_equal(df1, df2)
def test_header_and_index_no_types(self):
df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
assert_framelist_equal(df1, df2)
def test_header_and_index_with_types(self):
df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
assert_framelist_equal(df1, df2)
def test_infer_types(self):
# 10892 infer_types removed
df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
assert_framelist_equal(df1, df2)
def test_string_io(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data1 = StringIO(f.read())
with open(self.spam_data, **self.spam_data_kwargs) as f:
data2 = StringIO(f.read())
df1 = self.read_html(data1, '.*Water.*')
df2 = self.read_html(data2, 'Unit')
assert_framelist_equal(df1, df2)
def test_string(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data = f.read()
df1 = self.read_html(data, '.*Water.*')
df2 = self.read_html(data, 'Unit')
assert_framelist_equal(df1, df2)
def test_file_like(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
df1 = self.read_html(f, '.*Water.*')
with open(self.spam_data, **self.spam_data_kwargs) as f:
df2 = self.read_html(f, 'Unit')
assert_framelist_equal(df1, df2)
@network
def test_bad_url_protocol(self):
with pytest.raises(URLError):
self.read_html('git://github.com', match='.*Water.*')
@network
def test_invalid_url(self):
try:
with pytest.raises(URLError):
self.read_html('http://www.a23950sdfa908sd.com',
match='.*Water.*')
except ValueError as e:
assert 'No tables found' in str(e)
@pytest.mark.slow
def test_file_url(self):
url = self.banklist_data
dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
'First',
attrs={'id': 'table'})
assert isinstance(dfs, list)
for df in dfs:
assert isinstance(df, DataFrame)
@pytest.mark.slow
def test_invalid_table_attrs(self):
url = self.banklist_data
with pytest.raises(ValueError, match='No tables found'):
self.read_html(url, 'First Federal Bank of Florida',
attrs={'id': 'tasdfable'})
def _bank_data(self, *args, **kwargs):
return self.read_html(self.banklist_data, 'Metcalf',
attrs={'id': 'table'}, *args, **kwargs)
@pytest.mark.slow
def test_multiindex_header(self):
df = self._bank_data(header=[0, 1])[0]
assert isinstance(df.columns, MultiIndex)
@pytest.mark.slow
def test_multiindex_index(self):
df = self._bank_data(index_col=[0, 1])[0]
assert isinstance(df.index, MultiIndex)
@pytest.mark.slow
def test_multiindex_header_index(self):
df = self._bank_data(header=[0, 1], index_col=[0, 1])[0]
assert isinstance(df.columns, MultiIndex)
assert isinstance(df.index, MultiIndex)
@pytest.mark.slow
def test_multiindex_header_skiprows_tuples(self):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
df = self._bank_data(header=[0, 1], skiprows=1,
tupleize_cols=True)[0]
assert isinstance(df.columns, Index)
@pytest.mark.slow
def test_multiindex_header_skiprows(self):
df = self._bank_data(header=[0, 1], skiprows=1)[0]
assert isinstance(df.columns, MultiIndex)
@pytest.mark.slow
def test_multiindex_header_index_skiprows(self):
df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0]
assert isinstance(df.index, MultiIndex)
assert isinstance(df.columns, MultiIndex)
@pytest.mark.slow
def test_regex_idempotency(self):
url = self.banklist_data
dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
match=re.compile(re.compile('Florida')),
attrs={'id': 'table'})
assert isinstance(dfs, list)
for df in dfs:
assert isinstance(df, DataFrame)
def test_negative_skiprows(self):
msg = r'\(you passed a negative value\)'
with pytest.raises(ValueError, match=msg):
self.read_html(self.spam_data, 'Water', skiprows=-1)
Loading ...