Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

Version: 0.24.2 

/ tests / io / test_html.py

from __future__ import print_function

from functools import partial
import os
import re
import threading

import numpy as np
from numpy.random import rand
import pytest

from pandas.compat import (
    PY3, BytesIO, StringIO, is_platform_windows, map, reload, zip)
from pandas.errors import ParserError
import pandas.util._test_decorators as td

from pandas import (
    DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv)
import pandas.util.testing as tm
from pandas.util.testing import makeCustomDataframe as mkdf, network

from pandas.io.common import URLError, file_path_to_url
import pandas.io.html
from pandas.io.html import read_html

HERE = os.path.dirname(__file__)


@pytest.fixture(params=[
    'chinese_utf-16.html',
    'chinese_utf-32.html',
    'chinese_utf-8.html',
    'letz_latin1.html',
])
def html_encoding_file(request, datapath):
    """Parametrized fixture for HTML encoding test filenames."""
    return datapath('io', 'data', 'html_encoding', request.param)


def assert_framelist_equal(list1, list2, *args, **kwargs):
    assert len(list1) == len(list2), ('lists are not of equal size '
                                      'len(list1) == {0}, '
                                      'len(list2) == {1}'.format(len(list1),
                                                                 len(list2)))
    msg = 'not all list elements are DataFrames'
    both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and
                          isinstance(y, DataFrame), list1, list2))
    assert both_frames, msg
    for frame_i, frame_j in zip(list1, list2):
        tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
        assert not frame_i.empty, 'frames are both empty'


@td.skip_if_no('bs4')
def test_bs4_version_fails(monkeypatch, datapath):
    import bs4
    monkeypatch.setattr(bs4, '__version__', '4.2')
    with pytest.raises(ValueError, match="minimum version"):
        read_html(datapath("io", "data", "spam.html"), flavor='bs4')


def test_invalid_flavor():
    url = "google.com"
    flavor = "invalid flavor"
    msg = r"\{" + flavor + r"\} is not a valid set of flavors"

    with pytest.raises(ValueError, match=msg):
        read_html(url, "google", flavor=flavor)


@td.skip_if_no('bs4')
@td.skip_if_no('lxml')
def test_same_ordering(datapath):
    filename = datapath('io', 'data', 'valid_markup.html')
    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
    assert_framelist_equal(dfs_lxml, dfs_bs4)


@pytest.mark.parametrize("flavor", [
    pytest.param('bs4', marks=pytest.mark.skipif(
        not td.safe_import('lxml'), reason='No bs4')),
    pytest.param('lxml', marks=pytest.mark.skipif(
        not td.safe_import('lxml'), reason='No lxml'))], scope="class")
class TestReadHtml(object):

    @pytest.fixture(autouse=True)
    def set_files(self, datapath):
        self.spam_data = datapath('io', 'data', 'spam.html')
        self.spam_data_kwargs = {}
        if PY3:
            self.spam_data_kwargs['encoding'] = 'UTF-8'
        self.banklist_data = datapath("io", "data", "banklist.html")

    @pytest.fixture(autouse=True, scope="function")
    def set_defaults(self, flavor, request):
        self.read_html = partial(read_html, flavor=flavor)
        yield

    def test_to_html_compat(self):
        df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
                  r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
        out = df.to_html()
        res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0]
        tm.assert_frame_equal(res, df)

    @network
    def test_banklist_url(self):
        url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
        df1 = self.read_html(url, 'First Federal Bank of Florida',
                             attrs={"id": 'table'})
        df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'})

        assert_framelist_equal(df1, df2)

    @network
    def test_spam_url(self):
        url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&'
               'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam')
        df1 = self.read_html(url, '.*Water.*')
        df2 = self.read_html(url, 'Unit')

        assert_framelist_equal(df1, df2)

    @pytest.mark.slow
    def test_banklist(self):
        df1 = self.read_html(self.banklist_data, '.*Florida.*',
                             attrs={'id': 'table'})
        df2 = self.read_html(self.banklist_data, 'Metcalf Bank',
                             attrs={'id': 'table'})

        assert_framelist_equal(df1, df2)

    def test_spam(self):
        df1 = self.read_html(self.spam_data, '.*Water.*')
        df2 = self.read_html(self.spam_data, 'Unit')
        assert_framelist_equal(df1, df2)

        assert df1[0].iloc[0, 0] == 'Proximates'
        assert df1[0].columns[0] == 'Nutrient'

    def test_spam_no_match(self):
        dfs = self.read_html(self.spam_data)
        for df in dfs:
            assert isinstance(df, DataFrame)

    def test_banklist_no_match(self):
        dfs = self.read_html(self.banklist_data, attrs={'id': 'table'})
        for df in dfs:
            assert isinstance(df, DataFrame)

    def test_spam_header(self):
        df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
        assert df.columns[0] == 'Proximates'
        assert not df.empty

    def test_skiprows_int(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
        df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)

        assert_framelist_equal(df1, df2)

    def test_skiprows_xrange(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0]
        df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0]
        tm.assert_frame_equal(df1, df2)

    def test_skiprows_list(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2])
        df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1])

        assert_framelist_equal(df1, df2)

    def test_skiprows_set(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows={1, 2})
        df2 = self.read_html(self.spam_data, 'Unit', skiprows={2, 1})

        assert_framelist_equal(df1, df2)

    def test_skiprows_slice(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1)
        df2 = self.read_html(self.spam_data, 'Unit', skiprows=1)

        assert_framelist_equal(df1, df2)

    def test_skiprows_slice_short(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2))
        df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2))

        assert_framelist_equal(df1, df2)

    def test_skiprows_slice_long(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5))
        df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1))

        assert_framelist_equal(df1, df2)

    def test_skiprows_ndarray(self):
        df1 = self.read_html(self.spam_data, '.*Water.*',
                             skiprows=np.arange(2))
        df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2))

        assert_framelist_equal(df1, df2)

    def test_skiprows_invalid(self):
        with pytest.raises(TypeError, match=('is not a valid type '
                                             'for skipping rows')):
            self.read_html(self.spam_data, '.*Water.*', skiprows='asdf')

    def test_index(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
        df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
        assert_framelist_equal(df1, df2)

    def test_header_and_index_no_types(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
                             index_col=0)
        df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
        assert_framelist_equal(df1, df2)

    def test_header_and_index_with_types(self):
        df1 = self.read_html(self.spam_data, '.*Water.*', header=1,
                             index_col=0)
        df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0)
        assert_framelist_equal(df1, df2)

    def test_infer_types(self):

        # 10892 infer_types removed
        df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0)
        df2 = self.read_html(self.spam_data, 'Unit', index_col=0)
        assert_framelist_equal(df1, df2)

    def test_string_io(self):
        with open(self.spam_data, **self.spam_data_kwargs) as f:
            data1 = StringIO(f.read())

        with open(self.spam_data, **self.spam_data_kwargs) as f:
            data2 = StringIO(f.read())

        df1 = self.read_html(data1, '.*Water.*')
        df2 = self.read_html(data2, 'Unit')
        assert_framelist_equal(df1, df2)

    def test_string(self):
        with open(self.spam_data, **self.spam_data_kwargs) as f:
            data = f.read()

        df1 = self.read_html(data, '.*Water.*')
        df2 = self.read_html(data, 'Unit')

        assert_framelist_equal(df1, df2)

    def test_file_like(self):
        with open(self.spam_data, **self.spam_data_kwargs) as f:
            df1 = self.read_html(f, '.*Water.*')

        with open(self.spam_data, **self.spam_data_kwargs) as f:
            df2 = self.read_html(f, 'Unit')

        assert_framelist_equal(df1, df2)

    @network
    def test_bad_url_protocol(self):
        with pytest.raises(URLError):
            self.read_html('git://github.com', match='.*Water.*')

    @network
    def test_invalid_url(self):
        try:
            with pytest.raises(URLError):
                self.read_html('http://www.a23950sdfa908sd.com',
                               match='.*Water.*')
        except ValueError as e:
            assert 'No tables found' in str(e)

    @pytest.mark.slow
    def test_file_url(self):
        url = self.banklist_data
        dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
                             'First',
                             attrs={'id': 'table'})
        assert isinstance(dfs, list)
        for df in dfs:
            assert isinstance(df, DataFrame)

    @pytest.mark.slow
    def test_invalid_table_attrs(self):
        url = self.banklist_data
        with pytest.raises(ValueError, match='No tables found'):
            self.read_html(url, 'First Federal Bank of Florida',
                           attrs={'id': 'tasdfable'})

    def _bank_data(self, *args, **kwargs):
        return self.read_html(self.banklist_data, 'Metcalf',
                              attrs={'id': 'table'}, *args, **kwargs)

    @pytest.mark.slow
    def test_multiindex_header(self):
        df = self._bank_data(header=[0, 1])[0]
        assert isinstance(df.columns, MultiIndex)

    @pytest.mark.slow
    def test_multiindex_index(self):
        df = self._bank_data(index_col=[0, 1])[0]
        assert isinstance(df.index, MultiIndex)

    @pytest.mark.slow
    def test_multiindex_header_index(self):
        df = self._bank_data(header=[0, 1], index_col=[0, 1])[0]
        assert isinstance(df.columns, MultiIndex)
        assert isinstance(df.index, MultiIndex)

    @pytest.mark.slow
    def test_multiindex_header_skiprows_tuples(self):
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            df = self._bank_data(header=[0, 1], skiprows=1,
                                 tupleize_cols=True)[0]
            assert isinstance(df.columns, Index)

    @pytest.mark.slow
    def test_multiindex_header_skiprows(self):
        df = self._bank_data(header=[0, 1], skiprows=1)[0]
        assert isinstance(df.columns, MultiIndex)

    @pytest.mark.slow
    def test_multiindex_header_index_skiprows(self):
        df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0]
        assert isinstance(df.index, MultiIndex)
        assert isinstance(df.columns, MultiIndex)

    @pytest.mark.slow
    def test_regex_idempotency(self):
        url = self.banklist_data
        dfs = self.read_html(file_path_to_url(os.path.abspath(url)),
                             match=re.compile(re.compile('Florida')),
                             attrs={'id': 'table'})
        assert isinstance(dfs, list)
        for df in dfs:
            assert isinstance(df, DataFrame)

    def test_negative_skiprows(self):
        msg = r'\(you passed a negative value\)'
        with pytest.raises(ValueError, match=msg):
            self.read_html(self.spam_data, 'Water', skiprows=-1)
Loading ...