from collections import OrderedDict
import contextlib
from datetime import date, datetime, time, timedelta
from distutils.version import LooseVersion
from functools import partial
import os
import warnings
from warnings import catch_warnings
import numpy as np
from numpy import nan
import pytest
from pandas.compat import PY36, BytesIO, iteritems, map, range, u
import pandas.util._test_decorators as td
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.core.config import get_option, set_option
import pandas.util.testing as tm
from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf
from pandas.io.common import URLError
from pandas.io.excel import (
ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter,
read_excel, register_writer)
from pandas.io.formats.excel import ExcelFormatter
from pandas.io.parsers import read_csv
_seriesd = tm.getSeriesData()
_tsd = tm.getTimeSeriesData()
_frame = DataFrame(_seriesd)[:10]
_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10]
_tsframe = tm.makeTimeDataFrame()[:5]
_mixed_frame = _frame.copy()
_mixed_frame['foo'] = 'bar'
@contextlib.contextmanager
def ignore_xlrd_time_clock_warning():
"""
Context manager to ignore warnings raised by the xlrd library,
regarding the deprecation of `time.clock` in Python 3.7.
"""
with warnings.catch_warnings():
warnings.filterwarnings(
action='ignore',
message='time.clock has been deprecated',
category=DeprecationWarning)
yield
@td.skip_if_no('xlrd', '1.0.0')
class SharedItems(object):
@pytest.fixture(autouse=True)
def setup_method(self, datapath):
self.dirpath = datapath("io", "data")
self.frame = _frame.copy()
self.frame2 = _frame2.copy()
self.tsframe = _tsframe.copy()
self.mixed_frame = _mixed_frame.copy()
def get_csv_refdf(self, basename):
"""
Obtain the reference data from read_csv with the Python engine.
Parameters
----------
basename : str
File base name, excluding file extension.
Returns
-------
dfref : DataFrame
"""
pref = os.path.join(self.dirpath, basename + '.csv')
dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python')
return dfref
def get_excelfile(self, basename, ext):
"""
Return test data ExcelFile instance.
Parameters
----------
basename : str
File base name, excluding file extension.
Returns
-------
excel : io.excel.ExcelFile
"""
return ExcelFile(os.path.join(self.dirpath, basename + ext))
def get_exceldf(self, basename, ext, *args, **kwds):
"""
Return test data DataFrame.
Parameters
----------
basename : str
File base name, excluding file extension.
Returns
-------
df : DataFrame
"""
pth = os.path.join(self.dirpath, basename + ext)
return read_excel(pth, *args, **kwds)
class ReadingTestsBase(SharedItems):
# This is based on ExcelWriterBase
@pytest.fixture(autouse=True, params=['xlrd', None])
def set_engine(self, request):
func_name = "get_exceldf"
old_func = getattr(self, func_name)
new_func = partial(old_func, engine=request.param)
setattr(self, func_name, new_func)
yield
setattr(self, func_name, old_func)
@td.skip_if_no("xlrd", "1.0.1") # see gh-22682
def test_usecols_int(self, ext):
df_ref = self.get_csv_refdf("test1")
df_ref = df_ref.reindex(columns=["A", "B", "C"])
# usecols as int
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
with ignore_xlrd_time_clock_warning():
df1 = self.get_exceldf("test1", ext, "Sheet1",
index_col=0, usecols=3)
# usecols as int
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
with ignore_xlrd_time_clock_warning():
df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
index_col=0, usecols=3)
# parse_cols instead of usecols, usecols as int
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
with ignore_xlrd_time_clock_warning():
df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
index_col=0, parse_cols=3)
# TODO add index to xls file)
tm.assert_frame_equal(df1, df_ref, check_names=False)
tm.assert_frame_equal(df2, df_ref, check_names=False)
tm.assert_frame_equal(df3, df_ref, check_names=False)
@td.skip_if_no('xlrd', '1.0.1') # GH-22682
def test_usecols_list(self, ext):
dfref = self.get_csv_refdf('test1')
dfref = dfref.reindex(columns=['B', 'C'])
df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
usecols=[0, 2, 3])
df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
index_col=0, usecols=[0, 2, 3])
with tm.assert_produces_warning(FutureWarning):
with ignore_xlrd_time_clock_warning():
df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
index_col=0, parse_cols=[0, 2, 3])
# TODO add index to xls file)
tm.assert_frame_equal(df1, dfref, check_names=False)
tm.assert_frame_equal(df2, dfref, check_names=False)
tm.assert_frame_equal(df3, dfref, check_names=False)
@td.skip_if_no('xlrd', '1.0.1') # GH-22682
def test_usecols_str(self, ext):
dfref = self.get_csv_refdf('test1')
df1 = dfref.reindex(columns=['A', 'B', 'C'])
df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
usecols='A:D')
df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
index_col=0, usecols='A:D')
with tm.assert_produces_warning(FutureWarning):
with ignore_xlrd_time_clock_warning():
df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
index_col=0, parse_cols='A:D')
# TODO add index to xls, read xls ignores index name ?
tm.assert_frame_equal(df2, df1, check_names=False)
tm.assert_frame_equal(df3, df1, check_names=False)
tm.assert_frame_equal(df4, df1, check_names=False)
df1 = dfref.reindex(columns=['B', 'C'])
df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
usecols='A,C,D')
df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
index_col=0, usecols='A,C,D')
# TODO add index to xls file
tm.assert_frame_equal(df2, df1, check_names=False)
tm.assert_frame_equal(df3, df1, check_names=False)
df1 = dfref.reindex(columns=['B', 'C'])
df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
usecols='A,C:D')
df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
index_col=0, usecols='A,C:D')
tm.assert_frame_equal(df2, df1, check_names=False)
tm.assert_frame_equal(df3, df1, check_names=False)
@pytest.mark.parametrize("usecols", [
[0, 1, 3], [0, 3, 1],
[1, 0, 3], [1, 3, 0],
[3, 0, 1], [3, 1, 0],
])
def test_usecols_diff_positional_int_columns_order(self, ext, usecols):
expected = self.get_csv_refdf("test1")[["A", "C"]]
result = self.get_exceldf("test1", ext, "Sheet1",
index_col=0, usecols=usecols)
tm.assert_frame_equal(result, expected, check_names=False)
@pytest.mark.parametrize("usecols", [
["B", "D"], ["D", "B"]
])
def test_usecols_diff_positional_str_columns_order(self, ext, usecols):
expected = self.get_csv_refdf("test1")[["B", "D"]]
expected.index = range(len(expected))
result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols)
tm.assert_frame_equal(result, expected, check_names=False)
def test_read_excel_without_slicing(self, ext):
expected = self.get_csv_refdf("test1")
result = self.get_exceldf("test1", ext, "Sheet1", index_col=0)
tm.assert_frame_equal(result, expected, check_names=False)
def test_usecols_excel_range_str(self, ext):
expected = self.get_csv_refdf("test1")[["C", "D"]]
result = self.get_exceldf("test1", ext, "Sheet1",
index_col=0, usecols="A,D:E")
tm.assert_frame_equal(result, expected, check_names=False)
def test_usecols_excel_range_str_invalid(self, ext):
msg = "Invalid column name: E1"
with pytest.raises(ValueError, match=msg):
self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1")
def test_index_col_label_error(self, ext):
msg = "list indices must be integers.*, not str"
with pytest.raises(TypeError, match=msg):
self.get_exceldf("test1", ext, "Sheet1", index_col=["A"],
usecols=["A", "C"])
def test_index_col_empty(self, ext):
# see gh-9208
result = self.get_exceldf("test1", ext, "Sheet3",
index_col=["A", "B", "C"])
expected = DataFrame(columns=["D", "E", "F"],
index=MultiIndex(levels=[[]] * 3,
codes=[[]] * 3,
names=["A", "B", "C"]))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", [None, 2])
def test_index_col_with_unnamed(self, ext, index_col):
# see gh-18792
result = self.get_exceldf("test1", ext, "Sheet4",
index_col=index_col)
expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
columns=["Unnamed: 0", "col1", "col2"])
if index_col:
expected = expected.set_index(expected.columns[index_col])
tm.assert_frame_equal(result, expected)
def test_usecols_pass_non_existent_column(self, ext):
msg = ("Usecols do not match columns, "
"columns expected but not found: " + r"\['E'\]")
with pytest.raises(ValueError, match=msg):
self.get_exceldf("test1", ext, usecols=["E"])
def test_usecols_wrong_type(self, ext):
msg = ("'usecols' must either be list-like of "
"all strings, all unicode, all integers or a callable.")
with pytest.raises(ValueError, match=msg):
self.get_exceldf("test1", ext, usecols=["E1", 0])
def test_excel_stop_iterator(self, ext):
parsed = self.get_exceldf('test2', ext, 'Sheet1')
expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1'])
tm.assert_frame_equal(parsed, expected)
def test_excel_cell_error_na(self, ext):
parsed = self.get_exceldf('test3', ext, 'Sheet1')
expected = DataFrame([[np.nan]], columns=['Test'])
tm.assert_frame_equal(parsed, expected)
def test_excel_passes_na(self, ext):
excel = self.get_excelfile('test4', ext)
parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
na_values=['apple'])
expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']],
columns=['Test'])
tm.assert_frame_equal(parsed, expected)
parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
na_values=['apple'])
expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
columns=['Test'])
tm.assert_frame_equal(parsed, expected)
# 13967
excel = self.get_excelfile('test5', ext)
parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
na_values=['apple'])
expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']],
columns=['Test'])
tm.assert_frame_equal(parsed, expected)
parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
na_values=['apple'])
expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
columns=['Test'])
tm.assert_frame_equal(parsed, expected)
@td.skip_if_no('xlrd', '1.0.1') # GH-22682
Loading ...