Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

Version: 0.24.2 

/ tests / io / test_excel.py

from collections import OrderedDict
import contextlib
from datetime import date, datetime, time, timedelta
from distutils.version import LooseVersion
from functools import partial
import os
import warnings
from warnings import catch_warnings

import numpy as np
from numpy import nan
import pytest

from pandas.compat import PY36, BytesIO, iteritems, map, range, u
import pandas.util._test_decorators as td

import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.core.config import get_option, set_option
import pandas.util.testing as tm
from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf

from pandas.io.common import URLError
from pandas.io.excel import (
    ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter,
    read_excel, register_writer)
from pandas.io.formats.excel import ExcelFormatter
from pandas.io.parsers import read_csv

_seriesd = tm.getSeriesData()
_tsd = tm.getTimeSeriesData()
_frame = DataFrame(_seriesd)[:10]
_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10]
_tsframe = tm.makeTimeDataFrame()[:5]
_mixed_frame = _frame.copy()
_mixed_frame['foo'] = 'bar'


@contextlib.contextmanager
def ignore_xlrd_time_clock_warning():
    """
    Context manager to ignore warnings raised by the xlrd library,
    regarding the deprecation of `time.clock` in Python 3.7.
    """
    with warnings.catch_warnings():
        warnings.filterwarnings(
            action='ignore',
            message='time.clock has been deprecated',
            category=DeprecationWarning)
        yield


@td.skip_if_no('xlrd', '1.0.0')
class SharedItems(object):

    @pytest.fixture(autouse=True)
    def setup_method(self, datapath):
        self.dirpath = datapath("io", "data")
        self.frame = _frame.copy()
        self.frame2 = _frame2.copy()
        self.tsframe = _tsframe.copy()
        self.mixed_frame = _mixed_frame.copy()

    def get_csv_refdf(self, basename):
        """
        Obtain the reference data from read_csv with the Python engine.

        Parameters
        ----------

        basename : str
            File base name, excluding file extension.

        Returns
        -------

        dfref : DataFrame
        """
        pref = os.path.join(self.dirpath, basename + '.csv')
        dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python')
        return dfref

    def get_excelfile(self, basename, ext):
        """
        Return test data ExcelFile instance.

        Parameters
        ----------

        basename : str
            File base name, excluding file extension.

        Returns
        -------

        excel : io.excel.ExcelFile
        """
        return ExcelFile(os.path.join(self.dirpath, basename + ext))

    def get_exceldf(self, basename, ext, *args, **kwds):
        """
        Return test data DataFrame.

        Parameters
        ----------

        basename : str
            File base name, excluding file extension.

        Returns
        -------

        df : DataFrame
        """
        pth = os.path.join(self.dirpath, basename + ext)
        return read_excel(pth, *args, **kwds)


class ReadingTestsBase(SharedItems):
    # This is based on ExcelWriterBase

    @pytest.fixture(autouse=True, params=['xlrd', None])
    def set_engine(self, request):
        func_name = "get_exceldf"
        old_func = getattr(self, func_name)
        new_func = partial(old_func, engine=request.param)
        setattr(self, func_name, new_func)
        yield
        setattr(self, func_name, old_func)

    @td.skip_if_no("xlrd", "1.0.1")  # see gh-22682
    def test_usecols_int(self, ext):

        df_ref = self.get_csv_refdf("test1")
        df_ref = df_ref.reindex(columns=["A", "B", "C"])

        # usecols as int
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            with ignore_xlrd_time_clock_warning():
                df1 = self.get_exceldf("test1", ext, "Sheet1",
                                       index_col=0, usecols=3)

        # usecols as int
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            with ignore_xlrd_time_clock_warning():
                df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
                                       index_col=0, usecols=3)

        # parse_cols instead of usecols, usecols as int
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            with ignore_xlrd_time_clock_warning():
                df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
                                       index_col=0, parse_cols=3)

        # TODO add index to xls file)
        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)
        tm.assert_frame_equal(df3, df_ref, check_names=False)

    @td.skip_if_no('xlrd', '1.0.1')  # GH-22682
    def test_usecols_list(self, ext):

        dfref = self.get_csv_refdf('test1')
        dfref = dfref.reindex(columns=['B', 'C'])
        df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
                               usecols=[0, 2, 3])
        df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
                               index_col=0, usecols=[0, 2, 3])

        with tm.assert_produces_warning(FutureWarning):
            with ignore_xlrd_time_clock_warning():
                df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
                                       index_col=0, parse_cols=[0, 2, 3])

        # TODO add index to xls file)
        tm.assert_frame_equal(df1, dfref, check_names=False)
        tm.assert_frame_equal(df2, dfref, check_names=False)
        tm.assert_frame_equal(df3, dfref, check_names=False)

    @td.skip_if_no('xlrd', '1.0.1')  # GH-22682
    def test_usecols_str(self, ext):

        dfref = self.get_csv_refdf('test1')

        df1 = dfref.reindex(columns=['A', 'B', 'C'])
        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
                               usecols='A:D')
        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
                               index_col=0, usecols='A:D')

        with tm.assert_produces_warning(FutureWarning):
            with ignore_xlrd_time_clock_warning():
                df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
                                       index_col=0, parse_cols='A:D')

        # TODO add index to xls, read xls ignores index name ?
        tm.assert_frame_equal(df2, df1, check_names=False)
        tm.assert_frame_equal(df3, df1, check_names=False)
        tm.assert_frame_equal(df4, df1, check_names=False)

        df1 = dfref.reindex(columns=['B', 'C'])
        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
                               usecols='A,C,D')
        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
                               index_col=0, usecols='A,C,D')
        # TODO add index to xls file
        tm.assert_frame_equal(df2, df1, check_names=False)
        tm.assert_frame_equal(df3, df1, check_names=False)

        df1 = dfref.reindex(columns=['B', 'C'])
        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
                               usecols='A,C:D')
        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
                               index_col=0, usecols='A,C:D')
        tm.assert_frame_equal(df2, df1, check_names=False)
        tm.assert_frame_equal(df3, df1, check_names=False)

    @pytest.mark.parametrize("usecols", [
        [0, 1, 3], [0, 3, 1],
        [1, 0, 3], [1, 3, 0],
        [3, 0, 1], [3, 1, 0],
    ])
    def test_usecols_diff_positional_int_columns_order(self, ext, usecols):
        expected = self.get_csv_refdf("test1")[["A", "C"]]
        result = self.get_exceldf("test1", ext, "Sheet1",
                                  index_col=0, usecols=usecols)
        tm.assert_frame_equal(result, expected, check_names=False)

    @pytest.mark.parametrize("usecols", [
        ["B", "D"], ["D", "B"]
    ])
    def test_usecols_diff_positional_str_columns_order(self, ext, usecols):
        expected = self.get_csv_refdf("test1")[["B", "D"]]
        expected.index = range(len(expected))

        result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols)
        tm.assert_frame_equal(result, expected, check_names=False)

    def test_read_excel_without_slicing(self, ext):
        expected = self.get_csv_refdf("test1")
        result = self.get_exceldf("test1", ext, "Sheet1", index_col=0)
        tm.assert_frame_equal(result, expected, check_names=False)

    def test_usecols_excel_range_str(self, ext):
        expected = self.get_csv_refdf("test1")[["C", "D"]]
        result = self.get_exceldf("test1", ext, "Sheet1",
                                  index_col=0, usecols="A,D:E")
        tm.assert_frame_equal(result, expected, check_names=False)

    def test_usecols_excel_range_str_invalid(self, ext):
        msg = "Invalid column name: E1"

        with pytest.raises(ValueError, match=msg):
            self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1")

    def test_index_col_label_error(self, ext):
        msg = "list indices must be integers.*, not str"

        with pytest.raises(TypeError, match=msg):
            self.get_exceldf("test1", ext, "Sheet1", index_col=["A"],
                             usecols=["A", "C"])

    def test_index_col_empty(self, ext):
        # see gh-9208
        result = self.get_exceldf("test1", ext, "Sheet3",
                                  index_col=["A", "B", "C"])
        expected = DataFrame(columns=["D", "E", "F"],
                             index=MultiIndex(levels=[[]] * 3,
                                              codes=[[]] * 3,
                                              names=["A", "B", "C"]))
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("index_col", [None, 2])
    def test_index_col_with_unnamed(self, ext, index_col):
        # see gh-18792
        result = self.get_exceldf("test1", ext, "Sheet4",
                                  index_col=index_col)
        expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
                             columns=["Unnamed: 0", "col1", "col2"])
        if index_col:
            expected = expected.set_index(expected.columns[index_col])

        tm.assert_frame_equal(result, expected)

    def test_usecols_pass_non_existent_column(self, ext):
        msg = ("Usecols do not match columns, "
               "columns expected but not found: " + r"\['E'\]")

        with pytest.raises(ValueError, match=msg):
            self.get_exceldf("test1", ext, usecols=["E"])

    def test_usecols_wrong_type(self, ext):
        msg = ("'usecols' must either be list-like of "
               "all strings, all unicode, all integers or a callable.")

        with pytest.raises(ValueError, match=msg):
            self.get_exceldf("test1", ext, usecols=["E1", 0])

    def test_excel_stop_iterator(self, ext):

        parsed = self.get_exceldf('test2', ext, 'Sheet1')
        expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1'])
        tm.assert_frame_equal(parsed, expected)

    def test_excel_cell_error_na(self, ext):

        parsed = self.get_exceldf('test3', ext, 'Sheet1')
        expected = DataFrame([[np.nan]], columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

    def test_excel_passes_na(self, ext):

        excel = self.get_excelfile('test4', ext)

        parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
                            na_values=['apple'])
        expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
                            na_values=['apple'])
        expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        # 13967
        excel = self.get_excelfile('test5', ext)

        parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
                            na_values=['apple'])
        expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
                            na_values=['apple'])
        expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

    @td.skip_if_no('xlrd', '1.0.1')  # GH-22682
Loading ...