diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 112d14795d9bf..f9926cd26d8da 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -49,7 +49,6 @@ def ignore_xlrd_time_clock_warning(): yield -@td.skip_if_no('xlrd', '1.0.0') class SharedItems: @pytest.fixture(autouse=True) @@ -60,6 +59,20 @@ def setup_method(self, datapath): self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy() + +@td.skip_if_no('xlrd', '1.0.0') +class ReadingTestsBase(SharedItems): + # This is based on ExcelWriterBase + + @pytest.fixture(autouse=True, params=['xlrd', None]) + def set_engine(self, request): + func_name = "get_exceldf" + old_func = getattr(self, func_name) + new_func = partial(old_func, engine=request.param) + setattr(self, func_name, new_func) + yield + setattr(self, func_name, old_func) + def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. @@ -114,19 +127,6 @@ def get_exceldf(self, basename, ext, *args, **kwds): pth = os.path.join(self.dirpath, basename + ext) return read_excel(pth, *args, **kwds) - -class ReadingTestsBase(SharedItems): - # This is based on ExcelWriterBase - - @pytest.fixture(autouse=True, params=['xlrd', None]) - def set_engine(self, request): - func_name = "get_exceldf" - old_func = getattr(self, func_name) - new_func = partial(old_func, engine=request.param) - setattr(self, func_name, new_func) - yield - setattr(self, func_name, old_func) - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_usecols_int(self, ext): @@ -565,74 +565,6 @@ def test_read_excel_blank_with_header(self, ext): actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([np.nan] * 4)), - (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) - ]) - def test_read_one_empty_col_no_header(self, ext, header, expected): - # xref gh-12292 - filename = "no_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, filename, index=False, header=False) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([0] + [np.nan] * 4)), - (0, DataFrame([np.nan] * 4)) - ]) - def test_read_one_empty_col_with_header(self, ext, header, expected): - filename = "with_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, 'with_header', index=False, header=True) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - def test_set_column_names_in_parameter(self, ext): - # GH 12870 : pass down column names associated with - # keyword argument names - refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], - [3, 'baz']], columns=['a', 'b']) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as writer: - refdf.to_excel(writer, 'Data_no_head', - header=False, index=False) - refdf.to_excel(writer, 'Data_with_head', index=False) - - refdf.columns = ['A', 'B'] - - with ExcelFile(pth) as reader: - xlsdf_no_head = read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) - xlsdf_with_head = read_excel(reader, 'Data_with_head', - index_col=None, names=['A', 'B']) - - tm.assert_frame_equal(xlsdf_no_head, refdf) - tm.assert_frame_equal(xlsdf_with_head, refdf) - def test_date_conversion_overflow(self, ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], @@ -741,7 +673,6 @@ def test_read_from_file_url(self, ext): tm.assert_frame_equal(url_table, local_table) - @td.skip_if_no('pathlib') def test_read_from_pathlib_path(self, ext): # GH12655 @@ -780,32 +711,6 @@ def test_reader_closes_file(self, ext): assert f.closed - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_creating_and_reading_multiple_sheets(self, ext): - # see gh-9450 - # - # Test reading multiple sheets, from a runtime - # created Excel file with multiple sheets. - def tdf(col_sheet_name): - d, i = [11, 22, 33], [1, 2, 3] - return DataFrame(d, i, columns=[col_sheet_name]) - - sheets = ["AAA", "BBB", "CCC"] - - dfs = [tdf(s) for s in sheets] - dfs = dict(zip(sheets, dfs)) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as ew: - for sheetname, df in dfs.items(): - df.to_excel(ew, sheetname) - - dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) - - for s in sheets: - tm.assert_frame_equal(dfs[s], dfs_returned[s]) - def test_reader_seconds(self, ext): # Test reading times with and without milliseconds. GH5945. @@ -902,84 +807,6 @@ def test_read_excel_multiindex_header_only(self, ext): expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) tm.assert_frame_equal(result, expected) - @td.skip_if_no("xlsxwriter") - def test_read_excel_multiindex_empty_level(self, ext): - # see gh-12453 - with ensure_clean(ext) as path: - df = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0} - }) - - expected = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - df = pd.DataFrame({ - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - expected = pd.DataFrame({ - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - @td.skip_if_no("xlsxwriter") - @pytest.mark.parametrize("c_idx_names", [True, False]) - @pytest.mark.parametrize("r_idx_names", [True, False]) - @pytest.mark.parametrize("c_idx_levels", [1, 3]) - @pytest.mark.parametrize("r_idx_levels", [1, 3]) - def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels): - # see gh-4679 - with ensure_clean(ext) as pth: - if c_idx_levels == 1 and c_idx_names: - pytest.skip("Column index name cannot be " - "serialized unless it's a MultiIndex") - - # Empty name case current read in as - # unnamed levels, not Nones. - check_names = r_idx_names or r_idx_levels <= 1 - - df = mkdf(5, 5, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels) - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[0, :] = np.nan - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - def test_excel_old_index_format(self, ext): # see gh-4679 filename = "test_index_name_pre17" + ext @@ -1054,30 +881,6 @@ def test_read_excel_chunksize(self, ext): pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), chunksize=100) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_read_excel_parse_dates(self, ext): - # see gh-11544, gh-12051 - df = DataFrame( - {"col": [1, 2, 3], - "date_strings": pd.date_range("2012-01-01", periods=3)}) - df2 = df.copy() - df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") - - with ensure_clean(ext) as pth: - df2.to_excel(pth) - - res = read_excel(pth, index_col=0) - tm.assert_frame_equal(df2, res) - - res = read_excel(pth, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) - - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) - tm.assert_frame_equal(df, res) - def test_read_excel_skiprows_list(self, ext): # GH 4903 actual = pd.read_excel(os.path.join(self.dirpath, @@ -1141,6 +944,208 @@ def test_read_excel_squeeze(self, ext): tm.assert_series_equal(actual, expected) +@td.skip_if_no('xlrd', '1.0.0') +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestRoundTrip: + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([np.nan] * 4)), + (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) + ]) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, filename, index=False, header=False) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([0] + [np.nan] * 4)), + (0, DataFrame([np.nan] * 4)) + ]) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, 'with_header', index=False, header=True) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_set_column_names_in_parameter(self, ext): + # GH 12870 : pass down column names associated with + # keyword argument names + refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], + [3, 'baz']], columns=['a', 'b']) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as writer: + refdf.to_excel(writer, 'Data_no_head', + header=False, index=False) + refdf.to_excel(writer, 'Data_with_head', index=False) + + refdf.columns = ['A', 'B'] + + with ExcelFile(pth) as reader: + xlsdf_no_head = read_excel(reader, 'Data_no_head', + header=None, names=['A', 'B']) + xlsdf_with_head = read_excel(reader, 'Data_with_head', + index_col=None, names=['A', 'B']) + + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_creating_and_reading_multiple_sheets(self, ext): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[col_sheet_name]) + + sheets = ["AAA", "BBB", "CCC"] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets, dfs)) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in dfs.items(): + df.to_excel(ew, sheetname) + + dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) + + @td.skip_if_no("xlsxwriter") + def test_read_excel_multiindex_empty_level(self, ext): + # see gh-12453 + with ensure_clean(ext) as path: + df = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0} + }) + + expected = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame({ + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + expected = pd.DataFrame({ + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels): + # see gh-4679 + with ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip("Column index name cannot be " + "serialized unless it's a MultiIndex") + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = mkdf(5, 5, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels) + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_read_excel_parse_dates(self, ext): + # see gh-11544, gh-12051 + df = DataFrame( + {"col": [1, 2, 3], + "date_strings": pd.date_range("2012-01-01", periods=3)}) + df2 = df.copy() + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") + + with ensure_clean(ext) as pth: + df2.to_excel(pth) + + res = read_excel(pth, index_col=0) + tm.assert_frame_equal(df2, res) + + res = read_excel(pth, parse_dates=["date_strings"], index_col=0) + tm.assert_frame_equal(df, res) + + date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + res = read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) + tm.assert_frame_equal(df, res) + + +@td.skip_if_no('xlrd', '1.0.0') @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestXlrdReader(ReadingTestsBase): """