Skip to content

Commit adc47d8

Browse files
gfyoungjreback
authored andcommitted
TST: Test unnamed columns with index_col for Excel (pandas-dev#23874)
1 parent 66abbc3 commit adc47d8

File tree

6 files changed

+62
-12
lines changed

6 files changed

+62
-12
lines changed

doc/source/whatsnew/v0.24.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -1428,7 +1428,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
14281428
- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
14291429
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
14301430
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
1431-
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
1431+
- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`)
1432+
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`)
14321433
- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
14331434
- :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`)
14341435

pandas/io/excel.py

+42-5
Original file line numberDiff line numberDiff line change
@@ -662,10 +662,14 @@ def _parse_cell(cell_contents, cell_typ):
662662

663663
output[asheetname] = parser.read(nrows=nrows)
664664

665-
if ((not squeeze or isinstance(output[asheetname], DataFrame))
666-
and header_names):
667-
output[asheetname].columns = output[
668-
asheetname].columns.set_names(header_names)
665+
if not squeeze or isinstance(output[asheetname], DataFrame):
666+
if header_names:
667+
output[asheetname].columns = output[
668+
asheetname].columns.set_names(header_names)
669+
elif compat.PY2:
670+
output[asheetname].columns = _maybe_convert_to_string(
671+
output[asheetname].columns)
672+
669673
except EmptyDataError:
670674
# No Data, return an empty DataFrame
671675
output[asheetname] = DataFrame()
@@ -810,6 +814,39 @@ def _trim_excel_header(row):
810814
return row
811815

812816

817+
def _maybe_convert_to_string(row):
818+
"""
819+
Convert elements in a row to string from Unicode.
820+
821+
This is purely a Python 2.x patch and is performed ONLY when all
822+
elements of the row are string-like.
823+
824+
Parameters
825+
----------
826+
row : array-like
827+
The row of data to convert.
828+
829+
Returns
830+
-------
831+
converted : array-like
832+
"""
833+
if compat.PY2:
834+
converted = []
835+
836+
for i in range(len(row)):
837+
if isinstance(row[i], compat.string_types):
838+
try:
839+
converted.append(str(row[i]))
840+
except UnicodeEncodeError:
841+
break
842+
else:
843+
break
844+
else:
845+
row = converted
846+
847+
return row
848+
849+
813850
def _fill_mi_header(row, control_row):
814851
"""Forward fills blank entries in row, but only inside the same parent index
815852
@@ -838,7 +875,7 @@ def _fill_mi_header(row, control_row):
838875
control_row[i] = False
839876
last = row[i]
840877

841-
return row, control_row
878+
return _maybe_convert_to_string(row), control_row
842879

843880
# fill blank if index_col not None
844881

pandas/tests/io/data/test1.xls

512 Bytes
Binary file not shown.

pandas/tests/io/data/test1.xlsm

895 Bytes
Binary file not shown.

pandas/tests/io/data/test1.xlsx

896 Bytes
Binary file not shown.

pandas/tests/io/test_excel.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,18 @@ def test_index_col_empty(self, ext):
264264
names=["A", "B", "C"]))
265265
tm.assert_frame_equal(result, expected)
266266

267+
@pytest.mark.parametrize("index_col", [None, 2])
268+
def test_index_col_with_unnamed(self, ext, index_col):
269+
# see gh-18792
270+
result = self.get_exceldf("test1", ext, "Sheet4",
271+
index_col=index_col)
272+
expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
273+
columns=["Unnamed: 0", "col1", "col2"])
274+
if index_col:
275+
expected = expected.set_index(expected.columns[index_col])
276+
277+
tm.assert_frame_equal(result, expected)
278+
267279
def test_usecols_pass_non_existent_column(self, ext):
268280
msg = ("Usecols do not match columns, "
269281
"columns expected but not found: " + r"\['E'\]")
@@ -923,9 +935,9 @@ def test_read_excel_multiindex_empty_level(self, ext):
923935
})
924936

925937
expected = DataFrame({
926-
("One", u"x"): {0: 1},
927-
("Two", u"X"): {0: 3},
928-
("Two", u"Y"): {0: 7},
938+
("One", "x"): {0: 1},
939+
("Two", "X"): {0: 3},
940+
("Two", "Y"): {0: 7},
929941
("Zero", "Unnamed: 4_level_1"): {0: 0}
930942
})
931943

@@ -942,9 +954,9 @@ def test_read_excel_multiindex_empty_level(self, ext):
942954

943955
expected = pd.DataFrame({
944956
("Beg", "Unnamed: 1_level_1"): {0: 0},
945-
("Middle", u"x"): {0: 1},
946-
("Tail", u"X"): {0: 3},
947-
("Tail", u"Y"): {0: 7}
957+
("Middle", "x"): {0: 1},
958+
("Tail", "X"): {0: 3},
959+
("Tail", "Y"): {0: 7}
948960
})
949961

950962
df.to_excel(path)

0 commit comments

Comments
 (0)