diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d0dddb19f4c93..2fbd23352feb1 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1402,7 +1402,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) - Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) -- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`) +- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) +- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 1328713736b03..880ff5a56804f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -662,10 +662,14 @@ def _parse_cell(cell_contents, cell_typ): output[asheetname] = parser.read(nrows=nrows) - if ((not squeeze or isinstance(output[asheetname], DataFrame)) - and header_names): - output[asheetname].columns = output[ - asheetname].columns.set_names(header_names) + if not squeeze or isinstance(output[asheetname], DataFrame): + if header_names: + output[asheetname].columns = output[ + asheetname].columns.set_names(header_names) + elif compat.PY2: + output[asheetname].columns = _maybe_convert_to_string( + output[asheetname].columns) + except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() @@ -810,6 +814,39 @@ def _trim_excel_header(row): return row +def _maybe_convert_to_string(row): + """ + Convert elements in a row to string from Unicode. + + This is purely a Python 2.x patch and is performed ONLY when all + elements of the row are string-like. + + Parameters + ---------- + row : array-like + The row of data to convert. + + Returns + ------- + converted : array-like + """ + if compat.PY2: + converted = [] + + for i in range(len(row)): + if isinstance(row[i], compat.string_types): + try: + converted.append(str(row[i])) + except UnicodeEncodeError: + break + else: + break + else: + row = converted + + return row + + def _fill_mi_header(row, control_row): """Forward fills blank entries in row, but only inside the same parent index @@ -838,7 +875,7 @@ def _fill_mi_header(row, control_row): control_row[i] = False last = row[i] - return row, control_row + return _maybe_convert_to_string(row), control_row # fill blank if index_col not None diff --git a/pandas/tests/io/data/test1.xls b/pandas/tests/io/data/test1.xls index a5940b2cfa6c2..faf5dc84700c9 100644 Binary files a/pandas/tests/io/data/test1.xls and b/pandas/tests/io/data/test1.xls differ diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm index 981c303b7bd30..f93c57ab7f857 100644 Binary files a/pandas/tests/io/data/test1.xlsm and b/pandas/tests/io/data/test1.xlsm differ diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx index 8f011d0687521..a437d838fe130 100644 Binary files a/pandas/tests/io/data/test1.xlsx and b/pandas/tests/io/data/test1.xlsx differ diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 741d03a8dc0c2..7cc1f1899db98 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -264,6 +264,18 @@ def test_index_col_empty(self, ext): names=["A", "B", "C"])) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("index_col", [None, 2]) + def test_index_col_with_unnamed(self, ext, index_col): + # see gh-18792 + result = self.get_exceldf("test1", ext, "Sheet4", + index_col=index_col) + expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], + columns=["Unnamed: 0", "col1", "col2"]) + if index_col: + expected = expected.set_index(expected.columns[index_col]) + + tm.assert_frame_equal(result, expected) + def test_usecols_pass_non_existent_column(self, ext): msg = ("Usecols do not match columns, " "columns expected but not found: " + r"\['E'\]") @@ -923,9 +935,9 @@ def test_read_excel_multiindex_empty_level(self, ext): }) expected = DataFrame({ - ("One", u"x"): {0: 1}, - ("Two", u"X"): {0: 3}, - ("Two", u"Y"): {0: 7}, + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, ("Zero", "Unnamed: 4_level_1"): {0: 0} }) @@ -942,9 +954,9 @@ def test_read_excel_multiindex_empty_level(self, ext): expected = pd.DataFrame({ ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", u"x"): {0: 1}, - ("Tail", u"X"): {0: 3}, - ("Tail", u"Y"): {0: 7} + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} }) df.to_excel(path)