BUG: Patch Python 2.x non-conversion of Unicode

gfyoung · gfyoung · commit 78618500f17b · 2018-11-28T05:43:42.000-08:00
For column names only.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1402,6 +1402,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
 - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
 - Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
+- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`)
 - Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`)
 - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
 - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`)
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -662,10 +662,14 @@ def _parse_cell(cell_contents, cell_typ):
 
                 output[asheetname] = parser.read(nrows=nrows)
 
-                if ((not squeeze or isinstance(output[asheetname], DataFrame))
-                        and header_names):
-                    output[asheetname].columns = output[
-                        asheetname].columns.set_names(header_names)
+                if not squeeze or isinstance(output[asheetname], DataFrame):
+                    if header_names:
+                        output[asheetname].columns = output[
+                            asheetname].columns.set_names(header_names)
+                    elif compat.PY2:
+                        output[asheetname].columns = _maybe_convert_to_string(
+                            output[asheetname].columns)
+
             except EmptyDataError:
                 # No Data, return an empty DataFrame
                 output[asheetname] = DataFrame()
@@ -810,6 +814,39 @@ def _trim_excel_header(row):
     return row
 
 
+def _maybe_convert_to_string(row):
+    """
+    Convert elements in a row to string from Unicode.
+
+    This is purely a Python 2.x patch and is performed ONLY when all
+    elements of the row are string-like.
+
+    Parameters
+    ----------
+    row : array-like
+        The row of data to convert.
+
+    Returns
+    -------
+    converted : array-like
+    """
+    if compat.PY2:
+        converted = []
+
+        for i in range(len(row)):
+            if isinstance(row[i], compat.string_types):
+                try:
+                    converted.append(str(row[i]))
+                except UnicodeEncodeError:
+                    break
+            else:
+                break
+        else:
+            row = converted
+
+    return row
+
+
 def _fill_mi_header(row, control_row):
     """Forward fills blank entries in row, but only inside the same parent index
 
@@ -838,7 +875,7 @@ def _fill_mi_header(row, control_row):
             control_row[i] = False
             last = row[i]
 
-    return row, control_row
+    return _maybe_convert_to_string(row), control_row
 
 # fill blank if index_col not None