BUG: 'Unnamed' != unnamed column in CSV (#23687)

gfyoung · jreback · commit 4a6a9187ce14 · 2018-11-16T09:02:12.000-05:00
False criterion was causing errors when
specified headers appeared to capture
a seemingly unnamed row, just because
they had the string "Unnamed" in it.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1359,6 +1359,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
 - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
 - Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
+- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`)
 - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
 - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
 - :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1387,22 +1387,20 @@ def extract(r):
         columns = lzip(*[extract(r) for r in header])
         names = ic + columns
 
-        def tostr(x):
-            return str(x) if not isinstance(x, compat.string_types) else x
-
-        # if we find 'Unnamed' all of a single level, then our header was too
-        # long
+        # If we find unnamed columns all in a single
+        # level, then our header was too long.
         for n in range(len(columns[0])):
-            if all('Unnamed' in tostr(c[n]) for c in columns):
+            if all(compat.to_str(c[n]) in self.unnamed_cols for c in columns):
                 raise ParserError(
                     "Passed header=[%s] are too many rows for this "
                     "multi_index of columns"
                     % ','.join(str(x) for x in self.header)
                 )
 
-        # clean the column names (if we have an index_col)
+        # Clean the column names (if we have an index_col).
         if len(ic):
-            col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None
+            col_names = [r[0] if (len(r[0]) and
+                                  r[0] not in self.unnamed_cols) else None
                          for r in header]
         else:
             col_names = [None] * len(header)
diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py
@@ -11,6 +11,7 @@
 import pytest
 
 from pandas.compat import StringIO, lrange, u
+from pandas.errors import ParserError
 
 from pandas import DataFrame, Index, MultiIndex
 import pandas.util.testing as tm
@@ -360,3 +361,47 @@ def test_mangles_multi_index(self):
                                   ('A', 'one.1.1'), ('B', 'two'),
                                   ('B', 'two.1')]))
         tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize("index_col", [None, [0]])
+    @pytest.mark.parametrize("columns", [None,
+                                         (["", "Unnamed"]),
+                                         (["Unnamed", ""]),
+                                         (["Unnamed", "NotUnnamed"])])
+    def test_multi_index_unnamed(self, index_col, columns):
+        # see gh-23687
+        #
+        # When specifying a multi-index header, make sure that
+        # we don't error just because one of the rows in our header
+        # has ALL column names containing the string "Unnamed". The
+        # correct condition to check is whether the row contains
+        # ALL columns that did not have names (and instead were given
+        # placeholder ones).
+        header = [0, 1]
+
+        if index_col is None:
+            data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
+        else:
+            data = (",".join([""] + (columns or ["", ""])) +
+                    "\n,0,1\n0,2,3\n1,4,5\n")
+
+        if columns is None:
+            msg = (r"Passed header=\[0,1\] are too "
+                   r"many rows for this multi_index of columns")
+            with pytest.raises(ParserError, match=msg):
+                self.read_csv(StringIO(data), header=header,
+                              index_col=index_col)
+        else:
+            result = self.read_csv(StringIO(data), header=header,
+                                   index_col=index_col)
+            template = "Unnamed: {i}_level_0"
+            exp_columns = []
+
+            for i, col in enumerate(columns):
+                if not col:  # Unnamed.
+                    col = template.format(i=i if index_col is None else i + 1)
+
+                exp_columns.append(col)
+
+            columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
+            expected = DataFrame([[2, 3], [4, 5]], columns=columns)
+            tm.assert_frame_equal(result, expected)