Skip to content

Commit 4a6a918

Browse files
gfyoungjreback
authored andcommitted
BUG: 'Unnamed' != unnamed column in CSV (#23687)
False criterion was causing errors when specified headers appeared to capture a seemingly unnamed row, just because they had the string "Unnamed" in it.
1 parent e98032d commit 4a6a918

File tree

3 files changed

+52
-8
lines changed

3 files changed

+52
-8
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1359,6 +1359,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13591359
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
13601360
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
13611361
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
1362+
- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23687`)
13621363
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
13631364
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
13641365
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

pandas/io/parsers.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -1387,22 +1387,20 @@ def extract(r):
13871387
columns = lzip(*[extract(r) for r in header])
13881388
names = ic + columns
13891389

1390-
def tostr(x):
1391-
return str(x) if not isinstance(x, compat.string_types) else x
1392-
1393-
# if we find 'Unnamed' all of a single level, then our header was too
1394-
# long
1390+
# If we find unnamed columns all in a single
1391+
# level, then our header was too long.
13951392
for n in range(len(columns[0])):
1396-
if all('Unnamed' in tostr(c[n]) for c in columns):
1393+
if all(compat.to_str(c[n]) in self.unnamed_cols for c in columns):
13971394
raise ParserError(
13981395
"Passed header=[%s] are too many rows for this "
13991396
"multi_index of columns"
14001397
% ','.join(str(x) for x in self.header)
14011398
)
14021399

1403-
# clean the column names (if we have an index_col)
1400+
# Clean the column names (if we have an index_col).
14041401
if len(ic):
1405-
col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None
1402+
col_names = [r[0] if (len(r[0]) and
1403+
r[0] not in self.unnamed_cols) else None
14061404
for r in header]
14071405
else:
14081406
col_names = [None] * len(header)

pandas/tests/io/parser/header.py

+45
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pytest
1212

1313
from pandas.compat import StringIO, lrange, u
14+
from pandas.errors import ParserError
1415

1516
from pandas import DataFrame, Index, MultiIndex
1617
import pandas.util.testing as tm
@@ -360,3 +361,47 @@ def test_mangles_multi_index(self):
360361
('A', 'one.1.1'), ('B', 'two'),
361362
('B', 'two.1')]))
362363
tm.assert_frame_equal(df, expected)
364+
365+
@pytest.mark.parametrize("index_col", [None, [0]])
366+
@pytest.mark.parametrize("columns", [None,
367+
(["", "Unnamed"]),
368+
(["Unnamed", ""]),
369+
(["Unnamed", "NotUnnamed"])])
370+
def test_multi_index_unnamed(self, index_col, columns):
371+
# see gh-23687
372+
#
373+
# When specifying a multi-index header, make sure that
374+
# we don't error just because one of the rows in our header
375+
# has ALL column names containing the string "Unnamed". The
376+
# correct condition to check is whether the row contains
377+
# ALL columns that did not have names (and instead were given
378+
# placeholder ones).
379+
header = [0, 1]
380+
381+
if index_col is None:
382+
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
383+
else:
384+
data = (",".join([""] + (columns or ["", ""])) +
385+
"\n,0,1\n0,2,3\n1,4,5\n")
386+
387+
if columns is None:
388+
msg = (r"Passed header=\[0,1\] are too "
389+
r"many rows for this multi_index of columns")
390+
with pytest.raises(ParserError, match=msg):
391+
self.read_csv(StringIO(data), header=header,
392+
index_col=index_col)
393+
else:
394+
result = self.read_csv(StringIO(data), header=header,
395+
index_col=index_col)
396+
template = "Unnamed: {i}_level_0"
397+
exp_columns = []
398+
399+
for i, col in enumerate(columns):
400+
if not col: # Unnamed.
401+
col = template.format(i=i if index_col is None else i + 1)
402+
403+
exp_columns.append(col)
404+
405+
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
406+
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
407+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)