Skip to content

Commit ec41e07

Browse files
committed
BUG: 'Unnamed' != unnamed column in CSV
False criterion was causing errors when specified headers appeared to capture a seemingly unnamed row, just because they had the string "Unnamed" in it.
1 parent a197837 commit ec41e07

File tree

3 files changed

+45
-6
lines changed

3 files changed

+45
-6
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13071307
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
13081308
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
13091309
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
1310+
- Bug in :func:`read_csv()` in unnamed columns were being improperly identified when extracting a multi-index (:issue:`23686`)
13101311
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
13111312
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
13121313
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

pandas/io/parsers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -1387,22 +1387,23 @@ def extract(r):
13871387
columns = lzip(*[extract(r) for r in header])
13881388
names = ic + columns
13891389

1390-
def tostr(x):
1390+
def to_str(x):
13911391
return str(x) if not isinstance(x, compat.string_types) else x
13921392

1393-
# if we find 'Unnamed' all of a single level, then our header was too
1394-
# long
1393+
# If we find unnamed columns all in a single
1394+
# level, then our header was too long.
13951395
for n in range(len(columns[0])):
1396-
if all('Unnamed' in tostr(c[n]) for c in columns):
1396+
if all(to_str(c[n]) in self.unnamed_cols for c in columns):
13971397
raise ParserError(
13981398
"Passed header=[%s] are too many rows for this "
13991399
"multi_index of columns"
14001400
% ','.join(str(x) for x in self.header)
14011401
)
14021402

1403-
# clean the column names (if we have an index_col)
1403+
# Clean the column names (if we have an index_col).
14041404
if len(ic):
1405-
col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None
1405+
col_names = [r[0] if (len(r[0]) and
1406+
r[0] not in self.unnamed_cols) else None
14061407
for r in header]
14071408
else:
14081409
col_names = [None] * len(header)

pandas/tests/io/parser/header.py

+37
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pytest
1212

1313
from pandas.compat import StringIO, lrange, u
14+
from pandas.errors import ParserError
1415

1516
from pandas import DataFrame, Index, MultiIndex
1617
import pandas.util.testing as tm
@@ -360,3 +361,39 @@ def test_mangles_multi_index(self):
360361
('A', 'one.1.1'), ('B', 'two'),
361362
('B', 'two.1')]))
362363
tm.assert_frame_equal(df, expected)
364+
365+
@pytest.mark.parametrize("index_col", [None, [0]])
366+
@pytest.mark.parametrize("columns", [None,
367+
(["", "Unnamed"]),
368+
(["Unnamed", ""]),
369+
(["Unnamed", "NotUnnamed"])])
370+
def test_multi_index_unnamed(self, index_col, columns):
371+
header = [0, 1]
372+
373+
if index_col is None:
374+
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
375+
else:
376+
data = (",".join([""] + (columns or ["", ""])) +
377+
"\n,0,1\n0,2,3\n1,4,5\n")
378+
379+
if columns is None:
380+
msg = (r"Passed header=\[0,1\] are too "
381+
r"many rows for this multi_index of columns")
382+
with pytest.raises(ParserError, match=msg):
383+
self.read_csv(StringIO(data), header=header,
384+
index_col=index_col)
385+
else:
386+
result = self.read_csv(StringIO(data), header=header,
387+
index_col=index_col)
388+
template = "Unnamed: {i}_level_0"
389+
exp_columns = []
390+
391+
for i, col in enumerate(columns):
392+
if not col: # Unnamed.
393+
col = template.format(i=i if index_col is None else i + 1)
394+
395+
exp_columns.append(col)
396+
397+
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
398+
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
399+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)