Skip to content

Commit 2668351

Browse files
committed
BUG: Fix of handle missing CSV MI column names
Before, only the first index name got replaced with `None` so long as it had the string "Unnamed" in it. Now we replace all index names with `None` if they were deliberately set with placeholders.
1 parent b9fc22d commit 2668351

File tree

4 files changed

+59
-24
lines changed

4 files changed

+59
-24
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
12751275
- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`)
12761276
- Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`).
12771277
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
1278+
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
12781279

12791280
Plotting
12801281
^^^^^^^^

pandas/_libs/parsers.pyx

+9-6
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ cdef class TextReader:
302302
object tupleize_cols
303303
object usecols
304304
list dtype_cast_order
305+
set unnamed_cols
305306
set noconvert
306307

307308
def __cinit__(self, source,
@@ -536,7 +537,7 @@ cdef class TextReader:
536537
self.header = [ header ]
537538

538539
self.names = names
539-
self.header, self.table_width = self._get_header()
540+
self.header, self.table_width, self.unnamed_cols = self._get_header()
540541

541542
if not self.table_width:
542543
raise EmptyDataError("No columns to parse from file")
@@ -727,6 +728,8 @@ cdef class TextReader:
727728
cdef StringPath path = _string_path(self.c_encoding)
728729

729730
header = []
731+
unnamed_cols = set()
732+
730733
if self.parser.header_start >= 0:
731734

732735
# Header is in the file
@@ -758,7 +761,6 @@ cdef class TextReader:
758761
start = self.parser.line_start[hr]
759762

760763
counts = {}
761-
unnamed_count = 0
762764
for i in range(field_count):
763765
word = self.parser.words[start + i]
764766

@@ -776,8 +778,7 @@ cdef class TextReader:
776778
.format(i=i, lvl=level))
777779
else:
778780
name = 'Unnamed: {i}'.format(i=i)
779-
unnamed_count += 1
780-
781+
unnamed_cols.add(name)
781782
count = counts.get(name, 0)
782783

783784
if not self.has_mi_columns and self.mangle_dupe_cols:
@@ -798,6 +799,8 @@ cdef class TextReader:
798799
lc = len(this_header)
799800
ic = (len(self.index_col) if self.index_col
800801
is not None else 0)
802+
unnamed_count = len(unnamed_cols)
803+
801804
if lc != unnamed_count and lc - ic > unnamed_count:
802805
hr -= 1
803806
self.parser_start -= 1
@@ -830,7 +833,7 @@ cdef class TextReader:
830833
if self.parser.lines < 1:
831834
self._tokenize_rows(1)
832835

833-
return None, self.parser.line_fields[0]
836+
return None, self.parser.line_fields[0], unnamed_cols
834837

835838
# Corner case, not enough lines in the file
836839
if self.parser.lines < data_line + 1:
@@ -864,7 +867,7 @@ cdef class TextReader:
864867
elif self.allow_leading_cols and passed_count < field_count:
865868
self.leading_cols = field_count - passed_count
866869

867-
return header, field_count
870+
return header, field_count, unnamed_cols
868871

869872
def read(self, rows=None):
870873
"""

pandas/io/parsers.py

+31-18
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,7 @@ def __init__(self, kwds):
12651265
self.prefix = kwds.pop('prefix', None)
12661266

12671267
self.index_col = kwds.get('index_col', None)
1268+
self.unnamed_cols = set()
12681269
self.index_names = None
12691270
self.col_names = None
12701271

@@ -1374,7 +1375,8 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names,
13741375
# clean the index_names
13751376
index_names = header.pop(-1)
13761377
index_names, names, index_col = _clean_index_names(index_names,
1377-
self.index_col)
1378+
self.index_col,
1379+
self.unnamed_cols)
13781380

13791381
# extract the columns
13801382
field_count = len(header[0])
@@ -1454,7 +1456,8 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):
14541456
if not self._name_processed:
14551457
(self.index_names, _,
14561458
self.index_col) = _clean_index_names(list(columns),
1457-
self.index_col)
1459+
self.index_col,
1460+
self.unnamed_cols)
14581461
self._name_processed = True
14591462
index = self._get_complex_date_index(data, columns)
14601463
index = self._agg_index(index, try_parse_dates=False)
@@ -1732,6 +1735,7 @@ def __init__(self, src, **kwds):
17321735
kwds['usecols'] = self.usecols
17331736

17341737
self._reader = parsers.TextReader(src, **kwds)
1738+
self.unnamed_cols = self._reader.unnamed_cols
17351739

17361740
passed_names = self.names is None
17371741

@@ -1792,7 +1796,8 @@ def __init__(self, src, **kwds):
17921796
self._name_processed = True
17931797
(index_names, self.names,
17941798
self.index_col) = _clean_index_names(self.names,
1795-
self.index_col)
1799+
self.index_col,
1800+
self.unnamed_cols)
17961801

17971802
if self.index_names is None:
17981803
self.index_names = index_names
@@ -1966,7 +1971,8 @@ def _get_index_names(self):
19661971

19671972
if self._reader.leading_cols == 0 and self.index_col is not None:
19681973
(idx_names, names,
1969-
self.index_col) = _clean_index_names(names, self.index_col)
1974+
self.index_col) = _clean_index_names(names, self.index_col,
1975+
self.unnamed_cols)
19701976

19711977
return names, idx_names
19721978

@@ -2112,7 +2118,8 @@ def __init__(self, f, **kwds):
21122118
# Get columns in two steps: infer from data, then
21132119
# infer column indices from self.usecols if it is specified.
21142120
self._col_indices = None
2115-
self.columns, self.num_original_columns = self._infer_columns()
2121+
(self.columns, self.num_original_columns,
2122+
self.unnamed_cols) = self._infer_columns()
21162123

21172124
# Now self.columns has the set of columns that we will process.
21182125
# The original set is stored in self.original_columns.
@@ -2367,6 +2374,8 @@ def _infer_columns(self):
23672374
names = self.names
23682375
num_original_columns = 0
23692376
clear_buffer = True
2377+
unnamed_cols = set()
2378+
23702379
if self.header is not None:
23712380
header = self.header
23722381

@@ -2400,24 +2409,25 @@ def _infer_columns(self):
24002409
if clear_buffer:
24012410
self._clear_buffer()
24022411
columns.append([None] * len(columns[-1]))
2403-
return columns, num_original_columns
2412+
return columns, num_original_columns, unnamed_cols
24042413

24052414
if not self.names:
24062415
raise EmptyDataError(
24072416
"No columns to parse from file")
24082417

24092418
line = self.names[:]
24102419

2411-
unnamed_count = 0
24122420
this_columns = []
24132421
for i, c in enumerate(line):
24142422
if c == '':
24152423
if have_mi_columns:
2416-
this_columns.append('Unnamed: %d_level_%d'
2417-
% (i, level))
2424+
col_name = ("Unnamed: {i}_level_{level}"
2425+
.format(i=i, level=level))
24182426
else:
2419-
this_columns.append('Unnamed: %d' % i)
2420-
unnamed_count += 1
2427+
col_name = "Unnamed: {i}".format(i=i)
2428+
2429+
unnamed_cols.add(col_name)
2430+
this_columns.append(col_name)
24212431
else:
24222432
this_columns.append(c)
24232433

@@ -2443,6 +2453,8 @@ def _infer_columns(self):
24432453
lc = len(this_columns)
24442454
ic = (len(self.index_col)
24452455
if self.index_col is not None else 0)
2456+
unnamed_count = len(unnamed_cols)
2457+
24462458
if lc != unnamed_count and lc - ic > unnamed_count:
24472459
clear_buffer = False
24482460
this_columns = [None] * lc
@@ -2513,7 +2525,7 @@ def _infer_columns(self):
25132525
columns = [names]
25142526
num_original_columns = ncols
25152527

2516-
return columns, num_original_columns
2528+
return columns, num_original_columns, unnamed_cols
25172529

25182530
def _handle_usecols(self, columns, usecols_key):
25192531
"""
@@ -2879,7 +2891,8 @@ def _get_index_name(self, columns):
28792891
else:
28802892
# Case 2
28812893
(index_name, columns_,
2882-
self.index_col) = _clean_index_names(columns, self.index_col)
2894+
self.index_col) = _clean_index_names(columns, self.index_col,
2895+
self.unnamed_cols)
28832896

28842897
return index_name, orig_names, columns
28852898

@@ -3178,7 +3191,7 @@ def _clean_na_values(na_values, keep_default_na=True):
31783191
return na_values, na_fvalues
31793192

31803193

3181-
def _clean_index_names(columns, index_col):
3194+
def _clean_index_names(columns, index_col, unnamed_cols):
31823195
if not _is_index_col(index_col):
31833196
return None, columns, index_col
31843197

@@ -3203,10 +3216,10 @@ def _clean_index_names(columns, index_col):
32033216
columns.remove(name)
32043217
index_names.append(name)
32053218

3206-
# hack
3207-
if (isinstance(index_names[0], compat.string_types) and
3208-
'Unnamed' in index_names[0]):
3209-
index_names[0] = None
3219+
# Only clean index names that were placeholders.
3220+
for i, name in enumerate(index_names):
3221+
if isinstance(name, compat.string_types) and name in unnamed_cols:
3222+
index_names[i] = None
32103223

32113224
return index_names, columns, index_col
32123225

pandas/tests/io/parser/index_col.py

+18
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,21 @@ def test_empty_with_index_col_false(self):
141141
result = self.read_csv(StringIO(data), index_col=False)
142142
expected = DataFrame([], columns=['x', 'y'])
143143
tm.assert_frame_equal(result, expected)
144+
145+
@pytest.mark.parametrize("index_names", [
146+
["", ""],
147+
["foo", ""],
148+
["", "bar"],
149+
["foo", "bar"],
150+
["NotReallyUnnamed", "Unnamed: 0"],
151+
])
152+
def test_multi_index_naming(self, index_names):
153+
# We don't want empty index names being replaced with "Unnamed: 0"
154+
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
155+
result = self.read_csv(StringIO(data), index_col=[0, 1])
156+
157+
expected = DataFrame({"col": [1, 2, 3, 4]},
158+
index=MultiIndex.from_product([["a", "b"],
159+
["c", "d"]]))
160+
expected.index.names = [name if name else None for name in index_names]
161+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)