Skip to content

Commit 9b8294b

Browse files
gfyoungPingviinituutti
authored andcommitted
BUG: Fix of handle missing CSV MI column names (pandas-dev#23484)
1 parent afb68be commit 9b8294b

File tree

4 files changed

+80
-22
lines changed

4 files changed

+80
-22
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1283,6 +1283,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
12831283
- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`)
12841284
- Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`).
12851285
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
1286+
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
12861287

12871288
Plotting
12881289
^^^^^^^^

pandas/_libs/parsers.pyx

+15-4
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ cdef class TextReader:
302302
object tupleize_cols
303303
object usecols
304304
list dtype_cast_order
305+
set unnamed_cols
305306
set noconvert
306307

307308
def __cinit__(self, source,
@@ -536,7 +537,7 @@ cdef class TextReader:
536537
self.header = [ header ]
537538

538539
self.names = names
539-
self.header, self.table_width = self._get_header()
540+
self.header, self.table_width, self.unnamed_cols = self._get_header()
540541

541542
if not self.table_width:
542543
raise EmptyDataError("No columns to parse from file")
@@ -720,13 +721,15 @@ cdef class TextReader:
720721
cdef:
721722
Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa
722723
char *word
723-
object name
724+
object name, old_name
724725
int status
725726
int64_t hr, data_line
726727
char *errors = "strict"
727728
cdef StringPath path = _string_path(self.c_encoding)
728729

729730
header = []
731+
unnamed_cols = set()
732+
730733
if self.parser.header_start >= 0:
731734

732735
# Header is in the file
@@ -759,6 +762,7 @@ cdef class TextReader:
759762

760763
counts = {}
761764
unnamed_count = 0
765+
762766
for i in range(field_count):
763767
word = self.parser.words[start + i]
764768

@@ -770,6 +774,9 @@ cdef class TextReader:
770774
name = PyUnicode_Decode(word, strlen(word),
771775
self.c_encoding, errors)
772776

777+
# We use this later when collecting placeholder names.
778+
old_name = name
779+
773780
if name == '':
774781
if self.has_mi_columns:
775782
name = ('Unnamed: {i}_level_{lvl}'
@@ -786,6 +793,9 @@ cdef class TextReader:
786793
name = '%s.%d' % (name, count)
787794
count = counts.get(name, 0)
788795

796+
if old_name == '':
797+
unnamed_cols.add(name)
798+
789799
this_header.append(name)
790800
counts[name] = count + 1
791801

@@ -798,6 +808,7 @@ cdef class TextReader:
798808
lc = len(this_header)
799809
ic = (len(self.index_col) if self.index_col
800810
is not None else 0)
811+
801812
if lc != unnamed_count and lc - ic > unnamed_count:
802813
hr -= 1
803814
self.parser_start -= 1
@@ -830,7 +841,7 @@ cdef class TextReader:
830841
if self.parser.lines < 1:
831842
self._tokenize_rows(1)
832843

833-
return None, self.parser.line_fields[0]
844+
return None, self.parser.line_fields[0], unnamed_cols
834845

835846
# Corner case, not enough lines in the file
836847
if self.parser.lines < data_line + 1:
@@ -864,7 +875,7 @@ cdef class TextReader:
864875
elif self.allow_leading_cols and passed_count < field_count:
865876
self.leading_cols = field_count - passed_count
866877

867-
return header, field_count
878+
return header, field_count, unnamed_cols
868879

869880
def read(self, rows=None):
870881
"""

pandas/io/parsers.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,7 @@ def __init__(self, kwds):
12651265
self.prefix = kwds.pop('prefix', None)
12661266

12671267
self.index_col = kwds.get('index_col', None)
1268+
self.unnamed_cols = set()
12681269
self.index_names = None
12691270
self.col_names = None
12701271

@@ -1374,7 +1375,8 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names,
13741375
# clean the index_names
13751376
index_names = header.pop(-1)
13761377
index_names, names, index_col = _clean_index_names(index_names,
1377-
self.index_col)
1378+
self.index_col,
1379+
self.unnamed_cols)
13781380

13791381
# extract the columns
13801382
field_count = len(header[0])
@@ -1454,7 +1456,8 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):
14541456
if not self._name_processed:
14551457
(self.index_names, _,
14561458
self.index_col) = _clean_index_names(list(columns),
1457-
self.index_col)
1459+
self.index_col,
1460+
self.unnamed_cols)
14581461
self._name_processed = True
14591462
index = self._get_complex_date_index(data, columns)
14601463
index = self._agg_index(index, try_parse_dates=False)
@@ -1732,6 +1735,7 @@ def __init__(self, src, **kwds):
17321735
kwds['usecols'] = self.usecols
17331736

17341737
self._reader = parsers.TextReader(src, **kwds)
1738+
self.unnamed_cols = self._reader.unnamed_cols
17351739

17361740
passed_names = self.names is None
17371741

@@ -1792,7 +1796,8 @@ def __init__(self, src, **kwds):
17921796
self._name_processed = True
17931797
(index_names, self.names,
17941798
self.index_col) = _clean_index_names(self.names,
1795-
self.index_col)
1799+
self.index_col,
1800+
self.unnamed_cols)
17961801

17971802
if self.index_names is None:
17981803
self.index_names = index_names
@@ -1966,7 +1971,8 @@ def _get_index_names(self):
19661971

19671972
if self._reader.leading_cols == 0 and self.index_col is not None:
19681973
(idx_names, names,
1969-
self.index_col) = _clean_index_names(names, self.index_col)
1974+
self.index_col) = _clean_index_names(names, self.index_col,
1975+
self.unnamed_cols)
19701976

19711977
return names, idx_names
19721978

@@ -2112,7 +2118,8 @@ def __init__(self, f, **kwds):
21122118
# Get columns in two steps: infer from data, then
21132119
# infer column indices from self.usecols if it is specified.
21142120
self._col_indices = None
2115-
self.columns, self.num_original_columns = self._infer_columns()
2121+
(self.columns, self.num_original_columns,
2122+
self.unnamed_cols) = self._infer_columns()
21162123

21172124
# Now self.columns has the set of columns that we will process.
21182125
# The original set is stored in self.original_columns.
@@ -2367,6 +2374,8 @@ def _infer_columns(self):
23672374
names = self.names
23682375
num_original_columns = 0
23692376
clear_buffer = True
2377+
unnamed_cols = set()
2378+
23702379
if self.header is not None:
23712380
header = self.header
23722381

@@ -2400,24 +2409,27 @@ def _infer_columns(self):
24002409
if clear_buffer:
24012410
self._clear_buffer()
24022411
columns.append([None] * len(columns[-1]))
2403-
return columns, num_original_columns
2412+
return columns, num_original_columns, unnamed_cols
24042413

24052414
if not self.names:
24062415
raise EmptyDataError(
24072416
"No columns to parse from file")
24082417

24092418
line = self.names[:]
24102419

2411-
unnamed_count = 0
24122420
this_columns = []
2421+
this_unnamed_cols = []
2422+
24132423
for i, c in enumerate(line):
24142424
if c == '':
24152425
if have_mi_columns:
2416-
this_columns.append('Unnamed: %d_level_%d'
2417-
% (i, level))
2426+
col_name = ("Unnamed: {i}_level_{level}"
2427+
.format(i=i, level=level))
24182428
else:
2419-
this_columns.append('Unnamed: %d' % i)
2420-
unnamed_count += 1
2429+
col_name = "Unnamed: {i}".format(i=i)
2430+
2431+
this_unnamed_cols.append(i)
2432+
this_columns.append(col_name)
24212433
else:
24222434
this_columns.append(c)
24232435

@@ -2443,12 +2455,17 @@ def _infer_columns(self):
24432455
lc = len(this_columns)
24442456
ic = (len(self.index_col)
24452457
if self.index_col is not None else 0)
2458+
unnamed_count = len(this_unnamed_cols)
2459+
24462460
if lc != unnamed_count and lc - ic > unnamed_count:
24472461
clear_buffer = False
24482462
this_columns = [None] * lc
24492463
self.buf = [self.buf[-1]]
24502464

24512465
columns.append(this_columns)
2466+
unnamed_cols.update({this_columns[i]
2467+
for i in this_unnamed_cols})
2468+
24522469
if len(columns) == 1:
24532470
num_original_columns = len(this_columns)
24542471

@@ -2513,7 +2530,7 @@ def _infer_columns(self):
25132530
columns = [names]
25142531
num_original_columns = ncols
25152532

2516-
return columns, num_original_columns
2533+
return columns, num_original_columns, unnamed_cols
25172534

25182535
def _handle_usecols(self, columns, usecols_key):
25192536
"""
@@ -2879,7 +2896,8 @@ def _get_index_name(self, columns):
28792896
else:
28802897
# Case 2
28812898
(index_name, columns_,
2882-
self.index_col) = _clean_index_names(columns, self.index_col)
2899+
self.index_col) = _clean_index_names(columns, self.index_col,
2900+
self.unnamed_cols)
28832901

28842902
return index_name, orig_names, columns
28852903

@@ -3178,7 +3196,7 @@ def _clean_na_values(na_values, keep_default_na=True):
31783196
return na_values, na_fvalues
31793197

31803198

3181-
def _clean_index_names(columns, index_col):
3199+
def _clean_index_names(columns, index_col, unnamed_cols):
31823200
if not _is_index_col(index_col):
31833201
return None, columns, index_col
31843202

@@ -3203,10 +3221,10 @@ def _clean_index_names(columns, index_col):
32033221
columns.remove(name)
32043222
index_names.append(name)
32053223

3206-
# hack
3207-
if (isinstance(index_names[0], compat.string_types) and
3208-
'Unnamed' in index_names[0]):
3209-
index_names[0] = None
3224+
# Only clean index names that were placeholders.
3225+
for i, name in enumerate(index_names):
3226+
if isinstance(name, compat.string_types) and name in unnamed_cols:
3227+
index_names[i] = None
32103228

32113229
return index_names, columns, index_col
32123230

pandas/tests/io/parser/index_col.py

+28
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,31 @@ def test_empty_with_index_col_false(self):
141141
result = self.read_csv(StringIO(data), index_col=False)
142142
expected = DataFrame([], columns=['x', 'y'])
143143
tm.assert_frame_equal(result, expected)
144+
145+
@pytest.mark.parametrize("index_names", [
146+
["", ""],
147+
["foo", ""],
148+
["", "bar"],
149+
["foo", "bar"],
150+
["NotReallyUnnamed", "Unnamed: 0"],
151+
])
152+
def test_multi_index_naming(self, index_names):
153+
# We don't want empty index names being replaced with "Unnamed: 0"
154+
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
155+
result = self.read_csv(StringIO(data), index_col=[0, 1])
156+
157+
expected = DataFrame({"col": [1, 2, 3, 4]},
158+
index=MultiIndex.from_product([["a", "b"],
159+
["c", "d"]]))
160+
expected.index.names = [name if name else None for name in index_names]
161+
tm.assert_frame_equal(result, expected)
162+
163+
def test_multi_index_naming_not_all_at_beginning(self):
164+
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
165+
result = self.read_csv(StringIO(data), index_col=[0, 2])
166+
167+
expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]},
168+
index=MultiIndex(
169+
levels=[['a', 'b'], [1, 2, 3, 4]],
170+
labels=[[0, 0, 1, 1], [0, 1, 2, 3]]))
171+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)