Skip to content

Commit f862b52

Browse files
Ben Kandeljreback
Ben Kandel
authored andcommitted
BUG: Fix parse empty df
closes #14515 This commit fixes a bug where `read_csv` failed when given a file with a multiindex header and empty content. Because pandas reads index names as a separate line following the header lines, the reader looks for the line with index names in it. If the content of the dataframe is empty, the reader will choke. This bug surfaced after #6618 stopped writing an extra line after multiindex columns, which led to a situation where pandas could write CSV's that it couldn't then read. This commit changes that behavior by explicitly checking if the index name row exists, and processing it correctly if it doesn't. Author: Ben Kandel <[email protected]> Closes #14596 from bkandel/fix-parse-empty-df and squashes the following commits: 32e3b0a [Ben Kandel] lint e6b1237 [Ben Kandel] lint fedfff8 [Ben Kandel] fix multiindex column parsing 518982d [Ben Kandel] move to 0.19.2 fc23e5c [Ben Kandel] fix errant this_columns 3d9bbdd [Ben Kandel] whatsnew 68eadf3 [Ben Kandel] Modify test. 17e44dd [Ben Kandel] fix python parser too 72adaf2 [Ben Kandel] remove unnecessary test bfe0423 [Ben Kandel] typo 2f64d57 [Ben Kandel] pep8 b8200e4 [Ben Kandel] BUG: read_csv with empty df
1 parent f609640 commit f862b52

File tree

5 files changed

+41
-7
lines changed

5 files changed

+41
-7
lines changed

doc/source/whatsnew/v0.19.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Bug Fixes
2929

3030
- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
3131
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
32-
32+
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
3333

3434

3535

pandas/io/parsers.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -1509,10 +1509,11 @@ def read(self, nrows=None):
15091509
if self._first_chunk:
15101510
self._first_chunk = False
15111511
names = self._maybe_dedup_names(self.orig_names)
1512-
15131512
index, columns, col_dict = _get_empty_meta(
15141513
names, self.index_col, self.index_names,
15151514
dtype=self.kwds.get('dtype'))
1515+
columns = self._maybe_make_multi_index_columns(
1516+
columns, self.col_names)
15161517

15171518
if self.usecols is not None:
15181519
columns = self._filter_usecols(columns)
@@ -1979,8 +1980,11 @@ def read(self, rows=None):
19791980
if not len(content): # pragma: no cover
19801981
# DataFrame with the right metadata, even though it's length 0
19811982
names = self._maybe_dedup_names(self.orig_names)
1982-
return _get_empty_meta(names, self.index_col,
1983-
self.index_names)
1983+
index, columns, col_dict = _get_empty_meta(
1984+
names, self.index_col, self.index_names)
1985+
columns = self._maybe_make_multi_index_columns(
1986+
columns, self.col_names)
1987+
return index, columns, col_dict
19841988

19851989
# handle new style for names in index
19861990
count_empty_content_vals = count_empty_vals(content[0])
@@ -2083,6 +2087,12 @@ def _infer_columns(self):
20832087
# We have an empty file, so check
20842088
# if columns are provided. That will
20852089
# serve as the 'line' for parsing
2090+
if have_mi_columns and hr > 0:
2091+
if clear_buffer:
2092+
self._clear_buffer()
2093+
columns.append([None] * len(columns[-1]))
2094+
return columns, num_original_columns
2095+
20862096
if not self.names:
20872097
raise EmptyDataError(
20882098
"No columns to parse from file")

pandas/io/tests/parser/common.py

+22
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,28 @@ def test_multi_index_no_level_names(self):
606606
expected = self.read_csv(StringIO(data), index_col=[1, 0])
607607
tm.assert_frame_equal(df, expected, check_names=False)
608608

609+
def test_multi_index_blank_df(self):
610+
# GH 14545
611+
data = """a,b
612+
"""
613+
df = self.read_csv(StringIO(data), header=[0])
614+
expected = DataFrame(columns=['a', 'b'])
615+
tm.assert_frame_equal(df, expected)
616+
round_trip = self.read_csv(StringIO(
617+
expected.to_csv(index=False)), header=[0])
618+
tm.assert_frame_equal(round_trip, expected)
619+
620+
data_multiline = """a,b
621+
c,d
622+
"""
623+
df2 = self.read_csv(StringIO(data_multiline), header=[0, 1])
624+
cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
625+
expected2 = DataFrame(columns=cols)
626+
tm.assert_frame_equal(df2, expected2)
627+
round_trip = self.read_csv(StringIO(
628+
expected2.to_csv(index=False)), header=[0, 1])
629+
tm.assert_frame_equal(round_trip, expected2)
630+
609631
def test_no_unnamed_index(self):
610632
data = """ id c0 c1 c2
611633
0 1 0 a b

pandas/parser.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -717,7 +717,9 @@ cdef class TextReader:
717717
start = self.parser.line_start[0]
718718

719719
# e.g., if header=3 and file only has 2 lines
720-
elif self.parser.lines < hr + 1:
720+
elif (self.parser.lines < hr + 1
721+
and not isinstance(self.orig_header, list)) or (
722+
self.parser.lines < hr):
721723
msg = self.orig_header
722724
if isinstance(msg, list):
723725
msg = "[%s], len of %d," % (
@@ -940,7 +942,7 @@ cdef class TextReader:
940942
raise_parser_error('Error tokenizing data', self.parser)
941943
footer = self.skipfooter
942944

943-
if self.parser_start == self.parser.lines:
945+
if self.parser_start >= self.parser.lines:
944946
raise StopIteration
945947
self._end_clock('Tokenization')
946948

pandas/tests/frame/test_to_csv.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ def _make_frame(names=None):
587587
df = _make_frame(True)
588588
df.to_csv(path, tupleize_cols=False)
589589

590-
for i in [5, 6, 7]:
590+
for i in [6, 7]:
591591
msg = 'len of {i}, but only 5 lines in file'.format(i=i)
592592
with assertRaisesRegexp(ParserError, msg):
593593
read_csv(path, tupleize_cols=False,

0 commit comments

Comments
 (0)