Skip to content

Commit 57482f7

Browse files
Ben Kandeljorisvandenbossche
Ben Kandel
authored andcommitted
[Backport pandas-dev#14596] BUG: Fix parse empty df
closes pandas-dev#14515 This commit fixes a bug where `read_csv` failed when given a file with a multiindex header and empty content. Because pandas reads index names as a separate line following the header lines, the reader looks for the line with index names in it. If the content of the dataframe is empty, the reader will choke. This bug surfaced after pandas-dev#6618 stopped writing an extra line after multiindex columns, which led to a situation where pandas could write CSV's that it couldn't then read. This commit changes that behavior by explicitly checking if the index name row exists, and processing it correctly if it doesn't. Author: Ben Kandel <[email protected]> Closes pandas-dev#14596 from bkandel/fix-parse-empty-df and squashes the following commits: 32e3b0a [Ben Kandel] lint e6b1237 [Ben Kandel] lint fedfff8 [Ben Kandel] fix multiindex column parsing 518982d [Ben Kandel] move to 0.19.2 fc23e5c [Ben Kandel] fix errant this_columns 3d9bbdd [Ben Kandel] whatsnew 68eadf3 [Ben Kandel] Modify test. 17e44dd [Ben Kandel] fix python parser too 72adaf2 [Ben Kandel] remove unnecessary test bfe0423 [Ben Kandel] typo 2f64d57 [Ben Kandel] pep8 b8200e4 [Ben Kandel] BUG: read_csv with empty df (cherry picked from commit f862b52)
1 parent 56c6171 commit 57482f7

File tree

5 files changed

+41
-7
lines changed

5 files changed

+41
-7
lines changed

doc/source/whatsnew/v0.19.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Bug Fixes
2929

3030
- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
3131
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
32-
32+
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
3333

3434

3535

pandas/io/parsers.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -1509,10 +1509,11 @@ def read(self, nrows=None):
15091509
if self._first_chunk:
15101510
self._first_chunk = False
15111511
names = self._maybe_dedup_names(self.orig_names)
1512-
15131512
index, columns, col_dict = _get_empty_meta(
15141513
names, self.index_col, self.index_names,
15151514
dtype=self.kwds.get('dtype'))
1515+
columns = self._maybe_make_multi_index_columns(
1516+
columns, self.col_names)
15161517

15171518
if self.usecols is not None:
15181519
columns = self._filter_usecols(columns)
@@ -1979,8 +1980,11 @@ def read(self, rows=None):
19791980
if not len(content): # pragma: no cover
19801981
# DataFrame with the right metadata, even though it's length 0
19811982
names = self._maybe_dedup_names(self.orig_names)
1982-
return _get_empty_meta(names, self.index_col,
1983-
self.index_names)
1983+
index, columns, col_dict = _get_empty_meta(
1984+
names, self.index_col, self.index_names)
1985+
columns = self._maybe_make_multi_index_columns(
1986+
columns, self.col_names)
1987+
return index, columns, col_dict
19841988

19851989
# handle new style for names in index
19861990
count_empty_content_vals = count_empty_vals(content[0])
@@ -2083,6 +2087,12 @@ def _infer_columns(self):
20832087
# We have an empty file, so check
20842088
# if columns are provided. That will
20852089
# serve as the 'line' for parsing
2090+
if have_mi_columns and hr > 0:
2091+
if clear_buffer:
2092+
self._clear_buffer()
2093+
columns.append([None] * len(columns[-1]))
2094+
return columns, num_original_columns
2095+
20862096
if not self.names:
20872097
raise EmptyDataError(
20882098
"No columns to parse from file")

pandas/io/tests/parser/common.py

+22
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,28 @@ def test_multi_index_no_level_names(self):
606606
expected = self.read_csv(StringIO(data), index_col=[1, 0])
607607
tm.assert_frame_equal(df, expected, check_names=False)
608608

609+
def test_multi_index_blank_df(self):
610+
# GH 14545
611+
data = """a,b
612+
"""
613+
df = self.read_csv(StringIO(data), header=[0])
614+
expected = DataFrame(columns=['a', 'b'])
615+
tm.assert_frame_equal(df, expected)
616+
round_trip = self.read_csv(StringIO(
617+
expected.to_csv(index=False)), header=[0])
618+
tm.assert_frame_equal(round_trip, expected)
619+
620+
data_multiline = """a,b
621+
c,d
622+
"""
623+
df2 = self.read_csv(StringIO(data_multiline), header=[0, 1])
624+
cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
625+
expected2 = DataFrame(columns=cols)
626+
tm.assert_frame_equal(df2, expected2)
627+
round_trip = self.read_csv(StringIO(
628+
expected2.to_csv(index=False)), header=[0, 1])
629+
tm.assert_frame_equal(round_trip, expected2)
630+
609631
def test_no_unnamed_index(self):
610632
data = """ id c0 c1 c2
611633
0 1 0 a b

pandas/parser.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -714,7 +714,9 @@ cdef class TextReader:
714714
start = self.parser.line_start[0]
715715

716716
# e.g., if header=3 and file only has 2 lines
717-
elif self.parser.lines < hr + 1:
717+
elif (self.parser.lines < hr + 1
718+
and not isinstance(self.orig_header, list)) or (
719+
self.parser.lines < hr):
718720
msg = self.orig_header
719721
if isinstance(msg, list):
720722
msg = "[%s], len of %d," % (
@@ -937,7 +939,7 @@ cdef class TextReader:
937939
raise_parser_error('Error tokenizing data', self.parser)
938940
footer = self.skipfooter
939941

940-
if self.parser_start == self.parser.lines:
942+
if self.parser_start >= self.parser.lines:
941943
raise StopIteration
942944
self._end_clock('Tokenization')
943945

pandas/tests/frame/test_to_csv.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ def _make_frame(names=None):
587587
df = _make_frame(True)
588588
df.to_csv(path, tupleize_cols=False)
589589

590-
for i in [5, 6, 7]:
590+
for i in [6, 7]:
591591
msg = 'len of {i}, but only 5 lines in file'.format(i=i)
592592
with assertRaisesRegexp(CParserError, msg):
593593
read_csv(path, tupleize_cols=False,

0 commit comments

Comments
 (0)