Skip to content

ENH: read_{csv,table} look for index columns in row after header with C engine #7591

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 30, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ Enhancements

- ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`)

- ``read_csv`` and ``read_table`` can now read index columns from the first
line after the header when using the C engine (:issue:`6893`)


.. _whatsnew_0141.performance:

Expand Down
4 changes: 4 additions & 0 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,6 +1079,10 @@ def __init__(self, src, **kwds):

self.orig_names = self.names

# index_col may be specified on line after the header
if self.index_col is None:
self.index_col = self._reader.index_col

if not self._has_complex_date_col:
if (self._reader.leading_cols == 0 and
_is_index_col(self.index_col)):
Expand Down
43 changes: 11 additions & 32 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1568,21 +1568,22 @@ def test_converter_return_string_bug(self):
self.assertEqual(df2['Number1'].dtype, float)

def test_read_table_buglet_4x_multiindex(self):
# GH 6607
# Parsing multi-level index currently causes an error in the C parser.
# Temporarily copied to TestPythonParser.
# Here test that CParserError is raised:

with tm.assertRaises(CParserError):
text = """ A B C D E
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""

# it works!
df = self.read_table(StringIO(text), sep='\s+')
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
# it works!
df = self.read_table(StringIO(text), sep='\s+')
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))

# GH 6893
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
columns=list('abcABC'), index=list('abc'))
actual = self.read_table(StringIO(data), sep='\s+')
tm.assert_frame_equal(actual, expected)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merge test which previously failed with engine='c' into ParserTests (from TestPythonParser) so it gets run for all engines.


def test_read_csv_parse_simple_list(self):
text = """foo
Expand Down Expand Up @@ -2713,28 +2714,6 @@ def test_decompression_regex_sep(self):
self.assertRaises(ValueError, self.read_csv,
path, compression='bz3')

def test_read_table_buglet_4x_multiindex(self):
# GH 6607
# This is a copy which should eventually be merged into ParserTests
# when the issue with multi-level index is fixed in the C parser.

text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""

# it works!
df = self.read_table(StringIO(text), sep='\s+')
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))

# GH 6893
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
columns=list('abcABC'), index=list('abc'))
actual = self.read_table(StringIO(data), sep='\s+')
tm.assert_frame_equal(actual, expected)

class TestFwfColspaceSniffing(tm.TestCase):
def test_full_file(self):
# File with all values
Expand Down
69 changes: 57 additions & 12 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,17 @@ cdef class TextReader:
raise IOError('Expected file path name or file-like object,'
' got %s type' % type(source))

cdef _word2name(self, word, char *errors):
if self.c_encoding == NULL and not PY3:
name = PyBytes_FromString(word)
else:
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
name = PyUnicode_FromString(word)
else:
name = PyUnicode_Decode(word, strlen(word),
self.c_encoding, errors)
return name

cdef _get_header(self):
# header is now a list of lists, so field_count should use header[0]

Expand Down Expand Up @@ -612,16 +623,7 @@ cdef class TextReader:
counts = {}
unnamed_count = 0
for i in range(field_count):
word = self.parser.words[start + i]

if self.c_encoding == NULL and not PY3:
name = PyBytes_FromString(word)
else:
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
name = PyUnicode_FromString(word)
else:
name = PyUnicode_Decode(word, strlen(word),
self.c_encoding, errors)
name = self._word2name(self.parser.words[start + i], errors)

if name == '':
if self.has_mi_columns:
Expand Down Expand Up @@ -685,13 +687,56 @@ cdef class TextReader:
else: # not self.has_usecols:

field_count = self.parser.line_fields[data_line]
passed_count = len(header[0])

# #6893: look for index columns on first line after header

# hack: temporarily set expected_fields to prevent parser from
# raising if it sees extra columns
ex_fields = self.parser.expected_fields
self.parser.expected_fields = field_count

datapos = self.parser.datapos # save position
self._tokenize_rows(1)
self.parser.expected_fields = ex_fields # restore expected_fields

if self.parser.lines == data_line + 2:
field_count_next = self.parser.line_fields[data_line + 1]

if field_count_next > field_count:
# found extra columns in the second row after the header
# check whether previous row contains index columns
start = self.parser.line_start[data_line]

line = [self._word2name(self.parser.words[start + i], errors)
for i in range(self.parser.line_fields[data_line])]

# remove trailing empty fields
while not line[-1]:
line.pop()

if passed_count + len(line) == field_count_next:
for h in header:
for c in reversed(line):
h.insert(0, c)

field_count = field_count_next
passed_count = field_count
self.index_col = line
self.parser_start += 1

else:
# hack: didn't find index columns, back up a line and
# let the parser code hande this...
self.parser.datapos = datapos
self.parser.lines -= 1
self.parser.file_lines -= 1
self.parser.line_fields[self.parser.lines] = 0

# #2981
if self.names is not None:
field_count = max(field_count, len(self.names))

passed_count = len(header[0])

# if passed_count > field_count:
# raise CParserError('Column names have %d fields, '
# 'data has %d fields'
Expand Down