Skip to content

Commit 2c603e1

Browse files
committed
Merge pull request #7591 from mcwitt/parse-index-cols-c
ENH: read_{csv,table} look for index columns in row after header with C engine
2 parents 49a86f1 + 6934d0d commit 2c603e1

File tree

4 files changed

+75
-44
lines changed

4 files changed

+75
-44
lines changed

doc/source/v0.14.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ Enhancements
136136

137137
- ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`)
138138

139+
- ``read_csv`` and ``read_table`` can now read index columns from the first
140+
line after the header when using the C engine (:issue:`6893`)
141+
139142

140143
.. _whatsnew_0141.performance:
141144

pandas/io/parsers.py

+4
Original file line numberDiff line numberDiff line change
@@ -1085,6 +1085,10 @@ def __init__(self, src, **kwds):
10851085

10861086
self.orig_names = self.names
10871087

1088+
# index_col may be specified on line after the header
1089+
if self.index_col is None:
1090+
self.index_col = self._reader.index_col
1091+
10881092
if not self._has_complex_date_col:
10891093
if (self._reader.leading_cols == 0 and
10901094
_is_index_col(self.index_col)):

pandas/io/tests/test_parsers.py

+11-32
Original file line numberDiff line numberDiff line change
@@ -1568,21 +1568,22 @@ def test_converter_return_string_bug(self):
15681568
self.assertEqual(df2['Number1'].dtype, float)
15691569

15701570
def test_read_table_buglet_4x_multiindex(self):
1571-
# GH 6607
1572-
# Parsing multi-level index currently causes an error in the C parser.
1573-
# Temporarily copied to TestPythonParser.
1574-
# Here test that CParserError is raised:
1575-
1576-
with tm.assertRaises(CParserError):
1577-
text = """ A B C D E
1571+
text = """ A B C D E
15781572
one two three four
15791573
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
15801574
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
15811575
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
15821576

1583-
# it works!
1584-
df = self.read_table(StringIO(text), sep='\s+')
1585-
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
1577+
# it works!
1578+
df = self.read_table(StringIO(text), sep='\s+')
1579+
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
1580+
1581+
# GH 6893
1582+
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
1583+
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
1584+
columns=list('abcABC'), index=list('abc'))
1585+
actual = self.read_table(StringIO(data), sep='\s+')
1586+
tm.assert_frame_equal(actual, expected)
15861587

15871588
def test_line_comment(self):
15881589
data = """# empty
@@ -2772,28 +2773,6 @@ def test_decompression_regex_sep(self):
27722773
self.assertRaises(ValueError, self.read_csv,
27732774
path, compression='bz3')
27742775

2775-
def test_read_table_buglet_4x_multiindex(self):
2776-
# GH 6607
2777-
# This is a copy which should eventually be merged into ParserTests
2778-
# when the issue with multi-level index is fixed in the C parser.
2779-
2780-
text = """ A B C D E
2781-
one two three four
2782-
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
2783-
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
2784-
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
2785-
2786-
# it works!
2787-
df = self.read_table(StringIO(text), sep='\s+')
2788-
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
2789-
2790-
# GH 6893
2791-
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
2792-
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
2793-
columns=list('abcABC'), index=list('abc'))
2794-
actual = self.read_table(StringIO(data), sep='\s+')
2795-
tm.assert_frame_equal(actual, expected)
2796-
27972776
class TestFwfColspaceSniffing(tm.TestCase):
27982777
def test_full_file(self):
27992778
# File with all values

pandas/parser.pyx

+57-12
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,17 @@ cdef class TextReader:
576576
raise IOError('Expected file path name or file-like object,'
577577
' got %s type' % type(source))
578578

579+
cdef _word2name(self, word, char *errors):
580+
if self.c_encoding == NULL and not PY3:
581+
name = PyBytes_FromString(word)
582+
else:
583+
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
584+
name = PyUnicode_FromString(word)
585+
else:
586+
name = PyUnicode_Decode(word, strlen(word),
587+
self.c_encoding, errors)
588+
return name
589+
579590
cdef _get_header(self):
580591
# header is now a list of lists, so field_count should use header[0]
581592

@@ -614,16 +625,7 @@ cdef class TextReader:
614625
counts = {}
615626
unnamed_count = 0
616627
for i in range(field_count):
617-
word = self.parser.words[start + i]
618-
619-
if self.c_encoding == NULL and not PY3:
620-
name = PyBytes_FromString(word)
621-
else:
622-
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
623-
name = PyUnicode_FromString(word)
624-
else:
625-
name = PyUnicode_Decode(word, strlen(word),
626-
self.c_encoding, errors)
628+
name = self._word2name(self.parser.words[start + i], errors)
627629

628630
if name == '':
629631
if self.has_mi_columns:
@@ -687,13 +689,56 @@ cdef class TextReader:
687689
else: # not self.has_usecols:
688690

689691
field_count = self.parser.line_fields[data_line]
692+
passed_count = len(header[0])
693+
694+
# #6893: look for index columns on first line after header
695+
696+
# hack: temporarily set expected_fields to prevent parser from
697+
# raising if it sees extra columns
698+
ex_fields = self.parser.expected_fields
699+
self.parser.expected_fields = field_count
700+
701+
datapos = self.parser.datapos # save position
702+
self._tokenize_rows(1)
703+
self.parser.expected_fields = ex_fields # restore expected_fields
704+
705+
if self.parser.lines == data_line + 2:
706+
field_count_next = self.parser.line_fields[data_line + 1]
707+
708+
if field_count_next > field_count:
709+
# found extra columns in the second row after the header
710+
# check whether previous row contains index columns
711+
start = self.parser.line_start[data_line]
712+
713+
line = [self._word2name(self.parser.words[start + i], errors)
714+
for i in range(self.parser.line_fields[data_line])]
715+
716+
# remove trailing empty fields
717+
while not line[-1]:
718+
line.pop()
719+
720+
if passed_count + len(line) == field_count_next:
721+
for h in header:
722+
for c in reversed(line):
723+
h.insert(0, c)
724+
725+
field_count = field_count_next
726+
passed_count = field_count
727+
self.index_col = line
728+
self.parser_start += 1
729+
730+
else:
731+
# hack: didn't find index columns, back up a line and
732+
# let the parser code hande this...
733+
self.parser.datapos = datapos
734+
self.parser.lines -= 1
735+
self.parser.file_lines -= 1
736+
self.parser.line_fields[self.parser.lines] = 0
690737

691738
# #2981
692739
if self.names is not None:
693740
field_count = max(field_count, len(self.names))
694741

695-
passed_count = len(header[0])
696-
697742
# if passed_count > field_count:
698743
# raise CParserError('Column names have %d fields, '
699744
# 'data has %d fields'

0 commit comments

Comments
 (0)