Skip to content

Commit 6934d0d

Browse files
committed
ENH: look for index columns in row after header
1 parent 181431f commit 6934d0d

File tree

4 files changed

+75
-44
lines changed

4 files changed

+75
-44
lines changed

doc/source/v0.14.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ Enhancements
136136

137137
- ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`)
138138

139+
- ``read_csv`` and ``read_table`` can now read index columns from the first
140+
line after the header when using the C engine (:issue:`6893`)
141+
139142

140143
.. _whatsnew_0141.performance:
141144

pandas/io/parsers.py

+4
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,10 @@ def __init__(self, src, **kwds):
10791079

10801080
self.orig_names = self.names
10811081

1082+
# index_col may be specified on line after the header
1083+
if self.index_col is None:
1084+
self.index_col = self._reader.index_col
1085+
10821086
if not self._has_complex_date_col:
10831087
if (self._reader.leading_cols == 0 and
10841088
_is_index_col(self.index_col)):

pandas/io/tests/test_parsers.py

+11-32
Original file line numberDiff line numberDiff line change
@@ -1568,21 +1568,22 @@ def test_converter_return_string_bug(self):
15681568
self.assertEqual(df2['Number1'].dtype, float)
15691569

15701570
def test_read_table_buglet_4x_multiindex(self):
1571-
# GH 6607
1572-
# Parsing multi-level index currently causes an error in the C parser.
1573-
# Temporarily copied to TestPythonParser.
1574-
# Here test that CParserError is raised:
1575-
1576-
with tm.assertRaises(CParserError):
1577-
text = """ A B C D E
1571+
text = """ A B C D E
15781572
one two three four
15791573
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
15801574
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
15811575
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
15821576

1583-
# it works!
1584-
df = self.read_table(StringIO(text), sep='\s+')
1585-
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
1577+
# it works!
1578+
df = self.read_table(StringIO(text), sep='\s+')
1579+
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
1580+
1581+
# GH 6893
1582+
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
1583+
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
1584+
columns=list('abcABC'), index=list('abc'))
1585+
actual = self.read_table(StringIO(data), sep='\s+')
1586+
tm.assert_frame_equal(actual, expected)
15861587

15871588
def test_read_csv_parse_simple_list(self):
15881589
text = """foo
@@ -2713,28 +2714,6 @@ def test_decompression_regex_sep(self):
27132714
self.assertRaises(ValueError, self.read_csv,
27142715
path, compression='bz3')
27152716

2716-
def test_read_table_buglet_4x_multiindex(self):
2717-
# GH 6607
2718-
# This is a copy which should eventually be merged into ParserTests
2719-
# when the issue with multi-level index is fixed in the C parser.
2720-
2721-
text = """ A B C D E
2722-
one two three four
2723-
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
2724-
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
2725-
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
2726-
2727-
# it works!
2728-
df = self.read_table(StringIO(text), sep='\s+')
2729-
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
2730-
2731-
# GH 6893
2732-
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
2733-
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
2734-
columns=list('abcABC'), index=list('abc'))
2735-
actual = self.read_table(StringIO(data), sep='\s+')
2736-
tm.assert_frame_equal(actual, expected)
2737-
27382717
class TestFwfColspaceSniffing(tm.TestCase):
27392718
def test_full_file(self):
27402719
# File with all values

pandas/parser.pyx

+57-12
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,17 @@ cdef class TextReader:
574574
raise IOError('Expected file path name or file-like object,'
575575
' got %s type' % type(source))
576576

577+
cdef _word2name(self, word, char *errors):
578+
if self.c_encoding == NULL and not PY3:
579+
name = PyBytes_FromString(word)
580+
else:
581+
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
582+
name = PyUnicode_FromString(word)
583+
else:
584+
name = PyUnicode_Decode(word, strlen(word),
585+
self.c_encoding, errors)
586+
return name
587+
577588
cdef _get_header(self):
578589
# header is now a list of lists, so field_count should use header[0]
579590

@@ -612,16 +623,7 @@ cdef class TextReader:
612623
counts = {}
613624
unnamed_count = 0
614625
for i in range(field_count):
615-
word = self.parser.words[start + i]
616-
617-
if self.c_encoding == NULL and not PY3:
618-
name = PyBytes_FromString(word)
619-
else:
620-
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
621-
name = PyUnicode_FromString(word)
622-
else:
623-
name = PyUnicode_Decode(word, strlen(word),
624-
self.c_encoding, errors)
626+
name = self._word2name(self.parser.words[start + i], errors)
625627

626628
if name == '':
627629
if self.has_mi_columns:
@@ -685,13 +687,56 @@ cdef class TextReader:
685687
else: # not self.has_usecols:
686688

687689
field_count = self.parser.line_fields[data_line]
690+
passed_count = len(header[0])
691+
692+
# #6893: look for index columns on first line after header
693+
694+
# hack: temporarily set expected_fields to prevent parser from
695+
# raising if it sees extra columns
696+
ex_fields = self.parser.expected_fields
697+
self.parser.expected_fields = field_count
698+
699+
datapos = self.parser.datapos # save position
700+
self._tokenize_rows(1)
701+
self.parser.expected_fields = ex_fields # restore expected_fields
702+
703+
if self.parser.lines == data_line + 2:
704+
field_count_next = self.parser.line_fields[data_line + 1]
705+
706+
if field_count_next > field_count:
707+
# found extra columns in the second row after the header
708+
# check whether previous row contains index columns
709+
start = self.parser.line_start[data_line]
710+
711+
line = [self._word2name(self.parser.words[start + i], errors)
712+
for i in range(self.parser.line_fields[data_line])]
713+
714+
# remove trailing empty fields
715+
while not line[-1]:
716+
line.pop()
717+
718+
if passed_count + len(line) == field_count_next:
719+
for h in header:
720+
for c in reversed(line):
721+
h.insert(0, c)
722+
723+
field_count = field_count_next
724+
passed_count = field_count
725+
self.index_col = line
726+
self.parser_start += 1
727+
728+
else:
729+
# hack: didn't find index columns, back up a line and
730+
# let the parser code hande this...
731+
self.parser.datapos = datapos
732+
self.parser.lines -= 1
733+
self.parser.file_lines -= 1
734+
self.parser.line_fields[self.parser.lines] = 0
688735

689736
# #2981
690737
if self.names is not None:
691738
field_count = max(field_count, len(self.names))
692739

693-
passed_count = len(header[0])
694-
695740
# if passed_count > field_count:
696741
# raise CParserError('Column names have %d fields, '
697742
# 'data has %d fields'

0 commit comments

Comments
 (0)