Skip to content

Commit 31cac55

Browse files
committed
Revert "Merge pull request #7591 from mcwitt/parse-index-cols-c"
This reverts commit 2c603e1, reversing changes made to 49a86f1.
1 parent 160419e commit 31cac55

File tree

4 files changed

+44
-75
lines changed

4 files changed

+44
-75
lines changed

doc/source/v0.14.1.txt

-3
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,6 @@ Enhancements
136136
non-unique labels along *item* axis (``index``, ``columns`` and ``items``
137137
respectively) (:issue:`7370`).
138138

139-
- ``read_csv`` and ``read_table`` can now read index columns from the first
140-
line after the header when using the C engine (:issue:`6893`)
141-
142139

143140
.. _whatsnew_0141.performance:
144141

pandas/io/parsers.py

-4
Original file line numberDiff line numberDiff line change
@@ -1085,10 +1085,6 @@ def __init__(self, src, **kwds):
10851085

10861086
self.orig_names = self.names
10871087

1088-
# index_col may be specified on line after the header
1089-
if self.index_col is None:
1090-
self.index_col = self._reader.index_col
1091-
10921088
if not self._has_complex_date_col:
10931089
if (self._reader.leading_cols == 0 and
10941090
_is_index_col(self.index_col)):

pandas/io/tests/test_parsers.py

+32-11
Original file line numberDiff line numberDiff line change
@@ -1568,22 +1568,21 @@ def test_converter_return_string_bug(self):
15681568
self.assertEqual(df2['Number1'].dtype, float)
15691569

15701570
def test_read_table_buglet_4x_multiindex(self):
1571-
text = """ A B C D E
1571+
# GH 6607
1572+
# Parsing multi-level index currently causes an error in the C parser.
1573+
# Temporarily copied to TestPythonParser.
1574+
# Here test that CParserError is raised:
1575+
1576+
with tm.assertRaises(CParserError):
1577+
text = """ A B C D E
15721578
one two three four
15731579
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
15741580
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
15751581
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
15761582

1577-
# it works!
1578-
df = self.read_table(StringIO(text), sep='\s+')
1579-
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
1580-
1581-
# GH 6893
1582-
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
1583-
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
1584-
columns=list('abcABC'), index=list('abc'))
1585-
actual = self.read_table(StringIO(data), sep='\s+')
1586-
tm.assert_frame_equal(actual, expected)
1583+
# it works!
1584+
df = self.read_table(StringIO(text), sep='\s+')
1585+
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
15871586

15881587
def test_line_comment(self):
15891588
data = """# empty
@@ -2773,6 +2772,28 @@ def test_decompression_regex_sep(self):
27732772
self.assertRaises(ValueError, self.read_csv,
27742773
path, compression='bz3')
27752774

2775+
def test_read_table_buglet_4x_multiindex(self):
2776+
# GH 6607
2777+
# This is a copy which should eventually be merged into ParserTests
2778+
# when the issue with multi-level index is fixed in the C parser.
2779+
2780+
text = """ A B C D E
2781+
one two three four
2782+
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
2783+
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
2784+
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
2785+
2786+
# it works!
2787+
df = self.read_table(StringIO(text), sep='\s+')
2788+
self.assertEqual(df.index.names, ('one', 'two', 'three', 'four'))
2789+
2790+
# GH 6893
2791+
data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
2792+
expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
2793+
columns=list('abcABC'), index=list('abc'))
2794+
actual = self.read_table(StringIO(data), sep='\s+')
2795+
tm.assert_frame_equal(actual, expected)
2796+
27762797
class TestFwfColspaceSniffing(tm.TestCase):
27772798
def test_full_file(self):
27782799
# File with all values

pandas/parser.pyx

+12-57
Original file line numberDiff line numberDiff line change
@@ -576,17 +576,6 @@ cdef class TextReader:
576576
raise IOError('Expected file path name or file-like object,'
577577
' got %s type' % type(source))
578578

579-
cdef _word2name(self, word, char *errors):
580-
if self.c_encoding == NULL and not PY3:
581-
name = PyBytes_FromString(word)
582-
else:
583-
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
584-
name = PyUnicode_FromString(word)
585-
else:
586-
name = PyUnicode_Decode(word, strlen(word),
587-
self.c_encoding, errors)
588-
return name
589-
590579
cdef _get_header(self):
591580
# header is now a list of lists, so field_count should use header[0]
592581

@@ -625,7 +614,16 @@ cdef class TextReader:
625614
counts = {}
626615
unnamed_count = 0
627616
for i in range(field_count):
628-
name = self._word2name(self.parser.words[start + i], errors)
617+
word = self.parser.words[start + i]
618+
619+
if self.c_encoding == NULL and not PY3:
620+
name = PyBytes_FromString(word)
621+
else:
622+
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
623+
name = PyUnicode_FromString(word)
624+
else:
625+
name = PyUnicode_Decode(word, strlen(word),
626+
self.c_encoding, errors)
629627

630628
if name == '':
631629
if self.has_mi_columns:
@@ -689,56 +687,13 @@ cdef class TextReader:
689687
else: # not self.has_usecols:
690688

691689
field_count = self.parser.line_fields[data_line]
692-
passed_count = len(header[0])
693-
694-
# #6893: look for index columns on first line after header
695-
696-
# hack: temporarily set expected_fields to prevent parser from
697-
# raising if it sees extra columns
698-
ex_fields = self.parser.expected_fields
699-
self.parser.expected_fields = field_count
700-
701-
datapos = self.parser.datapos # save position
702-
self._tokenize_rows(1)
703-
self.parser.expected_fields = ex_fields # restore expected_fields
704-
705-
if self.parser.lines == data_line + 2:
706-
field_count_next = self.parser.line_fields[data_line + 1]
707-
708-
if field_count_next > field_count:
709-
# found extra columns in the second row after the header
710-
# check whether previous row contains index columns
711-
start = self.parser.line_start[data_line]
712-
713-
line = [self._word2name(self.parser.words[start + i], errors)
714-
for i in range(self.parser.line_fields[data_line])]
715-
716-
# remove trailing empty fields
717-
while not line[-1]:
718-
line.pop()
719-
720-
if passed_count + len(line) == field_count_next:
721-
for h in header:
722-
for c in reversed(line):
723-
h.insert(0, c)
724-
725-
field_count = field_count_next
726-
passed_count = field_count
727-
self.index_col = line
728-
self.parser_start += 1
729-
730-
else:
731-
# hack: didn't find index columns, back up a line and
732-
# let the parser code hande this...
733-
self.parser.datapos = datapos
734-
self.parser.lines -= 1
735-
self.parser.file_lines -= 1
736-
self.parser.line_fields[self.parser.lines] = 0
737690

738691
# #2981
739692
if self.names is not None:
740693
field_count = max(field_count, len(self.names))
741694

695+
passed_count = len(header[0])
696+
742697
# if passed_count > field_count:
743698
# raise CParserError('Column names have %d fields, '
744699
# 'data has %d fields'

0 commit comments

Comments
 (0)