From 8651a438d898f5e94b52f451e52039a721649acb Mon Sep 17 00:00:00 2001 From: Matt Wittmann Date: Tue, 29 Apr 2014 16:25:30 -0700 Subject: [PATCH] BUG: fix reading multi-level index in python parser --- doc/source/release.rst | 1 + pandas/io/parsers.py | 9 +++++---- pandas/io/tests/test_parsers.py | 11 +++++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 064fd3cf12b2f..b5a11091779ec 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -457,6 +457,7 @@ Bug Fixes - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`) - Bug in C parser with leading whitespace (:issue:`3374`) - Bug in C parser with ``delim_whitespace=True`` and ``\r``-delimited lines +- Bug in python parser with explicit multi-index in row following column header (:issue:`6893`) - Bug in ``Series.rank`` and ``DataFrame.rank`` that caused small floats (<1e-13) to all receive the same rank (:issue:`6886`) - Bug in ``DataFrame.apply`` with functions that used \*args`` or \*\*kwargs and returned an empty result (:issue:`6952`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b439ca5c61aeb..4898fabfcd2b4 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1383,7 +1383,7 @@ def __init__(self, f, **kwds): # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: (index_names, - self.orig_names, columns_) = self._get_index_name(self.columns) + self.orig_names, self.columns) = self._get_index_name(self.columns) self._name_processed = True if self.index_names is None: self.index_names = index_names @@ -1811,8 +1811,9 @@ def _get_index_name(self, columns): columns.insert(0, c) # Update list of original names to include all indices. - self.num_original_columns = len(next_line) - return line, columns, orig_names + orig_names = list(columns) + self.num_original_columns = len(columns) + return line, orig_names, columns if implicit_first_cols > 0: # Case 1 @@ -1824,7 +1825,7 @@ def _get_index_name(self, columns): else: # Case 2 - (index_name, columns, + (index_name, columns_, self.index_col) = _clean_index_names(columns, self.index_col) return index_name, orig_names, columns diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 872e719eaa630..2a31eb9608001 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1569,7 +1569,7 @@ def test_converter_return_string_bug(self): def test_read_table_buglet_4x_multiindex(self): # GH 6607 - # Parsing multiindex columns currently causes an error in the C parser. + # Parsing multi-level index currently causes an error in the C parser. # Temporarily copied to TestPythonParser. # Here test that CParserError is raised: @@ -2692,7 +2692,7 @@ def test_decompression_regex_sep(self): def test_read_table_buglet_4x_multiindex(self): # GH 6607 # This is a copy which should eventually be merged into ParserTests - # when the issue with multiindex columns is fixed in the C parser. + # when the issue with multi-level index is fixed in the C parser. text = """ A B C D E one two three four @@ -2704,6 +2704,13 @@ def test_read_table_buglet_4x_multiindex(self): df = self.read_table(StringIO(text), sep='\s+') self.assertEquals(df.index.names, ('one', 'two', 'three', 'four')) + # GH 6893 + data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' + expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)], + columns=list('abcABC'), index=list('abc')) + actual = self.read_table(StringIO(data), sep='\s+') + tm.assert_frame_equal(actual, expected) + class TestFwfColspaceSniffing(tm.TestCase): def test_full_file(self): # File with all values