diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 612840e82e3ff..904347acbb655 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2568,6 +2568,52 @@ def read_table(self, *args, **kwds): kwds['buffer_lines'] = 2 return read_table(*args, **kwds) + def test_list_of_one_header(self): + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +""" + df = self.read_csv(StringIO(data), header=[0]) + + values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + expected = DataFrame(values, columns=['A', 'B', 'C']) + + tm.assert_frame_equal(df, expected) + + def test_list_of_multiple_headers(self): + data = """A,B,C +a,b,c +1,2,3 +4,5,6 +7,8,9 +""" + df = self.read_csv(StringIO(data), header=[0,1]) + + values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + expected_columns = pd.MultiIndex.from_arrays([['A', 'B', 'C'], ['a', 'b', 'c']]) + expected = DataFrame(values, columns=expected_columns) + + tm.assert_frame_equal(df, expected) + + def test_list_of_multiple_headers_with_duplicated_column_pairs(self): + data = """A,A,A,A,A,B,B +a,b,b,b,c,c,c +1,2,3,4,5,6,7 +1,2,3,4,5,6,7 +1,2,3,4,5,6,7 +""" + df = self.read_csv(StringIO(data), header=[0,1]) + + values = [[1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7]] + expected_columns = pd.MultiIndex.from_arrays([ + ['A', 'A', 'A', 'A', 'A', 'B', 'B'], + ['a', 'b', 'b.1', 'b.2', 'c', 'c', 'c.1']]) + expected = DataFrame(values, columns=expected_columns) + + tm.assert_frame_equal(df, expected) + + def test_compact_ints(self): data = ('0,1,0,0\n' '1,1,0,0\n' diff --git a/pandas/parser.pyx b/pandas/parser.pyx index bb93097debf71..c4df3581e03cc 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -30,6 +30,7 @@ import numpy as np cimport util import pandas.lib as lib +from pandas.compat import lzip import time import os @@ -460,7 +461,8 @@ cdef class TextReader: self.parser_start = 0 self.header = [] else: - if isinstance(header, list) and len(header): + if isinstance(header, list) and len(header) >= 2: + # FIXME # need to artifically skip the final line # which is still a header line header = list(header) @@ -473,6 +475,11 @@ cdef class TextReader: self.has_mi_columns = 1 self.header = header else: + # if the header is a list with length 1 + # set the header as the only element in the list + if isinstance(header, list) and len(header) == 1: + header = header[0] + self.parser.header_start = header self.parser.header_end = header self.parser.header = header @@ -586,6 +593,7 @@ cdef class TextReader: char *errors = "strict" header = [] + is_duplicated = False if self.parser.header_start >= 0: @@ -633,6 +641,9 @@ cdef class TextReader: count = counts.get(name, 0) if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: this_header.append('%s.%d' % (name, count)) + + # for warning later + is_duplicated = True else: this_header.append(name) counts[name] = count + 1 @@ -653,6 +664,43 @@ cdef class TextReader: data_line = hr + 1 header.append(this_header) + # + # Append a seq number for the duplicated columns pairs + # + # i.e. [['a', 'a', 'a', 'b'], + # ['A', 'A', 'B', 'C']] + # ==> + # [['a', 'a', 'b', 'b'], + # ['A', 'A.1', 'B', 'C']] + # + if self.has_mi_columns: + + # zip the header, so that we can easily find the duplicated pair + header = lzip(*header) + + counts = {} + for i, column in enumerate(header): + + # Check whether the column is duplicated + count = counts.get(column, 0) + if count > 0: + # + # FIXME + # Since we've added an extra header line (search FIXME in this page) + # Append an incremental seq number to the second-last element + # + tmp_column = list(column) + tmp_column[-2] = '%s.%d' % (tmp_column[-2], count) + header[i] = tuple(tmp_column) + + # for warning later + is_duplicated = True + + counts[column] = count + 1 + + # unzip the header + header = lzip(*header) + if self.names is not None: header = [ self.names ] @@ -710,6 +758,9 @@ cdef class TextReader: elif self.allow_leading_cols and passed_count < field_count: self.leading_cols = field_count - passed_count + if self.mangle_dupe_cols and is_duplicated: + warnings.warn('Duplicated columns have been mangled', DtypeWarning) + return header, field_count cdef _implicit_index_count(self):