From 7f86033f43b1d205c956677366e8f9c3ae775041 Mon Sep 17 00:00:00 2001 From: waitingkuo Date: Tue, 28 Jan 2014 22:58:14 +0800 Subject: [PATCH 1/4] TST: add test cases for parsing duplicated multiple-column header --- pandas/io/tests/test_parsers.py | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 612840e82e3ff..904347acbb655 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2568,6 +2568,52 @@ def read_table(self, *args, **kwds): kwds['buffer_lines'] = 2 return read_table(*args, **kwds) + def test_list_of_one_header(self): + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +""" + df = self.read_csv(StringIO(data), header=[0]) + + values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + expected = DataFrame(values, columns=['A', 'B', 'C']) + + tm.assert_frame_equal(df, expected) + + def test_list_of_multiple_headers(self): + data = """A,B,C +a,b,c +1,2,3 +4,5,6 +7,8,9 +""" + df = self.read_csv(StringIO(data), header=[0,1]) + + values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + expected_columns = pd.MultiIndex.from_arrays([['A', 'B', 'C'], ['a', 'b', 'c']]) + expected = DataFrame(values, columns=expected_columns) + + tm.assert_frame_equal(df, expected) + + def test_list_of_multiple_headers_with_duplicated_column_pairs(self): + data = """A,A,A,A,A,B,B +a,b,b,b,c,c,c +1,2,3,4,5,6,7 +1,2,3,4,5,6,7 +1,2,3,4,5,6,7 +""" + df = self.read_csv(StringIO(data), header=[0,1]) + + values = [[1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7]] + expected_columns = pd.MultiIndex.from_arrays([ + ['A', 'A', 'A', 'A', 'A', 'B', 'B'], + ['a', 'b', 'b.1', 'b.2', 'c', 'c', 'c.1']]) + expected = DataFrame(values, columns=expected_columns) + + tm.assert_frame_equal(df, expected) + + def test_compact_ints(self): data = ('0,1,0,0\n' '1,1,0,0\n' From 08ed42615284277738f54724c496b4ff6597b9c8 Mon Sep 17 00:00:00 2001 From: waitingkuo Date: Tue, 28 Jan 2014 23:00:06 +0800 Subject: [PATCH 2/4] BUG: fix the bug when parsing multiple-column header --- pandas/parser.pyx | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index bb93097debf71..2445ddc50c9a6 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -460,7 +460,8 @@ cdef class TextReader: self.parser_start = 0 self.header = [] else: - if isinstance(header, list) and len(header): + if isinstance(header, list) and len(header) >= 2: + # FIXME # need to artifically skip the final line # which is still a header line header = list(header) @@ -473,6 +474,11 @@ cdef class TextReader: self.has_mi_columns = 1 self.header = header else: + # if the header is a list with length 1 + # set the header as the only element in the list + if isinstance(header, list) and len(header) == 1: + header = header[0] + self.parser.header_start = header self.parser.header_end = header self.parser.header = header @@ -653,6 +659,40 @@ cdef class TextReader: data_line = hr + 1 header.append(this_header) + # + # Append a seq number for the duplicated columns pairs + # + # i.e. [['a', 'a', 'a', 'b'], + # ['A', 'A', 'B', 'C']] + # ==> + # [['a', 'a', 'b', 'b'], + # ['A', 'A.1', 'B', 'C']] + # + if self.has_mi_columns: + + # zip the header, so that we can easily find the duplicated pair + header = zip(*header) + + counts = {} + for i, column in enumerate(header): + + # Check whether the column is duplicated + count = counts.get(column, 0) + if count > 0: + # + # FIXME + # Since we've added an extra header line (search FIXME in this page) + # Append an incremental seq number to the second-last element + # + tmp_column = list(column) + tmp_column[-2] = '%s.%d' % (tmp_column[-2], count) + header[i] = tuple(tmp_column) + + counts[column] = count + 1 + + # unzip the header + header = [list(x) for x in zip(*header)] + if self.names is not None: header = [ self.names ] From b7079a6653ff18684525b63f8d24222c59d218b8 Mon Sep 17 00:00:00 2001 From: waitingkuo Date: Wed, 29 Jan 2014 21:23:40 +0800 Subject: [PATCH 3/4] BUG: use lzip instead of zip to fix py3 compatible issue --- pandas/parser.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 2445ddc50c9a6..aaf66737f2176 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -30,6 +30,7 @@ import numpy as np cimport util import pandas.lib as lib +from pandas.compat import lzip import time import os @@ -671,7 +672,7 @@ cdef class TextReader: if self.has_mi_columns: # zip the header, so that we can easily find the duplicated pair - header = zip(*header) + header = lzip(*header) counts = {} for i, column in enumerate(header): @@ -691,7 +692,7 @@ cdef class TextReader: counts[column] = count + 1 # unzip the header - header = [list(x) for x in zip(*header)] + header = lzip(*header) if self.names is not None: header = [ self.names ] From 6c3566308ad8031b788adbc33981ba3ac5e1e1f8 Mon Sep 17 00:00:00 2001 From: waitingkuo Date: Thu, 13 Mar 2014 21:33:15 +0800 Subject: [PATCH 4/4] EHN: give warning if duplicated columns have been found --- pandas/parser.pyx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index aaf66737f2176..c4df3581e03cc 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -593,6 +593,7 @@ cdef class TextReader: char *errors = "strict" header = [] + is_duplicated = False if self.parser.header_start >= 0: @@ -640,6 +641,9 @@ cdef class TextReader: count = counts.get(name, 0) if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: this_header.append('%s.%d' % (name, count)) + + # for warning later + is_duplicated = True else: this_header.append(name) counts[name] = count + 1 @@ -689,6 +693,9 @@ cdef class TextReader: tmp_column[-2] = '%s.%d' % (tmp_column[-2], count) header[i] = tuple(tmp_column) + # for warning later + is_duplicated = True + counts[column] = count + 1 # unzip the header @@ -751,6 +758,9 @@ cdef class TextReader: elif self.allow_leading_cols and passed_count < field_count: self.leading_cols = field_count - passed_count + if self.mangle_dupe_cols and is_duplicated: + warnings.warn('Duplicated columns have been mangled', DtypeWarning) + return header, field_count cdef _implicit_index_count(self):