From 7f86033f43b1d205c956677366e8f9c3ae775041 Mon Sep 17 00:00:00 2001
From: waitingkuo <waitingkuo0527@gmail.com>
Date: Tue, 28 Jan 2014 22:58:14 +0800
Subject: [PATCH 1/4] TST: add test cases for parsing duplicated
 multiple-column header

---
 pandas/io/tests/test_parsers.py | 46 +++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 612840e82e3ff..904347acbb655 100644
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -2568,6 +2568,52 @@ def read_table(self, *args, **kwds):
         kwds['buffer_lines'] = 2
         return read_table(*args, **kwds)
 
+    def test_list_of_one_header(self):
+        data = """A,B,C
+1,2,3
+4,5,6
+7,8,9
+"""
+        df = self.read_csv(StringIO(data), header=[0])
+
+        values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+        expected = DataFrame(values, columns=['A', 'B', 'C'])
+
+        tm.assert_frame_equal(df, expected)
+
+    def test_list_of_multiple_headers(self):
+        data = """A,B,C
+a,b,c
+1,2,3
+4,5,6
+7,8,9
+"""
+        df = self.read_csv(StringIO(data), header=[0,1])
+
+        values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+        expected_columns = pd.MultiIndex.from_arrays([['A', 'B', 'C'], ['a', 'b', 'c']])
+        expected = DataFrame(values, columns=expected_columns)
+
+        tm.assert_frame_equal(df, expected)
+
+    def test_list_of_multiple_headers_with_duplicated_column_pairs(self):
+        data = """A,A,A,A,A,B,B
+a,b,b,b,c,c,c
+1,2,3,4,5,6,7
+1,2,3,4,5,6,7
+1,2,3,4,5,6,7
+"""
+        df = self.read_csv(StringIO(data), header=[0,1])
+
+        values = [[1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7], [1, 2, 3, 4, 5, 6, 7]]
+        expected_columns = pd.MultiIndex.from_arrays([
+            ['A', 'A', 'A',   'A',  'A', 'B', 'B'], 
+            ['a', 'b', 'b.1', 'b.2', 'c', 'c', 'c.1']])
+        expected = DataFrame(values, columns=expected_columns)
+
+        tm.assert_frame_equal(df, expected)
+
+
     def test_compact_ints(self):
         data = ('0,1,0,0\n'
                 '1,1,0,0\n'

From 08ed42615284277738f54724c496b4ff6597b9c8 Mon Sep 17 00:00:00 2001
From: waitingkuo <waitingkuo0527@gmail.com>
Date: Tue, 28 Jan 2014 23:00:06 +0800
Subject: [PATCH 2/4] BUG: fix the bug when parsing multiple-column header

---
 pandas/parser.pyx | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index bb93097debf71..2445ddc50c9a6 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -460,7 +460,8 @@ cdef class TextReader:
             self.parser_start = 0
             self.header = []
         else:
-            if isinstance(header, list) and len(header):
+            if isinstance(header, list) and len(header) >= 2:
+                # FIXME
                 # need to artifically skip the final line
                 # which is still a header line
                 header = list(header)
@@ -473,6 +474,11 @@ cdef class TextReader:
                 self.has_mi_columns = 1
                 self.header = header
             else:
+                # if the header is a list with length 1
+                #   set the header as the only element in the list
+                if isinstance(header, list) and len(header) == 1:
+                    header = header[0]
+
                 self.parser.header_start = header
                 self.parser.header_end = header
                 self.parser.header = header
@@ -653,6 +659,40 @@ cdef class TextReader:
                 data_line = hr + 1
                 header.append(this_header)
 
+            #
+            # Append a seq number for the duplicated columns pairs
+            #
+            # i.e. [['a', 'a', 'a', 'b'], 
+            #       ['A', 'A', 'B', 'C']]
+            #   ==>
+            #      [['a', 'a',   'b', 'b'], 
+            #       ['A', 'A.1', 'B', 'C']]
+            #
+            if self.has_mi_columns:
+
+                # zip the header, so that we can easily find the duplicated pair
+                header = zip(*header)
+
+                counts = {}
+                for i, column in enumerate(header):
+
+                    # Check whether the column is duplicated
+                    count = counts.get(column, 0)
+                    if count > 0:
+                        #
+                        # FIXME
+                        # Since we've added an extra header line (search FIXME in this page)
+                        # Append an incremental seq number to the second-last element
+                        #
+                        tmp_column = list(column)
+                        tmp_column[-2] = '%s.%d' % (tmp_column[-2], count)
+                        header[i] = tuple(tmp_column)
+
+                    counts[column] = count + 1
+
+                # unzip the header
+                header = [list(x) for x in zip(*header)]
+
             if self.names is not None:
                 header = [ self.names ]
 

From b7079a6653ff18684525b63f8d24222c59d218b8 Mon Sep 17 00:00:00 2001
From: waitingkuo <waitingkuo0527@gmail.com>
Date: Wed, 29 Jan 2014 21:23:40 +0800
Subject: [PATCH 3/4] BUG: use lzip instead of zip to fix py3 compatible issue

---
 pandas/parser.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 2445ddc50c9a6..aaf66737f2176 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -30,6 +30,7 @@ import numpy as np
 cimport util
 
 import pandas.lib as lib
+from pandas.compat import lzip
 
 import time
 import os
@@ -671,7 +672,7 @@ cdef class TextReader:
             if self.has_mi_columns:
 
                 # zip the header, so that we can easily find the duplicated pair
-                header = zip(*header)
+                header = lzip(*header)
 
                 counts = {}
                 for i, column in enumerate(header):
@@ -691,7 +692,7 @@ cdef class TextReader:
                     counts[column] = count + 1
 
                 # unzip the header
-                header = [list(x) for x in zip(*header)]
+                header = lzip(*header)
 
             if self.names is not None:
                 header = [ self.names ]

From 6c3566308ad8031b788adbc33981ba3ac5e1e1f8 Mon Sep 17 00:00:00 2001
From: waitingkuo <waitingkuo0527@gmail.com>
Date: Thu, 13 Mar 2014 21:33:15 +0800
Subject: [PATCH 4/4] EHN: give warning if duplicated columns have been found

---
 pandas/parser.pyx | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index aaf66737f2176..c4df3581e03cc 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -593,6 +593,7 @@ cdef class TextReader:
             char *errors = "strict"
 
         header = []
+        is_duplicated = False
 
         if self.parser.header_start >= 0:
 
@@ -640,6 +641,9 @@ cdef class TextReader:
                     count = counts.get(name, 0)
                     if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns:
                         this_header.append('%s.%d' % (name, count))
+
+                        # for warning later
+                        is_duplicated = True
                     else:
                         this_header.append(name)
                     counts[name] = count + 1
@@ -689,6 +693,9 @@ cdef class TextReader:
                         tmp_column[-2] = '%s.%d' % (tmp_column[-2], count)
                         header[i] = tuple(tmp_column)
 
+                        # for warning later
+                        is_duplicated = True
+
                     counts[column] = count + 1
 
                 # unzip the header
@@ -751,6 +758,9 @@ cdef class TextReader:
             elif self.allow_leading_cols and passed_count < field_count:
                 self.leading_cols = field_count - passed_count
 
+        if self.mangle_dupe_cols and is_duplicated:
+            warnings.warn('Duplicated columns have been mangled', DtypeWarning)
+
         return header, field_count
 
     cdef _implicit_index_count(self):