Merge pull request #3511 from y-p/PR_mangle_dupe_cols_option

y-p · y-p · commit 4ecb04efc4a8 · 2013-05-05T02:14:07.000-07:00
ENH: add mode.mangle_dupe_cols option GH3468
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -45,6 +45,11 @@ pandas 0.11.1
     **KeyError** if **key** is not a valid store object.
   - The repr() for (Multi)Index now obeys display.max_seq_items rather
     then numpy threshold print options. (GH3426_, GH3466_)
+  - Added mangle_dupe_cols option to read_table/csv, allowing users
+    to control legacy behaviour re dupe cols (A, A.1, A.2 vs A, A ) (GH3468_)
+    Note: The default value will change in 0.12 to the "no mangle" behaviour,
+    If your code relies on this behaviour, explicitly specify mangle_dupe_cols=True
+    in your calls.
 
 **Bug Fixes**
 
@@ -87,6 +92,7 @@ pandas 0.11.1
 .. _GH3038: https://github.com/pydata/pandas/issues/3038
 .. _GH3510: https://github.com/pydata/pandas/issues/3510
 .. _GH3437: https://github.com/pydata/pandas/issues/3437
+.. _GH3468: https://github.com/pydata/pandas/issues/3468
 .. _GH3455: https://github.com/pydata/pandas/issues/3455
 .. _GH3457: https://github.com/pydata/pandas/issues/3457
 .. _GH3477: https://github.com/pydata/pandas/issues/3457
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -266,7 +266,8 @@ def _read(filepath_or_buffer, kwds):
     'verbose': False,
     'encoding': None,
     'squeeze': False,
-    'compression': None
+    'compression': None,
+    'mangle_dupe_cols': True,
 }
 
 
@@ -352,7 +353,9 @@ def parser_f(filepath_or_buffer,
 
                  verbose=False,
                  encoding=None,
-                 squeeze=False):
+                 squeeze=False,
+                 mangle_dupe_cols=True
+                 ):
 
         # Alias sep -> delimiter.
         if delimiter is None:
@@ -408,7 +411,9 @@ def parser_f(filepath_or_buffer,
                     warn_bad_lines=warn_bad_lines,
                     error_bad_lines=error_bad_lines,
                     low_memory=low_memory,
-                    buffer_lines=buffer_lines)
+                    buffer_lines=buffer_lines,
+                    mangle_dupe_cols=mangle_dupe_cols
+            )
 
         return _read(filepath_or_buffer, kwds)
 
@@ -1154,6 +1159,7 @@ def __init__(self, f, **kwds):
         self.skipinitialspace = kwds['skipinitialspace']
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
+        self.mangle_dupe_cols = kwds.get('mangle_dupe_cols',True)
 
         self.has_index_names = False
         if 'has_index_names' in kwds:
@@ -1335,12 +1341,13 @@ def _infer_columns(self):
                 else:
                     columns.append(c)
 
-            counts = {}
-            for i, col in enumerate(columns):
-                cur_count = counts.get(col, 0)
-                if cur_count > 0:
-                    columns[i] = '%s.%d' % (col, cur_count)
-                counts[col] = cur_count + 1
+            if self.mangle_dupe_cols:
+                counts = {}
+                for i, col in enumerate(columns):
+                    cur_count = counts.get(col, 0)
+                    if cur_count > 0:
+                        columns[i] = '%s.%d' % (col, cur_count)
+                    counts[col] = cur_count + 1
 
             self._clear_buffer()
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -589,14 +589,21 @@ def test_string_nas(self):
         tm.assert_frame_equal(result, expected)
 
     def test_duplicate_columns(self):
-        data = """A,A,B,B,B
-1,2,3,4,5
-6,7,8,9,10
-11,12,13,14,15
-"""
-        df = self.read_table(StringIO(data), sep=',')
-        self.assert_(np.array_equal(df.columns,
-                                    ['A', 'A.1', 'B', 'B.1', 'B.2']))
+        for engine in ['python', 'c']:
+            data = """A,A,B,B,B
+    1,2,3,4,5
+    6,7,8,9,10
+    11,12,13,14,15
+    """
+            # check default beahviour
+            df = self.read_table(StringIO(data), sep=',',engine=engine)
+            self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
+
+            df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False)
+            self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B'])
+
+            df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True)
+            self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
 
     def test_csv_mixed_type(self):
         data = """A,B,C
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -249,6 +249,7 @@ cdef class TextReader:
         object dtype
         object encoding
         object compression
+        object mangle_dupe_cols
         set noconvert, usecols
 
     def __cinit__(self, source,
@@ -298,11 +299,14 @@ cdef class TextReader:
                   buffer_lines=None,
                   skiprows=None,
                   skip_footer=0,
-                  verbose=False):
+                  verbose=False,
+                  mangle_dupe_cols=True):
 
         self.parser = parser_new()
         self.parser.chunksize = tokenize_chunksize
 
+        self.mangle_dupe_cols=mangle_dupe_cols
+
         # For timekeeping
         self.clocks = []
 
@@ -571,8 +575,9 @@ cdef class TextReader:
                 if name == '':
                     name = 'Unnamed: %d' % i
 
+
                 count = counts.get(name, 0)
-                if count > 0:
+                if count > 0 and self.mangle_dupe_cols:
                     header.append('%s.%d' % (name, count))
                 else:
                     header.append(name)