Merge pull request #3458 from y-p/GH3454

y-p · y-p · commit 0bd5e7773f7e · 2013-04-25T09:09:51.000-07:00
BUG: to_csv handles cols= reordering,dupe cols GH3454
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -48,7 +48,8 @@ pandas 0.12.0
   - Fixed an esoteric excel reading bug, xlrd>= 0.9.0 now required for excel
     support. Should provide python3 support (for reading) which has been
     lacking. (GH3164_)
-  - Fix to_csv issue when having a large number of rows and ``NaT`` in some 
+  - Addressed handling of dupe columns in df.to_csv new and old (GH3454_, GH3457_)
+  - Fix to_csv issue when having a large number of rows and ``NaT`` in some
     columns (GH3437_)
   - ``.loc`` was not raising when passed an integer list (GH3449_)
   - Unordered time series selection was misbehaving when using label slicing (GH3448_)
@@ -57,6 +58,8 @@ pandas 0.12.0
 .. _GH3164: https://github.com/pydata/pandas/issues/3164
 .. _GH3251: https://github.com/pydata/pandas/issues/3251
 .. _GH3379: https://github.com/pydata/pandas/issues/3379
+.. _GH3454: https://github.com/pydata/pandas/issues/3454
+.. _GH3457: https://github.com/pydata/pandas/issues/3457
 .. _GH3038: https://github.com/pydata/pandas/issues/3038
 .. _GH3437: https://github.com/pydata/pandas/issues/3437
 .. _GH3455: https://github.com/pydata/pandas/issues/3455
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -772,6 +772,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
 
         self.engine = engine  # remove for 0.12
         self.obj = obj
+
         self.path_or_buf = path_or_buf
         self.sep = sep
         self.na_rep = na_rep
@@ -789,13 +790,27 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
 
         self.line_terminator = line_terminator
 
-        if cols is None:
-            cols = obj.columns
+        #GH3457
+        if not self.obj.columns.is_unique and engine == 'python':
+            msg= "columns.is_unique == False not supported with engine='python'"
+            raise NotImplementedError(msg)
 
+        if cols is not None:
+            if isinstance(cols,Index):
+                cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
+            else:
+                cols=list(cols)
+            self.obj = self.obj.loc[:,cols]
+
+        # update columns to include possible multiplicity of dupes
+        # and make sure sure cols is just a list of labels
+        cols = self.obj.columns
         if isinstance(cols,Index):
             cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
         else:
             cols=list(cols)
+
+        # save it
         self.cols = cols
 
         # preallocate data 2d list
@@ -804,7 +819,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
         self.data =[None] * ncols
 
         if self.obj.columns.is_unique:
-            self.colname_map = dict((k,i) for i,k in  enumerate(obj.columns))
+            self.colname_map = dict((k,i) for i,k in  enumerate(self.obj.columns))
         else:
             ks = [set(x.items) for x in self.blocks]
             u = len(reduce(lambda a,x: a.union(x),ks,set()))
@@ -1024,7 +1039,9 @@ def _save_chunk(self, start_i, end_i):
                     # self.data is a preallocated list
                     self.data[self.colname_map[k]] = d[j]
         else:
-            for i in range(len(self.cols)):
+            # self.obj should contain a proper view of the dataframes
+            # with the specified ordering of cols if cols was specified
+            for i in range(len(self.obj.columns)):
                 self.data[i] = self.obj.icol(i).values[slicer].tolist()
 
         ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -28,6 +28,7 @@
 from pandas.util.testing import (assert_almost_equal,
                                  assert_series_equal,
                                  assert_frame_equal,
+                                 makeCustomDataframe as mkdf,
                                  ensure_clean)
 from pandas.util import py3compat
 from pandas.util.compat import OrderedDict
@@ -4621,9 +4622,59 @@ def test_to_csv_from_csv(self):
             xp.columns = map(int,xp.columns)
             assert_frame_equal(xp,rs)
 
+    def test_to_csv_cols_reordering(self):
+        # GH3454
+        import pandas as pd
+
+        def _check_df(df,cols=None):
+            with ensure_clean() as path:
+                df.to_csv(path,cols = cols,engine='python')
+                rs_p = pd.read_csv(path,index_col=0)
+                df.to_csv(path,cols = cols,chunksize=chunksize)
+                rs_c = pd.read_csv(path,index_col=0)
+
+            if cols:
+                df = df[cols]
+            assert (rs_c.columns==rs_p.columns).all()
+            assert_frame_equal(df,rs_c,check_names=False)
+
+        chunksize=5
+        N = int(chunksize*2.5)
+
+        df= mkdf(N, 3)
+        cs = df.columns
+        cols = [cs[2],cs[0]]
+        _check_df(df,cols)
+
+    def test_to_csv_legacy_raises_on_dupe_cols(self):
+        df= mkdf(10, 3)
+        df.columns = ['a','a','b']
+        with ensure_clean() as path:
+            self.assertRaises(NotImplementedError,df.to_csv,path,engine='python')
+
+    def test_to_csv_new_dupe_cols(self):
+        import pandas as pd
+        def _check_df(df,cols=None):
+            with ensure_clean() as path:
+                df.to_csv(path,cols = cols,chunksize=chunksize)
+                rs_c = pd.read_csv(path,index_col=0)
+                rs_c.columns = df.columns
+                assert_frame_equal(df,rs_c,check_names=False)
+
+        chunksize=5
+        N = int(chunksize*2.5)
+
+        # dupe cols
+        df= mkdf(N, 3)
+        df.columns = ['a','a','b']
+        _check_df(df,None)
+
+        # dupe cols with selection
+        cols = ['b','a']
+        _check_df(df,cols)
+
     @slow
     def test_to_csv_moar(self):
-        from pandas.util.testing import makeCustomDataframe as mkdf
         path = '__tmp_to_csv_moar__'
 
         def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None,