diff --git a/RELEASE.rst b/RELEASE.rst index 6f55b7cd4490f..aac34c6cf8a5e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -48,7 +48,8 @@ pandas 0.12.0 - Fixed an esoteric excel reading bug, xlrd>= 0.9.0 now required for excel support. Should provide python3 support (for reading) which has been lacking. (GH3164_) - - Fix to_csv issue when having a large number of rows and ``NaT`` in some + - Addressed handling of dupe columns in df.to_csv new and old (GH3454_, GH3457_) + - Fix to_csv issue when having a large number of rows and ``NaT`` in some columns (GH3437_) - ``.loc`` was not raising when passed an integer list (GH3449_) - Unordered time series selection was misbehaving when using label slicing (GH3448_) @@ -57,6 +58,8 @@ pandas 0.12.0 .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH3251: https://github.com/pydata/pandas/issues/3251 .. _GH3379: https://github.com/pydata/pandas/issues/3379 +.. _GH3454: https://github.com/pydata/pandas/issues/3454 +.. _GH3457: https://github.com/pydata/pandas/issues/3457 .. _GH3038: https://github.com/pydata/pandas/issues/3038 .. _GH3437: https://github.com/pydata/pandas/issues/3437 .. _GH3455: https://github.com/pydata/pandas/issues/3455 diff --git a/pandas/core/format.py b/pandas/core/format.py index 22a1f99c6e2d9..7226bd14e5576 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -772,6 +772,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.engine = engine # remove for 0.12 self.obj = obj + self.path_or_buf = path_or_buf self.sep = sep self.na_rep = na_rep @@ -789,13 +790,27 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.line_terminator = line_terminator - if cols is None: - cols = obj.columns + #GH3457 + if not self.obj.columns.is_unique and engine == 'python': + msg= "columns.is_unique == False not supported with engine='python'" + raise NotImplementedError(msg) + if cols is not None: + if isinstance(cols,Index): + cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) + else: + cols=list(cols) + self.obj = self.obj.loc[:,cols] + + # update columns to include possible multiplicity of dupes + # and make sure sure cols is just a list of labels + cols = self.obj.columns if isinstance(cols,Index): cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) else: cols=list(cols) + + # save it self.cols = cols # preallocate data 2d list @@ -804,7 +819,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.data =[None] * ncols if self.obj.columns.is_unique: - self.colname_map = dict((k,i) for i,k in enumerate(obj.columns)) + self.colname_map = dict((k,i) for i,k in enumerate(self.obj.columns)) else: ks = [set(x.items) for x in self.blocks] u = len(reduce(lambda a,x: a.union(x),ks,set())) @@ -1024,7 +1039,9 @@ def _save_chunk(self, start_i, end_i): # self.data is a preallocated list self.data[self.colname_map[k]] = d[j] else: - for i in range(len(self.cols)): + # self.obj should contain a proper view of the dataframes + # with the specified ordering of cols if cols was specified + for i in range(len(self.obj.columns)): self.data[i] = self.obj.icol(i).values[slicer].tolist() ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6bba9f6d32efc..530128a100d0b 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -28,6 +28,7 @@ from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, + makeCustomDataframe as mkdf, ensure_clean) from pandas.util import py3compat from pandas.util.compat import OrderedDict @@ -4621,9 +4622,59 @@ def test_to_csv_from_csv(self): xp.columns = map(int,xp.columns) assert_frame_equal(xp,rs) + def test_to_csv_cols_reordering(self): + # GH3454 + import pandas as pd + + def _check_df(df,cols=None): + with ensure_clean() as path: + df.to_csv(path,cols = cols,engine='python') + rs_p = pd.read_csv(path,index_col=0) + df.to_csv(path,cols = cols,chunksize=chunksize) + rs_c = pd.read_csv(path,index_col=0) + + if cols: + df = df[cols] + assert (rs_c.columns==rs_p.columns).all() + assert_frame_equal(df,rs_c,check_names=False) + + chunksize=5 + N = int(chunksize*2.5) + + df= mkdf(N, 3) + cs = df.columns + cols = [cs[2],cs[0]] + _check_df(df,cols) + + def test_to_csv_legacy_raises_on_dupe_cols(self): + df= mkdf(10, 3) + df.columns = ['a','a','b'] + with ensure_clean() as path: + self.assertRaises(NotImplementedError,df.to_csv,path,engine='python') + + def test_to_csv_new_dupe_cols(self): + import pandas as pd + def _check_df(df,cols=None): + with ensure_clean() as path: + df.to_csv(path,cols = cols,chunksize=chunksize) + rs_c = pd.read_csv(path,index_col=0) + rs_c.columns = df.columns + assert_frame_equal(df,rs_c,check_names=False) + + chunksize=5 + N = int(chunksize*2.5) + + # dupe cols + df= mkdf(N, 3) + df.columns = ['a','a','b'] + _check_df(df,None) + + # dupe cols with selection + cols = ['b','a'] + _check_df(df,cols) + @slow def test_to_csv_moar(self): - from pandas.util.testing import makeCustomDataframe as mkdf path = '__tmp_to_csv_moar__' def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None,