Merge pull request #11681 from nbonnotte/to_csv-formatting-11553

jreback · jreback · commit f295c0acd6cd · 2015-12-27T12:24:32.000-05:00
API: DataFrame.to_csv formatting parameters for float indexes
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -325,6 +325,7 @@ Bug Fixes
 
 
 - Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`)
+
 - Bug in ``Index`` creation from ``Timestamp`` with mixed tz coerces to UTC (:issue:`11488`)
 - Bug in ``to_numeric`` where it does not raise if input is more than one dimension (:issue:`11776`)
 
@@ -348,4 +349,6 @@ Bug Fixes
 
 - Bug in ``read_sql`` with pymysql connections failing to return chunked data (:issue:`11522`)
 
+- Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`)
+
 - Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`)
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -6,7 +6,7 @@
 import sys
 
 from pandas.core.base import PandasObject
-from pandas.core.common import adjoin, notnull
+from pandas.core.common import adjoin, isnull, notnull
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas import compat
 from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u,
@@ -1631,6 +1631,7 @@ def _save_chunk(self, start_i, end_i):
         ix = data_index.to_native_types(slicer=slicer,
                                         na_rep=self.na_rep,
                                         float_format=self.float_format,
+                                        decimal=self.decimal,
                                         date_format=self.date_format,
                                         quoting=self.quoting)
 
@@ -1983,14 +1984,17 @@ def format_array(values, formatter, float_format=None, na_rep='NaN',
 class GenericArrayFormatter(object):
 
     def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
-                 space=12, float_format=None, justify='right'):
+                 space=12, float_format=None, justify='right',
+                 decimal='.', quoting=None):
         self.values = values
         self.digits = digits
         self.na_rep = na_rep
         self.space = space
         self.formatter = formatter
         self.float_format = float_format
         self.justify = justify
+        self.decimal = decimal
+        self.quoting = quoting
 
     def get_result(self):
         fmt_values = self._format_strings()
@@ -2101,6 +2105,42 @@ def _format_strings(self):
 
         return fmt_values
 
+    def get_formatted_data(self):
+        """Returns the array with its float values converted into strings using
+        the parameters given at initalisation.
+
+        Note: the method `.get_result()` does something similar, but with a
+        fixed-width output suitable for screen printing. The output here is not
+        fixed-width.
+        """
+        values = self.values
+        mask = isnull(values)
+
+        # the following variable is to be applied on each value to format it
+        # according to the string containing the float format, self.float_format
+        # and the character to use as decimal separator, self.decimal
+        formatter = None
+        if self.float_format and self.decimal != '.':
+            formatter = lambda v: (
+                (self.float_format % v).replace('.', self.decimal, 1))
+        elif self.decimal != '.':  # no float format
+            formatter = lambda v: str(v).replace('.', self.decimal, 1)
+        elif self.float_format:  # no special decimal separator
+            formatter = lambda v: self.float_format % v
+
+        if formatter is None and not self.quoting:
+            values = values.astype(str)
+        else:
+            values = np.array(values, dtype='object')
+
+        values[mask] = self.na_rep
+        if formatter:
+            imask = (~mask).ravel()
+            values.flat[imask] = np.array(
+                [formatter(val) for val in values.ravel()[imask]])
+
+        return values
+
 
 class IntArrayFormatter(GenericArrayFormatter):
 
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -3926,6 +3926,14 @@ def _convert_slice_indexer(self, key, kind=None):
         # translate to locations
         return self.slice_indexer(key.start, key.stop, key.step)
 
+    def _format_native_types(self, na_rep='', float_format=None,
+                             decimal='.', quoting=None, **kwargs):
+        from pandas.core.format import FloatArrayFormatter
+        formatter = FloatArrayFormatter(self.values, na_rep=na_rep,
+                                        float_format=float_format,
+                                        decimal=decimal, quoting=quoting)
+        return formatter.get_formatted_data()
+
     def get_value(self, series, key):
         """ we always want to get an index value, never a value """
         if not np.isscalar(key):
@@ -4448,12 +4456,27 @@ def _reference_duplicate_name(self, name):
         # count the times name equals an element in self.names.
         return sum(name == n for n in self.names) > 1
 
-    def _format_native_types(self, **kwargs):
-        # we go through the levels and format them
-        levels = [level._format_native_types(**kwargs)
-                  for level in self.levels]
-        mi = MultiIndex(levels=levels, labels=self.labels, names=self.names,
+    def _format_native_types(self, na_rep='nan', **kwargs):
+        new_levels = []
+        new_labels = []
+
+        # go through the levels and format them
+        for level, label in zip(self.levels, self.labels):
+            level = level._format_native_types(na_rep=na_rep, **kwargs)
+            # add nan values, if there are any
+            mask = (label == -1)
+            if mask.any():
+                nan_index = len(level)
+                level = np.append(level, na_rep)
+                label = label.values()
+                label[mask] = nan_index
+            new_levels.append(level)
+            new_labels.append(label)
+
+        # reconstruct the multi-index
+        mi = MultiIndex(levels=new_levels, labels=new_labels, names=self.names,
                         sortorder=self.sortorder, verify_integrity=False)
+
         return mi.values
 
     @property
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1390,28 +1390,12 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.'
         values = self.values
         if slicer is not None:
             values = values[:, slicer]
-        mask = isnull(values)
-
-        formatter = None
-        if float_format and decimal != '.':
-            formatter = lambda v : (float_format % v).replace('.',decimal,1)
-        elif decimal != '.':
-            formatter = lambda v : ('%g' % v).replace('.',decimal,1)
-        elif float_format:
-            formatter = lambda v : float_format % v
 
-        if formatter is None and not quoting:
-            values = values.astype(str)
-        else:
-            values = np.array(values, dtype='object')
-
-        values[mask] = na_rep
-        if formatter:
-            imask = (~mask).ravel()
-            values.flat[imask] = np.array(
-                [formatter(val) for val in values.ravel()[imask]])
-
-        return values
+        from pandas.core.format import FloatArrayFormatter
+        formatter = FloatArrayFormatter(values, na_rep=na_rep,
+                                        float_format=float_format,
+                                        decimal=decimal, quoting=quoting)
+        return formatter.get_formatted_data()
 
     def should_store(self, value):
         # when inserting a column should not coerce integers to floats
diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
@@ -2932,6 +2932,52 @@ def test_to_csv_decimal(self):
         expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
         self.assertEqual(df.to_csv(decimal=',',sep=';', float_format = '%.2f'), expected_float_format)
 
+        # GH 11553: testing if decimal is taken into account for '0.0'
+        df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
+        expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
+        self.assertEqual(
+            df.to_csv(index=False, decimal='^'), expected)
+
+        # same but for an index
+        self.assertEqual(
+            df.set_index('a').to_csv(decimal='^'), expected)
+
+        # same for a multi-index
+        self.assertEqual(
+            df.set_index(['a', 'b']).to_csv(decimal="^"), expected)
+
+    def test_to_csv_float_format(self):
+        # testing if float_format is taken into account for the index
+        # GH 11553
+        df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})
+        expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n'
+        self.assertEqual(
+            df.set_index('a').to_csv(float_format='%.2f'), expected)
+
+        # same for a multi-index
+        self.assertEqual(
+            df.set_index(['a', 'b']).to_csv(float_format='%.2f'), expected)
+
+    def test_to_csv_na_rep(self):
+        # testing if NaN values are correctly represented in the index
+        # GH 11553
+        df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
+        expected = "a,b,c\n0.0,0,2\n_,1,3\n"
+        self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
+        self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)
+
+        # now with an index containing only NaNs
+        df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
+        expected = "a,b,c\n_,0,2\n_,1,3\n"
+        self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
+        self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)
+
+        # check if na_rep parameter does not break anything when no NaN
+        df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
+        expected = "a,b,c\n0,0,2\n0,1,3\n"
+        self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
+        self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)
+
     def test_to_csv_date_format(self):
         # GH 10209
         df_sec = DataFrame({'A': pd.date_range('20130101',periods=5,freq='s')})