diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index b443bb74e98ea..4614ce9acf3d5 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -325,6 +325,7 @@ Bug Fixes - Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`) + - Bug in ``Index`` creation from ``Timestamp`` with mixed tz coerces to UTC (:issue:`11488`) - Bug in ``to_numeric`` where it does not raise if input is more than one dimension (:issue:`11776`) @@ -348,4 +349,6 @@ Bug Fixes - Bug in ``read_sql`` with pymysql connections failing to return chunked data (:issue:`11522`) +- Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`) + - Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`) diff --git a/pandas/core/format.py b/pandas/core/format.py index 07f16a5ef480a..91ac6f11f4ae9 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -6,7 +6,7 @@ import sys from pandas.core.base import PandasObject -from pandas.core.common import adjoin, notnull +from pandas.core.common import adjoin, isnull, notnull from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u, @@ -1631,6 +1631,7 @@ def _save_chunk(self, start_i, end_i): ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, + decimal=self.decimal, date_format=self.date_format, quoting=self.quoting) @@ -1983,7 +1984,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', class GenericArrayFormatter(object): def __init__(self, values, digits=7, formatter=None, na_rep='NaN', - space=12, float_format=None, justify='right'): + space=12, float_format=None, justify='right', + decimal='.', quoting=None): self.values = values self.digits = digits self.na_rep = na_rep @@ -1991,6 +1993,8 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN', self.formatter = formatter self.float_format = float_format self.justify = justify + self.decimal = decimal + self.quoting = quoting def get_result(self): fmt_values = self._format_strings() @@ -2101,6 +2105,42 @@ def _format_strings(self): return fmt_values + def get_formatted_data(self): + """Returns the array with its float values converted into strings using + the parameters given at initalisation. + + Note: the method `.get_result()` does something similar, but with a + fixed-width output suitable for screen printing. The output here is not + fixed-width. + """ + values = self.values + mask = isnull(values) + + # the following variable is to be applied on each value to format it + # according to the string containing the float format, self.float_format + # and the character to use as decimal separator, self.decimal + formatter = None + if self.float_format and self.decimal != '.': + formatter = lambda v: ( + (self.float_format % v).replace('.', self.decimal, 1)) + elif self.decimal != '.': # no float format + formatter = lambda v: str(v).replace('.', self.decimal, 1) + elif self.float_format: # no special decimal separator + formatter = lambda v: self.float_format % v + + if formatter is None and not self.quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + + values[mask] = self.na_rep + if formatter: + imask = (~mask).ravel() + values.flat[imask] = np.array( + [formatter(val) for val in values.ravel()[imask]]) + + return values + class IntArrayFormatter(GenericArrayFormatter): diff --git a/pandas/core/index.py b/pandas/core/index.py index 552eb7fb81180..8325e16515b90 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3926,6 +3926,14 @@ def _convert_slice_indexer(self, key, kind=None): # translate to locations return self.slice_indexer(key.start, key.stop, key.step) + def _format_native_types(self, na_rep='', float_format=None, + decimal='.', quoting=None, **kwargs): + from pandas.core.format import FloatArrayFormatter + formatter = FloatArrayFormatter(self.values, na_rep=na_rep, + float_format=float_format, + decimal=decimal, quoting=quoting) + return formatter.get_formatted_data() + def get_value(self, series, key): """ we always want to get an index value, never a value """ if not np.isscalar(key): @@ -4448,12 +4456,27 @@ def _reference_duplicate_name(self, name): # count the times name equals an element in self.names. return sum(name == n for n in self.names) > 1 - def _format_native_types(self, **kwargs): - # we go through the levels and format them - levels = [level._format_native_types(**kwargs) - for level in self.levels] - mi = MultiIndex(levels=levels, labels=self.labels, names=self.names, + def _format_native_types(self, na_rep='nan', **kwargs): + new_levels = [] + new_labels = [] + + # go through the levels and format them + for level, label in zip(self.levels, self.labels): + level = level._format_native_types(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = (label == -1) + if mask.any(): + nan_index = len(level) + level = np.append(level, na_rep) + label = label.values() + label[mask] = nan_index + new_levels.append(level) + new_labels.append(label) + + # reconstruct the multi-index + mi = MultiIndex(levels=new_levels, labels=new_labels, names=self.names, sortorder=self.sortorder, verify_integrity=False) + return mi.values @property diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 123dca9f3ee5c..961f5f437baf9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1390,28 +1390,12 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.' values = self.values if slicer is not None: values = values[:, slicer] - mask = isnull(values) - - formatter = None - if float_format and decimal != '.': - formatter = lambda v : (float_format % v).replace('.',decimal,1) - elif decimal != '.': - formatter = lambda v : ('%g' % v).replace('.',decimal,1) - elif float_format: - formatter = lambda v : float_format % v - if formatter is None and not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype='object') - - values[mask] = na_rep - if formatter: - imask = (~mask).ravel() - values.flat[imask] = np.array( - [formatter(val) for val in values.ravel()[imask]]) - - return values + from pandas.core.format import FloatArrayFormatter + formatter = FloatArrayFormatter(values, na_rep=na_rep, + float_format=float_format, + decimal=decimal, quoting=quoting) + return formatter.get_formatted_data() def should_store(self, value): # when inserting a column should not coerce integers to floats diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index f2290877676fa..6c4e4dd844fc9 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -2932,6 +2932,52 @@ def test_to_csv_decimal(self): expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n' self.assertEqual(df.to_csv(decimal=',',sep=';', float_format = '%.2f'), expected_float_format) + # GH 11553: testing if decimal is taken into account for '0.0' + df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) + expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n' + self.assertEqual( + df.to_csv(index=False, decimal='^'), expected) + + # same but for an index + self.assertEqual( + df.set_index('a').to_csv(decimal='^'), expected) + + # same for a multi-index + self.assertEqual( + df.set_index(['a', 'b']).to_csv(decimal="^"), expected) + + def test_to_csv_float_format(self): + # testing if float_format is taken into account for the index + # GH 11553 + df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1}) + expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n' + self.assertEqual( + df.set_index('a').to_csv(float_format='%.2f'), expected) + + # same for a multi-index + self.assertEqual( + df.set_index(['a', 'b']).to_csv(float_format='%.2f'), expected) + + def test_to_csv_na_rep(self): + # testing if NaN values are correctly represented in the index + # GH 11553 + df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) + expected = "a,b,c\n0.0,0,2\n_,1,3\n" + self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) + self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) + + # now with an index containing only NaNs + df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) + expected = "a,b,c\n_,0,2\n_,1,3\n" + self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) + self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) + + # check if na_rep parameter does not break anything when no NaN + df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) + expected = "a,b,c\n0,0,2\n0,1,3\n" + self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected) + self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected) + def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({'A': pd.date_range('20130101',periods=5,freq='s')})