Skip to content

Commit f295c0a

Browse files
committed
Merge pull request #11681 from nbonnotte/to_csv-formatting-11553
API: DataFrame.to_csv formatting parameters for float indexes
2 parents c45dc76 + 9302811 commit f295c0a

File tree

5 files changed

+124
-28
lines changed

5 files changed

+124
-28
lines changed

doc/source/whatsnew/v0.18.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ Bug Fixes
325325

326326

327327
- Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`)
328+
328329
- Bug in ``Index`` creation from ``Timestamp`` with mixed tz coerces to UTC (:issue:`11488`)
329330
- Bug in ``to_numeric`` where it does not raise if input is more than one dimension (:issue:`11776`)
330331

@@ -348,4 +349,6 @@ Bug Fixes
348349

349350
- Bug in ``read_sql`` with pymysql connections failing to return chunked data (:issue:`11522`)
350351

352+
- Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`)
353+
351354
- Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`)

pandas/core/format.py

+42-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import sys
77

88
from pandas.core.base import PandasObject
9-
from pandas.core.common import adjoin, notnull
9+
from pandas.core.common import adjoin, isnull, notnull
1010
from pandas.core.index import Index, MultiIndex, _ensure_index
1111
from pandas import compat
1212
from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u,
@@ -1631,6 +1631,7 @@ def _save_chunk(self, start_i, end_i):
16311631
ix = data_index.to_native_types(slicer=slicer,
16321632
na_rep=self.na_rep,
16331633
float_format=self.float_format,
1634+
decimal=self.decimal,
16341635
date_format=self.date_format,
16351636
quoting=self.quoting)
16361637

@@ -1983,14 +1984,17 @@ def format_array(values, formatter, float_format=None, na_rep='NaN',
19831984
class GenericArrayFormatter(object):
19841985

19851986
def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
1986-
space=12, float_format=None, justify='right'):
1987+
space=12, float_format=None, justify='right',
1988+
decimal='.', quoting=None):
19871989
self.values = values
19881990
self.digits = digits
19891991
self.na_rep = na_rep
19901992
self.space = space
19911993
self.formatter = formatter
19921994
self.float_format = float_format
19931995
self.justify = justify
1996+
self.decimal = decimal
1997+
self.quoting = quoting
19941998

19951999
def get_result(self):
19962000
fmt_values = self._format_strings()
@@ -2101,6 +2105,42 @@ def _format_strings(self):
21012105

21022106
return fmt_values
21032107

2108+
def get_formatted_data(self):
2109+
"""Returns the array with its float values converted into strings using
2110+
the parameters given at initalisation.
2111+
2112+
Note: the method `.get_result()` does something similar, but with a
2113+
fixed-width output suitable for screen printing. The output here is not
2114+
fixed-width.
2115+
"""
2116+
values = self.values
2117+
mask = isnull(values)
2118+
2119+
# the following variable is to be applied on each value to format it
2120+
# according to the string containing the float format, self.float_format
2121+
# and the character to use as decimal separator, self.decimal
2122+
formatter = None
2123+
if self.float_format and self.decimal != '.':
2124+
formatter = lambda v: (
2125+
(self.float_format % v).replace('.', self.decimal, 1))
2126+
elif self.decimal != '.': # no float format
2127+
formatter = lambda v: str(v).replace('.', self.decimal, 1)
2128+
elif self.float_format: # no special decimal separator
2129+
formatter = lambda v: self.float_format % v
2130+
2131+
if formatter is None and not self.quoting:
2132+
values = values.astype(str)
2133+
else:
2134+
values = np.array(values, dtype='object')
2135+
2136+
values[mask] = self.na_rep
2137+
if formatter:
2138+
imask = (~mask).ravel()
2139+
values.flat[imask] = np.array(
2140+
[formatter(val) for val in values.ravel()[imask]])
2141+
2142+
return values
2143+
21042144

21052145
class IntArrayFormatter(GenericArrayFormatter):
21062146

pandas/core/index.py

+28-5
Original file line numberDiff line numberDiff line change
@@ -3926,6 +3926,14 @@ def _convert_slice_indexer(self, key, kind=None):
39263926
# translate to locations
39273927
return self.slice_indexer(key.start, key.stop, key.step)
39283928

3929+
def _format_native_types(self, na_rep='', float_format=None,
3930+
decimal='.', quoting=None, **kwargs):
3931+
from pandas.core.format import FloatArrayFormatter
3932+
formatter = FloatArrayFormatter(self.values, na_rep=na_rep,
3933+
float_format=float_format,
3934+
decimal=decimal, quoting=quoting)
3935+
return formatter.get_formatted_data()
3936+
39293937
def get_value(self, series, key):
39303938
""" we always want to get an index value, never a value """
39313939
if not np.isscalar(key):
@@ -4448,12 +4456,27 @@ def _reference_duplicate_name(self, name):
44484456
# count the times name equals an element in self.names.
44494457
return sum(name == n for n in self.names) > 1
44504458

4451-
def _format_native_types(self, **kwargs):
4452-
# we go through the levels and format them
4453-
levels = [level._format_native_types(**kwargs)
4454-
for level in self.levels]
4455-
mi = MultiIndex(levels=levels, labels=self.labels, names=self.names,
4459+
def _format_native_types(self, na_rep='nan', **kwargs):
4460+
new_levels = []
4461+
new_labels = []
4462+
4463+
# go through the levels and format them
4464+
for level, label in zip(self.levels, self.labels):
4465+
level = level._format_native_types(na_rep=na_rep, **kwargs)
4466+
# add nan values, if there are any
4467+
mask = (label == -1)
4468+
if mask.any():
4469+
nan_index = len(level)
4470+
level = np.append(level, na_rep)
4471+
label = label.values()
4472+
label[mask] = nan_index
4473+
new_levels.append(level)
4474+
new_labels.append(label)
4475+
4476+
# reconstruct the multi-index
4477+
mi = MultiIndex(levels=new_levels, labels=new_labels, names=self.names,
44564478
sortorder=self.sortorder, verify_integrity=False)
4479+
44574480
return mi.values
44584481

44594482
@property

pandas/core/internals.py

+5-21
Original file line numberDiff line numberDiff line change
@@ -1390,28 +1390,12 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.'
13901390
values = self.values
13911391
if slicer is not None:
13921392
values = values[:, slicer]
1393-
mask = isnull(values)
1394-
1395-
formatter = None
1396-
if float_format and decimal != '.':
1397-
formatter = lambda v : (float_format % v).replace('.',decimal,1)
1398-
elif decimal != '.':
1399-
formatter = lambda v : ('%g' % v).replace('.',decimal,1)
1400-
elif float_format:
1401-
formatter = lambda v : float_format % v
14021393

1403-
if formatter is None and not quoting:
1404-
values = values.astype(str)
1405-
else:
1406-
values = np.array(values, dtype='object')
1407-
1408-
values[mask] = na_rep
1409-
if formatter:
1410-
imask = (~mask).ravel()
1411-
values.flat[imask] = np.array(
1412-
[formatter(val) for val in values.ravel()[imask]])
1413-
1414-
return values
1394+
from pandas.core.format import FloatArrayFormatter
1395+
formatter = FloatArrayFormatter(values, na_rep=na_rep,
1396+
float_format=float_format,
1397+
decimal=decimal, quoting=quoting)
1398+
return formatter.get_formatted_data()
14151399

14161400
def should_store(self, value):
14171401
# when inserting a column should not coerce integers to floats

pandas/tests/test_format.py

+46
Original file line numberDiff line numberDiff line change
@@ -2932,6 +2932,52 @@ def test_to_csv_decimal(self):
29322932
expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
29332933
self.assertEqual(df.to_csv(decimal=',',sep=';', float_format = '%.2f'), expected_float_format)
29342934

2935+
# GH 11553: testing if decimal is taken into account for '0.0'
2936+
df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
2937+
expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
2938+
self.assertEqual(
2939+
df.to_csv(index=False, decimal='^'), expected)
2940+
2941+
# same but for an index
2942+
self.assertEqual(
2943+
df.set_index('a').to_csv(decimal='^'), expected)
2944+
2945+
# same for a multi-index
2946+
self.assertEqual(
2947+
df.set_index(['a', 'b']).to_csv(decimal="^"), expected)
2948+
2949+
def test_to_csv_float_format(self):
2950+
# testing if float_format is taken into account for the index
2951+
# GH 11553
2952+
df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})
2953+
expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n'
2954+
self.assertEqual(
2955+
df.set_index('a').to_csv(float_format='%.2f'), expected)
2956+
2957+
# same for a multi-index
2958+
self.assertEqual(
2959+
df.set_index(['a', 'b']).to_csv(float_format='%.2f'), expected)
2960+
2961+
def test_to_csv_na_rep(self):
2962+
# testing if NaN values are correctly represented in the index
2963+
# GH 11553
2964+
df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
2965+
expected = "a,b,c\n0.0,0,2\n_,1,3\n"
2966+
self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
2967+
self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)
2968+
2969+
# now with an index containing only NaNs
2970+
df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
2971+
expected = "a,b,c\n_,0,2\n_,1,3\n"
2972+
self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
2973+
self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)
2974+
2975+
# check if na_rep parameter does not break anything when no NaN
2976+
df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
2977+
expected = "a,b,c\n0,0,2\n0,1,3\n"
2978+
self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
2979+
self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)
2980+
29352981
def test_to_csv_date_format(self):
29362982
# GH 10209
29372983
df_sec = DataFrame({'A': pd.date_range('20130101',periods=5,freq='s')})

0 commit comments

Comments
 (0)