Skip to content

API: DataFrame.to_csv formatting parameters for float indexes #11681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 27, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ Bug Fixes


- Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`)

- Bug in ``Index`` creation from ``Timestamp`` with mixed tz coerces to UTC (:issue:`11488`)
- Bug in ``to_numeric`` where it does not raise if input is more than one dimension (:issue:`11776`)

Expand All @@ -348,4 +349,6 @@ Bug Fixes

- Bug in ``read_sql`` with pymysql connections failing to return chunked data (:issue:`11522`)

- Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`)

- Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`)
44 changes: 42 additions & 2 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys

from pandas.core.base import PandasObject
from pandas.core.common import adjoin, notnull
from pandas.core.common import adjoin, isnull, notnull
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas import compat
from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u,
Expand Down Expand Up @@ -1631,6 +1631,7 @@ def _save_chunk(self, start_i, end_i):
ix = data_index.to_native_types(slicer=slicer,
na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)

Expand Down Expand Up @@ -1983,14 +1984,17 @@ def format_array(values, formatter, float_format=None, na_rep='NaN',
class GenericArrayFormatter(object):

def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
space=12, float_format=None, justify='right'):
space=12, float_format=None, justify='right',
decimal='.', quoting=None):
self.values = values
self.digits = digits
self.na_rep = na_rep
self.space = space
self.formatter = formatter
self.float_format = float_format
self.justify = justify
self.decimal = decimal
self.quoting = quoting

def get_result(self):
fmt_values = self._format_strings()
Expand Down Expand Up @@ -2101,6 +2105,42 @@ def _format_strings(self):

return fmt_values

def get_formatted_data(self):
"""Returns the array with its float values converted into strings using
the parameters given at initalisation.

Note: the method `.get_result()` does something similar, but with a
fixed-width output suitable for screen printing. The output here is not
fixed-width.
"""
values = self.values
mask = isnull(values)

# the following variable is to be applied on each value to format it
# according to the string containing the float format, self.float_format
# and the character to use as decimal separator, self.decimal
formatter = None
if self.float_format and self.decimal != '.':
formatter = lambda v: (
(self.float_format % v).replace('.', self.decimal, 1))
elif self.decimal != '.': # no float format
formatter = lambda v: str(v).replace('.', self.decimal, 1)
elif self.float_format: # no special decimal separator
formatter = lambda v: self.float_format % v

if formatter is None and not self.quoting:
values = values.astype(str)
else:
values = np.array(values, dtype='object')

values[mask] = self.na_rep
if formatter:
imask = (~mask).ravel()
values.flat[imask] = np.array(
[formatter(val) for val in values.ravel()[imask]])

return values


class IntArrayFormatter(GenericArrayFormatter):

Expand Down
33 changes: 28 additions & 5 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3926,6 +3926,14 @@ def _convert_slice_indexer(self, key, kind=None):
# translate to locations
return self.slice_indexer(key.start, key.stop, key.step)

def _format_native_types(self, na_rep='', float_format=None,
decimal='.', quoting=None, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since this is basically identical to core/internals/FloatBlock/to_native_types. let's pull both those out and put it in a function in core/format.py/FloatArrayFormatter and call it .get_formatted_data(). See how that works out. These are the routines for screen printing (which are necessarily different from to_csv / index formatting).

from pandas.core.format import FloatArrayFormatter
formatter = FloatArrayFormatter(self.values, na_rep=na_rep,
float_format=float_format,
decimal=decimal, quoting=quoting)
return formatter.get_formatted_data()

def get_value(self, series, key):
""" we always want to get an index value, never a value """
if not np.isscalar(key):
Expand Down Expand Up @@ -4448,12 +4456,27 @@ def _reference_duplicate_name(self, name):
# count the times name equals an element in self.names.
return sum(name == n for n in self.names) > 1

def _format_native_types(self, **kwargs):
# we go through the levels and format them
levels = [level._format_native_types(**kwargs)
for level in self.levels]
mi = MultiIndex(levels=levels, labels=self.labels, names=self.names,
def _format_native_types(self, na_rep='nan', **kwargs):
new_levels = []
new_labels = []

# go through the levels and format them
for level, label in zip(self.levels, self.labels):
level = level._format_native_types(na_rep=na_rep, **kwargs)
# add nan values, if there are any
mask = (label == -1)
if mask.any():
nan_index = len(level)
level = np.append(level, na_rep)
label = label.values()
label[mask] = nan_index
new_levels.append(level)
new_labels.append(label)

# reconstruct the multi-index
mi = MultiIndex(levels=new_levels, labels=new_labels, names=self.names,
sortorder=self.sortorder, verify_integrity=False)

return mi.values

@property
Expand Down
26 changes: 5 additions & 21 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1390,28 +1390,12 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.'
values = self.values
if slicer is not None:
values = values[:, slicer]
mask = isnull(values)

formatter = None
if float_format and decimal != '.':
formatter = lambda v : (float_format % v).replace('.',decimal,1)
elif decimal != '.':
formatter = lambda v : ('%g' % v).replace('.',decimal,1)
elif float_format:
formatter = lambda v : float_format % v

if formatter is None and not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype='object')

values[mask] = na_rep
if formatter:
imask = (~mask).ravel()
values.flat[imask] = np.array(
[formatter(val) for val in values.ravel()[imask]])

return values
from pandas.core.format import FloatArrayFormatter
formatter = FloatArrayFormatter(values, na_rep=na_rep,
float_format=float_format,
decimal=decimal, quoting=quoting)
return formatter.get_formatted_data()

def should_store(self, value):
# when inserting a column should not coerce integers to floats
Expand Down
46 changes: 46 additions & 0 deletions pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2932,6 +2932,52 @@ def test_to_csv_decimal(self):
expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
self.assertEqual(df.to_csv(decimal=',',sep=';', float_format = '%.2f'), expected_float_format)

# GH 11553: testing if decimal is taken into account for '0.0'
df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
self.assertEqual(
df.to_csv(index=False, decimal='^'), expected)

# same but for an index
self.assertEqual(
df.set_index('a').to_csv(decimal='^'), expected)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, are there tests for quoting? if not can you add a couple. thxs.


# same for a multi-index
self.assertEqual(
df.set_index(['a', 'b']).to_csv(decimal="^"), expected)

def test_to_csv_float_format(self):
# testing if float_format is taken into account for the index
# GH 11553
df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})
expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n'
self.assertEqual(
df.set_index('a').to_csv(float_format='%.2f'), expected)

# same for a multi-index
self.assertEqual(
df.set_index(['a', 'b']).to_csv(float_format='%.2f'), expected)

def test_to_csv_na_rep(self):
# testing if NaN values are correctly represented in the index
# GH 11553
df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
expected = "a,b,c\n0.0,0,2\n_,1,3\n"
self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)

# now with an index containing only NaNs
df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
expected = "a,b,c\n_,0,2\n_,1,3\n"
self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)

# check if na_rep parameter does not break anything when no NaN
df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
expected = "a,b,c\n0,0,2\n0,1,3\n"
self.assertEqual(df.set_index('a').to_csv(na_rep='_'), expected)
self.assertEqual(df.set_index(['a', 'b']).to_csv(na_rep='_'), expected)

def test_to_csv_date_format(self):
# GH 10209
df_sec = DataFrame({'A': pd.date_range('20130101',periods=5,freq='s')})
Expand Down