Skip to content

ENH: to_csv() date formatting #4313

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 12, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ New features
- Clipboard functionality now works with PySide (:issue:`4282`)
- New ``extract`` string method returns regex matches more conveniently (:issue:`4685`)
- Auto-detect field widths in read_fwf when unspecified (:issue:`4488`)
- ``to_csv()`` now outputs datetime objects according to a specified format string
via the ``date_format`` keyword (:issue:`4313`)


Experimental Features
~~~~~~~~~~~~~~~~~~~~~
Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ API changes
and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not
support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`)

- ``to_csv`` now takes a ``date_format`` keyword argument that specifies how
output datetime objects should be formatted. Datetimes encountered in the
index, columns, and values will all have this formatting applied. (:issue:`4313`)

Prior Version Deprecations/Changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
59 changes: 44 additions & 15 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import itertools
import csv

from pandas.tseries.period import PeriodIndex
from pandas.tseries.period import PeriodIndex, DatetimeIndex

docstring_to_string = """
Parameters
Expand Down Expand Up @@ -850,7 +850,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
cols=None, header=True, index=True, index_label=None,
mode='w', nanRep=None, encoding=None, quoting=None,
line_terminator='\n', chunksize=None, engine=None,
tupleize_cols=False, quotechar='"'):
tupleize_cols=False, quotechar='"', date_format=None):

self.engine = engine # remove for 0.13
self.obj = obj
Expand All @@ -877,6 +877,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,

self.line_terminator = line_terminator

self.date_format = date_format

#GH3457
if not self.obj.columns.is_unique and engine == 'python':
msg= "columns.is_unique == False not supported with engine='python'"
Expand All @@ -893,7 +895,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,

if cols is not None:
if isinstance(cols,Index):
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format,
date_format=date_format)
else:
cols=list(cols)
self.obj = self.obj.loc[:,cols]
Expand All @@ -902,7 +905,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
# and make sure sure cols is just a list of labels
cols = self.obj.columns
if isinstance(cols,Index):
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format,
date_format=date_format)
else:
cols=list(cols)

Expand All @@ -923,6 +927,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
if isinstance(obj.index, PeriodIndex):
self.data_index = obj.index.to_timestamp()

if isinstance(self.data_index, DatetimeIndex) and date_format is not None:
self.data_index = Index([x.strftime(date_format) if notnull(x) else '' for x in self.data_index])

self.nlevels = getattr(self.data_index, 'nlevels', 1)
if not index:
self.nlevels = 0
Expand All @@ -931,15 +938,10 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
# invoked by df.to_csv(engine=python)
def _helper_csv(self, writer, na_rep=None, cols=None,
header=True, index=True,
index_label=None, float_format=None):
index_label=None, float_format=None, date_format=None):
if cols is None:
cols = self.columns

series = {}
for k, v in compat.iteritems(self.obj._series):
series[k] = v.values


has_aliases = isinstance(header, (tuple, list, np.ndarray))
if has_aliases or header:
if index:
Expand Down Expand Up @@ -981,10 +983,34 @@ def _helper_csv(self, writer, na_rep=None, cols=None,
encoded_cols = list(cols)
writer.writerow(encoded_cols)

if date_format is None:
date_formatter = lambda x: lib.Timestamp(x)._repr_base
else:
def strftime_with_nulls(x):
x = lib.Timestamp(x)
if notnull(x):
return x.strftime(date_format)

date_formatter = lambda x: strftime_with_nulls(x)

data_index = self.obj.index

if isinstance(self.obj.index, PeriodIndex):
data_index = self.obj.index.to_timestamp()

if isinstance(data_index, DatetimeIndex) and date_format is not None:
data_index = Index([date_formatter(x) for x in data_index])

values = self.obj.copy()
values.index = data_index
values.columns = values.columns.to_native_types(na_rep=na_rep,float_format=float_format,
date_format=date_format)
values = values[cols]

series = {}
for k, v in compat.iteritems(values._series):
series[k] = v.values

nlevels = getattr(data_index, 'nlevels', 1)
for j, idx in enumerate(data_index):
row_fields = []
Expand All @@ -1000,8 +1026,8 @@ def _helper_csv(self, writer, na_rep=None, cols=None,

if float_format is not None and com.is_float(val):
val = float_format % val
elif isinstance(val, np.datetime64):
val = lib.Timestamp(val)._repr_base
elif isinstance(val, (np.datetime64, lib.Timestamp)):
val = date_formatter(val)

row_fields.append(val)

Expand Down Expand Up @@ -1031,7 +1057,7 @@ def save(self):
self._helper_csv(self.writer, na_rep=self.na_rep,
float_format=self.float_format, cols=self.cols,
header=self.header, index=self.index,
index_label=self.index_label)
index_label=self.index_label, date_format=self.date_format)

else:
self._save()
Expand Down Expand Up @@ -1150,13 +1176,16 @@ def _save_chunk(self, start_i, end_i):
slicer = slice(start_i,end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format, date_format=self.date_format)

for i, item in enumerate(b.items):

# self.data is a preallocated list
self.data[self.column_map[b][i]] = d[i]

ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format, date_format=self.date_format)

lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
cols=None, header=True, index=True, index_label=None,
mode='w', nanRep=None, encoding=None, quoting=None,
line_terminator='\n', chunksize=None,
tupleize_cols=False, **kwds):
tupleize_cols=False, date_format=None, **kwds):
r"""Write DataFrame to a comma-separated values (csv) file

Parameters
Expand Down Expand Up @@ -1073,6 +1073,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
tupleize_cols : boolean, default False
write multi_index columns as a list of tuples (if True)
or new (expanded format) if False)
date_format : string, default None
Format string for datetime objects.
"""
if nanRep is not None: # pragma: no cover
warnings.warn("nanRep is deprecated, use na_rep",
Expand All @@ -1088,7 +1090,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
index_label=index_label, mode=mode,
chunksize=chunksize, engine=kwds.get(
"engine"),
tupleize_cols=tupleize_cols)
tupleize_cols=tupleize_cols,
date_format=date_format)
formatter.save()

def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from pandas.tslib import Timestamp
from pandas import compat
from pandas.compat import range, lrange, lmap, callable, map, zip
from pandas.compat import range, lrange, lmap, callable, map, zip, u
from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type

class Block(PandasObject):
Expand Down Expand Up @@ -1396,7 +1396,7 @@ def fillna(self, value, inplace=False, downcast=None):
return [self if inplace else make_block(values, self.items,
self.ref_items, fastpath=True)]

def to_native_types(self, slicer=None, na_rep=None, **kwargs):
def to_native_types(self, slicer=None, na_rep=None, date_format=None, **kwargs):
""" convert to our native types format, slicing if desired """

values = self.values
Expand All @@ -1409,8 +1409,14 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs):
na_rep = 'NaT'
rvalues[mask] = na_rep
imask = (-mask).ravel()
rvalues.flat[imask] = np.array(
[Timestamp(val)._repr_base for val in values.ravel()[imask]], dtype=object)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use a date_formatter function (like above); you don't need to handle nulls here as that is already handled in the imask (eg you will get NO nulls in the list comprehension)

if date_format is None:
date_formatter = lambda x: Timestamp(x)._repr_base
else:
date_formatter = lambda x: Timestamp(x).strftime(date_format)

rvalues.flat[imask] = np.array([date_formatter(val) for val in
values.ravel()[imask]], dtype=object)

return rvalues.tolist()

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2129,7 +2129,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,

def to_csv(self, path, index=True, sep=",", na_rep='',
float_format=None, header=False,
index_label=None, mode='w', nanRep=None, encoding=None):
index_label=None, mode='w', nanRep=None, encoding=None,
date_format=None):
"""
Write Series to a comma-separated values (csv) file

Expand All @@ -2154,13 +2155,15 @@ def to_csv(self, path, index=True, sep=",", na_rep='',
encoding : string, optional
a string representing the encoding to use if the contents are
non-ascii, for python versions prior to 3
date_format: string, default None
Format string for datetime objects.
"""
from pandas.core.frame import DataFrame
df = DataFrame(self)
df.to_csv(path, index=index, sep=sep, na_rep=na_rep,
float_format=float_format, header=header,
index_label=index_label, mode=mode, nanRep=nanRep,
encoding=encoding)
encoding=encoding, date_format=date_format)

def dropna(self):
"""
Expand Down
47 changes: 47 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11407,6 +11407,53 @@ def test_isin_with_string_scalar(self):
with tm.assertRaises(TypeError):
df.isin('aaa')

def test_to_csv_date_format(self):
from pandas import to_datetime
pname = '__tmp_to_csv_date_format__'
with ensure_clean(pname) as path:
for engine in [None, 'python']:
dt_index = self.tsframe.index
datetime_frame = DataFrame({'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index)

datetime_frame.to_csv(path, date_format='%Y%m%d', engine=engine)
# Check that the data was put in the specified format
test = read_csv(path, index_col=0)

datetime_frame_int = datetime_frame.applymap(lambda x: int(x.strftime('%Y%m%d')))
datetime_frame_int.index = datetime_frame_int.index.map(lambda x: int(x.strftime('%Y%m%d')))

assert_frame_equal(test, datetime_frame_int)

datetime_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine)
# Check that the data was put in the specified format
test = read_csv(path, index_col=0)
datetime_frame_str = datetime_frame.applymap(lambda x: x.strftime('%Y-%m-%d'))
datetime_frame_str.index = datetime_frame_str.index.map(lambda x: x.strftime('%Y-%m-%d'))

assert_frame_equal(test, datetime_frame_str)

# Check that columns get converted
datetime_frame_columns = datetime_frame.T

datetime_frame_columns.to_csv(path, date_format='%Y%m%d', engine=engine)

test = read_csv(path, index_col=0)

datetime_frame_columns = datetime_frame_columns.applymap(lambda x: int(x.strftime('%Y%m%d')))
# Columns don't get converted to ints by read_csv
datetime_frame_columns.columns = datetime_frame_columns.columns.map(lambda x: x.strftime('%Y%m%d'))

assert_frame_equal(test, datetime_frame_columns)

# test NaTs
nat_index = to_datetime(['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000'])
nat_frame = DataFrame({'A': nat_index}, index=nat_index)

nat_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine)

test = read_csv(path, parse_dates=[0, 1], index_col=0)

assert_frame_equal(test, nat_frame)

def skip_if_no_ne(engine='numexpr'):
if engine == 'numexpr':
Expand Down
23 changes: 15 additions & 8 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import numpy as np

from pandas.core.common import (isnull, _NS_DTYPE, _INT64_DTYPE,
is_list_like,_values_from_object, _maybe_box)
is_list_like,_values_from_object, _maybe_box,
notnull)
from pandas.core.index import Index, Int64Index, _Identity
import pandas.compat as compat
from pandas.compat import u
Expand Down Expand Up @@ -599,23 +600,29 @@ def __contains__(self, key):
def _format_with_header(self, header, **kwargs):
return header + self._format_native_types(**kwargs)

def _format_native_types(self, na_rep=u('NaT'), **kwargs):
def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs):
data = list(self)

# tz formatter or time formatter
zero_time = time(0, 0)
for d in data:
if d.time() != zero_time or d.tzinfo is not None:
return [u('%s') % x for x in data]
if date_format is None:
for d in data:
if d.time() != zero_time or d.tzinfo is not None:
return [u('%s') % x for x in data]

values = np.array(data, dtype=object)
mask = isnull(self.values)
values[mask] = na_rep

imask = -mask
values[imask] = np.array([u('%d-%.2d-%.2d') % (dt.year, dt.month,
dt.day)
for dt in values[imask]])

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make date_formatter a function here

if date_format is None:
date_formatter = lambda x: u('%d-%.2d-%.2d' % (x.year, x.month, x.day))
else:
date_formatter = lambda x: u(x.strftime(date_format))

values[imask] = np.array([date_formatter(dt) for dt in values[imask]])

return values.tolist()

def isin(self, values):
Expand Down
10 changes: 10 additions & 0 deletions vb_suite/io_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,13 @@ def create_cols(name):
" parse_dates=['foo'])")
read_parse_dates_iso8601 = Benchmark(stmt, setup,
start_date=datetime(2012, 3, 1))

setup = common_setup + """
rng = date_range('1/1/2000', periods=1000)
data = DataFrame(rng, index=rng)
"""

stmt = ("data.to_csv('__test__.csv', date_format='%Y%m%d')")

frame_to_csv_date_formatting = Benchmark(stmt, setup,
start_date=datetime(2013, 9, 1))