diff --git a/doc/source/release.rst b/doc/source/release.rst index 7776ee1efba4f..494b22c7ff403 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -60,6 +60,9 @@ New features - Clipboard functionality now works with PySide (:issue:`4282`) - New ``extract`` string method returns regex matches more conveniently (:issue:`4685`) - Auto-detect field widths in read_fwf when unspecified (:issue:`4488`) + - ``to_csv()`` now outputs datetime objects according to a specified format string + via the ``date_format`` keyword (:issue:`4313`) + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index c6a4c280ca4bb..7b470606f333b 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -87,6 +87,9 @@ API changes and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`) + - ``to_csv`` now takes a ``date_format`` keyword argument that specifies how + output datetime objects should be formatted. Datetimes encountered in the + index, columns, and values will all have this formatting applied. (:issue:`4313`) Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/format.py b/pandas/core/format.py index 190ef3fb5f1ab..4f2d9f214ce6e 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -18,7 +18,7 @@ import itertools import csv -from pandas.tseries.period import PeriodIndex +from pandas.tseries.period import PeriodIndex, DatetimeIndex docstring_to_string = """ Parameters @@ -850,7 +850,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None, engine=None, - tupleize_cols=False, quotechar='"'): + tupleize_cols=False, quotechar='"', date_format=None): self.engine = engine # remove for 0.13 self.obj = obj @@ -877,6 +877,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.line_terminator = line_terminator + self.date_format = date_format + #GH3457 if not self.obj.columns.is_unique and engine == 'python': msg= "columns.is_unique == False not supported with engine='python'" @@ -893,7 +895,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, if cols is not None: if isinstance(cols,Index): - cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) + cols = cols.to_native_types(na_rep=na_rep,float_format=float_format, + date_format=date_format) else: cols=list(cols) self.obj = self.obj.loc[:,cols] @@ -902,7 +905,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols,Index): - cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) + cols = cols.to_native_types(na_rep=na_rep,float_format=float_format, + date_format=date_format) else: cols=list(cols) @@ -923,6 +927,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, if isinstance(obj.index, PeriodIndex): self.data_index = obj.index.to_timestamp() + if isinstance(self.data_index, DatetimeIndex) and date_format is not None: + self.data_index = Index([x.strftime(date_format) if notnull(x) else '' for x in self.data_index]) + self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0 @@ -931,15 +938,10 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, # invoked by df.to_csv(engine=python) def _helper_csv(self, writer, na_rep=None, cols=None, header=True, index=True, - index_label=None, float_format=None): + index_label=None, float_format=None, date_format=None): if cols is None: cols = self.columns - series = {} - for k, v in compat.iteritems(self.obj._series): - series[k] = v.values - - has_aliases = isinstance(header, (tuple, list, np.ndarray)) if has_aliases or header: if index: @@ -981,10 +983,34 @@ def _helper_csv(self, writer, na_rep=None, cols=None, encoded_cols = list(cols) writer.writerow(encoded_cols) + if date_format is None: + date_formatter = lambda x: lib.Timestamp(x)._repr_base + else: + def strftime_with_nulls(x): + x = lib.Timestamp(x) + if notnull(x): + return x.strftime(date_format) + + date_formatter = lambda x: strftime_with_nulls(x) + data_index = self.obj.index + if isinstance(self.obj.index, PeriodIndex): data_index = self.obj.index.to_timestamp() + if isinstance(data_index, DatetimeIndex) and date_format is not None: + data_index = Index([date_formatter(x) for x in data_index]) + + values = self.obj.copy() + values.index = data_index + values.columns = values.columns.to_native_types(na_rep=na_rep,float_format=float_format, + date_format=date_format) + values = values[cols] + + series = {} + for k, v in compat.iteritems(values._series): + series[k] = v.values + nlevels = getattr(data_index, 'nlevels', 1) for j, idx in enumerate(data_index): row_fields = [] @@ -1000,8 +1026,8 @@ def _helper_csv(self, writer, na_rep=None, cols=None, if float_format is not None and com.is_float(val): val = float_format % val - elif isinstance(val, np.datetime64): - val = lib.Timestamp(val)._repr_base + elif isinstance(val, (np.datetime64, lib.Timestamp)): + val = date_formatter(val) row_fields.append(val) @@ -1031,7 +1057,7 @@ def save(self): self._helper_csv(self.writer, na_rep=self.na_rep, float_format=self.float_format, cols=self.cols, header=self.header, index=self.index, - index_label=self.index_label) + index_label=self.index_label, date_format=self.date_format) else: self._save() @@ -1150,13 +1176,16 @@ def _save_chunk(self, start_i, end_i): slicer = slice(start_i,end_i) for i in range(len(self.blocks)): b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, + float_format=self.float_format, date_format=self.date_format) + for i, item in enumerate(b.items): # self.data is a preallocated list self.data[self.column_map[b][i]] = d[i] - ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, + float_format=self.float_format, date_format=self.date_format) lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index daaf9d9966635..126ed9242ecdd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1030,7 +1030,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None, - tupleize_cols=False, **kwds): + tupleize_cols=False, date_format=None, **kwds): r"""Write DataFrame to a comma-separated values (csv) file Parameters @@ -1073,6 +1073,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, tupleize_cols : boolean, default False write multi_index columns as a list of tuples (if True) or new (expanded format) if False) + date_format : string, default None + Format string for datetime objects. """ if nanRep is not None: # pragma: no cover warnings.warn("nanRep is deprecated, use na_rep", @@ -1088,7 +1090,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, index_label=index_label, mode=mode, chunksize=chunksize, engine=kwds.get( "engine"), - tupleize_cols=tupleize_cols) + tupleize_cols=tupleize_cols, + date_format=date_format) formatter.save() def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 070745d73b307..137b99858f506 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -22,7 +22,7 @@ from pandas.tslib import Timestamp from pandas import compat -from pandas.compat import range, lrange, lmap, callable, map, zip +from pandas.compat import range, lrange, lmap, callable, map, zip, u from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type class Block(PandasObject): @@ -1396,7 +1396,7 @@ def fillna(self, value, inplace=False, downcast=None): return [self if inplace else make_block(values, self.items, self.ref_items, fastpath=True)] - def to_native_types(self, slicer=None, na_rep=None, **kwargs): + def to_native_types(self, slicer=None, na_rep=None, date_format=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -1409,8 +1409,14 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): na_rep = 'NaT' rvalues[mask] = na_rep imask = (-mask).ravel() - rvalues.flat[imask] = np.array( - [Timestamp(val)._repr_base for val in values.ravel()[imask]], dtype=object) + + if date_format is None: + date_formatter = lambda x: Timestamp(x)._repr_base + else: + date_formatter = lambda x: Timestamp(x).strftime(date_format) + + rvalues.flat[imask] = np.array([date_formatter(val) for val in + values.ravel()[imask]], dtype=object) return rvalues.tolist() diff --git a/pandas/core/series.py b/pandas/core/series.py index d185939d6abc9..cd339d7201a83 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2129,7 +2129,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, def to_csv(self, path, index=True, sep=",", na_rep='', float_format=None, header=False, - index_label=None, mode='w', nanRep=None, encoding=None): + index_label=None, mode='w', nanRep=None, encoding=None, + date_format=None): """ Write Series to a comma-separated values (csv) file @@ -2154,13 +2155,15 @@ def to_csv(self, path, index=True, sep=",", na_rep='', encoding : string, optional a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 + date_format: string, default None + Format string for datetime objects. """ from pandas.core.frame import DataFrame df = DataFrame(self) df.to_csv(path, index=index, sep=sep, na_rep=na_rep, float_format=float_format, header=header, index_label=index_label, mode=mode, nanRep=nanRep, - encoding=encoding) + encoding=encoding, date_format=date_format) def dropna(self): """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 64a45d344f2a9..543a9ddad489b 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11407,6 +11407,53 @@ def test_isin_with_string_scalar(self): with tm.assertRaises(TypeError): df.isin('aaa') + def test_to_csv_date_format(self): + from pandas import to_datetime + pname = '__tmp_to_csv_date_format__' + with ensure_clean(pname) as path: + for engine in [None, 'python']: + dt_index = self.tsframe.index + datetime_frame = DataFrame({'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) + + datetime_frame.to_csv(path, date_format='%Y%m%d', engine=engine) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + + datetime_frame_int = datetime_frame.applymap(lambda x: int(x.strftime('%Y%m%d'))) + datetime_frame_int.index = datetime_frame_int.index.map(lambda x: int(x.strftime('%Y%m%d'))) + + assert_frame_equal(test, datetime_frame_int) + + datetime_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.applymap(lambda x: x.strftime('%Y-%m-%d')) + datetime_frame_str.index = datetime_frame_str.index.map(lambda x: x.strftime('%Y-%m-%d')) + + assert_frame_equal(test, datetime_frame_str) + + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + + datetime_frame_columns.to_csv(path, date_format='%Y%m%d', engine=engine) + + test = read_csv(path, index_col=0) + + datetime_frame_columns = datetime_frame_columns.applymap(lambda x: int(x.strftime('%Y%m%d'))) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = datetime_frame_columns.columns.map(lambda x: x.strftime('%Y%m%d')) + + assert_frame_equal(test, datetime_frame_columns) + + # test NaTs + nat_index = to_datetime(['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) + nat_frame = DataFrame({'A': nat_index}, index=nat_index) + + nat_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) + + test = read_csv(path, parse_dates=[0, 1], index_col=0) + + assert_frame_equal(test, nat_frame) def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 33c90d3714e8a..f13aa14698b6e 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -7,7 +7,8 @@ import numpy as np from pandas.core.common import (isnull, _NS_DTYPE, _INT64_DTYPE, - is_list_like,_values_from_object, _maybe_box) + is_list_like,_values_from_object, _maybe_box, + notnull) from pandas.core.index import Index, Int64Index, _Identity import pandas.compat as compat from pandas.compat import u @@ -599,23 +600,29 @@ def __contains__(self, key): def _format_with_header(self, header, **kwargs): return header + self._format_native_types(**kwargs) - def _format_native_types(self, na_rep=u('NaT'), **kwargs): + def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): data = list(self) # tz formatter or time formatter zero_time = time(0, 0) - for d in data: - if d.time() != zero_time or d.tzinfo is not None: - return [u('%s') % x for x in data] + if date_format is None: + for d in data: + if d.time() != zero_time or d.tzinfo is not None: + return [u('%s') % x for x in data] values = np.array(data, dtype=object) mask = isnull(self.values) values[mask] = na_rep imask = -mask - values[imask] = np.array([u('%d-%.2d-%.2d') % (dt.year, dt.month, - dt.day) - for dt in values[imask]]) + + if date_format is None: + date_formatter = lambda x: u('%d-%.2d-%.2d' % (x.year, x.month, x.day)) + else: + date_formatter = lambda x: u(x.strftime(date_format)) + + values[imask] = np.array([date_formatter(dt) for dt in values[imask]]) + return values.tolist() def isin(self, values): diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py index 892e9820011c8..4fc14459e83f5 100644 --- a/vb_suite/io_bench.py +++ b/vb_suite/io_bench.py @@ -88,3 +88,13 @@ def create_cols(name): " parse_dates=['foo'])") read_parse_dates_iso8601 = Benchmark(stmt, setup, start_date=datetime(2012, 3, 1)) + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=1000) +data = DataFrame(rng, index=rng) +""" + +stmt = ("data.to_csv('__test__.csv', date_format='%Y%m%d')") + +frame_to_csv_date_formatting = Benchmark(stmt, setup, + start_date=datetime(2013, 9, 1))