ENH: Enable automatic writing of dates to Stata files

bashtage · jreback · commit 0fe5a345f90e · 2016-07-21T06:58:27.000-04:00
Automatically select type %tc for datetime[ns] columns Change ValueErrors to NotImplementedError for unsupported types Add tests for select exceptions Improve to_stata and StataWriter docstrings closes pandas-dev#12259 closes pandas-dev#13710
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -310,6 +310,7 @@ Other enhancements
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)
 - ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
+- ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`)
 
 .. _whatsnew_0190.api:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1473,31 +1473,42 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
 
         Parameters
         ----------
-        fname : file path or buffer
-            Where to save the dta file.
+        fname : str or buffer
+            String path of file-like object
         convert_dates : dict
-            Dictionary mapping column of datetime types to the stata internal
-            format that you want to use for the dates. Options are
-            'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
-            number or a name.
+            Dictionary mapping columns containing datetime types to stata internal
+            format to use when wirting the dates. Options are 'tc', 'td', 'tm',
+            'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+            Datetime columns that do not have a conversion type specified will be
+            converted to 'tc'. Raises NotImplementedError if a datetime column has
+            timezone information
         write_index : bool
             Write the index to Stata dataset.
         encoding : str
-            Default is latin-1. Note that Stata does not support unicode.
+            Default is latin-1. Unicode is not supported
         byteorder : str
-            Can be ">", "<", "little", or "big". The default is None which uses
-            `sys.byteorder`
+            Can be ">", "<", "little", or "big". default is `sys.byteorder`
         time_stamp : datetime
-            A date time to use when writing the file.  Can be None, in which
-            case the current time is used.
+            A datetime to use as file creation date.  Default is the current time
         dataset_label : str
-            A label for the data set.  Should be 80 characters or smaller.
+            A label for the data set.  Must be 80 characters or smaller.
 
         .. versionadded:: 0.19.0
 
         variable_labels : dict
-            Dictionary containing columns as keys and variable labels as
-            values. Each label must be 80 characters or smaller.
+            Dictionary containing columns as keys and variable labels as values.
+            Each label must be 80 characters or smaller.
+
+        Raises
+        ------
+        NotImplementedError
+            * If datetimes contain timezone information
+            * Column dtype is not representable in Stata
+        ValueError
+            * Columns listed in convert_dates are noth either datetime64[ns]
+              or datetime.datetime
+            * Column listed in convert_dates is not in DataFrame
+            * Categorical label contains more than 32,000 characters
 
         Examples
         --------
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -432,7 +432,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
         d = parse_dates_safe(dates, year=True)
         conv_dates = d.year
     else:
-        raise ValueError("fmt %s not understood" % fmt)
+        raise ValueError("Format %s is not a known Stata date format" % fmt)
 
     conv_dates = Series(conv_dates, dtype=np.float64)
     missing_value = struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
@@ -1709,7 +1709,7 @@ def _convert_datetime_to_stata_type(fmt):
                "%tq", "th", "%th", "ty", "%ty"]:
         return np.float64  # Stata expects doubles for SIFs
     else:
-        raise ValueError("fmt %s not understood" % fmt)
+        raise NotImplementedError("Format %s not implemented" % fmt)
 
 
 def _maybe_convert_to_int_keys(convert_dates, varlist):
@@ -1721,9 +1721,8 @@ def _maybe_convert_to_int_keys(convert_dates, varlist):
             new_dict.update({varlist.index(key): convert_dates[key]})
         else:
             if not isinstance(key, int):
-                raise ValueError(
-                    "convert_dates key is not in varlist and is not an int"
-                )
+                raise ValueError("convert_dates key must be a "
+                                 "column or an integer")
             new_dict.update({key: convert_dates[key]})
     return new_dict
 
@@ -1763,8 +1762,7 @@ def _dtype_to_stata_type(dtype, column):
     elif dtype == np.int8:
         return chr(251)
     else:  # pragma : no cover
-        raise ValueError("Data type %s not currently understood. "
-                         "Please report an error to the developers." % dtype)
+        raise NotImplementedError("Data type %s not supported." % dtype)
 
 
 def _dtype_to_default_stata_fmt(dtype, column):
@@ -1801,35 +1799,36 @@ def _dtype_to_default_stata_fmt(dtype, column):
     elif dtype == np.int8 or dtype == np.int16:
         return "%8.0g"
     else:  # pragma : no cover
-        raise ValueError("Data type %s not currently understood. "
-                         "Please report an error to the developers." % dtype)
+        raise NotImplementedError("Data type %s not supported." % dtype)
 
 
 class StataWriter(StataParser):
     """
-    A class for writing Stata binary dta files from array-like objects
+    A class for writing Stata binary dta files
 
     Parameters
     ----------
-    fname : file path or buffer
-        Where to save the dta file.
-    data : array-like
-        Array-like input to save. Pandas objects are also accepted.
+    fname : str or buffer
+        String path of file-like object
+    data : DataFrame
+        Input to save
     convert_dates : dict
-        Dictionary mapping column of datetime types to the stata internal
-        format that you want to use for the dates. Options are
-        'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
-        number or a name.
+        Dictionary mapping columns containing datetime types to stata internal
+        format to use when wirting the dates. Options are 'tc', 'td', 'tm',
+        'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+        Datetime columns that do not have a conversion type specified will be
+        converted to 'tc'. Raises NotImplementedError if a datetime column has
+        timezone information
+    write_index : bool
+        Write the index to Stata dataset.
     encoding : str
-        Default is latin-1. Note that Stata does not support unicode.
+        Default is latin-1. Unicode is not supported
     byteorder : str
-        Can be ">", "<", "little", or "big". The default is None which uses
-        `sys.byteorder`
+        Can be ">", "<", "little", or "big". default is `sys.byteorder`
     time_stamp : datetime
-        A date time to use when writing the file.  Can be None, in which
-        case the current time is used.
+        A datetime to use as file creation date.  Default is the current time
     dataset_label : str
-        A label for the data set.  Should be 80 characters or smaller.
+        A label for the data set.  Must be 80 characters or smaller.
 
     .. versionadded:: 0.19.0
 
@@ -1843,6 +1842,17 @@ class StataWriter(StataParser):
         The StataWriter instance has a write_file method, which will
         write the file to the given `fname`.
 
+    Raises
+    ------
+    NotImplementedError
+        * If datetimes contain timezone information
+    ValueError
+        * Columns listed in convert_dates are noth either datetime64[ns]
+          or datetime.datetime
+        * Column dtype is not representable in Stata
+        * Column listed in convert_dates is not in DataFrame
+        * Categorical label contains more than 32,000 characters
+
     Examples
     --------
     >>> import pandas as pd
@@ -1861,7 +1871,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None):
         super(StataWriter, self).__init__(encoding)
-        self._convert_dates = convert_dates
+        self._convert_dates = {} if convert_dates is None else convert_dates
         self._write_index = write_index
         self._time_stamp = time_stamp
         self._data_label = data_label
@@ -2041,15 +2051,22 @@ def _prepare_pandas(self, data):
         self.varlist = data.columns.tolist()
 
         dtypes = data.dtypes
-        if self._convert_dates is not None:
-            self._convert_dates = _maybe_convert_to_int_keys(
-                self._convert_dates, self.varlist
+
+        # Ensure all date columns are converted
+        for col in data:
+            if col in self._convert_dates:
+                continue
+            if is_datetime64_dtype(data[col]):
+                self._convert_dates[col] = 'tc'
+
+        self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates,
+                                                         self.varlist)
+        for key in self._convert_dates:
+            new_type = _convert_datetime_to_stata_type(
+                self._convert_dates[key]
             )
-            for key in self._convert_dates:
-                new_type = _convert_datetime_to_stata_type(
-                    self._convert_dates[key]
-                )
-                dtypes[key] = np.dtype(new_type)
+            dtypes[key] = np.dtype(new_type)
+
         self.typlist = []
         self.fmtlist = []
         for col, dtype in dtypes.iteritems():
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -11,17 +11,17 @@
 
 import nose
 import numpy as np
+from pandas.tslib import NaT
 
 import pandas as pd
 import pandas.util.testing as tm
 from pandas import compat
 from pandas.compat import iterkeys
 from pandas.core.frame import DataFrame, Series
-from pandas.types.common import is_categorical_dtype
-from pandas.tslib import NaT
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
                              PossiblePrecisionLoss, StataMissingValue)
+from pandas.types.common import is_categorical_dtype
 
 
 class TestStata(tm.TestCase):
@@ -1165,6 +1165,52 @@ def test_write_variable_label_errors(self):
             with tm.ensure_clean() as path:
                 original.to_stata(path, variable_labels=variable_labels_long)
 
+    def test_default_date_conversion(self):
+        # GH 12259
+        dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+                 dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+                 dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
+        original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+                                 'strs': ['apple', 'banana', 'cherry'],
+                                 'dates': dates})
+
+        with tm.ensure_clean() as path:
+            original.to_stata(path, write_index=False)
+            reread = read_stata(path, convert_dates=True)
+            tm.assert_frame_equal(original, reread)
+
+            original.to_stata(path,
+                              write_index=False,
+                              convert_dates={'dates': 'tc'})
+            direct = read_stata(path, convert_dates=True)
+            tm.assert_frame_equal(reread, direct)
+
+    def test_unsupported_type(self):
+        original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]})
+
+        with tm.assertRaises(NotImplementedError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
+
+    def test_unsupported_datetype(self):
+        dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+                 dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+                 dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
+        original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+                                 'strs': ['apple', 'banana', 'cherry'],
+                                 'dates': dates})
+
+        with tm.assertRaises(NotImplementedError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path, convert_dates={'dates': 'tC'})
+
+        dates = pd.date_range('1-1-1990',periods=3,tz='Asia/Hong_Kong')
+        original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
+                                 'strs': ['apple', 'banana', 'cherry'],
+                                 'dates': dates})
+        with tm.assertRaises(NotImplementedError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],