BUG: Don't overflow PeriodIndex in to_csv (pandas-dev#15984)

gfyoung · jreback · commit 7ee73ffcfd1c · 2017-04-13T22:11:33.000Z
* BUG: Don't overflow PeriodIndex in to_csv Closes pandas-devgh-15982. * TST: Test to_native_types for Period/DatetimeIndex
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -1344,6 +1344,7 @@ I/O
 - Bug in ``pd.read_csv()`` in which invalid values for ``nrows`` and ``chunksize`` were allowed (:issue:`15767`)
 - Bug in ``pd.read_csv()`` for the Python engine in which unhelpful error messages were being raised when parsing errors occurred (:issue:`15910`)
 - Bug in ``pd.read_csv()`` in which the ``skipfooter`` parameter was not being properly validated (:issue:`15925`)
+- Bug in ``pd.to_csv()`` in which there was numeric overflow when a timestamp index was being written (:issue:`15982`)
 - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
 - Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`)
diff --git a/pandas/formats/format.py b/pandas/formats/format.py
@@ -1564,10 +1564,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
         self.chunksize = int(chunksize)
 
         self.data_index = obj.index
-        if isinstance(obj.index, PeriodIndex):
-            self.data_index = obj.index.to_timestamp()
-
-        if (isinstance(self.data_index, DatetimeIndex) and
+        if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and
                 date_format is not None):
             self.data_index = Index([x.strftime(date_format) if notnull(x) else
                                      '' for x in self.data_index])
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -1820,7 +1820,26 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
         return header + result
 
     def to_native_types(self, slicer=None, **kwargs):
-        """ slice and dice then format """
+        """
+        Format specified values of `self` and return them.
+
+        Parameters
+        ----------
+        slicer : int, array-like
+            An indexer into `self` that specifies which values
+            are used in the formatting process.
+        kwargs : dict
+            Options for specifying how the values should be formatted.
+            These options include the following:
+
+            1) na_rep : str
+                The value that serves as a placeholder for NULL values
+            2) quoting : bool or None
+                Whether or not there are quoted values in `self`
+            3) date_format : str
+                The format used to represent date-like values
+        """
+
         values = self
         if slicer is not None:
             values = values[slicer]
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -1143,3 +1143,31 @@ def test_to_csv_quoting(self):
         df = df.set_index(['a', 'b'])
         expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
         self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
+
+    def test_period_index_date_overflow(self):
+        # see gh-15982
+
+        dates = ["1990-01-01", "2000-01-01", "3005-01-01"]
+        index = pd.PeriodIndex(dates, freq="D")
+
+        df = pd.DataFrame([4, 5, 6], index=index)
+        result = df.to_csv()
+
+        expected = ',0\n1990-01-01,4\n2000-01-01,5\n3005-01-01,6\n'
+        assert result == expected
+
+        date_format = "%m-%d-%Y"
+        result = df.to_csv(date_format=date_format)
+
+        expected = ',0\n01-01-1990,4\n01-01-2000,5\n01-01-3005,6\n'
+        assert result == expected
+
+        # Overflow with pd.NaT
+        dates = ["1990-01-01", pd.NaT, "3005-01-01"]
+        index = pd.PeriodIndex(dates, freq="D")
+
+        df = pd.DataFrame([4, 5, 6], index=index)
+        result = df.to_csv()
+
+        expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n'
+        assert result == expected
diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py
@@ -0,0 +1,47 @@
+from pandas import DatetimeIndex
+
+import numpy as np
+
+import pandas.util.testing as tm
+import pandas as pd
+
+
+def test_to_native_types():
+    index = DatetimeIndex(freq='1D', periods=3, start='2017-01-01')
+
+    # First, with no arguments.
+    expected = np.array(['2017-01-01', '2017-01-02',
+                         '2017-01-03'], dtype=object)
+
+    result = index.to_native_types()
+    tm.assert_numpy_array_equal(result, expected)
+
+    # No NaN values, so na_rep has no effect
+    result = index.to_native_types(na_rep='pandas')
+    tm.assert_numpy_array_equal(result, expected)
+
+    # Make sure slicing works
+    expected = np.array(['2017-01-01', '2017-01-03'], dtype=object)
+
+    result = index.to_native_types([0, 2])
+    tm.assert_numpy_array_equal(result, expected)
+
+    # Make sure date formatting works
+    expected = np.array(['01-2017-01', '01-2017-02',
+                         '01-2017-03'], dtype=object)
+
+    result = index.to_native_types(date_format='%m-%Y-%d')
+    tm.assert_numpy_array_equal(result, expected)
+
+    # NULL object handling should work
+    index = DatetimeIndex(['2017-01-01', pd.NaT, '2017-01-03'])
+    expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object)
+
+    result = index.to_native_types()
+    tm.assert_numpy_array_equal(result, expected)
+
+    expected = np.array(['2017-01-01', 'pandas',
+                         '2017-01-03'], dtype=object)
+
+    result = index.to_native_types(na_rep='pandas')
+    tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py
@@ -0,0 +1,48 @@
+from pandas import PeriodIndex
+
+import numpy as np
+
+import pandas.util.testing as tm
+import pandas as pd
+
+
+def test_to_native_types():
+    index = PeriodIndex(['2017-01-01', '2017-01-02',
+                         '2017-01-03'], freq='D')
+
+    # First, with no arguments.
+    expected = np.array(['2017-01-01', '2017-01-02',
+                         '2017-01-03'], dtype='<U10')
+
+    result = index.to_native_types()
+    tm.assert_numpy_array_equal(result, expected)
+
+    # No NaN values, so na_rep has no effect
+    result = index.to_native_types(na_rep='pandas')
+    tm.assert_numpy_array_equal(result, expected)
+
+    # Make sure slicing works
+    expected = np.array(['2017-01-01', '2017-01-03'], dtype='<U10')
+
+    result = index.to_native_types([0, 2])
+    tm.assert_numpy_array_equal(result, expected)
+
+    # Make sure date formatting works
+    expected = np.array(['01-2017-01', '01-2017-02',
+                         '01-2017-03'], dtype='<U10')
+
+    result = index.to_native_types(date_format='%m-%Y-%d')
+    tm.assert_numpy_array_equal(result, expected)
+
+    # NULL object handling should work
+    index = PeriodIndex(['2017-01-01', pd.NaT, '2017-01-03'], freq='D')
+    expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object)
+
+    result = index.to_native_types()
+    tm.assert_numpy_array_equal(result, expected)
+
+    expected = np.array(['2017-01-01', 'pandas',
+                         '2017-01-03'], dtype=object)
+
+    result = index.to_native_types(na_rep='pandas')
+    tm.assert_numpy_array_equal(result, expected)