Skip to content

Commit 7ee73ff

Browse files
gfyoungjreback
authored andcommitted
BUG: Don't overflow PeriodIndex in to_csv (pandas-dev#15984)
* BUG: Don't overflow PeriodIndex in to_csv Closes pandas-devgh-15982. * TST: Test to_native_types for Period/DatetimeIndex
1 parent f2ed595 commit 7ee73ff

File tree

6 files changed

+145
-5
lines changed

6 files changed

+145
-5
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1344,6 +1344,7 @@ I/O
13441344
- Bug in ``pd.read_csv()`` in which invalid values for ``nrows`` and ``chunksize`` were allowed (:issue:`15767`)
13451345
- Bug in ``pd.read_csv()`` for the Python engine in which unhelpful error messages were being raised when parsing errors occurred (:issue:`15910`)
13461346
- Bug in ``pd.read_csv()`` in which the ``skipfooter`` parameter was not being properly validated (:issue:`15925`)
1347+
- Bug in ``pd.to_csv()`` in which there was numeric overflow when a timestamp index was being written (:issue:`15982`)
13471348
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
13481349
- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
13491350
- Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`)

pandas/formats/format.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1564,10 +1564,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
15641564
self.chunksize = int(chunksize)
15651565

15661566
self.data_index = obj.index
1567-
if isinstance(obj.index, PeriodIndex):
1568-
self.data_index = obj.index.to_timestamp()
1569-
1570-
if (isinstance(self.data_index, DatetimeIndex) and
1567+
if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and
15711568
date_format is not None):
15721569
self.data_index = Index([x.strftime(date_format) if notnull(x) else
15731570
'' for x in self.data_index])

pandas/indexes/base.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -1820,7 +1820,26 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
18201820
return header + result
18211821

18221822
def to_native_types(self, slicer=None, **kwargs):
1823-
""" slice and dice then format """
1823+
"""
1824+
Format specified values of `self` and return them.
1825+
1826+
Parameters
1827+
----------
1828+
slicer : int, array-like
1829+
An indexer into `self` that specifies which values
1830+
are used in the formatting process.
1831+
kwargs : dict
1832+
Options for specifying how the values should be formatted.
1833+
These options include the following:
1834+
1835+
1) na_rep : str
1836+
The value that serves as a placeholder for NULL values
1837+
2) quoting : bool or None
1838+
Whether or not there are quoted values in `self`
1839+
3) date_format : str
1840+
The format used to represent date-like values
1841+
"""
1842+
18241843
values = self
18251844
if slicer is not None:
18261845
values = values[slicer]

pandas/tests/frame/test_to_csv.py

+28
Original file line numberDiff line numberDiff line change
@@ -1143,3 +1143,31 @@ def test_to_csv_quoting(self):
11431143
df = df.set_index(['a', 'b'])
11441144
expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
11451145
self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
1146+
1147+
def test_period_index_date_overflow(self):
1148+
# see gh-15982
1149+
1150+
dates = ["1990-01-01", "2000-01-01", "3005-01-01"]
1151+
index = pd.PeriodIndex(dates, freq="D")
1152+
1153+
df = pd.DataFrame([4, 5, 6], index=index)
1154+
result = df.to_csv()
1155+
1156+
expected = ',0\n1990-01-01,4\n2000-01-01,5\n3005-01-01,6\n'
1157+
assert result == expected
1158+
1159+
date_format = "%m-%d-%Y"
1160+
result = df.to_csv(date_format=date_format)
1161+
1162+
expected = ',0\n01-01-1990,4\n01-01-2000,5\n01-01-3005,6\n'
1163+
assert result == expected
1164+
1165+
# Overflow with pd.NaT
1166+
dates = ["1990-01-01", pd.NaT, "3005-01-01"]
1167+
index = pd.PeriodIndex(dates, freq="D")
1168+
1169+
df = pd.DataFrame([4, 5, 6], index=index)
1170+
result = df.to_csv()
1171+
1172+
expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n'
1173+
assert result == expected
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from pandas import DatetimeIndex
2+
3+
import numpy as np
4+
5+
import pandas.util.testing as tm
6+
import pandas as pd
7+
8+
9+
def test_to_native_types():
10+
index = DatetimeIndex(freq='1D', periods=3, start='2017-01-01')
11+
12+
# First, with no arguments.
13+
expected = np.array(['2017-01-01', '2017-01-02',
14+
'2017-01-03'], dtype=object)
15+
16+
result = index.to_native_types()
17+
tm.assert_numpy_array_equal(result, expected)
18+
19+
# No NaN values, so na_rep has no effect
20+
result = index.to_native_types(na_rep='pandas')
21+
tm.assert_numpy_array_equal(result, expected)
22+
23+
# Make sure slicing works
24+
expected = np.array(['2017-01-01', '2017-01-03'], dtype=object)
25+
26+
result = index.to_native_types([0, 2])
27+
tm.assert_numpy_array_equal(result, expected)
28+
29+
# Make sure date formatting works
30+
expected = np.array(['01-2017-01', '01-2017-02',
31+
'01-2017-03'], dtype=object)
32+
33+
result = index.to_native_types(date_format='%m-%Y-%d')
34+
tm.assert_numpy_array_equal(result, expected)
35+
36+
# NULL object handling should work
37+
index = DatetimeIndex(['2017-01-01', pd.NaT, '2017-01-03'])
38+
expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object)
39+
40+
result = index.to_native_types()
41+
tm.assert_numpy_array_equal(result, expected)
42+
43+
expected = np.array(['2017-01-01', 'pandas',
44+
'2017-01-03'], dtype=object)
45+
46+
result = index.to_native_types(na_rep='pandas')
47+
tm.assert_numpy_array_equal(result, expected)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from pandas import PeriodIndex
2+
3+
import numpy as np
4+
5+
import pandas.util.testing as tm
6+
import pandas as pd
7+
8+
9+
def test_to_native_types():
10+
index = PeriodIndex(['2017-01-01', '2017-01-02',
11+
'2017-01-03'], freq='D')
12+
13+
# First, with no arguments.
14+
expected = np.array(['2017-01-01', '2017-01-02',
15+
'2017-01-03'], dtype='<U10')
16+
17+
result = index.to_native_types()
18+
tm.assert_numpy_array_equal(result, expected)
19+
20+
# No NaN values, so na_rep has no effect
21+
result = index.to_native_types(na_rep='pandas')
22+
tm.assert_numpy_array_equal(result, expected)
23+
24+
# Make sure slicing works
25+
expected = np.array(['2017-01-01', '2017-01-03'], dtype='<U10')
26+
27+
result = index.to_native_types([0, 2])
28+
tm.assert_numpy_array_equal(result, expected)
29+
30+
# Make sure date formatting works
31+
expected = np.array(['01-2017-01', '01-2017-02',
32+
'01-2017-03'], dtype='<U10')
33+
34+
result = index.to_native_types(date_format='%m-%Y-%d')
35+
tm.assert_numpy_array_equal(result, expected)
36+
37+
# NULL object handling should work
38+
index = PeriodIndex(['2017-01-01', pd.NaT, '2017-01-03'], freq='D')
39+
expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object)
40+
41+
result = index.to_native_types()
42+
tm.assert_numpy_array_equal(result, expected)
43+
44+
expected = np.array(['2017-01-01', 'pandas',
45+
'2017-01-03'], dtype=object)
46+
47+
result = index.to_native_types(na_rep='pandas')
48+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)