Skip to content

Commit 9d10b76

Browse files
gfyoungjorisvandenbossche
authored andcommitted
BUG: Don't parse index column as numeric when parse_dates=True (pandas-dev#14077)
When a thousands parameter is specified, if the index column data contains that thousands value for date purposes (e.g. '.'), do not interpret those characters as the thousands parameter. Closes pandas-devgh-14066.
1 parent 670435a commit 9d10b76

File tree

4 files changed

+64
-21
lines changed

4 files changed

+64
-21
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,7 @@ Bug Fixes
11261126
- Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`)
11271127
- Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`)
11281128

1129+
- Bug in ``pd.read_csv()`` where the index columns were being incorrectly parsed when parsed as dates with a ``thousands`` parameter (:issue:`14066`)
11291130
- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
11301131
- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`)
11311132

pandas/io/parsers.py

+15
Original file line numberDiff line numberDiff line change
@@ -1474,6 +1474,13 @@ def _set(x):
14741474
else:
14751475
_set(val)
14761476

1477+
elif self.parse_dates:
1478+
if isinstance(self.index_col, list):
1479+
for k in self.index_col:
1480+
_set(k)
1481+
elif self.index_col is not None:
1482+
_set(self.index_col)
1483+
14771484
def set_error_bad_lines(self, status):
14781485
self._reader.set_error_bad_lines(int(status))
14791486

@@ -1856,6 +1863,14 @@ def _set(x):
18561863
_set(k)
18571864
else:
18581865
_set(val)
1866+
1867+
elif self.parse_dates:
1868+
if isinstance(self.index_col, list):
1869+
for k in self.index_col:
1870+
_set(k)
1871+
elif self.index_col is not None:
1872+
_set(self.index_col)
1873+
18591874
return noconvert_columns
18601875

18611876
def _make_reader(self, f):

pandas/io/tests/parser/parse_dates.py

+32
Original file line numberDiff line numberDiff line change
@@ -458,3 +458,35 @@ def test_parse_dates_empty_string(self):
458458
result = self.read_csv(StringIO(data), parse_dates=["Date"],
459459
na_filter=False)
460460
self.assertTrue(result['Date'].isnull()[1])
461+
462+
def test_parse_dates_noconvert_thousands(self):
463+
# see gh-14066
464+
data = 'a\n04.15.2016'
465+
466+
expected = DataFrame([datetime(2016, 4, 15)], columns=['a'])
467+
result = self.read_csv(StringIO(data), parse_dates=['a'],
468+
thousands='.')
469+
tm.assert_frame_equal(result, expected)
470+
471+
exp_index = DatetimeIndex(['2016-04-15'], name='a')
472+
expected = DataFrame(index=exp_index)
473+
result = self.read_csv(StringIO(data), index_col=0,
474+
parse_dates=True, thousands='.')
475+
tm.assert_frame_equal(result, expected)
476+
477+
data = 'a,b\n04.15.2016,09.16.2013'
478+
479+
expected = DataFrame([[datetime(2016, 4, 15),
480+
datetime(2013, 9, 16)]],
481+
columns=['a', 'b'])
482+
result = self.read_csv(StringIO(data), parse_dates=['a', 'b'],
483+
thousands='.')
484+
tm.assert_frame_equal(result, expected)
485+
486+
expected = DataFrame([[datetime(2016, 4, 15),
487+
datetime(2013, 9, 16)]],
488+
columns=['a', 'b'])
489+
expected = expected.set_index(['a', 'b'])
490+
result = self.read_csv(StringIO(data), index_col=[0, 1],
491+
parse_dates=True, thousands='.')
492+
tm.assert_frame_equal(result, expected)

pandas/io/tests/parser/usecols.py

+16-21
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,12 @@
55
for all of the parsers defined in parsers.py
66
"""
77

8-
from datetime import datetime
98
import nose
109

1110
import numpy as np
1211
import pandas.util.testing as tm
1312

14-
from pandas import DataFrame
13+
from pandas import DataFrame, Index
1514
from pandas.lib import Timestamp
1615
from pandas.compat import StringIO
1716

@@ -99,35 +98,31 @@ def test_usecols_index_col_False(self):
9998

10099
def test_usecols_index_col_conflict(self):
101100
# see gh-4201: test that index_col as integer reflects usecols
102-
data = """SecId,Time,Price,P2,P3
103-
10000,2013-5-11,100,10,1
104-
500,2013-5-12,101,11,1
105-
"""
106-
expected = DataFrame({'Price': [100, 101]}, index=[
107-
datetime(2013, 5, 11), datetime(2013, 5, 12)])
108-
expected.index.name = 'Time'
101+
data = 'a,b,c,d\nA,a,1,one\nB,b,2,two'
102+
expected = DataFrame({'c': [1, 2]}, index=Index(
103+
['a', 'b'], name='b'))
109104

110-
df = self.read_csv(StringIO(data), usecols=[
111-
'Time', 'Price'], parse_dates=True, index_col=0)
105+
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
106+
index_col=0)
112107
tm.assert_frame_equal(expected, df)
113108

114-
df = self.read_csv(StringIO(data), usecols=[
115-
'Time', 'Price'], parse_dates=True, index_col='Time')
109+
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
110+
index_col='b')
116111
tm.assert_frame_equal(expected, df)
117112

118-
df = self.read_csv(StringIO(data), usecols=[
119-
1, 2], parse_dates=True, index_col='Time')
113+
df = self.read_csv(StringIO(data), usecols=[1, 2],
114+
index_col='b')
120115
tm.assert_frame_equal(expected, df)
121116

122-
df = self.read_csv(StringIO(data), usecols=[
123-
1, 2], parse_dates=True, index_col=0)
117+
df = self.read_csv(StringIO(data), usecols=[1, 2],
118+
index_col=0)
124119
tm.assert_frame_equal(expected, df)
125120

126121
expected = DataFrame(
127-
{'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)})
128-
expected = expected.set_index(['Price', 'P2'])
129-
df = self.read_csv(StringIO(data), usecols=[
130-
'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2'])
122+
{'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')})
123+
expected = expected.set_index(['b', 'c'])
124+
df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'],
125+
index_col=['b', 'c'])
131126
tm.assert_frame_equal(expected, df)
132127

133128
def test_usecols_implicit_index_col(self):

0 commit comments

Comments
 (0)