BUG: Don't parse index column as numeric when parse_dates=True (pandas-dev#14077)

gfyoung · jorisvandenbossche · commit 9d10b76fa223 · 2016-08-27T15:31:30.000+02:00
When a thousands parameter is specified, if the index column data contains that thousands value for date purposes (e.g. '.'), do not interpret those characters as the thousands parameter. Closes pandas-devgh-14066.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -1126,6 +1126,7 @@ Bug Fixes
 - Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`)
 - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`)
 
+- Bug in ``pd.read_csv()`` where the index columns were being incorrectly parsed when parsed as dates with a ``thousands`` parameter (:issue:`14066`)
 - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
 - Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1474,6 +1474,13 @@ def _set(x):
                 else:
                     _set(val)
 
+        elif self.parse_dates:
+            if isinstance(self.index_col, list):
+                for k in self.index_col:
+                    _set(k)
+            elif self.index_col is not None:
+                _set(self.index_col)
+
     def set_error_bad_lines(self, status):
         self._reader.set_error_bad_lines(int(status))
 
@@ -1856,6 +1863,14 @@ def _set(x):
                         _set(k)
                 else:
                     _set(val)
+
+        elif self.parse_dates:
+            if isinstance(self.index_col, list):
+                for k in self.index_col:
+                    _set(k)
+            elif self.index_col is not None:
+                _set(self.index_col)
+
         return noconvert_columns
 
     def _make_reader(self, f):
diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py
@@ -458,3 +458,35 @@ def test_parse_dates_empty_string(self):
         result = self.read_csv(StringIO(data), parse_dates=["Date"],
                                na_filter=False)
         self.assertTrue(result['Date'].isnull()[1])
+
+    def test_parse_dates_noconvert_thousands(self):
+        # see gh-14066
+        data = 'a\n04.15.2016'
+
+        expected = DataFrame([datetime(2016, 4, 15)], columns=['a'])
+        result = self.read_csv(StringIO(data), parse_dates=['a'],
+                               thousands='.')
+        tm.assert_frame_equal(result, expected)
+
+        exp_index = DatetimeIndex(['2016-04-15'], name='a')
+        expected = DataFrame(index=exp_index)
+        result = self.read_csv(StringIO(data), index_col=0,
+                               parse_dates=True, thousands='.')
+        tm.assert_frame_equal(result, expected)
+
+        data = 'a,b\n04.15.2016,09.16.2013'
+
+        expected = DataFrame([[datetime(2016, 4, 15),
+                               datetime(2013, 9, 16)]],
+                             columns=['a', 'b'])
+        result = self.read_csv(StringIO(data), parse_dates=['a', 'b'],
+                               thousands='.')
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame([[datetime(2016, 4, 15),
+                               datetime(2013, 9, 16)]],
+                             columns=['a', 'b'])
+        expected = expected.set_index(['a', 'b'])
+        result = self.read_csv(StringIO(data), index_col=[0, 1],
+                               parse_dates=True, thousands='.')
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py
@@ -5,13 +5,12 @@
 for all of the parsers defined in parsers.py
 """
 
-from datetime import datetime
 import nose
 
 import numpy as np
 import pandas.util.testing as tm
 
-from pandas import DataFrame
+from pandas import DataFrame, Index
 from pandas.lib import Timestamp
 from pandas.compat import StringIO
 
@@ -99,35 +98,31 @@ def test_usecols_index_col_False(self):
 
     def test_usecols_index_col_conflict(self):
         # see gh-4201: test that index_col as integer reflects usecols
-        data = """SecId,Time,Price,P2,P3
-10000,2013-5-11,100,10,1
-500,2013-5-12,101,11,1
-"""
-        expected = DataFrame({'Price': [100, 101]}, index=[
-            datetime(2013, 5, 11), datetime(2013, 5, 12)])
-        expected.index.name = 'Time'
+        data = 'a,b,c,d\nA,a,1,one\nB,b,2,two'
+        expected = DataFrame({'c': [1, 2]}, index=Index(
+            ['a', 'b'], name='b'))
 
-        df = self.read_csv(StringIO(data), usecols=[
-            'Time', 'Price'], parse_dates=True, index_col=0)
+        df = self.read_csv(StringIO(data), usecols=['b', 'c'],
+                           index_col=0)
         tm.assert_frame_equal(expected, df)
 
-        df = self.read_csv(StringIO(data), usecols=[
-            'Time', 'Price'], parse_dates=True, index_col='Time')
+        df = self.read_csv(StringIO(data), usecols=['b', 'c'],
+                           index_col='b')
         tm.assert_frame_equal(expected, df)
 
-        df = self.read_csv(StringIO(data), usecols=[
-            1, 2], parse_dates=True, index_col='Time')
+        df = self.read_csv(StringIO(data), usecols=[1, 2],
+                           index_col='b')
         tm.assert_frame_equal(expected, df)
 
-        df = self.read_csv(StringIO(data), usecols=[
-            1, 2], parse_dates=True, index_col=0)
+        df = self.read_csv(StringIO(data), usecols=[1, 2],
+                           index_col=0)
         tm.assert_frame_equal(expected, df)
 
         expected = DataFrame(
-            {'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)})
-        expected = expected.set_index(['Price', 'P2'])
-        df = self.read_csv(StringIO(data), usecols=[
-            'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2'])
+            {'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')})
+        expected = expected.set_index(['b', 'c'])
+        df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'],
+                           index_col=['b', 'c'])
         tm.assert_frame_equal(expected, df)
 
     def test_usecols_implicit_index_col(self):