BUG: Fix parse_dates processing with usecols and C engine

gfyoung · gfyoung · commit 83caa3b3852f · 2016-04-06T14:33:58.000+01:00
Fixes bug in processing 'parse_dates' with the C engine in which the wrong indices (those of the filtered column names) were being used to determine the date columns to not be dtype-parsed by the C engine. The correct indices are those of the original (unfiltered) column names, as they are used later on in the actual data processing. Closes gh-9755.
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -236,3 +236,9 @@ Bug Fixes
 - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
 - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
 - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
+
+
+
+
+
+- Bug in ``read_csv`` when specifying ``usecols`` and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1157,18 +1157,21 @@ def __init__(self, src, **kwds):
             else:
                 self.names = lrange(self._reader.table_width)
 
-        # If the names were inferred (not passed by user) and usedcols is
-        # defined, then ensure names refers to the used columns, not the
-        # document's columns.
-        if self.usecols and passed_names:
-            col_indices = []
-            for u in self.usecols:
-                if isinstance(u, string_types):
-                    col_indices.append(self.names.index(u))
-                else:
-                    col_indices.append(u)
-            self.names = [n for i, n in enumerate(self.names)
-                          if i in col_indices]
+        # gh-9755
+        #
+        # need to set orig_names here first
+        # so that proper indexing can be done
+        # with _set_noconvert_columns
+        #
+        # once names has been filtered, we will
+        # then set orig_names again to names
+        self.orig_names = self.names[:]
+
+        if self.usecols:
+            if len(self.names) > len(self.usecols):
+                self.names = [n for i, n in enumerate(self.names)
+                              if (i in self.usecols or n in self.usecols)]
+
             if len(self.names) < len(self.usecols):
                 raise ValueError("Usecols do not match names.")
 
@@ -1194,13 +1197,17 @@ def __init__(self, src, **kwds):
         self._implicit_index = self._reader.leading_cols > 0
 
     def _set_noconvert_columns(self):
-        names = self.names
+        names = self.orig_names
+        usecols = self.usecols
 
         def _set(x):
-            if com.is_integer(x):
-                self._reader.set_noconvert(x)
-            else:
-                self._reader.set_noconvert(names.index(x))
+            if usecols and com.is_integer(x):
+                x = list(usecols)[x]
+
+            if not com.is_integer(x):
+                x = names.index(x)
+
+            self._reader.set_noconvert(x)
 
         if isinstance(self.parse_dates, list):
             for val in self.parse_dates:
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2682,12 +2682,84 @@ def test_uneven_lines_with_usecols(self):
         df = self.read_csv(StringIO(csv), usecols=usecols)
         tm.assert_frame_equal(df, expected)
 
-        usecols = ['a', 1]
+        usecols = ['a', 'b']
         df = self.read_csv(StringIO(csv), usecols=usecols)
         tm.assert_frame_equal(df, expected)
 
-        usecols = ['a', 'b']
-        df = self.read_csv(StringIO(csv), usecols=usecols)
+    def test_usecols_with_parse_dates(self):
+        # See gh-9755
+        s = """a,b,c,d,e
+        0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = read_csv(StringIO(s), usecols=[0, 2, 3],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = read_csv(StringIO(s), usecols=[3, 0, 2],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_full_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('abcde')
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[0, 2, 3],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[3, 0, 2],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_parse_dates_and_usecol_names(self):
+        # See gh-9755
+        s = """0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+        parse_dates = [[1, 2]]
+        names = list('acd')
+
+        cols = {
+            'a'  : [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[0, 2, 3],
+                      parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        df = read_csv(StringIO(s), names=names,
+                      usecols=[3, 0, 2],
+                      parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)