REF: Unify _set_noconvert_dtype_columns for parsers (#39365)

phofl · web-flow · commit bc3adf2afa80 · 2021-01-27T09:01:35.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -325,6 +325,7 @@ I/O
 - Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`)
 - Bug in :meth:`~HDFStore.put` raising a wrong ``TypeError`` when saving a DataFrame with non-string dtype (:issue:`34274`)
 - Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
+- Bug in :func:`read_csv` apllying thousands separator to date columns when column should be parsed for dates and ``usecols`` is specified for ``engine="python"`` (:issue:`39365`)
 - Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
 - :func:`read_excel` now respects :func:`set_option` (:issue:`34252`)
 - Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -172,6 +172,8 @@ def __init__(self, kwds):
 
         self._first_chunk = True
 
+        self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
+
         self.handles: Optional[IOHandles] = None
 
     def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
@@ -546,6 +548,74 @@ def _convert_to_ndarrays(
                 print(f"Filled {na_count} NA values in column {c!s}")
         return result
 
+    def _set_noconvert_dtype_columns(
+        self, col_indices: List[int], names: List[Union[int, str]]
+    ) -> Set[int]:
+        """
+        Set the columns that should not undergo dtype conversions.
+
+        Currently, any column that is involved with date parsing will not
+        undergo such conversions. If usecols is specified, the positions of the columns
+        not to cast is relative to the usecols not to all columns.
+
+        Parameters
+        ----------
+        col_indices: The indices specifying order and positions of the columns
+        names: The column names which order is corresponding with the order
+               of col_indices
+
+        Returns
+        -------
+        A set of integers containing the positions of the columns not to convert.
+        """
+        usecols: Optional[Union[List[int], List[str]]]
+        noconvert_columns = set()
+        if self.usecols_dtype == "integer":
+            # A set of integers will be converted to a list in
+            # the correct order every single time.
+            usecols = sorted(self.usecols)
+        elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
+            # The names attribute should have the correct columns
+            # in the proper order for indexing with parse_dates.
+            usecols = col_indices
+        else:
+            # Usecols is empty.
+            usecols = None
+
+        def _set(x) -> int:
+            if usecols is not None and is_integer(x):
+                x = usecols[x]
+
+            if not is_integer(x):
+                x = col_indices[names.index(x)]
+
+            return x
+
+        if isinstance(self.parse_dates, list):
+            for val in self.parse_dates:
+                if isinstance(val, list):
+                    for k in val:
+                        noconvert_columns.add(_set(k))
+                else:
+                    noconvert_columns.add(_set(val))
+
+        elif isinstance(self.parse_dates, dict):
+            for val in self.parse_dates.values():
+                if isinstance(val, list):
+                    for k in val:
+                        noconvert_columns.add(_set(k))
+                else:
+                    noconvert_columns.add(_set(val))
+
+        elif self.parse_dates:
+            if isinstance(self.index_col, list):
+                for k in self.index_col:
+                    noconvert_columns.add(_set(k))
+            elif self.index_col is not None:
+                noconvert_columns.add(_set(self.index_col))
+
+        return noconvert_columns
+
     def _infer_types(self, values, na_values, try_num_bool=True):
         """
         Infer types of values, possibly casting
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -1,8 +1,6 @@
 import pandas._libs.parsers as parsers
 from pandas._typing import FilePathOrBuffer
 
-from pandas.core.dtypes.common import is_integer
-
 from pandas.core.indexes.api import ensure_index_from_sequences
 
 from pandas.io.parsers.base_parser import ParserBase, is_index_col
@@ -19,7 +17,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
         kwds["allow_leading_cols"] = self.index_col is not False
 
         # GH20529, validate usecol arg before TextReader
-        self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
         kwds["usecols"] = self.usecols
 
         # open handles
@@ -159,58 +156,11 @@ def _set_noconvert_columns(self):
         Currently, any column that is involved with date parsing will not
         undergo such conversions.
         """
-        names = self.orig_names
-        if self.usecols_dtype == "integer":
-            # A set of integers will be converted to a list in
-            # the correct order every single time.
-            usecols = list(self.usecols)
-            usecols.sort()
-        elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
-            # The names attribute should have the correct columns
-            # in the proper order for indexing with parse_dates.
-            usecols = self.names[:]
-        else:
-            # Usecols is empty.
-
-            # pandas\io\parsers.py:2030: error: Incompatible types in
-            # assignment (expression has type "None", variable has type
-            # "List[Any]")  [assignment]
-            usecols = None  # type: ignore[assignment]
-
-        def _set(x):
-            if usecols is not None and is_integer(x):
-                x = usecols[x]
-
-            if not is_integer(x):
-                # assert for mypy, names is List or None, None would error when calling
-                # .index()
-                assert names is not None
-                x = names.index(x)
-
-            self._reader.set_noconvert(x)
-
-        if isinstance(self.parse_dates, list):
-            for val in self.parse_dates:
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif isinstance(self.parse_dates, dict):
-            for val in self.parse_dates.values():
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif self.parse_dates:
-            if isinstance(self.index_col, list):
-                for k in self.index_col:
-                    _set(k)
-            elif self.index_col is not None:
-                _set(self.index_col)
+        assert self.orig_names is not None
+        col_indices = [self.orig_names.index(x) for x in self.names]
+        noconvert_columns = self._set_noconvert_dtype_columns(col_indices, self.names)
+        for col in noconvert_columns:
+            self._reader.set_noconvert(col)
 
     def set_error_bad_lines(self, status):
         self._reader.set_error_bad_lines(int(status))
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -3,7 +3,7 @@
 from io import StringIO
 import re
 import sys
-from typing import Iterator, List, Optional, cast
+from typing import Iterator, List, Optional, Set, cast
 
 import numpy as np
 
@@ -53,7 +53,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
         self.skipinitialspace = kwds["skipinitialspace"]
         self.lineterminator = kwds["lineterminator"]
         self.quoting = kwds["quoting"]
-        self.usecols, _ = self._validate_usecols_arg(kwds["usecols"])
         self.skip_blank_lines = kwds["skip_blank_lines"]
 
         self.warn_bad_lines = kwds["warn_bad_lines"]
@@ -136,10 +135,12 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
             self._col_indices = list(range(len(self.columns)))
 
         self._validate_parse_dates_presence(self.columns)
+        no_thousands_columns: Optional[Set[int]] = None
         if self.parse_dates:
-            self._no_thousands_columns = self._set_no_thousands_columns()
-        else:
-            self._no_thousands_columns = None
+            no_thousands_columns = self._set_noconvert_dtype_columns(
+                self._col_indices, self.columns
+            )
+        self._no_thousands_columns = no_thousands_columns
 
         if len(self.decimal) != 1:
             raise ValueError("Only length-1 decimal markers supported")
@@ -155,44 +156,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
             )
         self.num = re.compile(regex)
 
-    def _set_no_thousands_columns(self):
-        # Create a set of column ids that are not to be stripped of thousands
-        # operators.
-        noconvert_columns = set()
-
-        def _set(x):
-            if is_integer(x):
-                noconvert_columns.add(x)
-            else:
-                assert self._col_indices is not None
-                col_indices = self._col_indices
-                noconvert_columns.add(col_indices[self.columns.index(x)])
-
-        if isinstance(self.parse_dates, list):
-            for val in self.parse_dates:
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif isinstance(self.parse_dates, dict):
-            for val in self.parse_dates.values():
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif self.parse_dates:
-            if isinstance(self.index_col, list):
-                for k in self.index_col:
-                    _set(k)
-            elif self.index_col is not None:
-                _set(self.index_col)
-
-        return noconvert_columns
-
     def _make_reader(self, f):
         sep = self.delimiter
 
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -1603,3 +1603,21 @@ def test_date_parser_and_names(all_parsers):
     result = parser.read_csv(data, parse_dates=["B"], names=["B"])
     expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"])
     tm.assert_frame_equal(result, expected)
+
+
+def test_date_parser_usecols_thousands(all_parsers):
+    # GH#39365
+    data = """A,B,C
+    1,3,20-09-01-01
+    2,4,20-09-01-01
+    """
+
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO(data),
+        parse_dates=[1],
+        usecols=[1, 2],
+        thousands="-",
+    )
+    expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
+    tm.assert_frame_equal(result, expected)