REF: Unify _set_noconvert_dtype_columns for parsers

phofl · phofl · commit 19e5d3539d79 · 2021-01-24T01:54:59.000+01:00
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -172,6 +172,8 @@ def __init__(self, kwds):
 
         self._first_chunk = True
 
+        self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
+
         self.handles: Optional[IOHandles] = None
 
     def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
@@ -546,6 +548,65 @@ def _convert_to_ndarrays(
                 print(f"Filled {na_count} NA values in column {c!s}")
         return result
 
+    def _set_noconvert_dtype_columns(self, col_indices, names):
+        """
+        Set the columns that should not undergo dtype conversions.
+
+        Currently, any column that is involved with date parsing will not
+        undergo such conversions.
+        """
+        noconvert_columns = set()
+        if self.usecols_dtype == "integer":
+            # A set of integers will be converted to a list in
+            # the correct order every single time.
+            usecols = list(self.usecols)
+            usecols.sort()
+        elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
+            # The names attribute should have the correct columns
+            # in the proper order for indexing with parse_dates.
+            usecols = col_indices
+        else:
+            # Usecols is empty.
+
+            # pandas\io\parsers.py:2030: error: Incompatible types in
+            # assignment (expression has type "None", variable has type
+            # "List[Any]")  [assignment]
+            usecols = None  # type: ignore[assignment]
+
+        def _set(x):
+            if usecols is not None and is_integer(x):
+                x = usecols[x]
+
+            if not is_integer(x):
+                x = col_indices[names.index(x)]
+
+            noconvert_columns.add(x)
+
+        if isinstance(self.parse_dates, list):
+            for val in self.parse_dates:
+                if isinstance(val, list):
+                    for k in val:
+                        _set(k)
+                else:
+                    _set(val)
+
+        elif isinstance(self.parse_dates, dict):
+            for val in self.parse_dates.values():
+                if isinstance(val, list):
+                    for k in val:
+                        _set(k)
+                else:
+                    _set(val)
+
+        elif self.parse_dates:
+            if isinstance(self.index_col, list):
+                for k in self.index_col:
+                    _set(k)
+            elif self.index_col is not None:
+                _set(self.index_col)
+
+        return noconvert_columns
+
     def _infer_types(self, values, na_values, try_num_bool=True):
         """
         Infer types of values, possibly casting
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -1,8 +1,6 @@
 import pandas._libs.parsers as parsers
 from pandas._typing import FilePathOrBuffer
 
-from pandas.core.dtypes.common import is_integer
-
 from pandas.core.indexes.api import ensure_index_from_sequences
 
 from pandas.io.parsers.base_parser import ParserBase, is_index_col
@@ -19,7 +17,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
         kwds["allow_leading_cols"] = self.index_col is not False
 
         # GH20529, validate usecol arg before TextReader
-        self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
         kwds["usecols"] = self.usecols
 
         # open handles
@@ -159,58 +156,11 @@ def _set_noconvert_columns(self):
         Currently, any column that is involved with date parsing will not
         undergo such conversions.
         """
-        names = self.orig_names
-        if self.usecols_dtype == "integer":
-            # A set of integers will be converted to a list in
-            # the correct order every single time.
-            usecols = list(self.usecols)
-            usecols.sort()
-        elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
-            # The names attribute should have the correct columns
-            # in the proper order for indexing with parse_dates.
-            usecols = self.names[:]
-        else:
-            # Usecols is empty.
-
-            # pandas\io\parsers.py:2030: error: Incompatible types in
-            # assignment (expression has type "None", variable has type
-            # "List[Any]")  [assignment]
-            usecols = None  # type: ignore[assignment]
-
-        def _set(x):
-            if usecols is not None and is_integer(x):
-                x = usecols[x]
-
-            if not is_integer(x):
-                # assert for mypy, names is List or None, None would error when calling
-                # .index()
-                assert names is not None
-                x = names.index(x)
-
-            self._reader.set_noconvert(x)
-
-        if isinstance(self.parse_dates, list):
-            for val in self.parse_dates:
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif isinstance(self.parse_dates, dict):
-            for val in self.parse_dates.values():
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif self.parse_dates:
-            if isinstance(self.index_col, list):
-                for k in self.index_col:
-                    _set(k)
-            elif self.index_col is not None:
-                _set(self.index_col)
+        assert self.orig_names is not None
+        col_indices = [self.orig_names.index(x) for x in self.names]
+        noconvert_columns = self._set_noconvert_dtype_columns(col_indices, self.names)
+        for col in noconvert_columns:
+            self._reader.set_noconvert(col)
 
     def set_error_bad_lines(self, status):
         self._reader.set_error_bad_lines(int(status))
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -53,7 +53,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
         self.skipinitialspace = kwds["skipinitialspace"]
         self.lineterminator = kwds["lineterminator"]
         self.quoting = kwds["quoting"]
-        self.usecols, _ = self._validate_usecols_arg(kwds["usecols"])
         self.skip_blank_lines = kwds["skip_blank_lines"]
 
         self.warn_bad_lines = kwds["warn_bad_lines"]
@@ -137,7 +136,9 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
 
         self._validate_parse_dates_presence(self.columns)
         if self.parse_dates:
-            self._no_thousands_columns = self._set_no_thousands_columns()
+            self._no_thousands_columns = self._set_noconvert_dtype_columns(
+                self._col_indices, self.columns
+            )
         else:
             self._no_thousands_columns = None
 
@@ -155,44 +156,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
             )
         self.num = re.compile(regex)
 
-    def _set_no_thousands_columns(self):
-        # Create a set of column ids that are not to be stripped of thousands
-        # operators.
-        noconvert_columns = set()
-
-        def _set(x):
-            if is_integer(x):
-                noconvert_columns.add(x)
-            else:
-                assert self._col_indices is not None
-                col_indices = self._col_indices
-                noconvert_columns.add(col_indices[self.columns.index(x)])
-
-        if isinstance(self.parse_dates, list):
-            for val in self.parse_dates:
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif isinstance(self.parse_dates, dict):
-            for val in self.parse_dates.values():
-                if isinstance(val, list):
-                    for k in val:
-                        _set(k)
-                else:
-                    _set(val)
-
-        elif self.parse_dates:
-            if isinstance(self.index_col, list):
-                for k in self.index_col:
-                    _set(k)
-            elif self.index_col is not None:
-                _set(self.index_col)
-
-        return noconvert_columns
-
     def _make_reader(self, f):
         sep = self.delimiter
 
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -1603,3 +1603,21 @@ def test_date_parser_and_names(all_parsers):
     result = parser.read_csv(data, parse_dates=["B"], names=["B"])
     expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"])
     tm.assert_frame_equal(result, expected)
+
+
+def test_date_parser_usecols_thousands(all_parsers):
+    # GH#
+    data = """A,B,C
+    1,3,20-09-01-01
+    2,4,20-09-01-01
+    """
+
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO(data),
+        parse_dates=[1],
+        usecols=[1, 2],
+        thousands="-",
+    )
+    expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
+    tm.assert_frame_equal(result, expected)