TYP: parsers (pandas-dev#52993)

jbrockmendel · Yi Wei · commit bc0022e9256d · 2023-05-19T14:06:33.000+02:00
* REF: avoid modifying self.index_col

* REF: remove unused, do less in init

* REF: simplify parsers calls

* use orig_names

* simplify

* stronger typing sorta

* TYP/REF: make Parsers less stateful

* final
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -36,9 +36,6 @@ def _parse_kwds(self):
         encoding: str | None = self.kwds.get("encoding")
         self.encoding = "utf-8" if encoding is None else encoding
 
-        self.usecols, self.usecols_dtype = self._validate_usecols_arg(
-            self.kwds["usecols"]
-        )
         na_values = self.kwds["na_values"]
         if isinstance(na_values, dict):
             raise ValueError(
@@ -121,13 +118,15 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
         # we only need the frame not the names
         frame.columns, frame = self._do_date_conversions(frame.columns, frame)
         if self.index_col is not None:
+            index_to_set = self.index_col.copy()
             for i, item in enumerate(self.index_col):
                 if is_integer(item):
-                    self.index_col[i] = frame.columns[item]
+                    index_to_set[i] = frame.columns[item]
                 # String case
                 elif item not in frame.columns:
                     raise ValueError(f"Index {item} invalid")
-            frame.set_index(self.index_col, drop=True, inplace=True)
+
+            frame.set_index(index_to_set, drop=True, inplace=True)
             # Clear names if headerless and no name given
             if self.header is None and not multi_index_named:
                 frame.index.names = [None] * len(frame.index.names)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -102,10 +102,17 @@ class BadLineHandleMethod(Enum):
         WARN = 1
         SKIP = 2
 
-    _implicit_index: bool = False
+    _implicit_index: bool
     _first_chunk: bool
+    keep_default_na: bool
+    dayfirst: bool
+    cache_dates: bool
+    keep_date_col: bool
+    usecols_dtype: str | None
 
     def __init__(self, kwds) -> None:
+        self._implicit_index = False
+
         self.names = kwds.get("names")
         self.orig_names: Sequence[Hashable] | None = None
 
@@ -155,15 +162,19 @@ def __init__(self, kwds) -> None:
 
             # validate index_col that only contains integers
             if self.index_col is not None:
-                if not (
+                # In this case we can pin down index_col as list[int]
+                if is_integer(self.index_col):
+                    self.index_col = [self.index_col]
+                elif not (
                     is_list_like(self.index_col, allow_sets=False)
                     and all(map(is_integer, self.index_col))
-                    or is_integer(self.index_col)
                 ):
                     raise ValueError(
                         "index_col must only contain row numbers "
                         "when specifying a multi-index header"
                     )
+                else:
+                    self.index_col = list(self.index_col)
 
         self._name_processed = False
 
@@ -428,6 +439,7 @@ def _get_name(icol):
 
         return index
 
+    @final
     def _clean_mapping(self, mapping):
         """converts col numbers to names"""
         if not isinstance(mapping, dict):
@@ -656,6 +668,7 @@ def _set(x) -> int:
 
         return noconvert_columns
 
+    @final
     def _infer_types(
         self, values, na_values, no_dtype_specified, try_num_bool: bool = True
     ) -> tuple[ArrayLike, int]:
@@ -760,6 +773,7 @@ def _infer_types(
 
         return result, na_count
 
+    @final
     def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
         """
         Cast values to specified type
@@ -847,6 +861,7 @@ def _do_date_conversions(
     ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
         ...
 
+    @final
     def _do_date_conversions(
         self,
         names: Sequence[Hashable] | Index,
@@ -868,6 +883,7 @@ def _do_date_conversions(
 
         return names, data
 
+    @final
     def _check_data_length(
         self,
         columns: Sequence[Hashable],
@@ -911,6 +927,7 @@ def _evaluate_usecols(
     ) -> set[str]:
         ...
 
+    @final
     def _evaluate_usecols(
         self,
         usecols: Callable[[Hashable], object] | set[str] | set[int],
@@ -927,6 +944,7 @@ def _evaluate_usecols(
             return {i for i, name in enumerate(names) if usecols(name)}
         return usecols
 
+    @final
     def _validate_usecols_names(self, usecols, names: Sequence):
         """
         Validates that all usecols are present in a given
@@ -958,6 +976,7 @@ def _validate_usecols_names(self, usecols, names: Sequence):
 
         return usecols
 
+    @final
     def _validate_usecols_arg(self, usecols):
         """
         Validate the 'usecols' parameter.
@@ -1007,6 +1026,7 @@ def _validate_usecols_arg(self, usecols):
             return usecols, usecols_dtype
         return usecols, None
 
+    @final
     def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
         if not is_index_col(index_col):
             return None, columns, index_col
@@ -1044,11 +1064,13 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis
 
         return index_names, columns, index_col
 
-    def _get_empty_meta(
-        self, columns, index_col, index_names, dtype: DtypeArg | None = None
-    ):
+    @final
+    def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
         columns = list(columns)
 
+        index_col = self.index_col
+        index_names = self.index_names
+
         # Convert `dtype` to a defaultdict of some kind.
         # This will enable us to write `dtype[col_name]`
         # without worrying about KeyError issues later on.
@@ -1319,7 +1341,7 @@ def _try_convert_dates(
     return new_name, new_col, colnames
 
 
-def _get_na_values(col, na_values, na_fvalues, keep_default_na):
+def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
     """
     Get the NaN values for a given column.
 
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -245,9 +245,7 @@ def read(
                 )
                 index, columns, col_dict = self._get_empty_meta(
                     names,
-                    self.index_col,
-                    self.index_names,
-                    dtype=self.kwds.get("dtype"),
+                    dtype=self.dtype,
                 )
                 columns = self._maybe_make_multi_index_columns(columns, self.col_names)
 
@@ -344,17 +342,6 @@ def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
             ]
         return names
 
-    def _get_index_names(self):
-        names = list(self._reader.header[0])
-        idx_names = None
-
-        if self._reader.leading_cols == 0 and self.index_col is not None:
-            (idx_names, names, self.index_col) = self._clean_index_names(
-                names, self.index_col
-            )
-
-        return names, idx_names
-
     def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
         if try_parse_dates and self._should_parse_dates(index):
             values = self._date_conv(
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -28,6 +28,7 @@
     EmptyDataError,
     ParserError,
 )
+from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.common import (
     is_bool_dtype,
@@ -65,6 +66,8 @@
 
 
 class PythonParser(ParserBase):
+    _no_thousands_columns: set[int]
+
     def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
         """
         Workhorse function for processing nested list into DataFrame
@@ -97,8 +100,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
         self.quoting = kwds["quoting"]
         self.skip_blank_lines = kwds["skip_blank_lines"]
 
-        self.names_passed = kwds["names"] or None
-
         self.has_index_names = False
         if "has_index_names" in kwds:
             self.has_index_names = kwds["has_index_names"]
@@ -116,7 +117,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
             self.data = cast(Iterator[str], f)
         else:
             assert hasattr(f, "readline")
-            self._make_reader(f)
+            self.data = self._make_reader(f)
 
         # Get columns in two steps: infer from data, then
         # infer column indices from self.usecols if it is specified.
@@ -148,9 +149,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
         # multiple date column thing turning into a real spaghetti factory
 
         if not self._has_complex_date_col:
-            (index_names, self.orig_names, self.columns) = self._get_index_name(
-                self.columns
-            )
+            (index_names, self.orig_names, self.columns) = self._get_index_name()
             self._name_processed = True
             if self.index_names is None:
                 self.index_names = index_names
@@ -164,6 +163,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
         if len(self.decimal) != 1:
             raise ValueError("Only length-1 decimal markers supported")
 
+    @cache_readonly
+    def num(self) -> re.Pattern:
         decimal = re.escape(self.decimal)
         if self.thousands is None:
             regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
@@ -173,9 +174,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
                 rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
                 rf"([0-9]?(E|e)\-?[0-9]+)?$"
             )
-        self.num = re.compile(regex)
+        return re.compile(regex)
 
-    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
+    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]):
         sep = self.delimiter
 
         if sep is None or len(sep) == 1:
@@ -237,10 +238,7 @@ def _read():
 
             reader = _read()
 
-        # error: Incompatible types in assignment (expression has type "_reader",
-        # variable has type "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
-        # TextIOWrapper, mmap, None]")
-        self.data = reader  # type: ignore[assignment]
+        return reader
 
     def read(
         self, rows: int | None = None
@@ -270,11 +268,8 @@ def read(
                     self.index_col,  # type: ignore[has-type]
                 ),
             )
-            # error: Cannot determine type of 'index_col'
             index, columns, col_dict = self._get_empty_meta(
                 names,
-                self.index_col,  # type: ignore[has-type]
-                self.index_names,
                 self.dtype,
             )
             conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names)
@@ -908,10 +903,8 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
     def _clear_buffer(self) -> None:
         self.buf = []
 
-    _implicit_index = False
-
     def _get_index_name(
-        self, columns: Sequence[Hashable]
+        self,
     ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
         """
         Try several cases to get lines:
@@ -924,6 +917,7 @@ def _get_index_name(
         1 lists index columns and row 0 lists normal columns.
         2) Get index from the columns if it was listed.
         """
+        columns: Sequence[Hashable] = self.orig_names
         orig_names = list(columns)
         columns = list(columns)
 
@@ -1317,8 +1311,8 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
         self.infer_nrows = kwds.pop("infer_nrows")
         PythonParser.__init__(self, f, **kwds)
 
-    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
-        self.data = FixedWidthReader(
+    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader:
+        return FixedWidthReader(
             f,
             self.colspecs,
             self.delimiter,