Typ parts of python parser (pandas-dev#45015)

phofl · web-flow · commit 4558e7d1fc03 · 2022-02-06T14:44:59.000-08:00
* TYP: Type python parser

* Fix bug

* Fix assignment issue

* Adress conflicts

* Remove unnecessary changes

* Adjust
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -303,9 +303,7 @@ def _extract_multi_indexer_columns(
 
         # clean the index_names
         index_names = header.pop(-1)
-        index_names, _, _ = self._clean_index_names(
-            index_names, self.index_col, self.unnamed_cols
-        )
+        index_names, _, _ = self._clean_index_names(index_names, self.index_col)
 
         # extract the columns
         field_count = len(header[0])
@@ -381,21 +379,24 @@ def _maybe_make_multi_index_columns(
         return columns
 
     @final
-    def _make_index(self, data, alldata, columns, indexnamerow=False):
+    def _make_index(
+        self, data, alldata, columns, indexnamerow=False
+    ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
+        index: Index | None
         if not is_index_col(self.index_col) or not self.index_col:
             index = None
 
         elif not self._has_complex_date_col:
-            index = self._get_simple_index(alldata, columns)
-            index = self._agg_index(index)
+            simple_index = self._get_simple_index(alldata, columns)
+            index = self._agg_index(simple_index)
         elif self._has_complex_date_col:
             if not self._name_processed:
                 (self.index_names, _, self.index_col) = self._clean_index_names(
-                    list(columns), self.index_col, self.unnamed_cols
+                    list(columns), self.index_col
                 )
                 self._name_processed = True
-            index = self._get_complex_date_index(data, columns)
-            index = self._agg_index(index, try_parse_dates=False)
+            date_index = self._get_complex_date_index(data, columns)
+            index = self._agg_index(date_index, try_parse_dates=False)
 
         # add names for the index
         if indexnamerow:
@@ -966,7 +967,7 @@ def _validate_usecols_arg(self, usecols):
             return usecols, usecols_dtype
         return usecols, None
 
-    def _clean_index_names(self, columns, index_col, unnamed_cols):
+    def _clean_index_names(self, columns, index_col):
         if not is_index_col(index_col):
             return None, columns, index_col
 
@@ -998,7 +999,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols):
 
         # Only clean index names that were placeholders.
         for i, name in enumerate(index_names):
-            if isinstance(name, str) and name in unnamed_cols:
+            if isinstance(name, str) and name in self.unnamed_cols:
                 index_names[i] = None
 
         return index_names, columns, index_col
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -172,7 +172,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds):
                     self.names,  # type: ignore[has-type]
                     # error: Cannot determine type of 'index_col'
                     self.index_col,  # type: ignore[has-type]
-                    self.unnamed_cols,
                 )
 
                 if self.index_names is None:
@@ -220,6 +219,8 @@ def read(
         Sequence[Hashable] | MultiIndex,
         Mapping[Hashable, ArrayLike],
     ]:
+        index: Index | MultiIndex | None
+        column_names: Sequence[Hashable] | MultiIndex
         try:
             if self.low_memory:
                 chunks = self._reader.read_low_memory(nrows)
@@ -284,7 +285,12 @@ def read(
             data_tups = sorted(data.items())
             data = {k: v for k, (i, v) in zip(names, data_tups)}
 
-            names, date_data = self._do_date_conversions(names, data)
+            column_names, date_data = self._do_date_conversions(names, data)
+
+            # maybe create a mi on the columns
+            column_names = self._maybe_make_multi_index_columns(
+                column_names, self.col_names
+            )
 
         else:
             # rename dict keys
@@ -308,12 +314,9 @@ def read(
             data = {k: v for k, (i, v) in zip(names, data_tups)}
 
             names, date_data = self._do_date_conversions(names, data)
-            index, names = self._make_index(date_data, alldata, names)
-
-        # maybe create a mi on the columns
-        conv_names = self._maybe_make_multi_index_columns(names, self.col_names)
+            index, column_names = self._make_index(date_data, alldata, names)
 
-        return index, conv_names, date_data
+        return index, column_names, date_data
 
     def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
         # hackish
@@ -330,7 +333,7 @@ def _get_index_names(self):
 
         if self._reader.leading_cols == 0 and self.index_col is not None:
             (idx_names, names, self.index_col) = self._clean_index_names(
-                names, self.index_col, self.unnamed_cols
+                names, self.index_col
             )
 
         return names, idx_names
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -13,6 +13,7 @@
     DefaultDict,
     Hashable,
     Iterator,
+    List,
     Literal,
     Mapping,
     Sequence,
@@ -37,6 +38,11 @@
 from pandas.core.dtypes.common import is_integer
 from pandas.core.dtypes.inference import is_dict_like
 
+from pandas import (
+    Index,
+    MultiIndex,
+)
+
 from pandas.io.parsers.base_parser import (
     ParserBase,
     parser_defaults,
@@ -167,7 +173,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
             )
         self.num = re.compile(regex)
 
-    def _make_reader(self, f) -> None:
+    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
         sep = self.delimiter
 
         if sep is None or len(sep) == 1:
@@ -198,10 +204,11 @@ class MyDialect(csv.Dialect):
                     self.pos += 1
                     line = f.readline()
                     lines = self._check_comments([[line]])[0]
+                lines_str = cast(List[str], lines)
 
                 # since `line` was a string, lines will be a list containing
                 # only a single string
-                line = lines[0]
+                line = lines_str[0]
 
                 self.pos += 1
                 self.line_pos += 1
@@ -233,7 +240,11 @@ def _read():
         # TextIOWrapper, mmap, None]")
         self.data = reader  # type: ignore[assignment]
 
-    def read(self, rows: int | None = None):
+    def read(
+        self, rows: int | None = None
+    ) -> tuple[
+        Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
+    ]:
         try:
             content = self._get_lines(rows)
         except StopIteration:
@@ -273,9 +284,11 @@ def read(self, rows: int | None = None):
         conv_data = self._convert_data(data)
         columns, conv_data = self._do_date_conversions(columns, conv_data)
 
-        index, columns = self._make_index(conv_data, alldata, columns, indexnamerow)
+        index, result_columns = self._make_index(
+            conv_data, alldata, columns, indexnamerow
+        )
 
-        return index, columns, conv_data
+        return index, result_columns, conv_data
 
     def _exclude_implicit_index(
         self,
@@ -586,7 +599,7 @@ def _handle_usecols(
             self._col_indices = sorted(col_indices)
         return columns
 
-    def _buffered_line(self):
+    def _buffered_line(self) -> list[Scalar]:
         """
         Return a line from buffer, filling buffer if required.
         """
@@ -878,7 +891,9 @@ def _clear_buffer(self) -> None:
 
     _implicit_index = False
 
-    def _get_index_name(self, columns: list[Hashable]):
+    def _get_index_name(
+        self, columns: list[Hashable]
+    ) -> tuple[list[Hashable] | None, list[Hashable], list[Hashable]]:
         """
         Try several cases to get lines:
 
@@ -943,8 +958,8 @@ def _get_index_name(self, columns: list[Hashable]):
 
         else:
             # Case 2
-            (index_name, columns_, self.index_col) = self._clean_index_names(
-                columns, self.index_col, self.unnamed_cols
+            (index_name, _, self.index_col) = self._clean_index_names(
+                columns, self.index_col
             )
 
         return index_name, orig_names, columns
@@ -1036,7 +1051,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
                 ]
         return zipped_content
 
-    def _get_lines(self, rows: int | None = None):
+    def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
         lines = self.buf
         new_rows = None
 
@@ -1133,7 +1148,7 @@ class FixedWidthReader(abc.Iterator):
 
     def __init__(
         self,
-        f: IO[str],
+        f: IO[str] | ReadCsvBuffer[str],
         colspecs: list[tuple[int, int]] | Literal["infer"],
         delimiter: str | None,
         comment: str | None,
@@ -1230,14 +1245,16 @@ def detect_colspecs(
         return edge_pairs
 
     def __next__(self) -> list[str]:
+        # Argument 1 to "next" has incompatible type "Union[IO[str],
+        # ReadCsvBuffer[str]]"; expected "SupportsNext[str]"
         if self.buffer is not None:
             try:
                 line = next(self.buffer)
             except StopIteration:
                 self.buffer = None
-                line = next(self.f)
+                line = next(self.f)  # type: ignore[arg-type]
         else:
-            line = next(self.f)
+            line = next(self.f)  # type: ignore[arg-type]
         # Note: 'colspecs' is a sequence of half-open intervals.
         return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
 
@@ -1254,7 +1271,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
         self.infer_nrows = kwds.pop("infer_nrows")
         PythonParser.__init__(self, f, **kwds)
 
-    def _make_reader(self, f: IO[str]) -> None:
+    def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
         self.data = FixedWidthReader(
             f,
             self.colspecs,