TYP: Misc changes for pandas-stubs; use Protocol to avoid str in Sequence (#55263)

twoertwein · web-flow · commit 1f167626987f · 2023-09-26T08:48:56.000-07:00
* TYP: misc changes for pandas-stubs test

* re-write changes from 47233 with SequenceNotStr

* pyupgrade
diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -24,6 +24,7 @@
     Type as type_t,
     TypeVar,
     Union,
+    overload,
 )
 
 import numpy as np
@@ -85,6 +86,8 @@
     # Name "npt._ArrayLikeInt_co" is not defined  [name-defined]
     NumpySorter = Optional[npt._ArrayLikeInt_co]  # type: ignore[name-defined]
 
+    from typing import SupportsIndex
+
     if sys.version_info >= (3, 10):
         from typing import TypeGuard  # pyright: ignore[reportUnusedImport]
     else:
@@ -109,18 +112,48 @@
 
 # list-like
 
-# Cannot use `Sequence` because a string is a sequence, and we don't want to
-# accept that.  Could refine if https://github.com/python/typing/issues/256 is
-# resolved to differentiate between Sequence[str] and str
-ListLike = Union[AnyArrayLike, list, tuple, range]
+# from https://github.com/hauntsaninja/useful_types
+# includes Sequence-like objects but excludes str and bytes
+_T_co = TypeVar("_T_co", covariant=True)
+
+
+class SequenceNotStr(Protocol[_T_co]):
+    @overload
+    def __getitem__(self, index: SupportsIndex, /) -> _T_co:
+        ...
+
+    @overload
+    def __getitem__(self, index: slice, /) -> Sequence[_T_co]:
+        ...
+
+    def __contains__(self, value: object, /) -> bool:
+        ...
+
+    def __len__(self) -> int:
+        ...
+
+    def __iter__(self) -> Iterator[_T_co]:
+        ...
+
+    def index(self, value: Any, /, start: int = 0, stop: int = ...) -> int:
+        ...
+
+    def count(self, value: Any, /) -> int:
+        ...
+
+    def __reversed__(self) -> Iterator[_T_co]:
+        ...
+
+
+ListLike = Union[AnyArrayLike, SequenceNotStr, range]
 
 # scalars
 
 PythonScalar = Union[str, float, bool]
 DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"]
 PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"]
 Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, date]
-IntStrT = TypeVar("IntStrT", int, str)
+IntStrT = TypeVar("IntStrT", bound=Union[int, str])
 
 
 # timestamp and timedelta convertible types
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -240,6 +240,7 @@
         Renamer,
         Scalar,
         Self,
+        SequenceNotStr,
         SortKind,
         StorageOptions,
         Suffixes,
@@ -1187,7 +1188,7 @@ def to_string(
         buf: None = ...,
         columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | list[str] = ...,
+        header: bool | SequenceNotStr[str] = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1212,7 +1213,7 @@ def to_string(
         buf: FilePath | WriteBuffer[str],
         columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | list[str] = ...,
+        header: bool | SequenceNotStr[str] = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1250,7 +1251,7 @@ def to_string(
         buf: FilePath | WriteBuffer[str] | None = None,
         columns: Axes | None = None,
         col_space: int | list[int] | dict[Hashable, int] | None = None,
-        header: bool | list[str] = True,
+        header: bool | SequenceNotStr[str] = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: fmt.FormattersType | None = None,
@@ -10563,9 +10564,9 @@ def merge(
         self,
         right: DataFrame | Series,
         how: MergeHow = "inner",
-        on: IndexLabel | None = None,
-        left_on: IndexLabel | None = None,
-        right_on: IndexLabel | None = None,
+        on: IndexLabel | AnyArrayLike | None = None,
+        left_on: IndexLabel | AnyArrayLike | None = None,
+        right_on: IndexLabel | AnyArrayLike | None = None,
         left_index: bool = False,
         right_index: bool = False,
         sort: bool = False,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -72,6 +72,7 @@
     Renamer,
     Scalar,
     Self,
+    SequenceNotStr,
     SortKind,
     StorageOptions,
     Suffixes,
@@ -3273,7 +3274,7 @@ def to_latex(
         self,
         buf: None = ...,
         columns: Sequence[Hashable] | None = ...,
-        header: bool_t | list[str] = ...,
+        header: bool_t | SequenceNotStr[str] = ...,
         index: bool_t = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3300,7 +3301,7 @@ def to_latex(
         self,
         buf: FilePath | WriteBuffer[str],
         columns: Sequence[Hashable] | None = ...,
-        header: bool_t | list[str] = ...,
+        header: bool_t | SequenceNotStr[str] = ...,
         index: bool_t = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3330,7 +3331,7 @@ def to_latex(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
         columns: Sequence[Hashable] | None = None,
-        header: bool_t | list[str] = True,
+        header: bool_t | SequenceNotStr[str] = True,
         index: bool_t = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,
diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py
@@ -301,7 +301,7 @@ def describe_timestamp_as_categorical_1d(
     names = ["count", "unique"]
     objcounts = data.value_counts()
     count_unique = len(objcounts[objcounts != 0])
-    result = [data.count(), count_unique]
+    result: list[float | Timestamp] = [data.count(), count_unique]
     dtype = None
     if count_unique > 0:
         top, freq = objcounts.index[0], objcounts.iloc[0]
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -1541,7 +1541,7 @@ def count(self):
 
         return result
 
-    def quantile(self, q: float | AnyArrayLike = 0.5, **kwargs):
+    def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs):
         """
         Return value at the given quantile.
 
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -138,9 +138,9 @@ def merge(
     left: DataFrame | Series,
     right: DataFrame | Series,
     how: MergeHow = "inner",
-    on: IndexLabel | None = None,
-    left_on: IndexLabel | None = None,
-    right_on: IndexLabel | None = None,
+    on: IndexLabel | AnyArrayLike | None = None,
+    left_on: IndexLabel | AnyArrayLike | None = None,
+    right_on: IndexLabel | AnyArrayLike | None = None,
     left_index: bool = False,
     right_index: bool = False,
     sort: bool = False,
@@ -187,9 +187,9 @@ def merge(
 def _cross_merge(
     left: DataFrame,
     right: DataFrame,
-    on: IndexLabel | None = None,
-    left_on: IndexLabel | None = None,
-    right_on: IndexLabel | None = None,
+    on: IndexLabel | AnyArrayLike | None = None,
+    left_on: IndexLabel | AnyArrayLike | None = None,
+    right_on: IndexLabel | AnyArrayLike | None = None,
     left_index: bool = False,
     right_index: bool = False,
     sort: bool = False,
@@ -239,7 +239,9 @@ def _cross_merge(
     return res
 
 
-def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
+def _groupby_and_merge(
+    by, left: DataFrame | Series, right: DataFrame | Series, merge_pieces
+):
     """
     groupby & merge; we are always performing a left-by type operation
 
@@ -255,7 +257,7 @@ def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
         by = [by]
 
     lby = left.groupby(by, sort=False)
-    rby: groupby.DataFrameGroupBy | None = None
+    rby: groupby.DataFrameGroupBy | groupby.SeriesGroupBy | None = None
 
     # if we can groupby the rhs
     # then we can get vastly better perf
@@ -295,8 +297,8 @@ def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
 
 
 def merge_ordered(
-    left: DataFrame,
-    right: DataFrame,
+    left: DataFrame | Series,
+    right: DataFrame | Series,
     on: IndexLabel | None = None,
     left_on: IndexLabel | None = None,
     right_on: IndexLabel | None = None,
@@ -737,9 +739,9 @@ def __init__(
         left: DataFrame | Series,
         right: DataFrame | Series,
         how: MergeHow | Literal["asof"] = "inner",
-        on: IndexLabel | None = None,
-        left_on: IndexLabel | None = None,
-        right_on: IndexLabel | None = None,
+        on: IndexLabel | AnyArrayLike | None = None,
+        left_on: IndexLabel | AnyArrayLike | None = None,
+        right_on: IndexLabel | AnyArrayLike | None = None,
         left_index: bool = False,
         right_index: bool = False,
         sort: bool = True,
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2141,7 +2141,7 @@ def groupby(
     # Statistics, overridden ndarray methods
 
     # TODO: integrate bottleneck
-    def count(self):
+    def count(self) -> int:
         """
         Return number of non-NA/null observations in the Series.
 
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from pandas._libs import writers as libwriters
+from pandas._typing import SequenceNotStr
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.generic import (
@@ -109,7 +110,7 @@ def decimal(self) -> str:
         return self.fmt.decimal
 
     @property
-    def header(self) -> bool | list[str]:
+    def header(self) -> bool | SequenceNotStr[str]:
         return self.fmt.header
 
     @property
@@ -213,7 +214,7 @@ def _need_to_save_header(self) -> bool:
         return bool(self._has_aliases or self.header)
 
     @property
-    def write_cols(self) -> Sequence[Hashable]:
+    def write_cols(self) -> SequenceNotStr[Hashable]:
         if self._has_aliases:
             assert not isinstance(self.header, bool)
             if len(self.header) != len(self.cols):
@@ -224,7 +225,7 @@ def write_cols(self) -> Sequence[Hashable]:
         else:
             # self.cols is an ndarray derived from Index._format_native_types,
             #  so its entries are strings, i.e. hashable
-            return cast(Sequence[Hashable], self.cols)
+            return cast(SequenceNotStr[Hashable], self.cols)
 
     @property
     def encoded_labels(self) -> list[Hashable]:
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -105,6 +105,7 @@
         FloatFormatType,
         FormattersType,
         IndexLabel,
+        SequenceNotStr,
         StorageOptions,
         WriteBuffer,
     )
@@ -566,7 +567,7 @@ def __init__(
         frame: DataFrame,
         columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
-        header: bool | list[str] = True,
+        header: bool | SequenceNotStr[str] = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -3161,8 +3161,6 @@ def dtype_backend_data() -> DataFrame:
 @pytest.fixture
 def dtype_backend_expected():
     def func(storage, dtype_backend, conn_name):
-        string_array: StringArray | ArrowStringArray
-        string_array_na: StringArray | ArrowStringArray
         if storage == "python":
             string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
             string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))