return _T_STR, except for slice because that one preserves the input types

MarcoGorelli · MarcoGorelli · commit 208a55ca3427 · 2025-03-10T12:11:03.000Z
diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi
@@ -51,8 +51,8 @@ class StringMethods(
     Generic[T, _T_EXPANDING, _T_BOOL, _T_LIST_STR, _T_INT, _T_BYTES, _T_STR, _T_OBJECT],
 ):
     def __init__(self, data: T) -> None: ...
-    def __getitem__(self, key: slice | int) -> T: ...
-    def __iter__(self) -> T: ...
+    def __getitem__(self, key: slice | int) -> _T_STR: ...
+    def __iter__(self) -> _T_STR: ...
     @overload
     def cat(
         self,
@@ -79,7 +79,7 @@ class StringMethods(
         sep: str = ...,
         na_rep: str | None = ...,
         join: JoinHow = ...,
-    ) -> T: ...
+    ) -> _T_STR: ...
     @overload
     def split(
         self, pat: str = ..., *, n: int = ..., expand: Literal[True], regex: bool = ...
@@ -121,7 +121,7 @@ class StringMethods(
     def rpartition(self, sep: str, expand: Literal[False]) -> _T_OBJECT: ...
     @overload
     def rpartition(self, *, expand: Literal[False]) -> _T_OBJECT: ...
-    def get(self, i: int) -> T: ...
+    def get(self, i: int) -> _T_STR: ...
     def join(self, sep: str) -> _T_STR: ...
     def contains(
         self,
@@ -142,29 +142,29 @@ class StringMethods(
         case: bool | None = ...,
         flags: int = ...,
         regex: bool = ...,
-    ) -> T: ...
-    def repeat(self, repeats: int | Sequence[int]) -> T: ...
+    ) -> _T_STR: ...
+    def repeat(self, repeats: int | Sequence[int]) -> _T_STR: ...
     def pad(
         self,
         width: int,
         side: Literal["left", "right", "both"] = ...,
         fillchar: str = ...,
-    ) -> T: ...
-    def center(self, width: int, fillchar: str = ...) -> T: ...
-    def ljust(self, width: int, fillchar: str = ...) -> T: ...
-    def rjust(self, width: int, fillchar: str = ...) -> T: ...
-    def zfill(self, width: int) -> T: ...
+    ) -> _T_STR: ...
+    def center(self, width: int, fillchar: str = ...) -> _T_STR: ...
+    def ljust(self, width: int, fillchar: str = ...) -> _T_STR: ...
+    def rjust(self, width: int, fillchar: str = ...) -> _T_STR: ...
+    def zfill(self, width: int) -> _T_STR: ...
     def slice(
         self, start: int | None = ..., stop: int | None = ..., step: int | None = ...
     ) -> T: ...
     def slice_replace(
         self, start: int | None = ..., stop: int | None = ..., repl: str | None = ...
-    ) -> T: ...
+    ) -> _T_STR: ...
     def decode(self, encoding: str, errors: str = ...) -> _T_STR: ...
     def encode(self, encoding: str, errors: str = ...) -> _T_BYTES: ...
-    def strip(self, to_strip: str | None = ...) -> T: ...
-    def lstrip(self, to_strip: str | None = ...) -> T: ...
-    def rstrip(self, to_strip: str | None = ...) -> T: ...
+    def strip(self, to_strip: str | None = ...) -> _T_STR: ...
+    def lstrip(self, to_strip: str | None = ...) -> _T_STR: ...
+    def rstrip(self, to_strip: str | None = ...) -> _T_STR: ...
     def wrap(
         self,
         width: int,
@@ -173,9 +173,9 @@ class StringMethods(
         drop_whitespace: bool | None = ...,
         break_long_words: bool | None = ...,
         break_on_hyphens: bool | None = ...,
-    ) -> T: ...
+    ) -> _T_STR: ...
     def get_dummies(self, sep: str = ...) -> _T_EXPANDING: ...
-    def translate(self, table: dict[int, int | str | None] | None) -> T: ...
+    def translate(self, table: dict[int, int | str | None] | None) -> _T_STR: ...
     def count(self, pat: str, flags: int = ...) -> _T_INT: ...
     def startswith(self, pat: str | tuple[str, ...], na: Any = ...) -> _T_BOOL: ...
     def endswith(self, pat: str | tuple[str, ...], na: Any = ...) -> _T_BOOL: ...
@@ -193,16 +193,16 @@ class StringMethods(
     def extractall(self, pat: str, flags: int = ...) -> pd.DataFrame: ...
     def find(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
     def rfind(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
-    def normalize(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> T: ...
+    def normalize(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> _T_STR: ...
     def index(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
     def rindex(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
     def len(self) -> _T_INT: ...
-    def lower(self) -> T: ...
-    def upper(self) -> T: ...
-    def title(self) -> T: ...
-    def capitalize(self) -> T: ...
-    def swapcase(self) -> T: ...
-    def casefold(self) -> T: ...
+    def lower(self) -> _T_STR: ...
+    def upper(self) -> _T_STR: ...
+    def title(self) -> _T_STR: ...
+    def capitalize(self) -> _T_STR: ...
+    def swapcase(self) -> _T_STR: ...
+    def casefold(self) -> _T_STR: ...
     def isalnum(self) -> _T_BOOL: ...
     def isalpha(self) -> _T_BOOL: ...
     def isdigit(self) -> _T_BOOL: ...
@@ -215,5 +215,5 @@ class StringMethods(
     def fullmatch(
         self, pat: str, case: bool = ..., flags: int = ..., na: Any = ...
     ) -> _T_BOOL: ...
-    def removeprefix(self, prefix: str) -> T: ...
-    def removesuffix(self, suffix: str) -> T: ...
+    def removeprefix(self, prefix: str) -> _T_STR: ...
+    def removesuffix(self, suffix: str) -> _T_STR: ...
diff --git a/tests/test_string_accessors.py b/tests/test_string_accessors.py
@@ -13,70 +13,21 @@
 
 
 DATA = ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"]
+DATA_BYTES = [b"applep", b"bananap"]
 
 
 def test_string_accessors_type_preserving_series() -> None:
-    s = pd.Series(DATA)
-    _check = functools.partial(check, klass=pd.Series, dtype=str)
-    _check(assert_type(s.str.capitalize(), "pd.Series[str]"))
-    _check(assert_type(s.str.casefold(), "pd.Series[str]"))
-    check(assert_type(s.str.cat(sep="X"), str), str)
-    _check(assert_type(s.str.center(10), "pd.Series[str]"))
-    _check(assert_type(s.str.get(2), "pd.Series[str]"))
-    _check(assert_type(s.str.ljust(80), "pd.Series[str]"))
-    _check(assert_type(s.str.lower(), "pd.Series[str]"))
-    _check(assert_type(s.str.lstrip("a"), "pd.Series[str]"))
-    _check(assert_type(s.str.normalize("NFD"), "pd.Series[str]"))
-    _check(assert_type(s.str.pad(80, "right"), "pd.Series[str]"))
-    _check(assert_type(s.str.removeprefix("a"), "pd.Series[str]"))
-    _check(assert_type(s.str.removesuffix("e"), "pd.Series[str]"))
-    _check(assert_type(s.str.repeat(2), "pd.Series[str]"))
-    _check(assert_type(s.str.replace("a", "X"), "pd.Series[str]"))
-    _check(assert_type(s.str.rjust(80), "pd.Series[str]"))
-    _check(assert_type(s.str.rstrip(), "pd.Series[str]"))
-    _check(assert_type(s.str.slice(0, 4, 2), "pd.Series[str]"))
-    _check(assert_type(s.str.slice_replace(0, 2, "XX"), "pd.Series[str]"))
-    _check(assert_type(s.str.strip(), "pd.Series[str]"))
-    _check(assert_type(s.str.swapcase(), "pd.Series[str]"))
-    _check(assert_type(s.str.title(), "pd.Series[str]"))
-    _check(
-        assert_type(s.str.translate({241: "n"}), "pd.Series[str]"),
-    )
-    _check(assert_type(s.str.upper(), "pd.Series[str]"))
-    _check(assert_type(s.str.wrap(80), "pd.Series[str]"))
-    _check(assert_type(s.str.zfill(10), "pd.Series[str]"))
+    s_str = pd.Series(DATA)
+    s_bytes = pd.Series(DATA_BYTES)
+    check(assert_type(s_str.str.slice(0, 4, 2), "pd.Series[str]"), pd.Series, str)
+    check(assert_type(s_bytes.str.slice(0, 4, 2), "pd.Series[bytes]"), pd.Series, bytes)
 
 
 def test_string_accessors_type_preserving_index() -> None:
-    idx = pd.Index(DATA)
-    _check = functools.partial(check, klass=pd.Index, dtype=str)
-    _check(assert_type(idx.str.capitalize(), "pd.Index[str]"))
-    _check(assert_type(idx.str.casefold(), "pd.Index[str]"))
-    check(assert_type(idx.str.cat(sep="X"), str), str)
-    _check(assert_type(idx.str.center(10), "pd.Index[str]"))
-    _check(assert_type(idx.str.get(2), "pd.Index[str]"))
-    _check(assert_type(idx.str.ljust(80), "pd.Index[str]"))
-    _check(assert_type(idx.str.lower(), "pd.Index[str]"))
-    _check(assert_type(idx.str.lstrip("a"), "pd.Index[str]"))
-    _check(assert_type(idx.str.normalize("NFD"), "pd.Index[str]"))
-    _check(assert_type(idx.str.pad(80, "right"), "pd.Index[str]"))
-    _check(assert_type(idx.str.removeprefix("a"), "pd.Index[str]"))
-    _check(assert_type(idx.str.removesuffix("e"), "pd.Index[str]"))
-    _check(assert_type(idx.str.repeat(2), "pd.Index[str]"))
-    _check(assert_type(idx.str.replace("a", "X"), "pd.Index[str]"))
-    _check(assert_type(idx.str.rjust(80), "pd.Index[str]"))
-    _check(assert_type(idx.str.rstrip(), "pd.Index[str]"))
-    _check(assert_type(idx.str.slice(0, 4, 2), "pd.Index[str]"))
-    _check(assert_type(idx.str.slice_replace(0, 2, "XX"), "pd.Index[str]"))
-    _check(assert_type(idx.str.strip(), "pd.Index[str]"))
-    _check(assert_type(idx.str.swapcase(), "pd.Index[str]"))
-    _check(assert_type(idx.str.title(), "pd.Index[str]"))
-    _check(
-        assert_type(idx.str.translate({241: "n"}), "pd.Index[str]"),
-    )
-    _check(assert_type(idx.str.upper(), "pd.Index[str]"))
-    _check(assert_type(idx.str.wrap(80), "pd.Index[str]"))
-    _check(assert_type(idx.str.zfill(10), "pd.Index[str]"))
+    idx_str = pd.Index(DATA)
+    idx_bytes = pd.Index(DATA_BYTES)
+    check(assert_type(idx_str.str.slice(0, 4, 2), "pd.Index[str]"), pd.Index, str)
+    check(assert_type(idx_bytes.str.slice(0, 4, 2), "pd.Index[bytes]"), pd.Index, bytes)
 
 
 def test_string_accessors_boolean_series():
@@ -158,21 +109,73 @@ def test_string_accessors_integer_index():
 
 
 def test_string_accessors_string_series():
-    s = pd.Series([b"a1", b"b2", b"c3"])
+    s = pd.Series(DATA)
     _check = functools.partial(check, klass=pd.Series, dtype=str)
-    _check(assert_type(s.str.decode("utf-8"), "pd.Series[str]"))
-    s2 = pd.Series([["apple", "banana"], ["cherry", "date"], [1, "eggplant"]])
-    _check(assert_type(s2.str.join("-"), "pd.Series[str]"))
+    _check(assert_type(s.str.capitalize(), "pd.Series[str]"))
+    _check(assert_type(s.str.casefold(), "pd.Series[str]"))
+    check(assert_type(s.str.cat(sep="X"), str), str)
+    _check(assert_type(s.str.center(10), "pd.Series[str]"))
+    _check(assert_type(s.str.get(2), "pd.Series[str]"))
+    _check(assert_type(s.str.ljust(80), "pd.Series[str]"))
+    _check(assert_type(s.str.lower(), "pd.Series[str]"))
+    _check(assert_type(s.str.lstrip("a"), "pd.Series[str]"))
+    _check(assert_type(s.str.normalize("NFD"), "pd.Series[str]"))
+    _check(assert_type(s.str.pad(80, "right"), "pd.Series[str]"))
+    _check(assert_type(s.str.removeprefix("a"), "pd.Series[str]"))
+    _check(assert_type(s.str.removesuffix("e"), "pd.Series[str]"))
+    _check(assert_type(s.str.repeat(2), "pd.Series[str]"))
+    _check(assert_type(s.str.replace("a", "X"), "pd.Series[str]"))
+    _check(assert_type(s.str.rjust(80), "pd.Series[str]"))
+    _check(assert_type(s.str.rstrip(), "pd.Series[str]"))
+    _check(assert_type(s.str.slice_replace(0, 2, "XX"), "pd.Series[str]"))
+    _check(assert_type(s.str.strip(), "pd.Series[str]"))
+    _check(assert_type(s.str.swapcase(), "pd.Series[str]"))
+    _check(assert_type(s.str.title(), "pd.Series[str]"))
+    _check(
+        assert_type(s.str.translate({241: "n"}), "pd.Series[str]"),
+    )
+    _check(assert_type(s.str.upper(), "pd.Series[str]"))
+    _check(assert_type(s.str.wrap(80), "pd.Series[str]"))
+    _check(assert_type(s.str.zfill(10), "pd.Series[str]"))
+    s_bytes = pd.Series([b"a1", b"b2", b"c3"])
+    _check(assert_type(s_bytes.str.decode("utf-8"), "pd.Series[str]"))
+    s_list = pd.Series([["apple", "banana"], ["cherry", "date"], [1, "eggplant"]])
+    _check(assert_type(s_list.str.join("-"), "pd.Series[str]"))
 
 
 def test_string_accessors_string_index():
-    idx = pd.Index([b"a1", b"b2", b"c3"])
+    idx = pd.Index(DATA)
     _check = functools.partial(check, klass=pd.Index, dtype=str)
-    _check(assert_type(idx.str.decode("utf-8"), "pd.Index[str]"))
-    idx2: "pd.Index[list]" = pd.Index(
-        [["apple", "banana"], ["cherry", "date"], [1, "eggplant"]]
+    _check(assert_type(idx.str.capitalize(), "pd.Index[str]"))
+    _check(assert_type(idx.str.casefold(), "pd.Index[str]"))
+    check(assert_type(idx.str.cat(sep="X"), str), str)
+    _check(assert_type(idx.str.center(10), "pd.Index[str]"))
+    _check(assert_type(idx.str.get(2), "pd.Index[str]"))
+    _check(assert_type(idx.str.ljust(80), "pd.Index[str]"))
+    _check(assert_type(idx.str.lower(), "pd.Index[str]"))
+    _check(assert_type(idx.str.lstrip("a"), "pd.Index[str]"))
+    _check(assert_type(idx.str.normalize("NFD"), "pd.Index[str]"))
+    _check(assert_type(idx.str.pad(80, "right"), "pd.Index[str]"))
+    _check(assert_type(idx.str.removeprefix("a"), "pd.Index[str]"))
+    _check(assert_type(idx.str.removesuffix("e"), "pd.Index[str]"))
+    _check(assert_type(idx.str.repeat(2), "pd.Index[str]"))
+    _check(assert_type(idx.str.replace("a", "X"), "pd.Index[str]"))
+    _check(assert_type(idx.str.rjust(80), "pd.Index[str]"))
+    _check(assert_type(idx.str.rstrip(), "pd.Index[str]"))
+    _check(assert_type(idx.str.slice_replace(0, 2, "XX"), "pd.Index[str]"))
+    _check(assert_type(idx.str.strip(), "pd.Index[str]"))
+    _check(assert_type(idx.str.swapcase(), "pd.Index[str]"))
+    _check(assert_type(idx.str.title(), "pd.Index[str]"))
+    _check(
+        assert_type(idx.str.translate({241: "n"}), "pd.Index[str]"),
     )
-    _check(assert_type(idx2.str.join("-"), "pd.Index[str]"))
+    _check(assert_type(idx.str.upper(), "pd.Index[str]"))
+    _check(assert_type(idx.str.wrap(80), "pd.Index[str]"))
+    _check(assert_type(idx.str.zfill(10), "pd.Index[str]"))
+    idx_bytes = pd.Index([b"a1", b"b2", b"c3"])
+    _check(assert_type(idx_bytes.str.decode("utf-8"), "pd.Index[str]"))
+    idx_list = pd.Index([["apple", "banana"], ["cherry", "date"], [1, "eggplant"]])
+    _check(assert_type(idx_list.str.join("-"), "pd.Index[str]"))
 
 
 def test_string_accessors_bytes_series():
@@ -325,6 +328,12 @@ def test_series_overloads_cat():
     )
     unknown_s = pd.DataFrame({"a": list("abcdefg")})["a"]
     check(assert_type(s.str.cat(unknown_s, sep=";"), "pd.Series[str]"), pd.Series, str)
+    check(assert_type(unknown_s.str.cat(s, sep=";"), "pd.Series[str]"), pd.Series, str)
+    check(
+        assert_type(unknown_s.str.cat(unknown_s, sep=";"), "pd.Series[str]"),
+        pd.Series,
+        str,
+    )
 
 
 def test_index_overloads_cat():
@@ -351,6 +360,14 @@ def test_index_overloads_cat():
     check(
         assert_type(idx.str.cat(unknown_idx, sep=";"), "pd.Index[str]"), pd.Index, str
     )
+    check(
+        assert_type(unknown_idx.str.cat(idx, sep=";"), "pd.Index[str]"), pd.Index, str
+    )
+    check(
+        assert_type(unknown_idx.str.cat(unknown_idx, sep=";"), "pd.Index[str]"),
+        pd.Index,
+        str,
+    )
 
 
 def test_series_overloads_extract():