Skip to content

Introduce UnknownSeries and UnknownIndex, type core.strings.pyi using them #1146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 40 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
0d48f36
make typing in pandas_stubs.core.strings.pyi strict, add UnknownSerie…
MarcoGorelli Mar 6, 2025
ca10bf2
undo pyproject.toml changes
MarcoGorelli Mar 6, 2025
4b8183d
use class, use pyright: strict
MarcoGorelli Mar 6, 2025
def6eea
update pyright
MarcoGorelli Mar 6, 2025
9c5b33a
reduce diff
MarcoGorelli Mar 6, 2025
9b63e3f
fixup
MarcoGorelli Mar 6, 2025
fd6188a
fixup
MarcoGorelli Mar 6, 2025
bcdd40e
include UnknownSeries in str.cat
MarcoGorelli Mar 6, 2025
dba1bda
move UnknownSeries and UnknownIndex location
MarcoGorelli Mar 6, 2025
6a31e87
use typealias
MarcoGorelli Mar 7, 2025
5edf982
use Series[str] as .cat return type
MarcoGorelli Mar 7, 2025
9a47508
use -> T so it matches other .str methods like .str.uppercase
MarcoGorelli Mar 7, 2025
0fabb99
use _TS2 for findall
MarcoGorelli Mar 7, 2025
427a707
add test to cover passing UnknownSeries to cat
MarcoGorelli Mar 7, 2025
de28385
preserve type in series.str
MarcoGorelli Mar 7, 2025
e40d245
simplify
MarcoGorelli Mar 7, 2025
92dc75d
use Mapping instead of dict as it is invariant
MarcoGorelli Mar 7, 2025
231b54d
fixup
MarcoGorelli Mar 7, 2025
45b8da0
split out into separate file
MarcoGorelli Mar 8, 2025
385b1bd
split out into separate file
MarcoGorelli Mar 8, 2025
412b1ab
type check boolean return values
MarcoGorelli Mar 8, 2025
2463ce9
integer return type
MarcoGorelli Mar 8, 2025
b0cade6
integer return type
MarcoGorelli Mar 8, 2025
29710a4
strings and bytes
MarcoGorelli Mar 8, 2025
3298868
list
MarcoGorelli Mar 8, 2025
5dfa7fa
expanding
MarcoGorelli Mar 8, 2025
3d581a8
fixup
MarcoGorelli Mar 8, 2025
005759c
keep fixing
MarcoGorelli Mar 8, 2025
aca32d5
keep fixing
MarcoGorelli Mar 8, 2025
b244308
overloads cat
MarcoGorelli Mar 8, 2025
0d1fc59
fixup str.extract
MarcoGorelli Mar 8, 2025
7ccfa0d
rename for clarity
MarcoGorelli Mar 8, 2025
b4839a0
lint
MarcoGorelli Mar 8, 2025
17e280f
annotate idx2 as per mypys request
MarcoGorelli Mar 8, 2025
208a55c
return _T_STR, except for `slice` because that one preserves the inpu…
MarcoGorelli Mar 10, 2025
3dc660e
mypy fixup
MarcoGorelli Mar 10, 2025
b2d4657
disallow .str on certain series types
Dr-Irv Mar 10, 2025
ce7575e
Revert "disallow .str on certain series types"
MarcoGorelli Mar 11, 2025
98cb162
Merge remote-tracking branch 'upstream/main' into strict-strings-typing
MarcoGorelli Mar 11, 2025
3e24de0
use Index of list[str]
MarcoGorelli Mar 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pandas-stubs/core/indexes/base.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ from typing import (
Any,
ClassVar,
Literal,
TypeAlias,
final,
overload,
)
Expand Down Expand Up @@ -455,6 +456,8 @@ class Index(IndexOpsMixin[S1]):
),
) -> Self: ...

UnknownIndex: TypeAlias = Index[Any]

def ensure_index_from_sequences(
sequences: Sequence[Sequence[Dtype]], names: list[str] = ...
) -> Index: ...
Expand Down
4 changes: 3 additions & 1 deletion pandas-stubs/core/series.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1156,7 +1156,7 @@ class Series(IndexOpsMixin[S1], NDFrame):
@property
def str(
self,
) -> StringMethods[Series, DataFrame, Series[bool], Series[list[str]]]: ...
) -> StringMethods[Self, DataFrame, Series[bool], Series[list[str]]]: ...
@property
def dt(self) -> CombinedDatetimelikeProperties: ...
@property
Expand Down Expand Up @@ -2295,3 +2295,5 @@ class IntervalSeries(Series[Interval[_OrderableT]], Generic[_OrderableT]):
@property
def array(self) -> IntervalArray: ...
def diff(self, periods: int = ...) -> Never: ...

UnknownSeries: TypeAlias = Series[Any]
56 changes: 39 additions & 17 deletions pandas-stubs/core/strings.pyi
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# pyright: strict
from collections.abc import (
Callable,
Sequence,
Expand All @@ -12,6 +13,7 @@ from typing import (
)

import numpy as np
import numpy.typing as npt
import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -21,8 +23,10 @@ from pandas import (
)
from pandas.core.base import NoNewAttributesMixin

from pandas._libs.tslibs.nattype import NaTType
from pandas._typing import (
JoinHow,
Scalar,
T,
np_ndarray_bool,
)
Expand Down Expand Up @@ -58,7 +62,9 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
@overload
def cat(
self,
others: Series | pd.Index | pd.DataFrame | np.ndarray | list[Any],
others: (
Series[str] | Index[str] | pd.DataFrame | npt.NDArray[np.str_] | list[str]
),
sep: str = ...,
na_rep: str | None = ...,
join: JoinHow = ...,
Expand Down Expand Up @@ -89,27 +95,31 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
@overload
def partition(self, sep: str, expand: Literal[True]) -> pd.DataFrame: ...
@overload
def partition(self, sep: str, expand: Literal[False]) -> T: ...
def partition(
self, sep: str, expand: Literal[False]
) -> pd.Series[type[object]]: ...
@overload
def partition(self, *, expand: Literal[False]) -> T: ...
def partition(self, *, expand: Literal[False]) -> pd.Series[type[object]]: ...
@overload
def rpartition(self, sep: str = ...) -> pd.DataFrame: ...
@overload
def rpartition(self, *, expand: Literal[True]) -> pd.DataFrame: ...
@overload
def rpartition(self, sep: str, expand: Literal[True]) -> pd.DataFrame: ...
@overload
def rpartition(self, sep: str, expand: Literal[False]) -> T: ...
def rpartition(
self, sep: str, expand: Literal[False]
) -> pd.Series[type[object]]: ...
@overload
def rpartition(self, *, expand: Literal[False]) -> T: ...
def rpartition(self, *, expand: Literal[False]) -> pd.Series[type[object]]: ...
def get(self, i: int) -> T: ...
def join(self, sep: str) -> T: ...
def contains(
self,
pat: str | re.Pattern,
pat: str | re.Pattern[str],
case: bool = ...,
flags: int = ...,
na=...,
na: Scalar | NaTType | None = ...,
regex: bool = ...,
) -> Series[bool]: ...
def match(
Expand All @@ -118,7 +128,7 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
def replace(
self,
pat: str,
repl: str | Callable[[re.Match], str],
repl: str | Callable[[re.Match[str]], str],
n: int = ...,
case: bool | None = ...,
flags: int = ...,
Expand All @@ -141,8 +151,8 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
def slice_replace(
self, start: int | None = ..., stop: int | None = ..., repl: str | None = ...
) -> T: ...
def decode(self, encoding: str, errors: str = ...) -> T: ...
def encode(self, encoding: str, errors: str = ...) -> T: ...
def decode(self, encoding: str, errors: str = ...) -> Series[str]: ...
def encode(self, encoding: str, errors: str = ...) -> Series[bytes]: ...
def strip(self, to_strip: str | None = ...) -> T: ...
def lstrip(self, to_strip: str | None = ...) -> T: ...
def rstrip(self, to_strip: str | None = ...) -> T: ...
Expand All @@ -160,21 +170,33 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
def count(self, pat: str, flags: int = ...) -> Series[int]: ...
def startswith(self, pat: str | tuple[str, ...], na: Any = ...) -> Series[bool]: ...
def endswith(self, pat: str | tuple[str, ...], na: Any = ...) -> Series[bool]: ...
def findall(self, pat: str, flags: int = ...) -> Series: ...
def findall(self, pat: str, flags: int = ...) -> _TS2: ...
@overload
def extract(
self, pat: str, flags: int = ..., *, expand: Literal[True] = ...
) -> pd.DataFrame: ...
@overload
def extract(self, pat: str, flags: int, expand: Literal[False]) -> T: ...
def extract(
self, pat: str, flags: int, expand: Literal[False]
) -> Series[type[object]]: ...
@overload
def extract(self, pat: str, flags: int = ..., *, expand: Literal[False]) -> T: ...
def extract(
self, pat: str, flags: int = ..., *, expand: Literal[False]
) -> Series[type[object]]: ...
def extractall(self, pat: str, flags: int = ...) -> pd.DataFrame: ...
def find(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def rfind(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def find(
self, sub: str, start: int = ..., end: int | None = ...
) -> Series[int]: ...
def rfind(
self, sub: str, start: int = ..., end: int | None = ...
) -> Series[int]: ...
def normalize(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> T: ...
def index(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def rindex(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def index(
self, sub: str, start: int = ..., end: int | None = ...
) -> Series[int]: ...
def rindex(
self, sub: str, start: int = ..., end: int | None = ...
) -> Series[int]: ...
def len(self) -> Series[int]: ...
def lower(self) -> T: ...
def upper(self) -> T: ...
Expand Down
113 changes: 77 additions & 36 deletions tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,13 @@
OffsetSeries,
TimedeltaSeries,
TimestampSeries,
UnknownSeries,
)
else:
TimedeltaSeries: TypeAlias = pd.Series
TimestampSeries: TypeAlias = pd.Series
OffsetSeries: TypeAlias = pd.Series
UnknownSeries: TypeAlias = pd.Series

if TYPE_CHECKING:
from pandas._typing import (
Expand All @@ -88,6 +90,7 @@
)
from pandas._typing import np_ndarray_int # noqa: F401


# Tests will use numpy 2.1 in python 3.10 or later
# From Numpy 2.1 __init__.pyi
_DTypeKind: TypeAlias = Literal[
Expand Down Expand Up @@ -1574,31 +1577,32 @@ def test_string_accessors():
)
s2 = pd.Series([["apple", "banana"], ["cherry", "date"], [1, "eggplant"]])
s3 = pd.Series(["a1", "b2", "c3"])
check(assert_type(s.str.capitalize(), pd.Series), pd.Series)
check(assert_type(s.str.casefold(), pd.Series), pd.Series)
s4 = pd.Series([b"a1", b"b2", b"c3"])
check(assert_type(s.str.capitalize(), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.casefold(), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.cat(sep="X"), str), str)
check(assert_type(s.str.center(10), pd.Series), pd.Series)
check(assert_type(s.str.center(10), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.contains("a"), "pd.Series[bool]"), pd.Series, np.bool_)
check(
assert_type(s.str.contains(re.compile(r"a")), "pd.Series[bool]"),
pd.Series,
np.bool_,
)
check(assert_type(s.str.count("pp"), "pd.Series[int]"), pd.Series, np.integer)
check(assert_type(s.str.decode("utf-8"), pd.Series), pd.Series)
check(assert_type(s.str.encode("latin-1"), pd.Series), pd.Series)
check(assert_type(s4.str.decode("utf-8"), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.encode("latin-1"), "pd.Series[bytes]"), pd.Series, bytes)
check(assert_type(s.str.endswith("e"), "pd.Series[bool]"), pd.Series, np.bool_)
check(
assert_type(s.str.endswith(("e", "f")), "pd.Series[bool]"), pd.Series, np.bool_
)
check(assert_type(s3.str.extract(r"([ab])?(\d)"), pd.DataFrame), pd.DataFrame)
check(assert_type(s3.str.extractall(r"([ab])?(\d)"), pd.DataFrame), pd.DataFrame)
check(assert_type(s.str.find("p"), pd.Series), pd.Series)
check(assert_type(s.str.findall("pp"), pd.Series), pd.Series)
check(assert_type(s.str.find("p"), "pd.Series[int]"), pd.Series, np.int64)
check(assert_type(s.str.findall("pp"), "pd.Series[list[str]]"), pd.Series, list)
check(assert_type(s.str.fullmatch("apple"), "pd.Series[bool]"), pd.Series, np.bool_)
check(assert_type(s.str.get(2), pd.Series), pd.Series)
check(assert_type(s.str.get(2), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.get_dummies(), pd.DataFrame), pd.DataFrame)
check(assert_type(s.str.index("p"), pd.Series), pd.Series)
check(assert_type(s.str.index("p"), "pd.Series[int]"), pd.Series, np.int64)
check(assert_type(s.str.isalnum(), "pd.Series[bool]"), pd.Series, np.bool_)
check(assert_type(s.str.isalpha(), "pd.Series[bool]"), pd.Series, np.bool_)
check(assert_type(s.str.isdecimal(), "pd.Series[bool]"), pd.Series, np.bool_)
Expand All @@ -1610,20 +1614,20 @@ def test_string_accessors():
check(assert_type(s.str.isupper(), "pd.Series[bool]"), pd.Series, np.bool_)
check(assert_type(s2.str.join("-"), pd.Series), pd.Series)
check(assert_type(s.str.len(), "pd.Series[int]"), pd.Series, np.integer)
check(assert_type(s.str.ljust(80), pd.Series), pd.Series)
check(assert_type(s.str.lower(), pd.Series), pd.Series)
check(assert_type(s.str.lstrip("a"), pd.Series), pd.Series)
check(assert_type(s.str.ljust(80), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.lower(), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.lstrip("a"), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.match("pp"), "pd.Series[bool]"), pd.Series, np.bool_)
check(assert_type(s.str.normalize("NFD"), pd.Series), pd.Series)
check(assert_type(s.str.pad(80, "right"), pd.Series), pd.Series)
check(assert_type(s.str.normalize("NFD"), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.pad(80, "right"), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.partition("p"), pd.DataFrame), pd.DataFrame)
check(assert_type(s.str.removeprefix("a"), pd.Series), pd.Series)
check(assert_type(s.str.removesuffix("e"), pd.Series), pd.Series)
check(assert_type(s.str.repeat(2), pd.Series), pd.Series)
check(assert_type(s.str.replace("a", "X"), pd.Series), pd.Series)
check(assert_type(s.str.rfind("e"), pd.Series), pd.Series)
check(assert_type(s.str.rindex("p"), pd.Series), pd.Series)
check(assert_type(s.str.rjust(80), pd.Series), pd.Series)
check(assert_type(s.str.removeprefix("a"), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.removesuffix("e"), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.repeat(2), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.replace("a", "X"), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.rfind("e"), "pd.Series[int]"), pd.Series, np.int64)
check(assert_type(s.str.rindex("p"), "pd.Series[int]"), pd.Series, np.int64)
check(assert_type(s.str.rjust(80), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.rpartition("p"), pd.DataFrame), pd.DataFrame)
check(assert_type(s.str.rsplit("a"), "pd.Series[list[str]]"), pd.Series, list)
check(assert_type(s.str.rsplit("a", expand=True), pd.DataFrame), pd.DataFrame)
Expand All @@ -1632,9 +1636,11 @@ def test_string_accessors():
pd.Series,
list,
)
check(assert_type(s.str.rstrip(), pd.Series), pd.Series)
check(assert_type(s.str.slice(0, 4, 2), pd.Series), pd.Series)
check(assert_type(s.str.slice_replace(0, 2, "XX"), pd.Series), pd.Series)
check(assert_type(s.str.rstrip(), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.slice(0, 4, 2), "pd.Series[str]"), pd.Series, str)
check(
assert_type(s.str.slice_replace(0, 2, "XX"), "pd.Series[str]"), pd.Series, str
)
check(assert_type(s.str.split("a"), "pd.Series[list[str]]"), pd.Series, list)
# GH 194
check(assert_type(s.str.split("a", expand=True), pd.DataFrame), pd.DataFrame)
Expand All @@ -1649,13 +1655,17 @@ def test_string_accessors():
pd.Series,
np.bool_,
)
check(assert_type(s.str.strip(), pd.Series), pd.Series)
check(assert_type(s.str.swapcase(), pd.Series), pd.Series)
check(assert_type(s.str.title(), pd.Series), pd.Series)
check(assert_type(s.str.translate(None), pd.Series), pd.Series)
check(assert_type(s.str.upper(), pd.Series), pd.Series)
check(assert_type(s.str.wrap(80), pd.Series), pd.Series)
check(assert_type(s.str.zfill(10), pd.Series), pd.Series)
check(assert_type(s.str.strip(), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.swapcase(), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.title(), "pd.Series[str]"), pd.Series, str)
check(
assert_type(s.str.translate({241: "n"}), "pd.Series[str]"),
pd.Series,
str,
)
check(assert_type(s.str.upper(), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.wrap(80), "pd.Series[str]"), pd.Series, str)
check(assert_type(s.str.zfill(10), "pd.Series[str]"), pd.Series, str)


def test_series_overloads_cat():
Expand All @@ -1665,9 +1675,23 @@ def test_series_overloads_cat():
check(assert_type(s.str.cat(sep=";"), str), str)
check(assert_type(s.str.cat(None, sep=";"), str), str)
check(
assert_type(s.str.cat(["A", "B", "C", "D", "E", "F", "G"], sep=";"), pd.Series),
assert_type(
s.str.cat(["A", "B", "C", "D", "E", "F", "G"], sep=";"),
"pd.Series[str]",
),
pd.Series,
str,
)
check(
assert_type(
s.str.cat(pd.Series(["A", "B", "C", "D", "E", "F", "G"]), sep=";"),
"pd.Series[str]",
),
pd.Series,
str,
)
unknown_s: UnknownSeries = pd.DataFrame({"a": ["a", "b"]})["a"]
check(assert_type(s.str.cat(unknown_s, sep=";"), "pd.Series[str]"), pd.Series, str)


def test_series_overloads_partition():
Expand All @@ -1686,13 +1710,21 @@ def test_series_overloads_partition():
check(
assert_type(s.str.partition(sep=";", expand=True), pd.DataFrame), pd.DataFrame
)
check(assert_type(s.str.partition(sep=";", expand=False), pd.Series), pd.Series)
check(
assert_type(s.str.partition(sep=";", expand=False), "pd.Series[type[object]]"),
pd.Series,
object,
)

check(assert_type(s.str.rpartition(sep=";"), pd.DataFrame), pd.DataFrame)
check(
assert_type(s.str.rpartition(sep=";", expand=True), pd.DataFrame), pd.DataFrame
)
check(assert_type(s.str.rpartition(sep=";", expand=False), pd.Series), pd.Series)
check(
assert_type(s.str.rpartition(sep=";", expand=False), "pd.Series[type[object]]"),
pd.Series,
object,
)


def test_series_overloads_extract():
Expand All @@ -1703,10 +1735,19 @@ def test_series_overloads_extract():
check(
assert_type(s.str.extract(r"[ab](\d)", expand=True), pd.DataFrame), pd.DataFrame
)
check(assert_type(s.str.extract(r"[ab](\d)", expand=False), pd.Series), pd.Series)
check(
assert_type(s.str.extract(r"[ab](\d)", re.IGNORECASE, False), pd.Series),
assert_type(
s.str.extract(r"[ab](\d)", expand=False), "pd.Series[type[object]]"
),
pd.Series,
object,
)
check(
assert_type(
s.str.extract(r"[ab](\d)", re.IGNORECASE, False), "pd.Series[type[object]]"
),
pd.Series,
object,
)


Expand Down