From 38d5815bb9efbc43524e62eb71f6b0c33efd1072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 9 Mar 2022 22:41:32 -0500 Subject: [PATCH 01/10] TYP: sort_index --- pandas/_typing.py | 6 ++-- pandas/core/frame.py | 77 ++++++++++++++++++++++++++++++++------- pandas/core/generic.py | 62 ++++++++++++++++++++++++++++---- pandas/core/series.py | 82 ++++++++++++++++++++++++++++++++++-------- pandas/core/sorting.py | 16 ++++++--- 5 files changed, 204 insertions(+), 39 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index cabf0e8275d08..3b11aada96a15 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -307,9 +307,11 @@ def closed(self) -> bool: XMLParsers = Literal["lxml", "etree"] # Interval closed type - IntervalClosedType = Literal["left", "right", "both", "neither"] # datetime and NaTType - DatetimeNaTType = Union[datetime, "NaTType"] + +# sort_index +SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"] +NaPosition = Literal["first", "last"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9a554537896ab..ccc890d84fd17 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -61,10 +61,12 @@ IndexKeyFunc, IndexLabel, Level, + NaPosition, PythonFuncType, ReadBuffer, Renamer, Scalar, + SortKind, StorageOptions, Suffixes, TimedeltaConvertibleTypes, @@ -6422,19 +6424,68 @@ def sort_values( # type: ignore[override] else: return result.__finalize__(self, method="sort_values") - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool | int | Sequence[bool | int] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool | int | Sequence[bool | int] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> DataFrame: + ... + + @overload def sort_index( + self, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool | int | Sequence[bool | int] = ..., + inplace: bool = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> DataFrame | None: + ... + + # error: Signature of "sort_index" incompatible with supertype "NDFrame" + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def sort_index( # type: ignore[override] self, axis: Axis = 0, level: Level | None = None, ascending: bool | int | Sequence[bool | int] = True, inplace: bool = False, - kind: str = "quicksort", - na_position: str = "last", + kind: SortKind = "quicksort", + na_position: NaPosition = "last", sort_remaining: bool = True, ignore_index: bool = False, key: IndexKeyFunc = None, - ): + ) -> DataFrame | None: """ Sort object by labels (along an axis). @@ -6525,15 +6576,15 @@ def sort_index( d 4 """ return super().sort_index( - axis, - level, - ascending, - inplace, - kind, - na_position, - sort_remaining, - ignore_index, - key, + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + key=key, ) def value_counts( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 80af10383e24e..320ba572954d4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -50,9 +50,11 @@ JSONSerializable, Level, Manager, + NaPosition, NDFrameT, RandomState, Renamer, + SortKind, StorageOptions, T, TimedeltaConvertibleTypes, @@ -4672,18 +4674,66 @@ def sort_values( """ raise AbstractMethodError(self) + @overload def sort_index( self, - axis=0, - level=None, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool_t | int | Sequence[bool_t | int] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool_t = ..., + ignore_index: bool_t = ..., + key: IndexKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_index( + self: NDFrameT, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool_t | int | Sequence[bool_t | int] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool_t = ..., + ignore_index: bool_t = ..., + key: IndexKeyFunc = ..., + ) -> NDFrameT: + ... + + @overload + def sort_index( + self: NDFrameT, + *, + axis: Axis = ..., + level: Level | None = ..., + ascending: bool_t | int | Sequence[bool_t | int] = ..., + inplace: bool_t = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool_t = ..., + ignore_index: bool_t = ..., + key: IndexKeyFunc = ..., + ) -> NDFrameT | None: + ... + + def sort_index( + self: NDFrameT, + axis: Axis = 0, + level: Level | None = None, ascending: bool_t | int | Sequence[bool_t | int] = True, inplace: bool_t = False, - kind: str = "quicksort", - na_position: str = "last", + kind: SortKind = "quicksort", + na_position: NaPosition = "last", sort_remaining: bool_t = True, ignore_index: bool_t = False, key: IndexKeyFunc = None, - ): + ) -> NDFrameT | None: inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) @@ -4704,7 +4754,7 @@ def sort_index( if ignore_index: result.index = default_index(len(self)) if inplace: - return + return None else: return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 9b80bfe509634..ff006d6219006 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -39,7 +39,10 @@ DtypeObj, FillnaOptions, IndexKeyFunc, + Level, + NaPosition, SingleManager, + SortKind, StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -3571,15 +3574,66 @@ def sort_values( else: return result.__finalize__(self, method="sort_values") - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + # error: Signature of "sort_index" incompatible with supertype "NDFrame" + @overload # type: ignore[override] def sort_index( self, - axis=0, - level=None, + *, + axis: Literal[0] = ..., + level: Level | None = ..., + ascending: bool | int | Sequence[bool | int] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> None: + ... + + @overload + def sort_index( + self, + *, + axis: Literal[0] = ..., + level: Level | None = ..., + ascending: bool | int | Sequence[bool | int] = ..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> Series: + ... + + @overload + def sort_index( + self, + *, + axis: Literal[0] = ..., + level: Level | None = ..., + ascending: bool | int | Sequence[bool | int] = ..., + inplace: bool = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> Series | None: + ... + + # error: Argument 1 of "sort_index" is incompatible with supertype "NDFrame"; + # supertype defines the argument type as "Union[str, int]" + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def sort_index( # type: ignore[override] + self, + axis: Literal[0] = 0, + level: Level | None = None, ascending: bool | int | Sequence[bool | int] = True, inplace: bool = False, - kind: str = "quicksort", - na_position: str = "last", + kind: SortKind = "quicksort", + na_position: NaPosition = "last", sort_remaining: bool = True, ignore_index: bool = False, key: IndexKeyFunc = None, @@ -3720,15 +3774,15 @@ def sort_index( """ return super().sort_index( - axis, - level, - ascending, - inplace, - kind, - na_position, - sort_remaining, - ignore_index, - key, + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + key=key, ) def argsort(self, axis=0, kind="quicksort", order=None) -> Series: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d41947510c1bb..b2b08a4ad9644 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -22,7 +22,10 @@ from pandas._libs.hashtable import unique_label_indices from pandas._typing import ( IndexKeyFunc, + Level, + NaPosition, Shape, + SortKind, npt, ) @@ -47,10 +50,10 @@ def get_indexer_indexer( target: Index, - level: str | int | list[str] | list[int], + level: Level | list[Level] | None, ascending: Sequence[bool | int] | bool | int, - kind: str, - na_position: str, + kind: SortKind, + na_position: NaPosition, sort_remaining: bool, key: IndexKeyFunc, ) -> npt.NDArray[np.intp] | None: @@ -92,8 +95,13 @@ def get_indexer_indexer( ): return None + # error: Argument "ascending" to "nargsort" has incompatible type + # "Union[Sequence[Union[bool, int]], bool, int]"; expected "bool" indexer = nargsort( - target, kind=kind, ascending=ascending, na_position=na_position + target, + kind=kind, + ascending=ascending, # type: ignore[arg-type] + na_position=na_position, ) return indexer From e1473f9bdc3a0d75bc64300b6158a0240d443eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 11 Mar 2022 22:48:19 -0500 Subject: [PATCH 02/10] return type and sub-classes --- pandas/core/frame.py | 15 +++++++++------ pandas/core/series.py | 15 +++++++++------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 21f2cd4c788cb..d5c163c37d250 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,6 +26,7 @@ Iterator, Literal, Sequence, + TypeVar, cast, overload, ) @@ -222,6 +223,8 @@ from pandas.io.formats.style import Styler + DataFrameT = TypeVar("DataFrameT", bound="DataFrame") + # --------------------------------------------------------------------- # Docstring templates @@ -6446,7 +6449,7 @@ def sort_index( @overload def sort_index( - self, + self: DataFrameT, *, axis: Axis = ..., level: Level | None = ..., @@ -6457,12 +6460,12 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrame: + ) -> DataFrameT: ... @overload def sort_index( - self, + self: DataFrameT, *, axis: Axis = ..., level: Level | None = ..., @@ -6473,13 +6476,13 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrame | None: + ) -> DataFrameT | None: ... # error: Signature of "sort_index" incompatible with supertype "NDFrame" @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( # type: ignore[override] - self, + self: DataFrameT, axis: Axis = 0, level: Level | None = None, ascending: bool | int | Sequence[bool | int] = True, @@ -6489,7 +6492,7 @@ def sort_index( # type: ignore[override] sort_remaining: bool = True, ignore_index: bool = False, key: IndexKeyFunc = None, - ) -> DataFrame | None: + ) -> DataFrameT | None: """ Sort object by labels (along an axis). diff --git a/pandas/core/series.py b/pandas/core/series.py index 4ec747a9e3bfa..e382676dbe593 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -13,6 +13,7 @@ Iterable, Literal, Sequence, + TypeVar, Union, cast, overload, @@ -162,6 +163,8 @@ from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.resample import Resampler + SeriesT = TypeVar("SeriesT", bound="Series") + __all__ = ["Series"] _shared_doc_kwargs = { @@ -3597,7 +3600,7 @@ def sort_index( @overload def sort_index( - self, + self: SeriesT, *, axis: Literal[0] = ..., level: Level | None = ..., @@ -3608,12 +3611,12 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> Series: + ) -> SeriesT: ... @overload def sort_index( - self, + self: SeriesT, *, axis: Literal[0] = ..., level: Level | None = ..., @@ -3624,14 +3627,14 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> Series | None: + ) -> SeriesT | None: ... # error: Argument 1 of "sort_index" is incompatible with supertype "NDFrame"; # supertype defines the argument type as "Union[str, int]" @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( # type: ignore[override] - self, + self: SeriesT, axis: Literal[0] = 0, level: Level | None = None, ascending: bool | int | Sequence[bool | int] = True, @@ -3641,7 +3644,7 @@ def sort_index( # type: ignore[override] sort_remaining: bool = True, ignore_index: bool = False, key: IndexKeyFunc = None, - ): + ) -> SeriesT | None: """ Sort Series by index labels. From 5dd297c5e587c2025cc8046b4244f2dcf9d5d9ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 12 Mar 2022 14:53:06 -0500 Subject: [PATCH 03/10] remove int; add cast --- pandas/core/frame.py | 8 ++++---- pandas/core/generic.py | 8 ++++---- pandas/core/series.py | 8 ++++---- pandas/core/sorting.py | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d5c163c37d250..b8ac5627049fb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6437,7 +6437,7 @@ def sort_index( *, axis: Axis = ..., level: Level | None = ..., - ascending: bool | int | Sequence[bool | int] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[True], kind: SortKind = ..., na_position: NaPosition = ..., @@ -6453,7 +6453,7 @@ def sort_index( *, axis: Axis = ..., level: Level | None = ..., - ascending: bool | int | Sequence[bool | int] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[False] = ..., kind: SortKind = ..., na_position: NaPosition = ..., @@ -6469,7 +6469,7 @@ def sort_index( *, axis: Axis = ..., level: Level | None = ..., - ascending: bool | int | Sequence[bool | int] = ..., + ascending: bool | Sequence[bool] = ..., inplace: bool = ..., kind: SortKind = ..., na_position: NaPosition = ..., @@ -6485,7 +6485,7 @@ def sort_index( # type: ignore[override] self: DataFrameT, axis: Axis = 0, level: Level | None = None, - ascending: bool | int | Sequence[bool | int] = True, + ascending: bool | Sequence[bool] = True, inplace: bool = False, kind: SortKind = "quicksort", na_position: NaPosition = "last", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 97292ce608469..0dc636da768a8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4691,7 +4691,7 @@ def sort_index( *, axis: Axis = ..., level: Level | None = ..., - ascending: bool_t | int | Sequence[bool_t | int] = ..., + ascending: bool_t | Sequence[bool_t] = ..., inplace: Literal[True], kind: SortKind = ..., na_position: NaPosition = ..., @@ -4707,7 +4707,7 @@ def sort_index( *, axis: Axis = ..., level: Level | None = ..., - ascending: bool_t | int | Sequence[bool_t | int] = ..., + ascending: bool_t | Sequence[bool_t] = ..., inplace: Literal[False] = ..., kind: SortKind = ..., na_position: NaPosition = ..., @@ -4723,7 +4723,7 @@ def sort_index( *, axis: Axis = ..., level: Level | None = ..., - ascending: bool_t | int | Sequence[bool_t | int] = ..., + ascending: bool_t | Sequence[bool_t] = ..., inplace: bool_t = ..., kind: SortKind = ..., na_position: NaPosition = ..., @@ -4737,7 +4737,7 @@ def sort_index( self: NDFrameT, axis: Axis = 0, level: Level | None = None, - ascending: bool_t | int | Sequence[bool_t | int] = True, + ascending: bool_t | Sequence[bool_t] = True, inplace: bool_t = False, kind: SortKind = "quicksort", na_position: NaPosition = "last", diff --git a/pandas/core/series.py b/pandas/core/series.py index e382676dbe593..a5bd366dd81ea 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3588,7 +3588,7 @@ def sort_index( *, axis: Literal[0] = ..., level: Level | None = ..., - ascending: bool | int | Sequence[bool | int] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[True], kind: SortKind = ..., na_position: NaPosition = ..., @@ -3604,7 +3604,7 @@ def sort_index( *, axis: Literal[0] = ..., level: Level | None = ..., - ascending: bool | int | Sequence[bool | int] = ..., + ascending: bool | Sequence[bool] = ..., inplace: Literal[False] = ..., kind: SortKind = ..., na_position: NaPosition = ..., @@ -3620,7 +3620,7 @@ def sort_index( *, axis: Literal[0] = ..., level: Level | None = ..., - ascending: bool | int | Sequence[bool | int] = ..., + ascending: bool | Sequence[bool] = ..., inplace: bool = ..., kind: SortKind = ..., na_position: NaPosition = ..., @@ -3637,7 +3637,7 @@ def sort_index( # type: ignore[override] self: SeriesT, axis: Literal[0] = 0, level: Level | None = None, - ascending: bool | int | Sequence[bool | int] = True, + ascending: bool | Sequence[bool] = True, inplace: bool = False, kind: SortKind = "quicksort", na_position: NaPosition = "last", diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index b2b08a4ad9644..16facfc915e40 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -9,6 +9,7 @@ Hashable, Iterable, Sequence, + cast, ) import warnings @@ -51,7 +52,7 @@ def get_indexer_indexer( target: Index, level: Level | list[Level] | None, - ascending: Sequence[bool | int] | bool | int, + ascending: Sequence[bool] | bool, kind: SortKind, na_position: NaPosition, sort_remaining: bool, @@ -95,12 +96,11 @@ def get_indexer_indexer( ): return None - # error: Argument "ascending" to "nargsort" has incompatible type - # "Union[Sequence[Union[bool, int]], bool, int]"; expected "bool" + # ascending can only be a Sequence for MultiIndex indexer = nargsort( target, kind=kind, - ascending=ascending, # type: ignore[arg-type] + ascending=cast(bool, ascending), na_position=na_position, ) return indexer From 3889f41f9f779438610d65fe2a2b35c711215a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 13 Mar 2022 09:24:03 -0400 Subject: [PATCH 04/10] move SeriesT and DataFrameT --- pandas/_typing.py | 3 +++ pandas/core/frame.py | 4 +--- pandas/core/series.py | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 3b11aada96a15..73bce03e51528 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -103,6 +103,9 @@ # Series is passed into a function, a Series is always returned and if a DataFrame is # passed in, a DataFrame is always returned. NDFrameT = TypeVar("NDFrameT", bound="NDFrame") +# and for sub-types of DataFrame/Series +DataFrameT = TypeVar("DataFrameT", bound="DataFrame") +SeriesT = TypeVar("SeriesT", bound="Series") Axis = Union[str, int] IndexLabel = Union[Hashable, Sequence[Hashable]] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b8ac5627049fb..efd841c11791c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -26,7 +26,6 @@ Iterator, Literal, Sequence, - TypeVar, cast, overload, ) @@ -52,6 +51,7 @@ Axis, ColspaceArgType, CompressionOptions, + DataFrameT, Dtype, DtypeObj, FilePath, @@ -223,8 +223,6 @@ from pandas.io.formats.style import Styler - DataFrameT = TypeVar("DataFrameT", bound="DataFrame") - # --------------------------------------------------------------------- # Docstring templates diff --git a/pandas/core/series.py b/pandas/core/series.py index a5bd366dd81ea..1da1c75cba1c0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -13,7 +13,6 @@ Iterable, Literal, Sequence, - TypeVar, Union, cast, overload, @@ -42,6 +41,7 @@ IndexKeyFunc, Level, NaPosition, + SeriesT, SingleManager, SortKind, StorageOptions, @@ -163,8 +163,6 @@ from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.resample import Resampler - SeriesT = TypeVar("SeriesT", bound="Series") - __all__ = ["Series"] _shared_doc_kwargs = { From ce6f938bde33f6140a63e7b89d7795deba8a3f37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 13 Mar 2022 11:41:40 -0400 Subject: [PATCH 05/10] return Series/DataFrame --- pandas/_typing.py | 3 --- pandas/core/frame.py | 13 ++++++------- pandas/core/series.py | 13 ++++++------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 73bce03e51528..3b11aada96a15 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -103,9 +103,6 @@ # Series is passed into a function, a Series is always returned and if a DataFrame is # passed in, a DataFrame is always returned. NDFrameT = TypeVar("NDFrameT", bound="NDFrame") -# and for sub-types of DataFrame/Series -DataFrameT = TypeVar("DataFrameT", bound="DataFrame") -SeriesT = TypeVar("SeriesT", bound="Series") Axis = Union[str, int] IndexLabel = Union[Hashable, Sequence[Hashable]] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index efd841c11791c..5ddaf1eba8960 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -51,7 +51,6 @@ Axis, ColspaceArgType, CompressionOptions, - DataFrameT, Dtype, DtypeObj, FilePath, @@ -6447,7 +6446,7 @@ def sort_index( @overload def sort_index( - self: DataFrameT, + self, *, axis: Axis = ..., level: Level | None = ..., @@ -6458,12 +6457,12 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrameT: + ) -> DataFrame: ... @overload def sort_index( - self: DataFrameT, + self, *, axis: Axis = ..., level: Level | None = ..., @@ -6474,13 +6473,13 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> DataFrameT | None: + ) -> DataFrame | None: ... # error: Signature of "sort_index" incompatible with supertype "NDFrame" @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( # type: ignore[override] - self: DataFrameT, + self, axis: Axis = 0, level: Level | None = None, ascending: bool | Sequence[bool] = True, @@ -6490,7 +6489,7 @@ def sort_index( # type: ignore[override] sort_remaining: bool = True, ignore_index: bool = False, key: IndexKeyFunc = None, - ) -> DataFrameT | None: + ) -> DataFrame | None: """ Sort object by labels (along an axis). diff --git a/pandas/core/series.py b/pandas/core/series.py index 1da1c75cba1c0..fc1bb6c1a35a1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -41,7 +41,6 @@ IndexKeyFunc, Level, NaPosition, - SeriesT, SingleManager, SortKind, StorageOptions, @@ -3598,7 +3597,7 @@ def sort_index( @overload def sort_index( - self: SeriesT, + self, *, axis: Literal[0] = ..., level: Level | None = ..., @@ -3609,12 +3608,12 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> SeriesT: + ) -> Series: ... @overload def sort_index( - self: SeriesT, + self, *, axis: Literal[0] = ..., level: Level | None = ..., @@ -3625,14 +3624,14 @@ def sort_index( sort_remaining: bool = ..., ignore_index: bool = ..., key: IndexKeyFunc = ..., - ) -> SeriesT | None: + ) -> Series | None: ... # error: Argument 1 of "sort_index" is incompatible with supertype "NDFrame"; # supertype defines the argument type as "Union[str, int]" @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( # type: ignore[override] - self: SeriesT, + self, axis: Literal[0] = 0, level: Level | None = None, ascending: bool | Sequence[bool] = True, @@ -3642,7 +3641,7 @@ def sort_index( # type: ignore[override] sort_remaining: bool = True, ignore_index: bool = False, key: IndexKeyFunc = None, - ) -> SeriesT | None: + ) -> Series | None: """ Sort Series by index labels. From 17141e85f4ebf47b755c6e34a5a1fcb8da58ad27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sun, 13 Mar 2022 12:41:30 -0400 Subject: [PATCH 06/10] more NDFrame-compatible Series --- pandas/core/series.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index fc1bb6c1a35a1..6b87ea3e91c69 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3578,12 +3578,11 @@ def sort_values( else: return result.__finalize__(self, method="sort_values") - # error: Signature of "sort_index" incompatible with supertype "NDFrame" - @overload # type: ignore[override] + @overload def sort_index( self, *, - axis: Literal[0] = ..., + axis: Axis = ..., level: Level | None = ..., ascending: bool | Sequence[bool] = ..., inplace: Literal[True], @@ -3599,7 +3598,7 @@ def sort_index( def sort_index( self, *, - axis: Literal[0] = ..., + axis: Axis = ..., level: Level | None = ..., ascending: bool | Sequence[bool] = ..., inplace: Literal[False] = ..., @@ -3615,7 +3614,7 @@ def sort_index( def sort_index( self, *, - axis: Literal[0] = ..., + axis: Axis = ..., level: Level | None = ..., ascending: bool | Sequence[bool] = ..., inplace: bool = ..., @@ -3627,12 +3626,11 @@ def sort_index( ) -> Series | None: ... - # error: Argument 1 of "sort_index" is incompatible with supertype "NDFrame"; - # supertype defines the argument type as "Union[str, int]" + # error: Signature of "sort_index" incompatible with supertype "NDFrame" @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( # type: ignore[override] self, - axis: Literal[0] = 0, + axis: Axis = 0, level: Level | None = None, ascending: bool | Sequence[bool] = True, inplace: bool = False, From 122aa994eb2944b330cb1cf25d1d22964c390aa5 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 12 Mar 2022 10:11:05 -0500 Subject: [PATCH 07/10] BUG: replace with value also being replaced (#46335) --- doc/source/whatsnew/v1.4.2.rst | 1 + pandas/core/internals/blocks.py | 15 ++++++++++++--- pandas/tests/frame/methods/test_replace.py | 7 +++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.2.rst b/doc/source/whatsnew/v1.4.2.rst index 2bdbeb0ab6991..06f1f406c3816 100644 --- a/doc/source/whatsnew/v1.4.2.rst +++ b/doc/source/whatsnew/v1.4.2.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` killing python process when invalid file input was given for ``engine="c"`` (:issue:`45957`) - Fixed memory performance regression in :meth:`Series.fillna` when called on a :class:`DataFrame` column with ``inplace=True`` (:issue:`46149`) - Provided an alternative solution for passing custom Excel formats in :meth:`.Styler.to_excel`, which was a regression based on stricter CSS validation. Examples available in the documentation for :meth:`.Styler.format` (:issue:`46152`) +- Fixed regression in :meth:`DataFrame.replace` when a replacement value was also a target for replacement (:issue:`46335`) - Fixed regression in :meth:`DataFrame.loc.__setitem__` losing :class:`MultiIndex` names if :class:`DataFrame` was empty before (:issue:`46317`) - diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3693edbae7d95..69f66973d0954 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -612,9 +612,18 @@ def replace( else: # split so that we only upcast where necessary - return self.split_and_operate( - type(self).replace, to_replace, value, inplace=True - ) + blocks = [] + for i, nb in enumerate(self._split()): + blocks.extend( + type(self).replace( + nb, + to_replace=to_replace, + value=value, + inplace=True, + mask=mask[i : i + 1], + ) + ) + return blocks @final def _replace_regex( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 6b53ef400e53d..2eb300a8905b8 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1542,3 +1542,10 @@ def test_replace_regex_dtype_frame(self, regex): expected_df2 = DataFrame({"A": [1], "B": ["1"]}) result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) + + def test_replace_with_value_also_being_replaced(self): + # GH46306 + df = DataFrame({"A": [0, 1, 2], "B": [1, 0, 2]}) + result = df.replace({0: 1, 1: np.nan}) + expected = DataFrame({"A": [1, np.nan, 2], "B": [np.nan, 1, 2]}) + tm.assert_frame_equal(result, expected) From 35b9e120d25baada0445f28999486624a3def387 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Mar 2022 13:07:22 -0800 Subject: [PATCH 08/10] REF: de-duplicate libjoin (#46256) --- pandas/_libs/join.pyx | 238 ++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 149 deletions(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index b908fa2c65e4d..3fc97e3660120 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -93,10 +93,13 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, with nogil: # First pass, determine size of result set, do not use the NA group for i in range(1, max_groups + 1): - if right_count[i] > 0: - count += left_count[i] * right_count[i] + lc = left_count[i] + rc = right_count[i] + + if rc > 0: + count += lc * rc else: - count += left_count[i] + count += lc left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) @@ -679,7 +682,8 @@ def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, - tolerance=None): + tolerance=None, + bint use_hashtable=True): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos @@ -701,12 +705,13 @@ def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, left_indexer = np.empty(left_size, dtype=np.intp) right_indexer = np.empty(left_size, dtype=np.intp) - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + if use_hashtable: + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) right_pos = 0 for left_pos in range(left_size): @@ -718,19 +723,25 @@ def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, if allow_exact_matches: while (right_pos < right_size and right_values[right_pos] <= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos += 1 else: while (right_pos < right_size and right_values[right_pos] < left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos += 1 right_pos -= 1 # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) + if use_hashtable: + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + else: + found_right_pos = right_pos + left_indexer[left_pos] = left_pos right_indexer[left_pos] = found_right_pos @@ -748,7 +759,8 @@ def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=1, - tolerance=None): + tolerance=None, + bint use_hashtable=True): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos @@ -770,12 +782,13 @@ def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, left_indexer = np.empty(left_size, dtype=np.intp) right_indexer = np.empty(left_size, dtype=np.intp) - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + if use_hashtable: + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) right_pos = right_size - 1 for left_pos in range(left_size - 1, -1, -1): @@ -787,19 +800,26 @@ def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, if allow_exact_matches: while (right_pos >= 0 and right_values[right_pos] >= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos -= 1 else: while (right_pos >= 0 and right_values[right_pos] > left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos -= 1 right_pos += 1 # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) + if use_hashtable: + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + else: + found_right_pos = (right_pos + if right_pos != right_size else -1) + left_indexer[left_pos] = left_pos right_indexer[left_pos] = found_right_pos @@ -820,15 +840,7 @@ def asof_join_nearest_on_X_by_Y(numeric_t[:] left_values, tolerance=None): cdef: - Py_ssize_t left_size, right_size, i - ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - numeric_t bdiff, fdiff - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) + ndarray[intp_t] bli, bri, fli, fri # search both forward and backward bli, bri = asof_join_backward_on_X_by_Y( @@ -848,6 +860,27 @@ def asof_join_nearest_on_X_by_Y(numeric_t[:] left_values, tolerance, ) + return _choose_smaller_timestamp(left_values, right_values, bli, bri, fli, fri) + + +cdef _choose_smaller_timestamp( + numeric_t[:] left_values, + numeric_t[:] right_values, + ndarray[intp_t] bli, + ndarray[intp_t] bri, + ndarray[intp_t] fli, + ndarray[intp_t] fri, +): + cdef: + ndarray[intp_t] left_indexer, right_indexer + Py_ssize_t left_size, i + numeric_t bdiff, fdiff + + left_size = len(left_values) + + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) + for i in range(len(bri)): # choose timestamp from right with smaller difference if bri[i] != -1 and fri[i] != -1: @@ -870,106 +903,30 @@ def asof_join_backward(numeric_t[:] left_values, bint allow_exact_matches=True, tolerance=None): - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[intp_t] left_indexer, right_indexer - bint has_tolerance = False - numeric_t tolerance_ = 0 - numeric_t diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = True - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's - if allow_exact_matches: - while (right_pos < right_size and - right_values[right_pos] <= left_values[left_pos]): - right_pos += 1 - else: - while (right_pos < right_size and - right_values[right_pos] < left_values[left_pos]): - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = right_pos - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != -1: - diff = left_values[left_pos] - right_values[right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer + return asof_join_backward_on_X_by_Y( + left_values, + right_values, + None, + None, + allow_exact_matches=allow_exact_matches, + tolerance=tolerance, + use_hashtable=False, + ) def asof_join_forward(numeric_t[:] left_values, numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[intp_t] left_indexer, right_indexer - bint has_tolerance = False - numeric_t tolerance_ = 0 - numeric_t diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = True - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) - - right_pos = right_size - 1 - for left_pos in range(left_size - 1, -1, -1): - # restart right_pos if it went over in a previous iteration - if right_pos == right_size: - right_pos = right_size - 1 - - # find first position in right whose value is greater than left's - if allow_exact_matches: - while (right_pos >= 0 and - right_values[right_pos] >= left_values[left_pos]): - right_pos -= 1 - else: - while (right_pos >= 0 and - right_values[right_pos] > left_values[left_pos]): - right_pos -= 1 - right_pos += 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = (right_pos - if right_pos != right_size else -1) - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != right_size: - diff = right_values[right_pos] - left_values[left_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer + return asof_join_forward_on_X_by_Y( + left_values, + right_values, + None, + None, + allow_exact_matches=allow_exact_matches, + tolerance=tolerance, + use_hashtable=False, + ) def asof_join_nearest(numeric_t[:] left_values, @@ -978,14 +935,7 @@ def asof_join_nearest(numeric_t[:] left_values, tolerance=None): cdef: - Py_ssize_t left_size, i - ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - numeric_t bdiff, fdiff - - left_size = len(left_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) + ndarray[intp_t] bli, bri, fli, fri # search both forward and backward bli, bri = asof_join_backward(left_values, right_values, @@ -993,14 +943,4 @@ def asof_join_nearest(numeric_t[:] left_values, fli, fri = asof_join_forward(left_values, right_values, allow_exact_matches, tolerance) - for i in range(len(bri)): - # choose timestamp from right with smaller difference - if bri[i] != -1 and fri[i] != -1: - bdiff = left_values[bli[i]] - right_values[bri[i]] - fdiff = right_values[fri[i]] - left_values[fli[i]] - right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] - else: - right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] - left_indexer[i] = bli[i] - - return left_indexer, right_indexer + return _choose_smaller_timestamp(left_values, right_values, bli, bri, fli, fri) From 8adb13d9e14af272bf40899fe8c485e8dbfcb633 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 12 Mar 2022 23:57:59 +0100 Subject: [PATCH 09/10] BUG: DataFrame.getattribute raising if columns have dtype string (#46301) --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/indexes/base.py | 3 ++- pandas/tests/frame/indexing/test_getitem.py | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 368983b0a55c0..8dac952874f89 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -394,6 +394,7 @@ Indexing - Bug in :meth:`Series.mask` with ``inplace=True`` or setting values with a boolean mask with small integer dtypes incorrectly raising (:issue:`45750`) - Bug in :meth:`DataFrame.mask` with ``inplace=True`` and ``ExtensionDtype`` columns incorrectly raising (:issue:`45577`) - Bug in getting a column from a DataFrame with an object-dtype row index with datetime-like values: the resulting Series now preserves the exact object-dtype Index from the parent DataFrame (:issue:`42950`) +- Bug in :meth:`DataFrame.__getattribute__` raising ``AttributeError`` if columns have ``"string"`` dtype (:issue:`46185`) - Bug in indexing on a :class:`DatetimeIndex` with a ``np.str_`` key incorrectly raising (:issue:`45580`) - Bug in :meth:`CategoricalIndex.get_indexer` when index contains ``NaN`` values, resulting in elements that are in target but not present in the index to be mapped to the index of the NaN element, instead of -1 (:issue:`45361`) - Bug in setting large integer values into :class:`Series` with ``float32`` or ``float16`` dtype incorrectly altering these values instead of coercing to ``float64`` dtype (:issue:`45844`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c673848bc022a..77cae57e82f3b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -97,6 +97,7 @@ is_object_dtype, is_scalar, is_signed_integer_dtype, + is_string_dtype, is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, @@ -5276,7 +5277,7 @@ def _can_hold_identifiers_and_holds_name(self, name) -> bool: https://github.com/pandas-dev/pandas/issues/19764 """ - if self.is_object() or self.is_categorical(): + if self.is_object() or is_string_dtype(self.dtype) or self.is_categorical(): return name in self return False diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 0d4ab84175aab..8cc8b487ff44f 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -72,6 +72,13 @@ def test_getitem_sparse_column_return_type_and_dtype(self): result = df.loc[:, "A"] tm.assert_series_equal(result, expected) + def test_getitem_string_columns(self): + # GH#46185 + df = DataFrame([[1, 2]], columns=Index(["A", "B"], dtype="string")) + result = df.A + expected = df["A"] + tm.assert_series_equal(result, expected) + class TestGetitemListLike: def test_getitem_list_missing_key(self): From 808ff73e5279223c33c4bf73c744b6731c97c571 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 12 Mar 2022 16:20:47 -0800 Subject: [PATCH 10/10] TST: Avoid stack overflow on Windows CI with recursion test (#46345) --- pandas/tests/dtypes/test_inference.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index bf13e6b7b4629..15f6e82419049 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -17,6 +17,7 @@ import itertools from numbers import Number import re +import sys import numpy as np import pytest @@ -205,8 +206,14 @@ def foo(): inference.is_list_like([]) foo() - with tm.external_error_raised(RecursionError): - foo() + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + with tm.external_error_raised(RecursionError): + foo() + finally: + sys.setrecursionlimit(rec_limit) def test_is_list_like_iter_is_none():