From 12f9adf93813b577163b229e011d9c04fe5e332f Mon Sep 17 00:00:00 2001 From: Patrick Schleiter <4884221+pschleiter@users.noreply.github.com> Date: Tue, 18 Apr 2023 14:33:02 +0000 Subject: [PATCH 1/8] Specify method in reindex for class dataframe Specify parser in to_xml of class dataframe Update doc string to_orc in class dataframeUpdate doc string to_orc in class dataframe Specify engine in to_parquet in class dataframe --- pandas/_typing.py | 3 +++ pandas/core/frame.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index de02a549856ab..916e9ace59543 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -376,6 +376,9 @@ def closed(self) -> bool: # join JoinHow = Literal["left", "right", "inner", "outer"] +# reindex +ReindexMethod = Union[FillnaOptions ,Literal["nearest"]] + MatplotlibColor = Union[str, Sequence[float]] TimeGrouperOrigin = Union[ "Timestamp", Literal["epoch", "start", "start_day", "end", "end_day"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5341b87c39676..44054ec2e8d3f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -237,6 +237,8 @@ ValueKeyFunc, WriteBuffer, npt, + ReindexMethod, + XMLParsers, ) from pandas.core.groupby.generic import DataFrameGroupBy @@ -2763,7 +2765,7 @@ def to_markdown( def to_parquet( self, path: None = ..., - engine: str = ..., + engine: Literal["auto", "pyarrow", "fastparquet"] = ..., compression: str | None = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., @@ -2776,7 +2778,7 @@ def to_parquet( def to_parquet( self, path: FilePath | WriteBuffer[bytes], - engine: str = ..., + engine: Literal["auto", "pyarrow", "fastparquet"] = ..., compression: str | None = ..., index: bool | None = ..., partition_cols: list[str] | None = ..., @@ -2789,7 +2791,7 @@ def to_parquet( def to_parquet( self, path: FilePath | WriteBuffer[bytes] | None = None, - engine: str = "auto", + engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", compression: str | None = "snappy", index: bool | None = None, partition_cols: list[str] | None = None, @@ -2919,7 +2921,7 @@ def to_orc( we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. - engine : str, default 'pyarrow' + engine : {'pyarrow'}, default 'pyarrow' ORC library to use. Pyarrow must be >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. @@ -3155,7 +3157,7 @@ def to_xml( encoding: str = "utf-8", xml_declaration: bool | None = True, pretty_print: bool | None = True, - parser: str | None = "lxml", + parser: XMLParsers | None = "lxml", stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, @@ -4988,7 +4990,7 @@ def reindex( index=None, columns=None, axis: Axis | None = None, - method: str | None = None, + method: ReindexMethod | None = None, copy: bool | None = None, level: Level | None = None, fill_value: Scalar | None = np.nan, From fc65e3a40ce34ae6405f48012e55d2f9d08574e7 Mon Sep 17 00:00:00 2001 From: benedikt-mangold <48798074+benedikt-mangold@users.noreply.github.com> Date: Tue, 18 Apr 2023 16:37:14 +0200 Subject: [PATCH 2/8] undo changes and adding None as an optional argument type for validate argument of join and merge method Change byteorder argument typing for to_stata method to literal, added definition in pandas/_typing.py Change if_exists argument typing for to_gbq method to literal, added definition in pandas/_typing.py Change orient argument typing for from_dict method to literal, added definition in pandas/_typing.py Change how argument typing for to_timestamp method to literal, added definition in pandas/_typing.py Change validate argument typing for merge and join methods to literal, added definition in pandas/_typing.py Change na_action arguments typing for applymap method to literal, added definition in pandas/_typing.py Change join and errors arguments typing for update method to litaral, added definition in pandas/_typing.py Change keep argument typing for nlargest and nsallest to litaera, added definition in pandas/_typing.py Specify the kind and na_position more precisely in sort_values, reusing type definitions in pandas/_typing.py --- pandas/_typing.py | 40 ++++++++++++++++++++++++++++++++++++++++ pandas/core/frame.py | 39 ++++++++++++++++++++++++++------------- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index de02a549856ab..0dfe9d2b7d0b5 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -132,6 +132,8 @@ ] Timezone = Union[str, tzinfo] +ToTimestampHow = Literal["s", "e", "start", "end"] + # NDFrameT is stricter and ensures that the same subclass of NDFrame always is # used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a # Series is passed into a function, a Series is always returned and if a DataFrame is @@ -303,6 +305,9 @@ def closed(self) -> bool: str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]] ] +# Arguments for nsmalles and n_largest +NsmallestNlargestKeep = Literal["first", "last", "all"] + # Arguments for fillna() FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"] @@ -372,9 +377,29 @@ def closed(self) -> bool: # merge MergeHow = Literal["left", "right", "inner", "outer", "cross"] +MergeValidate = Literal[ + "one_to_one", + "1:1", + "one_to_many", + "1:m", + "many_to_one", + "m:1", + "many_to_many", + "m:m", +] # join JoinHow = Literal["left", "right", "inner", "outer"] +JoinValidate = Literal[ + "one_to_one", + "1:1", + "one_to_many", + "1:m", + "many_to_one", + "m:1", + "many_to_many", + "m:m", +] MatplotlibColor = Union[str, Sequence[float]] TimeGrouperOrigin = Union[ @@ -390,3 +415,18 @@ def closed(self) -> bool: ] AlignJoin = Literal["outer", "inner", "left", "right"] DtypeBackend = Literal["pyarrow", "numpy_nullable"] + +# update +UpdateJoin = Literal["left"] + +# applymap +NaAction = Literal["None", "ignore"] + +# from_dict[ +FromDictOrient = Literal["columns", "index", "tight"] + +# to_gbc +ToGbqIfexist = Literal["fail", "replace", "append"] + +# to_stata +ToStataByteorder = Literal[">", "<", "little", "big"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5341b87c39676..085e2c6eb2182 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -219,12 +219,17 @@ FloatFormatType, FormattersType, Frequency, + FromDictOrient, IgnoreRaise, IndexKeyFunc, IndexLabel, + JoinValidate, Level, MergeHow, + MergeValidate, + NaAction, NaPosition, + NsmallestNlargestKeep, PythonFuncType, QuantileInterpolation, ReadBuffer, @@ -234,6 +239,10 @@ SortKind, StorageOptions, Suffixes, + ToGbqIfexist, + ToStataByteorder, + ToTimestampHow, + UpdateJoin, ValueKeyFunc, WriteBuffer, npt, @@ -1637,7 +1646,7 @@ def __rmatmul__(self, other) -> DataFrame: def from_dict( cls, data: dict, - orient: str = "columns", + orient: FromDictOrient = "columns", dtype: Dtype | None = None, columns: Axes | None = None, ) -> DataFrame: @@ -1981,7 +1990,7 @@ def to_gbq( project_id: str | None = None, chunksize: int | None = None, reauth: bool = False, - if_exists: str = "fail", + if_exists: ToGbqIfexist = "fail", auth_local_webserver: bool = True, table_schema: list[dict[str, str]] | None = None, location: str | None = None, @@ -2535,7 +2544,7 @@ def to_stata( *, convert_dates: dict[Hashable, str] | None = None, write_index: bool = True, - byteorder: str | None = None, + byteorder: ToStataByteorder | None = None, time_stamp: datetime.datetime | None = None, data_label: str | None = None, variable_labels: dict[Hashable, str] | None = None, @@ -6521,8 +6530,8 @@ def sort_values( axis: Axis = ..., ascending=..., inplace: Literal[False] = ..., - kind: str = ..., - na_position: str = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., ignore_index: bool = ..., key: ValueKeyFunc = ..., ) -> DataFrame: @@ -7077,7 +7086,9 @@ def value_counts( return counts - def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: + def nlargest( + self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" + ) -> DataFrame: """ Return the first `n` rows ordered by `columns` in descending order. @@ -7184,7 +7195,9 @@ def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFram """ return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: + def nsmallest( + self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" + ) -> DataFrame: """ Return the first `n` rows ordered by `columns` in ascending order. @@ -8348,10 +8361,10 @@ def combiner(x, y): def update( self, other, - join: str = "left", + join: UpdateJoin = "left", overwrite: bool = True, filter_func=None, - errors: str = "ignore", + errors: IgnoreRaise = "ignore", ) -> None: """ Modify in place using non-NA values from another DataFrame. @@ -9857,7 +9870,7 @@ def infer(x): return self.apply(infer).__finalize__(self, "map") def applymap( - self, func: PythonFuncType, na_action: str | None = None, **kwargs + self, func: PythonFuncType, na_action: NaAction = None, **kwargs ) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -9969,7 +9982,7 @@ def join( lsuffix: str = "", rsuffix: str = "", sort: bool = False, - validate: str | None = None, + validate: JoinValidate | None = None, ) -> DataFrame: """ Join columns of another DataFrame. @@ -10211,7 +10224,7 @@ def merge( suffixes: Suffixes = ("_x", "_y"), copy: bool | None = None, indicator: str | bool = False, - validate: str | None = None, + validate: MergeValidate | None = None, ) -> DataFrame: from pandas.core.reshape.merge import merge @@ -11506,7 +11519,7 @@ def quantile( def to_timestamp( self, freq: Frequency | None = None, - how: str = "start", + how: ToTimestampHow = "start", axis: Axis = 0, copy: bool | None = None, ) -> DataFrame: From 7e1c7c9e7374b157eecd1d3e24698590d09ce565 Mon Sep 17 00:00:00 2001 From: benedikt-mangold <48798074+benedikt-mangold@users.noreply.github.com> Date: Tue, 18 Apr 2023 17:34:08 +0200 Subject: [PATCH 3/8] removing none from literal and adding it to the argument of applymap --- pandas/_typing.py | 4 ++-- pandas/core/frame.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index c7f023b11ad41..41247a90678dd 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -402,7 +402,7 @@ def closed(self) -> bool: ] # reindex -ReindexMethod = Union[FillnaOptions ,Literal["nearest"]] +ReindexMethod = Union[FillnaOptions, Literal["nearest"]] MatplotlibColor = Union[str, Sequence[float]] TimeGrouperOrigin = Union[ @@ -423,7 +423,7 @@ def closed(self) -> bool: UpdateJoin = Literal["left"] # applymap -NaAction = Literal["None", "ignore"] +NaAction = Literal["ignore"] # from_dict[ FromDictOrient = Literal["columns", "index", "tight"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2f947ad780ad0..8c8125f5cf788 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -233,6 +233,7 @@ PythonFuncType, QuantileInterpolation, ReadBuffer, + ReindexMethod, Renamer, Scalar, Self, @@ -245,9 +246,8 @@ UpdateJoin, ValueKeyFunc, WriteBuffer, - npt, - ReindexMethod, XMLParsers, + npt, ) from pandas.core.groupby.generic import DataFrameGroupBy @@ -9872,7 +9872,7 @@ def infer(x): return self.apply(infer).__finalize__(self, "map") def applymap( - self, func: PythonFuncType, na_action: NaAction = None, **kwargs + self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs ) -> DataFrame: """ Apply a function to a Dataframe elementwise. From 748c47f31a581cab79e8a1be3a1253bacff61274 Mon Sep 17 00:00:00 2001 From: benedikt-mangold <48798074+benedikt-mangold@users.noreply.github.com> Date: Tue, 18 Apr 2023 18:33:17 +0200 Subject: [PATCH 4/8] adding reindex literal to super class NDFrame as it violates the Liskov substitution principle otherwise --- pandas/core/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 582a043a8a78a..839093a215cba 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -69,6 +69,7 @@ NaPosition, NDFrameT, RandomState, + ReindexMethod, Renamer, Scalar, Self, @@ -5153,7 +5154,7 @@ def reindex( index=None, columns=None, axis: Axis | None = None, - method: str | None = None, + method: ReindexMethod | None = None, copy: bool_t | None = None, level: Level | None = None, fill_value: Scalar | None = np.nan, From 7ff3b1905aaeb09823e5ca244a29b5d5054af9a2 Mon Sep 17 00:00:00 2001 From: benedikt-mangold <48798074+benedikt-mangold@users.noreply.github.com> Date: Tue, 18 Apr 2023 20:17:05 +0200 Subject: [PATCH 5/8] adding reindex literal to super class NDFrame as it violates the Liskov substitution principle otherwise --- pandas/core/series.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index adb16c2f2dd55..4a82eafadb09e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -167,6 +167,7 @@ NumpySorter, NumpyValueArrayLike, QuantileInterpolation, + ReindexMethod, Renamer, Scalar, Self, @@ -4725,7 +4726,7 @@ def reindex( # type: ignore[override] index=None, *, axis: Axis | None = None, - method: str | None = None, + method: ReindexMethod | None = None, copy: bool | None = None, level: Level | None = None, fill_value: Scalar | None = None, From fd99bc433e61c4fdd8a8b92d7c0c7d896fb35001 Mon Sep 17 00:00:00 2001 From: benedikt-mangold <48798074+benedikt-mangold@users.noreply.github.com> Date: Tue, 18 Apr 2023 21:32:16 +0200 Subject: [PATCH 6/8] adding literal to missing.py --- pandas/core/missing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index aaed431f890d3..585ad50ad9069 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -25,6 +25,7 @@ Axis, AxisInt, F, + ReindexMethod, npt, ) from pandas.compat._optional import import_optional_dependency @@ -949,7 +950,7 @@ def get_fill_func(method, ndim: int = 1): return {"pad": _pad_2d, "backfill": _backfill_2d}[method] -def clean_reindex_fill_method(method) -> str | None: +def clean_reindex_fill_method(method) -> ReindexMethod | None: return clean_fill_method(method, allow_nearest=True) From 2d5667d5eaa834188e5e2df7b8dbba59a462445c Mon Sep 17 00:00:00 2001 From: benedikt-mangold <48798074+benedikt-mangold@users.noreply.github.com> Date: Wed, 19 Apr 2023 07:42:59 +0200 Subject: [PATCH 7/8] ignore type for orient in from_dict method of frame due to mypy error --- pandas/_typing.py | 2 +- pandas/core/frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 41247a90678dd..62fe31fec26b7 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -425,7 +425,7 @@ def closed(self) -> bool: # applymap NaAction = Literal["ignore"] -# from_dict[ +# from_dict FromDictOrient = Literal["columns", "index", "tight"] # to_gbc diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c8125f5cf788..bd298b8d723b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1735,7 +1735,7 @@ def from_dict( c 2 4 """ index = None - orient = orient.lower() + orient = orient.lower() # type: ignore[assignment] if orient == "index": if len(data) > 0: # TODO speed up Series case From c007b53b63747255ab8064db04187a78d2cc9030 Mon Sep 17 00:00:00 2001 From: benedikt-mangold <48798074+benedikt-mangold@users.noreply.github.com> Date: Thu, 20 Apr 2023 10:16:18 +0200 Subject: [PATCH 8/8] pulling main and resolving merge conflict --- pandas/_typing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_typing.py b/pandas/_typing.py index 71b8e0777b6dc..e162f7f1662ee 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -363,6 +363,9 @@ def closed(self) -> bool: SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"] NaPosition = Literal["first", "last"] +# Arguments for nsmalles and n_largest +NsmallestNlargestKeep = Literal["first", "last", "all"] + # quantile interpolation QuantileInterpolation = Literal["linear", "lower", "higher", "midpoint", "nearest"]