From 8061f943426b06dfb57e656a70318464d96dda08 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Thu, 17 Jun 2021 09:39:19 +0200 Subject: [PATCH 01/13] BUG: reordering the index labels --- pandas/core/series.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3d974cbec4286..e96b5c6bb4545 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -129,6 +129,7 @@ SingleArrayManager, SingleBlockManager, ) +from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped, @@ -3661,15 +3662,24 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: mask = isna(values) if mask.any(): - result = Series(-1, index=self.index, name=self.name, dtype="int64") - notmask = ~mask - result[notmask] = np.argsort(values[notmask], kind=kind) - return self._constructor(result, index=self.index).__finalize__( - self, method="argsort" + mask_ret = Series(-1, index=self.index[mask], dtype="int64", name=self.name) + notmask_result = np.argsort(values[~mask], kind=kind) + notmask_ret = Series( + notmask_result, + index=self.index[~mask][notmask_result], + dtype="int64", + name=self.name, ) + + # result = Series(-1, index=self.index, name=self.name, dtype="int64") + # notmask = ~mask + # result[notmask] = np.argsort(values[notmask], kind=kind) + ret = concat([notmask_ret, mask_ret]) + return ret.__finalize__(self, method="argsort") else: + result = np.argsort(values, kind=kind) return self._constructor( - np.argsort(values, kind=kind), index=self.index, dtype="int64" + result, index=self.index[result], dtype="int64" ).__finalize__(self, method="argsort") def nlargest(self, n=5, keep="first") -> Series: From f0c4d1a1480e8585e8c40d8e690c35cad82b4d08 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Thu, 17 Jun 2021 11:36:14 +0200 Subject: [PATCH 02/13] BUG: reordering the index labels --- pandas/core/series.py | 55 +++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index e96b5c6bb4545..1c104218415c7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -129,7 +129,6 @@ SingleArrayManager, SingleBlockManager, ) -from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped, @@ -3658,29 +3657,39 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: -------- numpy.ndarray.argsort : Returns the indices that would sort this array. """ - values = self._values + values = self.to_numpy() + res = np.argsort(values, kind=kind) + res_ser = Series(res, index=self.index[res], dtype="int64", name=self.name) mask = isna(values) - - if mask.any(): - mask_ret = Series(-1, index=self.index[mask], dtype="int64", name=self.name) - notmask_result = np.argsort(values[~mask], kind=kind) - notmask_ret = Series( - notmask_result, - index=self.index[~mask][notmask_result], - dtype="int64", - name=self.name, - ) - - # result = Series(-1, index=self.index, name=self.name, dtype="int64") - # notmask = ~mask - # result[notmask] = np.argsort(values[notmask], kind=kind) - ret = concat([notmask_ret, mask_ret]) - return ret.__finalize__(self, method="argsort") - else: - result = np.argsort(values, kind=kind) - return self._constructor( - result, index=self.index[result], dtype="int64" - ).__finalize__(self, method="argsort") + res_ser[self.index[mask]] = -1 # set na tp -1 + return res_ser.__finalize__(self, method="argsort") + + # if mask.any(): + # result = np.argsort(values, kind=kind) + # result_series = Series(result, index=self.index[result], dtype="int64", + # name=self.name) + # + # + # mask_ret = Series(-1, index=self.index[mask], dtype="int64", + # name=self.name) + # notmask_result = np.argsort(values[~mask], kind=kind) + # notmask_ret = Series( + # notmask_result, + # index=self.index[~mask][notmask_result], + # dtype="int64", + # name=self.name, + # ) + # + # # result = Series(-1, index=self.index, name=self.name, dtype="int64") + # # notmask = ~mask + # # result[notmask] = np.argsort(values[notmask], kind=kind) + # ret = concat([notmask_ret, mask_ret]) + # return ret.__finalize__(self, method="argsort") + # else: + # result = np.argsort(values, kind=kind) + # return self._constructor( + # result, index=self.index[result], dtype="int64" + # ).__finalize__(self, method="argsort") def nlargest(self, n=5, keep="first") -> Series: """ From fa848c08a97de0382f7773acf22b3aa2280d0c6e Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Thu, 17 Jun 2021 22:43:52 +0200 Subject: [PATCH 03/13] amend tests --- pandas/core/series.py | 68 ++++++++++++--------- pandas/tests/series/methods/test_argsort.py | 13 ++-- 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c104218415c7..cd4593e7c98e0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3634,8 +3634,8 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: """ Return the integer indices that would sort the Series values. - Override ndarray.argsort. Argsorts the value, omitting NA/null values, - and places the result in the same locations as the non-NA values. + Override ndarray.argsort. NA/null sort indices are replaced by -1. The index + is sorted so that index labels correspond to the integer indices Parameters ---------- @@ -3656,41 +3656,49 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: See Also -------- numpy.ndarray.argsort : Returns the indices that would sort this array. + + Examples + -------- + + Argsorting a basic Series. + + >>> series = Series([30, 10, 20], index=["high", "low", "mid"], name="xy") + >>> series.argsort() + low 1 + mid 2 + high 0 + Name: xy, dtype: int64 + + Argsorting a Series with null values. + + >>> series = Series([30, 10, np.nan, 20], name="xy", + ... index=["high", "low", "null", "mid"]) + >>> series.argsort() + low 1 + mid 3 + high 0 + null -1 + Name: xy, dtype: int64 """ values = self.to_numpy() + mask = isna(values) + object_and_nulls = self.dtype == "object" and any(mask) + if object_and_nulls: + # if an object/string array has nans convert to last ascii character '~' + values = np.where(isna(values), "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~", values) res = np.argsort(values, kind=kind) + if object_and_nulls: + if values[res[-1]] != "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~": + # then another string has lower sort index than the substitute + # so return the original, unedited numpy TypeError + raise TypeError( + "'<' not supported between instances of 'float' and 'str'" + ) + res_ser = Series(res, index=self.index[res], dtype="int64", name=self.name) - mask = isna(values) res_ser[self.index[mask]] = -1 # set na tp -1 return res_ser.__finalize__(self, method="argsort") - # if mask.any(): - # result = np.argsort(values, kind=kind) - # result_series = Series(result, index=self.index[result], dtype="int64", - # name=self.name) - # - # - # mask_ret = Series(-1, index=self.index[mask], dtype="int64", - # name=self.name) - # notmask_result = np.argsort(values[~mask], kind=kind) - # notmask_ret = Series( - # notmask_result, - # index=self.index[~mask][notmask_result], - # dtype="int64", - # name=self.name, - # ) - # - # # result = Series(-1, index=self.index, name=self.name, dtype="int64") - # # notmask = ~mask - # # result[notmask] = np.argsort(values[notmask], kind=kind) - # ret = concat([notmask_ret, mask_ret]) - # return ret.__finalize__(self, method="argsort") - # else: - # result = np.argsort(values, kind=kind) - # return self._constructor( - # result, index=self.index[result], dtype="int64" - # ).__finalize__(self, method="argsort") - def nlargest(self, n=5, keep="first") -> Series: """ Return the largest `n` elements. diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 7a545378ef402..e4b156ef5c15e 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -20,8 +20,9 @@ def _check_accum_op(self, name, ser, check_dtype=True): ts = ser.copy() ts[::2] = np.NaN - result = func(ts)[1::2] - expected = func(np.array(ts.dropna())) + result = func(ts) + expected = func(ts.to_numpy()) + expected[int(len(expected) / 2) :] = -1 tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) @@ -53,8 +54,12 @@ def test_argsort_stable(self): mexpected = np.argsort(s.values, kind="mergesort") qexpected = np.argsort(s.values, kind="quicksort") - tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected)) - tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) + tm.assert_series_equal( + mindexer.astype(np.intp), Series(mexpected, index=s.index[mexpected]) + ) + tm.assert_series_equal( + qindexer.astype(np.intp), Series(qexpected, index=s.index[qexpected]) + ) msg = ( r"ndarray Expected type , " r"found instead" From 701d3c867f73f7dd131626aa63b71ebd58a0885f Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Fri, 18 Jun 2021 11:01:12 +0200 Subject: [PATCH 04/13] update method and amend tests --- pandas/core/series.py | 41 +++++++++++++++----------- pandas/tests/extension/base/methods.py | 4 +-- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index cd4593e7c98e0..76b3a1109a3e8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3680,24 +3680,31 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: null -1 Name: xy, dtype: int64 """ - values = self.to_numpy() - mask = isna(values) - object_and_nulls = self.dtype == "object" and any(mask) - if object_and_nulls: - # if an object/string array has nans convert to last ascii character '~' - values = np.where(isna(values), "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~", values) - res = np.argsort(values, kind=kind) - if object_and_nulls: - if values[res[-1]] != "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~": - # then another string has lower sort index than the substitute - # so return the original, unedited numpy TypeError - raise TypeError( - "'<' not supported between instances of 'float' and 'str'" - ) + values = self.array + na_mask = isna(values) + if not any(na_mask): + res = np.argsort(values, kind=kind) + res_ser = Series(res, index=self.index[res], dtype="int64", name=self.name) + return res_ser.__finalize__(self, method="argsort") + else: + # GH 42090 + # count the missing index values that will be added to not na results + notna_na_cumsum = na_mask.cumsum()[~na_mask] + notna_argsort = np.argsort(values[~na_mask]) + notna_argsort += notna_na_cumsum[notna_argsort] + # combine the notna and na series + na_res_ser = Series( + -1, index=self.index[na_mask], dtype="int64", name=self.name + ) + notna_res_ser = Series( + notna_argsort, + index=self.index[notna_argsort], + dtype="int64", + name=self.name, + ) + from pandas.core.reshape.concat import concat - res_ser = Series(res, index=self.index[res], dtype="int64", name=self.name) - res_ser[self.index[mask]] = -1 # set na tp -1 - return res_ser.__finalize__(self, method="argsort") + return concat([notna_res_ser, na_res_ser]) def nlargest(self, n=5, keep="first") -> Series: """ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index ca9c2acb9fd12..69bb91c0825ee 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -71,7 +71,7 @@ def test_apply_simple_series(self, data): def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() - expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) + expected = pd.Series(np.array([2, 0, 1], dtype=np.int64), index=[2, 0, 1]) self.assert_series_equal(result, expected) def test_argsort_missing_array(self, data_missing_for_sorting): @@ -84,7 +84,7 @@ def test_argsort_missing_array(self, data_missing_for_sorting): def test_argsort_missing(self, data_missing_for_sorting): result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) + expected = pd.Series(np.array([2, 0, -1], dtype=np.int64), index=[2, 0, 1]) self.assert_series_equal(result, expected) def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): From c86f78dbe7b2dffb2c311406a6c38c1c81726476 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Fri, 18 Jun 2021 11:06:49 +0200 Subject: [PATCH 05/13] special if for SparseArray --- pandas/core/series.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index 76b3a1109a3e8..8f72825ec7adb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3689,6 +3689,9 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: else: # GH 42090 # count the missing index values that will be added to not na results + if isinstance(na_mask, pandas.core.arrays.sparse.SparseArray): + # avoid RecursionError + na_mask = np.asarray(na_mask) notna_na_cumsum = na_mask.cumsum()[~na_mask] notna_argsort = np.argsort(values[~na_mask]) notna_argsort += notna_na_cumsum[notna_argsort] From 2d92c342dc8aba2b0043e0ece7b949f4a7db4688 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Fri, 18 Jun 2021 13:59:48 +0200 Subject: [PATCH 06/13] mypy fix --- pandas/core/series.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8f72825ec7adb..79da2e58f9c64 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3680,7 +3680,7 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: null -1 Name: xy, dtype: int64 """ - values = self.array + values = self.values na_mask = isna(values) if not any(na_mask): res = np.argsort(values, kind=kind) @@ -3707,7 +3707,11 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: ) from pandas.core.reshape.concat import concat - return concat([notna_res_ser, na_res_ser]) + ret_ser = concat([notna_res_ser, na_res_ser]).__finalize__( + self, method="argsort" + ) + assert isinstance(ret_ser, Series) # mypy: concat 2 Series so is OK + return ret_ser def nlargest(self, n=5, keep="first") -> Series: """ From 94fefdce9a3e2f8ec00964fdbd8d0ad4c6218976 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Wed, 7 Jul 2021 08:16:02 +0200 Subject: [PATCH 07/13] whats new 1.4.0 --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5b09b62fa9e88..d2d88f84b506a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -212,7 +212,7 @@ Interval Indexing ^^^^^^^^ - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`) -- +- Bug in :class:`Series.argsort` where the index was misaligned and the indexed values were not correct (:issue:`42056`, :issue:`12694`) Missing ^^^^^^^ From 944f08b88f983002a1520717f8464e7aacb4dec6 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Thu, 8 Jul 2021 10:44:11 +0200 Subject: [PATCH 08/13] add possible return formats with "first" "last" and tests --- pandas/core/series.py | 61 ++++++++++++++++----- pandas/tests/series/methods/test_argsort.py | 17 ++++++ 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 79f81b9b9ff13..2ccfd7c096a73 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3633,12 +3633,12 @@ def sort_index( key, ) - def argsort(self, axis=0, kind="quicksort", order=None) -> Series: + def argsort(self, axis=0, kind="quicksort", order=None, na_position=None) -> Series: """ Return the integer indices that would sort the Series values. - Override ndarray.argsort. NA/null sort indices are replaced by -1. The index - is sorted so that index labels correspond to the integer indices + Override ndarray.argsort. The index is also sorted so that index labels + correspond to the integer indices. Parameters ---------- @@ -3649,20 +3649,26 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + na_position : {"first", "last", None} + Puts NaNs at the beginning if *first*; *last* puts NaNs at the end. + Defaults to *None*, which puts NaNs at the end an gives them all a sorting + index of '-1'. + + .. versionadded:: 1.4.0 Returns ------- Series[np.intp] - Positions of values within the sort order with -1 indicating - nan values. + Positions of values within the sort order with associated sorted index. See Also -------- numpy.ndarray.argsort : Returns the indices that would sort this array. + Series.idxmax : Return the row label of the maximum value. + Series.idxmin : Return the row label of the minimum value. Examples -------- - Argsorting a basic Series. >>> series = Series([30, 10, 20], index=["high", "low", "mid"], name="xy") @@ -3682,25 +3688,53 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: high 0 null -1 Name: xy, dtype: int64 + + Argsorting a Series using ``na_position`` + + >>> series.argsort(na_position="first") + null 2 + low 1 + mid 3 + high 0 + Name: xy, dtype: int64 """ values = self.values na_mask = isna(values) - if not any(na_mask): + n_na = na_mask.sum() + if n_na == 0: # number of NaN values is zero res = np.argsort(values, kind=kind) res_ser = Series(res, index=self.index[res], dtype="int64", name=self.name) return res_ser.__finalize__(self, method="argsort") else: # GH 42090 - # count the missing index values that will be added to not na results if isinstance(na_mask, pandas.core.arrays.sparse.SparseArray): # avoid RecursionError na_mask = np.asarray(na_mask) + + # Do the not_na argsort: + # count the missing index values within arrays added to not_na results notna_na_cumsum = na_mask.cumsum()[~na_mask] + # argsort the values excluding the nans notna_argsort = np.argsort(values[~na_mask]) + # add to these the indexes where nans have been removed notna_argsort += notna_na_cumsum[notna_argsort] - # combine the notna and na series + + # Do the na argsort: + if na_position is None: + na_argsort = -1 + elif na_position == "first" or na_position == "last": + # count the missing index values within arrays added to na results + na_notna_cumsum = (~na_mask).cumsum()[na_mask] + # argsort the nans + na_argsort = np.arange(n_na) + # add to these the indexes where not nans have been removed + na_argsort += na_notna_cumsum + else: + raise ValueError("`na_position` must be one of {'first', 'last', None}") + + # create and combine the Series: na_res_ser = Series( - -1, index=self.index[na_mask], dtype="int64", name=self.name + na_argsort, index=self.index[na_mask], dtype="int64", name=self.name ) notna_res_ser = Series( notna_argsort, @@ -3710,9 +3744,10 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> Series: ) from pandas.core.reshape.concat import concat - ret_ser = concat([notna_res_ser, na_res_ser]).__finalize__( - self, method="argsort" - ) + concat_order = [notna_res_ser, na_res_ser] + if na_position == "first": + concat_order = [na_res_ser, notna_res_ser] + ret_ser = concat(concat_order).__finalize__(self, method="argsort") assert isinstance(ret_ser, Series) # mypy: concat 2 Series so is OK return ret_ser diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index e4b156ef5c15e..0d7470e3faca2 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -70,3 +70,20 @@ def test_argsort_stable(self): def test_argsort_preserve_name(self, datetime_series): result = datetime_series.argsort() assert result.name == datetime_series.name + + def test_na_pos_raises(self): + s = Series([0, np.nan]) + with pytest.raises(ValueError, match="`na_position` must be one of"): + s.argsort(na_position="bad input") + + @pytest.mark.parametrize( + "na_position, expected", + [ + (None, Series([2, 0, -1, -1], index=["c", "a", "b", "d"])), + ("first", Series([1, 3, 2, 0], index=["b", "d", "c", "a"])), + ("last", Series([2, 0, 1, 3], index=["c", "a", "b", "d"])), + ], + ) + def test_na_position(self, na_position, expected): + s = Series([2, np.nan, 1, np.nan], index=["a", "b", "c", "d"]) + tm.assert_series_equal(s.argsort(na_position=na_position), expected) From 372a8db8345afa813b2c00a48acad952d472f426 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Thu, 8 Jul 2021 10:46:21 +0200 Subject: [PATCH 09/13] default --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2ccfd7c096a73..afc534066b340 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3649,7 +3649,7 @@ def argsort(self, axis=0, kind="quicksort", order=None, na_position=None) -> Ser information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. - na_position : {"first", "last", None} + na_position : {None, "first", "last"} Puts NaNs at the beginning if *first*; *last* puts NaNs at the end. Defaults to *None*, which puts NaNs at the end an gives them all a sorting index of '-1'. From 0e76ee36a4b73c0033ba6eb8c8d15cd1b3b764ff Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (MBP)" Date: Sat, 18 Sep 2021 09:46:45 +0200 Subject: [PATCH 10/13] whats new --- doc/source/whatsnew/v1.4.0.rst | 58 +++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index cc9eeb5d661b5..64ec6e4ba07e6 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -185,6 +185,62 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is res +.. _whatsnew_140.notable_bug_fixes.argsort_bug_fix: + +Argsort now returns NumPy like +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The current implementation of :meth:`Series.argsort` suffers from two main issues: + + - Returning a useful argsort array which is only applicable to the values *after* null elements have been removed. + - A confusing and static index for the returned Series object which has, in general, random argsort indexes. + +For example with: + +.. ipython:: python + + ser = pd.Series([2, np.nan, np.nan, 1], index=["high", "null", "null2", "low"]) + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: ser.argsort() + Out[5]: + high 1 + null -1 + null2 -1 + low 0 + dtype: int64 + +Firstly we observe that, for elements that are not null, the argsort has returned *1* and *0*. These are the +correct sorting indices when nulls are removed, i.e. the lowest element has index *1* and the highest element has index *0*. +If the nulls are not removed the sorting indices for the lowest element is actually *3* and the highest is *0*. + +Secondly, the argsort values for null elements returned is *-1* and those values have been positioned in the array +associated with the previous index labels for "null" and "null2". As indicated above, however, the relationship +between the index and the argsort values does not hold for the not nulls values, since the index is not sorted. + +*New behaviour*: + +The new behaviour replicates the behaviour in ``numpy.argsort``, whilst extending the functionality to work +with pandas nullable data types, and providing a *-1* return value for part backwards compatability, and also +sorting the index to maintain a sensible and useful association with the index and argsort value. + +.. ipython:: + + ser.argsort() + +The additional ``na_position`` argument allows ``numpy`` replication with an associated series index. + +.. ipython:: + + ser.argsort(na_position="first") + +.. ipython:: + + ser.argsort(na_position="last") + .. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: notable_bug_fix3 @@ -424,7 +480,7 @@ Indexing - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`) - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) -- Bug in :class:`Series.argsort` where the index was misaligned and the indexed values were not correct (:issue:`42056`, :issue:`12694`) +- Bug in :class:`Series.argsort` where the index was misaligned and the indexed values were not correct - From a78b9ff37650bf6349a853c183b5c205b18ccc3e Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Sat, 18 Sep 2021 14:12:19 +0200 Subject: [PATCH 11/13] whats new --- doc/source/whatsnew/v1.4.0.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4b92a8771173c..b7d7686d6e467 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -224,20 +224,20 @@ between the index and the argsort values does not hold for the not nulls values, *New behaviour*: The new behaviour replicates the behaviour in ``numpy.argsort``, whilst extending the functionality to work -with pandas nullable data types, and providing a *-1* return value for part backwards compatability, and also +with pandas nullable data types, and providing a *-1* return value for part backwards compatibility, and also sorting the index to maintain a sensible and useful association with the index and argsort value. -.. ipython:: +.. ipython:: python ser.argsort() The additional ``na_position`` argument allows ``numpy`` replication with an associated series index. -.. ipython:: +.. ipython:: python ser.argsort(na_position="first") -.. ipython:: +.. ipython:: python ser.argsort(na_position="last") @@ -480,7 +480,7 @@ Indexing - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`) - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) -- Bug in :class:`Series.argsort` where the index was misaligned and the indexed values were not correct +- Bug in :class:`Series.argsort` where the index was misaligned and the indexed values were not correct (:issue:`12694`, :issue:`42056`) - From 220fc67d689f4ae6e99e8cc9a4322e538552d4c0 Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Sat, 2 Oct 2021 12:49:25 +0200 Subject: [PATCH 12/13] Merge remote-tracking branch 'upstream/master' into argsort_labelling_index_sorted # Conflicts: # doc/source/whatsnew/v1.4.0.rst --- .github/workflows/ci.yml | 9 + .github/workflows/pre-commit.yml | 2 + .pre-commit-config.yaml | 14 +- asv_bench/benchmarks/groupby.py | 32 +++ asv_bench/benchmarks/indexing_engines.py | 2 +- asv_bench/benchmarks/io/style.py | 14 + asv_bench/benchmarks/sparse.py | 17 +- ci/code_checks.sh | 11 + .../development/contributing_codebase.rst | 7 +- doc/source/index.rst.template | 3 + doc/source/user_guide/basics.rst | 4 + doc/source/user_guide/groupby.rst | 4 +- doc/source/user_guide/merging.rst | 14 +- doc/source/user_guide/visualization.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 1 + doc/source/whatsnew/v1.3.4.rst | 6 +- environment.yml | 1 + pandas/__init__.py | 124 +++++++- pandas/_config/config.py | 30 +- pandas/_libs/algos.pxd | 4 +- pandas/_libs/algos.pyi | 46 +-- pandas/_libs/algos.pyx | 142 +++++----- pandas/_libs/arrays.pyi | 4 +- pandas/_libs/dtypes.pxd | 48 ++++ pandas/_libs/groupby.pyi | 6 +- pandas/_libs/groupby.pyx | 162 +++++------ pandas/_libs/hashtable.pyi | 67 ++--- pandas/_libs/hashtable.pyx | 8 +- pandas/_libs/index.pyi | 8 +- pandas/_libs/index.pyx | 156 ++++++---- pandas/_libs/index_class_helper.pxi.in | 39 +-- pandas/_libs/internals.pyi | 2 +- pandas/_libs/internals.pyx | 30 +- pandas/_libs/join.pyi | 26 +- pandas/_libs/join.pyx | 108 +++---- pandas/_libs/lib.pyi | 55 ++-- pandas/_libs/lib.pyx | 17 +- pandas/_libs/reshape.pyx | 28 +- pandas/_libs/sparse.pyi | 2 +- pandas/_libs/src/parser/tokenizer.c | 28 +- pandas/_libs/tslibs/dtypes.pyi | 2 +- pandas/_libs/tslibs/dtypes.pyx | 4 +- pandas/_libs/tslibs/fields.pyi | 2 +- pandas/_libs/tslibs/nattype.pyx | 6 +- pandas/_libs/tslibs/offsets.pyx | 12 +- pandas/_libs/tslibs/period.pyi | 20 +- pandas/_libs/tslibs/period.pyx | 2 +- pandas/_libs/tslibs/strptime.pyi | 4 +- pandas/_libs/tslibs/strptime.pyx | 4 +- pandas/_libs/tslibs/timestamps.pyx | 20 +- pandas/_libs/tslibs/timezones.pyx | 16 +- pandas/_libs/tslibs/tzconversion.pyi | 4 +- pandas/_libs/tslibs/vectorized.pyi | 12 +- pandas/_libs/util.pxd | 15 - pandas/_libs/window/aggregations.pyx | 29 +- pandas/_testing/_io.py | 16 +- pandas/_typing.py | 10 +- pandas/core/_numba/__init__.py | 0 pandas/core/_numba/executor.py | 59 ++++ pandas/core/_numba/kernels/__init__.py | 3 + pandas/core/_numba/kernels/mean_.py | 119 ++++++++ pandas/core/algorithms.py | 20 +- pandas/core/apply.py | 68 +++-- pandas/core/array_algos/take.py | 9 +- pandas/core/arrays/_mixins.py | 9 +- pandas/core/arrays/base.py | 5 +- pandas/core/arrays/datetimelike.py | 4 +- pandas/core/arrays/interval.py | 4 +- pandas/core/arrays/sparse/accessor.py | 8 +- pandas/core/arrays/sparse/array.py | 50 +++- pandas/core/arrays/string_arrow.py | 13 +- pandas/core/base.py | 53 +++- pandas/core/computation/align.py | 8 +- pandas/core/computation/expr.py | 14 +- pandas/core/construction.py | 9 +- pandas/core/describe.py | 8 +- pandas/core/dtypes/cast.py | 4 + pandas/core/frame.py | 7 +- pandas/core/generic.py | 266 ++++++++++-------- pandas/core/groupby/generic.py | 145 ++-------- pandas/core/groupby/groupby.py | 206 ++++++++------ pandas/core/groupby/grouper.py | 36 ++- pandas/core/groupby/ops.py | 62 ++-- pandas/core/indexes/base.py | 45 ++- pandas/core/indexes/datetimelike.py | 257 ++++++++--------- pandas/core/indexes/extension.py | 48 ---- pandas/core/indexes/interval.py | 7 +- pandas/core/indexes/period.py | 11 +- pandas/core/internals/base.py | 23 +- pandas/core/internals/blocks.py | 45 +-- pandas/core/internals/managers.py | 6 +- pandas/core/internals/ops.py | 14 +- pandas/core/nanops.py | 2 + pandas/core/resample.py | 40 ++- pandas/core/reshape/merge.py | 17 +- pandas/core/reshape/tile.py | 2 + pandas/core/sample.py | 8 +- pandas/core/series.py | 17 +- pandas/core/sorting.py | 3 +- pandas/core/tools/timedeltas.py | 1 + pandas/core/window/ewm.py | 18 +- pandas/core/window/expanding.py | 4 +- pandas/core/window/rolling.py | 67 ++++- pandas/io/__init__.py | 12 + pandas/io/clipboard/__init__.py | 10 +- pandas/io/formats/__init__.py | 8 + pandas/io/formats/format.py | 2 +- pandas/io/formats/style.py | 17 +- pandas/io/formats/style_render.py | 16 +- pandas/io/formats/xml.py | 2 - pandas/io/html.py | 1 - pandas/io/json/_table_schema.py | 4 +- pandas/io/parsers/arrow_parser_wrapper.py | 15 + pandas/io/pytables.py | 28 +- pandas/io/xml.py | 8 +- pandas/plotting/_matplotlib/boxplot.py | 14 +- pandas/tests/api/test_api.py | 38 ++- pandas/tests/apply/test_frame_apply.py | 35 ++- pandas/tests/apply/test_frame_transform.py | 21 +- pandas/tests/apply/test_series_apply.py | 39 ++- pandas/tests/arrays/sparse/test_accessor.py | 16 ++ pandas/tests/arrays/sparse/test_array.py | 48 ++-- pandas/tests/extension/test_numpy.py | 13 + pandas/tests/frame/methods/test_explode.py | 12 +- .../tests/groupby/aggregate/test_aggregate.py | 90 +++++- pandas/tests/groupby/aggregate/test_other.py | 6 +- pandas/tests/groupby/test_groupby.py | 5 +- pandas/tests/groupby/test_grouping.py | 2 +- .../tests/indexes/base_class/test_indexing.py | 42 ++- pandas/tests/indexes/test_numpy_compat.py | 2 +- .../indexing/multiindex/test_multiindex.py | 24 +- pandas/tests/indexing/test_at.py | 21 ++ pandas/tests/internals/test_internals.py | 4 +- pandas/tests/io/parser/common/test_index.py | 1 - .../io/parser/dtypes/test_dtypes_basic.py | 23 ++ pandas/tests/io/parser/test_index_col.py | 2 - pandas/tests/io/test_parquet.py | 23 ++ pandas/tests/resample/test_resample_api.py | 3 +- pandas/tests/series/methods/test_rename.py | 24 ++ pandas/tests/tools/test_to_datetime.py | 22 ++ pandas/tests/window/test_numba.py | 34 ++- pandas/tseries/__init__.py | 11 + pandas/util/_depr_module.py | 107 ------- pyproject.toml | 85 +++++- requirements-dev.txt | 1 + 145 files changed, 2615 insertions(+), 1561 deletions(-) create mode 100644 pandas/_libs/dtypes.pxd create mode 100644 pandas/core/_numba/__init__.py create mode 100644 pandas/core/_numba/executor.py create mode 100644 pandas/core/_numba/kernels/__init__.py create mode 100644 pandas/core/_numba/kernels/mean_.py delete mode 100644 pandas/util/_depr_module.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 93c17997a95f7..c8f5f0385732f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,15 @@ jobs: environment-file: ${{ env.ENV_FILE }} use-only-tar-bz2: true + - name: Install node.js (for pyright) + uses: actions/setup-node@v2 + with: + node-version: "16" + + - name: Install pyright + # note: keep version in sync with .pre-commit-config.yaml + run: npm install -g pyright@1.1.171 + - name: Build Pandas uses: ./.github/actions/build_pandas diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 0609755678d78..9afb0e34f1075 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -13,6 +13,8 @@ jobs: concurrency: group: ${{ github.ref }}-pre-commit cancel-in-progress: ${{github.event_name == 'pull_request'}} + env: + SKIP: pyright steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eb0abe1829fc6..5a26ed150a3ed 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: - id: absolufy-imports files: ^pandas/ - repo: https://github.com/python/black - rev: 21.7b0 + rev: 21.9b0 hooks: - id: black - repo: https://github.com/codespell-project/codespell @@ -58,7 +58,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.23.3 + rev: v2.29.0 hooks: - id: pyupgrade args: [--py38-plus] @@ -81,6 +81,16 @@ repos: - flake8-comprehensions==3.1.0 - flake8-bugbear==21.3.2 - pandas-dev-flaker==0.2.0 +- repo: local + hooks: + - id: pyright + name: pyright + entry: pyright + language: node + pass_filenames: false + types: [python] + # note: keep version in sync with .github/workflows/ci.yml + additional_dependencies: ['pyright@1.1.171'] - repo: local hooks: - id: flake8-rst diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index e41f185e08443..86322661a4e8a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -603,6 +603,38 @@ def time_sum(self): self.df.groupby(["a"])["b"].sum() +class String: + # GH#41596 + param_names = ["dtype", "method"] + params = [ + ["str", "string[python]"], + [ + "sum", + "prod", + "min", + "max", + "mean", + "median", + "var", + "first", + "last", + "any", + "all", + ], + ] + + def setup(self, dtype, method): + cols = list("abcdefghjkl") + self.df = DataFrame( + np.random.randint(0, 100, size=(1_000_000, len(cols))), + columns=cols, + dtype=dtype, + ) + + def time_str_func(self, dtype, method): + self.df.groupby("a")[self.df.columns[1:]].agg(method) + + class Categories: def setup(self): N = 10 ** 5 diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index dc9576bc94d4c..0cbc300ee2fc4 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,5 +1,5 @@ """ -Benchmarks in this fiel depend exclusively on code in _libs/ +Benchmarks in this file depend exclusively on code in _libs/ If a PR does not edit anything in _libs, it is very unlikely that benchmarks in this file will be affected. diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index d3934942b7dc4..f0902c9c2c328 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -34,6 +34,14 @@ def peakmem_classes_render(self, cols, rows): self._style_classes() self.st._render_html(True, True) + def time_tooltips_render(self, cols, rows): + self._style_tooltips() + self.st._render_html(True, True) + + def peakmem_tooltips_render(self, cols, rows): + self._style_tooltips() + self.st._render_html(True, True) + def time_format_render(self, cols, rows): self._style_format() self.st._render_html(True, True) @@ -77,3 +85,9 @@ def _style_apply_format_hide(self): self.st.format("{:.3f}") self.st.hide_index(self.st.index[1:]) self.st.hide_columns(self.st.columns[1:]) + + def _style_tooltips(self): + ttips = DataFrame("abc", index=self.df.index[::2], columns=self.df.columns[::2]) + self.st = self.df.style.set_tooltips(ttips) + self.st.hide_index(self.st.index[12:]) + self.st.hide_columns(self.st.columns[12:]) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ff1c4c92fe551..8969beb17f2e3 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -95,11 +95,11 @@ class ToCooFrame: def setup(self): N = 10000 k = 10 - arr = np.full((N, k), np.nan) + arr = np.zeros((N, k), dtype=float) arr[0, 0] = 3.0 arr[12, 7] = -1.0 arr[0, 9] = 11.2 - self.df = pd.DataFrame(arr, dtype=pd.SparseDtype("float")) + self.df = pd.DataFrame(arr, dtype=pd.SparseDtype("float", fill_value=0.0)) def time_to_coo(self): self.df.sparse.to_coo() @@ -195,4 +195,17 @@ def time_take(self, indices, allow_fill): self.sp_arr.take(indices, allow_fill=allow_fill) +class GetItem: + def setup(self): + N = 1_000_000 + arr = make_array(N, 1e-5, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + + def time_integer_indexing(self): + self.sp_arr[78] + + def time_slice(self): + self.sp_arr[1:] + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7e4b5775af317..21250789fde9f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -84,6 +84,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pandas/tseries/ RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Cython Doctests' ; echo $MSG + python -m pytest --doctest-cython pandas/_libs + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DOCSTRINGS ### @@ -104,6 +108,13 @@ if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then MSG='Performing static analysis using mypy' ; echo $MSG mypy pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + + # run pyright, if it is installed + if command -v pyright &> /dev/null ; then + MSG='Performing static analysis using pyright' ; echo $MSG + pyright + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi fi exit $RET diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 6fe0fa82236c3..eeba46a46dfd9 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -395,12 +395,17 @@ This module will ultimately house types for repeatedly used concepts like "path- Validating type hints ~~~~~~~~~~~~~~~~~~~~~ -pandas uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running +pandas uses `mypy `_ and `pyright `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running .. code-block:: shell mypy pandas + # let pre-commit setup and run pyright + pre-commit run --all-files pyright + # or if pyright is installed (requires node.js) + pyright + .. _contributing.ci: Testing with continuous integration diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 51a6807b30e2a..3b440122c2b97 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -12,6 +12,9 @@ pandas documentation **Download documentation**: `PDF Version `__ | `Zipped HTML `__ +**Previous versions**: Documentation of previous pandas versions is available at +`pandas.pydata.org `__. + **Useful links**: `Binary Installers `__ | `Source Repository `__ | diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 82c8a27bec3a5..a589ad96ca7d9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1045,6 +1045,9 @@ not noted for a particular column will be ``NaN``: Mixed dtypes ++++++++++++ +.. deprecated:: 1.4.0 + Attempting to determine which columns cannot be aggregated and silently dropping them from the results is deprecated and will be removed in a future version. If any porition of the columns or operations provided fail, the call to ``.agg`` will raise. + When presented with mixed dtypes that cannot aggregate, ``.agg`` will only take the valid aggregations. This is similar to how ``.groupby.agg`` works. @@ -1061,6 +1064,7 @@ aggregations. This is similar to how ``.groupby.agg`` works. mdf.dtypes .. ipython:: python + :okwarning: mdf.agg(["min", "sum"]) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 2cd6efe592277..0fb59c50efa74 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -578,7 +578,7 @@ column, which produces an aggregated result with a hierarchical index: .. ipython:: python - grouped.agg([np.sum, np.mean, np.std]) + grouped[["C", "D"]].agg([np.sum, np.mean, np.std]) The resulting aggregations are named for the functions themselves. If you @@ -597,7 +597,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python ( - grouped.agg([np.sum, np.mean, np.std]).rename( + grouped[["C", "D"]].agg([np.sum, np.mean, np.std]).rename( columns={"sum": "foo", "mean": "bar", "std": "baz"} ) ) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 09b3d3a8c96df..cee12c6939b25 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -562,7 +562,7 @@ all standard database join operations between ``DataFrame`` or named ``Series`` (hierarchical), the number of levels must match the number of join keys from the right DataFrame or Series. * ``right_index``: Same usage as ``left_index`` for the right DataFrame or Series -* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults +* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``, ``'cross'``. Defaults to ``inner``. See below for more detailed description of each method. * ``sort``: Sort the result DataFrame by the join keys in lexicographical order. Defaults to ``True``, setting to ``False`` will improve performance @@ -707,6 +707,7 @@ either the left or right tables, the values in the joined table will be ``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only ``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames ``inner``, ``INNER JOIN``, Use intersection of keys from both frames + ``cross``, ``CROSS JOIN``, Create the cartesian product of rows of both frames .. ipython:: python @@ -751,6 +752,17 @@ either the left or right tables, the values in the joined table will be p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); +.. ipython:: python + + result = pd.merge(left, right, how="cross") + +.. ipython:: python + :suppress: + + @savefig merging_merge_cross.png + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); + You can merge a mult-indexed Series and a DataFrame, if the names of the MultiIndex correspond to the columns from the DataFrame. Transform the Series to a DataFrame using :meth:`Series.reset_index` before merging, diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 4a4c3624aa614..fd0af7583f5dc 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1651,7 +1651,7 @@ remedy this, ``DataFrame`` plotting supports the use of the ``colormap`` argumen which accepts either a Matplotlib `colormap `__ or a string that is a name of a colormap registered with Matplotlib. A visualization of the default matplotlib colormaps is available `here -`__. +`__. As matplotlib does not directly support colormaps for line-based plots, the colors are selected based on an even spacing determined by the number of columns diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 8d9821e53e30c..239431b7621c6 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -105,6 +105,7 @@ aggregations. This is similar to how groupby ``.agg()`` works. (:issue:`15015`) df.dtypes .. ipython:: python + :okwarning: df.agg(['min', 'sum']) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 6212f2c6f3399..05667264ad9af 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -14,13 +14,16 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`) - Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`) - Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`) - Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`) +- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`) - Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`) - Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`) - Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`) -- +- Fixed regression in :meth:`DataFrame.explode` raising ``AssertionError`` when ``column`` is any scalar which is not a string (:issue:`43314`) +- Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`) .. --------------------------------------------------------------------------- @@ -29,6 +32,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue:`43132`) +- Fixed bug in :meth:`Series.aggregate` not passing the first ``args`` to the user supplied ``func`` in certain cases (:issue:`43357`) .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index 733bd06fbe12f..00fb5b0580200 100644 --- a/environment.yml +++ b/environment.yml @@ -122,3 +122,4 @@ dependencies: - types-PyMySQL - types-pytz - types-setuptools + - pytest-cython diff --git a/pandas/__init__.py b/pandas/__init__.py index 294b092e33c58..9505d0481ee19 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -131,7 +131,7 @@ qcut, ) -import pandas.api +from pandas import api, arrays, errors, io, plotting, testing, tseries from pandas.util._print_versions import show_versions from pandas.io.api import ( @@ -170,8 +170,6 @@ from pandas.io.json import _json_normalize as json_normalize from pandas.util._tester import test -import pandas.testing -import pandas.arrays # use the closest tagged version if possible from pandas._version import get_versions @@ -181,7 +179,6 @@ __git_version__ = v.get("full-revisionid") del get_versions, v - # GH 27101 __deprecated_num_index_names = ["Float64Index", "Int64Index", "UInt64Index"] @@ -303,3 +300,122 @@ def __getattr__(name): - Time series-specific functionality: date range generation and frequency conversion, moving window statistics, date shifting and lagging. """ + +# Use __all__ to let type checkers know what is part of the public API. +# Pandas is not (yet) a py.typed library: the public API is determined +# based on the documentation. +__all__ = [ + "BooleanDtype", + "Categorical", + "CategoricalDtype", + "CategoricalIndex", + "DataFrame", + "DateOffset", + "DatetimeIndex", + "DatetimeTZDtype", + "ExcelFile", + "ExcelWriter", + "Flags", + "Float32Dtype", + "Float64Dtype", + "Grouper", + "HDFStore", + "Index", + "IndexSlice", + "Int16Dtype", + "Int32Dtype", + "Int64Dtype", + "Int8Dtype", + "Interval", + "IntervalDtype", + "IntervalIndex", + "MultiIndex", + "NA", + "NaT", + "NamedAgg", + "NumericIndex", + "Period", + "PeriodDtype", + "PeriodIndex", + "RangeIndex", + "Series", + "SparseDtype", + "StringDtype", + "Timedelta", + "TimedeltaIndex", + "Timestamp", + "UInt16Dtype", + "UInt32Dtype", + "UInt64Dtype", + "UInt8Dtype", + "api", + "array", + "arrays", + "bdate_range", + "concat", + "crosstab", + "cut", + "date_range", + "describe_option", + "errors", + "eval", + "factorize", + "get_dummies", + "get_option", + "infer_freq", + "interval_range", + "io", + "isna", + "isnull", + "json_normalize", + "lreshape", + "melt", + "merge", + "merge_asof", + "merge_ordered", + "notna", + "notnull", + "offsets", + "option_context", + "options", + "period_range", + "pivot", + "pivot_table", + "plotting", + "qcut", + "read_clipboard", + "read_csv", + "read_excel", + "read_feather", + "read_fwf", + "read_gbq", + "read_hdf", + "read_html", + "read_json", + "read_orc", + "read_parquet", + "read_pickle", + "read_sas", + "read_spss", + "read_sql", + "read_sql_query", + "read_sql_table", + "read_stata", + "read_table", + "read_xml", + "reset_option", + "set_eng_float_format", + "set_option", + "show_versions", + "test", + "testing", + "timedelta_range", + "to_datetime", + "to_numeric", + "to_pickle", + "to_timedelta", + "tseries", + "unique", + "value_counts", + "wide_to_long", +] diff --git a/pandas/_config/config.py b/pandas/_config/config.py index ed48ff7ae08c6..e2e6bbe8db7cc 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -50,7 +50,6 @@ from __future__ import annotations -from collections import namedtuple from contextlib import ( ContextDecorator, contextmanager, @@ -60,14 +59,28 @@ Any, Callable, Iterable, + NamedTuple, cast, ) import warnings from pandas._typing import F -DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") -RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") + +class DeprecatedOption(NamedTuple): + key: str + msg: str | None + rkey: str | None + removal_ver: str | None + + +class RegisteredOption(NamedTuple): + key: str + defval: object + doc: str + validator: Callable[[object], Any] | None + cb: Callable[[str], Any] | None + # holds deprecated option metadata _deprecated_options: dict[str, DeprecatedOption] = {} @@ -425,7 +438,7 @@ def register_option( key: str, defval: object, doc: str = "", - validator: Callable[[Any], Any] | None = None, + validator: Callable[[object], Any] | None = None, cb: Callable[[str], Any] | None = None, ) -> None: """ @@ -497,7 +510,10 @@ def register_option( def deprecate_option( - key: str, msg: str | None = None, rkey: str | None = None, removal_ver=None + key: str, + msg: str | None = None, + rkey: str | None = None, + removal_ver: str | None = None, ) -> None: """ Mark option `key` as deprecated, if code attempts to access this option, @@ -523,7 +539,7 @@ def deprecate_option( re-routed to `rkey` including set/get/reset. rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). used by the default message if no `msg` is specified. - removal_ver : optional + removal_ver : str, optional Specifies the version in which this option will be removed. used by the default message if no `msg` is specified. @@ -823,7 +839,7 @@ def inner(x) -> None: return inner -def is_nonnegative_int(value: int | None) -> None: +def is_nonnegative_int(value: object) -> None: """ Verify that value is None or a positive int. diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 4f7cc9345ed30..fdeff2ed11805 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -1,7 +1,7 @@ -from pandas._libs.util cimport numeric +from pandas._libs.dtypes cimport numeric_t -cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil +cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil cdef enum TiebreakEnumType: TIEBREAK_AVERAGE diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index fdec60a84a708..bff7d117c97da 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -50,16 +50,16 @@ def kth_smallest( def nancorr( mat: np.ndarray, # const float64_t[:, :] - cov: bool = False, - minp=None, + cov: bool = ..., + minp=..., ) -> np.ndarray: ... # ndarray[float64_t, ndim=2] def nancorr_spearman( mat: np.ndarray, # ndarray[float64_t, ndim=2] - minp: int = 1, + minp: int = ..., ) -> np.ndarray: ... # ndarray[float64_t, ndim=2] def nancorr_kendall( mat: np.ndarray, # ndarray[float64_t, ndim=2] - minp: int = 1, + minp: int = ..., ) -> np.ndarray: ... # ndarray[float64_t, ndim=2] # ---------------------------------------------------------------------- @@ -77,36 +77,36 @@ def nancorr_kendall( # uint16_t # uint8_t -def validate_limit(nobs: int | None, limit=None) -> int: ... +def validate_limit(nobs: int | None, limit=...) -> int: ... def pad( old: np.ndarray, # ndarray[algos_t] new: np.ndarray, # ndarray[algos_t] - limit=None, + limit=..., ) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def pad_inplace( values: np.ndarray, # algos_t[:] mask: np.ndarray, # uint8_t[:] - limit=None, + limit=..., ) -> None: ... def pad_2d_inplace( values: np.ndarray, # algos_t[:, :] mask: np.ndarray, # const uint8_t[:, :] - limit=None, + limit=..., ) -> None: ... def backfill( old: np.ndarray, # ndarray[algos_t] new: np.ndarray, # ndarray[algos_t] - limit=None, + limit=..., ) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def backfill_inplace( values: np.ndarray, # algos_t[:] mask: np.ndarray, # uint8_t[:] - limit=None, + limit=..., ) -> None: ... def backfill_2d_inplace( values: np.ndarray, # algos_t[:, :] mask: np.ndarray, # const uint8_t[:, :] - limit=None, + limit=..., ) -> None: ... def is_monotonic( arr: np.ndarray, # ndarray[algos_t, ndim=1] @@ -150,18 +150,18 @@ def diff_2d( ) -> None: ... def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ... def ensure_object(arr: object) -> npt.NDArray[np.object_]: ... -def ensure_complex64(arr: object, copy=True) -> npt.NDArray[np.complex64]: ... -def ensure_complex128(arr: object, copy=True) -> npt.NDArray[np.complex128]: ... -def ensure_float64(arr: object, copy=True) -> npt.NDArray[np.float64]: ... -def ensure_float32(arr: object, copy=True) -> npt.NDArray[np.float32]: ... -def ensure_int8(arr: object, copy=True) -> npt.NDArray[np.int8]: ... -def ensure_int16(arr: object, copy=True) -> npt.NDArray[np.int16]: ... -def ensure_int32(arr: object, copy=True) -> npt.NDArray[np.int32]: ... -def ensure_int64(arr: object, copy=True) -> npt.NDArray[np.int64]: ... -def ensure_uint8(arr: object, copy=True) -> npt.NDArray[np.uint8]: ... -def ensure_uint16(arr: object, copy=True) -> npt.NDArray[np.uint16]: ... -def ensure_uint32(arr: object, copy=True) -> npt.NDArray[np.uint32]: ... -def ensure_uint64(arr: object, copy=True) -> npt.NDArray[np.uint64]: ... +def ensure_complex64(arr: object, copy=...) -> npt.NDArray[np.complex64]: ... +def ensure_complex128(arr: object, copy=...) -> npt.NDArray[np.complex128]: ... +def ensure_float64(arr: object, copy=...) -> npt.NDArray[np.float64]: ... +def ensure_float32(arr: object, copy=...) -> npt.NDArray[np.float32]: ... +def ensure_int8(arr: object, copy=...) -> npt.NDArray[np.int8]: ... +def ensure_int16(arr: object, copy=...) -> npt.NDArray[np.int16]: ... +def ensure_int32(arr: object, copy=...) -> npt.NDArray[np.int32]: ... +def ensure_int64(arr: object, copy=...) -> npt.NDArray[np.int64]: ... +def ensure_uint8(arr: object, copy=...) -> npt.NDArray[np.uint8]: ... +def ensure_uint16(arr: object, copy=...) -> npt.NDArray[np.uint16]: ... +def ensure_uint32(arr: object, copy=...) -> npt.NDArray[np.uint32]: ... +def ensure_uint64(arr: object, copy=...) -> npt.NDArray[np.uint64]: ... def take_1d_int8_int8( values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... ) -> None: ... diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a18cfc41d1e2e..82f9280870d59 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -45,6 +45,11 @@ from numpy cimport ( cnp.import_array() cimport pandas._libs.util as util +from pandas._libs.dtypes cimport ( + iu_64_floating_obj_t, + numeric_object_t, + numeric_t, +) from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -54,10 +59,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) -from pandas._libs.util cimport ( - get_nat, - numeric, -) +from pandas._libs.util cimport get_nat import pandas._libs.missing as missing @@ -239,9 +241,9 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups): return indexer.base, counts.base -cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: +cdef inline Py_ssize_t swap(numeric_t *a, numeric_t *b) nogil: cdef: - numeric t + numeric_t t # cython doesn't allow pointer dereference so use array syntax t = a[0] @@ -250,7 +252,7 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: return 0 -cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil: +cdef inline numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil: """ See kth_smallest.__doc__. The additional parameter n specifies the maximum number of elements considered in arr, needed for compatibility with usage @@ -258,7 +260,7 @@ cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nog """ cdef: Py_ssize_t i, j, l, m - numeric x + numeric_t x l = 0 m = n - 1 @@ -290,7 +292,7 @@ cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nog @cython.boundscheck(False) @cython.wraparound(False) -def kth_smallest(numeric[::1] arr, Py_ssize_t k) -> numeric: +def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t: """ Compute the kth smallest value in arr. Note that the input array will be modified. @@ -308,7 +310,7 @@ def kth_smallest(numeric[::1] arr, Py_ssize_t k) -> numeric: The kth smallest value in arr """ cdef: - numeric result + numeric_t result with nogil: result = kth_smallest_c(&arr[0], k, arr.shape[0]) @@ -513,20 +515,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr # ---------------------------------------------------------------------- -ctypedef fused algos_t: - float64_t - float32_t - object - int64_t - int32_t - int16_t - int8_t - uint64_t - uint32_t - uint16_t - uint8_t - - def validate_limit(nobs: int | None, limit=None) -> int: """ Check that the `limit` argument is a positive integer. @@ -555,12 +543,16 @@ def validate_limit(nobs: int | None, limit=None) -> int: @cython.boundscheck(False) @cython.wraparound(False) -def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: +def pad( + ndarray[numeric_object_t] old, + ndarray[numeric_object_t] new, + limit=None +) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t, ndim=1] indexer - algos_t cur, next_val + numeric_object_t cur, next_val int lim, fill_count = 0 nleft = len(old) @@ -613,10 +605,10 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): +def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N - algos_t val + numeric_object_t val uint8_t prev_mask int lim, fill_count = 0 @@ -645,10 +637,10 @@ def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): +def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K - algos_t val + numeric_object_t val int lim, fill_count = 0 K, N = (values).shape @@ -701,12 +693,16 @@ D @cython.boundscheck(False) @cython.wraparound(False) -def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: +def backfill( + ndarray[numeric_object_t] old, + ndarray[numeric_object_t] new, + limit=None +) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t, ndim=1] indexer - algos_t cur, prev + numeric_object_t cur, prev int lim, fill_count = 0 nleft = len(old) @@ -758,11 +754,11 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: return indexer -def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): +def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None): pad_inplace(values[::-1], mask[::-1], limit=limit) -def backfill_2d_inplace(algos_t[:, :] values, +def backfill_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None): pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit) @@ -770,7 +766,7 @@ def backfill_2d_inplace(algos_t[:, :] values, @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): +def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): """ Returns ------- @@ -781,7 +777,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): """ cdef: Py_ssize_t i, n - algos_t prev, cur + numeric_object_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 bint is_unique = 1 @@ -801,7 +797,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): if timelike and arr[0] == NPY_NAT: return False, False, True - if algos_t is not object: + if numeric_object_t is not object: with nogil: prev = arr[0] for i in range(1, n): @@ -860,34 +856,30 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): # rank_1d, rank_2d # ---------------------------------------------------------------------- -ctypedef fused rank_t: - object - float64_t - uint64_t - int64_t - - -cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None): +cdef iu_64_floating_obj_t get_rank_nan_fill_val( + bint rank_nans_highest, + iu_64_floating_obj_t[:] _=None +): """ Return the value we'll use to represent missing values when sorting depending on if we'd like missing values to end up at the top/bottom. (The second parameter is unused, but needed for fused type specialization) """ if rank_nans_highest: - if rank_t is object: + if iu_64_floating_obj_t is object: return Infinity() - elif rank_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return util.INT64_MAX - elif rank_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: return util.UINT64_MAX else: return np.inf else: - if rank_t is object: + if iu_64_floating_obj_t is object: return NegInfinity() - elif rank_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return NPY_NAT - elif rank_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: return 0 else: return -np.inf @@ -896,7 +888,7 @@ cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None): @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - ndarray[rank_t, ndim=1] values, + ndarray[iu_64_floating_obj_t, ndim=1] values, const intp_t[:] labels=None, bint is_datetimelike=False, ties_method="average", @@ -909,7 +901,7 @@ def rank_1d( Parameters ---------- - values : array of rank_t values to be ranked + values : array of iu_64_floating_obj_t values to be ranked labels : np.ndarray[np.intp] or None Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called @@ -939,11 +931,11 @@ def rank_1d( int64_t[::1] grp_sizes intp_t[:] lexsort_indexer float64_t[::1] out - ndarray[rank_t, ndim=1] masked_vals - rank_t[:] masked_vals_memview + ndarray[iu_64_floating_obj_t, ndim=1] masked_vals + iu_64_floating_obj_t[:] masked_vals_memview uint8_t[:] mask bint keep_na, nans_rank_highest, check_labels, check_mask - rank_t nan_fill_val + iu_64_floating_obj_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -964,21 +956,22 @@ def rank_1d( check_labels = labels is not None # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + check_mask = not (iu_64_floating_obj_t is uint64_t or + (iu_64_floating_obj_t is int64_t and not is_datetimelike)) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array - if rank_t is object and values.dtype != np.object_: + if iu_64_floating_obj_t is object and values.dtype != np.object_: masked_vals = values.astype('O') else: masked_vals = values.copy() - if rank_t is object: + if iu_64_floating_obj_t is object: mask = missing.isnaobj(masked_vals) - elif rank_t is int64_t and is_datetimelike: + elif iu_64_floating_obj_t is int64_t and is_datetimelike: mask = (masked_vals == NPY_NAT).astype(np.uint8) - elif rank_t is float64_t: + elif iu_64_floating_obj_t is float64_t: mask = np.isnan(masked_vals).astype(np.uint8) else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) @@ -990,7 +983,7 @@ def rank_1d( # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) if nans_rank_highest: order = [masked_vals, mask] else: @@ -1037,7 +1030,7 @@ cdef void rank_sorted_1d( int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, # Can make const with cython3 (https://github.com/cython/cython/issues/3222) - rank_t[:] masked_vals, + iu_64_floating_obj_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1061,7 +1054,7 @@ cdef void rank_sorted_1d( if labels is None. sort_indexer : intp_t[:] Array of indices which sorts masked_vals - masked_vals : rank_t[:] + masked_vals : iu_64_floating_obj_t[:] The values input to rank_1d, with missing values replaced by fill values mask : uint8_t[:] Array where entries are True if the value is missing, False otherwise. @@ -1093,7 +1086,7 @@ cdef void rank_sorted_1d( # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil - if rank_t is object: + if iu_64_floating_obj_t is object: with gil: for i in range(N): at_end = i == N - 1 @@ -1301,7 +1294,7 @@ cdef void rank_sorted_1d( def rank_2d( - ndarray[rank_t, ndim=2] in_arr, + ndarray[iu_64_floating_obj_t, ndim=2] in_arr, int axis=0, bint is_datetimelike=False, ties_method="average", @@ -1316,13 +1309,13 @@ def rank_2d( Py_ssize_t k, n, col float64_t[::1, :] out # Column-major so columns are contiguous int64_t[::1] grp_sizes - ndarray[rank_t, ndim=2] values - rank_t[:, :] masked_vals + ndarray[iu_64_floating_obj_t, ndim=2] values + iu_64_floating_obj_t[:, :] masked_vals intp_t[:, :] sort_indexer uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest - rank_t nan_fill_val + iu_64_floating_obj_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -1332,24 +1325,25 @@ def rank_2d( keep_na = na_option == 'keep' # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike)) + check_mask = not (iu_64_floating_obj_t is uint64_t or + (iu_64_floating_obj_t is int64_t and not is_datetimelike)) if axis == 1: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() - if rank_t is object: + if iu_64_floating_obj_t is object: if values.dtype != np.object_: values = values.astype('O') nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) - if rank_t is object: + if iu_64_floating_obj_t is object: mask = missing.isnaobj2d(values).view(np.uint8) - elif rank_t is float64_t: + elif iu_64_floating_obj_t is float64_t: mask = np.isnan(values).view(np.uint8) # int64 and datetimelike diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 67af9653fc75a..76e4321cc0c77 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -25,10 +25,10 @@ class NDArrayBacked: @property def nbytes(self) -> int: ... def copy(self): ... - def delete(self, loc, axis=0): ... + def delete(self, loc, axis=...): ... def swapaxes(self, axis1, axis2): ... def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... def reshape(self, *args, **kwargs): ... - def ravel(self, order="C"): ... + def ravel(self, order=...): ... @property def T(self): ... diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd new file mode 100644 index 0000000000000..f87a1525b15fd --- /dev/null +++ b/pandas/_libs/dtypes.pxd @@ -0,0 +1,48 @@ +""" +Common location for shared fused types +""" + +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) + +# All numeric types except complex +ctypedef fused numeric_t: + int8_t + int16_t + int32_t + int64_t + + uint8_t + uint16_t + uint32_t + uint64_t + + float32_t + float64_t + +# All numeric types + object, doesn't include complex +ctypedef fused numeric_object_t: + numeric_t + object + +# i64 + u64 + all float types +ctypedef fused iu_64_floating_t: + float64_t + float32_t + int64_t + uint64_t + +# i64 + u64 + all float types + object +ctypedef fused iu_64_floating_obj_t: + iu_64_floating_t + object diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 59b952cd46c56..0450a3483d346 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -6,9 +6,9 @@ from pandas._typing import npt def group_median_float64( out: np.ndarray, # ndarray[float64_t, ndim=2] - counts: np.ndarray, # ndarray[int64_t] + counts: npt.NDArray[np.int64], values: np.ndarray, # ndarray[float64_t, ndim=2] - labels: np.ndarray, # ndarray[int64_t] + labels: npt.NDArray[np.int64], min_count: int = ..., # Py_ssize_t ) -> None: ... def group_cumprod_float64( @@ -37,7 +37,7 @@ def group_fillna_indexer( out: np.ndarray, # ndarray[intp_t] labels: np.ndarray, # ndarray[int64_t] sorted_labels: npt.NDArray[np.intp], - mask: np.ndarray, # ndarray[uint8_t] + mask: npt.NDArray[np.uint8], direction: Literal["ffill", "bfill"], limit: int, # int64_t dropna: bool, diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b8700aa473d03..1e05ef443d516 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -31,10 +31,7 @@ from numpy.math cimport NAN cnp.import_array() from pandas._libs.algos cimport kth_smallest_c -from pandas._libs.util cimport ( - get_nat, - numeric, -) +from pandas._libs.util cimport get_nat from pandas._libs.algos import ( ensure_platform_int, @@ -43,6 +40,11 @@ from pandas._libs.algos import ( take_2d_axis1_float64_float64, ) +from pandas._libs.dtypes cimport ( + iu_64_floating_obj_t, + iu_64_floating_t, + numeric_t, +) from pandas._libs.missing cimport checknull @@ -200,8 +202,8 @@ def group_cumprod_float64(float64_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cumsum(numeric[:, ::1] out, - ndarray[numeric, ndim=2] values, +def group_cumsum(numeric_t[:, ::1] out, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, int ngroups, is_datetimelike, @@ -230,8 +232,8 @@ def group_cumsum(numeric[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, size - numeric val, y, t - numeric[:, ::1] accum, compensation + numeric_t val, y, t + numeric_t[:, ::1] accum, compensation intp_t lab N, K = (values).shape @@ -249,7 +251,7 @@ def group_cumsum(numeric[:, ::1] out, # For floats, use Kahan summation to reduce floating-point # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) - if numeric == float32_t or numeric == float64_t: + if numeric_t == float32_t or numeric_t == float64_t: if val == val: y = val - compensation[lab, j] t = accum[lab, j] + y @@ -480,6 +482,12 @@ ctypedef fused add_t: complex128_t object +ctypedef fused mean_t: + float64_t + float32_t + complex64_t + complex128_t + @cython.wraparound(False) @cython.boundscheck(False) @@ -669,9 +677,9 @@ def group_var(floating[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_mean(floating[:, ::1] out, +def group_mean(mean_t[:, ::1] out, int64_t[::1] counts, - ndarray[floating, ndim=2] values, + ndarray[mean_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -711,8 +719,8 @@ def group_mean(floating[:, ::1] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count, y, t, nan_val - floating[:, ::1] sumx, compensation + mean_t val, count, y, t, nan_val + mean_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -805,7 +813,7 @@ def group_ohlc(floating[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t, ndim=2] out, - ndarray[numeric, ndim=1] values, + ndarray[numeric_t, ndim=1] values, ndarray[intp_t] labels, ndarray[uint8_t] mask, const intp_t[:] sort_indexer, @@ -921,23 +929,15 @@ def group_quantile(ndarray[float64_t, ndim=2] out, # group_nth, group_last, group_rank # ---------------------------------------------------------------------- -ctypedef fused rank_t: - float64_t - float32_t - int64_t - uint64_t - object - - -cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: - if rank_t is object: +cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) nogil: + if iu_64_floating_obj_t is object: # Should never be used, but we need to avoid the `val != val` below # or else cython will raise about gil acquisition. raise NotImplementedError - elif rank_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return is_datetimelike and val == NPY_NAT - elif rank_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: # There is no NA value for uint64 return False else: @@ -945,12 +945,12 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const rank_t[:, :] values` +# use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_last(rank_t[:, ::1] out, +def group_last(iu_64_floating_obj_t[:, ::1] out, int64_t[::1] counts, - ndarray[rank_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1) -> None: """ @@ -958,8 +958,8 @@ def group_last(rank_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx + iu_64_floating_obj_t val + ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -970,14 +970,14 @@ def group_last(rank_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: + if iu_64_floating_obj_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if rank_t is object: + if iu_64_floating_obj_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1019,9 +1019,9 @@ def group_last(rank_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if rank_t is int64_t: + if iu_64_floating_obj_t is int64_t: out[i, j] = NPY_NAT - elif rank_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: runtime_error = True break else: @@ -1037,12 +1037,12 @@ def group_last(rank_t[:, ::1] out, # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const rank_t[:, :] values` +# use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_nth(rank_t[:, ::1] out, +def group_nth(iu_64_floating_obj_t[:, ::1] out, int64_t[::1] counts, - ndarray[rank_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, int64_t min_count=-1, int64_t rank=1, @@ -1052,8 +1052,8 @@ def group_nth(rank_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx + iu_64_floating_obj_t val + ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -1064,14 +1064,14 @@ def group_nth(rank_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: + if iu_64_floating_obj_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if rank_t is object: + if iu_64_floating_obj_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1116,9 +1116,9 @@ def group_nth(rank_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if rank_t is int64_t: + if iu_64_floating_obj_t is int64_t: out[i, j] = NPY_NAT - elif rank_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: runtime_error = True break else: @@ -1135,7 +1135,7 @@ def group_nth(rank_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, ::1] out, - ndarray[rank_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, str ties_method="average", @@ -1147,7 +1147,7 @@ def group_rank(float64_t[:, ::1] out, ---------- out : np.ndarray[np.float64, ndim=2] Values to which this method will write its results. - values : np.ndarray of rank_t values to be ranked + values : np.ndarray of iu_64_floating_obj_t values to be ranked labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values` @@ -1202,18 +1202,12 @@ def group_rank(float64_t[:, ::1] out, # ---------------------------------------------------------------------- # TODO: consider implementing for more dtypes -ctypedef fused groupby_t: - float64_t - float32_t - int64_t - uint64_t - @cython.wraparound(False) @cython.boundscheck(False) -cdef group_min_max(groupby_t[:, ::1] out, +cdef group_min_max(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1225,7 +1219,7 @@ cdef group_min_max(groupby_t[:, ::1] out, Parameters ---------- - out : np.ndarray[groupby_t, ndim=2] + out : np.ndarray[iu_64_floating_t, ndim=2] Array to store result in. counts : np.ndarray[int64] Input as a zeroed array, populated by group sizes during algorithm @@ -1254,8 +1248,8 @@ cdef group_min_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - groupby_t val, nan_val - ndarray[groupby_t, ndim=2] group_min_or_max + iu_64_floating_t val, nan_val + ndarray[iu_64_floating_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs bint uses_mask = mask is not None @@ -1270,10 +1264,10 @@ cdef group_min_max(groupby_t[:, ::1] out, nobs = np.zeros((out).shape, dtype=np.int64) group_min_or_max = np.empty_like(out) - if groupby_t is int64_t: + if iu_64_floating_t is int64_t: group_min_or_max[:] = -_int64_max if compute_max else _int64_max nan_val = NPY_NAT - elif groupby_t is uint64_t: + elif iu_64_floating_t is uint64_t: # NB: We do not define nan_val because there is no such thing # for uint64_t. We carefully avoid having to reference it in this # case. @@ -1311,7 +1305,7 @@ cdef group_min_max(groupby_t[:, ::1] out, for i in range(ngroups): for j in range(K): if nobs[i, j] < min_count: - if groupby_t is uint64_t: + if iu_64_floating_t is uint64_t: runtime_error = True break else: @@ -1330,9 +1324,9 @@ cdef group_min_max(groupby_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_max(groupby_t[:, ::1] out, +def group_max(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1354,9 +1348,9 @@ def group_max(groupby_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_min(groupby_t[:, ::1] out, +def group_min(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1378,8 +1372,8 @@ def group_min(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef group_cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef group_cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, uint8_t[:, ::1] mask, const intp_t[::1] labels, int ngroups, @@ -1391,9 +1385,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Parameters ---------- - out : np.ndarray[groupby_t, ndim=2] + out : np.ndarray[iu_64_floating_t, ndim=2] Array to store cummin/max in. - values : np.ndarray[groupby_t, ndim=2] + values : np.ndarray[iu_64_floating_t, ndim=2] Values to take cummin/max of. mask : np.ndarray[bool] or None If not None, indices represent missing values, @@ -1415,12 +1409,12 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - groupby_t[:, ::1] accum + iu_64_floating_t[:, ::1] accum accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) - if groupby_t is int64_t: + if iu_64_floating_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max - elif groupby_t is uint64_t: + elif iu_64_floating_t is uint64_t: accum[:] = 0 if compute_max else np.iinfo(np.uint64).max else: accum[:] = -np.inf if compute_max else np.inf @@ -1433,10 +1427,10 @@ cdef group_cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, - groupby_t[:, ::1] accum, + iu_64_floating_t[:, ::1] accum, bint skipna, bint is_datetimelike, bint compute_max): @@ -1446,12 +1440,12 @@ cdef cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval, na_val + iu_64_floating_t val, mval, na_val uint8_t[:, ::1] seen_na intp_t lab bint na_possible - if groupby_t is float64_t or groupby_t is float32_t: + if iu_64_floating_t is float64_t or iu_64_floating_t is float32_t: na_val = NaN na_possible = True elif is_datetimelike: @@ -1492,11 +1486,11 @@ cdef cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef masked_cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef masked_cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, uint8_t[:, ::1] mask, const intp_t[::1] labels, - groupby_t[:, ::1] accum, + iu_64_floating_t[:, ::1] accum, bint skipna, bint compute_max): """ @@ -1505,7 +1499,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval + iu_64_floating_t val, mval uint8_t[:, ::1] seen_na intp_t lab @@ -1536,8 +1530,8 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummin(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +def group_cummin(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, @@ -1558,8 +1552,8 @@ def group_cummin(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummax(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +def group_cummax(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 951703e04d5a3..bf7df5776896b 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -1,11 +1,12 @@ from typing import ( - Any, Hashable, Literal, ) import numpy as np +from pandas._typing import npt + def unique_label_indices( labels: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... @@ -20,11 +21,11 @@ class ObjectFactorizer(Factorizer): uniques: ObjectVector def factorize( self, - values: np.ndarray, # ndarray[object] + values: npt.NDArray[np.object_], sort: bool = ..., na_sentinel=..., na_value=..., - ) -> np.ndarray: ... # np.ndarray[intp] + ) -> npt.NDArray[np.intp]: ... class Int64Factorizer(Factorizer): table: Int64HashTable @@ -35,77 +36,77 @@ class Int64Factorizer(Factorizer): sort: bool = ..., na_sentinel=..., na_value=..., - ) -> np.ndarray: ... # np.ndarray[intp] + ) -> npt.NDArray[np.intp]: ... class Int64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int64] + def to_array(self) -> npt.NDArray[np.int64]: ... class Int32Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int32] + def to_array(self) -> npt.NDArray[np.int32]: ... class Int16Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int16] + def to_array(self) -> npt.NDArray[np.int16]: ... class Int8Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.int8] + def to_array(self) -> npt.NDArray[np.int8]: ... class UInt64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint64] + def to_array(self) -> npt.NDArray[np.uint64]: ... class UInt32Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint32] + def to_array(self) -> npt.NDArray[np.uint32]: ... class UInt16Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint16] + def to_array(self) -> npt.NDArray[np.uint16]: ... class UInt8Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint8] + def to_array(self) -> npt.NDArray[np.uint8]: ... class Float64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.float64] + def to_array(self) -> npt.NDArray[np.float64]: ... class Float32Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.float32] + def to_array(self) -> npt.NDArray[np.float32]: ... class Complex128Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex128] + def to_array(self) -> npt.NDArray[np.complex128]: ... class Complex64Vector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex64] + def to_array(self) -> npt.NDArray[np.complex64]: ... class StringVector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[object] + def to_array(self) -> npt.NDArray[np.object_]: ... class ObjectVector: def __init__(self): ... def __len__(self) -> int: ... - def to_array(self) -> np.ndarray: ... # np.ndarray[object] + def to_array(self) -> npt.NDArray[np.object_]: ... class HashTable: # NB: The base HashTable class does _not_ actually have these methods; @@ -132,7 +133,7 @@ class HashTable: def lookup( self, values: np.ndarray, # np.ndarray[subclass-specific] - ) -> np.ndarray: ... # np.ndarray[np.intp] + ) -> npt.NDArray[np.intp]: ... def get_labels( self, values: np.ndarray, # np.ndarray[subclass-specific] @@ -140,14 +141,14 @@ class HashTable: count_prior: int = ..., na_sentinel: int = ..., na_value: object = ..., - ) -> np.ndarray: ... # np.ndarray[intp_t] + ) -> npt.NDArray[np.intp]: ... def unique( self, values: np.ndarray, # np.ndarray[subclass-specific] return_inverse: bool = ..., ) -> tuple[ np.ndarray, # np.ndarray[subclass-specific] - np.ndarray, # np.ndarray[np.intp], + npt.NDArray[np.intp], ] | np.ndarray: ... # np.ndarray[subclass-specific] def _unique( self, @@ -160,7 +161,7 @@ class HashTable: return_inverse: bool = ..., ) -> tuple[ np.ndarray, # np.ndarray[subclass-specific] - np.ndarray, # np.ndarray[np.intp], + npt.NDArray[np.intp], ] | np.ndarray: ... # np.ndarray[subclass-specific] def factorize( self, @@ -168,10 +169,7 @@ class HashTable: na_sentinel: int = ..., na_value: object = ..., mask=..., - ) -> tuple[ - np.ndarray, # np.ndarray[subclass-specific] - np.ndarray, # np.ndarray[np.intp], - ]: ... + ) -> tuple[np.ndarray, npt.NDArray[np.intp],]: ... # np.ndarray[subclass-specific] class Complex128HashTable(HashTable): ... class Complex64HashTable(HashTable): ... @@ -183,10 +181,7 @@ class Int64HashTable(HashTable): def get_labels_groupby( self, values: np.ndarray, # const int64_t[:] - ) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.int64] - ]: ... + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ... class Int32HashTable(HashTable): ... class Int16HashTable(HashTable): ... @@ -201,32 +196,32 @@ class PyObjectHashTable(HashTable): ... def duplicated_int64( values: np.ndarray, # const int64_t[:] values keep: Literal["last", "first", False] = ..., -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... # TODO: Is it actually bool or is it uint8? def mode_int64( values: np.ndarray, # const int64_t[:] values dropna: bool, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def value_count_int64( values: np.ndarray, # const int64_t[:] dropna: bool, -) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] # np.ndarray[np.int64] +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... def duplicated( values: np.ndarray, keep: Literal["last", "first", False] = ..., -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ... def value_count( values: np.ndarray, dropna: bool, -) -> tuple[np.ndarray, np.ndarray,]: ... # np.ndarray[np.int64] +) -> tuple[np.ndarray, npt.NDArray[np.int64],]: ... # np.ndarray[same-as-values] # arr and values should have same dtype def ismember( arr: np.ndarray, values: np.ndarray, -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... def object_hash(obj) -> int: ... def objects_are_equal(a, b) -> bool: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 132435701bddb..3eb7bcc673cd4 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -98,7 +98,8 @@ cdef class ObjectFactorizer(Factorizer): -------- Factorize values with nans replaced by na_sentinel - >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) + >>> fac = ObjectFactorizer(3) + >>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ cdef: @@ -142,8 +143,9 @@ cdef class Int64Factorizer(Factorizer): -------- Factorize values with nans replaced by na_sentinel - >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) - array([ 0, 1, 20]) + >>> fac = Int64Factorizer(3) + >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) + array([0, 1, 2]) """ cdef: ndarray[intp_t] labels diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 9f526dd3fe653..18ee216d5c4a1 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -10,7 +10,7 @@ class IndexEngine: def __contains__(self, val: object) -> bool: ... # -> int | slice | np.ndarray[bool] def get_loc(self, val: object) -> int | slice | np.ndarray: ... - def sizeof(self, deep: bool = False) -> int: ... + def sizeof(self, deep: bool = ...) -> int: ... def __sizeof__(self) -> int: ... @property def is_unique(self) -> bool: ... @@ -18,12 +18,6 @@ class IndexEngine: def is_monotonic_increasing(self) -> bool: ... @property def is_monotonic_decreasing(self) -> bool: ... - def get_backfill_indexer( - self, other: np.ndarray, limit: int | None = ... - ) -> npt.NDArray[np.intp]: ... - def get_pad_indexer( - self, other: np.ndarray, limit: int | None = ... - ) -> npt.NDArray[np.intp]: ... @property def is_mapping_populated(self) -> bool: ... def clear_mapping(self): ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 31b507e9b7800..f4d59962c111e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,5 +1,3 @@ -import warnings - cimport cython import numpy as np @@ -34,7 +32,11 @@ from pandas._libs import ( algos, hashtable as _hash, ) -from pandas._libs.missing import checknull + +from pandas._libs.missing cimport ( + checknull, + is_matching_na, +) cdef inline bint is_definitely_invalid_key(object val): @@ -45,10 +47,64 @@ cdef inline bint is_definitely_invalid_key(object val): return False +cdef ndarray _get_bool_indexer(ndarray values, object val): + """ + Return a ndarray[bool] of locations where val matches self.values. + + If val is not NA, this is equivalent to `self.values == val` + """ + # Caller is responsible for ensuring _check_type has already been called + cdef: + ndarray[uint8_t, ndim=1, cast=True] indexer + Py_ssize_t i + object item + + if values.descr.type_num == cnp.NPY_OBJECT: + # i.e. values.dtype == object + if not checknull(val): + indexer = values == val + + else: + # We need to check for _matching_ NA values + indexer = np.empty(len(values), dtype=np.uint8) + + for i in range(len(values)): + item = values[i] + indexer[i] = is_matching_na(item, val) + + else: + if util.is_nan(val): + indexer = np.isnan(values) + else: + indexer = values == val + + return indexer.view(bool) + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 +cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val): + """ + Possibly unpack a boolean mask to a single indexer. + """ + # Returns ndarray[bool] or int + cdef: + ndarray[intp_t, ndim=1] found + int count + + found = np.where(indexer)[0] + count = len(found) + + if count > 1: + return indexer + if count == 1: + return int(found[0]) + + raise KeyError(val) + + @cython.freelist(32) cdef class IndexEngine: @@ -81,17 +137,14 @@ cdef class IndexEngine: if is_definitely_invalid_key(val): raise TypeError(f"'{val}' is an invalid key") + self._check_type(val) + if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: return self._get_loc_duplicates(val) values = self.values - self._check_type(val) - try: - loc = _bin_search(values, val) # .searchsorted(val, side='left') - except TypeError: - # GH#35788 e.g. val=None with float64 values - raise KeyError(val) + loc = self._searchsorted_left(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: @@ -102,13 +155,21 @@ cdef class IndexEngine: if not self.unique: return self._get_loc_duplicates(val) - self._check_type(val) - try: return self.mapping.get_item(val) - except (TypeError, ValueError, OverflowError): + except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 - raise KeyError(val) + # or if we are int64 and value is np.iinfo(np.int64).max+1 + # (the uint64 with -1 case should actually be excluded by _check_type) + raise KeyError(val) from err + + cdef Py_ssize_t _searchsorted_left(self, val) except? -1: + """ + See ObjectEngine._searchsorted_left.__doc__. + """ + # Caller is responsible for ensuring _check_type has already been called + loc = self.values.searchsorted(val, side="left") + return loc cdef inline _get_loc_duplicates(self, object val): # -> Py_ssize_t | slice | ndarray[bool] @@ -122,6 +183,7 @@ cdef class IndexEngine: right = values.searchsorted(val, side='right') except TypeError: # e.g. GH#29189 get_loc(None) with a Float64Index + # 2021-09-29 Now only reached for object-dtype raise KeyError(val) diff = right - left @@ -139,26 +201,8 @@ cdef class IndexEngine: cdef: ndarray[uint8_t, ndim=1, cast=True] indexer - indexer = self.values == val - return self._unpack_bool_indexer(indexer, val) - - cdef _unpack_bool_indexer(self, - ndarray[uint8_t, ndim=1, cast=True] indexer, - object val): - # Returns ndarray[bool] or int - cdef: - ndarray[intp_t, ndim=1] found - int count - - found = np.where(indexer)[0] - count = len(found) - - if count > 1: - return indexer - if count == 1: - return int(found[0]) - - raise KeyError(val) + indexer = _get_bool_indexer(self.values, val) + return _unpack_bool_indexer(indexer, val) def sizeof(self, deep: bool = False) -> int: """ return the sizeof our mapping """ @@ -217,12 +261,6 @@ cdef class IndexEngine: cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=False) - def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: - return algos.backfill(self.values, other, limit=limit) - - def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: - return algos.pad(self.values, other, limit=limit) - cdef _make_hash_table(self, Py_ssize_t n): raise NotImplementedError @@ -242,16 +280,13 @@ cdef class IndexEngine: values = self.values self.mapping = self._make_hash_table(len(values)) - self._call_map_locations(values) + self.mapping.map_locations(values) if len(self.mapping) == len(values): self.unique = 1 self.need_unique_check = 0 - cdef void _call_map_locations(self, ndarray values): - self.mapping.map_locations(values) - def clear_mapping(self): self.mapping = None self.need_monotonic_check = 1 @@ -287,7 +322,6 @@ cdef class IndexEngine: Py_ssize_t i, j, n, n_t, n_alloc bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True - self._ensure_mapping_populated() values = self.values stargets = set(targets) @@ -373,6 +407,11 @@ cdef class IndexEngine: cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: + # GH#1757 ndarray.searchsorted is not safe to use with array of tuples + # (treats a tuple `val` as a sequence of keys instead of a single key), + # so we implement something similar. + # This is equivalent to the stdlib's bisect.bisect_left + cdef: Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval @@ -405,6 +444,15 @@ cdef class ObjectEngine(IndexEngine): cdef _make_hash_table(self, Py_ssize_t n): return _hash.PyObjectHashTable(n) + cdef Py_ssize_t _searchsorted_left(self, val) except? -1: + # using values.searchsorted here would treat a tuple `val` as a sequence + # instead of a single key, so we use a different implementation + try: + loc = _bin_search(self.values, val) + except TypeError as err: + raise KeyError(val) from err + return loc + cdef class DatetimeEngine(Int64Engine): @@ -418,19 +466,12 @@ cdef class DatetimeEngine(Int64Engine): def __contains__(self, val: object) -> bool: # We assume before we get here: # - val is hashable - cdef: - int64_t loc, conv - - conv = self._unbox_scalar(val) - if self.over_size_threshold and self.is_monotonic_increasing: - if not self.is_unique: - return self._get_loc_duplicates(conv) - values = self.values - loc = values.searchsorted(conv, side='left') - return values[loc] == conv - - self._ensure_mapping_populated() - return conv in self.mapping + self._unbox_scalar(val) + try: + self.get_loc(val) + return True + except KeyError: + return False cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=True) @@ -703,7 +744,6 @@ cdef class BaseMultiIndexCodesEngine: return self._base.get_loc(self, lab_int) def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray: - # target: MultiIndex indexer = self._base.get_indexer_non_unique(self, target) return indexer diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index dfce411c09e66..7a2bbec96e413 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -36,41 +36,16 @@ cdef class {{name}}Engine(IndexEngine): {{if name not in {'Float64', 'Float32'} }} if not util.is_integer_object(val): raise KeyError(val) + {{if name.startswith("U")}} + if val < 0: + # cannot have negative values with unsigned int dtype + raise KeyError(val) + {{endif}} {{else}} - if util.is_bool_object(val): - # avoid casting to True -> 1.0 + if not util.is_integer_object(val) and not util.is_float_object(val): + # in particular catch bool and avoid casting True -> 1.0 raise KeyError(val) {{endif}} - cdef void _call_map_locations(self, ndarray[{{dtype}}_t] values): - self.mapping.map_locations(values) - - cdef _maybe_get_bool_indexer(self, object val): - # Returns ndarray[bool] or int - cdef: - ndarray[uint8_t, ndim=1, cast=True] indexer - ndarray[{{dtype}}_t, ndim=1] values - - self._check_type(val) - - values = self.values - try: - with warnings.catch_warnings(): - # e.g. if values is float64 and `val` is a str, suppress warning - warnings.filterwarnings("ignore", category=FutureWarning) - {{if name in {'Float64', 'Float32'} }} - if util.is_nan(val): - indexer = np.isnan(values) - else: - indexer = values == val - {{else}} - indexer = values == val - {{endif}} - except TypeError: - # if the equality above returns a bool, cython will raise TypeError - # when trying to cast it to ndarray - raise KeyError(val) - - return self._unpack_bool_indexer(indexer, val) {{endfor}} diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index da18084da92f9..6a90fbc729580 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -79,7 +79,7 @@ class BlockManager: _blknos: np.ndarray _blklocs: np.ndarray def __init__( - self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=True + self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=... ): ... def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ... def _rebuild_blknos_and_blklocs(self) -> None: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 1a0697f097494..87709ac6c33bf 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -24,7 +24,10 @@ cnp.import_array() from pandas._libs.algos import ensure_int64 from pandas._libs.arrays cimport NDArrayBacked -from pandas._libs.util cimport is_integer_object +from pandas._libs.util cimport ( + is_array, + is_integer_object, +) @cython.final @@ -61,8 +64,15 @@ cdef class BlockPlacement: self._has_array = True else: # Cython memoryview interface requires ndarray to be writeable. - arr = np.require(val, dtype=np.intp, requirements='W') - assert arr.ndim == 1, arr.shape + if ( + not is_array(val) + or not cnp.PyArray_ISWRITEABLE(val) + or (val).descr.type_num != cnp.NPY_INTP + ): + arr = np.require(val, dtype=np.intp, requirements='W') + else: + arr = val + # Caller is responsible for ensuring arr.ndim == 1 self._as_array = arr self._has_array = True @@ -254,11 +264,13 @@ cdef class BlockPlacement: if slc is not None and slc.step == 1: new_slc = slice(slc.start * factor, slc.stop * factor, 1) - new_placement = np.arange(new_slc.start, new_slc.stop, dtype=np.intp) + # equiv: np.arange(new_slc.start, new_slc.stop, dtype=np.intp) + new_placement = cnp.PyArray_Arange(new_slc.start, new_slc.stop, 1, NPY_INTP) else: # Note: test_pivot_table_empty_aggfunc gets here with `slc is not None` mapped = [ - np.arange(x * factor, (x + 1) * factor, dtype=np.intp) + # equiv: np.arange(x * factor, (x + 1) * factor, dtype=np.intp) + cnp.PyArray_Arange(x * factor, (x + 1) * factor, 1, NPY_INTP) for x in self ] new_placement = np.concatenate(mapped) @@ -681,15 +693,17 @@ cdef class BlockManager: cnp.npy_intp length = self.shape[0] SharedBlock blk BlockPlacement bp + ndarray[intp_t] new_blknos, new_blklocs # equiv: np.empty(length, dtype=np.intp) new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) - new_blknos.fill(-1) - new_blklocs.fill(-1) + # equiv: new_blknos.fill(-1) + cnp.PyArray_FILLWBYTE(new_blknos, -1) + cnp.PyArray_FILLWBYTE(new_blklocs, -1) for blkno, blk in enumerate(self.blocks): - bp = blk.mgr_locs + bp = blk._mgr_locs # Iterating over `bp` is a faster equivalent to # new_blknos[bp.indexer] = blkno # new_blklocs[bp.indexer] = np.arange(len(bp)) diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index 5b0e50ca76840..3a22aa439b7be 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -11,7 +11,7 @@ def left_outer_join( left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, - sort: bool = True, + sort: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def full_outer_join( left: np.ndarray, # const intp_t[:] @@ -54,40 +54,40 @@ def asof_join_backward_on_X_by_Y( right_values: np.ndarray, # asof_t[:] left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] - allow_exact_matches: bool = True, - tolerance=None, + allow_exact_matches: bool = ..., + tolerance=..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_forward_on_X_by_Y( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] - allow_exact_matches: bool = True, - tolerance=None, + allow_exact_matches: bool = ..., + tolerance=..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_nearest_on_X_by_Y( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] - allow_exact_matches: bool = True, - tolerance=None, + allow_exact_matches: bool = ..., + tolerance=..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_backward( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] - allow_exact_matches: bool = True, - tolerance=None, + allow_exact_matches: bool = ..., + tolerance=..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_forward( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] - allow_exact_matches: bool = True, - tolerance=None, + allow_exact_matches: bool = ..., + tolerance=..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_nearest( left_values: np.ndarray, # asof_t[:] right_values: np.ndarray, # asof_t[:] - allow_exact_matches: bool = True, - tolerance=None, + allow_exact_matches: bool = ..., + tolerance=..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index b6acf8914c0a6..c9a4b49f90037 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -4,17 +4,9 @@ import numpy as np cimport numpy as cnp from numpy cimport ( - float32_t, - float64_t, - int8_t, - int16_t, - int32_t, int64_t, intp_t, ndarray, - uint8_t, - uint16_t, - uint32_t, uint64_t, ) @@ -22,6 +14,11 @@ cnp.import_array() from pandas._libs.algos import groupsort_indexer +from pandas._libs.dtypes cimport ( + numeric_object_t, + numeric_t, +) + @cython.wraparound(False) @cython.boundscheck(False) @@ -257,31 +254,20 @@ def ffill_indexer(const intp_t[:] indexer) -> np.ndarray: # left_join_indexer, inner_join_indexer, outer_join_indexer # ---------------------------------------------------------------------- -ctypedef fused join_t: - float64_t - float32_t - object - int8_t - int16_t - int32_t - int64_t - uint8_t - uint16_t - uint32_t - uint64_t - - # Joins on ordered, unique indices # right might contain non-unique values @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): +def left_join_indexer_unique( + ndarray[numeric_object_t] left, + ndarray[numeric_object_t] right +): cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t] indexer - join_t lval, rval + numeric_object_t lval, rval i = 0 j = 0 @@ -322,15 +308,15 @@ def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -425,15 +411,15 @@ def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -518,12 +504,12 @@ def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): cdef: Py_ssize_t i, j, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -656,26 +642,14 @@ from pandas._libs.hashtable cimport ( UInt64HashTable, ) -ctypedef fused asof_t: - uint8_t - uint16_t - uint32_t - uint64_t - int8_t - int16_t - int32_t - int64_t - float - float64_t - ctypedef fused by_t: object int64_t uint64_t -def asof_join_backward_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, @@ -685,8 +659,8 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 HashTable hash_table by_t by_value @@ -743,8 +717,8 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_forward_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=1, @@ -754,8 +728,8 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 HashTable hash_table by_t by_value @@ -812,8 +786,8 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_nearest_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, @@ -822,7 +796,7 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_size, right_size, i ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - asof_t bdiff, fdiff + numeric_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) @@ -865,8 +839,8 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, # asof_join # ---------------------------------------------------------------------- -def asof_join_backward(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_backward(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): @@ -874,8 +848,8 @@ def asof_join_backward(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -918,8 +892,8 @@ def asof_join_backward(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_forward(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_forward(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): @@ -927,8 +901,8 @@ def asof_join_forward(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -972,15 +946,15 @@ def asof_join_forward(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_nearest(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_nearest(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_size, right_size, i ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - asof_t bdiff, fdiff + numeric_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 25fdec1bfca63..b88a2e4c28cfb 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -5,6 +5,7 @@ from typing import ( Any, Callable, Generator, + Hashable, Literal, overload, ) @@ -30,10 +31,10 @@ i8max: int u8max: int def item_from_zerodim(val: object) -> object: ... -def infer_dtype(value: object, skipna: bool = True) -> str: ... +def infer_dtype(value: object, skipna: bool = ...) -> str: ... def is_iterator(obj: object) -> bool: ... def is_scalar(val: object) -> bool: ... -def is_list_like(obj: object, allow_sets: bool = True) -> bool: ... +def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_period(val: object) -> bool: ... def is_interval(val: object) -> bool: ... def is_decimal(val: object) -> bool: ... @@ -45,22 +46,22 @@ def is_interval_array(values: np.ndarray) -> bool: ... def is_datetime64_array(values: np.ndarray) -> bool: ... def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... -def is_time_array(values: np.ndarray, skipna: bool = False): ... -def is_date_array(values: np.ndarray, skipna: bool = False): ... -def is_datetime_array(values: np.ndarray, skipna: bool = False): ... -def is_string_array(values: np.ndarray, skipna: bool = False): ... -def is_float_array(values: np.ndarray, skipna: bool = False): ... -def is_integer_array(values: np.ndarray, skipna: bool = False): ... -def is_bool_array(values: np.ndarray, skipna: bool = False): ... -def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... -def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... -def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... -def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... +def is_time_array(values: np.ndarray, skipna: bool = ...): ... +def is_date_array(values: np.ndarray, skipna: bool = ...): ... +def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... +def is_string_array(values: np.ndarray, skipna: bool = ...): ... +def is_float_array(values: np.ndarray, skipna: bool = ...): ... +def is_integer_array(values: np.ndarray, skipna: bool = ...): ... +def is_bool_array(values: np.ndarray, skipna: bool = ...): ... +def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ... +def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... +def fast_unique_multiple_list(lists: list, sort: bool = ...) -> list: ... +def fast_unique_multiple(arrays: list, sort: bool = ...) -> list: ... def map_infer( arr: np.ndarray, f: Callable[[Any], Any], - convert: bool = True, - ignore_na: bool = False, + convert: bool = ..., + ignore_na: bool = ..., ) -> np.ndarray: ... @overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray def maybe_convert_objects( @@ -131,16 +132,16 @@ def maybe_convert_objects( def maybe_convert_numeric( values: npt.NDArray[np.object_], na_values: set, - convert_empty: bool = True, - coerce_numeric: bool = False, + convert_empty: bool = ..., + coerce_numeric: bool = ..., convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... @overload def maybe_convert_numeric( values: npt.NDArray[np.object_], na_values: set, - convert_empty: bool = True, - coerce_numeric: bool = False, + convert_empty: bool = ..., + coerce_numeric: bool = ..., *, convert_to_masked_nullable: Literal[True], ) -> tuple[np.ndarray, np.ndarray]: ... @@ -148,10 +149,10 @@ def maybe_convert_numeric( # TODO: restrict `arr`? def ensure_string_array( arr, - na_value: object = np.nan, - convert_na_value: bool = True, - copy: bool = True, - skipna: bool = True, + na_value: object = ..., + convert_na_value: bool = ..., + copy: bool = ..., + skipna: bool = ..., ) -> npt.NDArray[np.object_]: ... def infer_datetimelike_array( arr: npt.NDArray[np.object_], @@ -169,7 +170,7 @@ def tuples_to_object_array( ) -> ndarray_obj_2d: ... # TODO: can we be more specific about rows? -def to_object_array(rows: object, min_width: int = 0) -> ndarray_obj_2d: ... +def to_object_array(rows: object, min_width: int = ...) -> ndarray_obj_2d: ... def dicts_to_array(dicts: list, columns: list) -> ndarray_obj_2d: ... def maybe_booleans_to_slice( mask: npt.NDArray[np.uint8], @@ -197,7 +198,7 @@ def indices_fast( labels: np.ndarray, # const int64_t[:] keys: list, sorted_labels: list[npt.NDArray[np.int64]], -) -> dict: ... +) -> dict[Hashable, npt.NDArray[np.intp]]: ... def generate_slices( labels: np.ndarray, ngroups: int # const intp_t[:] ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... @@ -214,8 +215,8 @@ def get_level_sorter( def generate_bins_dt64( values: npt.NDArray[np.int64], binner: np.ndarray, # const int64_t[:] - closed: object = "left", - hasnans: bool = False, + closed: object = ..., + hasnans: bool = ..., ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] def array_equivalent_object( left: np.ndarray, # object[:] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c9548a7e05fc5..4cc8d1ac2f60e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -175,6 +175,7 @@ def is_scalar(val: object) -> bool: Examples -------- + >>> import datetime >>> dt = datetime.datetime(2018, 10, 3) >>> pd.api.types.is_scalar(dt) True @@ -256,11 +257,12 @@ def is_iterator(obj: object) -> bool: Examples -------- + >>> import datetime >>> is_iterator((x for x in [])) True >>> is_iterator([1, 2, 3]) False - >>> is_iterator(datetime(2017, 1, 1)) + >>> is_iterator(datetime.datetime(2017, 1, 1)) False >>> is_iterator("foo") False @@ -727,14 +729,19 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - result[i] = str(val) + if not isinstance(val, np.floating): + # f"{val}" is faster than str(val) + result[i] = f"{val}" + else: + # f"{val}" is not always equivalent to str(val) for floats + result[i] = str(val) else: if convert_na_value: val = na_value if skipna: result[i] = val else: - result[i] = str(val) + result[i] = f"{val}" return result @@ -1071,11 +1078,12 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: Examples -------- + >>> import datetime >>> is_list_like([1, 2, 3]) True >>> is_list_like({1, 2, 3}) True - >>> is_list_like(datetime(2017, 1, 1)) + >>> is_list_like(datetime.datetime(2017, 1, 1)) False >>> is_list_like("foo") False @@ -1350,6 +1358,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: Examples -------- + >>> import datetime >>> infer_dtype(['foo', 'bar']) 'string' diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 959d83a55d4f3..9d3b80b321537 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -2,17 +2,9 @@ import cython from cython import Py_ssize_t from numpy cimport ( - float32_t, - float64_t, - int8_t, - int16_t, - int32_t, int64_t, ndarray, uint8_t, - uint16_t, - uint32_t, - uint64_t, ) import numpy as np @@ -21,27 +13,15 @@ cimport numpy as cnp cnp.import_array() +from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.lib cimport c_is_list_like -ctypedef fused reshape_t: - uint8_t - uint16_t - uint32_t - uint64_t - int8_t - int16_t - int32_t - int64_t - float32_t - float64_t - object - @cython.wraparound(False) @cython.boundscheck(False) -def unstack(reshape_t[:, :] values, const uint8_t[:] mask, +def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, - reshape_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: + numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: """ Transform long values to wide new_values. @@ -60,7 +40,7 @@ def unstack(reshape_t[:, :] values, const uint8_t[:] mask, cdef: Py_ssize_t i, j, w, nulls, s, offset - if reshape_t is not object: + if numeric_object_t is not object: # evaluated at compile-time with nogil: for i in range(stride): diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi index aff1ed6cef74c..c75c36bd61fcd 100644 --- a/pandas/_libs/sparse.pyi +++ b/pandas/_libs/sparse.pyi @@ -28,7 +28,7 @@ class SparseIndex: class IntIndex(SparseIndex): indices: npt.NDArray[np.int32] def __init__( - self, length: int, indices: Sequence[int], check_integrity: bool = True + self, length: int, indices: Sequence[int], check_integrity: bool = ... ): ... class BlockIndex(SparseIndex): diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49797eea59ddc..6785bf628919a 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1784,6 +1784,8 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; + // Skip leading whitespace. + while (isspace_ascii(*p)) p++; // Copy Leading sign if (*p == '+' || *p == '-') { *dst++ = *p++; @@ -1798,10 +1800,25 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, *dst++ = '.'; p++; } - // Copy the remainder of the string as is. - strncpy(dst, p, length + 1 - (p - s)); + // Copy fractional part after decimal (if any) + while (isdigit_ascii(*p)) { + *dst++ = *p++; + } + // Copy exponent if any + if (toupper_ascii(*p) == toupper_ascii('E')) { + *dst++ = *p++; + // Copy leading exponent sign (if any) + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy exponent digits + while (isdigit_ascii(*p)) { + *dst++ = *p++; + } + } + *dst++ = '\0'; // terminate if (endpos != NULL) - *endpos = (char *)(s + length); + *endpos = (char *)p; return s_copy; } @@ -1839,6 +1856,11 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, PyGILState_Release(gstate); free(pc); + if (skip_trailing && q != NULL && *q != p) { + while (isspace_ascii(**q)) { + (*q)++; + } + } return r; } diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index f6a8d7887ced1..9dbf9d082d8cc 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -9,7 +9,7 @@ class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode # actually __cinit__ - def __new__(self, code: int): ... + def __new__(cls, code: int): ... def freq_group_code(self) -> int: ... def date_offset(self) -> BaseOffset: ... @classmethod diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index f79ffd2d425c4..56be8bbfdcad2 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -227,7 +227,7 @@ class Resolution(Enum): Examples -------- >>> Resolution.from_attrname('second') - 2 + >>> Resolution.from_attrname('second') == Resolution.RESO_SEC True @@ -244,7 +244,7 @@ class Resolution(Enum): Examples -------- >>> Resolution.get_reso_from_freq('H') - 4 + >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR True diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index c6ebb1618f1f2..cbf91f2bcaf76 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -31,7 +31,7 @@ def isleapyear_arr( def build_isocalendar_sarray( dtindex: npt.NDArray[np.int64], # const int64_t[:] ) -> np.ndarray: ... -def get_locale_names(name_type: str, locale: object = None): ... +def get_locale_names(name_type: str, locale: object = ...): ... class RoundTo: @property diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 4ff6be25127c8..521927cd910ec 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -581,7 +581,7 @@ class NaTType(_NaT): Examples -------- - >>> pd.Timestamp.utcnow() + >>> pd.Timestamp.utcnow() # doctest: +SKIP Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC') """, ) @@ -705,7 +705,7 @@ class NaTType(_NaT): Examples -------- - >>> pd.Timestamp.now() + >>> pd.Timestamp.now() # doctest: +SKIP Timestamp('2020-11-16 22:06:16.378782') Analogous for ``pd.NaT``: @@ -730,7 +730,7 @@ class NaTType(_NaT): Examples -------- - >>> pd.Timestamp.today() + >>> pd.Timestamp.today() # doctest: +SKIP Timestamp('2020-11-16 22:37:39.969883') Analogous for ``pd.NaT``: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4c9b681452c0a..ea714ce0162bc 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1929,7 +1929,7 @@ cdef class BYearEnd(YearOffset): Examples -------- - >>> from pandas.tseries.offset import BYearEnd + >>> from pandas.tseries.offsets import BYearEnd >>> ts = pd.Timestamp('2020-05-24 05:01:15') >>> ts - BYearEnd() Timestamp('2019-12-31 05:01:15') @@ -1955,7 +1955,7 @@ cdef class BYearBegin(YearOffset): Examples -------- - >>> from pandas.tseries.offset import BYearBegin + >>> from pandas.tseries.offsets import BYearBegin >>> ts = pd.Timestamp('2020-05-24 05:01:15') >>> ts + BYearBegin() Timestamp('2021-01-01 05:01:15') @@ -2090,7 +2090,7 @@ cdef class BQuarterEnd(QuarterOffset): Examples -------- - >>> from pandas.tseries.offset import BQuarterEnd + >>> from pandas.tseries.offsets import BQuarterEnd >>> ts = pd.Timestamp('2020-05-24 05:01:15') >>> ts + BQuarterEnd() Timestamp('2020-06-30 05:01:15') @@ -2118,7 +2118,7 @@ cdef class BQuarterBegin(QuarterOffset): Examples -------- - >>> from pandas.tseries.offset import BQuarterBegin + >>> from pandas.tseries.offsets import BQuarterBegin >>> ts = pd.Timestamp('2020-05-24 05:01:15') >>> ts + BQuarterBegin() Timestamp('2020-06-01 05:01:15') @@ -2228,7 +2228,7 @@ cdef class BusinessMonthEnd(MonthOffset): Examples -------- - >>> from pandas.tseries.offset import BMonthEnd + >>> from pandas.tseries.offsets import BMonthEnd >>> ts = pd.Timestamp('2020-05-24 05:01:15') >>> ts + BMonthEnd() Timestamp('2020-05-29 05:01:15') @@ -2247,7 +2247,7 @@ cdef class BusinessMonthBegin(MonthOffset): Examples -------- - >>> from pandas.tseries.offset import BMonthBegin + >>> from pandas.tseries.offsets import BMonthBegin >>> ts=pd.Timestamp('2020-05-24 05:01:15') >>> ts + BMonthBegin() Timestamp('2020-06-01 05:01:15') diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index e5455fa55c5ef..4f7505fd7e792 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -58,16 +58,16 @@ class Period: # error: "__new__" must return a class instance (got "Union[Period, NaTType]") def __new__( # type: ignore[misc] cls, - value=None, - freq=None, - ordinal=None, - year=None, - month=None, - quarter=None, - day=None, - hour=None, - minute=None, - second=None, + value=..., + freq=..., + ordinal=..., + year=..., + month=..., + quarter=..., + day=..., + hour=..., + minute=..., + second=..., ) -> Period | NaTType: ... @classmethod def _maybe_convert_freq(cls, freq) -> BaseOffset: ... diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c53c8635c10e9..0998cb7b0c21e 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2375,7 +2375,7 @@ cdef class _Period(PeriodMixin): >>> >>> a = Period(freq='D', year=2001, month=1, day=1) >>> a.strftime('%d-%b-%Y') - '01-Jan-2006' + '01-Jan-2001' >>> a.strftime('%b. %d, %Y was a %A') 'Jan. 01, 2001 was a Monday' """ diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi index fd88bc6938294..8e1acb2ff0d38 100644 --- a/pandas/_libs/tslibs/strptime.pyi +++ b/pandas/_libs/tslibs/strptime.pyi @@ -5,8 +5,8 @@ from pandas._typing import npt def array_strptime( values: npt.NDArray[np.object_], fmt: str | None, - exact: bool = True, - errors: str = "raise", + exact: bool = ..., + errors: str = ..., ) -> tuple[np.ndarray, np.ndarray]: ... # first ndarray is M8[ns], second is object ndarray of tzinfo | None diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index e7fb38db2aa17..d214694fb659d 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -20,10 +20,10 @@ from numpy cimport ( ndarray, ) +from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_nat_strings as nat_strings, - checknull_with_nat, ) from pandas._libs.tslibs.np_datetime cimport ( check_dts_bounds, @@ -134,7 +134,7 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' iresult[i] = NPY_NAT continue else: - if checknull_with_nat(val): + if checknull_with_nat_and_na(val): iresult[i] = NPY_NAT continue else: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 1094fac6d8504..613da5a691736 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -909,16 +909,20 @@ cdef class _Timestamp(ABCTimestamp): Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') - >>> ts.to_period(freq='Y) # Year end frequency - numpy.datetime64('2020-03-14T15:32:52.192548651') + >>> # Year end frequency + >>> ts.to_period(freq='Y') + Period('2020', 'A-DEC') - >>> ts.to_period(freq='M') # Month end frequency + >>> # Month end frequency + >>> ts.to_period(freq='M') Period('2020-03', 'M') - >>> ts.to_period(freq='W') # Weekly frequency + >>> # Weekly frequency + >>> ts.to_period(freq='W') Period('2020-03-09/2020-03-15', 'W-SUN') - >>> ts.to_period(freq='Q') # Quarter end frequency + >>> # Quarter end frequency + >>> ts.to_period(freq='Q') Period('2020Q1', 'Q-DEC') """ from pandas import Period @@ -1059,7 +1063,7 @@ class Timestamp(_Timestamp): Examples -------- - >>> pd.Timestamp.now() + >>> pd.Timestamp.now() # doctest: +SKIP Timestamp('2020-11-16 22:06:16.378782') Analogous for ``pd.NaT``: @@ -1087,7 +1091,7 @@ class Timestamp(_Timestamp): Examples -------- - >>> pd.Timestamp.today() + >>> pd.Timestamp.today() # doctest: +SKIP Timestamp('2020-11-16 22:37:39.969883') Analogous for ``pd.NaT``: @@ -1106,7 +1110,7 @@ class Timestamp(_Timestamp): Examples -------- - >>> pd.Timestamp.utcnow() + >>> pd.Timestamp.utcnow() # doctest: +SKIP Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC') """ return cls.now(UTC) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 0809033b02934..60f90cc17ae34 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -385,25 +385,23 @@ def tz_standardize(tz: tzinfo) -> tzinfo: ------- tzinfo - Examples: + Examples -------- + >>> from datetime import datetime + >>> from pytz import timezone + >>> tz = timezone('US/Pacific').normalize( + ... datetime(2014, 1, 1, tzinfo=pytz.utc) + ... ).tzinfo >>> tz - >>> tz_standardize(tz) + >>> tz = timezone('US/Pacific') >>> tz - >>> tz_standardize(tz) - - >>> tz - dateutil.tz.tz.tzutc - - >>> tz_standardize(tz) - dateutil.tz.tz.tzutc """ if treat_tz_as_pytz(tz): return pytz.timezone(str(tz)) diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index 614c97a1ae0cc..e1a0263cf59ef 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -16,6 +16,6 @@ def tz_convert_from_utc_single(val: np.int64, tz: tzinfo) -> np.int64: ... def tz_localize_to_utc( vals: npt.NDArray[np.int64], tz: tzinfo | None, - ambiguous: str | bool | Iterable[bool] | None = None, - nonexistent: str | timedelta | np.timedelta64 | None = None, + ambiguous: str | bool | Iterable[bool] | None = ..., + nonexistent: str | timedelta | np.timedelta64 | None = ..., ) -> npt.NDArray[np.int64]: ... diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index a53bab26ff42b..e9a39a6a75a39 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -17,7 +17,7 @@ def dt64arr_to_periodarr( ) -> npt.NDArray[np.int64]: ... # np.ndarray[np.int64, ndim=1] def is_date_array_normalized( stamps: npt.NDArray[np.int64], # const int64_t[:] - tz: tzinfo | None = None, + tz: tzinfo | None = ..., ) -> bool: ... def normalize_i8_timestamps( stamps: npt.NDArray[np.int64], # const int64_t[:] @@ -25,12 +25,12 @@ def normalize_i8_timestamps( ) -> npt.NDArray[np.int64]: ... def get_resolution( stamps: npt.NDArray[np.int64], # const int64_t[:] - tz: tzinfo | None = None, + tz: tzinfo | None = ..., ) -> Resolution: ... def ints_to_pydatetime( arr: npt.NDArray[np.int64], # const int64_t[:}] - tz: tzinfo | None = None, - freq: str | BaseOffset | None = None, - fold: bool = False, - box: str = "datetime", + tz: tzinfo | None = ..., + freq: str | BaseOffset | None = ..., + fold: bool = ..., + box: str = ..., ) -> npt.NDArray[np.object_]: ... diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index be22fc368c28f..df88c896ac593 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -16,18 +16,3 @@ cdef extern from "src/headers/stdint.h": enum: INT32_MIN enum: INT64_MAX enum: INT64_MIN - - -ctypedef fused numeric: - cnp.int8_t - cnp.int16_t - cnp.int32_t - cnp.int64_t - - cnp.uint8_t - cnp.uint16_t - cnp.uint32_t - cnp.uint64_t - - cnp.float32_t - cnp.float64_t diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ea52bd24a3689..29fe20090875b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -20,15 +20,14 @@ from numpy cimport ( cnp.import_array() -cdef extern from "src/headers/cmath" namespace "std": +cdef extern from "../src/headers/cmath" namespace "std": bint isnan(float64_t) nogil bint notnan(float64_t) nogil int signbit(float64_t) nogil float64_t sqrt(float64_t x) nogil from pandas._libs.algos import is_monotonic - -from pandas._libs.util cimport numeric +from pandas._libs.dtypes cimport numeric_t cdef extern from "../src/skiplist.h": @@ -851,18 +850,18 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, # https://github.com/pydata/bottleneck -cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: +cdef inline numeric_t init_mm(numeric_t ai, Py_ssize_t *nobs, bint is_max) nogil: - if numeric in cython.floating: + if numeric_t in cython.floating: if ai == ai: nobs[0] = nobs[0] + 1 elif is_max: - if numeric == cython.float: + if numeric_t == cython.float: ai = MINfloat32 else: ai = MINfloat64 else: - if numeric == cython.float: + if numeric_t == cython.float: ai = MAXfloat32 else: ai = MAXfloat64 @@ -873,18 +872,18 @@ cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: return ai -cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: +cdef inline void remove_mm(numeric_t aold, Py_ssize_t *nobs) nogil: """ remove a value from the mm calc """ - if numeric in cython.floating and aold == aold: + if numeric_t in cython.floating and aold == aold: nobs[0] = nobs[0] - 1 -cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, - numeric value) nogil: +cdef inline numeric_t calc_mm(int64_t minp, Py_ssize_t nobs, + numeric_t value) nogil: cdef: - numeric result + numeric_t result - if numeric in cython.floating: + if numeric_t in cython.floating: if nobs >= minp: result = value else: @@ -940,13 +939,13 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start, return _roll_min_max(values, start, end, minp, is_max=0) -cdef _roll_min_max(ndarray[numeric] values, +cdef _roll_min_max(ndarray[numeric_t] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t minp, bint is_max): cdef: - numeric ai + numeric_t ai int64_t curr_win_size, start Py_ssize_t i, k, nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 32224cdd4d04e..c7113e663789b 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -4,15 +4,13 @@ from functools import wraps import gzip from typing import ( + TYPE_CHECKING, Any, Callable, ) import zipfile -from pandas._typing import ( - FilePathOrBuffer, - FrameOrSeries, -) +from pandas._typing import FilePathOrBuffer from pandas.compat import ( get_lzma_file, import_lzma, @@ -24,6 +22,12 @@ from pandas.io.common import urlopen +if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) + _RAISE_NETWORK_ERROR_DEFAULT = False lzma = import_lzma() @@ -272,7 +276,9 @@ def can_connect(url, error_classes=None): # File-IO -def round_trip_pickle(obj: Any, path: FilePathOrBuffer | None = None) -> FrameOrSeries: +def round_trip_pickle( + obj: Any, path: FilePathOrBuffer | None = None +) -> DataFrame | Series: """ Pickle an object and then read it again. diff --git a/pandas/_typing.py b/pandas/_typing.py index 9ed31dc3738f3..9c20eb12dc7fc 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -101,11 +101,11 @@ ] Timezone = Union[str, tzinfo] -# FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is -# used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a +# NDFrameT is stricter and ensures that the same subclass of NDFrame always is +# used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a # Series is passed into a function, a Series is always returned and if a DataFrame is # passed in, a DataFrame is always returned. -FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") +NDFrameT = TypeVar("NDFrameT", bound="NDFrame") Axis = Union[str, int] IndexLabel = Union[Hashable, Sequence[Hashable]] @@ -219,6 +219,10 @@ PositionalIndexer = Union[ScalarIndexer, SequenceIndexer] PositionalIndexerTuple = Tuple[PositionalIndexer, PositionalIndexer] PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple] +if TYPE_CHECKING: + TakeIndexer = Union[Sequence[int], Sequence[np.integer], npt.NDArray[np.integer]] +else: + TakeIndexer = Any # Windowing rank methods WindowingRankType = Literal["average", "min", "max"] diff --git a/pandas/core/_numba/__init__.py b/pandas/core/_numba/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py new file mode 100644 index 0000000000000..c666bb1a0ad4b --- /dev/null +++ b/pandas/core/_numba/executor.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from typing import Callable + +import numpy as np + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + get_jit_arguments, +) + + +def generate_shared_aggregator( + func: Callable[..., Scalar], + engine_kwargs: dict[str, bool] | None, + cache_key_str: str, +): + """ + Generate a Numba function that loops over the columns 2D object and applies + a 1D numba kernel over each column. + + Parameters + ---------- + func : function + aggregation function to be applied to each column + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + cache_key_str: str + string to access the compiled function of the form + _ e.g. rolling_mean, groupby_mean + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs, None) + + cache_key = (func, cache_key_str) + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] + + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def column_looper( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + ): + result = np.empty((len(start), values.shape[1]), dtype=np.float64) + for i in numba.prange(values.shape[1]): + result[:, i] = func(values[:, i], start, end, min_periods) + return result + + return column_looper diff --git a/pandas/core/_numba/kernels/__init__.py b/pandas/core/_numba/kernels/__init__.py new file mode 100644 index 0000000000000..eb43de1e0d979 --- /dev/null +++ b/pandas/core/_numba/kernels/__init__.py @@ -0,0 +1,3 @@ +from pandas.core._numba.kernels.mean_ import sliding_mean + +__all__ = ["sliding_mean"] diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py new file mode 100644 index 0000000000000..32ea505513ed0 --- /dev/null +++ b/pandas/core/_numba/kernels/mean_.py @@ -0,0 +1,119 @@ +""" +Numba 1D aggregation kernels that can be shared by +* Dataframe / Series +* groupby +* rolling / expanding + +Mirrors pandas/_libs/window/aggregation.pyx +""" +from __future__ import annotations + +import numba +import numpy as np + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def is_monotonic_increasing(bounds: np.ndarray) -> bool: + """Check if int64 values are monotonically increasing.""" + n = len(bounds) + if n < 2: + return True + prev = bounds[0] + for i in range(1, n): + cur = bounds[i] + if cur < prev: + return False + prev = cur + return True + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def add_mean( + val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float +) -> tuple[int, float, int, float]: + if not np.isnan(val): + nobs += 1 + y = val - compensation + t = sum_x + y + compensation = t - sum_x - y + sum_x = t + if val < 0: + neg_ct += 1 + return nobs, sum_x, neg_ct, compensation + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def remove_mean( + val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float +) -> tuple[int, float, int, float]: + if not np.isnan(val): + nobs -= 1 + y = -val - compensation + t = sum_x + y + compensation = t - sum_x - y + sum_x = t + if val < 0: + neg_ct -= 1 + return nobs, sum_x, neg_ct, compensation + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def sliding_mean( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + min_periods: int, +) -> np.ndarray: + N = len(start) + nobs = 0 + sum_x = 0.0 + neg_ct = 0 + compensation_add = 0.0 + compensation_remove = 0.0 + + is_monotonic_increasing_bounds = is_monotonic_increasing( + start + ) and is_monotonic_increasing(end) + + output = np.empty(N, dtype=np.float64) + + for i in range(N): + s = start[i] + e = end[i] + if i == 0 or not is_monotonic_increasing_bounds: + for j in range(s, e): + val = values[j] + nobs, sum_x, neg_ct, compensation_add = add_mean( + val, nobs, sum_x, neg_ct, compensation_add + ) + else: + for j in range(start[i - 1], s): + val = values[j] + nobs, sum_x, neg_ct, compensation_remove = remove_mean( + val, nobs, sum_x, neg_ct, compensation_remove + ) + + for j in range(end[i - 1], e): + val = values[j] + nobs, sum_x, neg_ct, compensation_add = add_mean( + val, nobs, sum_x, neg_ct, compensation_add + ) + + if nobs >= min_periods and nobs > 0: + result = sum_x / nobs + if neg_ct == 0 and result < 0: + result = 0 + elif neg_ct == nobs and result > 0: + result = 0 + else: + result = np.nan + + output[i] = result + + if not is_monotonic_increasing_bounds: + nobs = 0 + sum_x = 0.0 + neg_ct = 0 + compensation_remove = 0.0 + + return output diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 181cb46325708..77cd73fdfe91b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -28,6 +28,7 @@ ArrayLike, DtypeObj, Scalar, + TakeIndexer, npt, ) from pandas.util._decorators import doc @@ -95,6 +96,7 @@ ) from pandas.core.arrays import ( DatetimeArray, + ExtensionArray, TimedeltaArray, ) @@ -1431,7 +1433,11 @@ def get_indexer(current_indexer, other_indexer): def take( - arr, indices: np.ndarray, axis: int = 0, allow_fill: bool = False, fill_value=None + arr, + indices: TakeIndexer, + axis: int = 0, + allow_fill: bool = False, + fill_value=None, ): """ Take elements from an array. @@ -1441,7 +1447,7 @@ def take( arr : array-like or scalar value Non array-likes (sequences/scalars without a dtype) are coerced to an ndarray. - indices : sequence of integers + indices : sequence of int or one-dimensional np.ndarray of int Indices to be taken. axis : int, default 0 The axis over which to select values. @@ -1530,7 +1536,7 @@ def take( def searchsorted( arr: ArrayLike, - value: NumpyValueArrayLike, + value: NumpyValueArrayLike | ExtensionArray, side: Literal["left", "right"] = "left", sorter: NumpySorter = None, ) -> npt.NDArray[np.intp] | np.intp: @@ -1604,14 +1610,14 @@ def searchsorted( value = cast(int, dtype.type(value)) else: value = pd_array(cast(ArrayLike, value), dtype=dtype) - elif not ( - is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) - ): + else: # E.g. if `arr` is an array with dtype='datetime64[ns]' # and `value` is a pd.Timestamp, we may need to convert value arr = ensure_wrapped_if_datetimelike(arr) - return arr.searchsorted(value, side=side, sorter=sorter) + # Argument 1 to "searchsorted" of "ndarray" has incompatible type + # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike" + return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type] # ---- # diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 7555fb50f16af..291ad2b071665 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -4,6 +4,7 @@ from collections import defaultdict from functools import partial import inspect +import re from typing import ( TYPE_CHECKING, Any, @@ -30,9 +31,10 @@ AggFuncTypeDict, AggObjType, Axis, - FrameOrSeries, + NDFrameT, ) from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( @@ -271,8 +273,9 @@ def transform_dict_like(self, func): "No transform functions were provided", }: raise err - elif not isinstance(err, TypeError): - all_type_errors = False + else: + if not isinstance(err, TypeError): + all_type_errors = False failed_names.append(name) # combine results if not results: @@ -280,12 +283,11 @@ def transform_dict_like(self, func): raise klass("Transform function failed") if len(failed_names) > 0: warnings.warn( - f"{failed_names} did not transform successfully and did not raise " - f"a TypeError. If any error is raised except for TypeError, " - f"this will raise in a future version of pandas. " + f"{failed_names} did not transform successfully. If any error is " + f"raised, this will raise in a future version of pandas. " f"Drop these columns/ops to avoid this warning.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return concat(results, axis=1) @@ -335,6 +337,13 @@ def agg_list_like(self) -> DataFrame | Series: results = [] keys = [] + failed_names = [] + + depr_nuisance_columns_msg = ( + "{} did not aggregate successfully. If any error is " + "raised this will raise in a future version of pandas. " + "Drop these columns/ops to avoid this warning." + ) # degenerate case if selected_obj.ndim == 1: @@ -344,7 +353,7 @@ def agg_list_like(self) -> DataFrame | Series: new_res = colg.aggregate(a) except TypeError: - pass + failed_names.append(com.get_callable_name(a) or a) else: results.append(new_res) @@ -358,20 +367,39 @@ def agg_list_like(self) -> DataFrame | Series: for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: - new_res = colg.aggregate(arg) + # Capture and suppress any warnings emitted by us in the call + # to agg below, but pass through any warnings that were + # generated otherwise. + # This is necessary because of https://bugs.python.org/issue29672 + # See GH #43741 for more details + with warnings.catch_warnings(record=True) as record: + new_res = colg.aggregate(arg) + if len(record) > 0: + match = re.compile(depr_nuisance_columns_msg.format(".*")) + for warning in record: + if re.match(match, str(warning.message)): + failed_names.append(col) + else: + warnings.warn_explicit( + message=warning.message, + category=warning.category, + filename=warning.filename, + lineno=warning.lineno, + ) + except (TypeError, DataError): - pass + failed_names.append(col) except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named - pass + failed_names.append(col) elif "no results" in str(err): # reached in test_frame_apply.test_nuiscance_columns # where the colg.aggregate(arg) ends up going through # the selected_obj.ndim == 1 branch above with arg == ["sum"] # on a datetime64[ns] column - pass + failed_names.append(col) else: raise else: @@ -384,6 +412,13 @@ def agg_list_like(self) -> DataFrame | Series: if not len(results): raise ValueError("no results") + if len(failed_names) > 0: + warnings.warn( + depr_nuisance_columns_msg.format(failed_names), + FutureWarning, + stacklevel=find_stack_level(), + ) + try: concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: @@ -1051,7 +1086,6 @@ def agg(self): result = super().agg() if result is None: f = self.f - args = self.args kwargs = self.kwargs # string, list-like, and dict-like are entirely handled in super @@ -1070,9 +1104,9 @@ def agg(self): # then .agg and .apply would have different semantics if the # operation is actually defined on the Series, e.g. str try: - result = self.obj.apply(f, *args, **kwargs) + result = self.obj.apply(f) except (ValueError, AttributeError, TypeError): - result = f(self.obj, *args, **kwargs) + result = f(self.obj) return result @@ -1120,7 +1154,7 @@ def apply_standard(self) -> DataFrame | Series: class GroupByApply(Apply): def __init__( self, - obj: GroupBy[FrameOrSeries], + obj: GroupBy[NDFrameT], func: AggFuncType, args, kwargs, @@ -1338,7 +1372,7 @@ def _make_unique_kwarg_list( def relabel_result( - result: FrameOrSeries, + result: DataFrame | Series, func: dict[str, list[Callable | str]], columns: Iterable[Hashable], order: Iterable[int], diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 201e177d8bb10..6d350cfa2c1d6 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -178,14 +178,7 @@ def take_1d( """ if not isinstance(arr, np.ndarray): # ExtensionArray -> dispatch to their method - - # error: Argument 1 to "take" of "ExtensionArray" has incompatible type - # "ndarray"; expected "Sequence[int]" - return arr.take( - indexer, # type: ignore[arg-type] - fill_value=fill_value, - allow_fill=allow_fill, - ) + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if not allow_fill: return arr.take(indexer) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 4c7ccc2f16477..e43e66fed8957 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -22,6 +22,7 @@ ScalarIndexer, SequenceIndexer, Shape, + TakeIndexer, npt, type_t, ) @@ -101,7 +102,7 @@ def _validate_scalar(self, value): def take( self: NDArrayBackedExtensionArrayT, - indices: Sequence[int], + indices: TakeIndexer, *, allow_fill: bool = False, fill_value: Any = None, @@ -112,9 +113,7 @@ def take( new_data = take( self._ndarray, - # error: Argument 2 to "take" has incompatible type "Sequence[int]"; - # expected "ndarray" - indices, # type: ignore[arg-type] + indices, allow_fill=allow_fill, fill_value=fill_value, axis=axis, @@ -323,7 +322,7 @@ def __repr__(self) -> str: # ------------------------------------------------------------------------ # __array_function__ methods - def putmask(self: NDArrayBackedExtensionArrayT, mask: np.ndarray, value) -> None: + def putmask(self, mask: np.ndarray, value) -> None: """ Analogue to np.putmask(self, mask, value) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 088c44334495c..8ee5a4a2d913a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -33,6 +33,7 @@ ScalarIndexer, SequenceIndexer, Shape, + TakeIndexer, npt, ) from pandas.compat import set_function_name @@ -1076,7 +1077,7 @@ def repeat(self, repeats: int | Sequence[int], axis: int | None = None): def take( self: ExtensionArrayT, - indices: Sequence[int], + indices: TakeIndexer, *, allow_fill: bool = False, fill_value: Any = None, @@ -1086,7 +1087,7 @@ def take( Parameters ---------- - indices : sequence of int + indices : sequence of int or one-dimensional np.ndarray of int Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 36e7a090214ed..2c9796e826825 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1228,7 +1228,9 @@ def _addsub_object_array(self, other: np.ndarray, op): ) return result - def _time_shift(self, periods: int, freq=None): + def _time_shift( + self: DatetimeLikeArrayT, periods: int, freq=None + ) -> DatetimeLikeArrayT: """ Shift each value by `periods`. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 732bdb112b8c3..68365613c8c77 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -934,9 +934,7 @@ def copy(self: IntervalArrayT) -> IntervalArrayT: def isna(self) -> np.ndarray: return isna(self._left) - def shift( - self: IntervalArrayT, periods: int = 1, fill_value: object = None - ) -> IntervalArray: + def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: if not len(self) or periods == 0: return self.copy() diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 90af1cf082981..3bbe936befea9 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -340,10 +340,14 @@ def to_coo(self): cols, rows, data = [], [], [] for col, (_, ser) in enumerate(self._parent.iteritems()): - row = ser.array.sp_index.to_int_index().indices + sp_arr = ser.array + if sp_arr.fill_value != 0: + raise ValueError("fill value must be 0 when converting to COO matrix") + + row = sp_arr.sp_index.to_int_index().indices cols.append(np.repeat(col, len(row))) rows.append(row) - data.append(ser.array.sp_values.astype(dtype, copy=False)) + data.append(sp_arr.sp_values.astype(dtype, copy=False)) cols = np.concatenate(cols) rows = np.concatenate(rows) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 57f9c7262bce3..312cf387a8472 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -892,13 +892,39 @@ def __getitem__( elif isinstance(key, tuple): data_slice = self.to_dense()[key] elif isinstance(key, slice): - # special case to preserve dtypes - if key == slice(None): - return self.copy() - # TODO: this logic is surely elsewhere - # TODO: this could be more efficient - indices = np.arange(len(self), dtype=np.int32)[key] - return self.take(indices) + + # Avoid densifying when handling contiguous slices + if key.step is None or key.step == 1: + start = 0 if key.start is None else key.start + if start < 0: + start += len(self) + + end = len(self) if key.stop is None else key.stop + if end < 0: + end += len(self) + + indices = self.sp_index.to_int_index().indices + keep_inds = np.flatnonzero((indices >= start) & (indices < end)) + sp_vals = self.sp_values[keep_inds] + + sp_index = indices[keep_inds].copy() + + # If we've sliced to not include the start of the array, all our indices + # should be shifted. NB: here we are careful to also not shift by a + # negative value for a case like [0, 1][-100:] where the start index + # should be treated like 0 + if start > 0: + sp_index -= start + + # Length of our result should match applying this slice to a range + # of the length of our original array + new_len = len(range(len(self))[key]) + new_sp_index = make_sparse_index(new_len, sp_index, self.kind) + return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) + else: + indices = np.arange(len(self), dtype=np.int32)[key] + return self.take(indices) + else: # TODO: I think we can avoid densifying when masking a # boolean SparseArray with another. Need to look at the @@ -1571,9 +1597,6 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): else: return type(self)(result) - def __abs__(self): - return np.abs(self) - # ------------------------------------------------------------------------ # Ops # ------------------------------------------------------------------------ @@ -1655,6 +1678,9 @@ def __neg__(self) -> SparseArray: def __invert__(self) -> SparseArray: return self._unary_method(operator.invert) + def __abs__(self) -> SparseArray: + return self._unary_method(operator.abs) + # ---------- # Formatting # ----------- @@ -1745,10 +1771,10 @@ def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntInde def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: index: SparseIndex - if kind == "block" or isinstance(kind, BlockIndex): + if kind == "block": locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) - elif kind == "integer" or isinstance(kind, IntIndex): + elif kind == "integer": index = IntIndex(length, indices) else: # pragma: no cover raise ValueError("must be block or integer type") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9411d3535e06f..c7d08f7873c09 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Sequence, Union, cast, overload, @@ -24,6 +23,7 @@ Scalar, ScalarIndexer, SequenceIndexer, + TakeIndexer, npt, ) from pandas.compat import ( @@ -307,9 +307,7 @@ def __getitem__( if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item.dtype): - # error: Argument 1 to "take" of "ArrowStringArray" has incompatible - # type "ndarray"; expected "Sequence[int]" - return self.take(item) # type: ignore[arg-type] + return self.take(item) elif is_bool_dtype(item.dtype): return type(self)(self._data.filter(item)) else: @@ -513,14 +511,17 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: self[k] = v def take( - self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + self, + indices: TakeIndexer, + allow_fill: bool = False, + fill_value: Any = None, ): """ Take elements from an array. Parameters ---------- - indices : sequence of int + indices : sequence of int or one-dimensional np.ndarray of int Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. diff --git a/pandas/core/base.py b/pandas/core/base.py index b0993bbc619dc..24fa362eea9c3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -14,6 +14,7 @@ TypeVar, cast, final, + overload, ) import numpy as np @@ -22,8 +23,8 @@ from pandas._typing import ( ArrayLike, DtypeObj, - FrameOrSeries, IndexLabel, + NDFrameT, Shape, npt, ) @@ -181,13 +182,13 @@ class SpecificationError(Exception): pass -class SelectionMixin(Generic[FrameOrSeries]): +class SelectionMixin(Generic[NDFrameT]): """ mixin implementing the selection & aggregation interface on a group-like object sub-classes need to define: obj, exclusions """ - obj: FrameOrSeries + obj: NDFrameT _selection: IndexLabel | None = None exclusions: frozenset[Hashable] _internal_names = ["_cache", "__setstate__"] @@ -221,7 +222,11 @@ def _obj_with_exclusions(self): return self.obj[self._selection_list] if len(self.exclusions) > 0: - return self.obj.drop(self.exclusions, axis=1) + # equivalent to `self.obj.drop(self.exclusions, axis=1) + # but this avoids consolidating and making a copy + return self.obj._drop_axis( + self.exclusions, axis=1, consolidate=False, only_slice=True + ) else: return self.obj @@ -1226,14 +1231,50 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1): 0 # wrong result, correct would be 1 """ + # This overload is needed so that the call to searchsorted in + # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result + + @overload + # The following ignore is also present in numpy/__init__.pyi + # Possibly a mypy bug?? + # error: Overloaded function signatures 1 and 2 overlap with incompatible + # return types [misc] + def searchsorted( # type: ignore[misc] + self, + value: npt._ScalarLike_co, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> np.intp: + ... + + @overload + def searchsorted( + self, + value: npt.ArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp]: + ... + @doc(_shared_docs["searchsorted"], klass="Index") def searchsorted( self, - value: NumpyValueArrayLike, + value: NumpyValueArrayLike | ExtensionArray, side: Literal["left", "right"] = "left", sorter: NumpySorter = None, ) -> npt.NDArray[np.intp] | np.intp: - return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) + + values = self._values + if not isinstance(values, np.ndarray): + # Going through EA.searchsorted directly improves performance GH#38083 + return values.searchsorted(value, side=side, sorter=sorter) + + return algorithms.searchsorted( + values, + value, + side=side, + sorter=sorter, + ) def drop_duplicates(self, keep="first"): duplicated = self._duplicated(keep=keep) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 148b37cafbf78..a4bd0270f9451 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -15,7 +15,6 @@ import numpy as np -from pandas._typing import FrameOrSeries from pandas.errors import PerformanceWarning from pandas.core.dtypes.generic import ( @@ -28,14 +27,15 @@ from pandas.core.computation.common import result_type_many if TYPE_CHECKING: + from pandas.core.generic import NDFrame from pandas.core.indexes.api import Index def _align_core_single_unary_op( term, -) -> tuple[partial | type[FrameOrSeries], dict[str, Index] | None]: +) -> tuple[partial | type[NDFrame], dict[str, Index] | None]: - typ: partial | type[FrameOrSeries] + typ: partial | type[NDFrame] axes: dict[str, Index] | None = None if isinstance(term.value, np.ndarray): @@ -49,7 +49,7 @@ def _align_core_single_unary_op( def _zip_axes_from_type( - typ: type[FrameOrSeries], new_axes: Sequence[Index] + typ: type[NDFrame], new_axes: Sequence[Index] ) -> dict[str, Index]: return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index d495f89970348..51af2cd732d09 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -265,7 +265,10 @@ def f(self, *args, **kwargs): return f -_T = TypeVar("_T", bound="BaseExprVisitor") +# should be bound by BaseExprVisitor but that creates a circular dependency: +# _T is used in disallow, but disallow is used to define BaseExprVisitor +# https://github.com/microsoft/pyright/issues/2315 +_T = TypeVar("_T") def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: @@ -279,11 +282,13 @@ def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: """ def disallowed(cls: type[_T]) -> type[_T]: - cls.unsupported_nodes = () + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes = () # type: ignore[attr-defined] for node in nodes: new_method = _node_not_implemented(node) name = f"visit_{node}" - cls.unsupported_nodes += (name,) + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes += (name,) # type: ignore[attr-defined] setattr(cls, name, new_method) return cls @@ -702,7 +707,8 @@ def visit_Call(self, node, side=None, **kwargs): if key.arg: kwargs[key.arg] = self.visit(key.value).value - return self.const_type(res(*new_args, **kwargs), self.env) + name = self.env.add_tmp(res(*new_args, **kwargs)) + return self.term_type(name=name, env=self.env) def translate_In(self, op): return op diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 90349ee5f942f..8d3fd0c520a6d 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -49,7 +49,10 @@ is_object_dtype, is_timedelta64_ns_dtype, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + PandasDtype, +) from pandas.core.dtypes.generic import ( ABCExtensionArray, ABCIndex, @@ -494,6 +497,10 @@ def sanitize_array( if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) + if isinstance(dtype, PandasDtype): + # Avoid ending up with a PandasArray + dtype = dtype.numpy_dtype + # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index fd45da4a3ccc7..2c4a340e8c8ea 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -22,7 +22,7 @@ import numpy as np from pandas._libs.tslibs import Timestamp -from pandas._typing import FrameOrSeries +from pandas._typing import NDFrameT from pandas.util._validators import validate_percentile from pandas.core.dtypes.common import ( @@ -45,12 +45,12 @@ def describe_ndframe( *, - obj: FrameOrSeries, + obj: NDFrameT, include: str | Sequence[str] | None, exclude: str | Sequence[str] | None, datetime_is_numeric: bool, percentiles: Sequence[float] | np.ndarray | None, -) -> FrameOrSeries: +) -> NDFrameT: """Describe series or dataframe. Called from pandas.core.generic.NDFrame.describe() @@ -91,7 +91,7 @@ def describe_ndframe( ) result = describer.describe(percentiles=percentiles) - return cast(FrameOrSeries, result) + return cast(NDFrameT, result) class NDFrameDescriberAbstract(ABC): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d6a8790afd998..2d4d783e57d9a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -77,6 +77,7 @@ DatetimeTZDtype, ExtensionDtype, IntervalDtype, + PandasDtype, PeriodDtype, ) from pandas.core.dtypes.generic import ( @@ -1305,6 +1306,9 @@ def astype_array_safe( raise TypeError(msg) dtype = pandas_dtype(dtype) + if isinstance(dtype, PandasDtype): + # Ensure we don't end up with a PandasArray + dtype = dtype.numpy_dtype try: new_values = astype_array(values, dtype, copy=copy) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bfdfeabbd389c..5f4207d0985ef 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8201,7 +8201,7 @@ def stack(self, level: Level = -1, dropna: bool = True): def explode( self, - column: str | tuple | list[str | tuple], + column: IndexLabel, ignore_index: bool = False, ) -> DataFrame: """ @@ -8211,7 +8211,7 @@ def explode( Parameters ---------- - column : str or tuple or list thereof + column : IndexLabel Column(s) to explode. For multiple columns, specify a non-empty list with each element be str or tuple, and all specified columns their list-like data @@ -8293,9 +8293,8 @@ def explode( if not self.columns.is_unique: raise ValueError("columns must be unique") - columns: list[str | tuple] + columns: list[Hashable] if is_scalar(column) or isinstance(column, tuple): - assert isinstance(column, (str, tuple)) columns = [column] elif isinstance(column, list) and all( map(lambda c: is_scalar(c) or isinstance(c, tuple), column) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ae5b580c8070d..91a446dd99334 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -42,12 +42,12 @@ DtypeArg, DtypeObj, FilePathOrBuffer, - FrameOrSeries, IndexKeyFunc, IndexLabel, JSONSerializable, Level, Manager, + NDFrameT, RandomState, Renamer, StorageOptions, @@ -297,9 +297,7 @@ def _from_mgr(cls, mgr: Manager): object.__setattr__(obj, "_attrs", {}) return obj - def _as_manager( - self: FrameOrSeries, typ: str, copy: bool_t = True - ) -> FrameOrSeries: + def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT: """ Private helper function to create a DataFrame with specific manager. @@ -388,11 +386,11 @@ def flags(self) -> Flags: @final def set_flags( - self: FrameOrSeries, + self: NDFrameT, *, copy: bool_t = False, allows_duplicate_labels: bool_t | None = None, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Return a new object with updated flags. @@ -457,7 +455,7 @@ def _validate_dtype(cls, dtype) -> DtypeObj | None: # Construction @property - def _constructor(self: FrameOrSeries) -> type[FrameOrSeries]: + def _constructor(self: NDFrameT) -> type[NDFrameT]: """ Used when a manipulation result has the same dimensions as the original. @@ -704,24 +702,22 @@ def size(self) -> int: @overload def set_axis( - self: FrameOrSeries, labels, axis: Axis = ..., inplace: Literal[False] = ... - ) -> FrameOrSeries: + self: NDFrameT, labels, axis: Axis = ..., inplace: Literal[False] = ... + ) -> NDFrameT: ... @overload - def set_axis( - self: FrameOrSeries, labels, axis: Axis, inplace: Literal[True] - ) -> None: + def set_axis(self, labels, axis: Axis, inplace: Literal[True]) -> None: ... @overload - def set_axis(self: FrameOrSeries, labels, *, inplace: Literal[True]) -> None: + def set_axis(self, labels, *, inplace: Literal[True]) -> None: ... @overload def set_axis( - self: FrameOrSeries, labels, axis: Axis = ..., inplace: bool_t = ... - ) -> FrameOrSeries | None: + self: NDFrameT, labels, axis: Axis = ..., inplace: bool_t = ... + ) -> NDFrameT | None: ... def set_axis(self, labels, axis: Axis = 0, inplace: bool_t = False): @@ -770,7 +766,7 @@ def _set_axis(self, axis: int, labels: Index) -> None: self._clear_item_cache() @final - def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: + def swapaxes(self: NDFrameT, axis1, axis2, copy=True) -> NDFrameT: """ Interchange axes and swap values axes appropriately. @@ -808,7 +804,7 @@ def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: @final @doc(klass=_shared_doc_kwargs["klass"]) - def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: + def droplevel(self: NDFrameT, level, axis=0) -> NDFrameT: """ Return {klass} with requested index / column level(s) removed. @@ -991,7 +987,7 @@ def squeeze(self, axis=None): # Rename def rename( - self: FrameOrSeries, + self: NDFrameT, mapper: Renamer | None = None, *, index: Renamer | None = None, @@ -1001,7 +997,7 @@ def rename( inplace: bool_t = False, level: Level | None = None, errors: str = "ignore", - ) -> FrameOrSeries | None: + ) -> NDFrameT | None: """ Alter axes input function or functions. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left @@ -1147,7 +1143,11 @@ def rename( # GH 13473 if not callable(replacements): - indexer = ax.get_indexer_for(replacements) + if ax._is_multi and level is not None: + indexer = ax.get_level_values(level).get_indexer_for(replacements) + else: + indexer = ax.get_indexer_for(replacements) + if errors == "raise" and len(indexer[indexer == -1]): missing_labels = [ label @@ -1590,11 +1590,11 @@ def bool(self): self.__nonzero__() @final - def __abs__(self: FrameOrSeries) -> FrameOrSeries: + def __abs__(self: NDFrameT) -> NDFrameT: return self.abs() @final - def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: + def __round__(self: NDFrameT, decimals: int = 0) -> NDFrameT: return self.round(decimals) # ------------------------------------------------------------------------- @@ -2703,10 +2703,12 @@ def to_hdf( """ from pandas.io import pytables + # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected + # "Union[DataFrame, Series]" [arg-type] pytables.to_hdf( path_or_buf, key, - self, + self, # type: ignore[arg-type] mode=mode, complevel=complevel, complib=complib, @@ -3534,8 +3536,8 @@ def _clear_item_cache(self) -> None: # Indexing Methods def take( - self: FrameOrSeries, indices, axis=0, is_copy: bool_t | None = None, **kwargs - ) -> FrameOrSeries: + self: NDFrameT, indices, axis=0, is_copy: bool_t | None = None, **kwargs + ) -> NDFrameT: """ Return the elements in the given *positional* indices along an axis. @@ -3632,7 +3634,7 @@ class max_speed ) return self._constructor(new_data).__finalize__(self, method="take") - def _take_with_is_copy(self: FrameOrSeries, indices, axis=0) -> FrameOrSeries: + def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT: """ Internal version of the `take` method that sets the `_is_copy` attribute to keep track of the parent dataframe (using in indexing @@ -3832,7 +3834,7 @@ class animal locomotion def __getitem__(self, item): raise AbstractMethodError(self) - def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: + def _slice(self: NDFrameT, slobj: slice, axis=0) -> NDFrameT: """ Construct a slice of this container. @@ -3850,7 +3852,7 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: return result @final - def _set_is_copy(self, ref: FrameOrSeries, copy: bool_t = True) -> None: + def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None: if not copy: self._is_copy = None else: @@ -3953,6 +3955,8 @@ def __delitem__(self, key) -> None: maybe_shortcut = False if self.ndim == 2 and isinstance(self.columns, MultiIndex): try: + # By using engine's __contains__ we effectively + # restrict to same-length tuples maybe_shortcut = key not in self.columns._engine except TypeError: pass @@ -4050,13 +4054,13 @@ def _is_view(self) -> bool_t: @final def reindex_like( - self: FrameOrSeries, + self: NDFrameT, other, method: str | None = None, copy: bool_t = True, limit=None, tolerance=None, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Return an object with matching indices as other object. @@ -4198,8 +4202,14 @@ def drop( @final def _drop_axis( - self: FrameOrSeries, labels, axis, level=None, errors: str = "raise" - ) -> FrameOrSeries: + self: NDFrameT, + labels, + axis, + level=None, + errors: str = "raise", + consolidate: bool_t = True, + only_slice: bool_t = False, + ) -> NDFrameT: """ Drop labels from specified axis. Used in the ``drop`` method internally. @@ -4212,10 +4222,13 @@ def _drop_axis( For MultiIndex errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and existing labels are dropped. + consolidate : bool, default True + Whether to call consolidate_inplace in the reindex_indexer call. + only_slice : bool, default False + Whether indexing along columns should be view-only. """ - axis = self._get_axis_number(axis) - axis_name = self._get_axis_name(axis) + axis_num = self._get_axis_number(axis) axis = self._get_axis(axis) if axis.is_unique: @@ -4225,7 +4238,7 @@ def _drop_axis( new_axis = axis.drop(labels, level=level, errors=errors) else: new_axis = axis.drop(labels, errors=errors) - result = self.reindex(**{axis_name: new_axis}) + indexer = axis.get_indexer(new_axis) # Case for non-unique axis else: @@ -4234,10 +4247,10 @@ def _drop_axis( if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError("axis must be a MultiIndex") - indexer = ~axis.get_level_values(level).isin(labels) + mask = ~axis.get_level_values(level).isin(labels) # GH 18561 MultiIndex.drop should raise if label is absent - if errors == "raise" and indexer.all(): + if errors == "raise" and mask.all(): raise KeyError(f"{labels} not found in axis") elif ( isinstance(axis, MultiIndex) @@ -4247,20 +4260,31 @@ def _drop_axis( # Set level to zero in case of MultiIndex and label is string, # because isin can't handle strings for MultiIndexes GH#36293 # In case of tuples we get dtype object but have to use isin GH#42771 - indexer = ~axis.get_level_values(0).isin(labels) + mask = ~axis.get_level_values(0).isin(labels) else: - indexer = ~axis.isin(labels) + mask = ~axis.isin(labels) # Check if label doesn't exist along axis labels_missing = (axis.get_indexer_for(labels) == -1).any() if errors == "raise" and labels_missing: raise KeyError(f"{labels} not found in axis") - slicer = [slice(None)] * self.ndim - slicer[self._get_axis_number(axis_name)] = indexer + indexer = mask.nonzero()[0] + new_axis = axis.take(indexer) - result = self.loc[tuple(slicer)] + bm_axis = self.ndim - axis_num - 1 + new_mgr = self._mgr.reindex_indexer( + new_axis, + indexer, + axis=bm_axis, + allow_dups=True, + consolidate=consolidate, + only_slice=only_slice, + ) + result = self._constructor(new_mgr) + if self.ndim == 1: + result.name = self.name - return result + return result.__finalize__(self) @final def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: @@ -4281,7 +4305,7 @@ def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: self._maybe_update_cacher(verify_is_copy=verify_is_copy) @final - def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: + def add_prefix(self: NDFrameT, prefix: str) -> NDFrameT: """ Prefix labels with string `prefix`. @@ -4338,14 +4362,14 @@ def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - # error: Incompatible return value type (got "Optional[FrameOrSeries]", - # expected "FrameOrSeries") + # error: Incompatible return value type (got "Optional[NDFrameT]", + # expected "NDFrameT") # error: Argument 1 to "rename" of "NDFrame" has incompatible type # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" return self.rename(**mapper) # type: ignore[return-value, arg-type] @final - def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: + def add_suffix(self: NDFrameT, suffix: str) -> NDFrameT: """ Suffix labels with string `suffix`. @@ -4402,8 +4426,8 @@ def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - # error: Incompatible return value type (got "Optional[FrameOrSeries]", - # expected "FrameOrSeries") + # error: Incompatible return value type (got "Optional[NDFrameT]", + # expected "NDFrameT") # error: Argument 1 to "rename" of "NDFrame" has incompatible type # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" return self.rename(**mapper) # type: ignore[return-value, arg-type] @@ -4616,7 +4640,7 @@ def sort_index( optional_labels="", optional_axis="", ) - def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: + def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: """ Conform {klass} to new index with optional filling logic. @@ -4862,8 +4886,8 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: ).__finalize__(self, method="reindex") def _reindex_axes( - self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy - ) -> FrameOrSeries: + self: NDFrameT, axes, level, limit, tolerance, method, fill_value, copy + ) -> NDFrameT: """Perform the reindex for all the axes.""" obj = self for a in self._AXIS_ORDERS: @@ -4902,12 +4926,12 @@ def _reindex_multi(self, axes, copy, fill_value): @final def _reindex_with_indexers( - self: FrameOrSeries, + self: NDFrameT, reindexers, fill_value=None, copy: bool_t = False, allow_dups: bool_t = False, - ) -> FrameOrSeries: + ) -> NDFrameT: """allow_dups indicates an internal call here""" # reindex doing multiple operations on different axes if indicated new_data = self._mgr @@ -4940,12 +4964,12 @@ def _reindex_with_indexers( return self._constructor(new_data).__finalize__(self) def filter( - self: FrameOrSeries, + self: NDFrameT, items=None, like: str | None = None, regex: str | None = None, axis=None, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Subset the dataframe rows or columns according to the specified index labels. @@ -5043,7 +5067,7 @@ def f(x) -> bool_t: raise TypeError("Must pass either `items`, `like`, or `regex`") @final - def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: + def head(self: NDFrameT, n: int = 5) -> NDFrameT: """ Return the first `n` rows. @@ -5116,7 +5140,7 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: return self.iloc[:n] @final - def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: + def tail(self: NDFrameT, n: int = 5) -> NDFrameT: """ Return the last `n` rows. @@ -5192,7 +5216,7 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: @final def sample( - self: FrameOrSeries, + self: NDFrameT, n: int | None = None, frac: float | None = None, replace: bool_t = False, @@ -5200,7 +5224,7 @@ def sample( random_state: RandomState | None = None, axis: Axis | None = None, ignore_index: bool_t = False, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Return a random sample of items from an axis of object. @@ -5411,8 +5435,8 @@ def pipe( @final def __finalize__( - self: FrameOrSeries, other, method: str | None = None, **kwargs - ) -> FrameOrSeries: + self: NDFrameT, other, method: str | None = None, **kwargs + ) -> NDFrameT: """ Propagate metadata from other to self. @@ -5636,8 +5660,8 @@ def dtypes(self): return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) def astype( - self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" - ) -> FrameOrSeries: + self: NDFrameT, dtype, copy: bool_t = True, errors: str = "raise" + ) -> NDFrameT: """ Cast a pandas object to a specified dtype ``dtype``. @@ -5800,10 +5824,10 @@ def astype( result = concat(results, axis=1, copy=False) result.columns = self.columns # https://github.com/python/mypy/issues/8354 - return cast(FrameOrSeries, result) + return cast(NDFrameT, result) @final - def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: + def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT: """ Make a copy of this object's indices and data. @@ -5913,11 +5937,11 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: return self._constructor(data).__finalize__(self, method="copy") @final - def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: + def __copy__(self: NDFrameT, deep: bool_t = True) -> NDFrameT: return self.copy(deep=deep) @final - def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries: + def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT: """ Parameters ---------- @@ -5928,11 +5952,11 @@ def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries: @final def _convert( - self: FrameOrSeries, + self: NDFrameT, datetime: bool_t = False, numeric: bool_t = False, timedelta: bool_t = False, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Attempt to infer better dtype for object columns @@ -5963,7 +5987,7 @@ def _convert( ).__finalize__(self) @final - def infer_objects(self: FrameOrSeries) -> FrameOrSeries: + def infer_objects(self: NDFrameT) -> NDFrameT: """ Attempt to infer better dtypes for object columns. @@ -6010,13 +6034,13 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: @final def convert_dtypes( - self: FrameOrSeries, + self: NDFrameT, infer_objects: bool_t = True, convert_string: bool_t = True, convert_integer: bool_t = True, convert_boolean: bool_t = True, convert_floating: bool_t = True, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. @@ -6164,7 +6188,7 @@ def convert_dtypes( ] if len(results) > 0: # https://github.com/python/mypy/issues/8354 - return cast(FrameOrSeries, concat(results, axis=1, copy=False)) + return cast(NDFrameT, concat(results, axis=1, copy=False)) else: return self.copy() @@ -6173,14 +6197,14 @@ def convert_dtypes( @doc(**_shared_doc_kwargs) def fillna( - self: FrameOrSeries, + self: NDFrameT, value=None, method=None, axis=None, inplace: bool_t = False, limit=None, downcast=None, - ) -> FrameOrSeries | None: + ) -> NDFrameT | None: """ Fill NA/NaN values using the specified method. @@ -6386,12 +6410,12 @@ def fillna( @doc(klass=_shared_doc_kwargs["klass"]) def ffill( - self: FrameOrSeries, + self: NDFrameT, axis: None | Axis = None, inplace: bool_t = False, limit: None | int = None, downcast=None, - ) -> FrameOrSeries | None: + ) -> NDFrameT | None: """ Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. @@ -6408,12 +6432,12 @@ def ffill( @doc(klass=_shared_doc_kwargs["klass"]) def bfill( - self: FrameOrSeries, + self: NDFrameT, axis: None | Axis = None, inplace: bool_t = False, limit: None | int = None, downcast=None, - ) -> FrameOrSeries | None: + ) -> NDFrameT | None: """ Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. @@ -6612,7 +6636,7 @@ def replace( return result.__finalize__(self, method="replace") def interpolate( - self: FrameOrSeries, + self: NDFrameT, method: str = "linear", axis: Axis = 0, limit: int | None = None, @@ -6621,7 +6645,7 @@ def interpolate( limit_area: str | None = None, downcast: str | None = None, **kwargs, - ) -> FrameOrSeries | None: + ) -> NDFrameT | None: """ Fill NaN values using an interpolation method. @@ -7082,7 +7106,7 @@ def asof(self, where, subset=None): # Action Methods @doc(klass=_shared_doc_kwargs["klass"]) - def isna(self: FrameOrSeries) -> FrameOrSeries: + def isna(self: NDFrameT) -> NDFrameT: """ Detect missing values. @@ -7145,11 +7169,11 @@ def isna(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self, method="isna") @doc(isna, klass=_shared_doc_kwargs["klass"]) - def isnull(self: FrameOrSeries) -> FrameOrSeries: + def isnull(self: NDFrameT) -> NDFrameT: return isna(self).__finalize__(self, method="isnull") @doc(klass=_shared_doc_kwargs["klass"]) - def notna(self: FrameOrSeries) -> FrameOrSeries: + def notna(self: NDFrameT) -> NDFrameT: """ Detect existing (non-missing) values. @@ -7212,7 +7236,7 @@ def notna(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self, method="notna") @doc(notna, klass=_shared_doc_kwargs["klass"]) - def notnull(self: FrameOrSeries) -> FrameOrSeries: + def notnull(self: NDFrameT) -> NDFrameT: return notna(self).__finalize__(self, method="notnull") @final @@ -7276,14 +7300,14 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): return self.where(subset, threshold, axis=axis, inplace=inplace) def clip( - self: FrameOrSeries, + self: NDFrameT, lower=None, upper=None, axis: Axis | None = None, inplace: bool_t = False, *args, **kwargs, - ) -> FrameOrSeries | None: + ) -> NDFrameT | None: """ Trim values at input threshold(s). @@ -7436,13 +7460,13 @@ def clip( @doc(**_shared_doc_kwargs) def asfreq( - self: FrameOrSeries, + self: NDFrameT, freq, method=None, how: str | None = None, normalize: bool_t = False, fill_value=None, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Convert time series to specified frequency. @@ -7560,9 +7584,7 @@ def asfreq( ) @final - def at_time( - self: FrameOrSeries, time, asof: bool_t = False, axis=None - ) -> FrameOrSeries: + def at_time(self: NDFrameT, time, asof: bool_t = False, axis=None) -> NDFrameT: """ Select values at particular time of day (e.g., 9:30AM). @@ -7618,14 +7640,14 @@ def at_time( @final def between_time( - self: FrameOrSeries, + self: NDFrameT, start_time, end_time, include_start: bool_t | lib.NoDefault = lib.no_default, include_end: bool_t | lib.NoDefault = lib.no_default, inclusive: str | None = None, axis=None, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Select values between particular times of the day (e.g., 9:00-9:30 AM). @@ -8179,7 +8201,7 @@ def resample( ) @final - def first(self: FrameOrSeries, offset) -> FrameOrSeries: + def first(self: NDFrameT, offset) -> NDFrameT: """ Select initial periods of time series data based on a date offset. @@ -8252,7 +8274,7 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries: return self.loc[:end] @final - def last(self: FrameOrSeries, offset) -> FrameOrSeries: + def last(self: NDFrameT, offset) -> NDFrameT: """ Select final periods of time series data based on a date offset. @@ -8317,14 +8339,14 @@ def last(self: FrameOrSeries, offset) -> FrameOrSeries: @final def rank( - self: FrameOrSeries, + self: NDFrameT, axis=0, method: str = "average", numeric_only: bool_t | None = None, na_option: str = "keep", ascending: bool_t = True, pct: bool_t = False, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Compute numerical data ranks (1 through n) along axis. @@ -9179,8 +9201,8 @@ def mask( @doc(klass=_shared_doc_kwargs["klass"]) def shift( - self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None - ) -> FrameOrSeries: + self: NDFrameT, periods=1, freq=None, axis=0, fill_value=None + ) -> NDFrameT: """ Shift index by desired number of periods with an optional time `freq`. @@ -9324,7 +9346,7 @@ def shift( return result.__finalize__(self, method="shift") @final - def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: + def slice_shift(self: NDFrameT, periods: int = 1, axis=0) -> NDFrameT: """ Equivalent to `shift` without copying data. The shifted data will not include the dropped periods and the @@ -9373,9 +9395,7 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: return new_obj.__finalize__(self, method="slice_shift") @final - def tshift( - self: FrameOrSeries, periods: int = 1, freq=None, axis: Axis = 0 - ) -> FrameOrSeries: + def tshift(self: NDFrameT, periods: int = 1, freq=None, axis: Axis = 0) -> NDFrameT: """ Shift the time index, using the index's frequency if available. @@ -9417,8 +9437,8 @@ def tshift( return self.shift(periods, freq, axis) def truncate( - self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True - ) -> FrameOrSeries: + self: NDFrameT, before=None, after=None, axis=None, copy: bool_t = True + ) -> NDFrameT: """ Truncate a Series or DataFrame before and after some index value. @@ -9574,8 +9594,8 @@ def truncate( @final def tz_convert( - self: FrameOrSeries, tz, axis=0, level=None, copy: bool_t = True - ) -> FrameOrSeries: + self: NDFrameT, tz, axis=0, level=None, copy: bool_t = True + ) -> NDFrameT: """ Convert tz-aware axis to target time zone. @@ -9632,14 +9652,14 @@ def _tz_convert(ax, tz): @final def tz_localize( - self: FrameOrSeries, + self: NDFrameT, tz, axis=0, level=None, copy: bool_t = True, ambiguous="raise", nonexistent: str = "raise", - ) -> FrameOrSeries: + ) -> NDFrameT: """ Localize tz-naive index of a Series or DataFrame to target time zone. @@ -9803,7 +9823,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): # Numeric Methods @final - def abs(self: FrameOrSeries) -> FrameOrSeries: + def abs(self: NDFrameT) -> NDFrameT: """ Return a Series/DataFrame with absolute numeric value of each element. @@ -9871,17 +9891,17 @@ def abs(self: FrameOrSeries) -> FrameOrSeries: 3 7 40 -50 """ # error: Incompatible return value type (got "ndarray[Any, dtype[Any]]", - # expected "FrameOrSeries") + # expected "NDFrameT") return np.abs(self) # type: ignore[return-value] @final def describe( - self: FrameOrSeries, + self: NDFrameT, percentiles=None, include=None, exclude=None, datetime_is_numeric=False, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Generate descriptive statistics. @@ -10133,13 +10153,13 @@ def describe( @final def pct_change( - self: FrameOrSeries, + self: NDFrameT, periods=1, fill_method="pad", limit=None, freq=None, **kwargs, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Percentage change between the current and a prior element. @@ -10263,7 +10283,7 @@ def pct_change( data = _data shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs) - # Unsupported left operand type for / ("FrameOrSeries") + # Unsupported left operand type for / ("NDFrameT") rs = data / shifted - 1 # type: ignore[operator] if freq is not None: # Shift method is implemented differently when freq is not None @@ -11017,13 +11037,11 @@ def ewm( adjust: bool_t = True, ignore_na: bool_t = False, axis: Axis = 0, - times: str | np.ndarray | FrameOrSeries | None = None, + times: str | np.ndarray | DataFrame | Series | None = None, method: str = "single", ) -> ExponentialMovingWindow: axis = self._get_axis_number(axis) - # error: Value of type variable "FrameOrSeries" of "ExponentialMovingWindow" - # cannot be "object" - return ExponentialMovingWindow( # type: ignore[type-var] + return ExponentialMovingWindow( self, com=com, span=span, @@ -11805,8 +11823,8 @@ def _doc_params(cls): def _align_as_utc( - left: FrameOrSeries, right: FrameOrSeries, join_index: Index | None -) -> tuple[FrameOrSeries, FrameOrSeries]: + left: NDFrameT, right: NDFrameT, join_index: Index | None +) -> tuple[NDFrameT, NDFrameT]: """ If we are aligning timezone-aware DatetimeIndexes and the timezones do not match, convert both to UTC. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a672f8fc96ba0..8a893db95dc22 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -7,10 +7,7 @@ """ from __future__ import annotations -from collections import ( - abc, - namedtuple, -) +from collections import abc from functools import partial from textwrap import dedent from typing import ( @@ -19,8 +16,10 @@ Hashable, Iterable, Mapping, + NamedTuple, TypeVar, Union, + cast, ) import warnings @@ -29,8 +28,9 @@ from pandas._libs import reduction as libreduction from pandas._typing import ( ArrayLike, - FrameOrSeries, + Manager, Manager2D, + SingleManager, ) from pandas.util._decorators import ( Appender, @@ -45,7 +45,6 @@ is_dict_like, is_integer_dtype, is_interval_dtype, - is_numeric_dtype, is_scalar, ) from pandas.core.dtypes.missing import ( @@ -74,20 +73,18 @@ _agg_template, _apply_docs, _transform_template, - group_selection_context, + warn_dropping_nuisance_columns_deprecated, ) from pandas.core.indexes.api import ( Index, MultiIndex, all_indexes_same, - default_index, ) from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba from pandas.plotting import boxplot_frame_groupby -NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. AggScalar = Union[str, Callable[..., Any]] # TODO: validate types on ScalarResult and move to _typing @@ -96,7 +93,12 @@ ScalarResult = TypeVar("ScalarResult") -def generate_property(name: str, klass: type[FrameOrSeries]): +class NamedAgg(NamedTuple): + column: Hashable + aggfunc: AggScalar + + +def generate_property(name: str, klass: type[DataFrame | Series]): """ Create a property for a GroupBy subclass to dispatch to DataFrame/Series. @@ -119,7 +121,9 @@ def prop(self): return property(prop) -def pin_allowlisted_properties(klass: type[FrameOrSeries], allowlist: frozenset[str]): +def pin_allowlisted_properties( + klass: type[DataFrame | Series], allowlist: frozenset[str] +): """ Create GroupBy member defs for DataFrame/Series names in a allowlist. @@ -159,19 +163,21 @@ def pinner(cls): class SeriesGroupBy(GroupBy[Series]): _apply_allowlist = base.series_apply_allowlist - def _wrap_agged_manager(self, mgr: Manager2D) -> Series: - single = mgr.iget(0) + def _wrap_agged_manager(self, mgr: Manager) -> Series: + if mgr.ndim == 1: + mgr = cast(SingleManager, mgr) + single = mgr + else: + mgr = cast(Manager2D, mgr) + single = mgr.iget(0) ser = self.obj._constructor(single, name=self.obj.name) # NB: caller is responsible for setting ser.index return ser - def _get_data_to_aggregate(self) -> Manager2D: + def _get_data_to_aggregate(self) -> SingleManager: ser = self._obj_with_exclusions single = ser._mgr - columns = default_index(1) - # Much faster than using ser.to_frame() since we avoid inferring columns - # from scalar - return single.to_2d_mgr(columns) + return single def _iterate_slices(self) -> Iterable[Series]: yield self._selected_obj @@ -238,7 +244,7 @@ def apply(self, func, *args, **kwargs): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - with group_selection_context(self): + with self._group_selection_context(): data = self._selected_obj result = self._aggregate_with_numba( data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs @@ -330,43 +336,6 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: output = self._reindex_output(output) return output - def _cython_agg_general( - self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 - ): - - obj = self._selected_obj - objvals = obj._values - data = obj._mgr - - if numeric_only and not is_numeric_dtype(obj.dtype): - # GH#41291 match Series behavior - raise NotImplementedError( - f"{type(self).__name__}.{how} does not implement numeric_only." - ) - - # This is overkill because it is only called once, but is here to - # mirror the array_func used in DataFrameGroupBy._cython_agg_general - def array_func(values: ArrayLike) -> ArrayLike: - try: - result = self.grouper._cython_operation( - "aggregate", values, how, axis=data.ndim - 1, min_count=min_count - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - # TODO: shouldn't min_count matter? - result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) - - return result - - result = array_func(objvals) - - ser = self.obj._constructor( - result, index=self.grouper.result_index, name=obj.name - ) - return self._reindex_output(ser) - def _indexed_output_to_ndframe( self, output: Mapping[base.OutputKey, ArrayLike] ) -> Series: @@ -877,7 +846,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - with group_selection_context(self): + with self._group_selection_context(): data = self._selected_obj result = self._aggregate_with_numba( data, func, *args, engine_kwargs=engine_kwargs, **kwargs @@ -966,46 +935,6 @@ def _iterate_slices(self) -> Iterable[Series]: yield values - def _cython_agg_general( - self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 - ) -> DataFrame: - # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy - - data: Manager2D = self._get_data_to_aggregate() - - if numeric_only: - data = data.get_numeric_data(copy=False) - - def array_func(values: ArrayLike) -> ArrayLike: - try: - result = self.grouper._cython_operation( - "aggregate", values, how, axis=data.ndim - 1, min_count=min_count - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - # TODO: shouldn't min_count matter? - result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) - - return result - - # TypeError -> we may have an exception in trying to aggregate - # continue and exclude the block - new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - - if len(new_mgr) < len(data): - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.{how} " - "is deprecated. In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which should be " - "valid for the function.", - FutureWarning, - stacklevel=4, - ) - - return self._wrap_agged_manager(new_mgr) - def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") @@ -1191,14 +1120,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: res_mgr.set_axis(1, mgr.axes[1]) if len(res_mgr) < len(mgr): - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.{how} " - "is deprecated. In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which should be " - "valid for the transforming function.", - FutureWarning, - stacklevel=4, - ) + warn_dropping_nuisance_columns_deprecated(type(self), how) res_df = self.obj._constructor(res_mgr) if self.axis == 1: @@ -1310,14 +1232,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.transform " - "is deprecated. In a future version, a TypeError will be raised. " - "Before calling .transform, select only columns which should be " - "valid for the transforming function.", - FutureWarning, - stacklevel=5, - ) + warn_dropping_nuisance_columns_deprecated(type(self), "transform") else: inds.append(i) @@ -1519,7 +1434,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: # values converted return self._reindex_output(result)._convert(datetime=True) - def _iterate_column_groupbys(self, obj: FrameOrSeries): + def _iterate_column_groupbys(self, obj: DataFrame | Series): for i, colname in enumerate(obj.columns): yield colname, SeriesGroupBy( obj.iloc[:, i], @@ -1528,7 +1443,7 @@ def _iterate_column_groupbys(self, obj: FrameOrSeries): exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func, obj: FrameOrSeries) -> DataFrame: + def _apply_to_column_groupbys(self, func, obj: DataFrame | Series) -> DataFrame: from pandas.core.reshape.concat import concat columns = obj.columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7ff672429afcf..52bf44a0bb4ec 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -44,9 +44,8 @@ class providing the base-class of operations. import pandas._libs.groupby as libgroupby from pandas._typing import ( ArrayLike, - F, - FrameOrSeries, IndexLabel, + NDFrameT, RandomState, Scalar, T, @@ -547,18 +546,6 @@ def f(self): return attr -@contextmanager -def group_selection_context(groupby: GroupBy) -> Iterator[GroupBy]: - """ - Set / reset the group_selection_context. - """ - groupby._set_group_selection() - try: - yield groupby - finally: - groupby._reset_group_selection() - - _KeysArgType = Union[ Hashable, List[Hashable], @@ -568,7 +555,7 @@ def group_selection_context(groupby: GroupBy) -> Iterator[GroupBy]: ] -class BaseGroupBy(PandasObject, SelectionMixin[FrameOrSeries]): +class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT]): _group_selection: IndexLabel | None = None _apply_allowlist: frozenset[str] = frozenset() _hidden_attrs = PandasObject._hidden_attrs | { @@ -756,7 +743,7 @@ def get_group(self, name, obj=None) -> DataFrame | Series: return obj._take_with_is_copy(inds, axis=self.axis) @final - def __iter__(self) -> Iterator[tuple[Hashable, FrameOrSeries]]: + def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: """ Groupby iterator. @@ -772,7 +759,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, FrameOrSeries]]: OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) -class GroupBy(BaseGroupBy[FrameOrSeries]): +class GroupBy(BaseGroupBy[NDFrameT]): """ Class for grouping and aggregating relational data. @@ -846,7 +833,7 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): @final def __init__( self, - obj: FrameOrSeries, + obj: NDFrameT, keys: _KeysArgType | None = None, axis: int = 0, level: IndexLabel | None = None, @@ -916,7 +903,7 @@ def __getattr__(self, attr: str): def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist - with group_selection_context(self): + with self._group_selection_context(): # need to setup the selection # as are not passed directly but in the grouper f = getattr(self._obj_with_exclusions, name) @@ -993,6 +980,17 @@ def _reset_group_selection(self) -> None: self._group_selection = None self._reset_cache("_selected_obj") + @contextmanager + def _group_selection_context(self) -> Iterator[GroupBy]: + """ + Set / reset the _group_selection_context. + """ + self._set_group_selection() + try: + yield self + finally: + self._reset_group_selection() + def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) @@ -1214,7 +1212,10 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: numeric_only = True # GH#42395 GH#43108 GH#43154 # Regression from 1.2.5 to 1.3 caused object columns to be dropped - obj = self._obj_with_exclusions + if self.axis: + obj = self._obj_with_exclusions.T + else: + obj = self._obj_with_exclusions check = obj._get_numeric_data() if len(obj.columns) and not len(check.columns) and not obj.empty: numeric_only = False @@ -1366,14 +1367,17 @@ def f(g): # fails on *some* columns, e.g. a numeric operation # on a string grouper column - with group_selection_context(self): + with self._group_selection_context(): return self._python_apply_general(f, self._selected_obj) return result @final def _python_apply_general( - self, f: F, data: DataFrame | Series, not_indexed_same: bool | None = None + self, + f: Callable, + data: DataFrame | Series, + not_indexed_same: bool | None = None, ) -> DataFrame | Series: """ Apply function f in python space @@ -1422,14 +1426,7 @@ def _python_agg_general(self, func, *args, **kwargs): # if this function is invalid for this dtype, we will ignore it. result = self.grouper.agg_series(obj, f) except TypeError: - warnings.warn( - f"Dropping invalid columns in {type(self).__name__}.agg " - "is deprecated. In a future version, a TypeError will be raised. " - "Before calling .agg, select only columns which should be " - "valid for the aggregating function.", - FutureWarning, - stacklevel=3, - ) + warn_dropping_nuisance_columns_deprecated(type(self), "agg") continue key = base.OutputKey(label=name, position=idx) @@ -1450,7 +1447,7 @@ def _agg_general( npfunc: Callable, ): - with group_selection_context(self): + with self._group_selection_context(): # try a cython aggregation if we can result = self._cython_agg_general( how=alias, @@ -1500,10 +1497,52 @@ def _agg_py_fallback( # test_groupby_duplicate_columns with object dtype values return ensure_block_shape(res_values, ndim=ndim) + @final def _cython_agg_general( self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): - raise AbstractMethodError(self) + # Note: we never get here with how="ohlc" for DataFrameGroupBy; + # that goes through SeriesGroupBy + + data = self._get_data_to_aggregate() + is_ser = data.ndim == 1 + + if numeric_only: + if is_ser and not is_numeric_dtype(self._selected_obj.dtype): + # GH#41291 match Series behavior + raise NotImplementedError( + f"{type(self).__name__}.{how} does not implement numeric_only." + ) + elif not is_ser: + data = data.get_numeric_data(copy=False) + + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self.grouper._cython_operation( + "aggregate", values, how, axis=data.ndim - 1, min_count=min_count + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) + + return result + + # TypeError -> we may have an exception in trying to aggregate + # continue and exclude the block + new_mgr = data.grouped_reduce(array_func, ignore_failures=True) + + if not is_ser and len(new_mgr) < len(data): + warn_dropping_nuisance_columns_deprecated(type(self), how) + + res = self._wrap_agged_manager(new_mgr) + if is_ser: + res.index = self.grouper.result_index + return self._reindex_output(res) + else: + return res def _cython_transform( self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs @@ -1515,7 +1554,7 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): # TODO: tests with self._selected_obj.ndim == 1 on DataFrameGroupBy - with group_selection_context(self): + with self._group_selection_context(): data = self._selected_obj df = data if data.ndim == 2 else data.to_frame() result = self._transform_with_numba( @@ -1561,7 +1600,7 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): return self._transform_general(func, *args, **kwargs) @final - def _wrap_transform_fast_result(self, result: FrameOrSeries) -> FrameOrSeries: + def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: """ Fast transform path for aggregations. """ @@ -1745,6 +1784,8 @@ def count(self) -> Series | DataFrame: ids, _, ngroups = self.grouper.group_info mask = ids != -1 + is_series = data.ndim == 1 + def hfunc(bvalues: ArrayLike) -> ArrayLike: # TODO(2DEA): reshape would not be necessary with 2D EAs if bvalues.ndim == 1: @@ -1754,6 +1795,10 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: masked = mask & ~isna(bvalues) counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) + if is_series: + assert counted.ndim == 2 + assert counted.shape[0] == 1 + return counted[0] return counted new_mgr = data.grouped_reduce(hfunc) @@ -1763,8 +1808,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: # _wrap_agged_manager() returns. GH 35028 with com.temp_setattr(self, "observed", True): result = self._wrap_agged_manager(new_mgr) - if result.ndim == 1: - result.index = self.grouper.result_index + + if result.ndim == 1: + result.index = self.grouper.result_index return self._reindex_output(result, fill_value=0) @@ -1910,7 +1956,7 @@ def var(self, ddof: int = 1): ) else: func = lambda x: x.var(ddof=ddof) - with group_selection_context(self): + with self._group_selection_context(): return self._python_agg_general(func) @final @@ -2017,7 +2063,7 @@ def max(self, numeric_only: bool = False, min_count: int = -1): @final @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = -1): - def first_compat(obj: FrameOrSeries, axis: int = 0): + def first_compat(obj: NDFrameT, axis: int = 0): def first(x: Series): """Helper function for first item that isn't NA.""" arr = x.array[notna(x.array)] @@ -2042,7 +2088,7 @@ def first(x: Series): @final @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = -1): - def last_compat(obj: FrameOrSeries, axis: int = 0): + def last_compat(obj: NDFrameT, axis: int = 0): def last(x: Series): """Helper function for last item that isn't NA.""" arr = x.array[notna(x.array)] @@ -2102,7 +2148,7 @@ def ohlc(self) -> DataFrame: @doc(DataFrame.describe) def describe(self, **kwargs): - with group_selection_context(self): + with self._group_selection_context(): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T @@ -2486,7 +2532,7 @@ def nth( nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - with group_selection_context(self): + with self._group_selection_context(): mask_left = np.in1d(self._cumcount_array(), nth_array) mask_right = np.in1d( @@ -2647,10 +2693,9 @@ def post_processor(vals: np.ndarray, inference: np.dtype | None) -> np.ndarray: return vals - if is_scalar(q): - res = self.quantile([q], interpolation=interpolation) - nlevels = res.index.nlevels - return res.droplevel(nlevels - 1, axis=0) + orig_scalar = is_scalar(q) + if orig_scalar: + q = [q] qs = np.array(q, dtype=np.float64) ids, _, ngroups = self.grouper.group_info @@ -2702,25 +2747,25 @@ def blk_func(values: ArrayLike) -> ArrayLike: mgr = self._get_data_to_aggregate() res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) - if len(res_mgr.items) != len(mgr.items): - warnings.warn( - "Dropping invalid columns in " - f"{type(self).__name__}.quantile is deprecated. " - "In a future version, a TypeError will be raised. " - "Before calling .quantile, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if not is_ser and len(res_mgr.items) != len(mgr.items): + warn_dropping_nuisance_columns_deprecated(type(self), "quantile") + if len(res_mgr.items) == 0: # re-call grouped_reduce to get the desired exception message mgr.grouped_reduce(blk_func, ignore_failures=False) + # grouped_reduce _should_ raise, so this should not be reached + raise TypeError( # pragma: no cover + "All columns were dropped in grouped_reduce" + ) if is_ser: res = self._wrap_agged_manager(res_mgr) else: res = obj._constructor(res_mgr) + if orig_scalar: + # Avoid expensive MultiIndex construction + return self._wrap_aggregated_output(res) return self._wrap_aggregated_output(res, qs=qs) @final @@ -2784,7 +2829,7 @@ def ngroup(self, ascending: bool = True): 5 0 dtype: int64 """ - with group_selection_context(self): + with self._group_selection_context(): index = self._selected_obj.index result = self._obj_1d_constructor( self.grouper.group_info[0], index, dtype=np.int64 @@ -2848,7 +2893,7 @@ def cumcount(self, ascending: bool = True): 5 0 dtype: int64 """ - with group_selection_context(self): + with self._group_selection_context(): index = self._selected_obj._get_axis(self.axis) cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) @@ -3134,39 +3179,30 @@ def blk_func(values: ArrayLike) -> ArrayLike: obj = self._obj_with_exclusions # Operate block-wise instead of column-by-column - orig_ndim = obj.ndim + is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() if numeric_only: mgr = mgr.get_numeric_data() res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) - if len(res_mgr.items) != len(mgr.items): + + if not is_ser and len(res_mgr.items) != len(mgr.items): howstr = how.replace("group_", "") - warnings.warn( - "Dropping invalid columns in " - f"{type(self).__name__}.{howstr} is deprecated. " - "In a future version, a TypeError will be raised. " - f"Before calling .{howstr}, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=3, - ) + warn_dropping_nuisance_columns_deprecated(type(self), howstr) + if len(res_mgr.items) == 0: # We re-call grouped_reduce to get the right exception message - try: - mgr.grouped_reduce(blk_func, ignore_failures=False) - except Exception as err: - error_msg = str(err) - raise TypeError(error_msg) - # We should never get here - raise TypeError("All columns were dropped in grouped_reduce") - - if orig_ndim == 1: + mgr.grouped_reduce(blk_func, ignore_failures=False) + # grouped_reduce _should_ raise, so this should not be reached + raise TypeError( # pragma: no cover + "All columns were dropped in grouped_reduce" + ) + + if is_ser: out = self._wrap_agged_manager(res_mgr) - out.index = self.grouper.result_index else: - out = type(obj)(res_mgr) + out = obj._constructor(res_mgr) return self._wrap_aggregated_output(out) @@ -3620,3 +3656,15 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde else: mi = MultiIndex.from_product([idx, qs]) return mi + + +def warn_dropping_nuisance_columns_deprecated(cls, how: str) -> None: + warnings.warn( + "Dropping invalid columns in " + f"{cls.__name__}.{how} is deprecated. " + "In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=find_stack_level(), + ) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d02df6a65d359..7577b1e671d60 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -5,6 +5,8 @@ from __future__ import annotations from typing import ( + TYPE_CHECKING, + Any, Hashable, final, ) @@ -14,7 +16,8 @@ from pandas._typing import ( ArrayLike, - FrameOrSeries, + NDFrameT, + npt, ) from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -47,6 +50,9 @@ from pandas.io.formats.printing import pprint_thing +if TYPE_CHECKING: + from pandas.core.generic import NDFrame + class Grouper: """ @@ -299,7 +305,9 @@ def ax(self) -> Index: raise ValueError("_set_grouper must be called before ax is accessed") return index - def _get_grouper(self, obj: FrameOrSeries, validate: bool = True): + def _get_grouper( + self, obj: NDFrameT, validate: bool = True + ) -> tuple[Any, ops.BaseGrouper, NDFrameT]: """ Parameters ---------- @@ -312,7 +320,7 @@ def _get_grouper(self, obj: FrameOrSeries, validate: bool = True): a tuple of binner, grouper, obj (possibly sorted) """ self._set_grouper(obj) - # error: Value of type variable "FrameOrSeries" of "get_grouper" cannot be + # error: Value of type variable "NDFrameT" of "get_grouper" cannot be # "Optional[Any]" # error: Incompatible types in assignment (expression has type "BaseGrouper", # variable has type "None") @@ -326,10 +334,12 @@ def _get_grouper(self, obj: FrameOrSeries, validate: bool = True): dropna=self.dropna, ) - return self.binner, self.grouper, self.obj + # error: Incompatible return value type (got "Tuple[None, None, None]", + # expected "Tuple[Any, BaseGrouper, NDFrameT]") + return self.binner, self.grouper, self.obj # type: ignore[return-value] @final - def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): + def _set_grouper(self, obj: NDFrame, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification @@ -400,7 +410,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): obj = obj.take(indexer, axis=self.axis) # error: Incompatible types in assignment (expression has type - # "FrameOrSeries", variable has type "None") + # "NDFrameT", variable has type "None") self.obj = obj # type: ignore[assignment] self._gpr_index = ax return self._gpr_index @@ -459,7 +469,7 @@ def __init__( self, index: Index, grouper=None, - obj: FrameOrSeries | None = None, + obj: NDFrame | None = None, level=None, sort: bool = True, observed: bool = False, @@ -501,11 +511,9 @@ def __init__( # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) + assert self.obj is not None # for mypy _, newgrouper, newobj = self.grouping_vector._get_grouper( - # error: Value of type variable "FrameOrSeries" of "_get_grouper" - # of "Grouper" cannot be "Optional[FrameOrSeries]" - self.obj, # type: ignore[type-var] - validate=False, + self.obj, validate=False ) self.obj = newobj @@ -597,7 +605,7 @@ def ngroups(self) -> int: return len(self.group_index) @cache_readonly - def indices(self): + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: # we have a list of groupers if isinstance(self.grouping_vector, ops.BaseGrouper): return self.grouping_vector.indices @@ -690,7 +698,7 @@ def groups(self) -> dict[Hashable, np.ndarray]: def get_grouper( - obj: FrameOrSeries, + obj: NDFrameT, key=None, axis: int = 0, level=None, @@ -699,7 +707,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> tuple[ops.BaseGrouper, frozenset[Hashable], FrameOrSeries]: +) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a6436b4c4334b..46e4465667e7e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -10,6 +10,7 @@ import collections import functools from typing import ( + Callable, Generic, Hashable, Iterator, @@ -29,8 +30,7 @@ from pandas._typing import ( ArrayLike, DtypeObj, - F, - FrameOrSeries, + NDFrameT, Shape, npt, ) @@ -82,6 +82,7 @@ BaseMaskedArray, BaseMaskedDtype, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import grouper @@ -306,7 +307,7 @@ def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: elif how in ["mean", "median", "var"]: if isinstance(dtype, (BooleanDtype, _IntegerDtype)): return Float64Dtype() - elif is_float_dtype(dtype): + elif is_float_dtype(dtype) or is_complex_dtype(dtype): return dtype elif is_numeric_dtype(dtype): return np.dtype(np.float64) @@ -348,6 +349,9 @@ def _ea_wrap_cython_operation( elif isinstance(values.dtype, FloatingDtype): # FloatingArray npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) + elif isinstance(values.dtype, StringDtype): + # StringArray + npvalues = values.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( f"function is not implemented for this dtype: {values.dtype}" @@ -375,7 +379,9 @@ def _reconstruct_ea_result(self, values, res_values): """ # TODO: allow EAs to override this logic - if isinstance(values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype)): + if isinstance( + values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype, StringDtype) + ): dtype = self._get_result_dtype(values.dtype) cls = dtype.construct_array_type() return cls._from_sequence(res_values, dtype=dtype) @@ -678,8 +684,8 @@ def nkeys(self) -> int: return len(self.groupings) def get_iterator( - self, data: FrameOrSeries, axis: int = 0 - ) -> Iterator[tuple[Hashable, FrameOrSeries]]: + self, data: NDFrameT, axis: int = 0 + ) -> Iterator[tuple[Hashable, NDFrameT]]: """ Groupby iterator @@ -694,7 +700,7 @@ def get_iterator( yield key, group.__finalize__(data, method="groupby") @final - def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> DataSplitter: + def _get_splitter(self, data: NDFrame, axis: int = 0) -> DataSplitter: """ Returns ------- @@ -726,7 +732,9 @@ def group_keys_seq(self): return get_flattened_list(ids, ngroups, self.levels, self.codes) @final - def apply(self, f: F, data: FrameOrSeries, axis: int = 0) -> tuple[list, bool]: + def apply( + self, f: Callable, data: DataFrame | Series, axis: int = 0 + ) -> tuple[list, bool]: mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self.group_keys_seq @@ -741,14 +749,14 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0) -> tuple[list, bool]: # group might be modified group_axes = group.axes res = f(group) - if not _is_indexed_like(res, group_axes, axis): + if not mutated and not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) return result_values, mutated @cache_readonly - def indices(self): + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): # This shows unused categories in indices GH#38642 @@ -799,7 +807,7 @@ def is_monotonic(self) -> bool: return Index(self.group_info[0]).is_monotonic @cache_readonly - def group_info(self): + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: comp_ids, obs_group_ids = self._get_compressed_codes() ngroups = len(obs_group_ids) @@ -809,22 +817,26 @@ def group_info(self): @final @cache_readonly - def codes_info(self) -> np.ndarray: + def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis ids, _, _ = self.group_info if self.indexer is not None: sorter = np.lexsort((ids, self.indexer)) ids = ids[sorter] + ids = ensure_platform_int(ids) + # TODO: if numpy annotates np.lexsort, this ensure_platform_int + # may become unnecessary return ids @final - def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: + def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.intp]]: + # The first returned ndarray may have any signed integer dtype if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index)) + return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) @final @cache_readonly @@ -912,7 +924,7 @@ def _cython_operation( @final def agg_series( - self, obj: Series, func: F, preserve_dtype: bool = False + self, obj: Series, func: Callable, preserve_dtype: bool = False ) -> ArrayLike: """ Parameters @@ -954,7 +966,7 @@ def agg_series( @final def _aggregate_series_pure_python( - self, obj: Series, func: F + self, obj: Series, func: Callable ) -> npt.NDArray[np.object_]: ids, _, ngroups = self.group_info @@ -1009,7 +1021,7 @@ class BinGrouper(BaseGrouper): """ - bins: np.ndarray # np.ndarray[np.int64] + bins: npt.NDArray[np.int64] binlabels: Index mutated: bool @@ -1055,7 +1067,7 @@ def _get_grouper(self): """ return self - def get_iterator(self, data: FrameOrSeries, axis: int = 0): + def get_iterator(self, data: NDFrame, axis: int = 0): """ Groupby iterator @@ -1093,9 +1105,9 @@ def indices(self): return indices @cache_readonly - def group_info(self): + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: ngroups = self.ngroups - obs_group_ids = np.arange(ngroups, dtype=np.int64) + obs_group_ids = np.arange(ngroups, dtype=np.intp) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) @@ -1136,7 +1148,7 @@ def groupings(self) -> list[grouper.Grouping]: ping = grouper.Grouping(lev, lev, in_axis=False, level=None) return [ping] - def _aggregate_series_fast(self, obj: Series, func: F) -> np.ndarray: + def _aggregate_series_fast(self, obj: Series, func: Callable) -> np.ndarray: # -> np.ndarray[object] raise NotImplementedError( "This should not be reached; use _aggregate_series_pure_python" @@ -1158,10 +1170,10 @@ def _is_indexed_like(obj, axes, axis: int) -> bool: # Splitting / application -class DataSplitter(Generic[FrameOrSeries]): +class DataSplitter(Generic[NDFrameT]): def __init__( self, - data: FrameOrSeries, + data: NDFrameT, labels: npt.NDArray[np.intp], ngroups: int, axis: int = 0, @@ -1197,7 +1209,7 @@ def __iter__(self): yield self._chop(sdata, slice(start, end)) @cache_readonly - def sorted_data(self) -> FrameOrSeries: + def sorted_data(self) -> NDFrameT: return self.data.take(self._sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj: slice) -> NDFrame: @@ -1235,7 +1247,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: def get_splitter( - data: FrameOrSeries, labels: np.ndarray, ngroups: int, axis: int = 0 + data: NDFrame, labels: np.ndarray, ngroups: int, axis: int = 0 ) -> DataSplitter: if isinstance(data, Series): klass: type[DataSplitter] = SeriesSplitter diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a8de95cfa72e0..c9e128ffc4289 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -315,8 +315,8 @@ class Index(IndexOpsMixin, PandasObject): @final def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_join_target() - ov = other._get_join_target() + sv = self._get_engine_target() + ov = other._get_engine_target() return libjoin.left_join_indexer_unique(sv, ov) @final @@ -324,8 +324,8 @@ def _left_indexer( self: _IndexT, other: _IndexT ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_join_target() - ov = other._get_join_target() + sv = self._get_engine_target() + ov = other._get_engine_target() joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -335,8 +335,8 @@ def _inner_indexer( self: _IndexT, other: _IndexT ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_join_target() - ov = other._get_join_target() + sv = self._get_engine_target() + ov = other._get_engine_target() joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -346,8 +346,8 @@ def _outer_indexer( self: _IndexT, other: _IndexT ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: # Caller is responsible for ensuring other.dtype == self.dtype - sv = self._get_join_target() - ov = other._get_join_target() + sv = self._get_engine_target() + ov = other._get_engine_target() joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -894,6 +894,8 @@ def ravel(self, order="C"): # Item "ndarray[Any, Any]" of "Union[ExtensionArray, ndarray[Any, Any]]" # has no attribute "_ndarray" values = self._data._ndarray # type: ignore[union-attr] + elif is_interval_dtype(self.dtype): + values = np.asarray(self._data) else: values = self._get_engine_target() return values.ravel(order=order) @@ -3703,13 +3705,14 @@ def _get_fill_indexer( ) if self.is_monotonic_increasing and target.is_monotonic_increasing: - engine_method = ( - self._engine.get_pad_indexer - if method == "pad" - else self._engine.get_backfill_indexer - ) target_values = target._get_engine_target() - indexer = engine_method(target_values, limit) + own_values = self._get_engine_target() + + if method == "pad": + indexer = libalgos.pad(own_values, target_values, limit=limit) + else: + # i.e. "backfill" + indexer = libalgos.backfill(own_values, target_values, limit=limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None and len(self): @@ -4320,12 +4323,8 @@ def _join_non_unique( ) mask = left_idx == -1 - # error: Argument 1 to "take" of "ExtensionArray" has incompatible - # type "ndarray[Any, dtype[signedinteger[Any]]]"; expected "Sequence[int]" - join_array = self._values.take(left_idx) # type: ignore[arg-type] - # error: Argument 1 to "take" of "ExtensionArray" has incompatible type - # "ndarray[Any, dtype[signedinteger[Any]]]"; expected "Sequence[int]" - right = other._values.take(right_idx) # type: ignore[arg-type] + join_array = self._values.take(left_idx) + right = other._values.take(right_idx) if isinstance(join_array, np.ndarray): np.putmask(join_array, mask, right) @@ -4621,12 +4620,6 @@ def _get_engine_target(self) -> np.ndarray: # ndarray]", expected "ndarray") return self._values # type: ignore[return-value] - def _get_join_target(self) -> np.ndarray: - """ - Get the ndarray that we will pass to libjoin functions. - """ - return self._get_engine_target() - def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ Cast the ndarray returned from one of the libjoin.foo_indexer functions diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d29cb519df1b0..063bb4aafeb75 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -41,7 +41,6 @@ is_dtype_equal, is_integer, is_list_like, - is_period_dtype, ) from pandas.core.dtypes.concat import concat_compat @@ -101,23 +100,6 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): def _is_all_dates(self) -> bool: return True - # ------------------------------------------------------------------------ - # Abstract data attributes - - @property - def values(self) -> np.ndarray: - # Note: PeriodArray overrides this to return an ndarray of objects. - return self._data._ndarray - - def __array_wrap__(self, result, context=None): - """ - Gets called after a ufunc and other functions. - """ - out = super().__array_wrap__(result, context=context) - if isinstance(out, DatetimeTimedeltaMixin) and self.freq is not None: - out = out._with_freq("infer") - return out - # ------------------------------------------------------------------------ def equals(self, other: Any) -> bool: @@ -165,21 +147,6 @@ def __contains__(self, key: Any) -> bool: return False return True - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take((), kwargs) - indices = np.asarray(indices, dtype=np.intp) - - maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) - - result = NDArrayBackedExtensionIndex.take( - self, indices, axis, allow_fill, fill_value, **kwargs - ) - if isinstance(maybe_slice, slice): - freq = self._data._get_getitem_freq(maybe_slice) - result._data._freq = freq - return result - _can_hold_na = True _na_value: NaTType = NaT @@ -189,12 +156,6 @@ def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) return super()._convert_tolerance(tolerance, target) - def tolist(self) -> list: - """ - Return a list of the underlying data. - """ - return list(self.astype(object)) - # -------------------------------------------------------------------- # Rendering Methods @@ -415,102 +376,7 @@ def shift(self: _T, periods: int = 1, freq=None) -> _T: arr = self._data.view() arr._freq = self.freq result = arr._time_shift(periods, freq=freq) - return type(self)(result, name=self.name) - - # -------------------------------------------------------------------- - # List-like Methods - - def _get_delete_freq(self, loc: int | slice | Sequence[int]): - """ - Find the `freq` for self.delete(loc). - """ - freq = None - if is_period_dtype(self.dtype): - freq = self.freq - elif self.freq is not None: - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - # error: Incompatible types in assignment (expression has - # type "Union[slice, ndarray]", variable has type - # "Union[int, slice, Sequence[int]]") - loc = lib.maybe_indices_to_slice( # type: ignore[assignment] - np.asarray(loc, dtype=np.intp), len(self) - ) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - return freq - - def _get_insert_freq(self, loc: int, item): - """ - Find the `freq` for self.insert(loc, item). - """ - value = self._data._validate_scalar(item) - item = self._data._box_func(value) - - freq = None - if is_period_dtype(self.dtype): - freq = self.freq - elif self.freq is not None: - # freq can be preserved on edge cases - if self.size: - if item is NaT: - pass - elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - else: - # Adding a single item to an empty index may preserve freq - if self.freq.is_on_offset(item): - freq = self.freq - return freq - - @doc(NDArrayBackedExtensionIndex.delete) - def delete(self: _T, loc) -> _T: - result = super().delete(loc) - result._data._freq = self._get_delete_freq(loc) - return result - - @doc(NDArrayBackedExtensionIndex.insert) - def insert(self, loc: int, item): - result = super().insert(loc, item) - if isinstance(result, type(self)): - # i.e. parent class method did not cast - result._data._freq = self._get_insert_freq(loc, item) - return result - - # -------------------------------------------------------------------- - # Join/Set Methods - - def _get_join_freq(self, other): - """ - Get the freq to attach to the result of a join operation. - """ - if is_period_dtype(self.dtype): - freq = self.freq - else: - self = cast(DatetimeTimedeltaMixin, self) - freq = self.freq if self._can_fast_union(other) else None - return freq - - def _wrap_joined_index(self, joined, other): - assert other.dtype == self.dtype, (other.dtype, self.dtype) - result = super()._wrap_joined_index(joined, other) - result._data._freq = self._get_join_freq(other) - return result - - def _get_engine_target(self) -> np.ndarray: - # engine methods and libjoin methods need dt64/td64 values cast to i8 - return self._data._ndarray.view("i8") - - def _from_join_target(self, result: np.ndarray): - # view e.g. i8 back to M8[ns] - result = result.view(self._data._ndarray.dtype) - return self._data._from_backing_data(result) + return type(self)._simple_new(result, name=self.name) # -------------------------------------------------------------------- @@ -558,6 +424,11 @@ def is_type_compatible(self, kind: str) -> bool: ) return kind in self._data._infer_matches + @property + def values(self) -> np.ndarray: + # NB: For Datetime64TZ this is lossy + return self._data._ndarray + # -------------------------------------------------------------------- # Set Operation Methods @@ -708,3 +579,119 @@ def _union(self, other, sort): return result else: return super()._union(other, sort)._with_freq("infer") + + # -------------------------------------------------------------------- + # Join Methods + + def _get_join_freq(self, other): + """ + Get the freq to attach to the result of a join operation. + """ + freq = None + if self._can_fast_union(other): + freq = self.freq + return freq + + def _wrap_joined_index(self, joined, other): + assert other.dtype == self.dtype, (other.dtype, self.dtype) + result = super()._wrap_joined_index(joined, other) + result._data._freq = self._get_join_freq(other) + return result + + def _get_engine_target(self) -> np.ndarray: + # engine methods and libjoin methods need dt64/td64 values cast to i8 + return self._data._ndarray.view("i8") + + def _from_join_target(self, result: np.ndarray): + # view e.g. i8 back to M8[ns] + result = result.view(self._data._ndarray.dtype) + return self._data._from_backing_data(result) + + # -------------------------------------------------------------------- + # List-like Methods + + def _get_delete_freq(self, loc: int | slice | Sequence[int]): + """ + Find the `freq` for self.delete(loc). + """ + freq = None + if self.freq is not None: + if is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + # error: Incompatible types in assignment (expression has + # type "Union[slice, ndarray]", variable has type + # "Union[int, slice, Sequence[int]]") + loc = lib.maybe_indices_to_slice( # type: ignore[assignment] + np.asarray(loc, dtype=np.intp), len(self) + ) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq + return freq + + def _get_insert_freq(self, loc: int, item): + """ + Find the `freq` for self.insert(loc, item). + """ + value = self._data._validate_scalar(item) + item = self._data._box_func(value) + + freq = None + if self.freq is not None: + # freq can be preserved on edge cases + if self.size: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + else: + # Adding a single item to an empty index may preserve freq + if self.freq.is_on_offset(item): + freq = self.freq + return freq + + @doc(NDArrayBackedExtensionIndex.delete) + def delete(self, loc): + result = super().delete(loc) + result._data._freq = self._get_delete_freq(loc) + return result + + @doc(NDArrayBackedExtensionIndex.insert) + def insert(self, loc: int, item): + result = super().insert(loc, item) + if isinstance(result, type(self)): + # i.e. parent class method did not cast + result._data._freq = self._get_insert_freq(loc, item) + return result + + # -------------------------------------------------------------------- + # NDArray-Like Methods + + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc and other functions. + """ + out = super().__array_wrap__(result, context=context) + if isinstance(out, DatetimeTimedeltaMixin) and self.freq is not None: + out = out._with_freq("infer") + return out + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + nv.validate_take((), kwargs) + indices = np.asarray(indices, dtype=np.intp) + + result = NDArrayBackedExtensionIndex.take( + self, indices, axis, allow_fill, fill_value, **kwargs + ) + + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + if isinstance(maybe_slice, slice): + freq = self._data._get_getitem_freq(maybe_slice) + result._data._freq = freq + return result diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index a7930ac83a96a..0baa671162c56 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -4,11 +4,8 @@ from __future__ import annotations from typing import ( - TYPE_CHECKING, Hashable, - Literal, TypeVar, - overload, ) import numpy as np @@ -38,17 +35,9 @@ TimedeltaArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.base import Index -if TYPE_CHECKING: - - from pandas._typing import ( - NumpySorter, - NumpyValueArrayLike, - ) - _T = TypeVar("_T", bound="NDArrayBackedExtensionIndex") @@ -207,45 +196,8 @@ def __getitem__(self, key): deprecate_ndim_indexing(result) return result - # This overload is needed so that the call to searchsorted in - # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result - - @overload - # The following ignore is also present in numpy/__init__.pyi - # Possibly a mypy bug?? - # error: Overloaded function signatures 1 and 2 overlap with incompatible - # return types [misc] - def searchsorted( # type: ignore[misc] - self, - value: npt._ScalarLike_co, - side: Literal["left", "right"] = "left", - sorter: NumpySorter = None, - ) -> np.intp: - ... - - @overload - def searchsorted( - self, - value: npt.ArrayLike | ExtensionArray, - side: Literal["left", "right"] = "left", - sorter: NumpySorter = None, - ) -> npt.NDArray[np.intp]: - ... - - def searchsorted( - self, - value: NumpyValueArrayLike | ExtensionArray, - side: Literal["left", "right"] = "left", - sorter: NumpySorter = None, - ) -> npt.NDArray[np.intp] | np.intp: - # overriding IndexOpsMixin improves performance GH#38083 - return self._data.searchsorted(value, side=side, sorter=sorter) - # --------------------------------------------------------------------- - def _get_engine_target(self) -> np.ndarray: - return np.asarray(self._data) - def delete(self, loc): """ Make new Index with passed location(-s) deleted diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c74a1b1ea60fd..f494638ba1aa4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -892,11 +892,14 @@ def _is_all_dates(self) -> bool: """ return False - def _get_join_target(self) -> np.ndarray: + def _get_engine_target(self) -> np.ndarray: # Note: we _could_ use libjoin functions by either casting to object # dtype or constructing tuples (faster than constructing Intervals) # but the libjoin fastpaths are no longer fast in these cases. - raise NotImplementedError("IntervalIndex does not use libjoin fastpaths") + raise NotImplementedError( + "IntervalIndex does not use libjoin fastpaths or pass values to " + "IndexEngine objects" + ) def _from_join_target(self, result): raise NotImplementedError("IntervalIndex does not use libjoin fastpaths") diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 1d265402b040f..4c4902d3ce89f 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -308,7 +308,16 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ if not isinstance(dtype, PeriodDtype): return False - return dtype.freq == self.freq + # For the subset of DateOffsets that can be a dtype.freq, it + # suffices (and is much faster) to compare the dtype_code rather than + # the freq itself. + # See also: PeriodDtype.__eq__ + freq = dtype.freq + own_freq = self.freq + return ( + freq._period_dtype_code == own_freq._period_dtype_code + and freq.n == own_freq.n + ) # ------------------------------------------------------------------------ # Index Methods diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 4d3dcb9c4732e..080796e7957a3 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -10,6 +10,7 @@ ) from pandas._typing import ( + ArrayLike, DtypeObj, Shape, ) @@ -18,7 +19,10 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.base import PandasObject -from pandas.core.indexes.api import Index +from pandas.core.indexes.api import ( + Index, + default_index, +) T = TypeVar("T", bound="DataManager") @@ -171,6 +175,23 @@ def setitem_inplace(self, indexer, value) -> None: """ self.array[indexer] = value + def grouped_reduce(self, func, ignore_failures: bool = False): + """ + ignore_failures : bool, default False + Not used; for compatibility with ArrayManager/BlockManager. + """ + + arr = self.array + res = func(arr) + index = default_index(len(res)) + + mgr = type(self).from_array(res, index) + return mgr + + @classmethod + def from_array(cls, arr: ArrayLike, index: Index): + raise AbstractMethodError(cls) + def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index da7ffbf08c34b..2279dbd283905 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -28,6 +28,7 @@ DtypeObj, F, Shape, + npt, ) from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg @@ -294,7 +295,7 @@ def __repr__(self) -> str: def __len__(self) -> int: return len(self.values) - def _slice(self, slicer): + def _slice(self, slicer) -> ArrayLike: """return a slice of my values""" return self.values[slicer] @@ -343,7 +344,7 @@ def dtype(self) -> DtypeObj: def iget(self, i): return self.values[i] - def set_inplace(self, locs, values): + def set_inplace(self, locs, values) -> None: """ Modify block values in-place with new item value. @@ -562,13 +563,13 @@ def _downcast_2d(self) -> list[Block]: return [self.make_block(new_values)] @final - def astype(self, dtype, copy: bool = False, errors: str = "raise"): + def astype(self, dtype: DtypeObj, copy: bool = False, errors: str = "raise"): """ Coerce to the new dtype. Parameters ---------- - dtype : str, dtype convertible + dtype : np.dtype or ExtensionDtype copy : bool, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'raise' @@ -1128,7 +1129,6 @@ def interpolate( **kwargs, ) - interp_values = maybe_coerce_values(interp_values) nbs = [self.make_block_same_class(interp_values)] return self._maybe_downcast(nbs, downcast) @@ -1279,7 +1279,13 @@ def where(self, other, cond, errors="raise") -> list[Block]: return result_blocks - def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + allow_fill: bool, + ): """ Return a list of unstacked blocks of self @@ -1435,7 +1441,7 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace(self, locs, values): + def set_inplace(self, locs, values) -> None: # NB: This is a misnomer, is supposed to be inplace but is not, # see GH#33457 assert locs.tolist() == [0] @@ -1503,7 +1509,7 @@ def setitem(self, indexer, value): # https://github.com/pandas-dev/pandas/issues/24020 # Need a dedicated setitem until GH#24020 (type promotion in setitem # for extension arrays) is designed and implemented. - return self.astype(object).setitem(indexer, value) + return self.astype(_dtype_obj).setitem(indexer, value) if isinstance(indexer, tuple): # TODO(EA2D): not needed with 2D EAs @@ -1541,7 +1547,7 @@ def take_nd( return self.make_block_same_class(new_values, new_mgr_locs) - def _slice(self, slicer): + def _slice(self, slicer) -> ExtensionArray: """ Return a slice of my values. @@ -1552,7 +1558,7 @@ def _slice(self, slicer): Returns ------- - np.ndarray or ExtensionArray + ExtensionArray """ # return same dims as we currently have if not isinstance(slicer, tuple) and self.ndim == 2: @@ -1669,7 +1675,13 @@ def where(self, other, cond, errors="raise") -> list[Block]: return [self.make_block_same_class(result)] - def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + allow_fill: bool, + ): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require @@ -1724,7 +1736,7 @@ def is_view(self) -> bool: def setitem(self, indexer, value): if not self._can_hold_element(value): # TODO: general case needs casting logic. - return self.astype(object).setitem(indexer, value) + return self.astype(_dtype_obj).setitem(indexer, value) values = self.values if self.ndim > 1: @@ -1738,7 +1750,7 @@ def putmask(self, mask, new) -> list[Block]: mask = extract_bool_array(mask) if not self._can_hold_element(new): - return self.astype(object).putmask(mask, new) + return self.astype(_dtype_obj).putmask(mask, new) arr = self.values arr.T.putmask(mask, new) @@ -1796,7 +1808,7 @@ def fillna( # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. # TODO: don't special-case td64 - return self.astype(object).fillna(value, limit, inplace, downcast) + return self.astype(_dtype_obj).fillna(value, limit, inplace, downcast) values = self.values values = values if inplace else values.copy() @@ -1903,10 +1915,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike: ------- values : np.ndarray or ExtensionArray """ - - # Note: the only test that needs extract_array here is one where we - # pass PandasDtype to Series.astype, then need to extract PandasArray here. - values = extract_array(values, extract_numpy=True) + # Caller is responsible for ensuring PandasArray is already extracted. if isinstance(values, np.ndarray): values = ensure_wrapped_if_datetimelike(values) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6afb071f76f10..f34926f727c3f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -71,7 +71,6 @@ ensure_block_shape, extend_blocks, get_block_type, - maybe_coerce_values, new_block, ) from pandas.core.internals.ops import ( @@ -989,7 +988,6 @@ def iget(self, i: int) -> SingleBlockManager: # shortcut for select a single-dim from a 2-dim BM bp = BlockPlacement(slice(0, len(values))) - values = maybe_coerce_values(values) nb = type(block)(values, placement=bp, ndim=1) return SingleBlockManager(nb, self.axes[1]) @@ -1229,7 +1227,7 @@ def idelete(self, indexer) -> BlockManager: nbs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] - return type(self)(tuple(nbs), axes) + return type(self)(tuple(nbs), axes, verify_integrity=False) # ---------------------------------------------------------------- # Block-wise Operation @@ -2045,7 +2043,7 @@ def _merge_blocks( def _fast_count_smallints(arr: npt.NDArray[np.intp]): """Faster version of set(arr) for sequences of small numbers.""" - counts = np.bincount(arr.astype(np.int_, copy=False)) + counts = np.bincount(arr) nz = counts.nonzero()[0] # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, # in one benchmark by a factor of 11 diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 35caeea9b9067..c938a018574f9 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,21 +1,27 @@ from __future__ import annotations -from collections import namedtuple from typing import ( TYPE_CHECKING, Iterator, + NamedTuple, ) from pandas._typing import ArrayLike if TYPE_CHECKING: + from pandas._libs.internals import BlockPlacement + from pandas.core.internals.blocks import Block from pandas.core.internals.managers import BlockManager -BlockPairInfo = namedtuple( - "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"] -) +class BlockPairInfo(NamedTuple): + lvals: ArrayLike + rvals: ArrayLike + locs: BlockPlacement + left_ea: bool + right_ea: bool + rblk: Block def _iter_block_pairs( diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 2d0aa45f0fad2..a95592c96d411 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -178,6 +178,8 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: def _has_infs(result) -> bool: if isinstance(result, np.ndarray): if result.dtype == "f8" or result.dtype == "f4": + # Note: outside of an nanops-specific test, we always have + # result.ndim == 1, so there is no risk of this ravel making a copy. return lib.has_infs(result.ravel("K")) try: return np.isinf(result).any() diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 303d0494db803..f132dd88d5147 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -4,6 +4,7 @@ from datetime import timedelta from textwrap import dedent from typing import ( + TYPE_CHECKING, Callable, Hashable, Literal, @@ -24,8 +25,8 @@ to_offset, ) from pandas._typing import ( - FrameOrSeries, IndexLabel, + NDFrameT, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -90,6 +91,12 @@ Tick, ) +if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) + _shared_docs_kwargs: dict[str, str] = {} @@ -135,7 +142,7 @@ class Resampler(BaseGroupBy, PandasObject): def __init__( self, - obj: FrameOrSeries, + obj: DataFrame | Series, groupby: TimeGrouper, axis: int = 0, kind=None, @@ -191,9 +198,9 @@ def __getattr__(self, attr: str): # error: Signature of "obj" incompatible with supertype "BaseGroupBy" @property - def obj(self) -> FrameOrSeries: # type: ignore[override] + def obj(self) -> NDFrameT: # type: ignore[override] # error: Incompatible return value type (got "Optional[Any]", - # expected "FrameOrSeries") + # expected "NDFrameT") return self.groupby.obj # type: ignore[return-value] @property @@ -213,7 +220,7 @@ def _from_selection(self) -> bool: self.groupby.key is not None or self.groupby.level is not None ) - def _convert_obj(self, obj: FrameOrSeries) -> FrameOrSeries: + def _convert_obj(self, obj: NDFrameT) -> NDFrameT: """ Provide any conversions for the object in order to correctly handle. @@ -1253,7 +1260,7 @@ def _get_binner_for_time(self): return super()._get_binner_for_time() return self.groupby._get_period_bins(self.ax) - def _convert_obj(self, obj: FrameOrSeries) -> FrameOrSeries: + def _convert_obj(self, obj: NDFrameT) -> NDFrameT: obj = super()._convert_obj(obj) if self._from_selection: @@ -1788,12 +1795,12 @@ def _get_period_bins(self, ax: PeriodIndex): def _take_new_index( - obj: FrameOrSeries, indexer: npt.NDArray[np.intp], new_index: Index, axis: int = 0 -) -> FrameOrSeries: + obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: int = 0 +) -> NDFrameT: if isinstance(obj, ABCSeries): new_values = algos.take_nd(obj._values, indexer) - # error: Incompatible return value type (got "Series", expected "FrameOrSeries") + # error: Incompatible return value type (got "Series", expected "NDFrameT") return obj._constructor( # type: ignore[return-value] new_values, index=new_index, name=obj.name ) @@ -1802,7 +1809,7 @@ def _take_new_index( raise NotImplementedError("axis 1 is not supported") new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) # error: Incompatible return value type - # (got "DataFrame", expected "FrameOrSeries") + # (got "DataFrame", expected "NDFrameT") return obj._constructor(new_mgr) # type: ignore[return-value] else: raise ValueError("'obj' should be either a Series or a DataFrame") @@ -1950,8 +1957,13 @@ def _insert_nat_bin( assert nat_count > 0 bins += nat_count bins = np.insert(bins, 0, nat_count) - binner = binner.insert(0, NaT) - labels = labels.insert(0, NaT) + + # Incompatible types in assignment (expression has type "Index", variable + # has type "PeriodIndex") + binner = binner.insert(0, NaT) # type: ignore[assignment] + # Incompatible types in assignment (expression has type "Index", variable + # has type "PeriodIndex") + labels = labels.insert(0, NaT) # type: ignore[assignment] return binner, bins, labels @@ -2032,13 +2044,13 @@ def _adjust_dates_anchored( def asfreq( - obj: FrameOrSeries, + obj: NDFrameT, freq, method=None, how=None, normalize: bool = False, fill_value=None, -) -> FrameOrSeries: +) -> NDFrameT: """ Utility frequency conversion method for Series/DataFrame. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6dc95a19d8d53..fa09f003bc7b8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -26,7 +26,6 @@ from pandas._typing import ( ArrayLike, DtypeObj, - FrameOrSeries, IndexLabel, Suffixes, npt, @@ -2173,10 +2172,16 @@ def _factorize_keys( rizer = klass(max(len(lk), len(rk))) - llab = rizer.factorize(lk) - rlab = rizer.factorize(rk) - assert llab.dtype == np.intp, llab.dtype - assert rlab.dtype == np.intp, rlab.dtype + # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type + # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], + # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" + llab = rizer.factorize(lk) # type: ignore[arg-type] + # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type + # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], + # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" + rlab = rizer.factorize(rk) # type: ignore[arg-type] + assert llab.dtype == np.dtype(np.intp), llab.dtype + assert rlab.dtype == np.dtype(np.intp), rlab.dtype count = rizer.get_count() @@ -2259,7 +2264,7 @@ def _any(x) -> bool: return x is not None and com.any_not_none(*x) -def _validate_operand(obj: FrameOrSeries) -> DataFrame: +def _validate_operand(obj: DataFrame | Series) -> DataFrame: if isinstance(obj, ABCDataFrame): return obj elif isinstance(obj, ABCSeries): diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b1ecd75c84f4b..4ea4c055c12b0 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -1,6 +1,8 @@ """ Quantilization functions and related stuff """ +from __future__ import annotations + from typing import ( Any, Callable, diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 63b8789f3f551..16fca2d0ff1b4 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -3,18 +3,22 @@ """ from __future__ import annotations +from typing import TYPE_CHECKING + import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, ) +if TYPE_CHECKING: + from pandas.core.generic import NDFrame + -def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: +def preprocess_weights(obj: NDFrame, weights, axis: int) -> np.ndarray: """ Process and validate the `weights` argument to `NDFrame.sample` and `.GroupBy.sample`. diff --git a/pandas/core/series.py b/pandas/core/series.py index 3163133bf4255..a446df335f3f6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -449,7 +449,9 @@ def __init__( self.name = name self._set_axis(0, index, fastpath=True) - def _init_dict(self, data, index=None, dtype: Dtype | None = None): + def _init_dict( + self, data, index: Index | None = None, dtype: DtypeObj | None = None + ): """ Derive the "_mgr" and "index" attributes of a new Series from a dictionary input. @@ -458,9 +460,9 @@ def _init_dict(self, data, index=None, dtype: Dtype | None = None): ---------- data : dict or dict-like Data used to populate the new Series. - index : Index or index-like, default None + index : Index or None, default None Index for the new Series: if None, use dict keys. - dtype : dtype, default None + dtype : np.dtype, ExtensionDtype, or None, default None The dtype for the new Series: if None, infer from data. Returns @@ -468,6 +470,8 @@ def _init_dict(self, data, index=None, dtype: Dtype | None = None): _data : BlockManager for the new Series index : index for the new Series """ + keys: Index | tuple + # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: @@ -2790,13 +2794,14 @@ def __rmatmul__(self, other): return self.dot(np.transpose(other)) @doc(base.IndexOpsMixin.searchsorted, klass="Series") - def searchsorted( + # Signature of "searchsorted" incompatible with supertype "IndexOpsMixin" + def searchsorted( # type: ignore[override] self, - value: NumpyValueArrayLike, + value: NumpyValueArrayLike | ExtensionArray, side: Literal["left", "right"] = "left", sorter: NumpySorter = None, ) -> npt.NDArray[np.intp] | np.intp: - return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) + return base.IndexOpsMixin.searchsorted(self, value, side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index befa67350e182..ccb51a0ea2132 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Callable, DefaultDict, + Hashable, Iterable, Sequence, ) @@ -576,7 +577,7 @@ def get_flattened_list( def get_indexer_dict( label_list: list[np.ndarray], keys: list[Index] -) -> dict[str | tuple, np.ndarray]: +) -> dict[Hashable, npt.NDArray[np.intp]]: """ Returns ------- diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index cbdd02aad1dd0..ca100a60a81b6 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -1,6 +1,7 @@ """ timedelta support tools """ +from __future__ import annotations import numpy as np diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 7b58af87fb1d8..79102c2bc82ee 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -12,12 +12,12 @@ import pandas._libs.window.aggregations as window_aggregations from pandas._typing import ( Axis, - FrameOrSeries, TimedeltaConvertibleTypes, ) if TYPE_CHECKING: from pandas import DataFrame, Series + from pandas.core.generic import NDFrame from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -93,7 +93,7 @@ def get_center_of_mass( def _calculate_deltas( - times: str | np.ndarray | FrameOrSeries | None, + times: str | np.ndarray | NDFrame | None, halflife: float | TimedeltaConvertibleTypes | None, ) -> np.ndarray: """ @@ -113,9 +113,9 @@ def _calculate_deltas( np.ndarray Diff of the times divided by the half-life """ - # error: Item "str" of "Union[str, ndarray, FrameOrSeries, None]" has no + # error: Item "str" of "Union[str, ndarray, NDFrameT, None]" has no # attribute "view" - # error: Item "None" of "Union[str, ndarray, FrameOrSeries, None]" has no + # error: Item "None" of "Union[str, ndarray, NDFrameT, None]" has no # attribute "view" _times = np.asarray( times.view(np.int64), dtype=np.float64 # type: ignore[union-attr] @@ -281,7 +281,7 @@ class ExponentialMovingWindow(BaseWindow): def __init__( self, - obj: FrameOrSeries, + obj: NDFrame, com: float | None = None, span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, @@ -290,7 +290,7 @@ def __init__( adjust: bool = True, ignore_na: bool = False, axis: Axis = 0, - times: str | np.ndarray | FrameOrSeries | None = None, + times: str | np.ndarray | NDFrame | None = None, method: str = "single", *, selection=None, @@ -329,7 +329,7 @@ def __init__( if not is_datetime64_ns_dtype(self.times): raise ValueError("times must be datetime64[ns] dtype.") # error: Argument 1 to "len" has incompatible type "Union[str, ndarray, - # FrameOrSeries, None]"; expected "Sized" + # NDFrameT, None]"; expected "Sized" if len(self.times) != len(obj): # type: ignore[arg-type] raise ValueError("times must be the same length as the object.") if not isinstance(self.halflife, (str, datetime.timedelta)): @@ -744,7 +744,7 @@ def _get_window_indexer(self) -> GroupbyIndexer: class OnlineExponentialMovingWindow(ExponentialMovingWindow): def __init__( self, - obj: FrameOrSeries, + obj: NDFrame, com: float | None = None, span: float | None = None, halflife: float | TimedeltaConvertibleTypes | None = None, @@ -753,7 +753,7 @@ def __init__( adjust: bool = True, ignore_na: bool = False, axis: Axis = 0, - times: str | np.ndarray | FrameOrSeries | None = None, + times: str | np.ndarray | NDFrame | None = None, engine: str = "numba", engine_kwargs: dict[str, bool] | None = None, *, diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 2f460267bfec4..03f16259c66a2 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -9,12 +9,12 @@ from pandas._typing import ( Axis, - FrameOrSeries, WindowingRankType, ) if TYPE_CHECKING: from pandas import DataFrame, Series + from pandas.core.generic import NDFrame from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -101,7 +101,7 @@ class Expanding(RollingAndExpandingMixin): def __init__( self, - obj: FrameOrSeries, + obj: NDFrame, min_periods: int = 1, center=None, axis: Axis = 0, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 503a884578e8b..ea40e8d816f45 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -27,7 +27,7 @@ from pandas._typing import ( ArrayLike, Axis, - FrameOrSeries, + NDFrameT, WindowingRankType, ) from pandas.compat._optional import import_optional_dependency @@ -49,6 +49,7 @@ ) from pandas.core.dtypes.missing import notna +from pandas.core._numba import executor from pandas.core.algorithms import factorize from pandas.core.apply import ResamplerWindowApply from pandas.core.arrays import ExtensionArray @@ -103,6 +104,7 @@ DataFrame, Series, ) + from pandas.core.generic import NDFrame from pandas.core.groupby.ops import BaseGrouper from pandas.core.internals import Block # noqa:F401 @@ -116,7 +118,7 @@ class BaseWindow(SelectionMixin): def __init__( self, - obj: FrameOrSeries, + obj: NDFrame, window=None, min_periods: int | None = None, center: bool = False, @@ -225,7 +227,7 @@ def _validate(self) -> None: if self.method not in ["table", "single"]: raise ValueError("method must be 'table' or 'single") - def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: + def _create_data(self, obj: NDFrameT) -> NDFrameT: """ Split data into blocks & return conformed data. """ @@ -576,6 +578,44 @@ def calc(x): else: return self._apply_tablewise(homogeneous_func, name) + def _numba_apply( + self, + func: Callable[..., Any], + numba_cache_key_str: str, + engine_kwargs: dict[str, bool] | None = None, + ): + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + obj = self._create_data(self._selected_obj) + if self.axis == 1: + obj = obj.T + values = self._prep_values(obj.to_numpy()) + if values.ndim == 1: + values = values.reshape(-1, 1) + start, end = window_indexer.get_window_bounds( + num_values=len(values), + min_periods=min_periods, + center=self.center, + closed=self.closed, + ) + aggregator = executor.generate_shared_aggregator( + func, engine_kwargs, numba_cache_key_str + ) + result = aggregator(values, start, end, min_periods) + NUMBA_FUNC_CACHE[(func, numba_cache_key_str)] = aggregator + result = result.T if self.axis == 1 else result + if obj.ndim == 1: + result = result.squeeze() + out = obj._constructor(result, index=obj.index, name=obj.name) + return out + else: + out = obj._constructor(result, index=obj.index, columns=obj.columns) + return self._resolve_output(out, obj) + def aggregate(self, func, *args, **kwargs): result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: @@ -596,7 +636,7 @@ class BaseWindowGroupby(BaseWindow): def __init__( self, - obj: FrameOrSeries, + obj: DataFrame | Series, *args, _grouper: BaseGrouper, _as_index: bool = True, @@ -621,7 +661,7 @@ def _apply( numba_cache_key: tuple[Callable, str] | None = None, numba_args: tuple[Any, ...] = (), **kwargs, - ) -> FrameOrSeries: + ) -> DataFrame | Series: result = super()._apply( func, name, @@ -756,7 +796,7 @@ def _apply_pairwise( result.index = result_index return result - def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: + def _create_data(self, obj: NDFrameT) -> NDFrameT: """ Split data into blocks & return conformed data. """ @@ -1331,15 +1371,16 @@ def mean( if maybe_use_numba(engine): if self.method == "table": func = generate_manual_numpy_nan_agg_with_axis(np.nanmean) + return self.apply( + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, + ) else: - func = np.nanmean + from pandas.core._numba.kernels import sliding_mean - return self.apply( - func, - raw=True, - engine=engine, - engine_kwargs=engine_kwargs, - ) + return self._numba_apply(sliding_mean, "rolling_mean", engine_kwargs) window_func = window_aggregations.roll_mean return self._apply(window_func, name="mean", **kwargs) diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py index e69de29bb2d1d..bd3ddc09393d8 100644 --- a/pandas/io/__init__.py +++ b/pandas/io/__init__.py @@ -0,0 +1,12 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # import modules that have public classes/functions + from pandas.io import ( + formats, + json, + stata, + ) + + # and mark only those modules as public + __all__ = ["formats", "json", "stata"] diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 947deb39ae064..0417529999890 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -116,7 +116,7 @@ def copy_osx_pbcopy(text): def paste_osx_pbcopy(): p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True) - stdout, stderr = p.communicate() + stdout = p.communicate()[0] return stdout.decode(ENCODING) return copy_osx_pbcopy, paste_osx_pbcopy @@ -194,7 +194,7 @@ def paste_xclip(primary=False): stderr=subprocess.PIPE, close_fds=True, ) - stdout, stderr = p.communicate() + stdout = p.communicate()[0] # Intentionally ignore extraneous output on stderr when clipboard is empty return stdout.decode(ENCODING) @@ -222,7 +222,7 @@ def paste_xsel(primary=False): p = subprocess.Popen( ["xsel", selection_flag, "-o"], stdout=subprocess.PIPE, close_fds=True ) - stdout, stderr = p.communicate() + stdout = p.communicate()[0] return stdout.decode(ENCODING) return copy_xsel, paste_xsel @@ -250,7 +250,7 @@ def paste_klipper(): stdout=subprocess.PIPE, close_fds=True, ) - stdout, stderr = p.communicate() + stdout = p.communicate()[0] # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874 # TODO: https://github.com/asweigart/pyperclip/issues/43 @@ -493,7 +493,7 @@ def paste_wsl(): stderr=subprocess.PIPE, close_fds=True, ) - stdout, stderr = p.communicate() + stdout = p.communicate()[0] # WSL appends "\r\n" to the contents. return stdout[:-2].decode(ENCODING) diff --git a/pandas/io/formats/__init__.py b/pandas/io/formats/__init__.py index e69de29bb2d1d..8a3486a4d71fe 100644 --- a/pandas/io/formats/__init__.py +++ b/pandas/io/formats/__init__.py @@ -0,0 +1,8 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # import modules that have public classes/functions + from pandas.io.formats import style + + # and mark only those modules as public + __all__ = ["style"] diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d636838d21d0e..cfda2911db73f 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -876,7 +876,7 @@ def space_format(x, y): need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) str_columns = [ [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] - for i, (col, x) in enumerate(zip(columns, fmt_columns)) + for i, x in enumerate(fmt_columns) ] # self.str_columns = str_columns return str_columns diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index a454980138369..6e7a7593e56e0 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -22,7 +22,6 @@ from pandas._typing import ( Axis, FilePathOrBuffer, - FrameOrSeries, IndexLabel, Level, Scalar, @@ -3190,10 +3189,10 @@ def pipe(self, func: Callable, *args, **kwargs): def _validate_apply_axis_arg( - arg: FrameOrSeries | Sequence | np.ndarray, + arg: NDFrame | Sequence | np.ndarray, arg_name: str, dtype: Any | None, - data: FrameOrSeries, + data: NDFrame, ) -> np.ndarray: """ For the apply-type methods, ``axis=None`` creates ``data`` as DataFrame, and for @@ -3250,7 +3249,7 @@ def _background_gradient( text_color_threshold: float = 0.408, vmin: float | None = None, vmax: float | None = None, - gmap: Sequence | np.ndarray | FrameOrSeries | None = None, + gmap: Sequence | np.ndarray | DataFrame | Series | None = None, text_only: bool = False, ): """ @@ -3310,10 +3309,10 @@ def css(rgba, text_only) -> str: def _highlight_between( - data: FrameOrSeries, + data: NDFrame, props: str, - left: Scalar | Sequence | np.ndarray | FrameOrSeries | None = None, - right: Scalar | Sequence | np.ndarray | FrameOrSeries | None = None, + left: Scalar | Sequence | np.ndarray | NDFrame | None = None, + right: Scalar | Sequence | np.ndarray | NDFrame | None = None, inclusive: bool | str = True, ) -> np.ndarray: """ @@ -3357,7 +3356,7 @@ def _highlight_between( return np.where(g_left & l_right, props, "") -def _highlight_value(data: FrameOrSeries, op: str, props: str) -> np.ndarray: +def _highlight_value(data: DataFrame | Series, op: str, props: str) -> np.ndarray: """ Return an array of css strings based on the condition of values matching an op. """ @@ -3368,7 +3367,7 @@ def _highlight_value(data: FrameOrSeries, op: str, props: str) -> np.ndarray: def _bar( - data: FrameOrSeries, + data: NDFrame, align: str | float | int | Callable, colors: list[str], width: float, diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index b0e729bb926bf..cfe5b2c2bdfab 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -294,7 +294,7 @@ def _translate( d.update({"table_attributes": table_attr}) if self.tooltips: - d = self.tooltips._translate(self.data, self.uuid, d) + d = self.tooltips._translate(self, d) return d @@ -1508,7 +1508,7 @@ def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str): }, ] - def _translate(self, styler_data: DataFrame | Series, uuid: str, d: dict): + def _translate(self, styler: StylerRenderer, d: dict): """ Mutate the render dictionary to allow for tooltips: @@ -1529,21 +1529,23 @@ def _translate(self, styler_data: DataFrame | Series, uuid: str, d: dict): ------- render_dict : Dict """ - self.tt_data = self.tt_data.reindex_like(styler_data) - + self.tt_data = self.tt_data.reindex_like(styler.data) if self.tt_data.empty: return d name = self.class_name - mask = (self.tt_data.isna()) | (self.tt_data.eq("")) # empty string = no ttip self.table_styles = [ style for sublist in [ - self._pseudo_css(uuid, name, i, j, str(self.tt_data.iloc[i, j])) + self._pseudo_css(styler.uuid, name, i, j, str(self.tt_data.iloc[i, j])) for i in range(len(self.tt_data.index)) for j in range(len(self.tt_data.columns)) - if not mask.iloc[i, j] + if not ( + mask.iloc[i, j] + or i in styler.hidden_rows + or j in styler.hidden_columns + ) ] for style in sublist ] diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index f5ba8c6b53335..ea7d1dfa1645e 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -264,8 +264,6 @@ def build_elems(self) -> None: def write_output(self) -> str | None: xml_doc = self.build_tree() - out_str: str | None - if self.path_or_buffer is not None: with get_handle( self.path_or_buffer, diff --git a/pandas/io/html.py b/pandas/io/html.py index 2947b22f85d61..cbf10798a538a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -632,7 +632,6 @@ def _build_xpath_expr(attrs) -> str: _re_namespace = {"re": "http://exslt.org/regular-expressions"} -_valid_schemes = "http", "file", "ftp" class _LxmlFrameParser(_HtmlFrameParser): diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index e7a24fcee2b34..75fd950cd6076 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -15,7 +15,6 @@ import pandas._libs.json as json from pandas._typing import ( DtypeObj, - FrameOrSeries, JSONSerializable, ) @@ -36,6 +35,7 @@ import pandas.core.common as com if TYPE_CHECKING: + from pandas import Series from pandas.core.indexes.multi import MultiIndex loads = json.loads @@ -206,7 +206,7 @@ def convert_json_field_to_pandas_type(field): def build_table_schema( - data: FrameOrSeries, + data: DataFrame | Series, index: bool = True, primary_key: bool | None = None, version: bool = True, diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 033cd88da9687..5b1b178c4f610 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -91,12 +91,20 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: The processed DataFrame. """ num_cols = len(frame.columns) + multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) + if len(self.names) != num_cols: + # usecols is passed through to pyarrow, we only handle index col here + # The only way self.names is not the same length as number of cols is + # if we have int index_col. We should just pad the names(they will get + # removed anyways) to expected length then. + self.names = list(range(num_cols - len(self.names))) + self.names + multi_index_named = False frame.columns = self.names # we only need the frame not the names frame.columns, frame = self._do_date_conversions(frame.columns, frame) @@ -104,7 +112,14 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] + else: + # String case + if item not in frame.columns: + raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) + # Clear names if headerless and no name given + if self.header is None and not multi_index_named: + frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d7a90511e3c73..f85128ea0ca4a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -39,7 +39,6 @@ from pandas._typing import ( ArrayLike, DtypeArg, - FrameOrSeries, Shape, ) from pandas.compat._optional import import_optional_dependency @@ -262,7 +261,7 @@ def _tables(): def to_hdf( path_or_buf, key: str, - value: FrameOrSeries, + value: DataFrame | Series, mode: str = "a", complevel: int | None = None, complib: str | None = None, @@ -275,7 +274,7 @@ def to_hdf( data_columns: bool | list[str] | None = None, errors: str = "strict", encoding: str = "UTF-8", -): +) -> None: """store this object, close it if we opened it""" if append: f = lambda store: store.append( @@ -1070,7 +1069,7 @@ def func(_start, _stop, _where): def put( self, key: str, - value: FrameOrSeries, + value: DataFrame | Series, format=None, index=True, append=False, @@ -1195,7 +1194,7 @@ def remove(self, key: str, where=None, start=None, stop=None): def append( self, key: str, - value: FrameOrSeries, + value: DataFrame | Series, format=None, axes=None, index=True, @@ -1637,7 +1636,7 @@ def _create_storer( self, group, format=None, - value: FrameOrSeries | None = None, + value: DataFrame | Series | None = None, encoding: str = "UTF-8", errors: str = "strict", ) -> GenericFixed | Table: @@ -1728,7 +1727,7 @@ def error(t): def _write_to_group( self, key: str, - value: FrameOrSeries, + value: DataFrame | Series, format, axes=None, index=True, @@ -1745,7 +1744,7 @@ def _write_to_group( encoding=None, errors: str = "strict", track_times: bool = True, - ): + ) -> None: # we don't want to store a table node at all if our object is 0-len # as there are not dtypes if getattr(value, "empty", None) and (format == "table" or append): @@ -3016,7 +3015,9 @@ def write_array_empty(self, key: str, value: ArrayLike): node._v_attrs.value_type = str(value.dtype) node._v_attrs.shape = value.shape - def write_array(self, key: str, obj: FrameOrSeries, items: Index | None = None): + def write_array( + self, key: str, obj: DataFrame | Series, items: Index | None = None + ) -> None: # TODO: we only have a few tests that get here, the only EA # that gets passed is DatetimeArray, and we never have # both self._filters and EA @@ -3469,14 +3470,9 @@ def write_metadata(self, key: str, values: np.ndarray): key : str values : ndarray """ - # error: Incompatible types in assignment (expression has type - # "Series", variable has type "ndarray") - values = Series(values) # type: ignore[assignment] - # error: Value of type variable "FrameOrSeries" of "put" of "HDFStore" - # cannot be "ndarray" - self.parent.put( # type: ignore[type-var] + self.parent.put( self._get_metadata_path(key), - values, + Series(values), format="table", encoding=self.encoding, errors=self.errors, diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 8b0055a522e25..bc3436861f1a8 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -297,9 +297,7 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] if self.names: - dicts = [ - {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts - ] + dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] return dicts @@ -478,9 +476,7 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] if self.names: - dicts = [ - {nm: v for nm, (k, v) in zip(self.names, d.items())} for d in dicts - ] + dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] return dicts diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 8b4cf158ac827..1308a83f61443 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -1,7 +1,9 @@ from __future__ import annotations -from collections import namedtuple -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + NamedTuple, +) import warnings from matplotlib.artist import setp @@ -28,6 +30,7 @@ if TYPE_CHECKING: from matplotlib.axes import Axes + from matplotlib.lines import Line2D class BoxPlot(LinePlot): @@ -35,8 +38,11 @@ class BoxPlot(LinePlot): _layout_type = "horizontal" _valid_return_types = (None, "axes", "dict", "both") - # namedtuple to hold results - BP = namedtuple("BP", ["ax", "lines"]) + + class BP(NamedTuple): + # namedtuple to hold results + ax: Axes + lines: dict[str, list[Line2D]] def __init__(self, data, return_type="axes", **kwargs): # Do not call LinePlot.__init__ which may fill nan diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ef0a9337234ce..ec20bc49c8a4b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -30,21 +30,18 @@ class TestPDApi(Base): ignored = ["tests", "locale", "conftest"] # top-level sub-packages - lib = [ + public_lib = [ "api", "arrays", - "compat", - "core", - "errors", - "pandas", - "plotting", + "options", "test", "testing", - "tseries", - "util", - "options", + "errors", + "plotting", "io", + "tseries", ] + private_lib = ["compat", "core", "pandas", "util"] # these are already deprecated; awaiting removal deprecated_modules: list[str] = ["np", "datetime"] @@ -204,7 +201,8 @@ class TestPDApi(Base): def test_api(self): checkthese = ( - self.lib + self.public_lib + + self.private_lib + self.misc + self.modules + self.classes @@ -217,6 +215,26 @@ def test_api(self): ) self.check(namespace=pd, expected=checkthese, ignored=self.ignored) + def test_api_all(self): + expected = set( + self.public_lib + + self.misc + + self.modules + + self.classes + + self.funcs + + self.funcs_option + + self.funcs_read + + self.funcs_json + + self.funcs_to + ) - set(self.deprecated_classes) + actual = set(pd.__all__) + + extraneous = actual - expected + assert not extraneous + + missing = expected - actual + assert not missing + def test_depr(self): deprecated_list = ( self.deprecated_modules diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 62983b5327a26..f8c945bb496a8 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1087,12 +1087,16 @@ def test_agg_multiple_mixed_no_warning(): index=["min", "sum"], ) # sorted index - with tm.assert_produces_warning(None): + with tm.assert_produces_warning( + FutureWarning, match=r"\['D'\] did not aggregate successfully" + ): result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning( + FutureWarning, match=r"\['D'\] did not aggregate successfully" + ): result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # GH40420: the result of .agg should have an index that is sorted @@ -1201,7 +1205,10 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - result = df.agg(["sum"]) + with tm.assert_produces_warning( + FutureWarning, match=r"\['D'\] did not aggregate successfully" + ): + result = df.agg(["sum"]) expected = DataFrame( [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) @@ -1433,7 +1440,10 @@ def foo(s): return s.sum() / 2 aggs = ["sum", foo, "count", "min"] - result = df.agg(aggs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['item'\] did not aggregate successfully" + ): + result = df.agg(aggs) expected = DataFrame( { "item": ["123456", np.nan, 6, "1"], @@ -1452,3 +1462,20 @@ def test_apply_getitem_axis_1(): result = df[["a", "a"]].apply(lambda x: x[0] + x[1], axis=1) expected = Series([0, 2, 4]) tm.assert_series_equal(result, expected) + + +def test_nuisance_depr_passes_through_warnings(): + # GH 43740 + # DataFrame.agg with list-likes may emit warnings for both individual + # args and for entire columns, but we only want to emit once. We + # catch and suppress the warnings for individual args, but need to make + # sure if some other warnings were raised, they get passed through to + # the user. + + def foo(x): + warnings.warn("Hello, World!") + return x.sum() + + df = DataFrame({"a": [1, 2, 3]}) + with tm.assert_produces_warning(UserWarning, match="Hello, World!"): + df.agg([foo]) diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 47173d14c543d..ade8df6fbfc0e 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -160,7 +160,6 @@ def test_transform_bad_dtype(op, frame_or_series, request): @pytest.mark.parametrize("op", frame_kernels_raise) def test_transform_partial_failure_typeerror(op): # GH 35964 - if op == "rank": pytest.skip("GH 40418: rank does not raise a TypeError") @@ -168,25 +167,33 @@ def test_transform_partial_failure_typeerror(op): df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]}) expected = df[["B"]].transform([op]) - result = df.transform([op]) + match = r"\['A'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = df.transform([op]) tm.assert_equal(result, expected) expected = df[["B"]].transform({"B": op}) - result = df.transform({"A": op, "B": op}) + match = r"\['A'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = df.transform({"A": op, "B": op}) tm.assert_equal(result, expected) expected = df[["B"]].transform({"B": [op]}) - result = df.transform({"A": [op], "B": [op]}) + match = r"\['A'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = df.transform({"A": [op], "B": [op]}) tm.assert_equal(result, expected) expected = df.transform({"A": ["shift"], "B": [op]}) - result = df.transform({"A": [op, "shift"], "B": [op]}) + match = rf"\['{op}'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = df.transform({"A": [op, "shift"], "B": [op]}) tm.assert_equal(result, expected) def test_transform_partial_failure_valueerror(): # GH 40211 - match = ".*did not transform successfully and did not raise a TypeError" + match = ".*did not transform successfully" def op(x): if np.sum(np.sum(x)) < 10: @@ -211,7 +218,7 @@ def op(x): tm.assert_equal(result, expected) expected = df.transform({"A": ["shift"], "B": [op]}) - with tm.assert_produces_warning(FutureWarning, match=match, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, match=match): result = df.transform({"A": [op, "shift"], "B": [op]}) tm.assert_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 2af340f0c1bb9..18f96f9c61ab8 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -92,6 +92,21 @@ def test_apply_args(): assert isinstance(result[0], list) +@pytest.mark.parametrize( + "args, kwargs, increment", + [((), {}, 0), ((), {"a": 1}, 1), ((2, 3), {}, 32), ((1,), {"c": 2}, 201)], +) +def test_agg_args(args, kwargs, increment): + # GH 43357 + def f(x, a=0, b=0, c=0): + return x + a + 10 * b + 100 * c + + s = Series([1, 2]) + result = s.agg(f, 0, *args, **kwargs) + expected = s + increment + tm.assert_series_equal(result, expected) + + def test_series_map_box_timestamps(): # GH#2689, GH#2627 ser = Series(pd.date_range("1/1/2000", periods=10)) @@ -276,25 +291,35 @@ def test_transform_partial_failure(op, request): ser = Series(3 * [object]) expected = ser.transform(["shift"]) - result = ser.transform([op, "shift"]) + match = rf"\['{op}'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = ser.transform([op, "shift"]) tm.assert_equal(result, expected) expected = ser.transform({"B": "shift"}) - result = ser.transform({"A": op, "B": "shift"}) + match = r"\['A'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = ser.transform({"A": op, "B": "shift"}) tm.assert_equal(result, expected) expected = ser.transform({"B": ["shift"]}) - result = ser.transform({"A": [op], "B": ["shift"]}) + match = r"\['A'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = ser.transform({"A": [op], "B": ["shift"]}) tm.assert_equal(result, expected) - expected = ser.transform({"A": ["shift"], "B": [op]}) - result = ser.transform({"A": [op, "shift"], "B": [op]}) + match = r"\['B'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + expected = ser.transform({"A": ["shift"], "B": [op]}) + match = rf"\['{op}'\] did not transform successfully" + with tm.assert_produces_warning(FutureWarning, match=match): + result = ser.transform({"A": [op, "shift"], "B": [op]}) tm.assert_equal(result, expected) def test_transform_partial_failure_valueerror(): # GH 40211 - match = ".*did not transform successfully and did not raise a TypeError" + match = ".*did not transform successfully" def noop(x): return x @@ -320,7 +345,7 @@ def raising_op(_): tm.assert_equal(result, expected) expected = ser.transform({"A": [noop], "B": [noop]}) - with tm.assert_produces_warning(FutureWarning, match=match, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, match=match): result = ser.transform({"A": [noop, raising_op], "B": [noop]}) tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 6b8dc8821d4fa..e45dbb393a8de 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -85,6 +85,22 @@ def test_to_coo(self, colnames): expected = scipy.sparse.coo_matrix(np.asarray(df)) assert (result != expected).nnz == 0 + @pytest.mark.parametrize("fill_value", [1, np.nan]) + @td.skip_if_no_scipy + def test_to_coo_nonzero_fill_val_raises(self, fill_value): + df = pd.DataFrame( + { + "A": SparseArray( + [fill_value, fill_value, fill_value, 2], fill_value=fill_value + ), + "B": SparseArray( + [fill_value, 2, fill_value, fill_value], fill_value=fill_value + ), + } + ) + with pytest.raises(ValueError, match="fill value must be 0"): + df.sparse.to_coo() + def test_to_dense(self): df = pd.DataFrame( { diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 34ee68dbbbf18..0b00ff2dbd861 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -679,23 +679,37 @@ def test_getitem_arraylike_mask(self): expected = SparseArray([0, 2]) tm.assert_sp_array_equal(result, expected) - def test_getslice(self): - result = self.arr[:-3] - exp = SparseArray(self.arr.to_dense()[:-3]) - tm.assert_sp_array_equal(result, exp) - - result = self.arr[-4:] - exp = SparseArray(self.arr.to_dense()[-4:]) - tm.assert_sp_array_equal(result, exp) - - # two corner cases from Series - result = self.arr[-12:] - exp = SparseArray(self.arr) - tm.assert_sp_array_equal(result, exp) - - result = self.arr[:-12] - exp = SparseArray(self.arr.to_dense()[:0]) - tm.assert_sp_array_equal(result, exp) + @pytest.mark.parametrize( + "slc", + [ + np.s_[:], + np.s_[1:10], + np.s_[1:100], + np.s_[10:1], + np.s_[:-3], + np.s_[-5:-4], + np.s_[:-12], + np.s_[-12:], + np.s_[2:], + np.s_[2::3], + np.s_[::2], + np.s_[::-1], + np.s_[::-2], + np.s_[1:6:2], + np.s_[:-6:-2], + ], + ) + @pytest.mark.parametrize( + "as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []] + ) + def test_getslice(self, slc, as_dense): + as_dense = np.array(as_dense) + arr = SparseArray(as_dense) + + result = arr[slc] + expected = SparseArray(as_dense[slc]) + + tm.assert_sp_array_equal(result, expected) def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index e963cfec71b5b..7be776819e399 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -336,6 +336,19 @@ def test_merge(self, data, na_value): # Fails creating expected (key column becomes a PandasDtype because) super().test_merge(data, na_value) + @pytest.mark.parametrize( + "in_frame", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail(reason="PandasArray inconsistently extracted"), + ), + ], + ) + def test_concat(self, data, in_frame): + super().test_concat(data, in_frame) + class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): @skip_nested diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 6fdf5d806ac6b..8716a181120f6 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -53,14 +53,18 @@ def test_error_multi_columns(input_subset, error_message): df.explode(input_subset) -def test_basic(): +@pytest.mark.parametrize( + "scalar", + ["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")], +) +def test_basic(scalar): df = pd.DataFrame( - {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + {scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} ) - result = df.explode("A") + result = df.explode(scalar) expected = pd.DataFrame( { - "A": pd.Series( + scalar: pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object ), "B": 1, diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4bda0e6ef9872..2c798e543bf6b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -225,6 +225,56 @@ def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func): gb.agg(reduction_func, axis=1) +@pytest.mark.parametrize( + "func, expected, dtype, result_dtype_dict", + [ + ("sum", [5, 7, 9], "int64", {}), + ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), + ("sum", [5, 7, 9], "Int64", {"j": "int64"}), + ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), + ], +) +def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): + # GH#43209 + df = DataFrame( + [[1, 2, 3, 4, 5, 6]] * 3, + columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), + ).astype({("a", "j"): dtype, ("b", "j"): dtype}) + result = df.groupby(level=1, axis=1).agg(func) + expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( + result_dtype_dict + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_data, result_dtype_dict", + [ + ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), + # std should ideally return Int64 / Float64 #43330 + ("std", [[2 ** 0.5] * 2] * 3, "float64"), + ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), + ], +) +def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): + # GH#43209 + df = DataFrame( + np.arange(12).reshape(3, 4), + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20, 10, 20], name="x"), + dtype="int64", + ).astype({10: "Int64"}) + result = df.groupby("x", axis=1).agg(func) + expected = DataFrame( + data=expected_data, + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20], name="x"), + ).astype(result_dtype_dict) + tm.assert_frame_equal(result, expected) + + def test_aggregate_item_by_item(df): grouped = df.groupby("A") @@ -339,8 +389,14 @@ def test_multiple_functions_tuples_and_non_tuples(df): expected = df.groupby("A")["C"].agg(ex_funcs) tm.assert_frame_equal(result, expected) - result = df.groupby("A").agg(funcs) - expected = df.groupby("A").agg(ex_funcs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['B'\] did not aggregate successfully" + ): + result = df.groupby("A").agg(funcs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['B'\] did not aggregate successfully" + ): + expected = df.groupby("A").agg(ex_funcs) tm.assert_frame_equal(result, expected) @@ -861,6 +917,16 @@ def test_groupby_aggregate_empty_key_empty_return(): tm.assert_frame_equal(result, expected) +def test_groupby_aggregate_empty_with_multiindex_frame(): + # GH 39178 + df = DataFrame(columns=["a", "b", "c"]) + result = df.groupby(["a", "b"]).agg(d=("c", list)) + expected = DataFrame( + columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"]) + ) + tm.assert_frame_equal(result, expected) + + def test_grouby_agg_loses_results_with_as_index_false_relabel(): # GH 32240: When the aggregate function relabels column names and # as_index=False is specified, the results are dropped. @@ -1291,3 +1357,23 @@ def test_group_mean_datetime64_nat(input_data, expected_output): result = data.groupby([0, 0, 0]).mean() tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, output", [("mean", [8 + 18j, 10 + 22j]), ("sum", [40 + 90j, 50 + 110j])] +) +def test_groupby_complex(func, output): + # GH#43701 + data = Series(np.arange(20).reshape(10, 2).dot([1, 2j])) + result = data.groupby(data.index % 2).agg(func) + expected = Series(output) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max", "var"]) +def test_groupby_complex_raises(func): + # GH#43701 + data = Series(np.arange(20).reshape(10, 2).dot([1, 2j])) + msg = "No matching signature found" + with pytest.raises(TypeError, match=msg): + data.groupby(data.index % 2).agg(func) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 79990deed261d..66b968e01eef1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -45,13 +45,15 @@ def peak_to_peak(arr): return arr.max() - arr.min() with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid", check_stacklevel=False + FutureWarning, + match=r"\['key2'\] did not aggregate successfully", ): expected = grouped.agg([peak_to_peak]) expected.columns = ["data1", "data2"] with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid", check_stacklevel=False + FutureWarning, + match=r"\['key2'\] did not aggregate successfully", ): result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b9a6730996a02..3de6af8eef694 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -583,7 +583,10 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - agged = grouped.agg(funcs) + with tm.assert_produces_warning( + FutureWarning, match=r"\['C'\] did not aggregate successfully" + ): + agged = grouped.agg(funcs) expected = pd.concat( [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], keys=["D", "E", "F"], diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 527b93a28359c..efb0b82f58e97 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -688,7 +688,7 @@ def test_groupby_empty(self): ) tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype("int")) + gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) ) assert gr.grouper.group_info[2] == 0 diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py index 654f5a89f1828..9cd582925ff79 100644 --- a/pandas/tests/indexes/base_class/test_indexing.py +++ b/pandas/tests/indexes/base_class/test_indexing.py @@ -1,7 +1,11 @@ import numpy as np import pytest -from pandas import Index +import pandas as pd +from pandas import ( + Index, + NaT, +) import pandas._testing as tm @@ -36,3 +40,39 @@ def test_get_indexer_non_unique_dtype_mismatch(self): indexes, missing = Index(["A", "B"]).get_indexer_non_unique(Index([0])) tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) + + +class TestGetLoc: + @pytest.mark.slow # to_flat_index takes a while + def test_get_loc_tuple_monotonic_above_size_cutoff(self): + # Go through the libindex path for which using + # _bin_search vs ndarray.searchsorted makes a difference + + lev = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + dti = pd.date_range("2016-01-01", periods=100) + + mi = pd.MultiIndex.from_product([lev, range(10 ** 3), dti]) + oidx = mi.to_flat_index() + + loc = len(oidx) // 2 + tup = oidx[loc] + + res = oidx.get_loc(tup) + assert res == loc + + def test_get_loc_nan_object_dtype_nonmonotonic_nonunique(self): + # case that goes through _maybe_get_bool_indexer + idx = Index(["foo", np.nan, None, "foo", 1.0, None], dtype=object) + + # we dont raise KeyError on nan + res = idx.get_loc(np.nan) + assert res == 1 + + # we only match on None, not on np.nan + res = idx.get_loc(None) + expected = np.array([False, False, True, False, False, True]) + tm.assert_numpy_array_equal(res, expected) + + # we don't match at all on mismatched NA + with pytest.raises(KeyError, match="NaT"): + idx.get_loc(NaT) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index edcde3d7cae1d..9cc1205310ea7 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -83,7 +83,7 @@ def test_numpy_ufuncs_other(index, func, request): request.node.add_marker(mark) if func in (np.isfinite, np.isinf, np.isnan): - # numpy 1.18 changed isinf and isnan to not raise on dt64/tfd64 + # numpy 1.18 changed isinf and isnan to not raise on dt64/td64 result = func(index) assert isinstance(result, np.ndarray) else: diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 479d048d35fbd..9fa873a212cbd 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -15,7 +15,6 @@ class TestMultiIndexBasic: def test_multiindex_perf_warn(self): - df = DataFrame( { "jim": [0, 0, 1, 1], @@ -47,7 +46,6 @@ def test_indexing_over_hashtable_size_cutoff(self): _index._SIZE_CUTOFF = old_cutoff def test_multi_nan_indexing(self): - # GH 3588 df = DataFrame( { @@ -70,6 +68,28 @@ def test_multi_nan_indexing(self): ) tm.assert_frame_equal(result, expected) + def test_exclusive_nat_column_indexing(self): + # GH 38025 + # test multi indexing when one column exclusively contains NaT values + df = DataFrame( + { + "a": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + } + ) + df = df.set_index(["a", "b"]) + expected = DataFrame( + { + "c": [10, 15, np.nan, 20], + }, + index=[ + Index([pd.NaT, pd.NaT, pd.NaT, pd.NaT], name="a"), + Index(["C1", "C2", "C3", "C4"], name="b"), + ], + ) + tm.assert_frame_equal(df, expected) + def test_nested_tuples_duplicates(self): # GH#30892 diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 23d2bee612243..d6be817ab6f77 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -26,6 +26,27 @@ def test_at_timezone(): tm.assert_frame_equal(result, expected) +def test_selection_methods_of_assigned_col(): + # GH 29282 + df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = DataFrame(data={"c": [7, 8, 9]}, index=[2, 1, 0]) + df["c"] = df2["c"] + df.at[1, "c"] = 11 + result = df + expected = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [9, 11, 7]}) + tm.assert_frame_equal(result, expected) + result = df.at[1, "c"] + assert result == 11 + + result = df["c"] + expected = Series([9, 11, 7], name="c") + tm.assert_series_equal(result, expected) + + result = df[["c"]] + expected = DataFrame({"c": [9, 11, 7]}) + tm.assert_frame_equal(result, expected) + + class TestAtSetItem: def test_at_setitem_mixed_index_assignment(self): # GH#19860 diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 362252e1a6b72..00635671e459b 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -408,7 +408,9 @@ def test_iget(self): cols = Index(list("abc")) values = np.random.rand(3, 3) block = new_block( - values=values.copy(), placement=np.arange(3), ndim=values.ndim + values=values.copy(), + placement=np.arange(3, dtype=np.intp), + ndim=values.ndim, ) mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))]) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 9db1df345404b..69afb9fe56472 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -84,7 +84,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ae5ddb83f7052..726cd64c6dc23 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -204,6 +204,11 @@ def test_1000_sep_decimal_float_precision( # test decimal and thousand sep handling in across 'float_precision' # parsers decimal_number_check(c_parser_only, numeric_decimal, thousands, float_precision) + text, value = numeric_decimal + text = " " + text + " " + if isinstance(value, str): # the negative cases (parse as text) + value = " " + value + " " + decimal_number_check(c_parser_only, (text, value), thousands, float_precision) def decimal_number_check(parser, numeric_decimal, thousands, float_precision): @@ -222,6 +227,24 @@ def decimal_number_check(parser, numeric_decimal, thousands, float_precision): assert val == numeric_decimal[1] +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) +def test_skip_whitespace(c_parser_only, float_precision): + DATA = """id\tnum\t +1\t1.2 \t +1\t 2.1\t +2\t 1\t +2\t 1.2 \t +""" + df = c_parser_only.read_csv( + StringIO(DATA), + float_precision=float_precision, + sep="\t", + header=0, + dtype={1: np.float64}, + ) + tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num")) + + def test_true_values_cast_to_bool(all_parsers): # GH#34655 text = """a,b diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 6be82af5349ed..646cb2029919d 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -19,7 +19,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -228,7 +227,6 @@ def test_header_with_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.slow def test_index_col_large_csv(all_parsers): # https://github.com/pandas-dev/pandas/issues/37094 diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 01715ee133e96..ec724602c5249 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -637,6 +637,29 @@ def test_use_nullable_dtypes(self, engine): expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected) + @pytest.mark.parametrize( + "dtype", + [ + "Int64", + "UInt8", + "boolean", + "object", + "datetime64[ns, UTC]", + "float", + "period[D]", + "Float64", + "string", + ], + ) + def test_read_empty_array(self, pa, dtype): + # GH #41241 + df = pd.DataFrame( + { + "value": pd.array([], dtype=dtype), + } + ) + check_round_trip(df, pa, read_kwargs={"use_nullable_dtypes": True}) + @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning") class TestParquetPyArrow(Base): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 3b3bd402e4cc7..10fabe234d218 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -352,7 +352,8 @@ def test_agg(): for t in cases: warn = FutureWarning if t in cases[1:3] else None with tm.assert_produces_warning( - warn, match="Dropping invalid columns", check_stacklevel=False + warn, + match=r"\['date'\] did not aggregate successfully", ): # .var on dt64 column raises and is dropped result = t.aggregate([np.mean, np.std]) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index eacafa9310384..e00e9a894d340 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -4,6 +4,7 @@ from pandas import ( Index, + MultiIndex, Series, ) import pandas._testing as tm @@ -101,3 +102,26 @@ def test_rename_callable(self): tm.assert_series_equal(result, expected) assert result.name == expected.name + + def test_rename_series_with_multiindex(self): + # issue #43659 + arrays = [ + ["bar", "baz", "baz", "foo", "qux"], + ["one", "one", "two", "two", "one"], + ] + + index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + s = Series(np.ones(5), index=index) + result = s.rename(index={"one": "yes"}, level="second", errors="raise") + + arrays_expected = [ + ["bar", "baz", "baz", "foo", "qux"], + ["yes", "yes", "two", "two", "yes"], + ] + + index_expected = MultiIndex.from_arrays( + arrays_expected, names=["first", "second"] + ) + series_expected = Series(np.ones(5), index=index_expected) + + tm.assert_series_equal(result, series_expected) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a38affbc7f723..850ce6df21b7f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -177,6 +177,28 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): result = to_datetime(input_s, format="%Y%m%d", errors="coerce") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data, format, expected", + [ + ([pd.NA], "%Y%m%d%H%M%S", DatetimeIndex(["NaT"])), + ([pd.NA], None, DatetimeIndex(["NaT"])), + ( + [pd.NA, "20210202202020"], + "%Y%m%d%H%M%S", + DatetimeIndex(["NaT", "2021-02-02 20:20:20"]), + ), + (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), + (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), + (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])), + ([None, np.nan, pd.NA], None, DatetimeIndex(["NaT", "NaT", "NaT"])), + ([None, np.nan, pd.NA], "%Y%m%d", DatetimeIndex(["NaT", "NaT", "NaT"])), + ], + ) + def test_to_datetime_with_NA(self, data, format, expected): + # GH#42957 + result = to_datetime(data, format=format) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_integer(self, cache): # GH 10178 diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 1086857f38b62..af2ca7270c982 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -43,44 +43,52 @@ def f(x, *args): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")] + ) def test_numba_vs_cython_rolling_methods( - self, nogil, parallel, nopython, arithmetic_numba_supported_operators + self, data, nogil, parallel, nopython, arithmetic_numba_supported_operators ): method = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - df = DataFrame(np.eye(5)) - roll = df.rolling(2) + roll = data.rolling(2) result = getattr(roll, method)(engine="numba", engine_kwargs=engine_kwargs) expected = getattr(roll, method)(engine="cython") # Check the cache - assert (getattr(np, f"nan{method}"), "Rolling_apply_single") in NUMBA_FUNC_CACHE + if method != "mean": + assert ( + getattr(np, f"nan{method}"), + "Rolling_apply_single", + ) in NUMBA_FUNC_CACHE - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) + @pytest.mark.parametrize("data", [DataFrame(np.eye(5)), Series(range(5))]) def test_numba_vs_cython_expanding_methods( - self, nogil, parallel, nopython, arithmetic_numba_supported_operators + self, data, nogil, parallel, nopython, arithmetic_numba_supported_operators ): method = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - df = DataFrame(np.eye(5)) - expand = df.expanding() + data = DataFrame(np.eye(5)) + expand = data.expanding() result = getattr(expand, method)(engine="numba", engine_kwargs=engine_kwargs) expected = getattr(expand, method)(engine="cython") # Check the cache - assert ( - getattr(np, f"nan{method}"), - "Expanding_apply_single", - ) in NUMBA_FUNC_CACHE + if method != "mean": + assert ( + getattr(np, f"nan{method}"), + "Expanding_apply_single", + ) in NUMBA_FUNC_CACHE - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) @pytest.mark.parametrize("jit", [True, False]) def test_cache_apply(self, jit, nogil, parallel, nopython): diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py index e69de29bb2d1d..dd4ce02b19453 100644 --- a/pandas/tseries/__init__.py +++ b/pandas/tseries/__init__.py @@ -0,0 +1,11 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # import modules that have public classes/functions: + from pandas.tseries import ( + frequencies, + offsets, + ) + + # and mark only those modules as public + __all__ = ["frequencies", "offsets"] diff --git a/pandas/util/_depr_module.py b/pandas/util/_depr_module.py deleted file mode 100644 index 5694ca24aab57..0000000000000 --- a/pandas/util/_depr_module.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -This module houses a utility class for mocking deprecated modules. -It is for internal use only and should not be used beyond this purpose. -""" - -import importlib -from typing import Iterable -import warnings - - -class _DeprecatedModule: - """ - Class for mocking deprecated modules. - - Parameters - ---------- - deprmod : name of module to be deprecated. - deprmodto : name of module as a replacement, optional. - If not given, the __module__ attribute will - be used when needed. - removals : objects or methods in module that will no longer be - accessible once module is removed. - moved : dict, optional - dictionary of function name -> new location for moved - objects - """ - - def __init__(self, deprmod, deprmodto=None, removals=None, moved=None): - self.deprmod = deprmod - self.deprmodto = deprmodto - self.removals = removals - if self.removals is not None: - self.removals = frozenset(self.removals) - self.moved = moved - - # For introspection purposes. - self.self_dir = frozenset(dir(type(self))) - - def __dir__(self) -> Iterable[str]: - deprmodule = self._import_deprmod() - return dir(deprmodule) - - def __repr__(self) -> str: - deprmodule = self._import_deprmod() - return repr(deprmodule) - - __str__ = __repr__ - - def __getattr__(self, name: str): - if name in self.self_dir: - return object.__getattribute__(self, name) - - try: - deprmodule = self._import_deprmod(self.deprmod) - except ImportError: - if self.deprmodto is None: - raise - - # a rename - deprmodule = self._import_deprmod(self.deprmodto) - - obj = getattr(deprmodule, name) - - if self.removals is not None and name in self.removals: - warnings.warn( - f"{self.deprmod}.{name} is deprecated and will be removed in " - "a future version.", - FutureWarning, - stacklevel=2, - ) - elif self.moved is not None and name in self.moved: - warnings.warn( - f"{self.deprmod} is deprecated and will be removed in " - f"a future version.\nYou can access {name} as {self.moved[name]}", - FutureWarning, - stacklevel=2, - ) - else: - deprmodto = self.deprmodto - if deprmodto is False: - warnings.warn( - f"{self.deprmod}.{name} is deprecated and will be removed in " - "a future version.", - FutureWarning, - stacklevel=2, - ) - else: - if deprmodto is None: - deprmodto = obj.__module__ - # The object is actually located in another module. - warnings.warn( - f"{self.deprmod}.{name} is deprecated. Please use " - f"{deprmodto}.{name} instead.", - FutureWarning, - stacklevel=2, - ) - - return obj - - def _import_deprmod(self, mod=None): - if mod is None: - mod = self.deprmod - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=FutureWarning) - deprmodule = importlib.import_module(mod) - return deprmodule diff --git a/pyproject.toml b/pyproject.toml index 03c1485bd4e35..0223a1c035cbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ requires = [ # Copied from https://github.com/scipy/scipy/blob/master/pyproject.toml (which is also licensed under BSD) "numpy==1.17.3; python_version=='3.7' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", "numpy==1.18.3; python_version=='3.8' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", - "numpy==1.19.3; python_version>='3.9' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", + "numpy==1.19.3; python_version>='3.9' and (platform_machine!='arm64' or platform_system!='Darwin')", # Aarch64(Python 3.9 requirements are the same as AMD64) "numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'", "numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'", @@ -69,13 +69,51 @@ markers = [ ] [tool.mypy] -platform = "linux-64" +# Import discovery +namespace_packages = false +explicit_package_bases = false ignore_missing_imports = true -no_implicit_optional = true +follow_imports = "normal" +follow_imports_for_stubs = false +no_site_packages = false +no_silence_site_packages = false +# Platform configuration +python_version = "3.8" +platform = "linux-64" +# Disallow dynamic typing +disallow_any_unimported = false # TODO +disallow_any_expr = false # TODO +disallow_any_decorated = false # TODO +disallow_any_explicit = false # TODO +disallow_any_generics = false # TODO +disallow_subclassing_any = false # TODO +# Untyped definitions and calls +disallow_untyped_calls = false # TODO +disallow_untyped_defs = false # TODO +disallow_incomplete_defs = false # TODO check_untyped_defs = true -strict_equality = true +disallow_untyped_decorators = false # GH#33455 +# None and Optional handling +no_implicit_optional = true +strict_optional = true +# Configuring warnings warn_redundant_casts = true warn_unused_ignores = true +warn_no_return = true +warn_return_any = false # TODO +warn_unreachable = false # GH#27396 +# Suppressing errors +show_none_errors = true +ignore_errors = false +# Miscellaneous strictness flags +allow_untyped_globals = false +allow_redefinition = false +local_partial_types = false +implicit_reexport = true +strict_equality = true +# Configuring error messages +show_error_context = false +show_column_numbers = false show_error_codes = true [[tool.mypy.overrides]] @@ -116,3 +154,42 @@ force_grid_wrap = 2 force_sort_within_sections = true skip_glob = "env" skip = "pandas/__init__.py" + +[tool.pyright] +pythonVersion = "3.8" +typeCheckingMode = "strict" +include = ["pandas"] +exclude = ["pandas/tests", "pandas/util/version"] +reportGeneralTypeIssues = false +reportConstantRedefinition = false +reportFunctionMemberAccess = false +reportImportCycles = false +reportIncompatibleMethodOverride = false +reportIncompatibleVariableOverride = false +reportMissingImports = false +reportMissingModuleSource = false +reportMissingTypeArgument = false +reportMissingTypeStubs = false +reportOptionalCall = false +reportOptionalIterable = false +reportOptionalMemberAccess = false +reportOptionalOperand = false +reportOptionalSubscript = false +reportPrivateImportUsage = false +reportPrivateUsage = false +reportPropertyTypeMismatch = false +reportUnboundVariable = false +reportUnknownArgumentType = false +reportUnknownLambdaType = false +reportUnknownMemberType = false +reportUnknownParameterType = false +reportUnknownVariableType = false +reportUnnecessaryComparison = false +reportUnnecessaryIsInstance = false +reportUnsupportedDunderAll = false +reportUntypedBaseClass = false +reportUntypedFunctionDecorator = false +reportUnusedClass = false +reportUnusedFunction = false +reportUnusedImport = false +reportUnusedVariable = false diff --git a/requirements-dev.txt b/requirements-dev.txt index 9b35de4bccb48..da2006a8b5c05 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -85,4 +85,5 @@ types-python-dateutil types-PyMySQL types-pytz types-setuptools +pytest-cython setuptools>=51.0.0 From c8e69d838a8df5edd3d7c08b09e93dd7798827cc Mon Sep 17 00:00:00 2001 From: "JHM Darbyshire (iMac)" Date: Thu, 4 Nov 2021 22:07:13 +0100 Subject: [PATCH 13/13] fix tests --- pandas/tests/series/methods/test_argsort.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 47af71b01da91..2776120dad83c 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -79,9 +79,9 @@ def test_na_pos_raises(self): @pytest.mark.parametrize( "na_position, expected", [ - (None, Series([2, 0, -1, -1], index=["c", "a", "b", "d"])), - ("first", Series([1, 3, 2, 0], index=["b", "d", "c", "a"])), - ("last", Series([2, 0, 1, 3], index=["c", "a", "b", "d"])), + (None, Series([2, 0, -1, -1], index=["c", "a", "b", "d"], dtype=np.intp)), + ("first", Series([1, 3, 2, 0], index=["b", "d", "c", "a"], dtype=np.intp)), + ("last", Series([2, 0, 1, 3], index=["c", "a", "b", "d"], dtype=np.intp)), ], ) def test_na_position(self, na_position, expected):