From 28514640dab89eadc88003cd7b5aefa13eba2c40 Mon Sep 17 00:00:00 2001 From: Omar Elbaz Date: Thu, 10 Aug 2023 22:38:21 -0400 Subject: [PATCH 1/8] groupby tests --- pandas/tests/groupby/test_groupby.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9b260d5757767..087528a2853c1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3159,3 +3159,17 @@ def test_groupby_series_with_datetimeindex_month_name(): expected = Series([2, 1], name="jan") expected.index.name = "jan" tm.assert_series_equal(result, expected) + + +def test_rolling_corr_case2(df): + # GH 44078 + df = DataFrame({"a": [(1, 2), (1, 2), (1, 2)], "b": [4, 5, 6]}) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples( + [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] + ) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) \ No newline at end of file From 46c310f445b115fb93141fdd6ff9b489e59c23b9 Mon Sep 17 00:00:00 2001 From: Omar Elbaz Date: Thu, 10 Aug 2023 22:40:54 -0400 Subject: [PATCH 2/8] revert commit --- pandas/tests/groupby/test_groupby.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 087528a2853c1..2800151c3294d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3158,18 +3158,4 @@ def test_groupby_series_with_datetimeindex_month_name(): result = s.groupby(s).count() expected = Series([2, 1], name="jan") expected.index.name = "jan" - tm.assert_series_equal(result, expected) - - -def test_rolling_corr_case2(df): - # GH 44078 - df = DataFrame({"a": [(1, 2), (1, 2), (1, 2)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples( - [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] - ) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) \ No newline at end of file + tm.assert_series_equal(result, expected) \ No newline at end of file From c10224f7f60e97e1fac5c355a5bcba391f070e2d Mon Sep 17 00:00:00 2001 From: Omar Elbaz Date: Thu, 10 Aug 2023 22:47:30 -0400 Subject: [PATCH 3/8] reverting --- pandas/tests/groupby/test_groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2800151c3294d..f5a828aa1fefa 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3158,4 +3158,5 @@ def test_groupby_series_with_datetimeindex_month_name(): result = s.groupby(s).count() expected = Series([2, 1], name="jan") expected.index.name = "jan" - tm.assert_series_equal(result, expected) \ No newline at end of file + tm.assert_series_equal(result, expected) + \ No newline at end of file From 37085a35a5250c138d9a43475e8fb9bf981e8ad1 Mon Sep 17 00:00:00 2001 From: Omar Elbaz Date: Thu, 10 Aug 2023 22:48:42 -0400 Subject: [PATCH 4/8] reverting again --- pandas/tests/groupby/test_groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f5a828aa1fefa..9b260d5757767 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3159,4 +3159,3 @@ def test_groupby_series_with_datetimeindex_month_name(): expected = Series([2, 1], name="jan") expected.index.name = "jan" tm.assert_series_equal(result, expected) - \ No newline at end of file From ee5ed1abd43a5ea3e9227d5a9b5d2c6db370dddd Mon Sep 17 00:00:00 2001 From: Omar Elbaz Date: Fri, 11 Aug 2023 14:16:34 -0400 Subject: [PATCH 5/8] move unit tests --- asv_bench/asv.conf.json | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- doc/source/whatsnew/index.rst | 1 - doc/source/whatsnew/v2.0.4.rst | 41 ---- doc/source/whatsnew/v2.1.0.rst | 44 +--- environment.yml | 2 +- pandas/_libs/algos.pyx | 117 +++++++++- pandas/_libs/algos_take_helper.pxi.in | 40 ++-- pandas/_libs/arrays.pyi | 2 +- pandas/_libs/arrays.pyx | 3 +- pandas/_libs/groupby.pyi | 3 +- pandas/_libs/groupby.pyx | 203 +++++++++++------- pandas/_libs/hashtable.pyi | 9 +- pandas/_libs/hashtable_class_helper.pxi.in | 6 +- pandas/_libs/internals.pyx | 6 +- pandas/_libs/interval.pyx | 15 ++ pandas/_libs/lib.pyi | 22 +- pandas/_libs/lib.pyx | 3 +- pandas/_libs/ops.pyi | 4 +- pandas/_libs/parsers.pyx | 7 +- pandas/_libs/sparse.pyi | 4 - pandas/_libs/tslibs/conversion.pyi | 1 - pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/dtypes.pyi | 4 +- pandas/_libs/tslibs/nattype.pyx | 30 ++- pandas/_libs/tslibs/np_datetime.pxd | 21 +- pandas/_libs/tslibs/np_datetime.pyi | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 3 - pandas/_libs/tslibs/offsets.pyi | 5 +- pandas/_libs/tslibs/offsets.pyx | 24 ++- pandas/_libs/tslibs/parsing.pyx | 3 +- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 10 +- pandas/_libs/tslibs/timedeltas.pyi | 8 +- pandas/_libs/tslibs/timedeltas.pyx | 9 +- pandas/_libs/tslibs/timestamps.pyi | 2 +- pandas/_libs/tslibs/timestamps.pyx | 13 ++ pandas/_libs/tslibs/tzconversion.pyi | 2 +- pandas/_libs/tslibs/vectorized.pyi | 2 +- pandas/_libs/window/aggregations.pyi | 4 +- pandas/core/arrays/datetimelike.py | 3 +- pandas/core/generic.py | 55 +++++ pandas/core/internals/blocks.py | 21 +- pandas/core/internals/managers.py | 33 ++- pandas/tests/copy_view/test_clip.py | 13 ++ .../copy_view/test_core_functionalities.py | 18 +- pandas/tests/copy_view/test_indexing.py | 11 +- pandas/tests/copy_view/test_interp_fillna.py | 14 ++ pandas/tests/copy_view/test_methods.py | 36 +++- pandas/tests/copy_view/test_replace.py | 38 +++- .../tests/frame/methods/test_interpolate.py | 21 +- pandas/tests/groupby/test_groupby.py | 26 +++ pandas/tests/indexing/test_loc.py | 7 + pyproject.toml | 2 +- requirements-dev.txt | 2 +- scripts/run_stubtest.py | 2 - setup.py | 2 +- 64 files changed, 705 insertions(+), 296 deletions(-) delete mode 100644 doc/source/whatsnew/v2.0.4.rst diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index b7dce4f63f94c..810764754b7e1 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -41,7 +41,7 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "Cython": ["3.0.0"], + "Cython": ["0.29.33"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index d8186d09bb9d4..ffa7732c604a0 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index a40640e99265a..5a6a26c2e1ad8 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 3d5e2e99feb9c..f24e866af0439 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - meson[ninja]=1.0.1 - - cython>=3.0.0 + - cython>=0.29.33 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 67203201ea637..9d60d734db5b3 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index c1c7b986fe8a4..0e2fcf87c2d6e 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 658dfe032a42b..6ea0d41b947dc 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 292565e9640d9..035395d55eb3a 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 60bdc2b828dae..df4e8e285bd02 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index f22fd2a79b50e..fc225d0f44497 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,7 +24,6 @@ Version 2.0 .. toctree:: :maxdepth: 2 - v2.0.4 v2.0.3 v2.0.2 v2.0.1 diff --git a/doc/source/whatsnew/v2.0.4.rst b/doc/source/whatsnew/v2.0.4.rst deleted file mode 100644 index 36d798ea68c30..0000000000000 --- a/doc/source/whatsnew/v2.0.4.rst +++ /dev/null @@ -1,41 +0,0 @@ -.. _whatsnew_204: - -What's new in 2.0.4 (August ??, 2023) ---------------------------------------- - -These are the changes in pandas 2.0.4. See :ref:`release` for a full changelog -including other versions of pandas. - -{{ header }} - -.. --------------------------------------------------------------------------- -.. _whatsnew_204.regressions: - -Fixed regressions -~~~~~~~~~~~~~~~~~ -- -- - -.. --------------------------------------------------------------------------- -.. _whatsnew_204.bug_fixes: - -Bug fixes -~~~~~~~~~ -- -- - -.. --------------------------------------------------------------------------- -.. _whatsnew_204.other: - -Other -~~~~~ -- -- - -.. --------------------------------------------------------------------------- -.. _whatsnew_204.contributors: - -Contributors -~~~~~~~~~~~~ - -.. contributors:: v2.0.3..v2.0.4|HEAD diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e8e8a6f0a92d5..313bf61ecabf9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_210: -What's new in 2.1.0 (Month XX, 2023) +What's new in 2.1.0 (Aug 11, 2023) -------------------------------------- These are the changes in pandas 2.1.0. See :ref:`release` for a full changelog @@ -121,6 +121,12 @@ Copy-on-Write improvements - DataFrame.update / Series.update - DataFrame.fillna / Series.fillna - DataFrame.replace / Series.replace + - DataFrame.clip / Series.clip + - DataFrame.where / Series.where + - DataFrame.mask / Series.mask + - DataFrame.interpolate / Series.interpolate + - DataFrame.ffill / Series.ffill + - DataFrame.bfill / Series.bfill .. _whatsnew_210.enhancements.map_na_action: @@ -260,7 +266,6 @@ Other enhancements - :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) - Added support for the DataFrame Consortium Standard (:issue:`54383`) - Performance improvement in :meth:`.GroupBy.quantile` (:issue:`51722`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: @@ -270,16 +275,6 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_210.notable_bug_fixes.notable_bug_fix1: - -notable_bug_fix1 -^^^^^^^^^^^^^^^^ - -.. _whatsnew_210.notable_bug_fixes.notable_bug_fix2: - -notable_bug_fix2 -^^^^^^^^^^^^^^^^ - .. --------------------------------------------------------------------------- .. _whatsnew_210.api_breaking: @@ -361,14 +356,6 @@ If installed, we now require: +----------------------+-----------------+----------+---------+ For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| | | X | -+-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -377,7 +364,6 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ - :class:`arrays.PandasArray` has been renamed ``NumpyExtensionArray`` and the attached dtype name changed from ``PandasDtype`` to ``NumpyEADtype``; importing ``PandasArray`` still works until the next major version (:issue:`53694`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.deprecations: @@ -602,7 +588,6 @@ Other Deprecations - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) - Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and skipna=True or any-NAs and skipna=False returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name``. (:issue:`54229`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: @@ -667,7 +652,6 @@ Categorical - Bug in :meth:`CategoricalIndex.remove_categories` where ordered categories would not be maintained (:issue:`53935`). - Bug in :meth:`Series.astype` with ``dtype="category"`` for nullable arrays with read-only null value masks (:issue:`53658`) - Bug in :meth:`Series.map` , where the value of the ``na_action`` parameter was not used if the series held a :class:`Categorical` (:issue:`22527`). -- Datetimelike ^^^^^^^^^^^^ @@ -694,13 +678,11 @@ Timedelta - Bug in :meth:`Timedelta.__hash__`, raising an ``OutOfBoundsTimedelta`` on certain large values of second resolution (:issue:`54037`) - Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`) - Bug in :meth:`arrays.TimedeltaArray.map` and :meth:`TimedeltaIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) -- Timezones ^^^^^^^^^ - Bug in :func:`infer_freq` that raises ``TypeError`` for ``Series`` of timezone-aware timestamps (:issue:`52456`) - Bug in :meth:`DatetimeTZDtype.base` that always returns a NumPy dtype with nanosecond resolution (:issue:`52705`) -- Numeric ^^^^^^^ @@ -732,7 +714,6 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str` that did not raise a ``TypeError`` when iterated (:issue:`54173`) -- Interval ^^^^^^^^ @@ -753,7 +734,6 @@ Missing - Bug in :meth:`Series.idxmin`, :meth:`Series.idxmax`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax` with a :class:`DatetimeIndex` index containing ``NaT`` incorrectly returning ``NaN`` instead of ``NaT`` (:issue:`43587`) - Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` failing to raise on invalid ``downcast`` keyword, which can be only ``None`` or "infer" (:issue:`53103`) - Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` with complex dtype incorrectly failing to fill ``NaN`` entries (:issue:`53635`) -- MultiIndex ^^^^^^^^^^ @@ -797,7 +777,6 @@ Plotting ^^^^^^^^ - Bug in :meth:`Series.plot` when invoked with ``color=None`` (:issue:`51953`) - Fixed UserWarning in :meth:`DataFrame.plot.scatter` when invoked with ``c="b"`` (:issue:`53908`) -- Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -843,13 +822,11 @@ Reshaping - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - Bug when joining empty :class:`DataFrame` objects, where the joined index would be a :class:`RangeIndex` instead of the joined index type (:issue:`52777`) -- Sparse ^^^^^^ - Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a ``numpy`` dtype (:issue:`53160`) - Bug in :meth:`arrays.SparseArray.map` allowed the fill value to be included in the sparse values (:issue:`52095`) -- ExtensionArray ^^^^^^^^^^^^^^ @@ -865,14 +842,12 @@ ExtensionArray Styler ^^^^^^ - Bug in :meth:`Styler._copy` calling overridden methods in subclasses of :class:`Styler` (:issue:`52728`) -- Metadata ^^^^^^^^ - Fixed metadata propagation in :meth:`DataFrame.max`, :meth:`DataFrame.min`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`Series.mode`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt` (:issue:`28283`) - Fixed metadata propagation in :meth:`DataFrame.squeeze`, and :meth:`DataFrame.describe` (:issue:`28283`) - Fixed metadata propagation in :meth:`DataFrame.std` (:issue:`28283`) -- Other ^^^^^ @@ -898,11 +873,6 @@ Other - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) -- The minimum version of Cython needed to compile pandas is now ``3.0.0`` (:issue:`54335`) - -.. ***DO NOT USE THIS SECTION*** - -- .. --------------------------------------------------------------------------- .. _whatsnew_210.contributors: diff --git a/environment.yml b/environment.yml index 44c0ce37c2957..3a0da0bfc703d 100644 --- a/environment.yml +++ b/environment.yml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython=3.0.0 + - cython=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ed251c401c277..0b6ea58f987d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -998,7 +998,8 @@ def rank_1d( N = len(values) if labels is not None: - assert len(labels) == N + # TODO(cython3): cast won't be necessary (#2992) + assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N, dtype=np.int64) @@ -1087,7 +1088,8 @@ cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, - const numeric_object_t[:] masked_vals, + # TODO(cython3): make const (https://github.com/cython/cython/issues/3222) + numeric_object_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1142,7 +1144,108 @@ cdef void rank_sorted_1d( # array that we sorted previously, which gives us the location of # that sorted value for retrieval back from the original # values / masked_vals arrays - with gil(numeric_object_t is object): + # TODO(cython3): de-duplicate once cython supports conditional nogil + if numeric_object_t is object: + with gil: + for i in range(N): + at_end = i == N - 1 + + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change. Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], + masked_vals[sort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[sort_indexer[i]] + != labels[sort_indexer[i+1]])) + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if (next_val_diff or group_changed or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): + + # If keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and check_mask and mask[sort_indexer[i]]: + grp_na_count = dups + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n + 1, n + 2 ... n + m + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n + m, n + (m - 1) ... n + 1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = grp_vals_seen + + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration. If group + # changes we do not record seeing a new value in the group + if not group_changed and (next_val_diff or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): + dups = sum_ranks = 0 + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be + # sure to reset any of the items helping to calculate dups + if group_changed: + + # If not dense tiebreak, group size used to compute + # percentile will be # of non-null elements in group + if tiebreak != TIEBREAK_DENSE: + grp_size = i - grp_start + 1 - grp_na_count + + # Otherwise, it will be the number of distinct values + # in the group, subtracting 1 if NaNs are present + # since that is a distinct value we shouldn't count + else: + grp_size = grp_vals_seen - (grp_na_count > 0) + + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size + + dups = sum_ranks = 0 + grp_na_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 + else: for i in range(N): at_end = i == N - 1 @@ -1371,9 +1474,8 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - # TODO: cython bug (post Cython 3) prevents update to "const diff_t[:, :] arr" - ndarray[diff_t, ndim=2] arr, - out_t[:, :] out, + ndarray[diff_t, ndim=2] arr, # TODO(cython3) update to "const diff_t[:, :] arr" + ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, bint datetimelike=False, @@ -1381,8 +1483,7 @@ def diff_2d( cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.flags.f_contiguous - # TODO: change to this when arr becomes a memoryview - # bint f_contig = arr.is_f_contig() + # bint f_contig = arr.is_f_contig() # TODO(cython3) diff_t left, right # Disable for unsupported dtype combinations, diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 2a3858674af9e..88c3abba506a3 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -121,21 +121,30 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{if c_type_in == c_type_out != "object"}} # GH#3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof({{c_type_out}}) and - sizeof({{c_type_out}}) * n >= 256): - - for i in range(n): - idx = indexer[i] - if idx == -1: - for j in range(k): - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof({{c_type_out}}) * k)) - return - {{endif}} + with nogil: + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof({{c_type_out}}) and + sizeof({{c_type_out}}) * n >= 256): + + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof({{c_type_out}}) * k)) + else: + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + for j in range(k): + out[i, j] = values[idx, j] + {{else}} for i in range(n): idx = indexer[i] @@ -149,6 +158,7 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{else}} out[i, j] = values[idx, j] {{endif}} + {{endif}} @cython.wraparound(False) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 86f69c3cdfc75..78fee8f01319c 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -26,7 +26,7 @@ class NDArrayBacked: def size(self) -> int: ... @property def nbytes(self) -> int: ... - def copy(self, order=...): ... + def copy(self): ... def delete(self, loc, axis=...): ... def swapaxes(self, axis1, axis2): ... def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9889436a542c1..718fb358e26bc 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -126,7 +126,8 @@ cdef class NDArrayBacked: @property def size(self) -> int: - return self._ndarray.size + # TODO(cython3): use self._ndarray.size + return cnp.PyArray_SIZE(self._ndarray) @property def nbytes(self) -> int: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 019b30900547d..d165ddd6c8afa 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -44,6 +44,7 @@ def group_fillna_indexer( labels: np.ndarray, # ndarray[int64_t] sorted_labels: npt.NDArray[np.intp], mask: npt.NDArray[np.uint8], + direction: Literal["ffill", "bfill"], limit: int, # int64_t dropna: bool, ) -> None: ... @@ -54,7 +55,7 @@ def group_any_all( mask: np.ndarray, # const uint8_t[::1] val_test: Literal["any", "all"], skipna: bool, - result_mask: np.ndarray | None, + nullable: bool, ) -> None: ... def group_sum( out: np.ndarray, # complexfloatingintuint_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3384060f74c20..20499016f951e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -695,8 +695,6 @@ def group_sum( N, K = (values).shape - # TODO: Port this to use conditional nogil - # Note: There are some test failures since the object/non-object paths have diverged if sum_t is object: # NB: this does not use 'compensation' like the non-object track does. for i in range(N): @@ -757,9 +755,9 @@ def group_sum( compensation[lab, j] = 0 sumx[lab, j] = t - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx + ) @cython.wraparound(False) @@ -811,9 +809,9 @@ def group_prod( nobs[lab, j] += 1 prodx[lab, j] *= val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx + ) @cython.wraparound(False) @@ -1371,7 +1369,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike): ctypedef fused mincount_t: - numeric_object_t + numeric_t complex64_t complex128_t @@ -1387,7 +1385,7 @@ cdef inline void _check_below_mincount( int64_t[:, ::1] nobs, int64_t min_count, mincount_t[:, ::1] resx, -) noexcept: +) noexcept nogil: """ Check if the number of observations for a group is below min_count, and if so set the result for that group to the appropriate NA-like value. @@ -1395,49 +1393,48 @@ cdef inline void _check_below_mincount( cdef: Py_ssize_t i, j - with nogil(mincount_t is not object): - for i in range(ncounts): - for j in range(K): + for i in range(ncounts): + for j in range(K): - if nobs[i, j] < min_count: - # if we are integer dtype, not is_datetimelike, and - # not uses_mask, then getting here implies that - # counts[i] < min_count, which means we will - # be cast to float64 and masked at the end - # of WrappedCythonOp._call_cython_op. So we can safely - # set a placeholder value in out[i, j]. - if uses_mask: - result_mask[i, j] = True - # set out[i, j] to 0 to be deterministic, as - # it was initialized with np.empty. Also ensures - # we can downcast out if appropriate. - out[i, j] = 0 - elif ( - mincount_t is float32_t - or mincount_t is float64_t - or mincount_t is complex64_t - or mincount_t is complex128_t - ): - out[i, j] = NAN - elif mincount_t is int64_t: - # Per above, this is a placeholder in - # non-is_datetimelike cases. - out[i, j] = NPY_NAT - elif mincount_t is object: - out[i, j] = None - else: - # placeholder, see above - out[i, j] = 0 + if nobs[i, j] < min_count: + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. + if uses_mask: + result_mask[i, j] = True + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 + elif ( + mincount_t is float32_t + or mincount_t is float64_t + or mincount_t is complex64_t + or mincount_t is complex128_t + ): + out[i, j] = NAN + elif mincount_t is int64_t: + # Per above, this is a placeholder in + # non-is_datetimelike cases. + out[i, j] = NPY_NAT else: - out[i, j] = resx[i, j] + # placeholder, see above + out[i, j] = 0 + else: + out[i, j] = resx[i, j] +# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can +# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_last( numeric_object_t[:, ::1] out, int64_t[::1] counts, - const numeric_object_t[:, :] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, @@ -1455,7 +1452,9 @@ def group_last( bint uses_mask = mask is not None bint isna_entry - if not len(values) == len(labels): + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1467,7 +1466,8 @@ def group_last( N, K = (values).shape - with nogil(numeric_object_t is not object): + if numeric_object_t is object: + # TODO(cython3): De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -1480,28 +1480,53 @@ def group_last( if uses_mask: isna_entry = mask[i, j] else: - # TODO: just make _treat_as_na support this? - # remove notimplemented for object dtype there - if numeric_object_t is object: - isna_entry = checknull(val) - else: - isna_entry = _treat_as_na(val, is_datetimelike) + isna_entry = checknull(val) if not isna_entry: + # TODO(cython3): use _treat_as_na here once + # conditional-nogil is available. nobs[lab, j] += 1 resx[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = None + else: + out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: + nobs[lab, j] += 1 + resx[lab, j] = val + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) + +# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can +# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_nth( numeric_object_t[:, ::1] out, int64_t[::1] counts, - const numeric_object_t[:, :] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, @@ -1520,7 +1545,9 @@ def group_nth( bint uses_mask = mask is not None bint isna_entry - if not len(values) == len(labels): + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1532,7 +1559,8 @@ def group_nth( N, K = (values).shape - with nogil(numeric_object_t is not object): + if numeric_object_t is object: + # TODO(cython3): De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -1545,21 +1573,46 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] else: - # TODO: just make _treat_as_na support this? - # remove notimplemented for object dtype there - if numeric_object_t is object: - isna_entry = checknull(val) - else: - isna_entry = _treat_as_na(val, is_datetimelike) + isna_entry = checknull(val) if not isna_entry: + # TODO(cython3): use _treat_as_na here once + # conditional-nogil is available. nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = None + else: + out[i, j] = resx[i, j] + + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) @cython.boundscheck(False) @@ -1651,7 +1704,7 @@ def group_rank( cdef group_min_max( numeric_t[:, ::1] out, int64_t[::1] counts, - const numeric_t[:, :] values, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1699,7 +1752,9 @@ cdef group_min_max( bint uses_mask = mask is not None bint isna_entry - if not len(values) == len(labels): + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1734,9 +1789,9 @@ cdef group_min_max( if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max - ) + _check_below_mincount( + out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max + ) @cython.wraparound(False) @@ -1744,7 +1799,7 @@ cdef group_min_max( def group_max( numeric_t[:, ::1] out, int64_t[::1] counts, - const numeric_t[:, :] values, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1770,7 +1825,7 @@ def group_max( def group_min( numeric_t[:, ::1] out, int64_t[::1] counts, - const numeric_t[:, :] values, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 8069637a9bff4..2bc6d74fe6aee 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -20,6 +20,7 @@ class Factorizer: def factorize( self, values: np.ndarray, + sort: bool = ..., na_sentinel=..., na_value=..., mask=..., @@ -156,9 +157,9 @@ class HashTable: def __contains__(self, key: Hashable) -> bool: ... def sizeof(self, deep: bool = ...) -> int: ... def get_state(self) -> dict[str, int]: ... - # TODO: `val/key` type is subclass-specific - def get_item(self, val): ... # TODO: return type? - def set_item(self, key, val) -> None: ... + # TODO: `item` type is subclass-specific + def get_item(self, item): ... # TODO: return type? + def set_item(self, item, val) -> None: ... def get_na(self): ... # TODO: return type? def set_na(self, val) -> None: ... def map_locations( @@ -184,7 +185,6 @@ class HashTable: self, values: np.ndarray, # np.ndarray[subclass-specific] return_inverse: bool = ..., - mask=..., ) -> ( tuple[ np.ndarray, # np.ndarray[subclass-specific] @@ -198,7 +198,6 @@ class HashTable: na_sentinel: int = ..., na_value: object = ..., mask=..., - ignore_na: bool = True, ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] class Complex128HashTable(HashTable): ... diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c0723392496c1..1cf5d734705af 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1239,10 +1239,9 @@ cdef class StringHashTable(HashTable): na_value=na_value, ignore_na=ignore_na, return_inverse=True) - # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, @@ -1497,10 +1496,9 @@ cdef class PyObjectHashTable(HashTable): na_value=na_value, ignore_na=ignore_na, return_inverse=True) - # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 83ea99c13b153..adf4e8c926fa3 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -2,10 +2,14 @@ from collections import defaultdict import weakref cimport cython -from cpython.pyport cimport PY_SSIZE_T_MAX from cpython.slice cimport PySlice_GetIndicesEx from cython cimport Py_ssize_t + +cdef extern from "Python.h": + # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX + Py_ssize_t PY_SSIZE_T_MAX + import numpy as np cimport numpy as cnp diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 44f54bb451283..e07d80dd04b31 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -511,6 +511,17 @@ cdef class Interval(IntervalMixin): or is_timedelta64_object(y) ): return Interval(self.left + y, self.right + y, closed=self.closed) + elif ( + # __radd__ pattern + # TODO(cython3): remove this + isinstance(y, Interval) + and ( + isinstance(self, numbers.Number) + or PyDelta_Check(self) + or is_timedelta64_object(self) + ) + ): + return Interval(y.left + self, y.right + self, closed=y.closed) return NotImplemented def __radd__(self, other): @@ -534,6 +545,10 @@ cdef class Interval(IntervalMixin): def __mul__(self, y): if isinstance(y, numbers.Number): return Interval(self.left * y, self.right * y, closed=self.closed) + elif isinstance(y, Interval) and isinstance(self, numbers.Number): + # __radd__ semantics + # TODO(cython3): remove this + return Interval(y.left * self, y.right * self, closed=y.closed) return NotImplemented def __rmul__(self, other): diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 7e92032a73325..32641319a6b96 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -45,24 +45,22 @@ def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_pyarrow_array(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... -def is_interval(obj: object) -> TypeGuard[Interval]: ... -def is_decimal(obj: object) -> TypeGuard[Decimal]: ... -def is_complex(obj: object) -> TypeGuard[complex]: ... -def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ... -def is_integer(obj: object) -> TypeGuard[int | np.integer]: ... +def is_interval(val: object) -> TypeGuard[Interval]: ... +def is_decimal(val: object) -> TypeGuard[Decimal]: ... +def is_complex(val: object) -> TypeGuard[complex]: ... +def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ... +def is_integer(val: object) -> TypeGuard[int | np.integer]: ... def is_int_or_none(obj) -> bool: ... -def is_float(obj: object) -> TypeGuard[float]: ... +def is_float(val: object) -> TypeGuard[float]: ... def is_interval_array(values: np.ndarray) -> bool: ... -def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ... -def is_timedelta_or_timedelta64_array( - values: np.ndarray, skipna: bool = True -) -> bool: ... +def is_datetime64_array(values: np.ndarray) -> bool: ... +def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... def is_time_array(values: np.ndarray, skipna: bool = ...): ... def is_date_array(values: np.ndarray, skipna: bool = ...): ... def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... def is_string_array(values: np.ndarray, skipna: bool = ...): ... -def is_float_array(values: np.ndarray): ... +def is_float_array(values: np.ndarray, skipna: bool = ...): ... def is_integer_array(values: np.ndarray, skipna: bool = ...): ... def is_bool_array(values: np.ndarray, skipna: bool = ...): ... def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ... @@ -183,7 +181,7 @@ def count_level_2d( max_bin: int, ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] def get_level_sorter( - codes: np.ndarray, # const int64_t[:] + label: np.ndarray, # const int64_t[:] starts: np.ndarray, # const intp_t[:] ) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def generate_bins_dt64( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 38695fbb8222b..55819ebd1f15e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -512,7 +512,8 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: @cython.wraparound(False) @cython.boundscheck(False) -def has_infs(const floating[:] arr) -> bool: +# TODO(cython3): Can add const once cython#1772 is resolved +def has_infs(floating[:] arr) -> bool: cdef: Py_ssize_t i, n = len(arr) floating inf, neginf, val diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index 6738a1dff4a9e..515f7aa53ba15 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -37,8 +37,8 @@ def vec_binop( @overload def maybe_convert_bool( arr: npt.NDArray[np.object_], - true_values: Iterable | None = None, - false_values: Iterable | None = None, + true_values: Iterable = ..., + false_values: Iterable = ..., convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... @overload diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e447d3b0f5920..6d66e21ce49f5 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -35,7 +35,6 @@ from cpython.unicode cimport ( PyUnicode_AsUTF8String, PyUnicode_Decode, PyUnicode_DecodeUTF8, - PyUnicode_FromString, ) from cython cimport Py_ssize_t from libc.stdlib cimport free @@ -45,6 +44,12 @@ from libc.string cimport ( strncpy, ) + +cdef extern from "Python.h": + # TODO(cython3): get this from cpython.unicode + object PyUnicode_FromString(char *v) + + import numpy as np cimport numpy as cnp diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi index 536265b25425e..9e5cecc61e5ca 100644 --- a/pandas/_libs/sparse.pyi +++ b/pandas/_libs/sparse.pyi @@ -39,10 +39,6 @@ class BlockIndex(SparseIndex): self, length: int, blocs: np.ndarray, blengths: np.ndarray ) -> None: ... - # Override to have correct parameters - def intersect(self, other: SparseIndex) -> Self: ... - def make_union(self, y: SparseIndex) -> Self: ... - def make_mask_object_ndarray( arr: npt.NDArray[np.object_], fill_value ) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index 6426e32c52304..d564d767f7f05 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -10,6 +10,5 @@ TD64NS_DTYPE: np.dtype def precision_from_unit( unit: str, - out_reso: int = ..., ) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 2a2a0f347ce12..45c4d7809fe7a 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -5,7 +5,6 @@ from libc.math cimport log10 from numpy cimport ( int32_t, int64_t, - npy_datetime, ) cnp.import_array() @@ -44,6 +43,7 @@ from pandas._libs.tslibs.np_datetime cimport ( get_datetime64_value, get_implementation_bounds, import_pandas_datetime, + npy_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, pandas_datetime_to_datetimestruct, diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index b0293d2e0fcf2..bea3e18273318 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -5,10 +5,10 @@ from enum import Enum _attrname_to_abbrevs: dict[str, str] _period_code_map: dict[str, int] -def periods_per_day(reso: int = ...) -> int: ... +def periods_per_day(reso: int) -> int: ... def periods_per_second(reso: int) -> int: ... def is_supported_unit(reso: int) -> bool: ... -def npy_unit_to_abbrev(unit: int) -> str: ... +def npy_unit_to_abbrev(reso: int) -> str: ... def get_supported_reso(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str) -> int: ... diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 04a6858297aee..7d75fa3114d2b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -128,6 +128,11 @@ cdef class _NaT(datetime): return NotImplemented def __add__(self, other): + if self is not c_NaT: + # TODO(cython3): remove this it moved to __radd__ + # cython __radd__ semantics + self, other = other, self + if PyDateTime_Check(other): return c_NaT elif PyDelta_Check(other): @@ -157,6 +162,15 @@ cdef class _NaT(datetime): def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing # to subclass; allows us to @final(_Timestamp.__sub__) + cdef: + bint is_rsub = False + + if self is not c_NaT: + # cython __rsub__ semantics + # TODO(cython3): remove __rsub__ logic from here + self, other = other, self + is_rsub = True + if PyDateTime_Check(other): return c_NaT elif PyDelta_Check(other): @@ -170,9 +184,19 @@ cdef class _NaT(datetime): elif util.is_array(other): if other.dtype.kind == "m": - # NaT - timedelta64 we treat NaT as datetime64, so result - # is datetime64 - result = np.empty(other.shape, dtype="datetime64[ns]") + if not is_rsub: + # NaT - timedelta64 we treat NaT as datetime64, so result + # is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") + result.fill("NaT") + return result + + # __rsub__ logic here + # TODO(cython3): remove this, move above code out of + # ``if not is_rsub`` block + # timedelta64 - NaT we have to treat NaT as timedelta64 + # for this to be meaningful, and the result is timedelta64 + result = np.empty(other.shape, dtype="timedelta64[ns]") result.fill("NaT") return result diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index bf29184d7a94b..60532174e8bdc 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -6,22 +6,35 @@ from cpython.datetime cimport ( from numpy cimport ( int32_t, int64_t, - npy_datetime, - npy_timedelta, ) +# TODO(cython3): most of these can be cimported directly from numpy +cdef extern from "numpy/ndarrayobject.h": + ctypedef int64_t npy_timedelta + ctypedef int64_t npy_datetime + cdef extern from "numpy/ndarraytypes.h": ctypedef struct PyArray_DatetimeMetaData: NPY_DATETIMEUNIT base int64_t num +cdef extern from "numpy/arrayscalars.h": + ctypedef struct PyDatetimeScalarObject: + # PyObject_HEAD + npy_datetime obval + PyArray_DatetimeMetaData obmeta + + ctypedef struct PyTimedeltaScalarObject: + # PyObject_HEAD + npy_timedelta obval + PyArray_DatetimeMetaData obmeta + cdef extern from "numpy/ndarraytypes.h": ctypedef struct npy_datetimestruct: int64_t year int32_t month, day, hour, min, sec, us, ps, as - # TODO: Can remove this once NPY_FR_GENERIC is added to - # the Cython __init__.pxd for numpy + ctypedef enum NPY_DATETIMEUNIT: NPY_FR_Y NPY_FR_M diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index c42bc43ac9d89..0cb0e3b0237d7 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -9,7 +9,7 @@ class OutOfBoundsTimedelta(ValueError): ... def py_get_unit_from_dtype(dtype: np.dtype): ... def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ... def astype_overflowsafe( - values: np.ndarray, + arr: np.ndarray, dtype: np.dtype, copy: bool = ..., round_ok: bool = ..., diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 8873695c23381..7b2ee68c73ad2 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -28,11 +28,8 @@ cimport numpy as cnp cnp.import_array() from numpy cimport ( - PyDatetimeScalarObject, - PyTimedeltaScalarObject, int64_t, ndarray, - npy_datetime, uint8_t, ) diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index 1d37477573023..1a4742111db89 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -277,10 +277,7 @@ def roll_qtrday( INVALID_FREQ_ERR_MSG: Literal["Invalid frequency: {0}"] def shift_months( - dtindex: npt.NDArray[np.int64], - months: int, - day_opt: str | None = ..., - reso: int = ..., + dtindex: npt.NDArray[np.int64], months: int, day_opt: str | None = ... ) -> npt.NDArray[np.int64]: ... _offset_map: dict[str, BaseOffset] diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index f330a0cea1917..958fe1181d309 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -479,7 +479,12 @@ cdef class BaseOffset: return type(self)(n=1, normalize=self.normalize, **self.kwds) def __add__(self, other): - if util.is_array(other) and other.dtype == object: + if not isinstance(self, BaseOffset): + # cython semantics; this is __radd__ + # TODO(cython3): remove this, this moved to __radd__ + return other.__add__(self) + + elif util.is_array(other) and other.dtype == object: return np.array([self + x for x in other]) try: @@ -496,6 +501,10 @@ cdef class BaseOffset: elif type(other) is type(self): return type(self)(self.n - other.n, normalize=self.normalize, **self.kwds) + elif not isinstance(self, BaseOffset): + # TODO(cython3): remove, this moved to __rsub__ + # cython semantics, this is __rsub__ + return (-other).__add__(self) else: # e.g. PeriodIndex return NotImplemented @@ -509,6 +518,10 @@ cdef class BaseOffset: elif is_integer_object(other): return type(self)(n=other * self.n, normalize=self.normalize, **self.kwds) + elif not isinstance(self, BaseOffset): + # TODO(cython3): remove this, this moved to __rmul__ + # cython semantics, this is __rmul__ + return other.__mul__(self) return NotImplemented def __rmul__(self, other): @@ -997,6 +1010,10 @@ cdef class Tick(SingleConstructorOffset): return self.delta.__gt__(other) def __mul__(self, other): + if not isinstance(self, Tick): + # TODO(cython3), remove this, this moved to __rmul__ + # cython semantics, this is __rmul__ + return other.__mul__(self) if is_float_object(other): n = other * self.n # If the new `n` is an integer, we can represent it using the @@ -1024,6 +1041,11 @@ cdef class Tick(SingleConstructorOffset): return _wrap_timedelta_result(result) def __add__(self, other): + if not isinstance(self, Tick): + # cython semantics; this is __radd__ + # TODO(cython3): remove this, this moved to __radd__ + return other.__add__(self) + if isinstance(other, Tick): if type(self) is type(other): return type(self)(self.n + other.n) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 5e3ed8d99c659..3643c840a50a6 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -774,7 +774,8 @@ def try_parse_year_month_day( object[::1] result n = len(years) - if len(months) != n or len(days) != n: + # TODO(cython3): Use len instead of `shape[0]` + if months.shape[0] != n or days.shape[0] != n: raise ValueError("Length of years/months/days must all be equal") result = np.empty(n, dtype="O") diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index b3aa6c34e323f..8826757e31c32 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -89,7 +89,7 @@ class Period(PeriodMixin): @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod - def now(cls, freq: BaseOffset) -> Period: ... + def now(cls, freq: BaseOffset = ...) -> Period: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index eadb23e0a94ca..c37e9cd7ef1f3 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1838,6 +1838,10 @@ cdef class _Period(PeriodMixin): def __add__(self, other): if not is_period_object(self): + # cython semantics; this is analogous to a call to __radd__ + # TODO(cython3): remove this + if self is NaT: + return NaT return other.__add__(self) if is_any_td_scalar(other): @@ -1872,6 +1876,10 @@ cdef class _Period(PeriodMixin): def __sub__(self, other): if not is_period_object(self): + # cython semantics; this is like a call to __rsub__ + # TODO(cython3): remove this + if self is NaT: + return NaT return NotImplemented elif ( @@ -2503,7 +2511,7 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def strftime(self, fmt: str | None) -> str: + def strftime(self, fmt: str) -> str: r""" Returns a formatted string representation of the :class:`Period`. diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index da88ad32d625b..aba9b25b23154 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -68,7 +68,7 @@ UnitChoices: TypeAlias = Literal[ _S = TypeVar("_S", bound=timedelta) def ints_to_pytimedelta( - m8values: npt.NDArray[np.timedelta64], + arr: npt.NDArray[np.timedelta64], box: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_to_timedelta64( @@ -162,10 +162,8 @@ class Timedelta(timedelta): def __gt__(self, other: timedelta) -> bool: ... def __hash__(self) -> int: ... def isoformat(self) -> str: ... - def to_numpy( - self, dtype: npt.DTypeLike = ..., copy: bool = False - ) -> np.timedelta64: ... - def view(self, dtype: npt.DTypeLike) -> object: ... + def to_numpy(self) -> np.timedelta64: ... + def view(self, dtype: npt.DTypeLike = ...) -> object: ... @property def unit(self) -> str: ... def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d2b57f447c350..ffa9a67542e21 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1043,9 +1043,8 @@ cdef class _Timedelta(timedelta): """ return npy_unit_to_abbrev(self._creso) - # TODO: make cdef property once this works in Cython @property - def days(self) -> int: + def days(self) -> int: # TODO(cython3): make cdef property """ Returns the days of the timedelta. @@ -1068,9 +1067,8 @@ cdef class _Timedelta(timedelta): self._ensure_components() return self._d - # TODO: make cdef property once this works in Cython @property - def seconds(self) -> int: + def seconds(self) -> int: # TODO(cython3): make cdef property """ Return the total hours, minutes, and seconds of the timedelta as seconds. @@ -1107,9 +1105,8 @@ cdef class _Timedelta(timedelta): self._ensure_components() return self._h * 3600 + self._m * 60 + self._s - # TODO: make cdef property once this works in Cython @property - def microseconds(self) -> int: + def microseconds(self) -> int: # TODO(cython3): make cdef property # NB: using the python C-API PyDateTime_DELTA_GET_MICROSECONDS will fail # (or be incorrect) self._ensure_components() diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 24c0a07eb7985..36ae2d6d892f1 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -180,7 +180,7 @@ class Timestamp(datetime): def is_year_end(self) -> bool: ... def to_pydatetime(self, warn: bool = ...) -> datetime: ... def to_datetime64(self) -> np.datetime64: ... - def to_period(self, freq: BaseOffset | str | None = None) -> Period: ... + def to_period(self, freq: BaseOffset | str = ...) -> Period: ... def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 536a8372c64a8..844fc8f0ed187 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -476,6 +476,11 @@ cdef class _Timestamp(ABCTimestamp): dtype=object, ) + elif not isinstance(self, _Timestamp): + # cython semantics, args have been switched and this is __radd__ + # TODO(cython3): remove this it moved to __radd__ + return other.__add__(self) + return NotImplemented def __radd__(self, other): @@ -509,10 +514,13 @@ cdef class _Timestamp(ABCTimestamp): and (PyDateTime_Check(other) or is_datetime64_object(other))): # both_timestamps is to determine whether Timedelta(self - other) # should raise the OOB error, or fall back returning a timedelta. + # TODO(cython3): clean out the bits that moved to __rsub__ both_timestamps = (isinstance(other, _Timestamp) and isinstance(self, _Timestamp)) if isinstance(self, _Timestamp): other = type(self)(other) + else: + self = type(other)(self) if (self.tzinfo is None) ^ (other.tzinfo is None): raise TypeError( @@ -542,6 +550,11 @@ cdef class _Timestamp(ABCTimestamp): # We get here in stata tests, fall back to stdlib datetime # method and return stdlib timedelta object pass + elif is_datetime64_object(self): + # GH#28286 cython semantics for __rsub__, `other` is actually + # the Timestamp + # TODO(cython3): remove this, this moved to __rsub__ + return type(other)(self) - other return NotImplemented diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index 2108fa0f35547..a354765a348ec 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -10,7 +10,7 @@ from pandas._typing import npt # tz_convert_from_utc_single exposed for testing def tz_convert_from_utc_single( - utc_val: np.int64, tz: tzinfo, creso: int = ... + val: np.int64, tz: tzinfo, creso: int = ... ) -> np.int64: ... def tz_localize_to_utc( vals: npt.NDArray[np.int64], diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index de19f592da62b..3fd9e2501e611 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -31,7 +31,7 @@ def get_resolution( reso: int = ..., # NPY_DATETIMEUNIT ) -> Resolution: ... def ints_to_pydatetime( - stamps: npt.NDArray[np.int64], + arr: npt.NDArray[np.int64], tz: tzinfo | None = ..., box: str = ..., reso: int = ..., # NPY_DATETIMEUNIT diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index a6cfbec9b15b9..b926a7cb73425 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -111,8 +111,8 @@ def ewm( com: float, # float64_t adjust: bool, ignore_na: bool, - deltas: np.ndarray | None = None, # const float64_t[:] - normalize: bool = True, + deltas: np.ndarray, # const float64_t[:] + normalize: bool, ) -> np.ndarray: ... # np.ndarray[np.float64] def ewmcov( input_x: np.ndarray, # const float64_t[:] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2b43b090a43e0..c3b8d1c0e79e8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2260,7 +2260,8 @@ def _concat_same_type( return new_obj def copy(self, order: str = "C") -> Self: - new_obj = super().copy(order=order) + # error: Unexpected keyword argument "order" for "copy" + new_obj = super().copy(order=order) # type: ignore[call-arg] new_obj._freq = self.freq return new_obj diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f3af21e839fdb..be0d046697ba9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7385,6 +7385,15 @@ def ffill( dtype: float64 """ downcast = self._deprecate_downcast(downcast, "ffill") + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) return self._pad_or_backfill( "ffill", @@ -7523,6 +7532,15 @@ def bfill( 3 4.0 7.0 """ downcast = self._deprecate_downcast(downcast, "bfill") + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) return self._pad_or_backfill( "bfill", axis=axis, @@ -8047,6 +8065,16 @@ def interpolate( raise ValueError("downcast must be either None or 'infer'") inplace = validate_bool_kwarg(inplace, "inplace") + + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + axis = self._get_axis_number(axis) if self.empty: @@ -8619,6 +8647,15 @@ def clip( """ inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + axis = nv.validate_clip_with_axis(axis, (), kwargs) if axis is not None: axis = self._get_axis_number(axis) @@ -10500,6 +10537,15 @@ def where( 3 True True 4 True True """ + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) other = common.apply_if_callable(other, self) return self._where(cond, other, inplace, axis, level) @@ -10558,6 +10604,15 @@ def mask( level: Level | None = None, ) -> Self | None: inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + cond = common.apply_if_callable(cond, self) # see gh-21891 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 69fa711d9e375..ecb9cd47d7995 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,6 +11,7 @@ final, ) import warnings +import weakref import numpy as np @@ -317,7 +318,7 @@ def take_block_columns(self, indices: npt.NDArray[np.intp]) -> Self: @final def getitem_block_columns( - self, slicer: slice, new_mgr_locs: BlockPlacement + self, slicer: slice, new_mgr_locs: BlockPlacement, ref_inplace_op: bool = False ) -> Self: """ Perform __getitem__-like, return result as block. @@ -325,7 +326,8 @@ def getitem_block_columns( Only supports slices that preserve dimensionality. """ new_values = self._slice(slicer) - return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs) + refs = self.refs if not ref_inplace_op or self.refs.has_reference() else None + return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) @final def _can_hold_element(self, element: Any) -> bool: @@ -839,7 +841,7 @@ def replace_list( if inplace: masks = list(masks) - if using_cow and inplace: + if using_cow: # Don't set up refs here, otherwise we will think that we have # references when we check again later rb = [self] @@ -872,6 +874,17 @@ def replace_list( regex=regex, using_cow=using_cow, ) + + if using_cow and i != src_len: + # This is ugly, but we have to get rid of intermediate refs + # that did not go out of scope yet, otherwise we will trigger + # many unnecessary copies + for b in result: + ref = weakref.ref(b) + b.refs.referenced_blocks.pop( + b.refs.referenced_blocks.index(ref) + ) + if convert and blk.is_object and not all(x is None for x in dest_list): # GH#44498 avoid unwanted cast-back result = extend_blocks( @@ -936,7 +949,7 @@ def _replace_coerce( putmask_inplace(nb.values, mask, value) return [nb] if using_cow: - return [self.copy(deep=False)] + return [self] return [self] if inplace else [self.copy()] return self.replace( to_replace=to_replace, diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1533dc77321cb..7bb579b22aeed 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -370,8 +370,30 @@ def setitem(self, indexer, value) -> Self: raise ValueError(f"Cannot set values with ndim > {self.ndim}") if using_copy_on_write() and not self._has_no_reference(0): - # if being referenced -> perform Copy-on-Write and clear the reference # this method is only called if there is a single block -> hardcoded 0 + # Split blocks to only copy the columns we want to modify + if self.ndim == 2 and isinstance(indexer, tuple): + blk_loc = self.blklocs[indexer[1]] + if is_list_like(blk_loc) and blk_loc.ndim == 2: + blk_loc = np.squeeze(blk_loc, axis=0) + elif not is_list_like(blk_loc): + # Keep dimension and copy data later + blk_loc = [blk_loc] # type: ignore[assignment] + if len(blk_loc) == 0: + return self.copy(deep=False) + + values = self.blocks[0].values + if values.ndim == 2: + values = values[blk_loc] + # "T" has no attribute "_iset_split_block" + self._iset_split_block( # type: ignore[attr-defined] + 0, blk_loc, values + ) + # first block equals values + self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) + return self + # No need to split if we either set all columns or on a single block + # manager self = self.copy() return self.apply("setitem", indexer=indexer, value=value) @@ -673,6 +695,7 @@ def _slice_take_blocks_ax0( only_slice: bool = False, *, use_na_proxy: bool = False, + ref_inplace_op: bool = False, ) -> list[Block]: """ Slice/take blocks along axis=0. @@ -688,6 +711,8 @@ def _slice_take_blocks_ax0( This is used when called from ops.blockwise.operate_blockwise. use_na_proxy : bool, default False Whether to use a np.void ndarray for newly introduced columns. + ref_inplace_op: bool, default False + Don't track refs if True because we operate inplace Returns ------- @@ -718,7 +743,9 @@ def _slice_take_blocks_ax0( # views instead of copies blocks = [ blk.getitem_block_columns( - slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i) + slice(ml, ml + 1), + new_mgr_locs=BlockPlacement(i), + ref_inplace_op=ref_inplace_op, ) for i, ml in enumerate(slobj) ] @@ -1380,7 +1407,7 @@ def idelete(self, indexer) -> BlockManager: is_deleted[indexer] = True taker = (~is_deleted).nonzero()[0] - nbs = self._slice_take_blocks_ax0(taker, only_slice=True) + nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] return type(self)(tuple(nbs), axes, verify_integrity=False) diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 30140ed4ddb6d..6a27a0633a4aa 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -68,3 +68,16 @@ def test_clip_no_op(using_copy_on_write): assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +def test_clip_chained_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 4, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].clip(1, 2, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + df[["a"]].clip(1, 2, inplace=True) + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index 08e49aa3813d8..5c177465d2fa4 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -80,11 +80,21 @@ def test_delete(using_copy_on_write): ) del df["b"] if using_copy_on_write: - # TODO: This should not have references, delete makes a shallow copy - # but keeps the blocks alive - assert df._mgr.blocks[0].refs.has_reference() - assert df._mgr.blocks[1].refs.has_reference() + assert not df._mgr.blocks[0].refs.has_reference() + assert not df._mgr.blocks[1].refs.has_reference() df = df[["a"]] if using_copy_on_write: assert not df._mgr.blocks[0].refs.has_reference() + + +def test_delete_reference(using_copy_on_write): + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"] + ) + x = df[:] + del df["b"] + if using_copy_on_write: + assert df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + assert x._mgr.blocks[0].refs.has_reference() diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index c58aeecad22e3..ebb25bd5c57d3 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1028,16 +1028,15 @@ def test_dataframe_add_column_from_series(backend, using_copy_on_write): (tm.iloc, (slice(None), 0)), ], ) +@pytest.mark.parametrize( + "col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"] +) def test_set_value_copy_only_necessary_column( - using_copy_on_write, - indexer_func, - indexer, - val, + using_copy_on_write, indexer_func, indexer, val, col ): # When setting inplace, only copy column that is modified instead of the whole # block (by splitting the block) - # TODO multi-block only for now - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col}) df_orig = df.copy() view = df[:] diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index cfaae644af39a..5507e81d04e2a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -361,3 +361,17 @@ def test_fillna_chained_assignment(using_copy_on_write): with tm.raises_chained_assignment_error(): df[["a"]].fillna(100, inplace=True) tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"]) +def test_interpolate_chained_assignment(using_copy_on_write, func): + df = DataFrame({"a": [1, np.nan, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + getattr(df["a"], func)(inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + getattr(df[["a"]], func)(inplace=True) + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index dbdd832f34aa4..fe1be2d8b6a0a 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -489,7 +489,8 @@ def test_shift_no_op(using_copy_on_write): df.iloc[0, 0] = 0 if using_copy_on_write: - assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) tm.assert_frame_equal(df2, df_orig) @@ -532,16 +533,16 @@ def test_shift_columns(using_copy_on_write): df2 = df.shift(periods=1, axis=1) assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")) - df.iloc[0, 1] = 0 + df.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory( get_array(df2, "2020-01-02"), get_array(df, "2020-01-01") ) - expected = DataFrame( - [[np.nan, 1], [np.nan, 3], [np.nan, 5]], - columns=date_range("2020-01-01", "2020-01-02"), - ) - tm.assert_frame_equal(df2, expected) + expected = DataFrame( + [[np.nan, 1], [np.nan, 3], [np.nan, 5]], + columns=date_range("2020-01-01", "2020-01-02"), + ) + tm.assert_frame_equal(df2, expected) def test_pop(using_copy_on_write): @@ -1335,13 +1336,18 @@ def test_droplevel(using_copy_on_write): if using_copy_on_write: assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 0] = 0 - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) @@ -1518,6 +1524,20 @@ def test_where_mask_noop_on_single_column(using_copy_on_write, dtype, val, func) tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize("func", ["mask", "where"]) +def test_chained_where_mask(using_copy_on_write, func): + df = DataFrame({"a": [1, 4, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) + tm.assert_frame_equal(df, df_orig) + + def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 72e15281e5986..085f355dc4377 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -333,8 +333,7 @@ def test_replace_list_multiple_elements_inplace(using_copy_on_write): arr = get_array(df, "a") df.replace([1, 2], 4, inplace=True) if using_copy_on_write: - # TODO(CoW): This should share memory - assert not np.shares_memory(arr, get_array(df, "a")) + assert np.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) else: assert np.shares_memory(arr, get_array(df, "a")) @@ -396,3 +395,38 @@ def test_replace_chained_assignment(using_copy_on_write): with tm.raises_chained_assignment_error(): df[["a"]].replace(1, 100, inplace=True) tm.assert_frame_equal(df, df_orig) + + +def test_replace_listlike(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + df_orig = df.copy() + + result = df.replace([200, 201], [11, 11]) + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + tm.assert_frame_equal(df, df) + + result = df.replace([200, 2], [10, 10]) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_replace_listlike_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + arr = get_array(df, "a") + df.replace([200, 2], [10, 11], inplace=True) + assert np.shares_memory(get_array(df, "a"), arr) + + view = df[:] + df_orig = df.copy() + df.replace([200, 3], [10, 11], inplace=True) + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), arr) + tm.assert_frame_equal(df, view) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index fa7a49c164913..291a79815a81c 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td from pandas import ( @@ -370,21 +371,31 @@ def test_interp_inplace(self, using_copy_on_write): expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) expected_cow = df.copy() result = df.copy() - return_value = result["a"].interpolate(inplace=True) - assert return_value is None + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + return_value = result["a"].interpolate(inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected_cow) else: + return_value = result["a"].interpolate(inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) result = df.copy() msg = "The 'downcast' keyword in Series.interpolate is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = result["a"].interpolate(inplace=True, downcast="infer") - assert return_value is None + if using_copy_on_write: + with tm.assert_produces_warning( + (FutureWarning, ChainedAssignmentError), match=msg + ): + return_value = result["a"].interpolate(inplace=True, downcast="infer") + assert return_value is None tm.assert_frame_equal(result, expected_cow) else: + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = result["a"].interpolate(inplace=True, downcast="infer") + assert return_value is None tm.assert_frame_equal(result, expected.astype("int64")) def test_interp_inplace_row(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9b260d5757767..24a550b4602a6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3159,3 +3159,29 @@ def test_groupby_series_with_datetimeindex_month_name(): expected = Series([2, 1], name="jan") expected.index.name = "jan" tm.assert_series_equal(result, expected) + + +def test_rolling_corr_grouping_column_contains_tuples_1(df): + # GH 44078 + df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_rolling_corr_case_grrouping_column_contains_tuples_2(df): + # GH 44078 + df = DataFrame({"a": [(1, 2), (1, 2), (1, 2)], "b": [4, 5, 6]}) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples( + [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] + ) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b314c20568f64..abbf22a7fc70a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2646,6 +2646,13 @@ def test_loc_indexer_all_false_broadcast(self): df.loc[np.array([False], dtype=np.bool_), ["a"]] = df["b"] tm.assert_frame_equal(df, expected) + def test_loc_indexer_length_one(self): + # GH#51435 + df = DataFrame({"a": ["x"], "b": ["y"]}, dtype=object) + expected = DataFrame({"a": ["y"], "b": ["y"]}, dtype=object) + df.loc[np.array([True], dtype=np.bool_), ["a"]] = df["b"] + tm.assert_frame_equal(df, expected) + class TestLocListlike: @pytest.mark.parametrize("box", [lambda x: x, np.asarray, list]) diff --git a/pyproject.toml b/pyproject.toml index ae658329f42ee..1034196baa15e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "meson-python==0.13.1", "meson==1.0.1", "wheel", - "Cython>=3.0.0", # Note: sync with setup.py, environment.yml and asv.conf.json + "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json # Note: numpy 1.25 has a backwards compatible C API by default # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) diff --git a/requirements-dev.txt b/requirements-dev.txt index 4af4351413a5b..0944acbc36c9b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,7 @@ pip versioneer[toml] -cython==3.0.0 +cython==0.29.33 meson[ninja]==1.0.1 meson-python==0.13.1 pytest>=7.3.2 diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index 35cbbef08124e..dedcdb5532593 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -47,8 +47,6 @@ # stubtest might be too sensitive "pandas._libs.lib.NoDefault", "pandas._libs.lib._NoDefault.no_default", - # stubtest/Cython is not recognizing the default value for the dtype parameter - "pandas._libs.lib.map_infer_mask", # internal type alias (should probably be private) "pandas._libs.lib.ndarray_obj_2d", # runtime argument "owner" has a default value but stub argument does not diff --git a/setup.py b/setup.py index 1ea7a502505b5..663bbd3952eab 100755 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def is_platform_mac(): # note: sync with pyproject.toml, environment.yml and asv.conf.json -min_cython_ver = "3.0.0" +min_cython_ver = "0.29.33" try: from Cython import ( From c759525cca82efb6b047160cf9e77b8dcd94b061 Mon Sep 17 00:00:00 2001 From: Omar Elbaz Date: Fri, 11 Aug 2023 14:48:15 -0400 Subject: [PATCH 6/8] move test to tests/window --- pandas/tests/groupby/test_groupby.py | 28 +--------------------------- pandas/tests/window/test_groupby.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 24a550b4602a6..2800151c3294d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3158,30 +3158,4 @@ def test_groupby_series_with_datetimeindex_month_name(): result = s.groupby(s).count() expected = Series([2, 1], name="jan") expected.index.name = "jan" - tm.assert_series_equal(result, expected) - - -def test_rolling_corr_grouping_column_contains_tuples_1(df): - # GH 44078 - df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -def test_rolling_corr_case_grrouping_column_contains_tuples_2(df): - # GH 44078 - df = DataFrame({"a": [(1, 2), (1, 2), (1, 2)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples( - [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] - ) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) + tm.assert_series_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index eb4fc06aa523e..57df666554abb 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1230,3 +1230,22 @@ def test_dont_mutate_obj_after_slicing(self): # This is the key test result = grp.count() tm.assert_frame_equal(result, expected_df) + + +def test_rolling_corr_with_single_integer_in_index(): + # GH 44078 + df = DataFrame({'a': [(1,), (1,), (1,)], 'b': [4, 5, 6]}) + gb = df.groupby(['a']) + result = (gb.rolling(2).corr(other=df)) + index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=['a',None]) + expected = DataFrame({'a': [np.nan, np.nan, np.nan], 'b': [np.nan, 1.0, 1.0]}, index=index) + tm.assert_frame_equal(result,expected) + +def test_rolling_corr_with_tuples_in_index(): + # GH 44078 + df = DataFrame({'a': [(1,2,), (1,2,), (1,2,)], 'b': [4, 5, 6]}) + gb = df.groupby(['a']) + result = (gb.rolling(2).corr(other=df)) + index = MultiIndex.from_tuples([((1,2), 0), ((1,2), 1), ((1,2), 2)], names=['a',None]) + expected = DataFrame({'a': [np.nan, np.nan, np.nan], 'b': [np.nan, 1.0, 1.0]}, index=index) + tm.assert_frame_equal(result,expected) \ No newline at end of file From af03fddef9df43c4678e6de382f944e42bc6c68d Mon Sep 17 00:00:00 2001 From: Omar Elbaz Date: Fri, 11 Aug 2023 15:05:21 -0400 Subject: [PATCH 7/8] move UTs --- pandas/tests/groupby/test_groupby.py | 30 ---------------------------- 1 file changed, 30 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 835b32e9084f4..9b260d5757767 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3158,34 +3158,4 @@ def test_groupby_series_with_datetimeindex_month_name(): result = s.groupby(s).count() expected = Series([2, 1], name="jan") expected.index.name = "jan" -<<<<<<< HEAD tm.assert_series_equal(result, expected) -======= - tm.assert_series_equal(result, expected) - - -def test_rolling_corr_grouping_column_contains_tuples_1(df): - # GH 44078 - df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -def test_rolling_corr_case_grrouping_column_contains_tuples_2(df): - # GH 44078 - df = DataFrame({"a": [(1, 2), (1, 2), (1, 2)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples( - [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] - ) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) ->>>>>>> 49ca01ba9023b677f2b2d1c42e99f45595258b74 From 215d57361260ea41527ca528e720e365984eea89 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Aug 2023 19:31:51 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/window/test_groupby.py | 51 +++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 57df666554abb..ab00e18fc4812 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1234,18 +1234,43 @@ def test_dont_mutate_obj_after_slicing(self): def test_rolling_corr_with_single_integer_in_index(): # GH 44078 - df = DataFrame({'a': [(1,), (1,), (1,)], 'b': [4, 5, 6]}) - gb = df.groupby(['a']) - result = (gb.rolling(2).corr(other=df)) - index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=['a',None]) - expected = DataFrame({'a': [np.nan, np.nan, np.nan], 'b': [np.nan, 1.0, 1.0]}, index=index) - tm.assert_frame_equal(result,expected) - + df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + def test_rolling_corr_with_tuples_in_index(): # GH 44078 - df = DataFrame({'a': [(1,2,), (1,2,), (1,2,)], 'b': [4, 5, 6]}) - gb = df.groupby(['a']) - result = (gb.rolling(2).corr(other=df)) - index = MultiIndex.from_tuples([((1,2), 0), ((1,2), 1), ((1,2), 2)], names=['a',None]) - expected = DataFrame({'a': [np.nan, np.nan, np.nan], 'b': [np.nan, 1.0, 1.0]}, index=index) - tm.assert_frame_equal(result,expected) \ No newline at end of file + df = DataFrame( + { + "a": [ + ( + 1, + 2, + ), + ( + 1, + 2, + ), + ( + 1, + 2, + ), + ], + "b": [4, 5, 6], + } + ) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples( + [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] + ) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected)