From 017dafeca580b9d9bc9789f908af1889cd6de49e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 09:32:40 -0800 Subject: [PATCH 1/8] PERF: Return RangeIndex from RangeIndex.reindex when possible --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/range.py | 4 ++++ pandas/tests/indexes/ranges/test_range.py | 10 ++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8bb051b6228ce..664b63b660ca1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -252,6 +252,7 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) +- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`?`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8a2e3fbf500a4..38222e602ff84 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -487,6 +487,10 @@ def _view(self) -> Self: result._cache = self._cache return result + def _wrap_reindex_result(self, target, indexer, preserve_names: bool): + target = self._shallow_copy(target._values) + return super()._wrap_reindex_result(target, indexer, preserve_names) + @doc(Index.copy) def copy(self, name: Hashable | None = None, deep: bool = False) -> Self: name = self._validate_names(name=name, deep=deep)[0] diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index d500687763a1e..e1c7c7924cf3c 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -608,6 +608,16 @@ def test_range_index_rsub_by_const(self): tm.assert_index_equal(result, expected) +def test_reindex_returns_rangeindex(): + ri = RangeIndex(2, name="foo") + result, result_indexer = ri.reindex([1, 2, 3]) + expected = RangeIndex(1, 4, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([1, -1, -1]) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + def test_take_return_rangeindex(): ri = RangeIndex(5, name="foo") result = ri.take([]) From ab72067ff8bf3c72eb7483d1f200a18d345368cc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:15:19 -0800 Subject: [PATCH 2/8] Add whatsnew number --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 664b63b660ca1..3416cdd107db5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -252,7 +252,7 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) -- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`?`) +- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) From 7d6fc289d144760f31b59bd5673a68e5bbefbf0f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:24:29 -0800 Subject: [PATCH 3/8] Only if index --- pandas/core/indexes/range.py | 6 ++++-- pandas/tests/indexing/test_loc.py | 5 +---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 38222e602ff84..1cd5f79cf70ea 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -29,6 +29,7 @@ doc, ) +from pandas.core.dtypes import missing from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, @@ -472,7 +473,7 @@ def _shallow_copy(self, values, name: Hashable = no_default): # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype diff = values[1] - values[0] - if diff != 0: + if not missing.isna(diff) and diff != 0: maybe_range_indexer, remainder = np.divmod(values - values[0], diff) if ( lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) @@ -488,7 +489,8 @@ def _view(self) -> Self: return result def _wrap_reindex_result(self, target, indexer, preserve_names: bool): - target = self._shallow_copy(target._values) + if target.dtype.kind == "i": + target = self._shallow_copy(target._values) return super()._wrap_reindex_result(target, indexer, preserve_names) @doc(Index.copy) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7208c688bd217..9c33d15c01cd6 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1204,10 +1204,7 @@ def test_loc_setitem_empty_append_raises(self): data = [1, 2] df = DataFrame(columns=["x", "y"]) df.index = df.index.astype(np.int64) - msg = ( - rf"None of \[Index\(\[0, 1\], dtype='{np.dtype(int)}'\)\] " - r"are in the \[index\]" - ) + msg = r"None of .*Index.* are in the \[index\]" with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data From b791bbfa743b00b4e2f64e4faff826ed1c76fff4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 16:19:32 -0800 Subject: [PATCH 4/8] add name --- pandas/core/indexes/range.py | 2 +- pandas/tests/frame/methods/test_reindex.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 1cd5f79cf70ea..01f4d710c3451 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -490,7 +490,7 @@ def _view(self) -> Self: def _wrap_reindex_result(self, target, indexer, preserve_names: bool): if target.dtype.kind == "i": - target = self._shallow_copy(target._values) + target = self._shallow_copy(target._values, name=target.name) return super()._wrap_reindex_result(target, indexer, preserve_names) @doc(Index.copy) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 45109991c4553..013b3290a6a22 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -658,15 +658,15 @@ def test_reindex_nan(self): tm.assert_frame_equal(left, right) def test_reindex_name_remains(self): - s = Series(np.random.default_rng(2).random(10)) - df = DataFrame(s, index=np.arange(len(s))) - i = Series(np.arange(10), name="iname") + # s = Series(np.random.default_rng(2).random(10)) + # df = DataFrame(s, index=np.arange(len(s))) + # i = Series(np.arange(10), name="iname") - df = df.reindex(i) - assert df.index.name == "iname" + # df = df.reindex(i) + # assert df.index.name == "iname" - df = df.reindex(Index(np.arange(10), name="tmpname")) - assert df.index.name == "tmpname" + # df = df.reindex(Index(np.arange(10), name="tmpname")) + # assert df.index.name == "tmpname" s = Series(np.random.default_rng(2).random(10)) df = DataFrame(s.T, index=np.arange(len(s))) From 5beaa225f488dcfa4b1ac661660a63d57dd08cc2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 16:22:33 -0800 Subject: [PATCH 5/8] Skip for type self, undo test --- pandas/core/indexes/range.py | 2 +- pandas/tests/frame/methods/test_reindex.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 01f4d710c3451..42b8e6d1cb066 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -489,7 +489,7 @@ def _view(self) -> Self: return result def _wrap_reindex_result(self, target, indexer, preserve_names: bool): - if target.dtype.kind == "i": + if not isinstance(target, type(self)) and target.dtype.kind == "i": target = self._shallow_copy(target._values, name=target.name) return super()._wrap_reindex_result(target, indexer, preserve_names) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 013b3290a6a22..45109991c4553 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -658,15 +658,15 @@ def test_reindex_nan(self): tm.assert_frame_equal(left, right) def test_reindex_name_remains(self): - # s = Series(np.random.default_rng(2).random(10)) - # df = DataFrame(s, index=np.arange(len(s))) - # i = Series(np.arange(10), name="iname") + s = Series(np.random.default_rng(2).random(10)) + df = DataFrame(s, index=np.arange(len(s))) + i = Series(np.arange(10), name="iname") - # df = df.reindex(i) - # assert df.index.name == "iname" + df = df.reindex(i) + assert df.index.name == "iname" - # df = df.reindex(Index(np.arange(10), name="tmpname")) - # assert df.index.name == "tmpname" + df = df.reindex(Index(np.arange(10), name="tmpname")) + assert df.index.name == "tmpname" s = Series(np.random.default_rng(2).random(10)) df = DataFrame(s.T, index=np.arange(len(s))) From ea3c613e681e7b2270338902e8ae50c5d31984c9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 18:13:02 -0800 Subject: [PATCH 6/8] Use intp --- pandas/tests/indexes/ranges/test_range.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index e1c7c7924cf3c..35ad581bfd967 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -614,7 +614,7 @@ def test_reindex_returns_rangeindex(): expected = RangeIndex(1, 4, name="foo") tm.assert_index_equal(result, expected, exact=True) - expected_indexer = np.array([1, -1, -1]) + expected_indexer = np.array([1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result_indexer, expected_indexer) From 370d20cb0ca46009c439e1dc9d6e805196df5271 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Mar 2024 17:03:05 -0800 Subject: [PATCH 7/8] merge --- .circleci/config.yml | 4 +- .pre-commit-config.yaml | 13 ++- ci/code_checks.sh | 35 -------- doc/source/whatsnew/v3.0.0.rst | 3 +- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable.pyx | 4 - pandas/_libs/hashtable_class_helper.pxi.in | 36 +++----- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- .../src/vendored/ujson/python/objToJSON.c | 4 +- pandas/core/frame.py | 5 ++ pandas/core/groupby/generic.py | 52 +---------- pandas/core/indexes/base.py | 1 - pandas/core/indexes/range.py | 36 ++++++++ pandas/core/internals/construction.py | 10 ++- pandas/io/excel/_base.py | 30 ++++--- pandas/tests/groupby/test_groupby.py | 20 +++++ pandas/tests/groupby/test_reductions.py | 10 --- pandas/tests/indexes/ranges/test_join.py | 52 +++++++++++ pandas/tests/io/pytables/test_timezones.py | 2 +- pandas/tests/scalar/timestamp/test_formats.py | 2 +- pyproject.toml | 3 + scripts/tests/test_use_pd_array_in_core.py | 26 ------ scripts/use_pd_array_in_core.py | 80 ----------------- web/pandas/community/ecosystem.md | 87 ------------------- 24 files changed, 167 insertions(+), 352 deletions(-) delete mode 100644 scripts/tests/test_use_pd_array_in_core.py delete mode 100644 scripts/use_pd_array_in_core.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 90afb1ce29684..ea93575ac9430 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,7 +3,7 @@ version: 2.1 jobs: test-arm: machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: ENV_FILE: ci/deps/circle-310-arm64.yaml @@ -46,7 +46,7 @@ jobs: cibw-build: type: string machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: TRIGGER_SOURCE: << pipeline.trigger_source >> diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e683fc50c1c5d..201820c6a8b28 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,12 @@ repos: files: ^pandas exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] + - id: ruff + name: ruff-use-pd_array-in-core + alias: ruff-use-pd_array-in-core + files: ^pandas/core/ + exclude: ^pandas/core/api\.py$ + args: [--select, "ICN001", --exit-non-zero-on-fix] - id: ruff-format exclude: ^scripts - repo: https://github.com/jendrikseipp/vulture @@ -272,13 +278,6 @@ repos: language: python entry: python scripts/validate_unwanted_patterns.py --validation-type="nodefault_used_not_only_for_typing" types: [python] - - id: use-pd_array-in-core - name: Import pandas.array as pd_array in core - language: python - entry: python scripts/use_pd_array_in_core.py - files: ^pandas/core/ - exclude: ^pandas/core/api\.py$ - types: [python] - id: no-return-exception name: Use raise instead of return for exceptions language: pygrep diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 998e48d96d6b3..5bbad800b7aa9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -173,14 +173,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.Timestamp.tzinfo\ pandas.Timestamp.value\ pandas.Timestamp.year\ - pandas.tseries.offsets.BQuarterBegin.is_anchored\ pandas.tseries.offsets.BQuarterBegin.is_on_offset\ pandas.tseries.offsets.BQuarterBegin.n\ pandas.tseries.offsets.BQuarterBegin.nanos\ pandas.tseries.offsets.BQuarterBegin.normalize\ pandas.tseries.offsets.BQuarterBegin.rule_code\ pandas.tseries.offsets.BQuarterBegin.startingMonth\ - pandas.tseries.offsets.BQuarterEnd.is_anchored\ pandas.tseries.offsets.BQuarterEnd.is_on_offset\ pandas.tseries.offsets.BQuarterEnd.n\ pandas.tseries.offsets.BQuarterEnd.nanos\ @@ -278,7 +276,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.Easter.rule_code\ pandas.tseries.offsets.FY5253.get_rule_code_suffix\ pandas.tseries.offsets.FY5253.get_year_end\ - pandas.tseries.offsets.FY5253.is_anchored\ pandas.tseries.offsets.FY5253.is_on_offset\ pandas.tseries.offsets.FY5253.n\ pandas.tseries.offsets.FY5253.nanos\ @@ -289,7 +286,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.FY5253.weekday\ pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix\ pandas.tseries.offsets.FY5253Quarter.get_weeks\ - pandas.tseries.offsets.FY5253Quarter.is_anchored\ pandas.tseries.offsets.FY5253Quarter.is_on_offset\ pandas.tseries.offsets.FY5253Quarter.n\ pandas.tseries.offsets.FY5253Quarter.nanos\ @@ -342,14 +338,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.Nano.n\ pandas.tseries.offsets.Nano.normalize\ pandas.tseries.offsets.Nano.rule_code\ - pandas.tseries.offsets.QuarterBegin.is_anchored\ pandas.tseries.offsets.QuarterBegin.is_on_offset\ pandas.tseries.offsets.QuarterBegin.n\ pandas.tseries.offsets.QuarterBegin.nanos\ pandas.tseries.offsets.QuarterBegin.normalize\ pandas.tseries.offsets.QuarterBegin.rule_code\ pandas.tseries.offsets.QuarterBegin.startingMonth\ - pandas.tseries.offsets.QuarterEnd.is_anchored\ pandas.tseries.offsets.QuarterEnd.is_on_offset\ pandas.tseries.offsets.QuarterEnd.n\ pandas.tseries.offsets.QuarterEnd.nanos\ @@ -379,7 +373,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.Tick.n\ pandas.tseries.offsets.Tick.normalize\ pandas.tseries.offsets.Tick.rule_code\ - pandas.tseries.offsets.Week.is_anchored\ pandas.tseries.offsets.Week.is_on_offset\ pandas.tseries.offsets.Week.n\ pandas.tseries.offsets.Week.nanos\ @@ -1500,75 +1493,62 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.BQuarterEnd.name\ pandas.tseries.offsets.BYearBegin.copy\ pandas.tseries.offsets.BYearBegin.freqstr\ - pandas.tseries.offsets.BYearBegin.is_anchored\ pandas.tseries.offsets.BYearBegin.kwds\ pandas.tseries.offsets.BYearBegin.name\ pandas.tseries.offsets.BYearEnd.copy\ pandas.tseries.offsets.BYearEnd.freqstr\ - pandas.tseries.offsets.BYearEnd.is_anchored\ pandas.tseries.offsets.BYearEnd.kwds\ pandas.tseries.offsets.BYearEnd.name\ pandas.tseries.offsets.BusinessDay\ pandas.tseries.offsets.BusinessDay.copy\ pandas.tseries.offsets.BusinessDay.freqstr\ - pandas.tseries.offsets.BusinessDay.is_anchored\ pandas.tseries.offsets.BusinessDay.kwds\ pandas.tseries.offsets.BusinessDay.name\ pandas.tseries.offsets.BusinessHour\ pandas.tseries.offsets.BusinessHour.copy\ pandas.tseries.offsets.BusinessHour.freqstr\ - pandas.tseries.offsets.BusinessHour.is_anchored\ pandas.tseries.offsets.BusinessHour.kwds\ pandas.tseries.offsets.BusinessHour.name\ pandas.tseries.offsets.BusinessMonthBegin.copy\ pandas.tseries.offsets.BusinessMonthBegin.freqstr\ - pandas.tseries.offsets.BusinessMonthBegin.is_anchored\ pandas.tseries.offsets.BusinessMonthBegin.kwds\ pandas.tseries.offsets.BusinessMonthBegin.name\ pandas.tseries.offsets.BusinessMonthEnd.copy\ pandas.tseries.offsets.BusinessMonthEnd.freqstr\ - pandas.tseries.offsets.BusinessMonthEnd.is_anchored\ pandas.tseries.offsets.BusinessMonthEnd.kwds\ pandas.tseries.offsets.BusinessMonthEnd.name\ pandas.tseries.offsets.CDay\ pandas.tseries.offsets.CustomBusinessDay\ pandas.tseries.offsets.CustomBusinessDay.copy\ pandas.tseries.offsets.CustomBusinessDay.freqstr\ - pandas.tseries.offsets.CustomBusinessDay.is_anchored\ pandas.tseries.offsets.CustomBusinessDay.kwds\ pandas.tseries.offsets.CustomBusinessDay.name\ pandas.tseries.offsets.CustomBusinessHour\ pandas.tseries.offsets.CustomBusinessHour.copy\ pandas.tseries.offsets.CustomBusinessHour.freqstr\ - pandas.tseries.offsets.CustomBusinessHour.is_anchored\ pandas.tseries.offsets.CustomBusinessHour.kwds\ pandas.tseries.offsets.CustomBusinessHour.name\ pandas.tseries.offsets.CustomBusinessMonthBegin.copy\ pandas.tseries.offsets.CustomBusinessMonthBegin.freqstr\ - pandas.tseries.offsets.CustomBusinessMonthBegin.is_anchored\ pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset\ pandas.tseries.offsets.CustomBusinessMonthBegin.kwds\ pandas.tseries.offsets.CustomBusinessMonthBegin.name\ pandas.tseries.offsets.CustomBusinessMonthEnd.copy\ pandas.tseries.offsets.CustomBusinessMonthEnd.freqstr\ - pandas.tseries.offsets.CustomBusinessMonthEnd.is_anchored\ pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset\ pandas.tseries.offsets.CustomBusinessMonthEnd.kwds\ pandas.tseries.offsets.CustomBusinessMonthEnd.name\ pandas.tseries.offsets.DateOffset.copy\ pandas.tseries.offsets.DateOffset.freqstr\ - pandas.tseries.offsets.DateOffset.is_anchored\ pandas.tseries.offsets.DateOffset.kwds\ pandas.tseries.offsets.DateOffset.name\ pandas.tseries.offsets.Day.copy\ pandas.tseries.offsets.Day.freqstr\ - pandas.tseries.offsets.Day.is_anchored\ pandas.tseries.offsets.Day.kwds\ pandas.tseries.offsets.Day.name\ pandas.tseries.offsets.Day.nanos\ pandas.tseries.offsets.Easter.copy\ pandas.tseries.offsets.Easter.freqstr\ - pandas.tseries.offsets.Easter.is_anchored\ pandas.tseries.offsets.Easter.kwds\ pandas.tseries.offsets.Easter.name\ pandas.tseries.offsets.FY5253.copy\ @@ -1581,47 +1561,39 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.FY5253Quarter.name\ pandas.tseries.offsets.Hour.copy\ pandas.tseries.offsets.Hour.freqstr\ - pandas.tseries.offsets.Hour.is_anchored\ pandas.tseries.offsets.Hour.kwds\ pandas.tseries.offsets.Hour.name\ pandas.tseries.offsets.Hour.nanos\ pandas.tseries.offsets.LastWeekOfMonth\ pandas.tseries.offsets.LastWeekOfMonth.copy\ pandas.tseries.offsets.LastWeekOfMonth.freqstr\ - pandas.tseries.offsets.LastWeekOfMonth.is_anchored\ pandas.tseries.offsets.LastWeekOfMonth.kwds\ pandas.tseries.offsets.LastWeekOfMonth.name\ pandas.tseries.offsets.Micro.copy\ pandas.tseries.offsets.Micro.freqstr\ - pandas.tseries.offsets.Micro.is_anchored\ pandas.tseries.offsets.Micro.kwds\ pandas.tseries.offsets.Micro.name\ pandas.tseries.offsets.Micro.nanos\ pandas.tseries.offsets.Milli.copy\ pandas.tseries.offsets.Milli.freqstr\ - pandas.tseries.offsets.Milli.is_anchored\ pandas.tseries.offsets.Milli.kwds\ pandas.tseries.offsets.Milli.name\ pandas.tseries.offsets.Milli.nanos\ pandas.tseries.offsets.Minute.copy\ pandas.tseries.offsets.Minute.freqstr\ - pandas.tseries.offsets.Minute.is_anchored\ pandas.tseries.offsets.Minute.kwds\ pandas.tseries.offsets.Minute.name\ pandas.tseries.offsets.Minute.nanos\ pandas.tseries.offsets.MonthBegin.copy\ pandas.tseries.offsets.MonthBegin.freqstr\ - pandas.tseries.offsets.MonthBegin.is_anchored\ pandas.tseries.offsets.MonthBegin.kwds\ pandas.tseries.offsets.MonthBegin.name\ pandas.tseries.offsets.MonthEnd.copy\ pandas.tseries.offsets.MonthEnd.freqstr\ - pandas.tseries.offsets.MonthEnd.is_anchored\ pandas.tseries.offsets.MonthEnd.kwds\ pandas.tseries.offsets.MonthEnd.name\ pandas.tseries.offsets.Nano.copy\ pandas.tseries.offsets.Nano.freqstr\ - pandas.tseries.offsets.Nano.is_anchored\ pandas.tseries.offsets.Nano.kwds\ pandas.tseries.offsets.Nano.name\ pandas.tseries.offsets.Nano.nanos\ @@ -1635,25 +1607,21 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.QuarterEnd.name\ pandas.tseries.offsets.Second.copy\ pandas.tseries.offsets.Second.freqstr\ - pandas.tseries.offsets.Second.is_anchored\ pandas.tseries.offsets.Second.kwds\ pandas.tseries.offsets.Second.name\ pandas.tseries.offsets.Second.nanos\ pandas.tseries.offsets.SemiMonthBegin\ pandas.tseries.offsets.SemiMonthBegin.copy\ pandas.tseries.offsets.SemiMonthBegin.freqstr\ - pandas.tseries.offsets.SemiMonthBegin.is_anchored\ pandas.tseries.offsets.SemiMonthBegin.kwds\ pandas.tseries.offsets.SemiMonthBegin.name\ pandas.tseries.offsets.SemiMonthEnd\ pandas.tseries.offsets.SemiMonthEnd.copy\ pandas.tseries.offsets.SemiMonthEnd.freqstr\ - pandas.tseries.offsets.SemiMonthEnd.is_anchored\ pandas.tseries.offsets.SemiMonthEnd.kwds\ pandas.tseries.offsets.SemiMonthEnd.name\ pandas.tseries.offsets.Tick.copy\ pandas.tseries.offsets.Tick.freqstr\ - pandas.tseries.offsets.Tick.is_anchored\ pandas.tseries.offsets.Tick.kwds\ pandas.tseries.offsets.Tick.name\ pandas.tseries.offsets.Tick.nanos\ @@ -1664,17 +1632,14 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.tseries.offsets.WeekOfMonth\ pandas.tseries.offsets.WeekOfMonth.copy\ pandas.tseries.offsets.WeekOfMonth.freqstr\ - pandas.tseries.offsets.WeekOfMonth.is_anchored\ pandas.tseries.offsets.WeekOfMonth.kwds\ pandas.tseries.offsets.WeekOfMonth.name\ pandas.tseries.offsets.YearBegin.copy\ pandas.tseries.offsets.YearBegin.freqstr\ - pandas.tseries.offsets.YearBegin.is_anchored\ pandas.tseries.offsets.YearBegin.kwds\ pandas.tseries.offsets.YearBegin.name\ pandas.tseries.offsets.YearEnd.copy\ pandas.tseries.offsets.YearEnd.freqstr\ - pandas.tseries.offsets.YearEnd.is_anchored\ pandas.tseries.offsets.YearEnd.kwds\ pandas.tseries.offsets.YearEnd.name\ pandas.util.hash_array\ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 130a6b6c834c1..23ae0f3ea1bc7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -191,6 +191,7 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) +- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`) @@ -238,7 +239,6 @@ Removal of prior version deprecations/changes - Removed unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` methods (:issue:`50977`) - Unrecognized timezones when parsing strings to datetimes now raises a ``ValueError`` (:issue:`51477`) - .. --------------------------------------------------------------------------- .. _whatsnew_300.performance: @@ -255,6 +255,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) +- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index eaec9e8462450..22b923580c491 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -180,7 +180,7 @@ cdef class Vector: cdef bint external_view_exists cdef class Int64Vector(Vector): - cdef Int64VectorData *data + cdef Int64VectorData data cdef ndarray ao cdef resize(self) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..8250d0242c31f 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,8 +1,4 @@ cimport cython -from cpython.mem cimport ( - PyMem_Free, - PyMem_Malloc, -) from cpython.ref cimport ( Py_INCREF, PyObject, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 26dcf0b6c4ce3..629b6b42db852 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -204,15 +204,11 @@ cdef class {{name}}Vector(Vector): # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: - {{name}}VectorData *data + {{name}}VectorData data ndarray ao {{endif}} def __cinit__(self): - self.data = <{{name}}VectorData *>PyMem_Malloc( - sizeof({{name}}VectorData)) - if not self.data: - raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) @@ -223,11 +219,6 @@ cdef class {{name}}Vector(Vector): self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data - def __dealloc__(self): - if self.data is not NULL: - PyMem_Free(self.data) - self.data = NULL - def __len__(self) -> int: return self.data.n @@ -243,13 +234,13 @@ cdef class {{name}}Vector(Vector): cdef void append(self, {{c_type}} x) noexcept: - if needs_resize(self.data): + if needs_resize(&self.data): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.resize() - append_data_{{dtype}}(self.data, x) + append_data_{{dtype}}(&self.data, x) cdef extend(self, const {{c_type}}[:] x): for i in range(len(x)): @@ -260,12 +251,9 @@ cdef class {{name}}Vector(Vector): cdef class StringVector(Vector): cdef: - StringVectorData *data + StringVectorData data def __cinit__(self): - self.data = PyMem_Malloc(sizeof(StringVectorData)) - if not self.data: - raise MemoryError() self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) @@ -288,11 +276,7 @@ cdef class StringVector(Vector): self.data.data[i] = orig_data[i] def __dealloc__(self): - if self.data is not NULL: - if self.data.data is not NULL: - free(self.data.data) - PyMem_Free(self.data) - self.data = NULL + free(self.data.data) def __len__(self) -> int: return self.data.n @@ -313,10 +297,10 @@ cdef class StringVector(Vector): cdef void append(self, char *x) noexcept: - if needs_resize(self.data): + if needs_resize(&self.data): self.resize() - append_data_string(self.data, x) + append_data_string(&self.data, x) cdef extend(self, ndarray[object] x): for i in range(len(x)): @@ -652,7 +636,7 @@ cdef class {{name}}HashTable(HashTable): if return_inverse: labels = np.empty(n, dtype=np.intp) - ud = uniques.data + ud = &uniques.data use_na_value = na_value is not None use_mask = mask is not None if not use_mask and use_result_mask: @@ -662,7 +646,7 @@ cdef class {{name}}HashTable(HashTable): raise NotImplementedError # pragma: no cover result_mask = UInt8Vector() - rmd = result_mask.data + rmd = &result_mask.data if use_mask: mask_values = mask.view("uint8") @@ -846,7 +830,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud labels = np.empty(n, dtype=np.intp) - ud = uniques.data + ud = &uniques.data with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 336af306d410f..ca1b28b9442ca 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -472,7 +472,7 @@ def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: kh_{{ttype}}_t *table = kh_init_{{ttype}}() {{name}}Vector idx = {{name}}Vector() ndarray[{{c_type}}, ndim=1] arr - {{name}}VectorData *ud = idx.data + {{name}}VectorData *ud = &idx.data kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 74ca8ead3d936..fa91db5fe34e3 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -74,7 +74,6 @@ typedef struct __NpyArrContext { npy_intp ndim; npy_intp index[NPY_MAXDIMS]; int type_num; - PyArray_GetItemFunc *getitem; char **rowLabels; char **columnLabels; @@ -405,7 +404,6 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } npyarr->array = (PyObject *)obj; - npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; npyarr->dataptr = PyArray_DATA(obj); npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; @@ -492,7 +490,7 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + GET_TC(tc)->itemValue = PyArray_GETITEM(arrayobj, npyarr->dataptr); } npyarr->dataptr += npyarr->stride; diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 54cefabb6097a..d1d35506ad3a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6506,6 +6506,11 @@ def drop_duplicates( DataFrame or None DataFrame with duplicates removed or None if ``inplace=True``. + Notes + ------- + This method requires columns specified by ``subset`` to be of hashable type. + Passing unhashable columns will raise a ``TypeError``. + See Also -------- DataFrame.value_counts: Count unique combinations of columns. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ab5e8bbd4528c..9449e6d7abdec 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -62,10 +62,7 @@ ) import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.groupby import ( - base, - ops, -) +from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, @@ -373,32 +370,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) index=self._grouper.result_index, dtype=obj.dtype, ) - - if self._grouper.nkeys > 1: - return self._python_agg_general(func, *args, **kwargs) - - try: - return self._python_agg_general(func, *args, **kwargs) - except KeyError: - # KeyError raised in test_groupby.test_basic is bc the func does - # a dictionary lookup on group.name, but group name is not - # pinned in _python_agg_general, only in _aggregate_named - result = self._aggregate_named(func, *args, **kwargs) - - warnings.warn( - "Pinning the groupby key to each group in " - f"{type(self).__name__}.agg is deprecated, and cases that " - "relied on it will raise in a future version. " - "If your operation requires utilizing the groupby keys, " - "iterate over the groupby object instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # result is a dict whose keys are the elements of result_index - result = Series(result, index=self._grouper.result_index) - result = self._wrap_aggregated_output(result) - return result + return self._python_agg_general(func, *args, **kwargs) agg = aggregate @@ -527,26 +499,6 @@ def _wrap_applied_output( result.index = default_index(len(result)) return result - def _aggregate_named(self, func, *args, **kwargs): - # Note: this is very similar to _aggregate_series_pure_python, - # but that does not pin group.name - result = {} - initialized = False - - for name, group in self._grouper.get_iterator(self._obj_with_exclusions): - # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations - object.__setattr__(group, "name", name) - - output = func(group, *args, **kwargs) - output = ops.extract_result(output) - if not initialized: - # We only do this validation on the first iteration - ops.check_result_array(output, group.dtype) - initialized = True - result[name] = output - - return result - __examples_series_doc = dedent( """ >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c17e01b85fa84..0701bed7cd9a4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4588,7 +4588,6 @@ def _get_leaf_sorter( ) return join_index, left_indexer, right_indexer - @final def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d2289c9e046ec..09d635b53c482 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -56,6 +56,7 @@ from pandas._typing import ( Axis, Dtype, + JoinHow, NaPosition, Self, npt, @@ -896,6 +897,41 @@ def symmetric_difference( result = result.rename(result_name) return result + def _join_monotonic( + self, other: Index, how: JoinHow = "left" + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + # This currently only gets called for the monotonic increasing case + if not isinstance(other, type(self)): + maybe_ri = self._shallow_copy(other._values) + if not isinstance(maybe_ri, type(self)): + return super()._join_monotonic(other, how=how) + other = maybe_ri + + if self.equals(other): + ret_index = other if how == "right" else self + return ret_index, None, None + + if how == "left": + join_index = self + lidx = None + ridx = other.get_indexer(join_index) + elif how == "right": + join_index = other + lidx = self.get_indexer(join_index) + ridx = None + elif how == "inner": + join_index = self.intersection(other) + lidx = self.get_indexer(join_index) + ridx = other.get_indexer(join_index) + elif how == "outer": + join_index = self.union(other) + lidx = self.get_indexer(join_index) + ridx = other.get_indexer(join_index) + + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + # -------------------------------------------------------------------- # error: Return type "Index" of "delete" incompatible with return type diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index aab0f1c6dac3c..6bc3556902e80 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -250,10 +250,12 @@ def ndarray_to_mgr( elif isinstance(values, (np.ndarray, ExtensionArray)): # drop subclass info - _copy = ( - copy if (dtype is None or astype_is_view(values.dtype, dtype)) else False - ) - values = np.array(values, copy=_copy) + if copy and (dtype is None or astype_is_view(values.dtype, dtype)): + # only force a copy now if copy=True was requested + # and a subsequent `astype` will not already result in a copy + values = np.array(values, copy=True, order="F") + else: + values = np.array(values, copy=False) values = _ensure_2d(values) else: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8a287beac7afd..c38ced573531e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -43,6 +43,7 @@ from pandas.core.dtypes.common import ( is_bool, + is_file_like, is_float, is_integer, is_list_like, @@ -1523,20 +1524,25 @@ def __init__( # Always a string self._io = stringify_path(path_or_buffer) - # Determine xlrd version if installed - if import_optional_dependency("xlrd", errors="ignore") is None: - xlrd_version = None - else: - import xlrd - - xlrd_version = Version(get_version(xlrd)) - if engine is None: # Only determine ext if it is needed - ext: str | None - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: + ext: str | None = None + + if not isinstance( + path_or_buffer, (str, os.PathLike, ExcelFile) + ) and not is_file_like(path_or_buffer): + # GH#56692 - avoid importing xlrd if possible + if import_optional_dependency("xlrd", errors="ignore") is None: + xlrd_version = None + else: + import xlrd + + xlrd_version = Version(get_version(xlrd)) + + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + + if ext is None: ext = inspect_excel_format( content_or_path=path_or_buffer, storage_options=storage_options ) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d02e22c29159f..686279f25939a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2945,3 +2945,23 @@ def test_decimal_na_sort(test_series): result = gb._grouper.result_index expected = Index([Decimal(1), None], name="key") tm.assert_index_equal(result, expected) + + +def test_groupby_dropna_with_nunique_unique(): + # GH#42016 + df = [[1, 1, 1, "A"], [1, None, 1, "A"], [1, None, 2, "A"], [1, None, 3, "A"]] + df_dropna = DataFrame(df, columns=["a", "b", "c", "partner"]) + result = df_dropna.groupby(["a", "b", "c"], dropna=False).agg( + {"partner": ["nunique", "unique"]} + ) + + index = MultiIndex.from_tuples( + [(1, 1.0, 1), (1, np.nan, 1), (1, np.nan, 2), (1, np.nan, 3)], + names=["a", "b", "c"], + ) + columns = MultiIndex.from_tuples([("partner", "nunique"), ("partner", "unique")]) + expected = DataFrame( + [(1, ["A"]), (1, ["A"]), (1, ["A"]), (1, ["A"])], index=index, columns=columns + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index e304a5ae467d8..2037ded9f20e6 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -65,16 +65,6 @@ def test_basic_aggregations(dtype): with pytest.raises(pd.errors.SpecificationError, match=msg): grouped.aggregate({"one": np.mean, "two": np.std}) - group_constants = {0: 10, 1: 20, 2: 30} - msg = ( - "Pinning the groupby key to each group in SeriesGroupBy.agg is deprecated, " - "and cases that relied on it will raise in a future version" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#41090 - agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) - assert agged[1] == 21 - # corner cases msg = "Must produce aggregated value" # exception raised is type Exception diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index 682b5c8def9ff..ca3af607c0a38 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( Index, @@ -175,3 +176,54 @@ def test_join_self(self, join_type): index = RangeIndex(start=0, stop=20, step=2) joined = index.join(index, how=join_type) assert index is joined + + +@pytest.mark.parametrize( + "left, right, expected, expected_lidx, expected_ridx, how", + [ + [RangeIndex(2), RangeIndex(3), RangeIndex(2), None, [0, 1], "left"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "left"], + [RangeIndex(2), RangeIndex(20, 22), RangeIndex(2), None, [-1, -1], "left"], + [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1, -1], None, "right"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "right"], + [ + RangeIndex(2), + RangeIndex(20, 22), + RangeIndex(20, 22), + [-1, -1], + None, + "right", + ], + [RangeIndex(2), RangeIndex(3), RangeIndex(2), [0, 1], [0, 1], "inner"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "inner"], + [RangeIndex(2), RangeIndex(1, 3), RangeIndex(1, 2), [1], [0], "inner"], + [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1, -1], [0, 1, 2], "outer"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "outer"], + [ + RangeIndex(2), + RangeIndex(2, 4), + RangeIndex(4), + [0, 1, -1, -1], + [-1, -1, 0, 1], + "outer", + ], + ], +) +@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))]) +def test_join_preserves_rangeindex( + left, right, expected, expected_lidx, expected_ridx, how, right_type +): + result, lidx, ridx = left.join(right_type(right), how=how, return_indexers=True) + tm.assert_index_equal(result, expected, exact=True) + + if expected_lidx is None: + assert lidx is expected_lidx + else: + exp_lidx = np.array(expected_lidx, dtype=np.intp) + tm.assert_numpy_array_equal(lidx, exp_lidx) + + if expected_ridx is None: + assert ridx is expected_ridx + else: + exp_ridx = np.array(expected_ridx, dtype=np.intp) + tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index b455235669636..9192804e49bd1 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -104,7 +104,7 @@ def test_append_with_timezones(setup_path, gettz): msg = ( r"invalid info for \[values_block_1\] for \[tz\], " - r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"existing_value \[(dateutil/.*)?(US/Eastern|America/New_York)\] " r"conflicts with new value \[(dateutil/.*)?EET\]" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 6a578b0a9eb09..b4493088acb31 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -89,7 +89,7 @@ def test_isoformat(ts, timespec, expected_iso): class TestTimestampRendering: @pytest.mark.parametrize( - "tz", ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + "tz", ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"] ) @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) @pytest.mark.parametrize( diff --git a/pyproject.toml b/pyproject.toml index 82a9d72c8cc74..5a06e22f4be9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -342,6 +342,9 @@ exclude = [ +[tool.ruff.lint.flake8-import-conventions.aliases] +"pandas.core.construction.array" = "pd_array" + [tool.ruff.per-file-ignores] # relative imports allowed for asv_bench "asv_bench/*" = ["TID", "NPY002"] diff --git a/scripts/tests/test_use_pd_array_in_core.py b/scripts/tests/test_use_pd_array_in_core.py deleted file mode 100644 index f58c92722caad..0000000000000 --- a/scripts/tests/test_use_pd_array_in_core.py +++ /dev/null @@ -1,26 +0,0 @@ -import pytest - -from scripts.use_pd_array_in_core import use_pd_array - -BAD_FILE_0 = "import pandas as pd\npd.array" -BAD_FILE_1 = "\nfrom pandas import array" -GOOD_FILE_0 = "from pandas import array as pd_array" -GOOD_FILE_1 = "from pandas.core.construction import array as pd_array" -PATH = "t.py" - - -@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1]) -def test_inconsistent_usage(content, capsys) -> None: - result_msg = ( - "t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n" - ) - with pytest.raises(SystemExit, match=None): - use_pd_array(content, PATH) - expected_msg, _ = capsys.readouterr() - assert result_msg == expected_msg - - -@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1]) -def test_consistent_usage(content) -> None: - # should not raise - use_pd_array(content, PATH) diff --git a/scripts/use_pd_array_in_core.py b/scripts/use_pd_array_in_core.py deleted file mode 100644 index c9e14dece44e4..0000000000000 --- a/scripts/use_pd_array_in_core.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Check that pandas/core imports pandas.array as pd_array. - -This makes it easier to grep for usage of pandas array. - -This is meant to be run as a pre-commit hook - to run it manually, you can do: - - pre-commit run use-pd_array-in-core --all-files - -""" - -from __future__ import annotations - -import argparse -import ast -import sys -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Sequence - - -ERROR_MESSAGE = ( - "{path}:{lineno}:{col_offset}: " - "Don't use pd.array in core, import array as pd_array instead\n" -) - - -class Visitor(ast.NodeVisitor): - def __init__(self, path: str) -> None: - self.path = path - - def visit_ImportFrom(self, node: ast.ImportFrom) -> None: - # If array has been imported from somewhere in pandas, - # check it's aliased as pd_array. - if ( - node.module is not None - and node.module.startswith("pandas") - and any(i.name == "array" and i.asname != "pd_array" for i in node.names) - ): - msg = ERROR_MESSAGE.format( - path=self.path, lineno=node.lineno, col_offset=node.col_offset - ) - sys.stdout.write(msg) - sys.exit(1) - super().generic_visit(node) - - def visit_Attribute(self, node: ast.Attribute) -> None: - if ( - isinstance(node.value, ast.Name) - and node.value.id == "pd" - and node.attr == "array" - ): - msg = ERROR_MESSAGE.format( - path=self.path, lineno=node.lineno, col_offset=node.col_offset - ) - sys.stdout.write(msg) - sys.exit(1) - super().generic_visit(node) - - -def use_pd_array(content: str, path: str) -> None: - tree = ast.parse(content) - visitor = Visitor(path) - visitor.visit(tree) - - -def main(argv: Sequence[str] | None = None) -> None: - parser = argparse.ArgumentParser() - parser.add_argument("paths", nargs="*") - args = parser.parse_args(argv) - - for path in args.paths: - with open(path, encoding="utf-8") as fd: - content = fd.read() - use_pd_array(content, path) - - -if __name__ == "__main__": - main() diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 58c5da67bcd74..715a2fafbe87a 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -21,10 +21,6 @@ please let us know. ## Statistics and machine learning -### [pandas-tfrecords](https://pypi.org/project/pandas-tfrecords/) - -Easy saving pandas dataframe to tensorflow tfrecords format and reading tfrecords to pandas. - ### [Statsmodels](https://www.statsmodels.org/) Statsmodels is the prominent Python "statistics and econometrics @@ -34,11 +30,6 @@ modeling functionality that is out of pandas' scope. Statsmodels leverages pandas objects as the underlying data container for computation. -### [sklearn-pandas](https://github.com/scikit-learn-contrib/sklearn-pandas) - -Use pandas DataFrames in your [scikit-learn](https://scikit-learn.org/) -ML pipeline. - ### [Featuretools](https://github.com/alteryx/featuretools/) Featuretools is a Python library for automated feature engineering built @@ -150,13 +141,6 @@ df # discover interesting insights! By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html>) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. -### [QtPandas](https://github.com/draperjames/qtpandas) - -Spun off from the main pandas library, the -[qtpandas](https://github.com/draperjames/qtpandas) library enables -DataFrame visualization and manipulation in PyQt4 and PySide -applications. - ### [D-Tale](https://github.com/man-group/dtale) D-Tale is a lightweight web client for visualizing pandas data structures. It @@ -210,12 +194,6 @@ or may not be compatible with non-HTML Jupyter output formats.) See [Options and Settings](https://pandas.pydata.org/docs/user_guide/options.html) for pandas `display.` settings. -### [modin-project/modin-spreadsheet](https://github.com/modin-project/modin-spreadsheet) - -modin-spreadsheet is an interactive grid for sorting and filtering DataFrames in IPython Notebook. -It is a fork of qgrid and is actively maintained by the modin project. -modin-spreadsheet provides similar functionality to qgrid and allows for easy data exploration and manipulation in a tabular format. - ### [Spyder](https://www.spyder-ide.org/) Spyder is a cross-platform PyQt-based IDE combining the editing, @@ -271,18 +249,6 @@ The following data feeds are available: - Stooq Index Data - MOEX Data -### [quandl/Python](https://github.com/quandl/Python) - -Quandl API for Python wraps the Quandl REST API to return Pandas -DataFrames with timeseries indexes. - -### [pydatastream](https://github.com/vfilimonov/pydatastream) - -PyDatastream is a Python interface to the [Thomson Dataworks Enterprise -(DWE/Datastream)](http://dataworks.thomson.com/Dataworks/Enterprise/1.0/) -SOAP API to return indexed Pandas DataFrames with financial data. This -package requires valid credentials for this API (non free). - ### [pandaSDMX](https://pandasdmx.readthedocs.io) pandaSDMX is a library to retrieve and acquire statistical data and @@ -305,13 +271,6 @@ point-in-time data from ALFRED. fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that you can obtain for free on the FRED website. -### [dataframe_sql](https://github.com/zbrookle/dataframe_sql) - -``dataframe_sql`` is a Python package that translates SQL syntax directly into -operations on pandas DataFrames. This is useful when migrating from a database to -using pandas or for users more comfortable with SQL looking for a way to interface -with pandas. - ## Domain specific ### [Geopandas](https://github.com/geopandas/geopandas) @@ -384,12 +343,6 @@ any Delta table into Pandas dataframe. ## Out-of-core -### [Blaze](https://blaze.pydata.org/) - -Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, -PyTables, PySpark. - ### [Cylon](https://cylondata.org/) Cylon is a fast, scalable, distributed memory parallel runtime with a pandas @@ -457,14 +410,6 @@ import modin.pandas as pd df = pd.read_csv("big.csv") # use all your cores! ``` -### [Odo](http://odo.pydata.org) - -Odo provides a uniform API for moving data between different formats. It -uses pandas own `read_csv` for CSV IO and leverages many existing -packages such as PyTables, h5py, and pymongo to move data between non -pandas formats. Its graph based approach is also extensible by end users -for custom formats that may be too specific for the core of odo. - ### [Pandarallel](https://github.com/nalepae/pandarallel) Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. @@ -479,23 +424,6 @@ pandarallel.initialize(progress_bar=True) df.parallel_apply(func) ``` -### [Ray](https://docs.ray.io/en/latest/data/modin/index.html) - -Pandas on Ray is an early stage DataFrame library that wraps Pandas and -transparently distributes the data and computation. The user does not -need to know how many cores their system has, nor do they need to -specify how to distribute the data. In fact, users can continue using -their previous Pandas notebooks while experiencing a considerable -speedup from Pandas on Ray, even on a single machine. Only a -modification of the import statement is needed, as we demonstrate below. -Once you've changed your import statement, you're ready to use Pandas on -Ray just like you would Pandas. - -``` -# import pandas as pd -import ray.dataframe as pd -``` - ### [Vaex](https://vaex.io/docs/) Increasingly, packages are being built on top of pandas to address @@ -540,11 +468,6 @@ to make data processing pipelines more readable and robust. Dataframes contain information that pandera explicitly validates at runtime. This is useful in production-critical data pipelines or reproducible research settings. -### [Engarde](https://engarde.readthedocs.io/en/latest/) - -Engarde is a lightweight library used to explicitly state your -assumptions about your datasets and check that they're *actually* true. - ## Extension data types Pandas provides an interface for defining @@ -559,12 +482,6 @@ Arrays](https://awkward-array.org/) inside pandas' Series and DataFrame. It also provides an accessor for using awkward functions on Series that are of awkward type. -### [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) - -Cyberpandas provides an extension type for storing arrays of IP -Addresses. These arrays can be stored inside pandas' Series and -DataFrame. - ### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/) Pandas-Genomics provides an extension type and extension array for working @@ -599,15 +516,11 @@ authors to coordinate on the namespace. | Library | Accessor | Classes | | -------------------------------------------------------------------- | ---------- | --------------------- | | [awkward-pandas](https://awkward-pandas.readthedocs.io/en/latest/) | `ak` | `Series` | - | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | - | [pandas_path](https://github.com/drivendataorg/pandas-path/) | `path` | `Index`, `Series` | | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | | [physipandas](https://github.com/mocquin/physipandas) | `physipy` | `Series`, `DataFrame` | | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | - | [datatest](https://datatest.readthedocs.io/en/stable/) | `validate` | `Series`, `DataFrame` | - | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | | [gurobipy-pandas](https://github.com/Gurobi/gurobipy-pandas) | `gppd` | `Series`, `DataFrame` | | [staircase](https://www.staircase.dev/) | `sc` | `Series`, `DataFrame` | | [woodwork](https://github.com/alteryx/woodwork) | `slice` | `Series`, `DataFrame` | From 3be40203b91f84a032b9ae2f630e74fc83453c28 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Mar 2024 12:37:03 -0800 Subject: [PATCH 8/8] Add test for Index return --- pandas/tests/indexes/ranges/test_range.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 2176395242e33..898548d1cc4dc 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -618,6 +618,16 @@ def test_reindex_returns_rangeindex(): tm.assert_numpy_array_equal(result_indexer, expected_indexer) +def test_reindex_returns_index(): + ri = RangeIndex(4, name="foo") + result, result_indexer = ri.reindex([0, 1, 3]) + expected = Index([0, 1, 3], name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + def test_take_return_rangeindex(): ri = RangeIndex(5, name="foo") result = ri.take([])