From 7f251755d3238a8ff9f460bcd7f7aa77a6490435 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 12 Aug 2023 12:55:17 -0700 Subject: [PATCH 001/174] Backport PR #54498 on branch 2.1.x (Speed up string inference in maybe_convert_objects) (#54512) Backport PR #54498: Speed up string inference in maybe_convert_objects Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_libs/lib.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 55819ebd1f15e..a96152ccdf3cc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2681,14 +2681,13 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if is_string_array(objects, skipna=True): - if using_pyarrow_string_dtype(): - import pyarrow as pa + if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + import pyarrow as pa - from pandas.core.dtypes.dtypes import ArrowDtype + from pandas.core.dtypes.dtypes import ArrowDtype - dtype = ArrowDtype(pa.string()) - return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + dtype = ArrowDtype(pa.string()) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True elif seen.interval_: From 0fb9149a1123842d36c63c2895b205fd11f5191b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 12 Aug 2023 12:55:27 -0700 Subject: [PATCH 002/174] Backport PR #54504 on branch 2.1.x (Move Unit Tests for groupby.rolling.corr) (#54513) Backport PR #54504: Move Unit Tests for groupby.rolling.corr Co-authored-by: omar-elbaz <114315316+omar-elbaz@users.noreply.github.com> --- pandas/tests/groupby/test_groupby.py | 26 ---------------- pandas/tests/window/test_groupby.py | 44 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 24a550b4602a6..9b260d5757767 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3159,29 +3159,3 @@ def test_groupby_series_with_datetimeindex_month_name(): expected = Series([2, 1], name="jan") expected.index.name = "jan" tm.assert_series_equal(result, expected) - - -def test_rolling_corr_grouping_column_contains_tuples_1(df): - # GH 44078 - df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -def test_rolling_corr_case_grrouping_column_contains_tuples_2(df): - # GH 44078 - df = DataFrame({"a": [(1, 2), (1, 2), (1, 2)], "b": [4, 5, 6]}) - gb = df.groupby(["a"]) - result = gb.rolling(2).corr(other=df) - index = MultiIndex.from_tuples( - [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] - ) - expected = DataFrame( - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index eb4fc06aa523e..ab00e18fc4812 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1230,3 +1230,47 @@ def test_dont_mutate_obj_after_slicing(self): # This is the key test result = grp.count() tm.assert_frame_equal(result, expected_df) + + +def test_rolling_corr_with_single_integer_in_index(): + # GH 44078 + df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]}) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None]) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_rolling_corr_with_tuples_in_index(): + # GH 44078 + df = DataFrame( + { + "a": [ + ( + 1, + 2, + ), + ( + 1, + 2, + ), + ( + 1, + 2, + ), + ], + "b": [4, 5, 6], + } + ) + gb = df.groupby(["a"]) + result = gb.rolling(2).corr(other=df) + index = MultiIndex.from_tuples( + [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None] + ) + expected = DataFrame( + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index + ) + tm.assert_frame_equal(result, expected) From 06e04b03a2fc9508a6f05b48782e74575c48b93e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 12 Aug 2023 12:55:41 -0700 Subject: [PATCH 003/174] Backport PR #54509 on branch 2.1.x (PERF: Dataframe reductions with EA dtypes) (#54514) Backport PR #54509: PERF: Dataframe reductions with EA dtypes Co-authored-by: Luke Manley --- pandas/core/frame.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4908b535bcb1c..0e998898ffd1a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11109,14 +11109,20 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) + dtype_has_keepdims: dict[ExtensionDtype, bool] = {} + def blk_func(values, axis: Axis = 1): if isinstance(values, ExtensionArray): if not is_1d_only_ea_dtype(values.dtype) and not isinstance( self._mgr, ArrayManager ): return values._reduce(name, axis=1, skipna=skipna, **kwds) - sign = signature(values._reduce) - if "keepdims" in sign.parameters: + has_keepdims = dtype_has_keepdims.get(values.dtype) + if has_keepdims is None: + sign = signature(values._reduce) + has_keepdims = "keepdims" in sign.parameters + dtype_has_keepdims[values.dtype] = has_keepdims + if has_keepdims: return values._reduce(name, skipna=skipna, keepdims=True, **kwds) else: warnings.warn( From b3f122c09e6b20df2fde05c07161c79cf43ad4a8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 13 Aug 2023 02:11:07 -0700 Subject: [PATCH 004/174] Backport PR #54517 on branch 2.1.x (Run CI on 2.1.x) (#54522) Backport PR #54517: Run CI on 2.1.x Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- .github/workflows/code-checks.yml | 4 ++-- .github/workflows/docbuild-and-upload.yml | 4 ++-- .github/workflows/package-checks.yml | 4 ++-- .github/workflows/unit-tests.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 0954bf7d9231c..9daa1a67bc2c1 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.0.x + - 2.1.x pull_request: branches: - main - - 2.0.x + - 2.1.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 9840596357d26..e05f12ac6416a 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.0.x + - 2.1.x tags: - '*' pull_request: branches: - main - - 2.0.x + - 2.1.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 6c9ba89d82794..04abcf4ce8816 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.0.x + - 2.1.x pull_request: branches: - main - - 2.0.x + - 2.1.x types: [ labeled, opened, synchronize, reopened ] permissions: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 1770d18d4eb41..66d8320206429 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.0.x + - 2.1.x pull_request: branches: - main - - 2.0.x + - 2.1.x paths-ignore: - "doc/**" - "web/**" From b215522e070c0871170c5424ca343acda4983cec Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 13 Aug 2023 05:38:58 -0700 Subject: [PATCH 005/174] Backport PR #54515 on branch 2.1.x (ENH: ArrowExtensionArray(duration) workarounds for pyarrow versions >= 11.0) (#54524) Backport PR #54515: ENH: ArrowExtensionArray(duration) workarounds for pyarrow versions >= 11.0 Co-authored-by: Luke Manley Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/arrow/array.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 88695f11fba59..07436cdf24e8d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1011,12 +1011,11 @@ def factorize( ) -> tuple[np.ndarray, ExtensionArray]: null_encoding = "mask" if use_na_sentinel else "encode" - pa_type = self._pa_array.type - if pa.types.is_duration(pa_type): + data = self._pa_array + pa_type = data.type + if pa_version_under11p0 and pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array + data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): encoded = data @@ -1034,7 +1033,7 @@ def factorize( ) uniques = type(self)(encoded.chunk(0).dictionary) - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) return indices, uniques @@ -1273,7 +1272,7 @@ def unique(self) -> Self: """ pa_type = self._pa_array.type - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 data = self._pa_array.cast(pa.int64()) else: @@ -1281,7 +1280,7 @@ def unique(self) -> Self: pa_result = pc.unique(data) - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): pa_result = pa_result.cast(pa_type) return type(self)(pa_result) @@ -1304,7 +1303,7 @@ def value_counts(self, dropna: bool = True) -> Series: Series.value_counts """ pa_type = self._pa_array.type - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 data = self._pa_array.cast(pa.int64()) else: @@ -1324,7 +1323,7 @@ def value_counts(self, dropna: bool = True) -> Series: values = values.filter(mask) counts = counts.filter(mask) - if pa.types.is_duration(pa_type): + if pa_version_under11p0 and pa.types.is_duration(pa_type): values = values.cast(pa_type) counts = ArrowExtensionArray(counts) From 08edd6462cf13853d8ce356c410de966d20108d9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 13 Aug 2023 09:20:25 -0700 Subject: [PATCH 006/174] Backport PR #54341 on branch 2.1.x (PERF: axis=1 reductions with EA dtypes) (#54528) Backport PR #54341: PERF: axis=1 reductions with EA dtypes Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/frame.py | 26 +++++++++ pandas/core/groupby/groupby.py | 6 ++- pandas/tests/frame/test_reductions.py | 77 +++++++++++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 313bf61ecabf9..d1a689dc60830 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -614,6 +614,7 @@ Performance improvements - :class:`Period`'s default formatter (`period_format`) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`Period.strftime(fmt=None)`, as well as ``PeriodArray.strftime(fmt=None)``, ``PeriodIndex.strftime(fmt=None)`` and ``PeriodIndex.format(fmt=None)``. Finally, ``to_csv`` operations involving :class:`PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated. (:issue:`51459`) - Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`) - Performance improvement for :class:`DataFrameGroupBy`/:class:`SeriesGroupBy` aggregations (e.g. :meth:`DataFrameGroupBy.sum`) with ``engine="numba"`` (:issue:`53731`) +- Performance improvement in :class:`DataFrame` reductions with ``axis=1`` and extension dtypes (:issue:`54341`) - Performance improvement in :class:`DataFrame` reductions with ``axis=None`` and extension dtypes (:issue:`54308`) - Performance improvement in :class:`MultiIndex` and multi-column operations (e.g. :meth:`DataFrame.sort_values`, :meth:`DataFrame.groupby`, :meth:`Series.unstack`) when index/column values are already sorted (:issue:`53806`) - Performance improvement in :class:`Series` reductions (:issue:`52341`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0e998898ffd1a..9939daadd9237 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11172,6 +11172,32 @@ def _get_data() -> DataFrame: ).iloc[:0] result.index = df.index return result + + # kurtosis excluded since groupby does not implement it + if df.shape[1] and name != "kurt": + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + if isinstance(dtype, ExtensionDtype): + # GH 54341: fastpath for EA-backed axis=1 reductions + # This flattens the frame into a single 1D array while keeping + # track of the row and column indices of the original frame. Once + # flattened, grouping by the row indices and aggregating should + # be equivalent to transposing the original frame and aggregating + # with axis=0. + name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) + df = df.astype(dtype, copy=False) + arr = concat_compat(list(df._iter_column_arrays())) + nrows, ncols = df.shape + row_index = np.tile(np.arange(nrows), ncols) + col_index = np.repeat(np.arange(ncols), nrows) + ser = Series(arr, index=col_index, copy=False) + result = ser.groupby(row_index).agg(name, **kwds) + result.index = df.index + if not skipna and name not in ("any", "all"): + mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) + other = -1 if name in ("idxmax", "idxmin") else lib.no_default + result = result.mask(mask, other) + return result + df = df.T # After possibly _get_data and transposing, we are now in the diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 69a99c1bc867c..e327dd9d6c5ff 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -105,6 +105,7 @@ class providing the base-class of operations. ExtensionArray, FloatingArray, IntegerArray, + SparseArray, ) from pandas.core.base import ( PandasObject, @@ -1909,7 +1910,10 @@ def array_func(values: ArrayLike) -> ArrayLike: # and non-applicable functions # try to python agg # TODO: shouldn't min_count matter? - if how in ["any", "all", "std", "sem"]: + # TODO: avoid special casing SparseArray here + if how in ["any", "all"] and isinstance(values, SparseArray): + pass + elif how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached else: return result diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3768298156550..ab36934533beb 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1938,3 +1938,80 @@ def test_fails_on_non_numeric(kernel): msg = "|".join([msg1, msg2]) with pytest.raises(TypeError, match=msg): getattr(df, kernel)(*args) + + +@pytest.mark.parametrize( + "method", + [ + "all", + "any", + "count", + "idxmax", + "idxmin", + "kurt", + "kurtosis", + "max", + "mean", + "median", + "min", + "nunique", + "prod", + "product", + "sem", + "skew", + "std", + "sum", + "var", + ], +) +@pytest.mark.parametrize("min_count", [0, 2]) +def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): + # GH 54341 + df = DataFrame( + { + "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype), + "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype), + }, + ) + expected_df = DataFrame( + { + "a": [0.0, 1.0, 2.0, 3.0], + "b": [0.0, 1.0, np.nan, 3.0], + }, + ) + if method in ("count", "nunique"): + expected_dtype = "int64" + elif method in ("all", "any"): + expected_dtype = "boolean" + elif method in ( + "kurt", + "kurtosis", + "mean", + "median", + "sem", + "skew", + "std", + "var", + ) and not any_numeric_ea_dtype.startswith("Float"): + expected_dtype = "Float64" + else: + expected_dtype = any_numeric_ea_dtype + + kwargs = {} + if method not in ("count", "nunique", "quantile"): + kwargs["skipna"] = skipna + if method in ("prod", "product", "sum"): + kwargs["min_count"] = min_count + + warn = None + msg = None + if not skipna and method in ("idxmax", "idxmin"): + warn = FutureWarning + msg = f"The behavior of DataFrame.{method} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(df, method)(axis=1, **kwargs) + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(expected_df, method)(axis=1, **kwargs) + if method not in ("idxmax", "idxmin"): + expected = expected.astype(expected_dtype) + tm.assert_series_equal(result, expected) From 723b2c6dac0a7f9f3d282faed87ff1add1c683f7 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 14 Aug 2023 02:56:49 -0700 Subject: [PATCH 007/174] Backport PR #54536 on branch 2.1.x (REF: Move checks to object into a variable) (#54538) Backport PR #54536: REF: Move checks to object into a variable Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/strings/__init__.py | 2 ++ pandas/tests/strings/test_find_replace.py | 31 ++++++++++++----------- pandas/tests/strings/test_strings.py | 17 +++++++------ 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index e69de29bb2d1d..9a7622b4f1cd8 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -0,0 +1,2 @@ +# Needed for new arrow string dtype +object_pyarrow_numpy = ("object",) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index c3cc8b3643ed2..b2d7e3e7507f8 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -11,6 +11,7 @@ Series, _testing as tm, ) +from pandas.tests.strings import object_pyarrow_numpy # -------------------------------------------------------------------------------------- # str.contains @@ -25,7 +26,7 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -44,7 +45,7 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -71,14 +72,14 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -163,7 +164,7 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -204,7 +205,7 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -220,7 +221,7 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -648,7 +649,7 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -703,12 +704,12 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -716,7 +717,7 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -732,7 +733,7 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -742,14 +743,14 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) @@ -877,7 +878,7 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1e573bdfe8fb5..4315835b70a40 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,6 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods +from pandas.tests.strings import object_pyarrow_numpy @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -40,7 +41,7 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -91,7 +92,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -205,7 +206,7 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -230,7 +231,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ser = Series( ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -250,7 +251,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -280,7 +281,7 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype == "object" else "Int64" + expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -309,7 +310,7 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -350,7 +351,7 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) From 64e452718ee33a668e88c7435841eeb27102a990 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 14 Aug 2023 02:56:59 -0700 Subject: [PATCH 008/174] Backport PR #54534 on branch 2.1.x (REF: Move methods that can be shared with new string dtype) (#54539) Backport PR #54534: REF: Move methods that can be shared with new string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/_arrow_string_mixins.py | 84 ++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 68 ++---------------- 2 files changed, 89 insertions(+), 63 deletions(-) create mode 100644 pandas/core/arrays/_arrow_string_mixins.py diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py new file mode 100644 index 0000000000000..63db03340683b --- /dev/null +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import Literal + +import numpy as np + +from pandas.compat import pa_version_under7p0 + +if not pa_version_under7p0: + import pyarrow as pa + import pyarrow.compute as pc + + +class ArrowStringArrayMixin: + _pa_array = None + + def __init__(self, *args, **kwargs) -> None: + raise NotImplementedError + + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + if side == "left": + pa_pad = pc.utf8_lpad + elif side == "right": + pa_pad = pc.utf8_rpad + elif side == "both": + pa_pad = pc.utf8_center + else: + raise ValueError( + f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" + ) + return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) + + def _str_get(self, i: int): + lengths = pc.utf8_length(self._pa_array) + if i >= 0: + out_of_bounds = pc.greater_equal(i, lengths) + start = i + stop = i + 1 + step = 1 + else: + out_of_bounds = pc.greater(-i, lengths) + start = i + stop = i - 1 + step = -1 + not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) + selected = pc.utf8_slice_codeunits( + self._pa_array, start=start, stop=stop, step=step + ) + null_value = pa.scalar( + None, type=self._pa_array.type # type: ignore[attr-defined] + ) + result = pc.if_else(not_out_of_bounds, selected, null_value) + return type(self)(result) + + def _str_slice_replace( + self, start: int | None = None, stop: int | None = None, repl: str | None = None + ): + if repl is None: + repl = "" + if start is None: + start = 0 + if stop is None: + stop = np.iinfo(np.int64).max + return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + + def _str_capitalize(self): + return type(self)(pc.utf8_capitalize(self._pa_array)) + + def _str_title(self): + return type(self)(pc.utf8_title(self._pa_array)) + + def _str_swapcase(self): + return type(self)(pc.utf8_swapcase(self._pa_array)) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 07436cdf24e8d..0f46e5a4e7482 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -42,6 +42,7 @@ from pandas.core import roperator from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -184,7 +185,10 @@ def to_pyarrow_type( class ArrowExtensionArray( - OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods + OpsMixin, + ExtensionArraySupportsAnyAll, + ArrowStringArrayMixin, + BaseStringArrayMethods, ): """ Pandas ExtensionArray backed by a PyArrow ChunkedArray. @@ -1986,24 +1990,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_pad( - self, - width: int, - side: Literal["left", "right", "both"] = "left", - fillchar: str = " ", - ): - if side == "left": - pa_pad = pc.utf8_lpad - elif side == "right": - pa_pad = pc.utf8_rpad - elif side == "both": - pa_pad = pc.utf8_center - else: - raise ValueError( - f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" - ) - return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): @@ -2088,26 +2074,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): ) return type(self)(result) - def _str_get(self, i: int): - lengths = pc.utf8_length(self._pa_array) - if i >= 0: - out_of_bounds = pc.greater_equal(i, lengths) - start = i - stop = i + 1 - step = 1 - else: - out_of_bounds = pc.greater(-i, lengths) - start = i - stop = i - 1 - step = -1 - not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) - selected = pc.utf8_slice_codeunits( - self._pa_array, start=start, stop=stop, step=step - ) - null_value = pa.scalar(None, type=self._pa_array.type) - result = pc.if_else(not_out_of_bounds, selected, null_value) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type): result = self._apply_elementwise(list) @@ -2137,15 +2103,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_slice_replace( - self, start: int | None = None, stop: int | None = None, repl: str | None = None - ): - if repl is None: - repl = "" - if start is None: - start = 0 - return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_isalnum(self): return type(self)(pc.utf8_is_alnum(self._pa_array)) @@ -2170,18 +2127,9 @@ def _str_isspace(self): def _str_istitle(self): return type(self)(pc.utf8_is_title(self._pa_array)) - def _str_capitalize(self): - return type(self)(pc.utf8_capitalize(self._pa_array)) - - def _str_title(self): - return type(self)(pc.utf8_title(self._pa_array)) - def _str_isupper(self): return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_swapcase(self): - return type(self)(pc.utf8_swapcase(self._pa_array)) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) @@ -2222,12 +2170,6 @@ def _str_removeprefix(self, prefix: str): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) From f59950d820c3e126f36b1a5e73f53f0d6caede3b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 15 Aug 2023 14:18:05 -0700 Subject: [PATCH 009/174] Backport PR #54537 on branch 2.1.x (REF: Refactor using_pyarrow check for string tests) (#54559) Backport PR #54537: REF: Refactor using_pyarrow check for string tests Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/strings/test_find_replace.py | 62 ++++++++--------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index b2d7e3e7507f8..bcb8db96b37fa 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -18,6 +18,10 @@ # -------------------------------------------------------------------------------------- +def using_pyarrow(dtype): + return dtype in ("string[pyarrow]",) + + def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -379,9 +383,7 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -402,9 +404,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -424,7 +424,7 @@ def test_replace_callable_raises(any_string_dtype, repl): ) with pytest.raises(TypeError, match=msg): with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, using_pyarrow(any_string_dtype) ): values.str.replace("a", repl, regex=True) @@ -434,9 +434,7 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -448,16 +446,12 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -477,9 +471,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -507,9 +499,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -558,9 +548,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("A", "YYY", case=False) expected = Series( [ @@ -579,9 +567,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ @@ -605,16 +591,12 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -762,9 +744,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -945,16 +925,16 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - using_pyarrow = any_string_dtype == "string[pyarrow]" + use_pyarrow = using_pyarrow(any_string_dtype) result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] @@ -966,7 +946,7 @@ def test_flags_kwarg(any_string_dtype): msg = "has match groups" with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not using_pyarrow + UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow ): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] From e69fab9ec3b0bf0bad03dd002efb0fd58072c693 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 15 Aug 2023 14:18:44 -0700 Subject: [PATCH 010/174] Backport PR #54525 on branch 2.1.x (BLD: Fix version script) (#54560) Backport PR #54525: BLD: Fix version script Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .circleci/config.yml | 1 + generate_version.py | 28 ++++++++++++++++++++++----- scripts/validate_unwanted_patterns.py | 1 + 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ac9db5f451bf3..50f6a116a6630 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -46,6 +46,7 @@ jobs: fi - run: name: Build aarch64 wheels + no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | pip3 install cibuildwheel==2.14.1 cibuildwheel --prerelease-pythons --output-dir wheelhouse diff --git a/generate_version.py b/generate_version.py index 5534f49c0ea58..46e9f52bfc5de 100644 --- a/generate_version.py +++ b/generate_version.py @@ -1,18 +1,30 @@ # Note: This file has to live next to setup.py or versioneer will not work import argparse import os +import sys import versioneer +sys.path.insert(0, "") + def write_version_info(path): + version = None + git_version = None + + try: + import _version_meson + + version = _version_meson.__version__ + git_version = _version_meson.__git_version__ + except ImportError: + version = versioneer.get_version() + git_version = versioneer.get_versions()["full-revisionid"] if os.environ.get("MESON_DIST_ROOT"): path = os.path.join(os.environ.get("MESON_DIST_ROOT"), path) with open(path, "w", encoding="utf-8") as file: - file.write(f'__version__="{versioneer.get_version()}"\n') - file.write( - f'__git_version__="{versioneer.get_versions()["full-revisionid"]}"\n' - ) + file.write(f'__version__="{version}"\n') + file.write(f'__git_version__="{git_version}"\n') def main(): @@ -43,7 +55,13 @@ def main(): write_version_info(args.outfile) if args.print: - print(versioneer.get_version()) + try: + import _version_meson + + version = _version_meson.__version__ + except ImportError: + version = versioneer.get_version() + print(version) main() diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index f8b49923cf466..47534226f972f 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -49,6 +49,7 @@ "_global_config", "_chained_assignment_msg", "_chained_assignment_method_msg", + "_version_meson", } From 91969c67c3304ea0e0de6eedc4c8371fea10c846 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 15 Aug 2023 14:32:32 -0700 Subject: [PATCH 011/174] Backport PR #54541 on branch 2.1.x (Bump pypa/cibuildwheel from 2.14.1 to 2.15.0) (#54558) Backport PR #54541: Bump pypa/cibuildwheel from 2.14.1 to 2.15.0 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index f486a57bdb67d..5f541f1bae1fd 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,7 +139,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.14.1 + uses: pypa/cibuildwheel@v2.15.0 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 4160d6964fc7257fb1adb40e8a2b83fc300944a6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 15 Aug 2023 14:32:51 -0700 Subject: [PATCH 012/174] Backport PR #54493 on branch 2.1.x (DOC: updated required dependencies list) (#54562) Backport PR #54493: DOC: updated required dependencies list Co-authored-by: Rajat Subhra Mukherjee --- doc/source/getting_started/install.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1896dffa9a105..0ab0391ac78a9 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -206,6 +206,7 @@ Package Minimum support `NumPy `__ 1.22.4 `python-dateutil `__ 2.8.2 `pytz `__ 2020.1 +`tzdata `__ 2022.1 ================================================================ ========================== .. _install.optional_dependencies: From 4fbe0782fdcce7041fc507136f7e76ba58533884 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 16 Aug 2023 15:01:17 -0700 Subject: [PATCH 013/174] Backport PR #54579 on branch 2.1.x (ENH: Reflect changes from `numpy` namespace refactor Part 3) (#54583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #54579: ENH: Reflect changes from `numpy` namespace refactor Part 3 Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- asv_bench/benchmarks/algos/isin.py | 4 +- .../comparison/comparison_with_sql.rst | 2 +- doc/source/user_guide/enhancingperf.rst | 4 +- doc/source/user_guide/gotchas.rst | 2 +- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/_libs/algos.pyx | 2 +- pandas/_libs/groupby.pyx | 2 +- pandas/_libs/lib.pyx | 2 +- pandas/_libs/tslibs/util.pxd | 4 +- pandas/_libs/window/aggregations.pyx | 2 +- pandas/conftest.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/computation/ops.py | 4 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/common.py | 2 +- pandas/core/generic.py | 16 +++---- pandas/core/groupby/groupby.py | 4 +- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/interval.py | 4 +- pandas/core/series.py | 2 +- pandas/core/strings/accessor.py | 6 +-- pandas/io/formats/format.py | 2 +- pandas/tests/apply/test_frame_apply.py | 6 +-- pandas/tests/apply/test_series_apply.py | 4 +- .../arrays/categorical/test_analytics.py | 4 +- pandas/tests/arrays/interval/test_interval.py | 2 +- pandas/tests/computation/test_eval.py | 2 +- pandas/tests/dtypes/cast/test_infer_dtype.py | 8 ++-- pandas/tests/dtypes/test_common.py | 2 +- pandas/tests/dtypes/test_inference.py | 7 ++- pandas/tests/dtypes/test_missing.py | 6 +-- .../frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/methods/test_astype.py | 2 +- pandas/tests/frame/methods/test_clip.py | 2 +- pandas/tests/frame/methods/test_describe.py | 2 +- pandas/tests/frame/methods/test_dropna.py | 4 +- pandas/tests/frame/methods/test_dtypes.py | 8 ++-- pandas/tests/frame/methods/test_replace.py | 2 +- .../tests/frame/methods/test_select_dtypes.py | 2 +- pandas/tests/frame/methods/test_shift.py | 8 ++-- pandas/tests/frame/test_block_internals.py | 16 +++---- pandas/tests/frame/test_constructors.py | 4 +- pandas/tests/frame/test_reductions.py | 16 +++---- pandas/tests/frame/test_stack_unstack.py | 22 ++++----- pandas/tests/groupby/test_categorical.py | 46 +++++++++---------- pandas/tests/groupby/test_counting.py | 2 +- pandas/tests/groupby/test_rank.py | 2 +- .../indexes/datetimes/methods/test_astype.py | 10 ++-- .../tests/indexes/datetimes/test_timezones.py | 4 +- .../tests/indexes/interval/test_interval.py | 4 +- pandas/tests/indexes/multi/test_join.py | 2 +- .../indexes/period/methods/test_astype.py | 4 +- pandas/tests/indexes/period/test_pickle.py | 2 +- pandas/tests/indexes/test_base.py | 2 +- .../indexes/timedeltas/methods/test_astype.py | 6 +-- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/interchange/test_impl.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/formats/test_to_csv.py | 4 +- pandas/tests/io/parser/test_parse_dates.py | 4 +- pandas/tests/io/test_stata.py | 6 +-- pandas/tests/reductions/test_reductions.py | 4 +- .../tests/reductions/test_stat_reductions.py | 2 +- pandas/tests/resample/test_datetime_index.py | 4 +- pandas/tests/resample/test_period_index.py | 2 +- pandas/tests/reshape/concat/test_concat.py | 8 ++-- pandas/tests/reshape/test_pivot.py | 4 +- pandas/tests/scalar/test_na_scalar.py | 10 ++-- .../series/accessors/test_dt_accessor.py | 4 +- pandas/tests/series/indexing/test_indexing.py | 8 ++-- pandas/tests/series/methods/test_argsort.py | 2 +- pandas/tests/series/methods/test_asof.py | 4 +- .../series/methods/test_combine_first.py | 10 ++-- pandas/tests/series/methods/test_copy.py | 2 +- pandas/tests/series/methods/test_count.py | 2 +- .../series/methods/test_drop_duplicates.py | 2 +- pandas/tests/series/methods/test_fillna.py | 6 +-- .../tests/series/methods/test_interpolate.py | 6 +-- pandas/tests/series/methods/test_map.py | 8 ++-- .../tests/series/methods/test_pct_change.py | 2 +- pandas/tests/series/methods/test_rank.py | 10 ++-- pandas/tests/series/methods/test_reindex.py | 6 +-- .../tests/series/methods/test_sort_values.py | 2 +- pandas/tests/series/test_constructors.py | 6 +-- pandas/tests/series/test_cumulative.py | 4 +- pandas/tests/series/test_logical_ops.py | 6 +-- pandas/tests/series/test_missing.py | 2 +- pandas/tests/series/test_repr.py | 2 +- pandas/tests/test_nanops.py | 2 +- pandas/tests/tools/test_to_datetime.py | 4 +- pandas/tests/util/test_assert_almost_equal.py | 2 +- pandas/tests/window/conftest.py | 2 +- pandas/tests/window/test_api.py | 6 +-- pandas/tests/window/test_apply.py | 10 ++-- pandas/tests/window/test_ewm.py | 24 +++++----- pandas/tests/window/test_pairwise.py | 2 +- pandas/tests/window/test_rolling.py | 22 ++++----- pandas/tests/window/test_rolling_functions.py | 14 +++--- pandas/tests/window/test_rolling_quantile.py | 10 ++-- pandas/tests/window/test_rolling_skew_kurt.py | 22 ++++----- pandas/tests/window/test_win_type.py | 2 +- 102 files changed, 280 insertions(+), 283 deletions(-) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index ac79ab65cea81..92797425b2c30 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -247,7 +247,7 @@ def setup(self, series_type, vals_type): elif series_type == "long": ser_vals = np.arange(N_many) elif series_type == "long_floats": - ser_vals = np.arange(N_many, dtype=np.float_) + ser_vals = np.arange(N_many, dtype=np.float64) self.series = Series(ser_vals).astype(object) @@ -258,7 +258,7 @@ def setup(self, series_type, vals_type): elif vals_type == "long": values = np.arange(N_many) elif vals_type == "long_floats": - values = np.arange(N_many, dtype=np.float_) + values = np.arange(N_many, dtype=np.float64) self.values = values.astype(object) diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 7a83d50416186..f0eaa7362c52c 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -107,7 +107,7 @@ methods. .. ipython:: python frame = pd.DataFrame( - {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]} + {"col1": ["A", "B", np.nan, "C", "D"], "col2": ["F", np.nan, "G", "H", "I"]} ) frame diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 2ddc3e709be85..bc2f4420da784 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -183,8 +183,8 @@ can be improved by passing an ``np.ndarray``. ...: return s * dx ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): - ...: assert (col_a.dtype == np.float_ - ...: and col_b.dtype == np.float_ and col_N.dtype == np.int_) + ...: assert (col_a.dtype == np.float64 + ...: and col_b.dtype == np.float64 and col_N.dtype == np.int_) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 67106df328361..c00a236ff4e9d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -327,7 +327,7 @@ present in the more domain-specific statistical programming language `R ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` ``numpy.object_`` | ``object_`` ``numpy.bool_`` | ``bool_`` - ``numpy.character`` | ``string_, unicode_`` + ``numpy.character`` | ``bytes_, str_`` The R language, by contrast, only has a handful of built-in data types: ``integer``, ``numeric`` (floating-point), ``character``, and diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6e352c52cd60e..df2f1bccc3cff 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4881,7 +4881,7 @@ unspecified columns of the given DataFrame. The argument ``selector`` defines which table is the selector table (which you can make queries from). The argument ``dropna`` will drop rows from the input ``DataFrame`` to ensure tables are synchronized. This means that if a row for one of the tables -being written to is entirely ``np.NaN``, that row will be dropped from all tables. +being written to is entirely ``np.nan``, that row will be dropped from all tables. If ``dropna`` is False, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. Remember that entirely ``np.Nan`` rows are not written to the HDFStore, so if diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 73a523b14f9f7..38c6e1123aaae 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -556,7 +556,7 @@ You must pass in the ``line_terminator`` explicitly, even in this case. .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: -Proper handling of ``np.NaN`` in a string data-typed column with the Python engine +Proper handling of ``np.nan`` in a string data-typed column with the Python engine ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ There was bug in :func:`read_excel` and :func:`read_csv` with the Python diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0b6ea58f987d4..9eed70a23c9dd 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -59,7 +59,7 @@ from pandas._libs.util cimport get_nat cdef: float64_t FP_ERR = 1e-13 - float64_t NaN = np.NaN + float64_t NaN = np.nan int64_t NPY_NAT = get_nat() diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 20499016f951e..7635b261d4149 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -52,7 +52,7 @@ from pandas._libs.missing cimport checknull cdef int64_t NPY_NAT = util.get_nat() -cdef float64_t NaN = np.NaN +cdef float64_t NaN = np.nan cdef enum InterpolationEnumType: INTERPOLATION_LINEAR, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a96152ccdf3cc..2681115bbdcfb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -144,7 +144,7 @@ cdef: object oINT64_MIN = INT64_MIN object oUINT64_MAX = UINT64_MAX - float64_t NaN = np.NaN + float64_t NaN = np.nan # python-visible i8max = INT64_MAX diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e25e7e8b94e1d..519d3fc939efa 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -75,7 +75,7 @@ cdef inline bint is_integer_object(object obj) noexcept nogil: cdef inline bint is_float_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (float, np.float_))` + Cython equivalent of `isinstance(val, (float, np.float64))` Parameters ---------- @@ -91,7 +91,7 @@ cdef inline bint is_float_object(object obj) noexcept nogil: cdef inline bint is_complex_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (complex, np.complex_))` + Cython equivalent of `isinstance(val, (complex, np.complex128))` Parameters ---------- diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 425c5ade2e2d4..9c151b8269a52 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -57,7 +57,7 @@ cdef: float32_t MAXfloat32 = np.inf float64_t MAXfloat64 = np.inf - float64_t NaN = np.NaN + float64_t NaN = np.nan cdef bint is_monotonic_increasing_start_end_bounds( ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end diff --git a/pandas/conftest.py b/pandas/conftest.py index f756da82157b8..757ca817d1b85 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -777,7 +777,7 @@ def series_with_multilevel_index() -> Series: index = MultiIndex.from_tuples(tuples) data = np.random.default_rng(2).standard_normal(8) ser = Series(data, index=index) - ser.iloc[3] = np.NaN + ser.iloc[3] = np.nan return ser diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6107388bfe78b..aefc94ebd665c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2109,7 +2109,7 @@ def _codes(self) -> np.ndarray: def _box_func(self, i: int): if i == -1: - return np.NaN + return np.nan return self.categories[i] def _unbox_scalar(self, key) -> int: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 9050fb6e76b9c..852bfae1cc79a 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -537,8 +537,8 @@ def __init__(self, lhs, rhs) -> None: ) # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float_] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) + acceptable_dtypes = [np.float32, np.float64] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) UNARY_OPS_SYMS = ("+", "-", "~", "not") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f7c0b3e36032..657cbce40087a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -850,7 +850,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: dtype = np.dtype(np.float64) elif is_complex(val): - dtype = np.dtype(np.complex_) + dtype = np.dtype(np.complex128) if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a0feb49f47c4e..c2e498e75b7d3 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1351,7 +1351,7 @@ def is_complex_dtype(arr_or_dtype) -> bool: False >>> is_complex_dtype(int) False - >>> is_complex_dtype(np.complex_) + >>> is_complex_dtype(np.complex128) True >>> is_complex_dtype(np.array(['a', 'b'])) False diff --git a/pandas/core/generic.py b/pandas/core/generic.py index be0d046697ba9..954573febed41 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5307,7 +5307,7 @@ def reindex( level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level. - fill_value : scalar, default np.NaN + fill_value : scalar, default np.nan Value to use for missing values. Defaults to NaN, but can be any "compatible" value. limit : int, default None @@ -7376,7 +7376,7 @@ def ffill( 2 3.0 4.0 NaN 1.0 3 3.0 3.0 NaN 4.0 - >>> ser = pd.Series([1, np.NaN, 2, 3]) + >>> ser = pd.Series([1, np.nan, 2, 3]) >>> ser.ffill() 0 1.0 1 1.0 @@ -8375,7 +8375,7 @@ def isna(self) -> Self: -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], ... born=[pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... name=['Alfred', 'Batman', ''], @@ -8394,7 +8394,7 @@ def isna(self) -> Self: Show which entries in a Series are NA. - >>> ser = pd.Series([5, 6, np.NaN]) + >>> ser = pd.Series([5, 6, np.nan]) >>> ser 0 5.0 1 6.0 @@ -8442,7 +8442,7 @@ def notna(self) -> Self: -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + >>> df = pd.DataFrame(dict(age=[5, 6, np.nan], ... born=[pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... name=['Alfred', 'Batman', ''], @@ -8461,7 +8461,7 @@ def notna(self) -> Self: Show which entries in a Series are not NA. - >>> ser = pd.Series([5, 6, np.NaN]) + >>> ser = pd.Series([5, 6, np.nan]) >>> ser 0 5.0 1 6.0 @@ -8628,7 +8628,7 @@ def clip( Clips using specific lower threshold per column element, with missing values: - >>> t = pd.Series([2, -4, np.NaN, 6, 3]) + >>> t = pd.Series([2, -4, np.nan, 6, 3]) >>> t 0 2.0 1 -4.0 @@ -9828,7 +9828,7 @@ def align( copy : bool, default True Always returns new objects. If copy=False and no reindexing is required then original objects are returned. - fill_value : scalar, default np.NaN + fill_value : scalar, default np.nan Value to use for missing values. Defaults to NaN, but can be any "compatible" value. method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e327dd9d6c5ff..5a7f42a535951 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5418,7 +5418,7 @@ def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: def _reindex_output( self, output: OutputFrameOrSeries, - fill_value: Scalar = np.NaN, + fill_value: Scalar = np.nan, qs: npt.NDArray[np.float64] | None = None, ) -> OutputFrameOrSeries: """ @@ -5436,7 +5436,7 @@ def _reindex_output( ---------- output : Series or DataFrame Object resulting from grouping and applying an operation. - fill_value : scalar, default np.NaN + fill_value : scalar, default np.nan Value to use for unobserved categories if self.observed is False. qs : np.ndarray[float64] or None, default None quantile values, only relevant for quantile. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 288fd35892fd0..241b2de513a04 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2848,7 +2848,7 @@ def isna(self) -> npt.NDArray[np.bool_]: Show which entries in a pandas.Index are NA. The result is an array. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx = pd.Index([5.2, 6.0, np.nan]) >>> idx Index([5.2, 6.0, nan], dtype='float64') >>> idx.isna() @@ -2904,7 +2904,7 @@ def notna(self) -> npt.NDArray[np.bool_]: Show which entries in an Index are not NA. The result is an array. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx = pd.Index([5.2, 6.0, np.nan]) >>> idx Index([5.2, 6.0, nan], dtype='float64') >>> idx.notna() diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f915c08bb8294..e8b3676e71ae0 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -124,7 +124,7 @@ def _get_next_label(label): elif is_integer_dtype(dtype): return label + 1 elif is_float_dtype(dtype): - return np.nextafter(label, np.infty) + return np.nextafter(label, np.inf) else: raise TypeError(f"cannot determine next label for type {repr(type(label))}") @@ -141,7 +141,7 @@ def _get_prev_label(label): elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): - return np.nextafter(label, -np.infty) + return np.nextafter(label, -np.inf) else: raise TypeError(f"cannot determine next label for type {repr(type(label))}") diff --git a/pandas/core/series.py b/pandas/core/series.py index 814a770b192bf..885675e5caa5a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5586,7 +5586,7 @@ def dropna( Empty strings are not considered NA values. ``None`` is considered an NA value. - >>> ser = pd.Series([np.NaN, 2, pd.NaT, '', None, 'I stay']) + >>> ser = pd.Series([np.nan, 2, pd.NaT, '', None, 'I stay']) >>> ser 0 NaN 1 2 diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index e59369db776da..becf9b47b3af1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1215,7 +1215,7 @@ def contains( -------- Returning a Series of booleans using only a literal pattern. - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) + >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan]) >>> s1.str.contains('og', regex=False) 0 False 1 True @@ -1226,7 +1226,7 @@ def contains( Returning an Index of booleans using only a literal pattern. - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) + >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan]) >>> ind.str.contains('23', regex=False) Index([False, False, False, True, nan], dtype='object') @@ -3500,7 +3500,7 @@ def str_extractall(arr, pat, flags: int = 0) -> DataFrame: for match_i, match_tuple in enumerate(regex.findall(subject)): if isinstance(match_tuple, str): match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] + na_tuple = [np.nan if group == "" else group for group in match_tuple] match_list.append(na_tuple) result_key = tuple(subject_key + (match_i,)) index_list.append(result_key) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9fe8cbfa159c6..ff26abd5cc26c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1715,7 +1715,7 @@ def format_percentiles( """ percentiles = np.asarray(percentiles) - # It checks for np.NaN as well + # It checks for np.nan as well if ( not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index ba052c6936dd9..3a3f73a68374b 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -637,15 +637,15 @@ def test_apply_with_byte_string(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("val", ["asd", 12, None, np.NaN]) +@pytest.mark.parametrize("val", ["asd", 12, None, np.nan]) def test_apply_category_equalness(val): # Check if categorical comparisons on apply, GH 21239 - df_values = ["asd", None, 12, "asd", "cde", np.NaN] + df_values = ["asd", None, 12, "asd", "cde", np.nan] df = DataFrame({"a": df_values}, dtype="category") result = df.a.apply(lambda x: x == val) expected = Series( - [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" + [np.nan if pd.isnull(x) else x == val for x in df_values], name="a" ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index aea1e03dfe0ee..d3e5ac1b4ca7a 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -242,7 +242,7 @@ def test_apply_categorical(by_row): assert result.dtype == object -@pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) +@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) def test_apply_categorical_with_nan_values(series, by_row): # GH 20714 bug fixed in: GH 24275 s = Series(series, dtype="category") @@ -254,7 +254,7 @@ def test_apply_categorical_with_nan_values(series, by_row): result = s.apply(lambda x: x.split("-")[0], by_row=by_row) result = result.astype(object) - expected = Series(["1", "1", np.NaN], dtype="category") + expected = Series(["1", "1", np.nan], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c42364d4d4377..c2c53fbc4637e 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -73,8 +73,8 @@ def test_min_max_reduce(self): @pytest.mark.parametrize( "categories,expected", [ - (list("ABC"), np.NaN), - ([1, 2, 3], np.NaN), + (list("ABC"), np.nan), + ([1, 2, 3], np.nan), pytest.param( Series(date_range("2020-01-01", periods=3), dtype="category"), NaT, diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index e16ef37e8799d..761b85287764f 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -129,7 +129,7 @@ def test_set_na(self, left_right_dtypes): # GH#45484 TypeError, not ValueError, matches what we get with # non-NA un-holdable value. with pytest.raises(TypeError, match=msg): - result[0] = np.NaN + result[0] = np.nan return result[0] = np.nan diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index f958d25e51103..9c630e29ea8e6 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -725,7 +725,7 @@ def test_and_logic_string_match(self): class TestTypeCasting: @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) # maybe someday... numexpr has too many upcasting rules now - # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])) + # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) @pytest.mark.parametrize("dt", [np.float32, np.float64]) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) def test_binop_typecasting(self, engine, parser, op, dt, left_right): diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index b5d761b3549fa..ed08df74461ef 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -42,7 +42,7 @@ def test_infer_dtype_from_float_scalar(float_numpy_dtype): @pytest.mark.parametrize( - "data,exp_dtype", [(12, np.int64), (np.float_(12), np.float64)] + "data,exp_dtype", [(12, np.int64), (np.float64(12), np.float64)] ) def test_infer_dtype_from_python_scalar(data, exp_dtype): dtype, val = infer_dtype_from_scalar(data) @@ -58,7 +58,7 @@ def test_infer_dtype_from_boolean(bool_val): def test_infer_dtype_from_complex(complex_dtype): data = np.dtype(complex_dtype).type(1) dtype, val = infer_dtype_from_scalar(data) - assert dtype == np.complex_ + assert dtype == np.complex128 def test_infer_dtype_from_datetime(): @@ -153,7 +153,7 @@ def test_infer_dtype_from_scalar_errors(): ("foo", np.object_), (b"foo", np.object_), (1, np.int64), - (1.5, np.float_), + (1.5, np.float64), (np.datetime64("2016-01-01"), np.dtype("M8[s]")), (Timestamp("20160101"), np.dtype("M8[s]")), (Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"), @@ -173,7 +173,7 @@ def test_infer_dtype_from_scalar(value, expected): ([1], np.int_), (np.array([1], dtype=np.int64), np.int64), ([np.nan, 1, ""], np.object_), - (np.array([[1.0, 2.0]]), np.float_), + (np.array([[1.0, 2.0]]), np.float64), (Categorical(list("aabc")), "category"), (Categorical([1, 2, 3]), "category"), (date_range("20160101", periods=3), np.dtype("=M8[ns]")), diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 0043ace1b9590..471e456146178 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -652,7 +652,7 @@ def test_is_complex_dtype(): assert not com.is_complex_dtype(pd.Series([1, 2])) assert not com.is_complex_dtype(np.array(["a", "b"])) - assert com.is_complex_dtype(np.complex_) + assert com.is_complex_dtype(np.complex128) assert com.is_complex_dtype(complex) assert com.is_complex_dtype(np.array([1 + 1j, 5])) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 375003e58c21a..df7c787d2b9bf 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -536,7 +536,7 @@ def test_isneginf_scalar(self, value, expected): ) def test_maybe_convert_nullable_boolean(self, convert_to_masked_nullable, exp): # GH 40687 - arr = np.array([True, np.NaN], dtype=object) + arr = np.array([True, np.nan], dtype=object) result = libops.maybe_convert_bool( arr, set(), convert_to_masked_nullable=convert_to_masked_nullable ) @@ -862,7 +862,7 @@ def test_maybe_convert_objects_timedelta64_nat(self): ) def test_maybe_convert_objects_nullable_integer(self, exp): # GH27335 - arr = np.array([2, np.NaN], dtype=object) + arr = np.array([2, np.nan], dtype=object) result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) tm.assert_extension_array_equal(result, exp) @@ -890,7 +890,7 @@ def test_maybe_convert_numeric_nullable_integer( self, convert_to_masked_nullable, exp ): # GH 40687 - arr = np.array([2, np.NaN], dtype=object) + arr = np.array([2, np.nan], dtype=object) result = lib.maybe_convert_numeric( arr, set(), convert_to_masked_nullable=convert_to_masked_nullable ) @@ -1889,7 +1889,6 @@ def test_is_scalar_numpy_array_scalars(self): assert is_scalar(np.complex64(2)) assert is_scalar(np.object_("foobar")) assert is_scalar(np.str_("foobar")) - assert is_scalar(np.unicode_("foobar")) assert is_scalar(np.bytes_(b"foobar")) assert is_scalar(np.datetime64("2014-01-01")) assert is_scalar(np.timedelta64(1, "h")) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 170f4f49ba377..451ac2afd1d91 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -51,7 +51,7 @@ def test_notna_notnull(notna_f): assert notna_f(1.0) assert not notna_f(None) - assert not notna_f(np.NaN) + assert not notna_f(np.nan) msg = "use_inf_as_na option is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -112,7 +112,7 @@ def test_empty_object(self, shape): def test_isna_isnull(self, isna_f): assert not isna_f(1.0) assert isna_f(None) - assert isna_f(np.NaN) + assert isna_f(np.nan) assert float("nan") assert not isna_f(np.inf) assert not isna_f(-np.inf) @@ -156,7 +156,7 @@ def test_isna_lists(self): tm.assert_numpy_array_equal(result, exp) # GH20675 - result = isna([np.NaN, "world"]) + result = isna([np.nan, "world"]) exp = np.array([True, False]) tm.assert_numpy_array_equal(result, exp) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 95f9f2ba4051e..59dca5055f170 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -269,7 +269,7 @@ def test_from_records_series_categorical_index(self): series_of_dicts = Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index) frame = DataFrame.from_records(series_of_dicts, index=index) expected = DataFrame( - {"a": [1, 2, np.NaN], "b": [np.NaN, np.NaN, 3]}, index=index + {"a": [1, 2, np.nan], "b": [np.nan, np.nan, 3]}, index=index ) tm.assert_frame_equal(frame, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 34cbebe1b3d3f..6590f10c6b967 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -164,7 +164,7 @@ def test_astype_str(self): def test_astype_str_float(self): # see GH#11302 - result = DataFrame([np.NaN]).astype(str) + result = DataFrame([np.nan]).astype(str) expected = DataFrame(["nan"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 710978057460a..9bd032a0aefc4 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -166,7 +166,7 @@ def test_clip_with_na_args(self, float_frame): # GH#40420 data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} df = DataFrame(data) - t = Series([2, -4, np.NaN, 6, 3]) + t = Series([2, -4, np.nan, 6, 3]) result = df.clip(lower=t, axis=0) expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index e2f92a1e04cb5..f56a7896c753e 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -332,7 +332,7 @@ def test_describe_percentiles_integer_idx(self): result = df.describe(percentiles=pct) expected = DataFrame( - {"x": [1.0, 1.0, np.NaN, 1.0, *(1.0 for _ in pct), 1.0]}, + {"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]}, index=[ "count", "mean", diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 11edf665b5494..7899b4aeac3fd 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -231,7 +231,7 @@ def test_dropna_with_duplicate_columns(self): def test_set_single_column_subset(self): # GH 41021 - df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.NaN, 5]}) + df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]}) expected = DataFrame( {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2] ) @@ -248,7 +248,7 @@ def test_single_column_not_present_in_axis(self): def test_subset_is_nparray(self): # GH 41021 - df = DataFrame({"A": [1, 2, np.NaN], "B": list("abc"), "C": [4, np.NaN, 5]}) + df = DataFrame({"A": [1, 2, np.nan], "B": list("abc"), "C": [4, np.nan, 5]}) expected = DataFrame({"A": [1.0], "B": ["a"], "C": [4.0]}) result = df.dropna(subset=np.array(["A", "C"])) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 6f21bd4c4b438..4bdf16977dae6 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -62,15 +62,15 @@ def test_datetime_with_tz_dtypes(self): def test_dtypes_are_correct_after_column_slice(self): # GH6525 - df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float64) tm.assert_series_equal( df.dtypes, - Series({"a": np.float_, "b": np.float_, "c": np.float_}), + Series({"a": np.float64, "b": np.float64, "c": np.float64}), ) - tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float_})) + tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float64})) tm.assert_series_equal( df.dtypes, - Series({"a": np.float_, "b": np.float_, "c": np.float_}), + Series({"a": np.float64, "b": np.float64, "c": np.float64}), ) @pytest.mark.parametrize( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3203482ddf724..61e44b4e24c08 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -666,7 +666,7 @@ def test_replace_NA_with_None(self): def test_replace_NAT_with_None(self): # gh-45836 df = DataFrame([pd.NaT, pd.NaT]) - result = df.replace({pd.NaT: None, np.NaN: None}) + result = df.replace({pd.NaT: None, np.nan: None}) expected = DataFrame([None, None]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 3bfb1af423bdd..67dd5b6217187 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -340,7 +340,7 @@ def test_select_dtypes_datetime_with_tz(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"] + "dtype", [str, "str", np.bytes_, "S1", "unicode", np.str_, "U1"] ) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 35941e9f24a4e..808f0cff2485c 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -681,10 +681,10 @@ def test_shift_with_iterable_basic_functionality(self): { "a_0": [1, 2, 3], "b_0": [4, 5, 6], - "a_1": [np.NaN, 1.0, 2.0], - "b_1": [np.NaN, 4.0, 5.0], - "a_2": [np.NaN, np.NaN, 1.0], - "b_2": [np.NaN, np.NaN, 4.0], + "a_1": [np.nan, 1.0, 2.0], + "b_1": [np.nan, 4.0, 5.0], + "a_2": [np.nan, np.nan, 1.0], + "b_2": [np.nan, np.nan, 4.0], } ) tm.assert_frame_equal(expected, shifted) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 008d7a023576a..9e8d92e832d01 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -131,22 +131,22 @@ def test_constructor_with_convert(self): df = DataFrame({"A": [None, 1]}) result = df["A"] - expected = Series(np.asarray([np.nan, 1], np.float_), name="A") + expected = Series(np.asarray([np.nan, 1], np.float64), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0, 2]}) result = df["A"] - expected = Series(np.asarray([1.0, 2], np.float_), name="A") + expected = Series(np.asarray([1.0, 2], np.float64), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, 3]}) result = df["A"] - expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name="A") + expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex128), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, 3.0]}) result = df["A"] - expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name="A") + expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex128), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, True]}) @@ -156,12 +156,12 @@ def test_constructor_with_convert(self): df = DataFrame({"A": [1.0, None]}) result = df["A"] - expected = Series(np.asarray([1.0, np.nan], np.float_), name="A") + expected = Series(np.asarray([1.0, np.nan], np.float64), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [1.0 + 2.0j, None]}) result = df["A"] - expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex_), name="A") + expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex128), name="A") tm.assert_series_equal(result, expected) df = DataFrame({"A": [2.0, 1, True, None]}) @@ -343,9 +343,9 @@ def test_stale_cached_series_bug_473(self, using_copy_on_write): Y["e"] = Y["e"].astype("object") if using_copy_on_write: with tm.raises_chained_assignment_error(): - Y["g"]["c"] = np.NaN + Y["g"]["c"] = np.nan else: - Y["g"]["c"] = np.NaN + Y["g"]["c"] = np.nan repr(Y) Y.sum() Y["g"].sum() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a493084142f7b..c170704150383 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1781,8 +1781,6 @@ def test_constructor_empty_with_string_dtype(self): tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) - df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_) - tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") tm.assert_frame_equal(df, expected) @@ -1826,7 +1824,7 @@ def test_constructor_single_value(self): def test_constructor_with_datetimes(self): intname = np.dtype(np.int_).name - floatname = np.dtype(np.float_).name + floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name # single item diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index ab36934533beb..e7b6a0c0b39b0 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -134,7 +134,7 @@ def wrapper(x): # all NA case if has_skipna: - all_na = frame * np.NaN + all_na = frame * np.nan r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname in ["sum", "prod"]: @@ -834,9 +834,9 @@ def test_sum_nanops_min_count(self): @pytest.mark.parametrize( "kwargs, expected_result", [ - ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.NaN]), - ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]), - ({"axis": 1, "skipna": False}, [3.2, 5.3, np.NaN]), + ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.nan]), + ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]), + ({"axis": 1, "skipna": False}, [3.2, 5.3, np.nan]), ], ) def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result): @@ -850,9 +850,9 @@ def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result): @pytest.mark.parametrize( "kwargs, expected_result", [ - ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.NaN]), - ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]), - ({"axis": 1, "skipna": False}, [2.0, 4.0, np.NaN]), + ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.nan]), + ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]), + ({"axis": 1, "skipna": False}, [2.0, 4.0, np.nan]), ], ) def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result): @@ -1189,7 +1189,7 @@ def wrapper(x): f(axis=2) # all NA case - all_na = frame * np.NaN + all_na = frame * np.nan r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname == "any": diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index cb8e8c5025e3b..c90b871d5d66f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -72,7 +72,7 @@ def test_stack_mixed_level(self, future_stack): def test_unstack_not_consolidated(self, using_array_manager): # Gh#34708 - df = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + df = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]}) df2 = df[["x"]] df2["y"] = df["y"] if not using_array_manager: @@ -584,7 +584,7 @@ def test_unstack_to_series(self, float_frame): tm.assert_frame_equal(undo, float_frame) # check NA handling - data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + data = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]}) data.index = Index(["a", "b", "c"]) result = data.unstack() @@ -592,7 +592,7 @@ def test_unstack_to_series(self, float_frame): levels=[["x", "y"], ["a", "b", "c"]], codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], ) - expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) + expected = Series([1, 2, np.nan, 3, 4, np.nan], index=midx) tm.assert_series_equal(result, expected) @@ -902,9 +902,9 @@ def cast(val): def test_unstack_nan_index2(self): # GH7403 df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) - # Explicit cast to avoid implicit cast when setting to np.NaN + # Explicit cast to avoid implicit cast when setting to np.nan df = df.astype({"B": "float"}) - df.iloc[3, 1] = np.NaN + df.iloc[3, 1] = np.nan left = df.set_index(["A", "B"]).unstack(0) vals = [ @@ -921,9 +921,9 @@ def test_unstack_nan_index2(self): tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) - # Explicit cast to avoid implicit cast when setting to np.NaN + # Explicit cast to avoid implicit cast when setting to np.nan df = df.astype({"B": "float"}) - df.iloc[2, 1] = np.NaN + df.iloc[2, 1] = np.nan left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] @@ -935,9 +935,9 @@ def test_unstack_nan_index2(self): tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) - # Explicit cast to avoid implicit cast when setting to np.NaN + # Explicit cast to avoid implicit cast when setting to np.nan df = df.astype({"B": "float"}) - df.iloc[3, 1] = np.NaN + df.iloc[3, 1] = np.nan left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] @@ -958,7 +958,7 @@ def test_unstack_nan_index3(self, using_array_manager): } ) - df.iloc[3, 1] = np.NaN + df.iloc[3, 1] = np.nan left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) @@ -1754,7 +1754,7 @@ def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack) result = df["foo"].stack(future_stack=future_stack).sort_index() tm.assert_series_equal(stacked["foo"], result, check_names=False) assert result.name is None - assert stacked["bar"].dtype == np.float_ + assert stacked["bar"].dtype == np.float64 def test_unstack_bug(self, future_stack): df = DataFrame( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d0ae9eeed394f..68ce58ad23690 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -18,7 +18,7 @@ from pandas.tests.groupby import get_groupby_method_args -def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): +def cartesian_product_for_groupers(result, args, names, fill_value=np.nan): """Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -42,28 +42,28 @@ def f(a): # These expected values can be used across several tests (i.e. they are # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be # hardcoded in one place. - "all": np.NaN, - "any": np.NaN, + "all": np.nan, + "any": np.nan, "count": 0, - "corrwith": np.NaN, - "first": np.NaN, - "idxmax": np.NaN, - "idxmin": np.NaN, - "last": np.NaN, - "max": np.NaN, - "mean": np.NaN, - "median": np.NaN, - "min": np.NaN, - "nth": np.NaN, + "corrwith": np.nan, + "first": np.nan, + "idxmax": np.nan, + "idxmin": np.nan, + "last": np.nan, + "max": np.nan, + "mean": np.nan, + "median": np.nan, + "min": np.nan, + "nth": np.nan, "nunique": 0, - "prod": np.NaN, - "quantile": np.NaN, - "sem": np.NaN, + "prod": np.nan, + "quantile": np.nan, + "sem": np.nan, "size": 0, - "skew": np.NaN, - "std": np.NaN, + "skew": np.nan, + "std": np.nan, "sum": 0, - "var": np.NaN, + "var": np.nan, } @@ -1750,8 +1750,8 @@ def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( cat2 = Categorical([0, 1]) idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"]) expected_dict = { - "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"), - "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"), + "first": Series([0, np.nan, np.nan, 1], idx, name="c"), + "last": Series([1, np.nan, np.nan, 0], idx, name="c"), } expected = expected_dict[func] @@ -1775,8 +1775,8 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( cat2 = Categorical([0, 1]) idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"]) expected_dict = { - "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"), - "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"), + "first": Series([0, np.nan, np.nan, 1], idx, name="c"), + "last": Series([1, np.nan, np.nan, 0], idx, name="c"), } expected = expected_dict[func].to_frame() diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index fd5018d05380c..6c27344ce3110 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -232,7 +232,7 @@ def test_count_with_only_nans_in_first_group(self): def test_count_groupby_column_with_nan_in_groupby_column(self): # https://github.com/pandas-dev/pandas/issues/32841 - df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) + df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]}) res = df.groupby(["B"]).count() expected = DataFrame( index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 26881bdd18274..5d85a0783e024 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -578,7 +578,7 @@ def test_rank_min_int(): result = df.groupby("grp").rank() expected = DataFrame( - {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]} + {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]} ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 94cf86b7fb9c5..d339639dc5def 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -20,7 +20,7 @@ class TestDatetimeIndex: def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") result = idx.astype(object) expected = Index( @@ -84,7 +84,7 @@ def test_astype_str_nat(self): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) @@ -141,7 +141,7 @@ def test_astype_str_freq_and_tz(self): def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) @@ -151,7 +151,7 @@ def test_astype_datetime64(self): tm.assert_index_equal(result, idx) assert result is idx - idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST", name="idx") + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], tz="EST", name="idx") msg = "Cannot use .astype to convert from timezone-aware" with pytest.raises(TypeError, match=msg): # dt64tz->dt64 deprecated @@ -202,7 +202,7 @@ def test_astype_object_with_nat(self): ) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) msg = "Cannot cast DatetimeIndex to dtype" if dtype == "datetime64": msg = "Casting to unit-less dtype 'datetime64' is not supported" diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 6f3c83b999e94..09b06ecd5630d 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -538,8 +538,8 @@ def test_dti_tz_localize_ambiguous_nat(self, tz): times = [ "11/06/2011 00:00", - np.NaN, - np.NaN, + np.nan, + np.nan, "11/06/2011 02:00", "11/06/2011 03:00", ] diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 49e8df2b71f22..aff4944e7bd55 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -247,12 +247,12 @@ def test_is_unique_interval(self, closed): assert idx.is_unique is True # unique NaN - idx = IntervalIndex.from_tuples([(np.NaN, np.NaN)], closed=closed) + idx = IntervalIndex.from_tuples([(np.nan, np.nan)], closed=closed) assert idx.is_unique is True # non-unique NaN idx = IntervalIndex.from_tuples( - [(np.NaN, np.NaN), (np.NaN, np.NaN)], closed=closed + [(np.nan, np.nan), (np.nan, np.nan)], closed=closed ) assert idx.is_unique is False diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index c5a3512113655..700af142958b3 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -217,7 +217,7 @@ def test_join_multi_with_nan(): ) df2 = DataFrame( data={"col2": [2.1, 2.2]}, - index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]), + index=MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]), ) result = df1.join(df2) expected = DataFrame( diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index 2a605d136175e..e54cd73a35f59 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -17,14 +17,14 @@ class TestPeriodIndexAsType: @pytest.mark.parametrize("dtype", [float, "timedelta64", "timedelta64[ns]"]) def test_astype_raises(self, dtype): # GH#13149, GH#13209 - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D") msg = "Cannot cast PeriodIndex to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) def test_astype_conversion(self): # GH#13149, GH#13209 - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D", name="idx") + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") result = idx.astype(object) expected = Index( diff --git a/pandas/tests/indexes/period/test_pickle.py b/pandas/tests/indexes/period/test_pickle.py index 82f906d1e361f..cb981ab10064f 100644 --- a/pandas/tests/indexes/period/test_pickle.py +++ b/pandas/tests/indexes/period/test_pickle.py @@ -14,7 +14,7 @@ class TestPickle: @pytest.mark.parametrize("freq", ["D", "M", "A"]) def test_pickle_round_trip(self, freq): - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq=freq) + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq=freq) result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b3fb5a26ca63f..ffa0b115e34fb 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -473,7 +473,7 @@ def test_empty_fancy(self, index, dtype): def test_empty_fancy_raises(self, index): # DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. - empty_farr = np.array([], dtype=np.float_) + empty_farr = np.array([], dtype=np.float64) empty_index = type(index)([], dtype=index.dtype) assert index[[]].identical(empty_index) diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 9b17a8af59ac5..f69f0fd3d78e2 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -45,7 +45,7 @@ def test_astype_object_with_nat(self): def test_astype(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN], name="idx") + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") result = idx.astype(object) expected = Index( @@ -78,7 +78,7 @@ def test_astype_uint(self): def test_astype_timedelta64(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) msg = ( r"Cannot convert from timedelta64\[ns\] to timedelta64. " @@ -98,7 +98,7 @@ def test_astype_timedelta64(self): @pytest.mark.parametrize("dtype", [float, "datetime64", "datetime64[ns]"]) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan]) msg = "Cannot cast TimedeltaIndex to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abbf22a7fc70a..d0b6adfda0241 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -201,7 +201,7 @@ def test_column_types_consistent(self): df = DataFrame( data={ "channel": [1, 2, 3], - "A": ["String 1", np.NaN, "String 2"], + "A": ["String 1", np.nan, "String 2"], "B": [ Timestamp("2019-06-11 11:00:00"), pd.NaT, diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 2a65937a82200..8a25a2c1889f3 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -32,7 +32,7 @@ def string_data(): "234,3245.67", "gSaf,qWer|Gre", "asd3,4sad|", - np.NaN, + np.nan, ] } diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 97d9f13bd9e9e..b7108896f01ed 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -473,7 +473,7 @@ def test_set_change_dtype(self, mgr): mgr2.iset( mgr2.items.get_loc("quux"), np.random.default_rng(2).standard_normal(N) ) - assert mgr2.iget(idx).dtype == np.float_ + assert mgr2.iget(idx).dtype == np.float64 def test_copy(self, mgr): cp = mgr.copy(deep=False) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 32509a799fa69..c8e984a92f418 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -181,7 +181,7 @@ def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. - df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]}) + df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) @@ -189,7 +189,7 @@ def test_to_csv_na_rep(self): assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # now with an index containing only NaNs - df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]}) + df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index eecacf29de872..c79fdd9145a6a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -46,7 +46,7 @@ def test_read_csv_with_custom_date_parser(all_parsers): # GH36111 def __custom_date_parser(time): - time = time.astype(np.float_) + time = time.astype(np.float64) time = time.astype(np.int_) # convert float seconds to int type return pd.to_timedelta(time, unit="s") @@ -86,7 +86,7 @@ def __custom_date_parser(time): def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): # GH44366 def __custom_date_parser(time): - time = time.astype(np.float_) + time = time.astype(np.float64) time = time.astype(np.int_) # convert float seconds to int type return pd.to_timedelta(time, unit="s") diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3cfd86049588b..7459aa1df8f3e 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1475,9 +1475,9 @@ def test_stata_111(self, datapath): df = read_stata(datapath("io", "data", "stata", "stata7_111.dta")) original = DataFrame( { - "y": [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0], - "x": [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6], - "w": [2, np.NaN, 5, 2, 4, 4, 3, 1, 2, 3], + "y": [1, 1, 1, 1, 1, 0, 0, np.nan, 0, 0], + "x": [1, 2, 1, 3, np.nan, 4, 3, 5, 1, 6], + "w": [2, np.nan, 5, 2, 4, 4, 3, 1, 2, 3], "z": ["a", "b", "c", "d", "e", "", "g", "h", "i", "j"], } ) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index afe5b3c66a611..87892a81cef3d 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -867,7 +867,7 @@ def test_idxmin(self): string_series = tm.makeStringSeries().rename("series") # add some NaNs - string_series[5:15] = np.NaN + string_series[5:15] = np.nan # skipna or no assert string_series[string_series.idxmin()] == string_series.min() @@ -900,7 +900,7 @@ def test_idxmax(self): string_series = tm.makeStringSeries().rename("series") # add some NaNs - string_series[5:15] = np.NaN + string_series[5:15] = np.nan # skipna or no assert string_series[string_series.idxmax()] == string_series.max() diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 58c5fc7269aee..55d78c516b6f3 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -99,7 +99,7 @@ def _check_stat_op( f = getattr(Series, name) # add some NaNs - string_series_[5:15] = np.NaN + string_series_[5:15] = np.nan # mean, idxmax, idxmin, min, and max are valid for dates if name not in ["max", "min", "mean", "median", "std"]: diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 1d72e6d3970ca..dbda751e82113 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -478,7 +478,7 @@ def test_resample_how_method(unit): ) s.index = s.index.as_unit(unit) expected = Series( - [11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22], + [11, np.nan, np.nan, np.nan, np.nan, np.nan, 22], index=DatetimeIndex( [ Timestamp("2015-03-31 21:48:50"), @@ -1356,7 +1356,7 @@ def test_resample_consistency(unit): i30 = date_range("2002-02-02", periods=4, freq="30T").as_unit(unit) s = Series(np.arange(4.0), index=i30) - s.iloc[2] = np.NaN + s.iloc[2] = np.nan # Upsample by factor 3 with reindex() and resample() methods: i10 = date_range(i30[0], i30[-1], freq="10T").as_unit(unit) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 0ded7d7e6bfc5..7559a85de7a6b 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -793,7 +793,7 @@ def test_upsampling_ohlc(self, freq, period_mult, kind): @pytest.mark.parametrize( "freq, expected_values", [ - ("1s", [3, np.NaN, 7, 11]), + ("1s", [3, np.nan, 7, 11]), ("2s", [3, (7 + 11) / 2]), ("3s", [(3 + 7) / 2, 11]), ], diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 869bf3ace9492..3efcd930af581 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -507,10 +507,10 @@ def test_concat_duplicate_indices_raise(self): concat([df1, df2], axis=1) -@pytest.mark.parametrize("dt", np.sctypes["float"]) -def test_concat_no_unnecessary_upcast(dt, frame_or_series): +def test_concat_no_unnecessary_upcast(float_numpy_dtype, frame_or_series): # GH 13247 dims = frame_or_series(dtype=object).ndim + dt = float_numpy_dtype dfs = [ frame_or_series(np.array([1], dtype=dt, ndmin=dims)), @@ -522,8 +522,8 @@ def test_concat_no_unnecessary_upcast(dt, frame_or_series): @pytest.mark.parametrize("pdt", [Series, DataFrame]) -@pytest.mark.parametrize("dt", np.sctypes["int"]) -def test_concat_will_upcast(dt, pdt): +def test_concat_will_upcast(pdt, any_signed_int_numpy_dtype): + dt = any_signed_int_numpy_dtype dims = pdt().ndim dfs = [ pdt(np.array([1], dtype=dt, ndmin=dims)), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 43786ee15d138..46da18445e135 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1950,7 +1950,7 @@ def test_pivot_table_not_series(self): result = df.pivot_table("col1", index="col3", columns="col2", aggfunc="sum") expected = DataFrame( - [[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]], + [[3, np.nan, np.nan], [np.nan, 4, np.nan], [np.nan, np.nan, 5]], index=Index([1, 3, 9], name="col3"), columns=Index(["C", "D", "E"], name="col2"), ) @@ -2424,7 +2424,7 @@ def test_pivot_table_aggfunc_nunique_with_different_values(self): ], names=(None, None, "b"), ) - nparr = np.full((10, 10), np.NaN) + nparr = np.full((10, 10), np.nan) np.fill_diagonal(nparr, 1.0) expected = DataFrame(nparr, index=Index(range(10), name="a"), columns=columnval) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 213fa1791838d..287b7557f50f9 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -103,9 +103,9 @@ def test_comparison_ops(comparison_op, other): False, np.bool_(False), np.int_(0), - np.float_(0), + np.float64(0), np.int_(-0), - np.float_(-0), + np.float64(-0), ], ) @pytest.mark.parametrize("asarray", [True, False]) @@ -123,7 +123,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)] + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float64(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -133,14 +133,14 @@ def test_rpow_special(value, asarray): if asarray: result = result[0] - elif not isinstance(value, (np.float_, np.bool_, np.int_)): + elif not isinstance(value, (np.float64, np.bool_, np.int_)): # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) assert result == value -@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float_(-1)]) +@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float64(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index e7fea9aa597b8..dd810a31c25af 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -739,9 +739,9 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): "input_series, expected_output", [ [["2020-01-01"], [[2020, 1, 3]]], - [[pd.NaT], [[np.NaN, np.NaN, np.NaN]]], + [[pd.NaT], [[np.nan, np.nan, np.nan]]], [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], - [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.NaN, np.NaN, np.NaN]]], + [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.nan, np.nan, np.nan]]], # see GH#36032 [["2016-01-08", "2016-01-04"], [[2016, 1, 5], [2016, 1, 1]]], [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 20f8dd1fc5b2a..7b857a487db78 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -196,9 +196,9 @@ def test_setitem_ambiguous_keyerror(indexer_sl): def test_setitem(datetime_series): - datetime_series[datetime_series.index[5]] = np.NaN - datetime_series.iloc[[1, 2, 17]] = np.NaN - datetime_series.iloc[6] = np.NaN + datetime_series[datetime_series.index[5]] = np.nan + datetime_series.iloc[[1, 2, 17]] = np.nan + datetime_series.iloc[6] = np.nan assert np.isnan(datetime_series.iloc[6]) assert np.isnan(datetime_series.iloc[2]) datetime_series[np.isnan(datetime_series)] = 5 @@ -304,7 +304,7 @@ def test_underlying_data_conversion(using_copy_on_write): def test_preserve_refs(datetime_series): seq = datetime_series.iloc[[5, 10, 15]] - seq.iloc[1] = np.NaN + seq.iloc[1] = np.nan assert not np.isnan(datetime_series.iloc[10]) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index bd8b7b34bd402..5bcf42aad1db4 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -27,7 +27,7 @@ def test_argsort_numpy(self, datetime_series): # with missing values ts = ser.copy() - ts[::2] = np.NaN + ts[::2] = np.nan msg = "The behavior of Series.argsort in the presence of NA values" with tm.assert_produces_warning( diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index d5f99f721d323..31c264d74d063 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -65,8 +65,8 @@ def test_scalar(self): rng = date_range("1/1/1990", periods=N, freq="53s") # Explicit cast to float avoid implicit cast when setting nan ts = Series(np.arange(N), index=rng, dtype="float") - ts.iloc[5:10] = np.NaN - ts.iloc[15:20] = np.NaN + ts.iloc[5:10] = np.nan + ts.iloc[15:20] = np.nan val1 = ts.asof(ts.index[7]) val2 = ts.asof(ts.index[19]) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index c7ca73da9ae66..d2d8eab1cb38b 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -36,7 +36,7 @@ def test_combine_first(self): series = Series(values, index=tm.makeIntIndex(20)) series_copy = series * 2 - series_copy[::2] = np.NaN + series_copy[::2] = np.nan # nothing used from the input combined = series.combine_first(series_copy) @@ -70,14 +70,14 @@ def test_combine_first(self): tm.assert_series_equal(ser, result) def test_combine_first_dt64(self): - s0 = to_datetime(Series(["2010", np.NaN])) - s1 = to_datetime(Series([np.NaN, "2011"])) + s0 = to_datetime(Series(["2010", np.nan])) + s1 = to_datetime(Series([np.nan, "2011"])) rs = s0.combine_first(s1) xp = to_datetime(Series(["2010", "2011"])) tm.assert_series_equal(rs, xp) - s0 = to_datetime(Series(["2010", np.NaN])) - s1 = Series([np.NaN, "2011"]) + s0 = to_datetime(Series(["2010", np.nan])) + s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 5ebf45090d7b8..77600e0e7d293 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -27,7 +27,7 @@ def test_copy(self, deep, using_copy_on_write): else: assert not np.may_share_memory(ser.values, ser2.values) - ser2[::2] = np.NaN + ser2[::2] = np.nan if deep is not False or using_copy_on_write: # Did not modify original Series diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index 90984a2e65cba..9ba163f347198 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -12,7 +12,7 @@ class TestSeriesCount: def test_count(self, datetime_series): assert datetime_series.count() == len(datetime_series) - datetime_series[::2] = np.NaN + datetime_series[::2] = np.nan assert datetime_series.count() == np.isfinite(datetime_series).sum() diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 7e4503be2ec47..96c2e1ba6d9bb 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -71,7 +71,7 @@ def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): class TestSeriesDropDuplicates: @pytest.fixture( - params=["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"] + params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"] ) def dtype(self, request): return request.param diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 96c3674541e6b..46bc14da59eb0 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -75,7 +75,7 @@ def test_fillna(self): tm.assert_series_equal(ts, ts.fillna(method="ffill")) - ts.iloc[2] = np.NaN + ts.iloc[2] = np.nan exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="ffill"), exp) @@ -881,7 +881,7 @@ def test_fillna_bug(self): def test_ffill(self): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts.iloc[2] = np.NaN + ts.iloc[2] = np.nan tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) def test_ffill_mixed_dtypes_without_missing_data(self): @@ -892,7 +892,7 @@ def test_ffill_mixed_dtypes_without_missing_data(self): def test_bfill(self): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts.iloc[2] = np.NaN + ts.iloc[2] = np.nan tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) def test_pad_nan(self): diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index a984cd16997aa..619690f400d98 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -94,7 +94,7 @@ def test_interpolate(self, datetime_series): ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() - ts_copy[5:10] = np.NaN + ts_copy[5:10] = np.nan linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) @@ -104,7 +104,7 @@ def test_interpolate(self, datetime_series): ).astype(float) ord_ts_copy = ord_ts.copy() - ord_ts_copy[5:10] = np.NaN + ord_ts_copy[5:10] = np.nan time_interp = ord_ts_copy.interpolate(method="time") tm.assert_series_equal(time_interp, ord_ts) @@ -112,7 +112,7 @@ def test_interpolate(self, datetime_series): def test_interpolate_time_raises_for_non_timeseries(self): # When method='time' is used on a non-TimeSeries that contains a null # value, a ValueError should be raised. - non_ts = Series([0, 1, 2, np.NaN]) + non_ts = Series([0, 1, 2, np.nan]) msg = "time-weighted interpolation only works on Series.* with a DatetimeIndex" with pytest.raises(ValueError, match=msg): non_ts.interpolate(method="time") diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 00d1ad99332e9..783e18e541ad8 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -104,7 +104,7 @@ def test_map_series_stringdtype(any_string_dtype): @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.NaN], "category"), (["1-1", "1-2", np.NaN], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], ) def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 @@ -114,7 +114,7 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - expected = Series(["1", "1", np.NaN], dtype=expected_dtype) + expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -229,11 +229,11 @@ def test_map_int(): left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) right = Series({1: 11, 2: 22, 3: 33}) - assert left.dtype == np.float_ + assert left.dtype == np.float64 assert issubclass(right.dtype.type, np.integer) merged = left.map(right) - assert merged.dtype == np.float_ + assert merged.dtype == np.float64 assert isna(merged["d"]) assert not isna(merged["c"]) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 38a42062b275e..4dabf7b87e2cd 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -40,7 +40,7 @@ def test_pct_change_with_duplicate_axis(self): result = Series(range(5), common_idx).pct_change(freq="B") # the reason that the expected should be like this is documented at PR 28681 - expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + expected = Series([np.nan, np.inf, np.nan, np.nan, 3.0], common_idx) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 766a2415d89fb..24cf97c05c0a8 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -185,7 +185,7 @@ def test_rank_categorical(self): # Test na_option for rank data na_ser = Series( - ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN] + ["first", "second", "third", "fourth", "fifth", "sixth", np.nan] ).astype( CategoricalDtype( ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], @@ -195,7 +195,7 @@ def test_rank_categorical(self): exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) - exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) + exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.nan]) tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) @@ -204,7 +204,7 @@ def test_rank_categorical(self): # Test na_option for rank data with ascending False exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) - exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) + exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.nan]) tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) tm.assert_series_equal( @@ -223,12 +223,12 @@ def test_rank_categorical(self): na_ser.rank(na_option=True, ascending=False) # Test with pct=True - na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( + na_ser = Series(["first", "second", "third", "fourth", np.nan]).astype( CategoricalDtype(["first", "second", "third", "fourth"], True) ) exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) - exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) + exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.nan]) tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 52446f96009d5..2ab1cd13a31d8 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -194,7 +194,7 @@ def test_reindex_int(datetime_series): reindexed_int = int_ts.reindex(datetime_series.index) # if NaNs introduced - assert reindexed_int.dtype == np.float_ + assert reindexed_int.dtype == np.float64 # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) @@ -425,11 +425,11 @@ def test_reindexing_with_float64_NA_log(): s = Series([1.0, NA], dtype=Float64Dtype()) s_reindex = s.reindex(range(3)) result = s_reindex.values._data - expected = np.array([1, np.NaN, np.NaN]) + expected = np.array([1, np.nan, np.nan]) tm.assert_numpy_array_equal(result, expected) with tm.assert_produces_warning(None): result_log = np.log(s_reindex) - expected_log = Series([0, np.NaN, np.NaN], dtype=Float64Dtype()) + expected_log = Series([0, np.nan, np.nan], dtype=Float64Dtype()) tm.assert_series_equal(result_log, expected_log) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index c3e074dc68c82..4808272879071 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -18,7 +18,7 @@ def test_sort_values(self, datetime_series, using_copy_on_write): tm.assert_series_equal(expected, result) ts = datetime_series.copy() - ts[:5] = np.NaN + ts[:5] = np.nan vals = ts.values result = ts.sort_values() diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 331afc4345616..611f4a7f790a6 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -165,7 +165,7 @@ def test_constructor(self, datetime_series): assert id(datetime_series.index) == id(derived.index) # Mixed type Series - mixed = Series(["hello", np.NaN], index=[0, 1]) + mixed = Series(["hello", np.nan], index=[0, 1]) assert mixed.dtype == np.object_ assert np.isnan(mixed[1]) @@ -1464,8 +1464,8 @@ def test_fromDict(self): assert series.dtype == np.float64 def test_fromValue(self, datetime_series): - nans = Series(np.NaN, index=datetime_series.index, dtype=np.float64) - assert nans.dtype == np.float_ + nans = Series(np.nan, index=datetime_series.index, dtype=np.float64) + assert nans.dtype == np.float64 assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 4c5fd2d44e4f4..e6f7b2a5e69e0 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -31,7 +31,7 @@ def test_datetime_series(self, datetime_series, func): # with missing values ts = datetime_series.copy() - ts[::2] = np.NaN + ts[::2] = np.nan result = func(ts)[1::2] expected = func(np.array(ts.dropna())) @@ -47,7 +47,7 @@ def test_cummin_cummax(self, datetime_series, method): tm.assert_numpy_array_equal(result, expected) ts = datetime_series.copy() - ts[::2] = np.NaN + ts[::2] = np.nan result = getattr(ts, method)()[1::2] expected = ufunc(ts.dropna()) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 4dab3e8f62598..26046ef9ba295 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -93,7 +93,7 @@ def test_logical_operators_int_dtype_with_float(self): msg = "Cannot perform.+with a dtyped.+array and scalar of type" with pytest.raises(TypeError, match=msg): - s_0123 & np.NaN + s_0123 & np.nan with pytest.raises(TypeError, match=msg): s_0123 & 3.14 msg = "unsupported operand type.+for &:" @@ -149,11 +149,11 @@ def test_logical_operators_int_dtype_with_object(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") - result = s_0123 & Series([False, np.NaN, False, False]) + result = s_0123 & Series([False, np.nan, False, False]) expected = Series([False] * 4) tm.assert_series_equal(result, expected) - s_abNd = Series(["a", "b", np.NaN, "d"]) + s_abNd = Series(["a", "b", np.nan, "d"]) with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): s_0123 & s_abNd diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 9f17f6d86cf93..cafc69c4d0f20 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -84,7 +84,7 @@ def test_logical_range_select(self, datetime_series): def test_valid(self, datetime_series): ts = datetime_series.copy() ts.index = ts.index._with_freq(None) - ts[::2] = np.NaN + ts[::2] = np.nan result = ts.dropna() assert len(result) == ts.count() diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 4c92b5694c43b..f294885fb8f4d 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -83,7 +83,7 @@ def test_string(self, string_series): str(string_series.astype(int)) # with NaNs - string_series[5:7] = np.NaN + string_series[5:7] = np.nan str(string_series) def test_object(self, object_series): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 76784ec726afe..a0062d2b6dd44 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -323,7 +323,7 @@ def check_fun_data( res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) if ( - isinstance(targ, np.complex_) + isinstance(targ, np.complex128) and isinstance(res, float) and np.isnan(targ) and np.isnan(res) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 5e51edfee17f1..93fe9b05adb4f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1570,8 +1570,8 @@ def test_convert_object_to_datetime_with_cache( (Series([""] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), (Series([pd.NA] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), (Series([pd.NA] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([np.NaN] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([np.NaN] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), + (Series([np.nan] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), + (Series([np.nan] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), ), ) def test_to_datetime_converts_null_like_to_nat(self, cache, input, expected): diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index a86302f158005..8527efdbf7867 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -293,7 +293,7 @@ def test_assert_almost_equal_null(): _assert_almost_equal_both(None, None) -@pytest.mark.parametrize("a,b", [(None, np.NaN), (None, 0), (np.NaN, 0)]) +@pytest.mark.parametrize("a,b", [(None, np.nan), (None, 0), (np.nan, 0)]) def test_assert_not_almost_equal_null(a, b): _assert_not_almost_equal(a, b) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 2dd4458172593..73ab470ab97a7 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -126,7 +126,7 @@ def series(): """Make mocked series as fixture.""" arr = np.random.default_rng(2).standard_normal(100) locs = np.arange(20, 40) - arr[locs] = np.NaN + arr[locs] = np.nan series = Series(arr, index=bdate_range(datetime(2009, 1, 1), periods=100)) return series diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index d901fe58950e3..33858e10afd75 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -223,9 +223,9 @@ def test_count_nonnumeric_types(step): Period("2012-02"), Period("2012-03"), ], - "fl_inf": [1.0, 2.0, np.Inf], - "fl_nan": [1.0, 2.0, np.NaN], - "str_nan": ["aa", "bb", np.NaN], + "fl_inf": [1.0, 2.0, np.inf], + "fl_nan": [1.0, 2.0, np.nan], + "str_nan": ["aa", "bb", np.nan], "dt_nat": dt_nat_col, "periods_nat": [ Period("2012-01"), diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 6af5a41e96e0a..4e4eca6e772e7 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -184,8 +184,8 @@ def numpysum(x, par): def test_nans(raw): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(50, min_periods=30).apply(f, raw=raw) tm.assert_almost_equal(result.iloc[-1], np.mean(obj[10:-10])) @@ -210,12 +210,12 @@ def test_nans(raw): def test_center(raw): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(20, min_periods=15, center=True).apply(f, raw=raw) expected = ( - concat([obj, Series([np.NaN] * 9)]) + concat([obj, Series([np.nan] * 9)]) .rolling(20, min_periods=15) .apply(f, raw=raw) .iloc[9:] diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 45d481fdd2e44..c5c395414b450 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -417,7 +417,7 @@ def test_ewm_alpha(): # GH 10789 arr = np.random.default_rng(2).standard_normal(100) locs = np.arange(20, 40) - arr[locs] = np.NaN + arr[locs] = np.nan s = Series(arr) a = s.ewm(alpha=0.61722699889169674).mean() @@ -433,7 +433,7 @@ def test_ewm_domain_checks(): # GH 12492 arr = np.random.default_rng(2).standard_normal(100) locs = np.arange(20, 40) - arr[locs] = np.NaN + arr[locs] = np.nan s = Series(arr) msg = "comass must satisfy: comass >= 0" @@ -484,8 +484,8 @@ def test_ew_empty_series(method): def test_ew_min_periods(min_periods, name): # excluding NaNs correctly arr = np.random.default_rng(2).standard_normal(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN + arr[:10] = np.nan + arr[-10:] = np.nan s = Series(arr) # check min_periods @@ -515,11 +515,11 @@ def test_ew_min_periods(min_periods, name): else: # ewm.std, ewm.var with bias=False require at least # two values - tm.assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.nan])) # pass in ints result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() - assert result2.dtype == np.float_ + assert result2.dtype == np.float64 @pytest.mark.parametrize("name", ["cov", "corr"]) @@ -527,8 +527,8 @@ def test_ewm_corr_cov(name): A = Series(np.random.default_rng(2).standard_normal(50), index=range(50)) B = A[2:] + np.random.default_rng(2).standard_normal(48) - A[:10] = np.NaN - B.iloc[-10:] = np.NaN + A[:10] = np.nan + B.iloc[-10:] = np.nan result = getattr(A.ewm(com=20, min_periods=5), name)(B) assert np.isnan(result.values[:14]).all() @@ -542,8 +542,8 @@ def test_ewm_corr_cov_min_periods(name, min_periods): A = Series(np.random.default_rng(2).standard_normal(50), index=range(50)) B = A[2:] + np.random.default_rng(2).standard_normal(48) - A[:10] = np.NaN - B.iloc[-10:] = np.NaN + A[:10] = np.nan + B.iloc[-10:] = np.nan result = getattr(A.ewm(com=20, min_periods=min_periods), name)(B) # binary functions (ewmcov, ewmcorr) with bias=False require at @@ -560,13 +560,13 @@ def test_ewm_corr_cov_min_periods(name, min_periods): result = getattr(Series([1.0]).ewm(com=50, min_periods=min_periods), name)( Series([1.0]) ) - tm.assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.nan])) @pytest.mark.parametrize("name", ["cov", "corr"]) def test_different_input_array_raise_exception(name): A = Series(np.random.default_rng(2).standard_normal(50), index=range(50)) - A[:10] = np.NaN + A[:10] = np.nan msg = "other must be a DataFrame or Series" # exception raised is Exception diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 8dac6d271510a..b6f2365afb457 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -410,7 +410,7 @@ def test_cov_mulittindex(self): expected = DataFrame( np.vstack( ( - np.full((8, 8), np.NaN), + np.full((8, 8), np.nan), np.full((8, 8), 32.000000), np.full((8, 8), 63.881919), ) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 4df20282bbfa6..70b7534b296f3 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -142,7 +142,7 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): index=date_range("2017-08-08", periods=n, freq="D"), ) expected = DataFrame( - {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, + {"value": np.append([np.nan, 1.0], np.arange(3.0, 27.0, 3))}, index=date_range("2017-08-08", periods=n, freq="D"), ) result_roll_sum = df.rolling(window=window, min_periods=2).sum() @@ -1461,15 +1461,15 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): 0.03, 0.03, 0.001, - np.NaN, + np.nan, 0.002, 0.008, - np.NaN, - np.NaN, - np.NaN, - np.NaN, - np.NaN, - np.NaN, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, 0.005, 0.2, ] @@ -1480,8 +1480,8 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): 0.005, 0.005, 0.008, - np.NaN, - np.NaN, + np.nan, + np.nan, 0.005, 0.102500, ] @@ -1495,7 +1495,7 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): def test_rolling_sum_all_nan_window_floating_artifacts(): # GH#41053 - df = DataFrame([0.002, 0.008, 0.005, np.NaN, np.NaN, np.NaN]) + df = DataFrame([0.002, 0.008, 0.005, np.nan, np.nan, np.nan]) result = df.rolling(3, min_periods=0).sum() expected = DataFrame([0.002, 0.010, 0.015, 0.013, 0.005, 0.0]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index bc0b3e496038c..940f0845befa2 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -150,8 +150,8 @@ def test_time_rule_frame(raw, frame, compare_func, roll_func, kwargs, minp): ) def test_nans(compare_func, roll_func, kwargs): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(50, min_periods=30), roll_func)(**kwargs) tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) @@ -177,8 +177,8 @@ def test_nans(compare_func, roll_func, kwargs): def test_nans_count(): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(50, min_periods=30).count() tm.assert_almost_equal( result.iloc[-1], np.isfinite(obj[10:-10]).astype(float).sum() @@ -241,15 +241,15 @@ def test_min_periods_count(series, step): ) def test_center(roll_func, kwargs, minp): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(20, min_periods=minp, center=True), roll_func)( **kwargs ) expected = ( getattr( - concat([obj, Series([np.NaN] * 9)]).rolling(20, min_periods=minp), roll_func + concat([obj, Series([np.nan] * 9)]).rolling(20, min_periods=minp), roll_func )(**kwargs) .iloc[9:] .reset_index(drop=True) diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py index 32296ae3f2470..d5a7010923563 100644 --- a/pandas/tests/window/test_rolling_quantile.py +++ b/pandas/tests/window/test_rolling_quantile.py @@ -89,8 +89,8 @@ def test_time_rule_frame(raw, frame, q): def test_nans(q): compare_func = partial(scoreatpercentile, per=q) obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(50, min_periods=30).quantile(q) tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) @@ -128,12 +128,12 @@ def test_min_periods(series, minp, q, step): @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) def test_center(q): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = obj.rolling(20, center=True).quantile(q) expected = ( - concat([obj, Series([np.NaN] * 9)]) + concat([obj, Series([np.nan] * 9)]) .rolling(20) .quantile(q) .iloc[9:] diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index ada726401c4a0..79c14f243e7cc 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -79,8 +79,8 @@ def test_nans(sp_func, roll_func): compare_func = partial(getattr(sp_stats, sp_func), bias=False) obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(50, min_periods=30), roll_func)() tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) @@ -122,12 +122,12 @@ def test_min_periods(series, minp, roll_func, step): @pytest.mark.parametrize("roll_func", ["kurt", "skew"]) def test_center(roll_func): obj = Series(np.random.default_rng(2).standard_normal(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN + obj[:10] = np.nan + obj[-10:] = np.nan result = getattr(obj.rolling(20, center=True), roll_func)() expected = ( - getattr(concat([obj, Series([np.NaN] * 9)]).rolling(20), roll_func)() + getattr(concat([obj, Series([np.nan] * 9)]).rolling(20), roll_func)() .iloc[9:] .reset_index(drop=True) ) @@ -170,14 +170,14 @@ def test_center_reindex_frame(frame, roll_func): def test_rolling_skew_edge_cases(step): - expected = Series([np.NaN] * 4 + [0.0])[::step] + expected = Series([np.nan] * 4 + [0.0])[::step] # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5, step=step).skew() # index 4 should be 0 as it contains 5 same obs tm.assert_series_equal(expected, x) - expected = Series([np.NaN] * 5)[::step] + expected = Series([np.nan] * 5)[::step] # yields all NaN (window too small) d = Series(np.random.default_rng(2).standard_normal(5)) x = d.rolling(window=2, step=step).skew() @@ -185,13 +185,13 @@ def test_rolling_skew_edge_cases(step): # yields [NaN, NaN, NaN, 0.177994, 1.548824] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824])[::step] + expected = Series([np.nan, np.nan, np.nan, 0.177994, 1.548824])[::step] x = d.rolling(window=4, step=step).skew() tm.assert_series_equal(expected, x) def test_rolling_kurt_edge_cases(step): - expected = Series([np.NaN] * 4 + [-3.0])[::step] + expected = Series([np.nan] * 4 + [-3.0])[::step] # yields all NaN (0 variance) d = Series([1] * 5) @@ -199,14 +199,14 @@ def test_rolling_kurt_edge_cases(step): tm.assert_series_equal(expected, x) # yields all NaN (window too small) - expected = Series([np.NaN] * 5)[::step] + expected = Series([np.nan] * 5)[::step] d = Series(np.random.default_rng(2).standard_normal(5)) x = d.rolling(window=3, step=step).kurt() tm.assert_series_equal(expected, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) - expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499])[::step] + expected = Series([np.nan, np.nan, np.nan, 1.224307, 2.671499])[::step] x = d.rolling(window=4, step=step).kurt() tm.assert_series_equal(expected, x) diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index 2ca02fef796ed..5052019ddb726 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -666,7 +666,7 @@ def test_weighted_var_big_window_no_segfault(win_types, center): pytest.importorskip("scipy") x = Series(0) result = x.rolling(window=16, center=center, win_type=win_types).var() - expected = Series(np.NaN) + expected = Series(np.nan) tm.assert_series_equal(result, expected) From 49c2f33450f331967c0242717ce0f09fa92f17e7 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 16 Aug 2023 16:24:52 -0700 Subject: [PATCH 014/174] Backport PR #54545 on branch 2.1.x (DOC: whatsnew 2.1.0 refinements) (#54588) Backport PR #54545: DOC: whatsnew 2.1.0 refinements Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/reference/series.rst | 2 + doc/source/whatsnew/v2.1.0.rst | 265 ++++++++++++++++---------------- 2 files changed, 131 insertions(+), 136 deletions(-) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 5a43e5796d1d9..41705620d4bc7 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -314,6 +314,7 @@ Datetime properties Series.dt.weekday Series.dt.dayofyear Series.dt.day_of_year + Series.dt.days_in_month Series.dt.quarter Series.dt.is_month_start Series.dt.is_month_end @@ -327,6 +328,7 @@ Datetime properties Series.dt.tz Series.dt.freq Series.dt.unit + Series.dt.normalize Datetime methods ^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d1a689dc60830..a8004dfd506b0 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -44,7 +44,7 @@ This release introduces an option ``future.infer_string`` that infers all strings as PyArrow backed strings with dtype ``pd.ArrowDtype(pa.string())`` instead. This option only works if PyArrow is installed. PyArrow backed strings have a significantly reduced memory footprint and provide a big performance improvement -compared to NumPy object. +compared to NumPy object (:issue:`54430`). The option can be enabled with: @@ -60,8 +60,8 @@ DataFrame reductions preserve extension dtypes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions of pandas, the results of DataFrame reductions -(:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) had numpy dtypes, even when the DataFrames -were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe +(:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) had NumPy dtypes, even when the DataFrames +were of extension dtypes. Pandas can now keep the dtypes when doing reductions over DataFrame columns with a common dtype (:issue:`52788`). *Old Behavior* @@ -90,9 +90,9 @@ columns with a common dtype (:issue:`52788`). df = df.astype("int64[pyarrow]") df.sum() -Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype. +Notice that the dtype is now a masked dtype and PyArrow dtype, respectively, while previously it was a NumPy integer dtype. -To allow Dataframe reductions to preserve extension dtypes, :meth:`ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :meth:`ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :meth:`ExtensionArray._reduce`. +To allow DataFrame reductions to preserve extension dtypes, :meth:`.ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :meth:`.ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :meth:`.ExtensionArray._reduce`. .. _whatsnew_210.enhancements.cow: @@ -106,7 +106,7 @@ Copy-on-Write improvements of Index objects and specifying ``copy=False``, will now use a lazy copy of those Index objects for the columns of the DataFrame (:issue:`52947`) - A shallow copy of a Series or DataFrame (``df.copy(deep=False)``) will now also return - a shallow copy of the rows/columns ``Index`` objects instead of only a shallow copy of + a shallow copy of the rows/columns :class:`Index` objects instead of only a shallow copy of the data, i.e. the index of the result is no longer identical (``df.copy(deep=False).index is df.index`` is no longer True) (:issue:`53721`) - :meth:`DataFrame.head` and :meth:`DataFrame.tail` will now return deep copies (:issue:`54011`) @@ -130,8 +130,10 @@ Copy-on-Write improvements .. _whatsnew_210.enhancements.map_na_action: -``map(func, na_action="ignore")`` now works for all array types -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +New :meth:`DataFrame.map` method and support for ExtensionArrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`DataFrame.map` been added and :meth:`DataFrame.applymap` has been deprecated. :meth:`DataFrame.map` has the same functionality as :meth:`DataFrame.applymap`, but the new name better communicates that this is the :class:`DataFrame` version of :meth:`Series.map` (:issue:`52353`). When given a callable, :meth:`Series.map` applies the callable to all elements of the :class:`Series`. Similarly, :meth:`DataFrame.map` applies the callable to all elements of the :class:`DataFrame`, @@ -139,8 +141,8 @@ while :meth:`Index.map` applies the callable to all elements of the :class:`Inde Frequently, it is not desirable to apply the callable to nan-like values of the array and to avoid doing that, the ``map`` method could be called with ``na_action="ignore"``, i.e. ``ser.map(func, na_action="ignore")``. -However, ``na_action="ignore"`` was not implemented for many ``ExtensionArray`` and ``Index`` types -and ``na_action="ignore"`` did not work correctly for any ``ExtensionArray`` subclass except the nullable numeric ones (i.e. with dtype :class:`Int64` etc.). +However, ``na_action="ignore"`` was not implemented for many :class:`.ExtensionArray` and ``Index`` types +and ``na_action="ignore"`` did not work correctly for any :class:`.ExtensionArray` subclass except the nullable numeric ones (i.e. with dtype :class:`Int64` etc.). ``na_action="ignore"`` now works for all array types (:issue:`52219`, :issue:`51645`, :issue:`51809`, :issue:`51936`, :issue:`52033`; :issue:`52096`). @@ -172,11 +174,9 @@ and ``na_action="ignore"`` did not work correctly for any ``ExtensionArray`` sub idx = pd.Index(ser) idx.map(str.upper, na_action="ignore") -Notice also that in this version, :meth:`DataFrame.map` been added and :meth:`DataFrame.applymap` has been deprecated. :meth:`DataFrame.map` has the same functionality as :meth:`DataFrame.applymap`, but the new name better communicate that this is the :class:`DataFrame` version of :meth:`Series.map` (:issue:`52353`). - Also, note that :meth:`Categorical.map` implicitly has had its ``na_action`` set to ``"ignore"`` by default. -This has been deprecated and will :meth:`Categorical.map` in the future change the default -to ``na_action=None``, like for all the other array types. +This has been deprecated and the default for :meth:`Categorical.map` will change +to ``na_action=None``, consistent with all the other array types. .. _whatsnew_210.enhancements.new_stack: @@ -222,8 +222,9 @@ If the input contains NA values, the previous version would drop those as well w Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Series.ffill` and :meth:`Series.bfill` are now supported for objects with :class:`IntervalDtype` (:issue:`54247`) -- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. - :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. +- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`) +- :meth:`.Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. + :meth:`.Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) @@ -231,49 +232,41 @@ Other enhancements - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). -- :meth:`pandas.read_html` now supports the ``storage_options`` keyword when used with a URL, allowing users to add headers the outbound HTTP request (:issue:`49944`) -- Add :meth:`diff()` and :meth:`round()` for :class:`Index` (:issue:`19708`) +- :meth:`pandas.read_html` now supports the ``storage_options`` keyword when used with a URL, allowing users to add headers to the outbound HTTP request (:issue:`49944`) +- Add :meth:`Index.diff` and :meth:`Index.round` (:issue:`19708`) +- Add ``"latex-math"`` as an option to the ``escape`` argument of :class:`.Styler` which will not escape all characters between ``"\("`` and ``"\)"`` during formatting (:issue:`51903`) - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) -- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`) - Adding ``engine_kwargs`` parameter to :func:`read_excel` (:issue:`52214`) - Classes that are useful for type-hinting have been added to the public API in the new submodule ``pandas.api.typing`` (:issue:`48577`) -- Implemented :attr:`Series.dt.is_month_start`, :attr:`Series.dt.is_month_end`, :attr:`Series.dt.is_year_start`, :attr:`Series.dt.is_year_end`, :attr:`Series.dt.is_quarter_start`, :attr:`Series.dt.is_quarter_end`, :attr:`Series.dt.is_days_in_month`, :attr:`Series.dt.unit`, :attr:`Series.dt.is_normalize`, :meth:`Series.dt.day_name`, :meth:`Series.dt.month_name`, :meth:`Series.dt.tz_convert` for :class:`ArrowDtype` with ``pyarrow.timestamp`` (:issue:`52388`, :issue:`51718`) -- :meth:`.DataFrameGroupby.agg` and :meth:`.DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) -- :meth:`.SeriesGroupby.agg` and :meth:`.DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) -- :meth:`.SeriesGroupby.transform` and :meth:`.DataFrameGroupby.transform` now support passing in a string as the function for ``engine="numba"`` (:issue:`53579`) -- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) +- Implemented :attr:`Series.dt.is_month_start`, :attr:`Series.dt.is_month_end`, :attr:`Series.dt.is_year_start`, :attr:`Series.dt.is_year_end`, :attr:`Series.dt.is_quarter_start`, :attr:`Series.dt.is_quarter_end`, :attr:`Series.dt.days_in_month`, :attr:`Series.dt.unit`, :attr:`Series.dt.normalize`, :meth:`Series.dt.day_name`, :meth:`Series.dt.month_name`, :meth:`Series.dt.tz_convert` for :class:`ArrowDtype` with ``pyarrow.timestamp`` (:issue:`52388`, :issue:`51718`) +- :meth:`.DataFrameGroupBy.agg` and :meth:`.DataFrameGroupBy.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) +- :meth:`.SeriesGroupBy.agg` and :meth:`.DataFrameGroupBy.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) +- :meth:`.SeriesGroupBy.transform` and :meth:`.DataFrameGroupBy.transform` now support passing in a string as the function for ``engine="numba"`` (:issue:`53579`) - :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) -- :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`) +- :meth:`Series.explode` now supports PyArrow-backed list types (:issue:`53602`) - :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`53646`) -- Added :meth:`ExtensionArray.interpolate` used by :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`53659`) +- Add ``validate`` parameter to :meth:`Categorical.from_codes` (:issue:`50975`) +- Added :meth:`.ExtensionArray.interpolate` used by :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`53659`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Implemented :func:`api.interchange.from_dataframe` for :class:`DatetimeTZDtype` (:issue:`54239`) -- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`) -- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`) +- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype` (:issue:`52201`) +- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`.ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`) - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`) - Improved error handling when using :meth:`DataFrame.to_json` with incompatible ``index`` and ``orient`` arguments (:issue:`52143`) -- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) -- Improved error message when providing an invalid ``index`` or ``offset`` argument to :class:`pandas.api.indexers.VariableOffsetWindowIndexer` (:issue:`54379`) +- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns (:issue:`52084`) +- Improved error message when providing an invalid ``index`` or ``offset`` argument to :class:`.VariableOffsetWindowIndexer` (:issue:`54379`) - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`). - :meth:`DataFrame.shift` and :meth:`Series.shift` now allow shifting by multiple periods by supplying a list of periods (:issue:`44424`) -- Groupby aggregations (such as :meth:`.DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) +- Groupby aggregations with ``numba`` (such as :meth:`.DataFrameGroupBy.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) - Improved error message when :meth:`.DataFrameGroupBy.agg` failed (:issue:`52930`) -- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) -- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) +- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to ``lzma.LZMAFile`` (:issue:`52979`) +- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) - :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) - Added support for the DataFrame Consortium Standard (:issue:`54383`) -- Performance improvement in :meth:`.GroupBy.quantile` (:issue:`51722`) - -.. --------------------------------------------------------------------------- -.. _whatsnew_210.notable_bug_fixes: - -Notable bug fixes -~~~~~~~~~~~~~~~~~ - -These are bug fixes that might have notable behavior changes. +- Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`) .. --------------------------------------------------------------------------- .. _whatsnew_210.api_breaking: @@ -363,7 +356,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- :class:`arrays.PandasArray` has been renamed ``NumpyExtensionArray`` and the attached dtype name changed from ``PandasDtype`` to ``NumpyEADtype``; importing ``PandasArray`` still works until the next major version (:issue:`53694`) +- :class:`arrays.PandasArray` has been renamed :class:`.NumpyExtensionArray` and the attached dtype name changed from ``PandasDtype`` to ``NumpyEADtype``; importing ``PandasArray`` still works until the next major version (:issue:`53694`) .. --------------------------------------------------------------------------- .. _whatsnew_210.deprecations: @@ -374,6 +367,8 @@ Deprecations Deprecated silent upcasting in setitem-like Series operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +PDEP-6: https://pandas.pydata.org/pdeps/0006-ban-upcasting.html + Setitem-like operations on Series (or DataFrame columns) which silently upcast the dtype are deprecated and show a warning. Examples of affected operations are: @@ -506,34 +501,33 @@ and ``datetime.datetime.strptime``: Other Deprecations ^^^^^^^^^^^^^^^^^^ -- Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) -- Deprecated 'downcast' keyword in :meth:`Index.fillna` (:issue:`53956`) -- Deprecated 'fill_method' and 'limit' keywords in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`.DataFrameGroupBy.pct_change`, and :meth:`.SeriesGroupBy.pct_change`, explicitly call ``ffill`` or ``bfill`` before calling ``pct_change`` instead (:issue:`53491`) -- Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) -- Deprecated 'quantile' keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed as 'q' instead (:issue:`52550`) - Deprecated :attr:`.DataFrameGroupBy.dtypes`, check ``dtypes`` on the underlying object instead (:issue:`51045`) - Deprecated :attr:`DataFrame._data` and :attr:`Series._data`, use public APIs instead (:issue:`33333`) - Deprecated :func:`concat` behavior when any of the objects being concatenated have length 0; in the past the dtypes of empty objects were ignored when determining the resulting dtype, in a future version they will not (:issue:`39122`) +- Deprecated :meth:`.Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Deprecated :meth:`.DataFrameGroupBy.all` and :meth:`.DataFrameGroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`) -- Deprecated :meth:`.DataFrameGroupBy.apply` and methods on the objects returned by :meth:`.DataFrameGroupBy.resample` operating on the grouping column(s); select the columns to operate on after groupby to either explicitly include or exclude the groupings and avoid the ``FutureWarning`` (:issue:`7155`) -- Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Deprecated ``axis=1`` in :meth:`DataFrame.ewm`, :meth:`DataFrame.rolling`, :meth:`DataFrame.expanding`, transpose before calling the method instead (:issue:`51778`) - Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`) +- Deprecated ``broadcast_axis`` keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) +- Deprecated ``downcast`` keyword in :meth:`Index.fillna` (:issue:`53956`) +- Deprecated ``fill_method`` and ``limit`` keywords in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`.DataFrameGroupBy.pct_change`, and :meth:`.SeriesGroupBy.pct_change`, explicitly call e.g. :meth:`DataFrame.ffill` or :meth:`DataFrame.bfill` before calling ``pct_change`` instead (:issue:`53491`) +- Deprecated ``method``, ``limit``, and ``fill_axis`` keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call :meth:`DataFrame.fillna` or :meth:`Series.fillna` on the alignment results instead (:issue:`51856`) +- Deprecated ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead (:issue:`52550`) - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`) - Deprecated behavior of :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin` in with all-NA entries or any-NA and ``skipna=False``; in a future version these will raise ``ValueError`` (:issue:`51276`) - Deprecated explicit support for subclassing :class:`Index` (:issue:`45289`) -- Deprecated making functions given to :meth:`Series.agg` attempt to operate on each element in the :class:`Series` and only operate on the whole :class:`Series` if the elementwise operations failed. In the future, functions given to :meth:`Series.agg` will always operate on the whole :class:`Series` only. To keep the current behavior, use :meth:`Series.transform` instead. (:issue:`53325`) -- Deprecated making the functions in a list of functions given to :meth:`DataFrame.agg` attempt to operate on each element in the :class:`DataFrame` and only operate on the columns of the :class:`DataFrame` if the elementwise operations failed. To keep the current behavior, use :meth:`DataFrame.transform` instead. (:issue:`53325`) +- Deprecated making functions given to :meth:`Series.agg` attempt to operate on each element in the :class:`Series` and only operate on the whole :class:`Series` if the elementwise operations failed. In the future, functions given to :meth:`Series.agg` will always operate on the whole :class:`Series` only. To keep the current behavior, use :meth:`Series.transform` instead (:issue:`53325`) +- Deprecated making the functions in a list of functions given to :meth:`DataFrame.agg` attempt to operate on each element in the :class:`DataFrame` and only operate on the columns of the :class:`DataFrame` if the elementwise operations failed. To keep the current behavior, use :meth:`DataFrame.transform` instead (:issue:`53325`) - Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`) - Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`) -- Deprecated the "downcast" keyword in :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`Series.ffill`, :meth:`DataFrame.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.bfill` (:issue:`40988`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.ewm`, :meth:`Series.ewm`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.expanding` (:issue:`51778`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.resample`, :meth:`Series.resample` (:issue:`51778`) +- Deprecated the ``downcast`` keyword in :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`Series.ffill`, :meth:`DataFrame.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.bfill` (:issue:`40988`) - Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`) - Deprecated the behavior of :meth:`Series.argsort` in the presence of NA values; in a future version these will be sorted at the end instead of giving -1 (:issue:`54219`) - Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) -- Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) -- Deprecated the 'axis' keyword in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.fillna`, :meth:`.DataFrameGroupBy.take`, :meth:`.DataFrameGroupBy.skew`, :meth:`.DataFrameGroupBy.rank`, :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cumsum`, :meth:`.DataFrameGroupBy.cummax`, :meth:`.DataFrameGroupBy.cummin`, :meth:`.DataFrameGroupBy.pct_change`, :meth:`DataFrameGroupBy.diff`, :meth:`.DataFrameGroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) +- Deprecating pinning ``group.name`` to each group in :meth:`.SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) +- Deprecated the ``axis`` keyword in :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.fillna`, :meth:`.DataFrameGroupBy.take`, :meth:`.DataFrameGroupBy.skew`, :meth:`.DataFrameGroupBy.rank`, :meth:`.DataFrameGroupBy.cumprod`, :meth:`.DataFrameGroupBy.cumsum`, :meth:`.DataFrameGroupBy.cummax`, :meth:`.DataFrameGroupBy.cummin`, :meth:`.DataFrameGroupBy.pct_change`, :meth:`.DataFrameGroupBy.diff`, :meth:`.DataFrameGroupBy.shift`, and :meth:`.DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) - Deprecated :class:`.DataFrameGroupBy` with ``as_index=False`` not including groupings in the result when they are not columns of the DataFrame (:issue:`49519`) - Deprecated :func:`is_categorical_dtype`, use ``isinstance(obj.dtype, pd.CategoricalDtype)`` instead (:issue:`52527`) - Deprecated :func:`is_datetime64tz_dtype`, check ``isinstance(dtype, pd.DatetimeTZDtype)`` instead (:issue:`52607`) @@ -545,49 +539,49 @@ Other Deprecations - Deprecated :meth:`.Styler.applymap`. Use the new :meth:`.Styler.map` method instead (:issue:`52708`) - Deprecated :meth:`DataFrame.applymap`. Use the new :meth:`DataFrame.map` method instead (:issue:`52353`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) -- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) -- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`) -- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series` before calling (:issue:`52986`) +- Deprecated ``freq`` parameter in :class:`.PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) +- Deprecated allowing non-standard inputs in :func:`take`, pass either a ``numpy.ndarray``, :class:`.ExtensionArray`, :class:`Index`, or :class:`Series` (:issue:`52981`) +- Deprecated allowing non-standard sequences for :func:`isin`, :func:`value_counts`, :func:`unique`, :func:`factorize`, case to one of ``numpy.ndarray``, :class:`Index`, :class:`.ExtensionArray`, or :class:`Series` before calling (:issue:`52986`) - Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`) - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) -- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) -- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) +- Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :attr:`Series.dt` properties (:issue:`20306`) +- Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or NumPy array before operating instead (:issue:`51521`) - Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) - Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) -- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) +- Deprecated the ``fastpath`` keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) - Deprecated the behavior of :func:`is_bool_dtype` returning ``True`` for object-dtype :class:`Index` of bool objects (:issue:`52680`) - Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) -- Deprecated unused "closed" and "normalize" keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`) -- Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`) -- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`) +- Deprecated unused ``closed`` and ``normalize`` keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`) +- Deprecated unused ``closed`` keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`) +- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs (:issue:`52500`, :issue:`52538`) - Deprecated :class:`Period` and :class:`PeriodDtype` with ``BDay`` freq, use a :class:`DatetimeIndex` with ``BDay`` freq instead (:issue:`53446`) - Deprecated :func:`value_counts`, use ``pd.Series(obj).value_counts()`` instead (:issue:`47862`) -- Deprecated :meth:`Series.first` and :meth:`DataFrame.first` (please create a mask and filter using ``.loc`` instead) (:issue:`45908`) +- Deprecated :meth:`Series.first` and :meth:`DataFrame.first`; create a mask and filter using ``.loc`` instead (:issue:`45908`) - Deprecated :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for object-dtype (:issue:`53631`) -- Deprecated :meth:`Series.last` and :meth:`DataFrame.last` (please create a mask and filter using ``.loc`` instead) (:issue:`53692`) +- Deprecated :meth:`Series.last` and :meth:`DataFrame.last`; create a mask and filter using ``.loc`` instead (:issue:`53692`) - Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`) - Deprecated allowing bool dtype in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile`, consistent with the :meth:`Series.quantile` and :meth:`DataFrame.quantile` behavior (:issue:`51424`) - Deprecated behavior of :func:`.testing.assert_series_equal` and :func:`.testing.assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`) -- Deprecated bytes input to :func:`read_excel`. To read a file path, use a string or path-like object. (:issue:`53767`) -- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) +- Deprecated bytes input to :func:`read_excel`. To read a file path, use a string or path-like object (:issue:`53767`) +- Deprecated constructing :class:`.SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) -- Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) -- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) -- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) -- Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) +- Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead (:issue:`53409`) +- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead (:issue:`53767`) +- Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead (:issue:`53767`) +- Deprecated option ``mode.use_inf_as_na``, convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`.DataFrameGroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) - Deprecated replacing builtin and NumPy functions in ``.agg``, ``.apply``, and ``.transform``; use the corresponding string alias (e.g. ``"sum"`` for ``sum`` or ``np.sum``) instead (:issue:`53425`) - Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`) -- Deprecated the "method" and "limit" keywords in ``ExtensionArray.fillna``, implement and use ``pad_or_backfill`` instead (:issue:`53621`) -- Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`.SeriesGroupBy.fillna`, :meth:`.DataFrameGroupBy.fillna`, and :meth:`.Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) +- Deprecated the "method" and "limit" keywords in ``.ExtensionArray.fillna``, implement and use ``pad_or_backfill`` instead (:issue:`53621`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) +- Deprecated the ``method`` and ``limit`` keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`.SeriesGroupBy.fillna`, :meth:`.DataFrameGroupBy.fillna`, and :meth:`.Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the behavior of :meth:`Series.__getitem__`, :meth:`Series.__setitem__`, :meth:`DataFrame.__getitem__`, :meth:`DataFrame.__setitem__` with an integer slice on objects with a floating-dtype index, in a future version this will be treated as *positional* indexing (:issue:`49612`) - Deprecated the use of non-supported datetime64 and timedelta64 resolutions with :func:`pandas.array`. Supported resolutions are: "s", "ms", "us", "ns" resolutions (:issue:`53058`) -- Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) -- Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and skipna=True or any-NAs and skipna=False returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name``. (:issue:`54229`) +- Deprecated values ``"pad"``, ``"ffill"``, ``"bfill"``, ``"backfill"`` for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) +- Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and ``skipna=True`` or any-NAs and ``skipna=False`` returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name`` (:issue:`54229`) .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: @@ -596,7 +590,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`) -- Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`) +- Performance improvement in :func:`read_orc` when reading a remote URI file path (:issue:`51609`) - Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) @@ -611,9 +605,9 @@ Performance improvements - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`) - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`) - Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`) -- :class:`Period`'s default formatter (`period_format`) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`Period.strftime(fmt=None)`, as well as ``PeriodArray.strftime(fmt=None)``, ``PeriodIndex.strftime(fmt=None)`` and ``PeriodIndex.format(fmt=None)``. Finally, ``to_csv`` operations involving :class:`PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated. (:issue:`51459`) +- :class:`Period`'s default formatter (``period_format``) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`.Period.strftime(fmt=None)`, as well as ``.PeriodArray.strftime(fmt=None)``, ``.PeriodIndex.strftime(fmt=None)`` and ``.PeriodIndex.format(fmt=None)``. ``to_csv`` operations involving :class:`.PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated (:issue:`51459`) - Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`) -- Performance improvement for :class:`DataFrameGroupBy`/:class:`SeriesGroupBy` aggregations (e.g. :meth:`DataFrameGroupBy.sum`) with ``engine="numba"`` (:issue:`53731`) +- Performance improvement for :class:`.DataFrameGroupBy`/:class:`.SeriesGroupBy` aggregations (e.g. :meth:`.DataFrameGroupBy.sum`) with ``engine="numba"`` (:issue:`53731`) - Performance improvement in :class:`DataFrame` reductions with ``axis=1`` and extension dtypes (:issue:`54341`) - Performance improvement in :class:`DataFrame` reductions with ``axis=None`` and extension dtypes (:issue:`54308`) - Performance improvement in :class:`MultiIndex` and multi-column operations (e.g. :meth:`DataFrame.sort_values`, :meth:`DataFrame.groupby`, :meth:`Series.unstack`) when index/column values are already sorted (:issue:`53806`) @@ -622,24 +616,24 @@ Performance improvements - Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`) - Performance improvement in :func:`merge` for PyArrow backed strings (:issue:`54443`) - Performance improvement in :func:`read_csv` with ``engine="c"`` (:issue:`52632`) +- Performance improvement in :meth:`.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.astype` when ``dtype`` is an extension dtype (:issue:`54299`) - Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) +- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single PyArrow dtype (:issue:`54224`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) -- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single pyarrow dtype (:issue:`54224`) -- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) +- Performance improvement in :meth:`Series.add` for PyArrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) -- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`) -- Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`) -- Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) -- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) -- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with PyArrow dtypes (:issue:`53950`) +- Performance improvement in :meth:`Series.str.get_dummies` for PyArrow-backed strings (:issue:`53655`) +- Performance improvement in :meth:`Series.str.get` for PyArrow-backed strings (:issue:`53152`) +- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for PyArrow-backed strings (:issue:`53585`) +- Performance improvement in :meth:`Series.to_numpy` when dtype is a NumPy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a PyArrow timestamp or duration dtype to NumPy (:issue:`53326`) - Performance improvement in various :class:`MultiIndex` set and indexing operations (:issue:`53955`) -- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) -- Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) +- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArray` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) +- Performance improvement when indexing with PyArrow timestamp and duration dtypes (:issue:`53368`) - Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`) .. --------------------------------------------------------------------------- @@ -656,15 +650,15 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected. (:issue:`51644`) -- :meth:`DatetimeIndex.slice_indexer` now raises ``KeyError`` for non-monotonic indexes if either of the slice bounds is not in the index, this behaviour was previously deprecated but inconsistently handled. (:issue:`53983`) +- :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) +- :meth:`DatetimeIndex.slice_indexer` now raises ``KeyError`` for non-monotonic indexes if either of the slice bounds is not in the index; this behaviour was previously deprecated but inconsistently handled (:issue:`53983`) - Bug in :class:`DateOffset` which had inconsistent behavior when multiplying a :class:`DateOffset` object by a constant (:issue:`47953`) - Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`) -- Bug in :func:`to_datetime` converting :class:`Series` or :class:`DataFrame` containing :class:`arrays.ArrowExtensionArray` of ``pyarrow`` timestamps to numpy datetimes (:issue:`52545`) -- Bug in :meth:`DataFrame.to_sql` raising ``ValueError`` for pyarrow-backed date like dtypes (:issue:`53854`) +- Bug in :func:`to_datetime` converting :class:`Series` or :class:`DataFrame` containing :class:`arrays.ArrowExtensionArray` of PyArrow timestamps to numpy datetimes (:issue:`52545`) +- Bug in :meth:`.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) +- Bug in :meth:`DataFrame.to_sql` raising ``ValueError`` for PyArrow-backed date like dtypes (:issue:`53854`) - Bug in :meth:`Timestamp.date`, :meth:`Timestamp.isocalendar`, :meth:`Timestamp.timetuple`, and :meth:`Timestamp.toordinal` were returning incorrect results for inputs outside those supported by the Python standard library's datetime module (:issue:`53668`) - Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`) -- Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Bug in constructing a :class:`Series` or :class:`DataFrame` from a datetime or timedelta scalar always inferring nanosecond resolution instead of inferring from the input (:issue:`52212`) - Bug in constructing a :class:`Timestamp` from a string representing a time without a date inferring an incorrect unit (:issue:`54097`) - Bug in constructing a :class:`Timestamp` with ``ts_input=pd.NA`` raising ``TypeError`` (:issue:`45481`) @@ -672,12 +666,12 @@ Datetimelike Timedelta ^^^^^^^^^ -- :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) - Bug in :class:`TimedeltaIndex` division or multiplication leading to ``.freq`` of "0 Days" instead of ``None`` (:issue:`51575`) -- Bug in :class:`Timedelta` with Numpy timedelta64 objects not properly raising ``ValueError`` (:issue:`52806`) -- Bug in :func:`to_timedelta` converting :class:`Series` or :class:`DataFrame` containing :class:`ArrowDtype` of ``pyarrow.duration`` to numpy ``timedelta64`` (:issue:`54298`) +- Bug in :class:`Timedelta` with NumPy ``timedelta64`` objects not properly raising ``ValueError`` (:issue:`52806`) +- Bug in :func:`to_timedelta` converting :class:`Series` or :class:`DataFrame` containing :class:`ArrowDtype` of ``pyarrow.duration`` to NumPy ``timedelta64`` (:issue:`54298`) - Bug in :meth:`Timedelta.__hash__`, raising an ``OutOfBoundsTimedelta`` on certain large values of second resolution (:issue:`54037`) - Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`) +- Bug in :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` (:issue:`51644`) - Bug in :meth:`arrays.TimedeltaArray.map` and :meth:`TimedeltaIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) Timezones @@ -689,10 +683,10 @@ Numeric ^^^^^^^ - Bug in :class:`RangeIndex` setting ``step`` incorrectly when being the subtrahend with minuend a numeric value (:issue:`53255`) - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) -- Bug when calling :meth:`Series.kurt` and :meth:`Series.skew` on numpy data of all zero returning a python type instead of a numpy type (:issue:`53482`) +- Bug when calling :meth:`Series.kurt` and :meth:`Series.skew` on NumPy data of all zero returning a Python type instead of a NumPy type (:issue:`53482`) - Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`) -- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`) -- Bug in :meth:`DataFrame.size` and :meth:`Series.size` returning 64-bit integer instead of int (:issue:`52897`) +- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for PyArrow-backed dtypes (:issue:`52314`) +- Bug in :meth:`DataFrame.size` and :meth:`Series.size` returning 64-bit integer instead of a Python int (:issue:`52897`) - Bug in :meth:`DateFrame.dot` returning ``object`` dtype for :class:`ArrowDtype` data (:issue:`53979`) - Bug in :meth:`Series.any`, :meth:`Series.all`, :meth:`DataFrame.any`, and :meth:`DataFrame.all` had the default value of ``bool_only`` set to ``None`` instead of ``False``; this change should have no impact on users (:issue:`53258`) - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) @@ -703,9 +697,9 @@ Numeric Conversion ^^^^^^^^^^ - Bug in :func:`DataFrame.style.to_latex` and :func:`DataFrame.style.to_html` if the DataFrame contains integers with more digits than can be represented by floating point double precision (:issue:`52272`) -- Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`NumpyExtensionArray` instead of :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`52859`) -- Bug in :func:`array` when given an empty list and no dtype returning :class:`NumpyExtensionArray` instead of :class:`FloatingArray` (:issue:`54371`) -- Bug in :meth:`ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`) +- Bug in :func:`array` when given a ``datetime64`` or ``timedelta64`` dtype with unit of "s", "us", or "ms" returning :class:`.NumpyExtensionArray` instead of :class:`.DatetimeArray` or :class:`.TimedeltaArray` (:issue:`52859`) +- Bug in :func:`array` when given an empty list and no dtype returning :class:`.NumpyExtensionArray` instead of :class:`.FloatingArray` (:issue:`54371`) +- Bug in :meth:`.ArrowDtype.numpy_dtype` returning nanosecond units for non-nanosecond ``pyarrow.timestamp`` and ``pyarrow.duration`` types (:issue:`51800`) - Bug in :meth:`DataFrame.__repr__` incorrectly raising a ``TypeError`` when the dtype of a column is ``np.record`` (:issue:`48526`) - Bug in :meth:`DataFrame.info` raising ``ValueError`` when ``use_numba`` is set (:issue:`51922`) - Bug in :meth:`DataFrame.insert` raising ``TypeError`` if ``loc`` is ``np.int64`` (:issue:`53193`) @@ -730,10 +724,10 @@ Indexing Missing ^^^^^^^ -- Bug in :meth:`DataFrame.interpolate` failing to fill across multiblock data when ``method`` is "pad", "ffill", "bfill", or "backfill" (:issue:`53898`) +- Bug in :meth:`DataFrame.interpolate` failing to fill across data when ``method`` is ``"pad"``, ``"ffill"``, ``"bfill"``, or ``"backfill"`` (:issue:`53898`) - Bug in :meth:`DataFrame.interpolate` ignoring ``inplace`` when :class:`DataFrame` is empty (:issue:`53199`) - Bug in :meth:`Series.idxmin`, :meth:`Series.idxmax`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax` with a :class:`DatetimeIndex` index containing ``NaT`` incorrectly returning ``NaN`` instead of ``NaT`` (:issue:`43587`) -- Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` failing to raise on invalid ``downcast`` keyword, which can be only ``None`` or "infer" (:issue:`53103`) +- Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` failing to raise on invalid ``downcast`` keyword, which can be only ``None`` or ``"infer"`` (:issue:`53103`) - Bug in :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` with complex dtype incorrectly failing to fill ``NaN`` entries (:issue:`53635`) MultiIndex @@ -745,25 +739,23 @@ I/O ^^^ - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`) -- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`) -- Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`) +- Bug in :func:`json_normalize` could not parse metadata fields list type (:issue:`37782`) - Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) -- Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) -- Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`) -- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) -- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` raising when specifying a ``dtype`` with ``index_col`` (:issue:`53229`) +- Bug in :func:`read_hdf` not properly closing store after an ``IndexError`` is raised (:issue:`52781`) +- Bug in :func:`read_html` where style elements were read into DataFrames (:issue:`52197`) +- Bug in :func:`read_html` where tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - Bug in :func:`read_sql_table` raising an exception when reading a view (:issue:`52969`) - Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`) - Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`) - Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`) - Bug in :meth:`DataFrame.to_html` where conversion for an empty :class:`DataFrame` with complex dtype raised a ``ValueError`` (:issue:`54167`) -- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) +- Bug in :meth:`DataFrame.to_json` where :class:`.DateTimeArray`/:class:`.DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) - Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`) - Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`) Period ^^^^^^ -- :meth:`PeriodIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) - Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`) - Bug in :class:`PeriodDtype` constructor incorrectly returning the same ``normalize`` for different :class:`DateOffset` ``freq`` inputs (:issue:`24121`) - Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`) @@ -771,6 +763,7 @@ Period - Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087`) - Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087`) - Bug in :meth:`Period.now` not accepting the ``freq`` parameter as a keyword argument (:issue:`53369`) +- Bug in :meth:`PeriodIndex.map` with ``na_action="ignore"`` (:issue:`51644`) - Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Bug in incorrectly allowing construction of :class:`Period` or :class:`PeriodDtype` with :class:`CustomBusinessDay` freq; use :class:`BusinessDay` instead (:issue:`52534`) @@ -781,29 +774,29 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` :class:`Datetimelike` ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`) +- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` returns wrong dtype when used on an empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`) - Bug in weighted rolling aggregations when specifying ``min_periods=0`` (:issue:`51449`) -- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby`, where, when the index of the +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where, when the index of the grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex` or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument, - the function operated on the whole index rather than each element of the index. (:issue:`51979`) + the function operated on the whole index rather than each element of the index (:issue:`51979`) - Bug in :meth:`.DataFrameGroupBy.agg` with lists not respecting ``as_index=False`` (:issue:`52849`) -- Bug in :meth:`.DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) +- Bug in :meth:`.DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same (:issue:`52444`) - Bug in :meth:`.DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`) -- Bug in :meth:`.GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) -- Bug in :meth:`.GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` with a datetime key in conjunction with another key produced an incorrect number of group keys (:issue:`51158`) +- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) - Bug in :meth:`.SeriesGroupBy.size` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`) -- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning names as tuples when grouping by a list of a single element. (:issue:`53500`) -- Bug in :meth:`.GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`) -- Bug in :meth:`.DataFrameGroupby.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`) +- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning names as tuples when grouping by a list consisting of a single element (:issue:`53500`) +- Bug in :meth:`.DataFrameGroupBy.var` and :meth:`.SeriesGroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`) +- Bug in :meth:`.DataFrameGroupBy.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`) - Bug in :meth:`.Resampler.ohlc` with empty object returning a :class:`Series` instead of empty :class:`DataFrame` (:issue:`42902`) - Bug in :meth:`.SeriesGroupBy.count` and :meth:`.DataFrameGroupBy.count` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`) - Bug in :meth:`.SeriesGroupBy.nth` and :meth:`.DataFrameGroupBy.nth` after performing column selection when using ``dropna="any"`` or ``dropna="all"`` would not subset columns (:issue:`53518`) - Bug in :meth:`.SeriesGroupBy.nth` and :meth:`.DataFrameGroupBy.nth` raised after performing column selection when using ``dropna="any"`` or ``dropna="all"`` resulted in rows being dropped (:issue:`53518`) -- Bug in :meth:`.SeriesGroupBy.sum` and :meth:`.DataFrameGroupby.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` (:issue:`53606`) +- Bug in :meth:`.SeriesGroupBy.sum` and :meth:`.DataFrameGroupBy.sum` summing ``np.inf + np.inf`` and ``(-np.inf) + (-np.inf)`` to ``np.nan`` instead of ``np.inf`` and ``-np.inf`` respectively (:issue:`53606`) - Bug in :meth:`Series.groupby` raising an error when grouped :class:`Series` has a :class:`DatetimeIndex` index and a :class:`Series` with a name that is a month is given to the ``by`` argument (:issue:`48509`) Reshaping @@ -826,23 +819,23 @@ Reshaping Sparse ^^^^^^ -- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a ``numpy`` dtype (:issue:`53160`) +- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a NumPy dtype (:issue:`53160`) - Bug in :meth:`arrays.SparseArray.map` allowed the fill value to be included in the sparse values (:issue:`52095`) ExtensionArray ^^^^^^^^^^^^^^ -- Bug in :class:`ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`) +- Bug in :class:`.ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`) - Bug in :class:`DataFrame` constructor not copying :class:`Series` with extension dtype when given in dict (:issue:`53744`) - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) -- Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) +- Bug in :meth:`Series.quantile` for PyArrow temporal types raising ``ArrowInvalid`` (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) -- Bug where the :class:`DataFrame` repr would not work when a column would have an :class:`ArrowDtype` with an ``pyarrow.ExtensionDtype`` (:issue:`54063`) -- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) +- Bug where the :class:`DataFrame` repr would not work when a column had an :class:`ArrowDtype` with a ``pyarrow.ExtensionDtype`` (:issue:`54063`) +- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes (e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept PyArrow arrays of type ``pyarrow.null()`` (:issue:`52223`) Styler ^^^^^^ -- Bug in :meth:`Styler._copy` calling overridden methods in subclasses of :class:`Styler` (:issue:`52728`) +- Bug in :meth:`.Styler._copy` calling overridden methods in subclasses of :class:`.Styler` (:issue:`52728`) Metadata ^^^^^^^^ @@ -852,21 +845,21 @@ Metadata Other ^^^^^ +- Bug in :class:`.FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`) - Bug in :class:`DataFrame` and :class:`Series` raising for data of complex dtype when ``NaN`` values are present (:issue:`53627`) - Bug in :class:`DatetimeIndex` where ``repr`` of index passed with time does not print time is midnight and non-day based freq(:issue:`53470`) -- Bug in :class:`FloatingArray.__contains__` with ``NaN`` item incorrectly returning ``False`` when ``NaN`` values are present (:issue:`52840`) -- Bug in :func:`.testing.assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) +- Bug in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` now throw assertion error for two unequal sets (:issue:`51727`) - Bug in :func:`.testing.assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) - Bug in :func:`api.interchange.from_dataframe` was not respecting ``allow_copy`` argument (:issue:`54322`) - Bug in :func:`api.interchange.from_dataframe` was raising during interchanging from non-pandas tz-aware data containing null values (:issue:`54287`) - Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`) - Bug in :func:`from_dummies` where the resulting :class:`Index` did not match the original :class:`Index` (:issue:`54300`) - Bug in :func:`from_dummies` where the resulting data would always be ``object`` dtype instead of the dtype of the columns (:issue:`54300`) +- Bug in :meth:`.DataFrameGroupBy.first`, :meth:`.DataFrameGroupBy.last`, :meth:`.SeriesGroupBy.first`, and :meth:`.SeriesGroupBy.last` where an empty group would return ``np.nan`` instead of the corresponding :class:`.ExtensionArray` NA value (:issue:`39098`) - Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) -- Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`) +- Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`.DataFrameGroupBy.shift` when passing both ``freq`` and ``fill_value`` silently ignoring ``fill_value`` instead of raising ``ValueError`` (:issue:`53832`) - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`) -- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where an empty group would return ``np.nan`` instead of a an ExtensionArray's NA value (:issue:`39098`) - Bug in :meth:`Index.sort_values` when a ``key`` is passed (:issue:`52764`) - Bug in :meth:`Series.align`, :meth:`DataFrame.align`, :meth:`Series.reindex`, :meth:`DataFrame.reindex`, :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, incorrectly failing to raise with method="asfreq" (:issue:`53620`) - Bug in :meth:`Series.argsort` failing to raise when an invalid ``axis`` is passed (:issue:`54257`) From 005d876911dab5b04187da9dcb85432eb7eed177 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 17 Aug 2023 08:05:04 -0700 Subject: [PATCH 015/174] Backport PR #54535 on branch 2.1.x (REF: Replace "pyarrow" string storage checks with variable) (#54594) Backport PR #54535: REF: Replace "pyarrow" string storage checks with variable Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/conftest.py | 5 +++++ pandas/tests/arrays/string_/test_string.py | 20 +++++++++---------- .../tests/arrays/string_/test_string_arrow.py | 4 ++-- pandas/tests/extension/test_string.py | 12 +++++------ 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 757ca817d1b85..5210e727aeb3c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1996,3 +1996,8 @@ def warsaw(request) -> str: tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo. """ return request.param + + +@pytest.fixture() +def arrow_string_storage(): + return ("pyarrow",) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index cfd3314eb5944..de93e89ecacd5 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -115,8 +115,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request): - if dtype.storage == "pyarrow": +def test_add_2d(dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage: reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.node.add_marker(mark) @@ -144,8 +144,8 @@ def test_add_sequence(dtype): tm.assert_extension_array_equal(result, expected) -def test_mul(dtype, request): - if dtype.storage == "pyarrow": +def test_mul(dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage: reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) request.node.add_marker(mark) @@ -369,8 +369,8 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request): - if dtype.storage == "pyarrow" and box is pd.array: +def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): + if dtype.storage in arrow_string_storage and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -384,7 +384,7 @@ def test_min_max_numpy(method, box, dtype, request): assert result == expected -def test_fillna_args(dtype, request): +def test_fillna_args(dtype, request, arrow_string_storage): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -397,7 +397,7 @@ def test_fillna_args(dtype, request): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage == "pyarrow": + if dtype.storage in arrow_string_storage: msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -503,10 +503,10 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) -def test_memory_usage(dtype): +def test_memory_usage(dtype, arrow_string_storage): # GH 33963 - if dtype.storage == "pyarrow": + if dtype.storage in arrow_string_storage: pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 6912d5038ae0d..1ab628f186b47 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -49,10 +49,10 @@ def test_config_bad_storage_raises(): @skip_if_no_pyarrow @pytest.mark.parametrize("chunked", [True, False]) @pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked): +def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): import pyarrow as pa - array = pa if array == "pyarrow" else np + array = pa if array in arrow_string_storage else np arr = array.array([1, 2, 3]) if chunked: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6597ff84e3ca4..4e142eb6e14b8 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -103,8 +103,8 @@ def test_is_not_string_type(self, dtype): class TestInterface(base.BaseInterfaceTests): - def test_view(self, data, request): - if data.dtype.storage == "pyarrow": + def test_view(self, data, request, arrow_string_storage): + if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -116,8 +116,8 @@ def test_from_dtype(self, data): class TestReshaping(base.BaseReshapingTests): - def test_transpose(self, data, request): - if data.dtype.storage == "pyarrow": + def test_transpose(self, data, request, arrow_string_storage): + if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) @@ -127,8 +127,8 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - def test_setitem_preserves_views(self, data, request): - if data.dtype.storage == "pyarrow": + def test_setitem_preserves_views(self, data, request, arrow_string_storage): + if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) From 0335d39c0236af28809f04b1531e00ed2d52bab8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 17 Aug 2023 10:16:58 -0700 Subject: [PATCH 016/174] Backport PR #54574 on branch 2.1.x (ENH: add cummax/cummin/cumprod support for arrow dtypes) (#54603) Backport PR #54574: ENH: add cummax/cummin/cumprod support for arrow dtypes Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/arrow/array.py | 17 +++++++++++--- pandas/tests/extension/test_arrow.py | 33 +++++++++++++++++++--------- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index a8004dfd506b0..43a64a79e691b 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -265,6 +265,7 @@ Other enhancements - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to ``lzma.LZMAFile`` (:issue:`52979`) - Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) - :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) +- :meth:`Series.cummax`, :meth:`Series.cummin` and :meth:`Series.cumprod` are now supported for pyarrow dtypes with pyarrow version 13.0 and above (:issue:`52085`) - Added support for the DataFrame Consortium Standard (:issue:`54383`) - Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0f46e5a4e7482..3c65e6b4879e2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1389,6 +1389,9 @@ def _accumulate( NotImplementedError : subclass does not define accumulations """ pyarrow_name = { + "cummax": "cumulative_max", + "cummin": "cumulative_min", + "cumprod": "cumulative_prod_checked", "cumsum": "cumulative_sum_checked", }.get(name, name) pyarrow_meth = getattr(pc, pyarrow_name, None) @@ -1398,12 +1401,20 @@ def _accumulate( data_to_accum = self._pa_array pa_dtype = data_to_accum.type - if pa.types.is_duration(pa_dtype): - data_to_accum = data_to_accum.cast(pa.int64()) + + convert_to_int = ( + pa.types.is_temporal(pa_dtype) and name in ["cummax", "cummin"] + ) or (pa.types.is_duration(pa_dtype) and name == "cumsum") + + if convert_to_int: + if pa_dtype.bit_width == 32: + data_to_accum = data_to_accum.cast(pa.int32()) + else: + data_to_accum = data_to_accum.cast(pa.int64()) result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) - if pa.types.is_duration(pa_dtype): + if convert_to_int: result = result.cast(pa_dtype) return type(self)(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index dd1ff925adf5f..e748f320b3f09 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -347,10 +347,15 @@ class TestBaseAccumulateTests(base.BaseAccumulateTests): def check_accumulate(self, ser, op_name, skipna): result = getattr(ser, op_name)(skipna=skipna) - if ser.dtype.kind == "m": + pa_type = ser.dtype.pyarrow_dtype + if pa.types.is_temporal(pa_type): # Just check that we match the integer behavior. - ser = ser.astype("int64[pyarrow]") - result = result.astype("int64[pyarrow]") + if pa_type.bit_width == 32: + int_type = "int32[pyarrow]" + else: + int_type = "int64[pyarrow]" + ser = ser.astype(int_type) + result = result.astype(int_type) result = result.astype("Float64") expected = getattr(ser.astype("Float64"), op_name)(skipna=skipna) @@ -361,14 +366,20 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): - if op_name in ["cumsum", "cumprod"]: + if ( + pa.types.is_string(pa_type) + or pa.types.is_binary(pa_type) + or pa.types.is_decimal(pa_type) + ): + if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False - elif pa.types.is_temporal(pa_type) and not pa.types.is_duration(pa_type): - if op_name in ["cumsum", "cumprod"]: + elif pa.types.is_boolean(pa_type): + if op_name in ["cumprod", "cummax", "cummin"]: return False - elif pa.types.is_duration(pa_type): - if op_name == "cumprod": + elif pa.types.is_temporal(pa_type): + if op_name == "cumsum" and not pa.types.is_duration(pa_type): + return False + elif op_name == "cumprod": return False return True @@ -384,7 +395,9 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques data, all_numeric_accumulations, skipna ) - if all_numeric_accumulations != "cumsum" or pa_version_under9p0: + if pa_version_under9p0 or ( + pa_version_under13p0 and all_numeric_accumulations != "cumsum" + ): # xfailing takes a long time to run because pytest # renders the exception messages even when not showing them opt = request.config.option From 802e5eeaa1cd39253607a79c4a44002eb1b503fe Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 18 Aug 2023 05:42:26 -0700 Subject: [PATCH 017/174] Backport PR #54615 on branch 2.1.x (DOC: Update build instructions in the README) (#54619) Backport PR #54615: DOC: Update build instructions in the README Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- README.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8ea473beb107e..6fa20d237babe 100644 --- a/README.md +++ b/README.md @@ -130,23 +130,17 @@ In the `pandas` directory (same one where you found this file after cloning the git repo), execute: ```sh -python setup.py install +pip install . ``` or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): ```sh -python -m pip install -e . --no-build-isolation --no-use-pep517 +python -m pip install -ve . --no-build-isolation --config-settings=editable-verbose=true ``` -or alternatively - -```sh -python setup.py develop -``` - -See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source). +See the full instructions for [installing from source](https://pandas.pydata.org/docs/dev/development/contributing_environment.html). ## License [BSD 3](LICENSE) From f7723bb18fbde4e60c91db0e3ed19ce90f5bb6eb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 18 Aug 2023 05:48:44 -0700 Subject: [PATCH 018/174] Backport PR #54587 on branch 2.1.x (CI: Enable MacOS Python Dev tests) (#54614) Backport PR #54587: CI: Enable MacOS Python Dev tests Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 66d8320206429..030c9546fecca 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -77,16 +77,12 @@ jobs: env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning -W error::FutureWarning" - # TODO(cython3): Re-enable once next-beta(after beta 1) comes out - # There are some warnings failing the build with -werror - pandas_ci: "0" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: - ENV_FILE: ci/deps/${{ matrix.env_file }} PATTERN: ${{ matrix.pattern }} EXTRA_APT: ${{ matrix.extra_apt || '' }} LANG: ${{ matrix.lang || 'C.UTF-8' }} @@ -150,14 +146,13 @@ jobs: - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests - run: | - sudo locale-gen ${{ matrix.extra_loc }} + run: sudo locale-gen ${{ matrix.extra_loc }} if: ${{ matrix.extra_loc }} - name: Set up Conda uses: ./.github/actions/setup-conda with: - environment-file: ${{ env.ENV_FILE }} + environment-file: ci/deps/${{ matrix.env_file }} - name: Build Pandas id: build @@ -312,15 +307,14 @@ jobs: # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. #if: false # Uncomment this to freeze the workflow, comment it to unfreeze + defaults: + run: + shell: bash -eou pipefail {0} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - # TODO: Disable macOS for now, Github Actions bug where python is not - # symlinked correctly to 3.12 - # xref https://github.com/actions/setup-python/issues/701 - #os: [ubuntu-22.04, macOS-latest, windows-latest] - os: [ubuntu-22.04, windows-latest] + os: [ubuntu-22.04, macOS-latest, windows-latest] timeout-minutes: 180 @@ -345,22 +339,15 @@ jobs: with: python-version: '3.12-dev' - - name: Install dependencies + - name: Build Environment run: | python --version python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 - python -m pip list - - - name: Build Pandas - run: | python -m pip install -ve . --no-build-isolation --no-index + python -m pip list - - name: Build Version - run: | - python -c "import pandas; pandas.show_versions();" - - - name: Test + - name: Run Tests uses: ./.github/actions/run-tests From 72282be13f1986106ae8343f2042f9d9e8a1432e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 21 Aug 2023 10:57:50 -0700 Subject: [PATCH 019/174] Backport PR #54586 on branch 2.1.x (REF: Refactor conversion of na value) (#54658) Backport PR #54586: REF: Refactor conversion of na value Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/strings/__init__.py | 10 ++++++ pandas/tests/strings/test_find_replace.py | 9 ++--- pandas/tests/strings/test_split_partition.py | 37 ++++++-------------- 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 9a7622b4f1cd8..496a2d095d85b 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -1,2 +1,12 @@ # Needed for new arrow string dtype + +import pandas as pd + object_pyarrow_numpy = ("object",) + + +def _convert_na_value(ser, expected): + if ser.dtype != object: + # GH#18463 + expected = expected.fillna(pd.NA) + return expected diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bcb8db96b37fa..d5017b1c47d85 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -11,7 +11,10 @@ Series, _testing as tm, ) -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) # -------------------------------------------------------------------------------------- # str.contains @@ -758,9 +761,7 @@ def test_findall(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype) result = ser.str.findall("BAD[_]*") expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) - if ser.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(ser, expected) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 0298694ccaf71..7fabe238d2b86 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -12,6 +12,7 @@ Series, _testing as tm, ) +from pandas.tests.strings import _convert_na_value @pytest.mark.parametrize("method", ["split", "rsplit"]) @@ -20,9 +21,7 @@ def test_split(any_string_dtype, method): result = getattr(values.str, method)("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -32,9 +31,7 @@ def test_split_more_than_one_char(any_string_dtype, method): values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = getattr(values.str, method)("__") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) result = getattr(values.str, method)("__", expand=False) @@ -46,9 +43,7 @@ def test_split_more_regex_split(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -128,9 +123,7 @@ def test_rsplit(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -139,9 +132,7 @@ def test_rsplit_max_number(any_string_dtype): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -455,9 +446,7 @@ def test_partition_series_more_than_one_char(method, exp, any_string_dtype): s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) result = getattr(s.str, method)("__", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -480,9 +469,7 @@ def test_partition_series_none(any_string_dtype, method, exp): s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype) result = getattr(s.str, method)(expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -505,9 +492,7 @@ def test_partition_series_not_split(any_string_dtype, method, exp): s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype) result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -531,9 +516,7 @@ def test_partition_series_unicode(any_string_dtype, method, exp): result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) From dc04d00fe438d3997e62934b3a345000158f492a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 21 Aug 2023 11:26:27 -0700 Subject: [PATCH 020/174] Backport PR #54633 on branch 2.1.x (DEP: remove python-snappy and brotli as optional dependencies (no longer used)) (#54637) Backport PR #54633: DEP: remove python-snappy and brotli as optional dependencies (no longer used) Co-authored-by: Joris Van den Bossche --- ci/deps/actions-310.yaml | 2 -- ci/deps/actions-311-downstream_compat.yaml | 2 -- ci/deps/actions-311.yaml | 2 -- ci/deps/actions-39-minimum_versions.yaml | 2 -- ci/deps/actions-39.yaml | 2 -- ci/deps/circle-310-arm64.yaml | 2 -- doc/source/getting_started/install.rst | 2 -- environment.yml | 2 -- pandas/compat/_optional.py | 11 ----------- pandas/core/frame.py | 5 +---- pandas/io/parquet.py | 5 +---- pandas/tests/io/test_parquet.py | 6 ------ pyproject.toml | 4 +--- requirements-dev.txt | 2 -- scripts/tests/data/deps_expected_random.yaml | 2 -- scripts/tests/data/deps_minimum.toml | 4 +--- scripts/tests/data/deps_unmodified_random.yaml | 2 -- 17 files changed, 4 insertions(+), 53 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ffa7732c604a0..638aaedecf2b5 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -27,7 +27,6 @@ dependencies: - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -47,7 +46,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 5a6a26c2e1ad8..4d925786c22d6 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -28,7 +28,6 @@ dependencies: - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -48,7 +47,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9d60d734db5b3..698107341c26d 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -27,7 +27,6 @@ dependencies: - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -47,7 +46,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 # - pytables>=3.7.0, 3.8.0 is first version that supports 3.11 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 0e2fcf87c2d6e..a0aafb1d772a7 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -29,7 +29,6 @@ dependencies: - beautifulsoup4=4.11.1 - blosc=1.21.0 - bottleneck=1.3.4 - - brotlipy=0.7.0 - fastparquet=0.8.1 - fsspec=2022.05.0 - html5lib=1.1 @@ -49,7 +48,6 @@ dependencies: - pymysql=1.0.2 - pyreadstat=1.1.5 - pytables=3.7.0 - - python-snappy=0.6.1 - pyxlsb=1.0.9 - s3fs=2022.05.0 - scipy=1.8.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 6ea0d41b947dc..6c6b1de165496 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -27,7 +27,6 @@ dependencies: - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -47,7 +46,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index df4e8e285bd02..d436e90fa6186 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -27,7 +27,6 @@ dependencies: - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - fastparquet>=0.8.1 - fsspec>=2022.05.0 - html5lib>=1.1 @@ -48,7 +47,6 @@ dependencies: - pymysql>=1.0.2 # - pyreadstat>=1.1.5 not available on ARM - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 0ab0391ac78a9..ae7c9d4ea9c62 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -412,8 +412,6 @@ Installable with ``pip install "pandas[compression]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -brotli 0.7.0 compression Brotli compression -python-snappy 0.6.1 compression Snappy compression Zstandard 0.17.0 compression Zstandard compression ========================= ================== =============== ============================================================= diff --git a/environment.yml b/environment.yml index 3a0da0bfc703d..1a9dffb55bca7 100644 --- a/environment.yml +++ b/environment.yml @@ -27,7 +27,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.1 - blosc - - brotlipy>=0.7.0 - bottleneck>=1.3.4 - fastparquet>=0.8.1 - fsspec>=2022.05.0 @@ -48,7 +47,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 - - python-snappy>=0.6.1 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index fe4e6457ff08c..c5792fa1379fe 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -18,7 +18,6 @@ "bs4": "4.11.1", "blosc": "1.21.0", "bottleneck": "1.3.4", - "brotli": "0.7.0", "dataframe-api-compat": "0.1.7", "fastparquet": "0.8.1", "fsspec": "2022.05.0", @@ -41,7 +40,6 @@ "pyxlsb": "1.0.9", "s3fs": "2022.05.0", "scipy": "1.8.1", - "snappy": "0.6.1", "sqlalchemy": "1.4.36", "tables": "3.7.0", "tabulate": "0.8.10", @@ -60,12 +58,10 @@ INSTALL_MAPPING = { "bs4": "beautifulsoup4", "bottleneck": "Bottleneck", - "brotli": "brotlipy", "jinja2": "Jinja2", "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", - "snappy": "python-snappy", "sqlalchemy": "SQLAlchemy", "tables": "pytables", } @@ -75,13 +71,6 @@ def get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) if version is None: - if module.__name__ == "brotli": - # brotli doesn't contain attributes to confirm it's version - return "" - if module.__name__ == "snappy": - # snappy doesn't contain attributes to confirm it's version - # See https://github.com/andrix/python-snappy/pull/119 - return "" raise ImportError(f"Can't determine version for {module.__name__}") if module.__name__ == "psycopg2": # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9939daadd9237..56e1b82c992c3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2909,10 +2909,7 @@ def to_parquet( 'pyarrow' is unavailable. compression : str or None, default 'snappy' Name of the compression to use. Use ``None`` for no compression. - The supported compression methods actually depend on which engine - is used. For 'pyarrow', 'snappy', 'gzip', 'brotli', 'lz4', 'zstd' - are all supported. For 'fastparquet', only 'gzip' and 'snappy' are - supported. + Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 91987e6531261..f51b98a929440 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -444,10 +444,7 @@ def to_parquet( if you wish to use its implementation. compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}}, default 'snappy'. Name of the compression to use. Use ``None`` - for no compression. The supported compression methods actually - depend on which engine is used. For 'pyarrow', 'snappy', 'gzip', - 'brotli', 'lz4', 'zstd' are all supported. For 'fastparquet', - only 'gzip' and 'snappy' are supported. + for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fcc1c218a149d..a4c6cfcf9fe0e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -404,12 +404,6 @@ def test_columns_dtypes(self, engine): @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): - if compression == "snappy": - pytest.importorskip("snappy") - - elif compression == "brotli": - pytest.importorskip("brotli") - df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine, write_kwargs={"compression": compression}) diff --git a/pyproject.toml b/pyproject.toml index 1034196baa15e..d2fb6a8d854d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,13 +81,12 @@ xml = ['lxml>=4.8.0'] plot = ['matplotlib>=3.6.1'] output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] -compression = ['brotlipy>=0.7.0', 'python-snappy>=0.6.1', 'zstandard>=0.17.0'] +compression = ['zstandard>=0.17.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] all = ['beautifulsoup4>=4.11.1', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.0', 'bottleneck>=1.3.4', - 'brotlipy>=0.7.0', 'dataframe-api-compat>=0.1.7', 'fastparquet>=0.8.1', 'fsspec>=2022.05.0', @@ -110,7 +109,6 @@ all = ['beautifulsoup4>=4.11.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', - 'python-snappy>=0.6.1', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', diff --git a/requirements-dev.txt b/requirements-dev.txt index 0944acbc36c9b..be02007a36333 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -16,7 +16,6 @@ numpy pytz beautifulsoup4>=4.11.1 blosc -brotlipy>=0.7.0 bottleneck>=1.3.4 fastparquet>=0.8.1 fsspec>=2022.05.0 @@ -37,7 +36,6 @@ pyarrow>=7.0.0 pymysql>=1.0.2 pyreadstat>=1.1.5 tables>=3.7.0 -python-snappy>=0.6.1 pyxlsb>=1.0.9 s3fs>=2022.05.0 scipy>=1.8.1 diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index 35d7fe74806a9..c70025f8f019d 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -26,7 +26,6 @@ dependencies: - beautifulsoup4>=5.9.3 - blosc - bottleneck>=1.3.2 - - brotlipy>=0.7.0 - fastparquet>=0.6.3 - fsspec>=2021.07.0 - html5lib>=1.1 @@ -45,7 +44,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 - - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 6f56ca498794b..b43815a982139 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -77,12 +77,11 @@ xml = ['lxml>=4.6.3'] plot = ['matplotlib>=3.6.1'] output_formatting = ['jinja2>=3.0.0', 'tabulate>=0.8.9'] clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.2.0'] -compression = ['brotlipy>=0.7.0', 'python-snappy>=0.6.0', 'zstandard>=0.15.2'] +compression = ['zstandard>=0.15.2'] all = ['beautifulsoup4>=5.9.3', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.0', 'bottleneck>=1.3.2', - 'brotlipy>=0.7.0', 'fastparquet>=0.6.3', 'fsspec>=2021.07.0', 'gcsfs>=2021.07.0', @@ -104,7 +103,6 @@ all = ['beautifulsoup4>=5.9.3', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', - 'python-snappy>=0.6.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 405762d33f53e..503eb3c7c7734 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -26,7 +26,6 @@ dependencies: - beautifulsoup4 - blosc - bottleneck>=1.3.2 - - brotlipy - fastparquet>=0.6.3 - fsspec>=2021.07.0 - html5lib>=1.1 @@ -45,7 +44,6 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 - - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 From 23cb2503170bf925f3ee4b7bdc103d832f2d3351 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:25:02 -0700 Subject: [PATCH 021/174] Backport PR #54641 on branch 2.1.x (BUG: getitem indexing wrong axis) (#54669) Backport PR #54641: BUG: getitem indexing wrong axis Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/indexing.py | 3 ++- pandas/tests/frame/indexing/test_getitem.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ca0b1705b5c38..a2871f364f092 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -985,8 +985,9 @@ def _getitem_tuple_same_dim(self, tup: tuple): """ retval = self.obj # Selecting columns before rows is signficiantly faster + start_val = (self.ndim - len(tup)) + 1 for i, key in enumerate(reversed(tup)): - i = self.ndim - i - 1 + i = self.ndim - i - start_val if com.is_null_slice(key): continue diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 9fed2116b2896..9d9324f557c8d 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -458,6 +458,14 @@ def test_getitem_datetime_slice(self): ): df["2011-01-01":"2011-11-01"] + def test_getitem_slice_same_dim_only_one_axis(self): + # GH#54622 + df = DataFrame(np.random.default_rng(2).standard_normal((10, 8))) + result = df.iloc[(slice(None, None, 2),)] + assert result.shape == (5, 8) + expected = df.iloc[slice(None, None, 2), slice(None)] + tm.assert_frame_equal(result, expected) + class TestGetitemDeprecatedIndexers: @pytest.mark.parametrize("key", [{"a", "b"}, {"a": "a"}]) From eb367106ca89dd6d22974c220c7ba5f7ece5ff7f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:06:13 -0700 Subject: [PATCH 022/174] Backport PR #54496 on branch 2.1.x (Fix inference for fixed with numpy strings with arrow string option) (#54672) Backport PR #54496: Fix inference for fixed with numpy strings with arrow string option Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/construction.py | 12 +++++++++++- pandas/core/internals/construction.py | 22 ++++++++++++++++++++- pandas/tests/frame/test_constructors.py | 25 ++++++++++++++++++++++++ pandas/tests/series/test_constructors.py | 8 ++++++++ 4 files changed, 65 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5757c69bb6ec7..f733ba3b445fd 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,6 +19,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import ( Period, @@ -49,7 +51,10 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import NumpyEADtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + NumpyEADtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -589,6 +594,11 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + import pyarrow as pa + + dtype = ArrowDtype(pa.string()) + subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: subarr = subarr.copy() diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2290cd86f35e6..8879d3318f7ca 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.core.dtypes.astype import astype_is_view @@ -30,7 +32,10 @@ is_named_tuple, is_object_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -65,6 +70,7 @@ from pandas.core.internals.blocks import ( BlockPlacement, ensure_block_shape, + new_block, new_block_2d, ) from pandas.core.internals.managers import ( @@ -372,6 +378,20 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] + elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + import pyarrow as pa + + obj_columns = list(values) + dtype = ArrowDtype(pa.string()) + block_values = [ + new_block( + dtype.construct_array_type()._from_sequence(data, dtype=dtype), + BlockPlacement(slice(i, i + 1)), + ndim=1, + ) + for i, data in enumerate(obj_columns) + ] + else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c170704150383..63cddb7f192e6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2718,6 +2718,31 @@ def test_frame_string_inference(self): df = DataFrame({"a": ["a", "b"]}, dtype="object") tm.assert_frame_equal(df, expected) + def test_frame_string_inference_array_string_dtype(self): + # GH#54496 + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.string()) + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": np.array(["a", "b"])}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame({0: ["a", "b"], 1: ["c", "d"]}, dtype=dtype) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]])) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"], "b": ["c", "d"]}, + dtype=dtype, + columns=Index(["a", "b"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 611f4a7f790a6..97bd8633954d8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2107,6 +2107,14 @@ def test_series_string_inference_scalar(self): ser = Series("a", index=[1]) tm.assert_series_equal(ser, expected) + def test_series_string_inference_array_string_dtype(self): + # GH#54496 + pa = pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())) + with pd.option_context("future.infer_string", True): + ser = Series(np.array(["a", "b"])) + tm.assert_series_equal(ser, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 59a10cc6e72dea2a71a3f9a2dbd5829f29f9de40 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 11:11:13 -0700 Subject: [PATCH 023/174] Backport PR #54678 on branch 2.1.x (COMPAT: Workaround invalid PyArrow duration conversion) (#54688) Backport PR #54678: COMPAT: Workaround invalid PyArrow duration conversion Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/arrays/arrow/array.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3c65e6b4879e2..43320cf68cbec 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -952,6 +952,9 @@ def convert_fill_value(value, pa_type, dtype): return value if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): return value + if isinstance(value, Timedelta) and value.unit in ("s", "ms"): + # Workaround https://github.com/apache/arrow/issues/37291 + value = value.to_numpy() if is_array_like(value): pa_box = pa.array else: From 8b0302436470c9bcdbfef28cd664fdba3e40e4ce Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 11:11:27 -0700 Subject: [PATCH 024/174] Backport PR #54510 on branch 2.1.x (Speed up StringDtype arrow implementation for merge) (#54689) Backport PR #54510: Speed up StringDtype arrow implementation for merge Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/reshape/merge.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6987a0ac7bf6b..c2cb9d643ca87 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -90,6 +90,7 @@ ExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -2399,21 +2400,9 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - if not isinstance(lk, BaseMaskedArray) and not ( - # exclude arrow dtypes that would get cast to object - isinstance(lk.dtype, ArrowDtype) - and ( - is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort - ) + if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): - lk, _ = lk._values_for_factorize() - - # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute - # "_values_for_factorize" - rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype): import pyarrow as pa import pyarrow.compute as pc @@ -2436,6 +2425,21 @@ def _factorize_keys( return rlab, llab, count return llab, rlab, count + if not isinstance(lk, BaseMaskedArray) and not ( + # exclude arrow dtypes that would get cast to object + isinstance(lk.dtype, ArrowDtype) + and ( + is_numeric_dtype(lk.dtype.numpy_dtype) + or is_string_dtype(lk.dtype) + and not sort + ) + ): + lk, _ = lk._values_for_factorize() + + # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute + # "_values_for_factorize" + rk, _ = rk._values_for_factorize() # type: ignore[union-attr] + if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: # GH#23917 TODO: Needs tests for non-matching dtypes # GH#23917 TODO: needs tests for case where lk is integer-dtype From 8c8ddc777ffa6759ae23549c02fd5d84a0178f23 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 13:41:47 -0700 Subject: [PATCH 025/174] Backport PR #54685 on branch 2.1.x (ENH: support integer bitwise ops in ArrowExtensionArray) (#54691) Backport PR #54685: ENH: support integer bitwise ops in ArrowExtensionArray Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/arrow/array.py | 22 ++++++++++++++++++++-- pandas/tests/extension/test_arrow.py | 27 ++++++++++++++++++++++++++- 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 43a64a79e691b..9341237acfaa1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -268,6 +268,7 @@ Other enhancements - :meth:`Series.cummax`, :meth:`Series.cummin` and :meth:`Series.cumprod` are now supported for pyarrow dtypes with pyarrow version 13.0 and above (:issue:`52085`) - Added support for the DataFrame Consortium Standard (:issue:`54383`) - Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`) +- PyArrow-backed integer dtypes now support bitwise operations (:issue:`54495`) .. --------------------------------------------------------------------------- .. _whatsnew_210.api_breaking: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 43320cf68cbec..48ff769f6c737 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -84,6 +84,15 @@ "rxor": lambda x, y: pc.xor(y, x), } + ARROW_BIT_WISE_FUNCS = { + "and_": pc.bit_wise_and, + "rand_": lambda x, y: pc.bit_wise_and(y, x), + "or_": pc.bit_wise_or, + "ror_": lambda x, y: pc.bit_wise_or(y, x), + "xor": pc.bit_wise_xor, + "rxor": lambda x, y: pc.bit_wise_xor(y, x), + } + def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar ) -> pa.ChunkedArray: @@ -582,7 +591,11 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: return self.to_numpy(dtype=dtype) def __invert__(self) -> Self: - return type(self)(pc.invert(self._pa_array)) + # This is a bit wise op for integer types + if pa.types.is_integer(self._pa_array.type): + return type(self)(pc.bit_wise_not(self._pa_array)) + else: + return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: return type(self)(pc.negate_checked(self._pa_array)) @@ -657,7 +670,12 @@ def _evaluate_op_method(self, other, op, arrow_funcs): return type(self)(result) def _logical_method(self, other, op): - return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) + # For integer types `^`, `|`, `&` are bitwise operators and return + # integer types. Otherwise these are boolean ops. + if pa.types.is_integer(self._pa_array.type): + return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS) + else: + return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) def _arith_method(self, other, op): return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e748f320b3f09..a9b7a8c655032 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -753,7 +753,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): class TestBaseUnaryOps(base.BaseUnaryOpsTests): def test_invert(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if not pa.types.is_boolean(pa_dtype): + if not (pa.types.is_boolean(pa_dtype) or pa.types.is_integer(pa_dtype)): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -1339,6 +1339,31 @@ def test_logical_masked_numpy(self, op, exp): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_bitwise(pa_type): + # GH 54495 + dtype = ArrowDtype(pa_type) + left = pd.Series([1, None, 3, 4], dtype=dtype) + right = pd.Series([None, 3, 5, 4], dtype=dtype) + + result = left | right + expected = pd.Series([None, None, 3 | 5, 4 | 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = left & right + expected = pd.Series([None, None, 3 & 5, 4 & 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = left ^ right + expected = pd.Series([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = ~left + expected = ~(left.fillna(0).to_numpy()) + expected = pd.Series(expected, dtype=dtype).mask(left.isnull()) + tm.assert_series_equal(result, expected) + + def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): with pytest.raises(NotImplementedError, match="Passing pyarrow type"): ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]") From 6c5e79b161729c950d44a18ef3b9dd5bdaa191d6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 13:41:59 -0700 Subject: [PATCH 026/174] Backport PR #54670 on branch 2.1.x (BUG: drop_duplicates raising for boolean arrow dtype with missing values) (#54692) Backport PR #54670: BUG: drop_duplicates raising for boolean arrow dtype with missing values Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 ++ pandas/core/algorithms.py | 11 ++++++++--- pandas/tests/series/methods/test_drop_duplicates.py | 7 +++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9341237acfaa1..d2b9705a72f75 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -627,6 +627,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`) - Performance improvement in :meth:`Series.add` for PyArrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) +- Performance improvement in :meth:`Series.drop_duplicates` for ``ArrowDtype`` (:issue:`54667`). - Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with PyArrow dtypes (:issue:`53950`) - Performance improvement in :meth:`Series.str.get_dummies` for PyArrow-backed strings (:issue:`53655`) - Performance improvement in :meth:`Series.str.get` for PyArrow-backed strings (:issue:`53152`) @@ -831,6 +832,7 @@ ExtensionArray - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for PyArrow temporal types raising ``ArrowInvalid`` (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) +- Bug in :meth:`Series.unique` for boolean ``ArrowDtype`` with ``NA`` values (:issue:`54667`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the :class:`DataFrame` repr would not work when a column had an :class:`ArrowDtype` with a ``pyarrow.ExtensionDtype`` (:issue:`54063`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes (e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept PyArrow arrays of type ``pyarrow.null()`` (:issue:`52223`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 14dee202a9d8d..06da747a450ee 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,6 +55,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( + ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -996,9 +997,13 @@ def duplicated( ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) + if hasattr(values, "dtype"): + if isinstance(values.dtype, ArrowDtype): + values = values._to_masked() # type: ignore[union-attr] + + if isinstance(values.dtype, BaseMaskedDtype): + values = cast("BaseMaskedArray", values) + return htable.duplicated(values._data, keep=keep, mask=values._mask) values = _ensure_data(values) return htable.duplicated(values, keep=keep) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 96c2e1ba6d9bb..324ab1204e16e 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -249,3 +249,10 @@ def test_drop_duplicates_ignore_index(self): result = ser.drop_duplicates(ignore_index=True) expected = Series([1, 2, 3]) tm.assert_series_equal(result, expected) + + def test_duplicated_arrow_dtype(self): + pytest.importorskip("pyarrow") + ser = Series([True, False, None, False], dtype="bool[pyarrow]") + result = ser.drop_duplicates() + expected = Series([True, False, None], dtype="bool[pyarrow]") + tm.assert_series_equal(result, expected) From 1dbd7929639a76104f29e479b17b4f30a4112f83 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 14:18:15 -0700 Subject: [PATCH 027/174] Backport PR #54566 on branch 2.1.x (ENH: support Index.any/all with float, timedelta64 dtypes) (#54693) Backport PR #54566: ENH: support Index.any/all with float, timedelta64 dtypes Co-authored-by: jbrockmendel --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/indexes/base.py | 28 +++++++++++--------- pandas/tests/indexes/numeric/test_numeric.py | 8 ++++++ pandas/tests/indexes/test_base.py | 7 ++++- pandas/tests/indexes/test_old_base.py | 26 +++++++++++------- 5 files changed, 47 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d2b9705a72f75..7dd2fb570c695 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -265,6 +265,7 @@ Other enhancements - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to ``lzma.LZMAFile`` (:issue:`52979`) - Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) - :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) +- :meth:`Index.all` and :meth:`Index.any` with floating dtypes and timedelta64 dtypes no longer raise ``TypeError``, matching the :meth:`Series.all` and :meth:`Series.any` behavior (:issue:`54566`) - :meth:`Series.cummax`, :meth:`Series.cummin` and :meth:`Series.cumprod` are now supported for pyarrow dtypes with pyarrow version 13.0 and above (:issue:`52085`) - Added support for the DataFrame Consortium Standard (:issue:`54383`) - Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 241b2de513a04..1f15a4ad84755 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7215,11 +7215,12 @@ def any(self, *args, **kwargs): """ nv.validate_any(args, kwargs) self._maybe_disable_logical_methods("any") - # error: Argument 1 to "any" has incompatible type "ArrayLike"; expected - # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int, - # float, complex, str, bytes, generic]], Sequence[Sequence[Any]], - # _SupportsArray]" - return np.any(self.values) # type: ignore[arg-type] + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "any" to get TypeError instead + # of AttributeError + return vals._reduce("any") + return np.any(vals) def all(self, *args, **kwargs): """ @@ -7262,11 +7263,12 @@ def all(self, *args, **kwargs): """ nv.validate_all(args, kwargs) self._maybe_disable_logical_methods("all") - # error: Argument 1 to "all" has incompatible type "ArrayLike"; expected - # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int, - # float, complex, str, bytes, generic]], Sequence[Sequence[Any]], - # _SupportsArray]" - return np.all(self.values) # type: ignore[arg-type] + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "all" to get TypeError instead + # of AttributeError + return vals._reduce("all") + return np.all(vals) @final def _maybe_disable_logical_methods(self, opname: str_t) -> None: @@ -7275,9 +7277,9 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None: """ if ( isinstance(self, ABCMultiIndex) - or needs_i8_conversion(self.dtype) - or isinstance(self.dtype, (IntervalDtype, CategoricalDtype)) - or is_float_dtype(self.dtype) + # TODO(3.0): PeriodArray and DatetimeArray any/all will raise, + # so checking needs_i8_conversion will be unnecessary + or (needs_i8_conversion(self.dtype) and self.dtype.kind != "m") ): # This call will raise make_invalid_op(opname)(self) diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 977c7da7d866f..8cd295802a5d1 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -227,6 +227,14 @@ def test_fillna_float64(self): exp = Index([1.0, "obj", 3.0], name="x") tm.assert_index_equal(idx.fillna("obj"), exp, exact=True) + def test_logical_compat(self, simple_index): + idx = simple_index + assert idx.all() == idx.values.all() + assert idx.any() == idx.values.any() + + assert idx.all() == idx.to_series().all() + assert idx.any() == idx.to_series().any() + class TestNumericInt: @pytest.fixture(params=[np.int64, np.int32, np.int16, np.int8, np.uint64]) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ffa0b115e34fb..bc04c1c6612f4 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -692,7 +692,12 @@ def test_format_missing(self, vals, nulls_fixture): @pytest.mark.parametrize("op", ["any", "all"]) def test_logical_compat(self, op, simple_index): index = simple_index - assert getattr(index, op)() == getattr(index.values, op)() + left = getattr(index, op)() + assert left == getattr(index.values, op)() + right = getattr(index.to_series(), op)() + # left might not match right exactly in e.g. string cases where the + # because we use np.any/all instead of .any/all + assert bool(left) == bool(right) @pytest.mark.parametrize( "index", ["string", "int64", "int32", "float64", "float32"], indirect=True diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index f8f5a543a9c19..79dc423f12a85 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -209,17 +209,25 @@ def test_numeric_compat(self, simple_index): 1 // idx def test_logical_compat(self, simple_index): - if ( - isinstance(simple_index, RangeIndex) - or is_numeric_dtype(simple_index.dtype) - or simple_index.dtype == object - ): + if simple_index.dtype == object: pytest.skip("Tested elsewhere.") idx = simple_index - with pytest.raises(TypeError, match="cannot perform all"): - idx.all() - with pytest.raises(TypeError, match="cannot perform any"): - idx.any() + if idx.dtype.kind in "iufcbm": + assert idx.all() == idx._values.all() + assert idx.all() == idx.to_series().all() + assert idx.any() == idx._values.any() + assert idx.any() == idx.to_series().any() + else: + msg = "cannot perform (any|all)" + if isinstance(idx, IntervalIndex): + msg = ( + r"'IntervalArray' with dtype interval\[.*\] does " + "not support reduction '(any|all)'" + ) + with pytest.raises(TypeError, match=msg): + idx.all() + with pytest.raises(TypeError, match=msg): + idx.any() def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): From 503cef4f81b17f6e0c5a9914eb3413ac63f31a23 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 17:15:40 -0700 Subject: [PATCH 028/174] Backport PR #54694 on branch 2.1.x (MAINT: Remove `np.in1d` function calls) (#54697) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #54694: MAINT: Remove `np.in1d` function calls Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- pandas/core/algorithms.py | 8 ++++---- pandas/core/arrays/interval.py | 4 ++-- pandas/tseries/holiday.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 06da747a450ee..0a9c1aad46f89 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -518,9 +518,9 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: return isin(np.asarray(comps_array), np.asarray(values)) # GH16012 - # Ensure np.in1d doesn't get object types or it *may* throw an exception + # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), - # in1d is faster for small sizes + # isin is faster for small sizes if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 @@ -531,10 +531,10 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: if isna(values).any(): def f(c, v): - return np.logical_or(np.in1d(c, v), np.isnan(c)) + return np.logical_or(np.isin(c, v).ravel(), np.isnan(c)) else: - f = np.in1d + f = lambda a, b: np.isin(a, b).ravel() else: common = np_find_common_type(values.dtype, comps_array.dtype) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 3263dd73fe4dc..2fce6631afa9b 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1844,14 +1844,14 @@ def isin(self, values) -> npt.NDArray[np.bool_]: # complex128 ndarray is much more performant. left = self._combined.view("complex128") right = values._combined.view("complex128") - # error: Argument 1 to "in1d" has incompatible type + # error: Argument 1 to "isin" has incompatible type # "Union[ExtensionArray, ndarray[Any, Any], # ndarray[Any, dtype[Any]]]"; expected # "Union[_SupportsArray[dtype[Any]], # _NestedSequence[_SupportsArray[dtype[Any]]], bool, # int, float, complex, str, bytes, _NestedSequence[ # Union[bool, int, float, complex, str, bytes]]]" - return np.in1d(left, right) # type: ignore[arg-type] + return np.isin(left, right).ravel() # type: ignore[arg-type] elif needs_i8_conversion(self.left.dtype) ^ needs_i8_conversion( values.left.dtype diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 44c21bc284121..75cb7f7850013 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -283,11 +283,11 @@ def dates( holiday_dates = self._apply_rule(dates) if self.days_of_week is not None: holiday_dates = holiday_dates[ - np.in1d( + np.isin( # error: "DatetimeIndex" has no attribute "dayofweek" holiday_dates.dayofweek, # type: ignore[attr-defined] self.days_of_week, - ) + ).ravel() ] if self.start_date is not None: From dc27aeef5494f473f60423f1ad03271610a56671 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 19:42:29 -0700 Subject: [PATCH 029/174] Backport PR #54625 on branch 2.1.x (BUG: Fix error in printing timezone series) (#54698) Backport PR #54625: BUG: Fix error in printing timezone series Co-authored-by: Adrian D'Alessandro --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/io/formats/format.py | 2 +- pandas/tests/io/formats/test_format.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7dd2fb570c695..7b0c120c3bbdb 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -667,6 +667,7 @@ Datetimelike - Bug in constructing a :class:`Timestamp` from a string representing a time without a date inferring an incorrect unit (:issue:`54097`) - Bug in constructing a :class:`Timestamp` with ``ts_input=pd.NA`` raising ``TypeError`` (:issue:`45481`) - Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`) +- Bug in the repr for :class:`Series` when dtype is a timezone aware datetime with non-nanosecond resolution raising ``OutOfBoundsDatetime`` (:issue:`54623`) Timedelta ^^^^^^^^^ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ff26abd5cc26c..2297f7945a264 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1830,8 +1830,8 @@ def get_format_datetime64_from_values( class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" + ido = is_dates_only(self.values) values = self.values.astype(object) - ido = is_dates_only(values) formatter = self.formatter or get_format_datetime64( ido, date_format=self.date_format ) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 3b0dac21ef10c..7dfd35d5424b4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3320,6 +3320,14 @@ def format_func(x): result = formatter.get_result() assert result == ["10:10", "12:12"] + def test_datetime64formatter_tz_ms(self): + x = Series( + np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") + ).dt.tz_localize("US/Pacific") + result = fmt.Datetime64TZFormatter(x).get_result() + assert result[0].strip() == "2999-01-01 00:00:00-08:00" + assert result[1].strip() == "2999-01-02 00:00:00-08:00" + class TestNaTFormatting: def test_repr(self): From 968b517ab3388d39eb58acc6a35ff65316c87275 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 22 Aug 2023 19:42:44 -0700 Subject: [PATCH 030/174] Backport PR #54508 on branch 2.1.x (PERF: DataFrame.iloc[int] for EA dtypes) (#54700) Backport PR #54508: PERF: DataFrame.iloc[int] for EA dtypes Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/internals/managers.py | 21 ++++++--------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7b0c120c3bbdb..b79797fa86431 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -622,6 +622,7 @@ Performance improvements - Performance improvement in :meth:`.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.astype` when ``dtype`` is an extension dtype (:issue:`54299`) +- Performance improvement in :meth:`DataFrame.iloc` when input is an single integer and dataframe is backed by extension dtypes (:issue:`54508`) - Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single PyArrow dtype (:issue:`54224`) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7bb579b22aeed..4a6d3c333a6a5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -968,19 +968,10 @@ def fast_xs(self, loc: int) -> SingleBlockManager: n = len(self) - # GH#46406 - immutable_ea = isinstance(dtype, ExtensionDtype) and dtype._is_immutable - - if isinstance(dtype, ExtensionDtype) and not immutable_ea: - cls = dtype.construct_array_type() - result = cls._empty((n,), dtype=dtype) + if isinstance(dtype, ExtensionDtype): + result = np.empty(n, dtype=object) else: - # error: Argument "dtype" to "empty" has incompatible type - # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected - # "None" - result = np.empty( - n, dtype=object if immutable_ea else dtype # type: ignore[arg-type] - ) + result = np.empty(n, dtype=dtype) result = ensure_wrapped_if_datetimelike(result) for blk in self.blocks: @@ -989,9 +980,9 @@ def fast_xs(self, loc: int) -> SingleBlockManager: for i, rl in enumerate(blk.mgr_locs): result[rl] = blk.iget((i, loc)) - if immutable_ea: - dtype = cast(ExtensionDtype, dtype) - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) bp = BlockPlacement(slice(0, len(result))) block = new_block(result, placement=bp, ndim=1) From 5c9b63c38255a57dfc3d2f2b2aa97d190a2b010d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 23 Aug 2023 11:39:10 -0700 Subject: [PATCH 031/174] Backport PR #54707 on branch 2.1.x (BUG: ArrowExtensionArray.fillna with duration types) (#54711) Backport PR #54707: BUG: ArrowExtensionArray.fillna with duration types Co-authored-by: Luke Manley --- pandas/core/arrays/arrow/array.py | 34 +++++++--------------------- pandas/tests/extension/test_arrow.py | 10 ++++++++ 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 48ff769f6c737..4ba0e70393efb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -381,8 +381,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: elif isna(value): pa_scalar = pa.scalar(None, type=pa_type) else: - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 if isinstance(value, Timedelta): if pa_type is None: pa_type = pa.duration(value.unit) @@ -448,8 +447,7 @@ def _box_pa_array( and pa.types.is_duration(pa_type) and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") ): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) @@ -462,8 +460,7 @@ def _box_pa_array( pa_array = pa.array(value, from_pandas=True) if pa_type is None and pa.types.is_duration(pa_array.type): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value) @@ -965,26 +962,11 @@ def fillna( f" expected {len(self)}" ) - def convert_fill_value(value, pa_type, dtype): - if value is None: - return value - if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): - return value - if isinstance(value, Timedelta) and value.unit in ("s", "ms"): - # Workaround https://github.com/apache/arrow/issues/37291 - value = value.to_numpy() - if is_array_like(value): - pa_box = pa.array - else: - pa_box = pa.scalar - try: - value = pa_box(value, type=pa_type, from_pandas=True) - except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {dtype}" - raise TypeError(msg) from err - return value - - fill_value = convert_fill_value(value, self._pa_array.type, self.dtype) + try: + fill_value = self._box_pa(value, pa_type=self._pa_array.type) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + raise TypeError(msg) from err try: if method is None: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a9b7a8c655032..86a7f20214a9a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3049,3 +3049,13 @@ def test_arrowextensiondtype_dataframe_repr(): # pyarrow.ExtensionType values are displayed expected = " col\n0 15340\n1 15341\n2 15342" assert result == expected + + +@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES) +def test_duration_fillna_numpy(pa_type): + # GH 54707 + ser1 = pd.Series([None, 2], dtype=ArrowDtype(pa_type)) + ser2 = pd.Series(np.array([1, 3], dtype=f"m8[{pa_type.unit}]")) + result = ser1.fillna(ser2) + expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type)) + tm.assert_series_equal(result, expected) From 2ad36cccdbcfe68c9eac047dbaf1f198f8eb9796 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 23 Aug 2023 14:09:23 -0700 Subject: [PATCH 032/174] Backport PR #54533 on branch 2.1.x (Implement Arrow String Array that is compatible with NumPy semantics) (#54713) Backport PR #54533: Implement Arrow String Array that is compatible with NumPy semantics Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/conftest.py | 5 +- pandas/core/arrays/arrow/array.py | 5 +- pandas/core/arrays/string_.py | 21 ++- pandas/core/arrays/string_arrow.py | 149 +++++++++++++++++++-- pandas/core/config_init.py | 2 +- pandas/core/strings/accessor.py | 4 +- pandas/tests/arrays/string_/test_string.py | 99 ++++++++++---- pandas/tests/arrays/test_datetimelike.py | 7 +- pandas/tests/extension/base/methods.py | 3 + pandas/tests/extension/test_string.py | 2 + pandas/tests/io/conftest.py | 16 +++ pandas/tests/strings/__init__.py | 2 +- pandas/tests/strings/test_case_justify.py | 5 + pandas/tests/strings/test_find_replace.py | 6 +- 14 files changed, 273 insertions(+), 53 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5210e727aeb3c..10826f50d1fe1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1321,6 +1321,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1329,6 +1330,7 @@ def string_storage(request): * 'python' * 'pyarrow' + * 'pyarrow_numpy' """ return request.param @@ -1380,6 +1382,7 @@ def object_dtype(request): "object", "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ] ) def any_string_dtype(request): @@ -2000,4 +2003,4 @@ def warsaw(request) -> str: @pytest.fixture() def arrow_string_storage(): - return ("pyarrow",) + return ("pyarrow", "pyarrow_numpy") diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4ba0e70393efb..528b085bd5236 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -512,7 +512,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 25f1c2ec6ce4f..1e285f90e9fea 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -76,7 +76,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow"}, optional + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -108,11 +108,11 @@ def na_value(self) -> libmissing.NAType: def __init__(self, storage=None) -> None: if storage is None: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under7p0: + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( "pyarrow>=7.0.0 is required for PyArrow backed StringArray." ) @@ -160,6 +160,8 @@ def construct_from_string(cls, string): return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") + elif string == "string[pyarrow_numpy]": + return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -176,12 +178,17 @@ def construct_array_type( # type: ignore[override] ------- type """ - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, + ) if self.storage == "python": return StringArray - else: + elif self.storage == "pyarrow": return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -193,6 +200,10 @@ def __from_arrow__( from pandas.core.arrays.string_arrow import ArrowStringArray return ArrowStringArray(array) + elif self.storage == "pyarrow_numpy": + from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + + return ArrowStringArrayNumpySemantics(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4a70fcf6b5a93..bc1d7cb52e196 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import re from typing import ( TYPE_CHECKING, @@ -27,6 +28,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -113,10 +115,11 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] + _storage = "pyarrow" def __init__(self, values) -> None: super().__init__(values) - self._dtype = StringDtype(storage="pyarrow") + self._dtype = StringDtype(storage=self._storage) if not pa.types.is_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -144,7 +147,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -178,6 +184,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) + @classmethod + def _result_converter(cls, values, na=None): + return BooleanDtype().__from_arrow__(values) + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" if is_scalar(value): @@ -313,7 +323,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -322,7 +332,7 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -331,7 +341,7 @@ def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -369,39 +379,39 @@ def _str_fullmatch( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_len(self): result = pc.utf8_length(self._pa_array) @@ -433,3 +443,114 @@ def _str_rstrip(self, to_strip=None): else: result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _storage = "pyarrow_numpy" + + @classmethod + def _result_converter(cls, values, na=None): + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + + def __getattribute__(self, item): + # ArrowStringArray and we both inherit from ArrowExtensionArray, which + # creates inheritance problems (Diamond inheritance) + if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": + return partial(getattr(ArrowStringArrayMixin, item), self) + return super().__getattribute__(item) + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _convert_int_dtype(self, result): + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat).to_numpy() + return self._convert_int_dtype(result) + + def _str_len(self): + result = pc.utf8_length(self._pa_array).to_numpy() + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result.to_numpy()) + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True): + from pandas import Series + + result = super().value_counts(dropna) + return Series( + result._values.to_numpy(), index=result.index, name=result.name, copy=False + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27e9bf8958ab0..745689ab1fcc8 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -500,7 +500,7 @@ def use_inf_as_na_cb(key) -> None: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index becf9b47b3af1..124ca546c4583 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -145,7 +145,9 @@ def _map_and_wrap(name: str | None, docstring: str | None): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result) + return self._wrap_result( + result, returns_string=name not in ("isnumeric", "isdecimal") + ) wrapper.__doc__ = docstring return wrapper diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index de93e89ecacd5..e29a72e1a5338 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,12 +5,16 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under12p0 + from pandas.core.dtypes.common import is_dtype_equal import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringArray -from pandas.util.version import Version +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) @pytest.fixture @@ -33,7 +37,12 @@ def test_repr(dtype): expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + else: + arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -195,19 +204,34 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = ( + pd.array(expected, dtype="boolean") + .to_numpy(na_value=False) + .astype(np.bool_) + ) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_pd_na(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_not_string(comparison_op, dtype): @@ -223,12 +247,21 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): return result = getattr(a, op_name)(other) - expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ - op_name - ] - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array(expected_data, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected_data = { + "__eq__": [False, False, False], + "__ne__": [True, False, True], + }[op_name] + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + else: + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array(expected_data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_array(comparison_op, dtype): @@ -237,15 +270,25 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.full(len(a), fill_value=None, dtype="object") - expected[-1] = getattr(other[-1], op_name)(a[-1]) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + result = getattr(a, op_name)(pd.NA) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_constructor_raises(cls): @@ -297,7 +340,7 @@ def test_from_sequence_no_mutate(copy, cls, request): result = cls._from_sequence(nan_arr, copy=copy) - if cls is ArrowStringArray: + if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) @@ -412,7 +455,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"): + if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) assert arr.equals(expected) @@ -455,6 +498,8 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -470,6 +515,8 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "double[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = np.float64 else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9eee2e0bea687..20530e37116f2 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -324,7 +324,12 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): ): arr.searchsorted("foo") - arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + if string_storage == "python": + arr_type = "StringArray" + elif string_storage == "pyarrow": + arr_type = "ArrowStringArray" + else: + arr_type = "ArrowStringArrayNumpySemantics" with pd.option_context("string_storage", string_storage): with pytest.raises( diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2dd62a4ca7538..16059155a7a8f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -70,6 +70,9 @@ def test_value_counts_with_normalize(self, data): ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") + elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": + # TODO: avoid special-casing + expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4e142eb6e14b8..069d53aeb248f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -184,6 +184,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" + elif dtype.storage == "pyarrow_numpy": # type: ignore[union-attr] + cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" return pointwise_result.astype(cast_to) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 170e2f61e7d4a..701bfe3767db4 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -234,3 +234,19 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + ] +) +def string_storage(request): + """ + Parametrized fixture for pd.options.mode.string_storage. + + * 'python' + * 'pyarrow' + """ + return request.param diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 496a2d095d85b..bf119f2721ed4 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,7 +2,7 @@ import pandas as pd -object_pyarrow_numpy = ("object",) +object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") def _convert_na_value(ser, expected): diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index ced941187f548..1dee25e631648 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -278,6 +278,11 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): + if any_string_dtype == "string[pyarrow_numpy]": + pytest.skip( + "Arrow logic is different, " + "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", + ) s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index d5017b1c47d85..78f0730d730e8 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -22,7 +22,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]",) + return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") def test_contains(any_string_dtype): @@ -223,6 +223,8 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype == "string[pyarrow_numpy]": + expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) @@ -807,7 +809,7 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) From c5adf1ab5dd30fe132300eb5df4a8da3a0b6c95c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 24 Aug 2023 12:00:42 -0700 Subject: [PATCH 033/174] Backport PR #54721 on branch 2.1.x (BUG/WARN: Passing EA object to dtype instead of an instance) (#54726) Backport PR #54721: BUG/WARN: Passing EA object to dtype instead of an instance Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/dtypes/common.py | 9 +++++++++ pandas/tests/dtypes/test_common.py | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b79797fa86431..5d46f819b07f9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -837,6 +837,7 @@ ExtensionArray - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug in :meth:`Series.unique` for boolean ``ArrowDtype`` with ``NA`` values (:issue:`54667`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) +- Bug when passing an :class:`ExtensionArray` subclass to ``dtype`` keywords. This will now raise a ``UserWarning`` to encourage passing an instance instead (:issue:`31356`, :issue:`54592`) - Bug where the :class:`DataFrame` repr would not work when a column had an :class:`ArrowDtype` with a ``pyarrow.ExtensionDtype`` (:issue:`54063`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes (e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept PyArrow arrays of type ``pyarrow.null()`` (:issue:`52223`) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c2e498e75b7d3..3db36fc50e343 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1614,6 +1614,15 @@ def pandas_dtype(dtype) -> DtypeObj: # registered extension types result = registry.find(dtype) if result is not None: + if isinstance(result, type): + # GH 31356, GH 54592 + warnings.warn( + f"Instantiating {result.__name__} without any arguments." + f"Pass a {result.__name__} instance to silence this warning.", + UserWarning, + stacklevel=find_stack_level(), + ) + result = result() return result # try a numpy dtype diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 471e456146178..165bf61302145 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -782,3 +782,9 @@ def test_pandas_dtype_numpy_warning(): match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", ): pandas_dtype(np.integer) + + +def test_pandas_dtype_ea_not_instance(): + # GH 31356 GH 54592 + with tm.assert_produces_warning(UserWarning): + assert pandas_dtype(CategoricalDtype) == CategoricalDtype() From 9cc7a2f75fd9ac02d41c3d1fb68548d5e0936844 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 24 Aug 2023 14:33:27 -0700 Subject: [PATCH 034/174] Backport PR #54720 on branch 2.1.x (Infer strings as pyarrow_numpy backed strings) (#54735) Backport PR #54720: Infer strings as pyarrow_numpy backed strings Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 6 ++++-- pandas/_libs/lib.pyx | 6 ++---- pandas/core/construction.py | 9 +++------ pandas/core/dtypes/cast.py | 5 ++--- pandas/core/internals/construction.py | 8 ++------ pandas/io/_util.py | 2 +- pandas/io/pytables.py | 13 +++---------- pandas/tests/frame/test_constructors.py | 8 ++++---- .../indexes/base_class/test_constructors.py | 4 ++-- pandas/tests/io/json/test_pandas.py | 8 ++++---- .../tests/io/parser/dtypes/test_dtypes_basic.py | 4 ++-- pandas/tests/io/pytables/test_read.py | 6 +++--- pandas/tests/io/test_feather.py | 6 +----- pandas/tests/io/test_orc.py | 4 ++-- pandas/tests/io/test_parquet.py | 6 ++---- pandas/tests/io/test_sql.py | 4 ++-- pandas/tests/series/test_constructors.py | 16 ++++++++-------- 17 files changed, 47 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 5d46f819b07f9..fabe910261c3d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0 `PyArrow `_ will become a required dependency of pandas starting with pandas 3.0. This decision was made based on -`PDEP 12 `_. +`PDEP 10 `_. This will enable more changes that are hugely beneficial to pandas users, including but not limited to: @@ -41,7 +41,9 @@ Avoid NumPy object dtype for strings by default Previously, all strings were stored in columns with NumPy object dtype. This release introduces an option ``future.infer_string`` that infers all -strings as PyArrow backed strings with dtype ``pd.ArrowDtype(pa.string())`` instead. +strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead. +This is a new string dtype implementation that follows NumPy semantics in comparison +operations and will return ``np.nan`` as the missing value indicator. This option only works if PyArrow is installed. PyArrow backed strings have a significantly reduced memory footprint and provide a big performance improvement compared to NumPy object (:issue:`54430`). diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2681115bbdcfb..859cb8e5ebead 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2682,11 +2682,9 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.str_: if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - from pandas.core.dtypes.dtypes import ArrowDtype - - dtype = ArrowDtype(pa.string()) + dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f733ba3b445fd..aaac0dc73486f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -51,10 +51,7 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - NumpyEADtype, -) +from pandas.core.dtypes.dtypes import NumpyEADtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -595,9 +592,9 @@ def sanitize_array( if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - dtype = ArrowDtype(pa.string()) + dtype = StringDtype(storage="pyarrow_numpy") subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 657cbce40087a..8ed57e9bf5532 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -799,10 +799,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: dtype = _dtype_obj if using_pyarrow_string_dtype(): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - pa_dtype = pa.string() - dtype = ArrowDtype(pa_dtype) + dtype = StringDtype(storage="pyarrow_numpy") elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 8879d3318f7ca..8aafb9dd623c5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -32,10 +32,7 @@ is_named_tuple, is_object_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -379,10 +376,9 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): - import pyarrow as pa + dtype = StringDtype(storage="pyarrow_numpy") obj_columns = list(values) - dtype = ArrowDtype(pa.string()) block_values = [ new_block( dtype.construct_array_type()._from_sequence(data, dtype=dtype), diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 27316b3ab0af0..915595833468d 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -28,4 +28,4 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return {pa.string(): pd.ArrowDtype(pa.string())}.get + return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f26411f65d91f..d3055f0ad2a38 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -68,7 +68,6 @@ ) from pandas.core.dtypes.missing import array_equivalent -import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -3224,9 +3223,7 @@ def read( values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - result = result.astype(pd.ArrowDtype(pa.string())) + result = result.astype("string[pyarrow_numpy]") return result # error: Signature of "write" incompatible with supertype "Fixed" @@ -3296,9 +3293,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") dfs.append(df) if len(dfs) > 0: @@ -4686,9 +4681,7 @@ def read( values, # type: ignore[arg-type] skipna=True, ): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") frames.append(df) if len(frames) == 1: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 63cddb7f192e6..a8ab3ce1ba014 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2685,8 +2685,8 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2720,8 +2720,8 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 638124ac20e06..60abbfc441e8e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Index(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 637d62b98a831..437012d836941 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2099,7 +2099,7 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) @@ -2108,8 +2108,8 @@ def test_json_roundtrip_string_inference(orient): result = read_json(StringIO(out)) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())), - columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 1c0f0939029ff..8494c5d58171b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -542,8 +542,8 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" data = """a,b x,1 diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 425828cb881a7..bc5ed4d208e0f 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -392,7 +392,7 @@ def test_read_py2_hdf_file_in_py3(datapath): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -400,7 +400,7 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype=pd.ArrowDtype(pa.string()), - columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a0fee6751bf53..cf43203466ef4 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -222,14 +222,10 @@ def test_invalid_dtype_backend(self): def test_string_inference(self, tmp_path): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame( - data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string()) - ) + expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index c2d791ba24c87..d90f803f1e607 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -426,7 +426,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a4c6cfcf9fe0e..ccba44b7f8fe4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1099,8 +1099,6 @@ def test_df_attrs_persistence(self, tmp_path, pa): def test_string_inference(self, tmp_path, pa): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) df.to_parquet(path, engine="pyarrow") @@ -1108,8 +1106,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 02736406e109b..2f446e6b8c81d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2946,7 +2946,7 @@ def test_read_sql_dtype_backend_table(self, string_storage, func): def test_read_sql_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=self.conn, index=False, if_exists="replace") @@ -2954,7 +2954,7 @@ def test_read_sql_string_inference(self): with pd.option_context("future.infer_string", True): result = read_sql_table(table, self.conn) - dtype = pd.ArrowDtype(pa.string()) + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 97bd8633954d8..ef734e9664844 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2077,8 +2077,8 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) @@ -2092,8 +2092,8 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) @@ -2101,16 +2101,16 @@ def test_series_string_with_na_inference(self, na_value): def test_series_string_inference_scalar(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string())) + pytest.importorskip("pyarrow") + expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) tm.assert_series_equal(ser, expected) def test_series_string_inference_array_string_dtype(self): # GH#54496 - pa = pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())) + pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) tm.assert_series_equal(ser, expected) From 4f66163376a9139f65063a92c9f9e81ce1e45f93 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 24 Aug 2023 15:44:39 -0700 Subject: [PATCH 035/174] Backport PR #54728 on branch 2.1.x (Docs: Update doc string to not reference deprecated function fillna for ffill) (#54740) Backport PR #54728: Docs: Update doc string to not reference deprecated function fillna for ffill Co-authored-by: Jessica Greene --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 954573febed41..09ebd9387220f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7348,7 +7348,7 @@ def ffill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + Fill NA/NaN values by propagating the last valid observation to next valid. Returns ------- From b7618443e82b5309011cd739eec61e79893f6118 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 25 Aug 2023 13:58:27 -0700 Subject: [PATCH 036/174] Backport PR #54699 on branch 2.1.x (BUG: setitem with object type inserting Series maintains Series) (#54760) Backport PR #54699: BUG: setitem with object type inserting Series maintains Series Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/internals/blocks.py | 4 +++- pandas/tests/indexing/test_indexing.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index fabe910261c3d..e3f18067e354d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -729,6 +729,7 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` losing dtype when setting a :class:`DataFrame` into duplicated columns (:issue:`53143`) - Bug in :meth:`DataFrame.__setitem__` with a boolean mask and :meth:`DataFrame.putmask` with mixed non-numeric dtypes and a value other than ``NaN`` incorrectly raising ``TypeError`` (:issue:`53291`) - Bug in :meth:`DataFrame.iloc` when using ``nan`` as the only element (:issue:`52234`) +- Bug in :meth:`Series.loc` casting :class:`Series` to ``np.dnarray`` when assigning :class:`Series` at predefined index of ``object`` dtype :class:`Series` (:issue:`48933`) Missing ^^^^^^^ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ecb9cd47d7995..6df90eece3cc8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1133,7 +1133,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: # length checking check_setitem_lengths(indexer, value, values) - value = extract_array(value, extract_numpy=True) + if self.dtype != _dtype_obj: + # GH48933: extract_array would convert a pd.Series value to np.ndarray + value = extract_array(value, extract_numpy=True) try: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d6d8a63797bb6..224555399ff8f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1132,3 +1132,19 @@ def test_scalar_setitem_series_with_nested_value_length1(value, indexer_sli): assert (ser.loc[0] == value).all() else: assert ser.loc[0] == value + + +def test_object_dtype_series_set_series_element(): + # GH 48933 + s1 = Series(dtype="O", index=["a", "b"]) + + s1["a"] = Series() + s1.loc["b"] = Series() + + tm.assert_series_equal(s1.loc["a"], Series()) + tm.assert_series_equal(s1.loc["b"], Series()) + + s2 = Series(dtype="O", index=["a", "b"]) + + s2.iloc[1] = Series() + tm.assert_series_equal(s2.iloc[1], Series()) From 2451d2e5cad3fa51f7fd9a4a7f8382989ba4f957 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 25 Aug 2023 13:59:10 -0700 Subject: [PATCH 037/174] Backport PR #54752 on branch 2.1.x (REGR: groupby.count returning string dtype instead of numeric for string input) (#54762) Backport PR #54752: REGR: groupby.count returning string dtype instead of numeric for string input Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/groupby/groupby.py | 5 ++++- pandas/tests/groupby/test_counting.py | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5a7f42a535951..dd0c995e6acca 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -107,6 +107,7 @@ class providing the base-class of operations. IntegerArray, SparseArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import ( PandasObject, SelectionMixin, @@ -2261,7 +2262,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: return IntegerArray( counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_) ) - elif isinstance(bvalues, ArrowExtensionArray): + elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( + bvalues.dtype, StringDtype + ): return type(bvalues)._from_sequence(counted[0]) if is_series: assert counted.ndim == 2 diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 6c27344ce3110..885e7848b76cb 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -379,3 +379,14 @@ def __eq__(self, other): result = df.groupby("grp").count() expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp")) tm.assert_frame_equal(result, expected) + + +def test_count_arrow_string_array(any_string_dtype): + # GH#54751 + pytest.importorskip("pyarrow") + df = DataFrame( + {"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)} + ) + result = df.groupby("a").count() + expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a")) + tm.assert_frame_equal(result, expected) From d407b03a8109002d1b867c1fe89fd795278cdeeb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 25 Aug 2023 17:45:15 -0700 Subject: [PATCH 038/174] Backport PR #54757 on branch 2.1.x (CI: Pin Cython on CI) (#54758) Backport PR #54757: CI: Pin Cython on CI Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-numpydev.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 638aaedecf2b5..2190136220c6c 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 4d925786c22d6..cf85345cb0cc2 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 2cd4d5f3528f8..1766721459ed2 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -25,7 +25,7 @@ dependencies: - pip - pip: - - "cython" + - "cython<=3.0.1" - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index f24e866af0439..e850f69f945c5 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - meson[ninja]=1.0.1 - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 698107341c26d..3c1630714a041 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index a0aafb1d772a7..b1cea49e22d15 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 6c6b1de165496..b8a119ece4b03 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 035395d55eb3a..23c3706f43dad 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index d436e90fa6186..71686837451b4 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.1 - meson[ninja]=1.0.1 - meson-python=0.13.1 From 0c7a8b41a8c5256359b0f4820ae1786fc7e6efba Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 26 Aug 2023 00:57:19 -0700 Subject: [PATCH 039/174] Backport PR #54759 on branch 2.1.x (CI: Pin more cython) (#54764) Backport PR #54759: CI: Pin more cython Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 6 +++--- ci/deps/actions-311-numpydev.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 030c9546fecca..6410f2edd6175 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -234,7 +234,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.1" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -272,7 +272,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.1" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -345,7 +345,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata "cython<3.0.1" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip install -ve . --no-build-isolation --no-index python -m pip list diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 1766721459ed2..acf94fce5288b 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -25,7 +25,7 @@ dependencies: - pip - pip: - - "cython<=3.0.1" + - "cython<3.0.1" - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" From 9eb6f5d4e2a7cca80f9304f8f0875ca711e1ee1e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 26 Aug 2023 03:40:15 -0700 Subject: [PATCH 040/174] Backport PR #54750 on branch 2.1.x (Revert deprecation of con as keyword only arg) (#54766) Backport PR #54750: Revert deprecation of con as keyword only arg Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/generic.py | 2 +- pandas/tests/io/test_sql.py | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e3f18067e354d..f5758a079b1b5 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -586,7 +586,7 @@ Other Deprecations - Deprecated the use of non-supported datetime64 and timedelta64 resolutions with :func:`pandas.array`. Supported resolutions are: "s", "ms", "us", "ns" resolutions (:issue:`53058`) - Deprecated values ``"pad"``, ``"ffill"``, ``"bfill"``, ``"backfill"`` for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) - Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and ``skipna=True`` or any-NAs and ``skipna=False`` returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name`` and ``con`` (:issue:`54229`) .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 09ebd9387220f..b4281f6959997 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2796,7 +2796,7 @@ def to_hdf( @final @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "name"], name="to_sql" + version="3.0", allowed_args=["self", "name", "con"], name="to_sql" ) def to_sql( self, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 2f446e6b8c81d..9ec0ba0b12a76 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2849,13 +2849,14 @@ def setup_driver(cls): def test_keyword_deprecation(self): # GH 54397 msg = ( - "tarting with pandas version 3.0 all arguments of to_sql except for the " - "argument 'name' will be keyword-only." + "Starting with pandas version 3.0 all arguments of to_sql except for the " + "arguments 'name' and 'con' will be keyword-only." ) df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) + df.to_sql("example", self.conn) with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_sql("example", self.conn) + df.to_sql("example", self.conn, None, if_exists="replace") def test_default_type_conversion(self): df = sql.read_sql_table("types", self.conn) From 605aea2d3bae5ab938a3da389841c9804bb34d15 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 26 Aug 2023 06:14:49 -0700 Subject: [PATCH 041/174] Backport PR #54585 on branch 2.1.x (Use NaN as na_value for new pyarrow_numpy StringDtype) (#54767) Backport PR #54585: Use NaN as na_value for new pyarrow_numpy StringDtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/string_.py | 10 +++-- pandas/tests/arrays/string_/test_string.py | 41 +++++++++++++------- pandas/tests/strings/__init__.py | 9 +++-- pandas/tests/strings/test_split_partition.py | 9 +++-- 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1e285f90e9fea..e3656b6ff4bcc 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -98,10 +98,14 @@ class StringDtype(StorageExtensionDtype): name = "string" - #: StringDtype().na_value uses pandas.NA + #: StringDtype().na_value uses pandas.NA except the implementation that + # follows NumPy semantics, which uses nan. @property - def na_value(self) -> libmissing.NAType: - return libmissing.NA + def na_value(self) -> libmissing.NAType | float: # type: ignore[override] + if self.storage == "pyarrow_numpy": + return np.nan + else: + return libmissing.NA _metadata = ("storage",) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e29a72e1a5338..68b2b01ddcaf3 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -17,6 +17,13 @@ ) +def na_val(dtype): + if dtype.storage == "pyarrow_numpy": + return np.nan + else: + return pd.NA + + @pytest.fixture def dtype(string_storage): """Fixture giving StringDtype from parametrized 'string_storage'""" @@ -31,26 +38,34 @@ def cls(dtype): def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - expected = " A\n0 a\n1 \n2 b" + if dtype.storage == "pyarrow_numpy": + expected = " A\n0 a\n1 NaN\n2 b" + else: + expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = "0 a\n1 \n2 b\nName: A, dtype: string" + if dtype.storage == "pyarrow_numpy": + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + else: + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected if dtype.storage == "pyarrow": arr_name = "ArrowStringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" elif dtype.storage == "pyarrow_numpy": arr_name = "ArrowStringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: arr_name = "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected def test_none_to_nan(cls): a = cls._from_sequence(["a", None, "b"]) assert a[1] is not None - assert a[1] is pd.NA + assert a[1] is na_val(a.dtype) def test_setitem_validates(cls): @@ -205,13 +220,9 @@ def test_comparison_methods_scalar(comparison_op, dtype): other = "a" result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = ( - pd.array(expected, dtype="boolean") - .to_numpy(na_value=False) - .astype(np.bool_) - ) - tm.assert_numpy_array_equal(result, expected) + expected = np.array([getattr(item, op_name)(other) for item in a]) + expected[1] = False + tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) @@ -407,7 +418,7 @@ def test_min_max(method, skipna, dtype, request): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is pd.NA + assert result is na_val(arr.dtype) @pytest.mark.parametrize("method", ["min", "max"]) @@ -475,7 +486,7 @@ def test_arrow_roundtrip(dtype, string_storage2): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is pd.NA + assert result.loc[2, "a"] is na_val(result["a"].dtype) def test_arrow_load_from_zero_chunks(dtype, string_storage2): @@ -573,7 +584,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", pd.NA, "b"], dtype=object) + expected = np.array(["a", na_val(dtype), "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -613,7 +624,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is pd.NA + assert ser.array[1] is na_val(ser.dtype) # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index bf119f2721ed4..01b49b5e5b633 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -1,4 +1,4 @@ -# Needed for new arrow string dtype +import numpy as np import pandas as pd @@ -7,6 +7,9 @@ def _convert_na_value(ser, expected): if ser.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + if ser.dtype.storage == "pyarrow_numpy": + expected = expected.fillna(np.nan) + else: + # GH#18463 + expected = expected.fillna(pd.NA) return expected diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 7fabe238d2b86..0a7d409773dd6 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -12,7 +12,10 @@ Series, _testing as tm, ) -from pandas.tests.strings import _convert_na_value +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) @pytest.mark.parametrize("method", ["split", "rsplit"]) @@ -113,8 +116,8 @@ def test_split_object_mixed(expand, method): def test_split_n(any_string_dtype, method, n): s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) - result = getattr(s.str, method)(" ", n=n) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -381,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) From d42fbed6313e0441d0eafc87fa044ea2ae338e56 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 26 Aug 2023 22:45:08 -0700 Subject: [PATCH 042/174] Backport PR #54768 on branch 2.1.x (Fix roundtripping with pyarrow schema) (#54773) Backport PR #54768: Fix roundtripping with pyarrow schema Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/string_.py | 3 ++- pandas/tests/io/test_parquet.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e3656b6ff4bcc..3af63c781d499 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -14,6 +14,7 @@ missing as libmissing, ) from pandas._libs.arrays import NDArrayBacked +from pandas._libs.lib import ensure_string_array from pandas.compat import pa_version_under7p0 from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -221,7 +222,7 @@ def __from_arrow__( arr = np.array([], dtype=object) else: arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) - arr = lib.convert_nans_to_NA(arr) + arr = ensure_string_array(arr, na_value=libmissing.NA) # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ccba44b7f8fe4..b2d6e958020be 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,5 +1,6 @@ """ test parquet compat """ import datetime +from decimal import Decimal from io import BytesIO import os import pathlib @@ -16,6 +17,7 @@ from pandas.compat.pyarrow import ( pa_version_under7p0, pa_version_under8p0, + pa_version_under11p0, pa_version_under13p0, ) @@ -1111,6 +1113,18 @@ def test_string_inference(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(pa_version_under11p0, reason="not supported before 11.0") + def test_roundtrip_decimal(self, tmp_path, pa): + # GH#54768 + import pyarrow as pa + + path = tmp_path / "decimal.p" + df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") + df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) + result = read_parquet(path) + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + tm.assert_frame_equal(result, expected) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From f7eb2ccea2beed48517b2a9ec6b19a92db2ef422 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 27 Aug 2023 23:45:14 +0200 Subject: [PATCH 043/174] Backport PR #54755 on branch 2.1.x (BUG: merge raising for ea int and numpy float) (#54779) BUG: merge raising for ea int and numpy float (#54755) * BUG: merge raising for ea int and numpy float * Fix up mypy and add check (cherry picked from commit 9939c323e636d580e8ca1690aef40b6822ff02f7) --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/reshape/merge.py | 16 ++++++++++++++++ pandas/tests/reshape/merge/test_merge.py | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f5758a079b1b5..19a8500928ab7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -816,6 +816,7 @@ Reshaping - Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`) - Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`) - Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`) +- Bug in :func:`merge` when merging on integer ``ExtensionDtype`` and float NumPy dtype raising ``TypeError`` (:issue:`46178`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) - Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`) - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c2cb9d643ca87..d811c53909fde 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -53,6 +53,7 @@ ensure_object, is_bool, is_bool_dtype, + is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, @@ -1385,6 +1386,21 @@ def _maybe_coerce_merge_keys(self) -> None: if lk.dtype.kind == rk.dtype.kind: continue + if is_extension_array_dtype(lk.dtype) and not is_extension_array_dtype( + rk.dtype + ): + ct = find_common_type([lk.dtype, rk.dtype]) + if is_extension_array_dtype(ct): + rk = ct.construct_array_type()._from_sequence(rk) # type: ignore[union-attr] # noqa: E501 + else: + rk = rk.astype(ct) # type: ignore[arg-type] + elif is_extension_array_dtype(rk.dtype): + ct = find_common_type([lk.dtype, rk.dtype]) + if is_extension_array_dtype(ct): + lk = ct.construct_array_type()._from_sequence(lk) # type: ignore[union-attr] # noqa: E501 + else: + lk = lk.astype(ct) # type: ignore[arg-type] + # check whether ints and floats if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 91510fae0a9b1..98db24530833b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2847,3 +2847,26 @@ def test_merge_multiindex_single_level(): result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) + + +def test_merge_ea_int_and_float_numpy(): + # GH#46178 + df1 = DataFrame([1.0, np.nan], dtype=pd.Int64Dtype()) + df2 = DataFrame([1.5]) + expected = DataFrame(columns=[0], dtype="Int64") + + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df1.merge(df2) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) + + df2 = DataFrame([1.0]) + expected = DataFrame([1], columns=[0], dtype="Int64") + result = df1.merge(df2) + tm.assert_frame_equal(result, expected) + + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) From d27c4675a90c3565cbe674d178685fe7d7f4bf13 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 27 Aug 2023 22:01:15 -0700 Subject: [PATCH 044/174] Backport PR #54783 on branch 2.1.x (CI: Remove strict option) (#54785) Backport PR #54783: CI: Remove strict option Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/code-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 9daa1a67bc2c1..f87aef5385898 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --strict --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same build_docker_dev_environment: name: Build Docker Dev Environment From 9ffc438a8137c5c5c8e8effab1d53e9d1a20775e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 05:30:58 -0700 Subject: [PATCH 045/174] Backport PR #54789 on branch 2.1.x (WARN: Remove non-actionable warning in value_counts) (#54795) Backport PR #54789: WARN: Remove non-actionable warning in value_counts Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/frame.py | 2 +- pandas/tests/frame/methods/test_value_counts.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 56e1b82c992c3..c34df3e0e865c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7260,7 +7260,7 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna).grouper.size() + counts = self.groupby(subset, dropna=dropna, observed=False).grouper.size() counts.name = name if sort: diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 355f05cd5156c..c05a929360478 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -175,3 +175,17 @@ def test_data_frame_value_counts_subset(nulls_fixture, columns): ) tm.assert_series_equal(result, expected) + + +def test_value_counts_categorical_future_warning(): + # GH#54775 + df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category") + result = df.value_counts() + expected = pd.Series( + 1, + index=pd.MultiIndex.from_arrays( + [pd.Index([1, 2, 3], name="a", dtype="category")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) From 901b5e67a81aad01f17ee81fbd56192ccc73377b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 05:31:28 -0700 Subject: [PATCH 046/174] Backport PR #54527 on branch 2.1.x (BUG: Special-case setting nan into integer series) (#54791) Backport PR #54527: BUG: Special-case setting nan into integer series Co-authored-by: Marco Edward Gorelli --- pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 16 ++++++++++ pandas/core/internals/blocks.py | 23 ++++++++++++++ pandas/tests/frame/indexing/test_indexing.py | 30 ++++++++----------- pandas/tests/indexing/test_indexing.py | 16 +++------- pandas/tests/series/indexing/test_indexing.py | 21 ++++++++----- pandas/tests/series/indexing/test_setitem.py | 29 ++++++++---------- .../series/methods/test_convert_dtypes.py | 12 +------- 8 files changed, 83 insertions(+), 65 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 32641319a6b96..15bd5a7379105 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -195,6 +195,7 @@ def array_equivalent_object( right: npt.NDArray[np.object_], ) -> bool: ... def has_infs(arr: np.ndarray) -> bool: ... # const floating[:] +def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:] def get_reverse_indexer( indexer: np.ndarray, # const intp_t[:] length: int, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 859cb8e5ebead..0c0610f72044e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -530,6 +530,22 @@ def has_infs(floating[:] arr) -> bool: return ret +@cython.boundscheck(False) +@cython.wraparound(False) +def has_only_ints_or_nan(floating[:] arr) -> bool: + cdef: + floating val + intp_t i + + for i in range(len(arr)): + val = arr[i] + if (val != val) or (val == val): + continue + else: + return False + return True + + def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6df90eece3cc8..7e647426e9726 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,6 +18,7 @@ from pandas._config import using_copy_on_write from pandas._libs import ( + NaT, internals as libinternals, lib, writers, @@ -60,7 +61,10 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, + is_float_dtype, + is_integer_dtype, is_list_like, + is_scalar, is_string_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -454,6 +458,25 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and will receive the same block """ new_dtype = find_result_type(self.values.dtype, other) + + # In a future version of pandas, the default will be that + # setting `nan` into an integer series won't raise. + if ( + is_scalar(other) + and is_integer_dtype(self.values.dtype) + and isna(other) + and other is not NaT + ): + warn_on_upcast = False + elif ( + isinstance(other, np.ndarray) + and other.ndim == 1 + and is_integer_dtype(self.values.dtype) + and is_float_dtype(other.dtype) + and lib.has_only_ints_or_nan(other) + ): + warn_on_upcast = False + if warn_on_upcast: warnings.warn( f"Setting an item of incompatible dtype is deprecated " diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 288aa1af746b6..b324291bab31e 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -337,18 +337,12 @@ def test_setitem(self, float_frame, using_copy_on_write): def test_setitem2(self): # dtype changing GH4204 df = DataFrame([[0, 0]]) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - df.iloc[0] = np.nan + df.iloc[0] = np.nan expected = DataFrame([[np.nan, np.nan]]) tm.assert_frame_equal(df, expected) df = DataFrame([[0, 0]]) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - df.loc[0] = np.nan + df.loc[0] = np.nan tm.assert_frame_equal(df, expected) def test_setitem_boolean(self, float_frame): @@ -1579,9 +1573,7 @@ def test_setitem(self, uint64_frame): # With NaN: because uint64 has no NaN element, # the column should be cast to object. df2 = df.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT result = df2["B"] @@ -1901,19 +1893,19 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. - def _check_setitem_invalid(self, df, invalid, indexer): + def _check_setitem_invalid(self, df, invalid, indexer, warn): msg = "Setting an item of incompatible dtype is deprecated" msg = re.escape(msg) orig_df = df.copy() # iloc - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): df.iloc[indexer, 0] = invalid df = orig_df.copy() # loc - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): df.loc[indexer, "a"] = invalid df = orig_df.copy() @@ -1934,16 +1926,20 @@ def _check_setitem_invalid(self, df, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): df = DataFrame({"a": [True, False, False]}, dtype="bool") - self._check_setitem_invalid(df, invalid, indexer) + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer) + if isna(invalid) and invalid is not pd.NaT: + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(df, invalid, indexer, warn) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer) + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 224555399ff8f..54e204c43dadd 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -830,8 +830,7 @@ def test_coercion_with_loc(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe.loc[0, ["foo"]] = None + start_dataframe.loc[0, ["foo"]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -841,8 +840,7 @@ def test_coercion_with_setitem_and_dataframe(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None + start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -852,10 +850,7 @@ def test_none_coercion_loc_and_dataframe(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe.loc[ - start_dataframe["foo"] == start_dataframe["foo"][0] - ] = None + start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -869,10 +864,7 @@ def test_none_coercion_mixed_dtypes(self): "d": ["a", "b", "c"], } ) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): - start_dataframe.iloc[0] = None + start_dataframe.iloc[0] = None exp = DataFrame( { diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 7b857a487db78..0fa28920d41bd 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -19,6 +19,7 @@ Timestamp, concat, date_range, + isna, period_range, timedelta_range, ) @@ -456,25 +457,25 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. - def _check_setitem_invalid(self, ser, invalid, indexer): + def _check_setitem_invalid(self, ser, invalid, indexer, warn): msg = "Setting an item of incompatible dtype is deprecated" msg = re.escape(msg) orig_ser = ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser.iloc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser.loc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser[:] = invalid _invalid_scalars = [ @@ -494,16 +495,20 @@ def _check_setitem_invalid(self, ser, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): ser = Series([True, False, False], dtype="bool") - self._check_setitem_invalid(ser, invalid, indexer) + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer) + if isna(invalid) and invalid is not NaT: + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(ser, invalid, indexer, warn) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): ser = Series([1, 2, None], dtype=float_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer) + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index bc596fb0a3abe..d42e884c1ac78 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -191,14 +191,11 @@ def test_setitem_series_object_dtype(self, indexer, ser_index): expected = Series([Series([42], index=[ser_index]), 0], dtype="object") tm.assert_series_equal(ser, expected) - @pytest.mark.parametrize( - "index, exp_value, warn", [(0, 42, None), (1, np.nan, FutureWarning)] - ) - def test_setitem_series(self, index, exp_value, warn): + @pytest.mark.parametrize("index, exp_value", [(0, 42), (1, np.nan)]) + def test_setitem_series(self, index, exp_value): # GH#38303 ser = Series([0, 0]) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): - ser.loc[0] = Series([42], index=[index]) + ser.loc[0] = Series([42], index=[index]) expected = Series([exp_value, 0]) tm.assert_series_equal(ser, expected) @@ -575,7 +572,7 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): [ (NA, NA, "Int64", "Int64", 1, None), (NA, NA, "Int64", "Int64", 2, None), - (NA, np.nan, "int64", "float64", 1, FutureWarning), + (NA, np.nan, "int64", "float64", 1, None), (NA, np.nan, "int64", "float64", 2, None), (NaT, NaT, "int64", "object", 1, FutureWarning), (NaT, NaT, "int64", "object", 2, None), @@ -583,7 +580,7 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): (np.nan, NA, "Int64", "Int64", 2, None), (np.nan, NA, "Float64", "Float64", 1, None), (np.nan, NA, "Float64", "Float64", 2, None), - (np.nan, np.nan, "int64", "float64", 1, FutureWarning), + (np.nan, np.nan, "int64", "float64", 1, None), (np.nan, np.nan, "int64", "float64", 2, None), ], ) @@ -592,7 +589,7 @@ def test_setitem_enlarge_with_na( ): # GH#32346 ser = Series([1, 2], dtype=dtype) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + with tm.assert_produces_warning(warn, match="incompatible dtype"): ser[indexer] = na expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] expected = Series(expected_values, dtype=target_dtype) @@ -884,7 +881,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), - FutureWarning, + None, id="int_series_slice_key_step", ), pytest.param( @@ -899,7 +896,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), - FutureWarning, + None, id="int_series_slice_key", ), pytest.param( @@ -907,7 +904,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, - FutureWarning, + None, id="int_series_int_key", ), pytest.param( @@ -1134,7 +1131,7 @@ def warn(self): "obj,expected,warn", [ # For numeric series, we should coerce to NaN. - (Series([1, 2, 3]), Series([np.nan, 2, 3]), FutureWarning), + (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), # For datetime series, we should coerce to NaT. ( @@ -1584,13 +1581,11 @@ def test_20643_comment(): expected = Series([np.nan, 1, 2], index=["a", "b", "c"]) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - ser.iat[0] = None + ser.iat[0] = None tm.assert_series_equal(ser, expected) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - ser.iloc[0] = None + ser.iloc[0] = None tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 9cd0ce250b5df..d1c79d0f00365 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -206,17 +206,7 @@ def test_convert_dtypes( # Test that it is a copy copy = series.copy(deep=True) - if result.notna().sum() > 0 and result.dtype in [ - "int8", - "uint8", - "int16", - "uint16", - "int32", - "uint32", - "int64", - "uint64", - "interval[int64, right]", - ]: + if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): result[result.notna()] = np.nan else: From 72cb1011366d1d6c661219b70a176948c0aaa675 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 28 Aug 2023 17:05:31 +0200 Subject: [PATCH 047/174] Backport PR #54591 on branch 2.1.x (Implement any and all for pyarrow numpy strings) (#54796) --- pandas/core/arrays/string_arrow.py | 13 +++++++++++++ pandas/tests/extension/base/reduce.py | 2 +- pandas/tests/extension/test_string.py | 6 ++++++ pandas/tests/reductions/test_reductions.py | 19 +++++++++++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bc1d7cb52e196..87dd48b3569d8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -554,3 +554,16 @@ def value_counts(self, dropna: bool = True): return Series( result._values.to_numpy(), index=result.index, name=result.name, copy=False ) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + arr = pc.and_kleene( + pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "") + ) + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index a6532a6190467..43b2df4290eed 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -25,7 +25,7 @@ def check_reduce(self, s, op_name, skipna): try: alt = s.astype("float64") - except TypeError: + except (TypeError, ValueError): # e.g. Interval can't cast, so let's cast to object and do # the reduction pointwise alt = s.astype(object) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 069d53aeb248f..5176289994033 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -157,6 +157,12 @@ def test_fillna_no_op_returns_copy(self, data): class TestReduce(base.BaseReduceTests): + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + return ( + ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + and op_name in ("any", "all") + ) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 87892a81cef3d..021252500e814 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1078,6 +1078,25 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() + def test_any_all_pyarrow_string(self): + # GH#54591 + pytest.importorskip("pyarrow") + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, ""], dtype="string[pyarrow_numpy]") + assert not ser.any() + assert not ser.all() + + ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() + def test_timedelta64_analytics(self): # index min/max dti = date_range("2012-1-1", periods=3, freq="D") From 4daed68d100df6f3ae85815dbee50dcb01ccd69a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 10:25:36 -0700 Subject: [PATCH 048/174] Backport PR #54800 on branch 2.1.x (ExtensionArray.fillna: start with DeprecationWarning about added copy keyword) (#54812) Backport PR #54800: ExtensionArray.fillna: start with DeprecationWarning about added copy keyword Co-authored-by: Joris Van den Bossche --- pandas/core/internals/blocks.py | 2 +- pandas/tests/extension/decimal/test_decimal.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7e647426e9726..a33c5d9303421 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2015,7 +2015,7 @@ def fillna( "need to implement this keyword or an exception will be " "raised. In the interim, the keyword is ignored by " f"{type(self.values).__name__}.", - FutureWarning, + DeprecationWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index baa056550624f..17dfc4e801ca0 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -133,7 +133,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): def test_fillna_frame(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): super().test_fillna_frame(data_missing) @@ -166,7 +166,7 @@ def test_fillna_no_op_returns_copy(self, data): def test_fillna_series(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): super().test_fillna_series(data_missing) @@ -178,13 +178,13 @@ def test_fillna_series_method(self, data_missing, fillna_method): super().test_fillna_series_method(data_missing, fillna_method) def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - warn = FutureWarning if not using_copy_on_write else None + warn = DeprecationWarning if not using_copy_on_write else None msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): super().test_fillna_copy_frame(data_missing) def test_fillna_copy_series(self, data_missing, using_copy_on_write): - warn = FutureWarning if not using_copy_on_write else None + warn = DeprecationWarning if not using_copy_on_write else None msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): super().test_fillna_copy_series(data_missing) From e5d508ec50fcf269fe0b0f5d28412b0a9f3c828f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 10:25:59 -0700 Subject: [PATCH 049/174] Backport PR #54778 on branch 2.1.x (REGR: Index.union loses python string dtype) (#54813) Backport PR #54778: REGR: Index.union loses python string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/test_setops.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1f15a4ad84755..0db0a3608a1f0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5204,7 +5204,7 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ if isinstance(self.values, BaseMaskedArray): return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) - elif isinstance(self.values, ArrowExtensionArray): + elif isinstance(self.values, (ArrowExtensionArray, StringArray)): return type(self.values)._from_sequence(result) return result diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 2fd203dbc77ed..a64994efec85a 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -899,3 +899,10 @@ def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): result = idx.union(idx2) expected = Index([1, 2, 3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) tm.assert_index_equal(result, expected) + + def test_union_string_array(self, any_string_dtype): + idx1 = Index(["a"], dtype=any_string_dtype) + idx2 = Index(["b"], dtype=any_string_dtype) + result = idx1.union(idx2) + expected = Index(["a", "b"], dtype=any_string_dtype) + tm.assert_index_equal(result, expected) From 891ea9be86715312986639d0cbf80314d6d59774 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 10:26:21 -0700 Subject: [PATCH 050/174] Backport PR #54803 on branch 2.1.x (DOC: update docstrings for ffill and bfill (no longer aliases for fillna with method)) (#54814) Backport PR #54803: DOC: update docstrings for ffill and bfill (no longer aliases for fillna with method) Co-authored-by: Joris Van den Bossche --- pandas/core/generic.py | 60 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b4281f6959997..fce9b35aa5c49 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7079,6 +7079,8 @@ def fillna( See Also -------- + ffill : Fill values by propagating the last valid observation to next valid. + bfill : Fill values by using the next valid observation to fill the gap. interpolate : Fill NaN values using interpolation. reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. @@ -7338,7 +7340,10 @@ def ffill( ... @final - @doc(klass=_shared_doc_kwargs["klass"]) + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) def ffill( self, *, @@ -7350,6 +7355,27 @@ def ffill( """ Fill NA/NaN values by propagating the last valid observation to next valid. + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + Returns ------- {klass} or None @@ -7417,7 +7443,7 @@ def pad( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + Fill NA/NaN values by propagating the last valid observation to next valid. .. deprecated:: 2.0 @@ -7474,7 +7500,10 @@ def bfill( ... @final - @doc(klass=_shared_doc_kwargs["klass"]) + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) def bfill( self, *, @@ -7484,7 +7513,28 @@ def bfill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + Fill NA/NaN values by using the next valid observation to fill the gap. + + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). Returns ------- @@ -7563,7 +7613,7 @@ def backfill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + Fill NA/NaN values by using the next valid observation to fill the gap. .. deprecated:: 2.0 From 7961b1c403827bf2672e4f1ed6c29a6e2c901b2e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 12:36:40 -0700 Subject: [PATCH 051/174] Backport PR #54801 on branch 2.1.x (BUG: repr aligning left for string dtype columns) (#54819) Backport PR #54801: BUG: repr aligning left for string dtype columns Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/indexes/base.py | 4 ++-- pandas/tests/frame/test_repr_info.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 19a8500928ab7..8f2667d69a322 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -717,6 +717,7 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str` that did not raise a ``TypeError`` when iterated (:issue:`54173`) +- Bug in ``repr`` for :class:`DataFrame`` with string-dtype columns (:issue:`54797`) Interval ^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0db0a3608a1f0..aa306da017d34 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1419,8 +1419,8 @@ def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t] values = self._values - if is_object_dtype(values.dtype): - values = cast(np.ndarray, values) + if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): + values = np.asarray(values) values = lib.maybe_convert_objects(values, safe=True) result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 49375658abfee..64d516e484991 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -455,3 +455,14 @@ def test_masked_ea_with_formatter(self): 0 0.12 1.00 1 1.12 2.00""" assert result == expected + + def test_repr_ea_columns(self, any_string_dtype): + # GH#54797 + pytest.importorskip("pyarrow") + df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]}) + df.columns = df.columns.astype(any_string_dtype) + expected = """ long_column_name col2 +0 1 4 +1 2 5 +2 3 6""" + assert repr(df) == expected From 4855f54921a4b423908c06e4fd6a48894089e7b2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 12:43:06 -0700 Subject: [PATCH 052/174] Backport PR #54737 on branch 2.1.x (MAINT: small simplification of meson.build following best practices) (#54799) * Backport PR #54737: MAINT: small simplification of meson.build following best practices * Update meson.build --------- Co-authored-by: Daniele Nicolodi Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- meson.build | 7 +------ pandas/_libs/window/meson.build | 2 -- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/meson.build b/meson.build index a927b59abeaf9..09a1494135af4 100644 --- a/meson.build +++ b/meson.build @@ -6,9 +6,6 @@ project( license: 'BSD-3', meson_version: '>=1.0.1', default_options: [ - # TODO: investigate, does meson try to compile against debug Python - # when buildtype = debug, this seems to be causing problems on CI - # where provided Python is not compiled in debug mode 'buildtype=release', # TODO: Reactivate werror, some warnings on Windows #'werror=true', @@ -16,10 +13,8 @@ project( ] ) -py_mod = import('python') fs = import('fs') -py = py_mod.find_installation('python') -py_dep = py.dependency() +py = import('python').find_installation() tempita = files('generate_pxi.py') versioneer = files('generate_version.py') diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index b83d8730f9447..85aa060a26406 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -3,7 +3,6 @@ py.extension_module( ['aggregations.pyx'], cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], - dependencies: [py_dep], subdir: 'pandas/_libs/window', override_options : ['cython_language=cpp'], install: true @@ -14,7 +13,6 @@ py.extension_module( ['indexers.pyx'], cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], - dependencies: [py_dep], subdir: 'pandas/_libs/window', install: true ) From 68b64b8b6b24c7cb9d5a2ec74ffb95bcba938a09 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 15:29:32 -0700 Subject: [PATCH 053/174] Backport PR #54820 on branch 2.1.x (REF: Dispatch ArrowExtensionArray.fillna methods to pad_or_backfill) (#54827) Backport PR #54820: REF: Dispatch ArrowExtensionArray.fillna methods to pad_or_backfill Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/arrays/arrow/array.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 528b085bd5236..70c59e94f6d27 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -40,7 +40,10 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna -from pandas.core import roperator +from pandas.core import ( + missing, + roperator, +) from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.base import ( @@ -933,6 +936,20 @@ def pad_or_backfill( # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() + if limit is not None and limit_area is not None: + method = missing.clean_fill_method(method) + try: + if method == "pad": + return type(self)(pc.fill_null_forward(self._pa_array)) + elif method == "backfill": + return type(self)(pc.fill_null_backward(self._pa_array)) + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'coalesce' has no kernel + # matching input types (duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. return super().pad_or_backfill( @@ -953,9 +970,12 @@ def fillna( # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is not None or method is not None: + if limit is not None: return super().fillna(value=value, method=method, limit=limit, copy=copy) + if method is not None: + return super().pad_or_backfill(method=method, limit=limit, copy=copy) + if isinstance(value, (np.ndarray, ExtensionArray)): # Similar to check_value_size, but we do not mask here since we may # end up passing it to the super() method. @@ -972,12 +992,7 @@ def fillna( raise TypeError(msg) from err try: - if method is None: - return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) - elif method == "pad": - return type(self)(pc.fill_null_forward(self._pa_array)) - elif method == "backfill": - return type(self)(pc.fill_null_backward(self._pa_array)) + return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) except pa.ArrowNotImplementedError: # ArrowNotImplementedError: Function 'coalesce' has no kernel # matching input types (duration[ns], duration[ns]) From 0084f7799e17c8781474e77c48569bc5c95ba7ec Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Aug 2023 15:39:48 -0700 Subject: [PATCH 054/174] Backport PR #54823 on branch 2.1.x (REGR: clip raising for list-bound) (#54829) Backport PR #54823: REGR: clip raising for list-bound Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/frame.py | 1 + pandas/tests/frame/methods/test_clip.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c34df3e0e865c..cdd0545f0a3ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7929,6 +7929,7 @@ def to_series(right): ) elif isinstance(right, Series): # axis=1 is default for DataFrame-with-Series op + axis = axis if axis is not None else 1 if not flex: if not left.axes[axis].equals(right.index): raise ValueError( diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 9bd032a0aefc4..f7b221d84ab78 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -177,3 +177,14 @@ def test_clip_int_data_with_float_bound(self): result = df.clip(lower=1.5) expected = DataFrame({"a": [1.5, 2.0, 3.0]}) tm.assert_frame_equal(result, expected) + + def test_clip_with_list_bound(self): + # GH#54817 + df = DataFrame([1, 5]) + expected = DataFrame([3, 5]) + result = df.clip([3]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([1, 3]) + result = df.clip(upper=[3]) + tm.assert_frame_equal(result, expected) From 3b7f41102746421824342e4284aa68017865ecf6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Aug 2023 04:14:20 -0700 Subject: [PATCH 055/174] Backport PR #54794 on branch 2.1.x (Infer string storage based on infer_string option) (#54839) Backport PR #54794: Infer string storage based on infer_string option Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 6 +++++- pandas/core/arrays/string_.py | 6 +++++- pandas/core/config_init.py | 3 ++- pandas/tests/series/test_constructors.py | 8 ++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 8f2667d69a322..7aea1fa99f655 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -39,11 +39,15 @@ We are collecting feedback on this decision `here libmissing.NAType | float: # type: ignore[override] def __init__(self, storage=None) -> None: if storage is None: - storage = get_option("mode.string_storage") + infer_string = get_option("future.infer_string") + if infer_string: + storage = "pyarrow_numpy" + else: + storage = get_option("mode.string_storage") if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 745689ab1fcc8..765b24fbc7868 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -492,7 +492,8 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc = """ : string - The default storage for StringDtype. + The default storage for StringDtype. This option is ignored if + ``future.infer_string`` is set to True. """ with cf.config_prefix("mode"): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ef734e9664844..2c3fdf627788a 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2115,6 +2115,14 @@ def test_series_string_inference_array_string_dtype(self): ser = Series(np.array(["a", "b"])) tm.assert_series_equal(ser, expected) + def test_series_string_inference_storage_definition(self): + # GH#54793 + pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="string") + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 772beaa50bf5bc15b2c46f1bfe00d51d01ffd086 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:49:53 -0700 Subject: [PATCH 056/174] Backport PR #54818 on branch 2.1.x (DEP: Deprecate passing fill_value and freq to shift) (#54849) Backport PR #54818: DEP: Deprecate passing fill_value and freq to shift Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/frame.py | 10 +++++++--- pandas/core/generic.py | 10 +++++++--- pandas/tests/frame/methods/test_shift.py | 19 +++++++++++++------ .../tests/groupby/test_groupby_shift_diff.py | 17 +++++++++++------ 5 files changed, 39 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7aea1fa99f655..ebb31733fe370 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -591,6 +591,7 @@ Other Deprecations - Deprecated values ``"pad"``, ``"ffill"``, ``"bfill"``, ``"backfill"`` for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) - Deprecated the behavior of :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`Series.argmax`, :meth:`Series.argmin` with either all-NAs and ``skipna=True`` or any-NAs and ``skipna=False`` returning -1; in a future version this will raise ``ValueError`` (:issue:`33941`, :issue:`33942`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_sql` except ``name`` and ``con`` (:issue:`54229`) +- Deprecated silently ignoring ``fill_value`` when passing both ``freq`` and ``fill_value`` to :meth:`DataFrame.shift`, :meth:`Series.shift` and :meth:`.DataFrameGroupBy.shift`; in a future version this will raise ``ValueError`` (:issue:`53832`) .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: @@ -875,7 +876,6 @@ Other - Bug in :meth:`.DataFrameGroupBy.first`, :meth:`.DataFrameGroupBy.last`, :meth:`.SeriesGroupBy.first`, and :meth:`.SeriesGroupBy.last` where an empty group would return ``np.nan`` instead of the corresponding :class:`.ExtensionArray` NA value (:issue:`39098`) - Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) -- Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`.DataFrameGroupBy.shift` when passing both ``freq`` and ``fill_value`` silently ignoring ``fill_value`` instead of raising ``ValueError`` (:issue:`53832`) - Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`) - Bug in :meth:`Index.sort_values` when a ``key`` is passed (:issue:`52764`) - Bug in :meth:`Series.align`, :meth:`DataFrame.align`, :meth:`Series.reindex`, :meth:`DataFrame.reindex`, :meth:`Series.interpolate`, :meth:`DataFrame.interpolate`, incorrectly failing to raise with method="asfreq" (:issue:`53620`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cdd0545f0a3ec..471db115da8df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5617,10 +5617,14 @@ def shift( ) -> DataFrame: if freq is not None and fill_value is not lib.no_default: # GH#53832 - raise ValueError( - "Cannot pass both 'freq' and 'fill_value' to " - f"{type(self).__name__}.shift" + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), ) + fill_value = lib.no_default axis = self._get_axis_number(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fce9b35aa5c49..f6bf51f65049a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10802,10 +10802,14 @@ def shift( if freq is not None and fill_value is not lib.no_default: # GH#53832 - raise ValueError( - "Cannot pass both 'freq' and 'fill_value' to " - f"{type(self).__name__}.shift" + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), ) + fill_value = lib.no_default if periods == 0: return self.copy(deep=None) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 808f0cff2485c..1e88152137b1f 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -32,19 +32,23 @@ def test_shift_axis1_with_valid_fill_value_one_array(self): expected2 = DataFrame([12345] * 5, dtype="Float64") tm.assert_frame_equal(res2, expected2) - def test_shift_disallow_freq_and_fill_value(self, frame_or_series): + def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): # Can't pass both! obj = frame_or_series( np.random.default_rng(2).standard_normal(5), index=date_range("1/1/2000", periods=5, freq="H"), ) - msg = "Cannot pass both 'freq' and 'fill_value' to (Series|DataFrame).shift" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): obj.shift(1, fill_value=1, freq="H") if frame_or_series is DataFrame: - with pytest.raises(ValueError, match=msg): + obj.columns = date_range("1/1/2000", periods=1, freq="H") + with tm.assert_produces_warning(FutureWarning, match=msg): obj.shift(1, axis=1, fill_value=1, freq="H") @pytest.mark.parametrize( @@ -716,8 +720,11 @@ def test_shift_with_iterable_freq_and_fill_value(self): df.shift(1, freq="H"), ) - msg = r"Cannot pass both 'freq' and 'fill_value' to.*" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.shift([1, 2], fill_value=1, freq="H") def test_shift_with_iterable_check_other_arguments(self): diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index 495f3fcd359c7..bb4b9aa866ac9 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -166,12 +166,14 @@ def test_shift_periods_freq(): tm.assert_frame_equal(result, expected) -def test_shift_disallow_freq_and_fill_value(): +def test_shift_deprecate_freq_and_fill_value(): # GH 53832 data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} df = DataFrame(data, index=date_range(start="20100101", periods=6)) - msg = "Cannot pass both 'freq' and 'fill_value' to (Series|DataFrame).shift" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") @@ -238,12 +240,15 @@ def test_group_shift_with_multiple_periods_and_fill_value(): tm.assert_frame_equal(shifted_df, expected_df) -def test_group_shift_with_multiple_periods_and_both_fill_and_freq_fails(): +def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): # GH#44424 df = DataFrame( {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, index=date_range("1/1/2000", periods=5, freq="H"), ) - msg = r"Cannot pass both 'freq' and 'fill_value' to.*" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="H") From 11424f82f5e1c65c088fe3c9dd2e5da38290c840 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:54:11 -0700 Subject: [PATCH 057/174] Backport PR #54838 on branch 2.1.x (Don't expose EA.pad_or_backfill to users + switch to DeprecationWarning) (#54851) Backport PR #54838: Don't expose EA.pad_or_backfill to users + switch to DeprecationWarning Co-authored-by: Joris Van den Bossche --- doc/source/reference/extensions.rst | 2 +- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/arrays/_mixins.py | 9 +-- pandas/core/arrays/arrow/array.py | 17 ++--- pandas/core/arrays/base.py | 32 ++++------ pandas/core/arrays/interval.py | 13 +--- pandas/core/arrays/masked.py | 9 +-- pandas/core/arrays/numpy_.py | 5 +- pandas/core/arrays/period.py | 13 +--- pandas/core/arrays/sparse/array.py | 13 +--- pandas/core/internals/blocks.py | 6 +- pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/arrays/test_datetimes.py | 10 +-- pandas/tests/extension/base/dim2.py | 8 +-- pandas/tests/extension/base/missing.py | 2 +- pandas/tests/extension/decimal/array.py | 2 +- .../tests/extension/decimal/test_decimal.py | 62 +++++++++++++++---- 17 files changed, 102 insertions(+), 105 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index eb2529716b55e..e177e2b1d87d5 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -39,6 +39,7 @@ objects. api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings api.extensions.ExtensionArray._hash_pandas_object + api.extensions.ExtensionArray._pad_or_backfill api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize @@ -54,7 +55,6 @@ objects. api.extensions.ExtensionArray.interpolate api.extensions.ExtensionArray.isin api.extensions.ExtensionArray.isna - api.extensions.ExtensionArray.pad_or_backfill api.extensions.ExtensionArray.ravel api.extensions.ExtensionArray.repeat api.extensions.ExtensionArray.searchsorted diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ebb31733fe370..0928569641b5b 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -583,7 +583,7 @@ Other Deprecations - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) - Deprecated replacing builtin and NumPy functions in ``.agg``, ``.apply``, and ``.transform``; use the corresponding string alias (e.g. ``"sum"`` for ``sum`` or ``np.sum``) instead (:issue:`53425`) - Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`) -- Deprecated the "method" and "limit" keywords in ``.ExtensionArray.fillna``, implement and use ``pad_or_backfill`` instead (:issue:`53621`) +- Deprecated the "method" and "limit" keywords in ``.ExtensionArray.fillna``, implement ``_pad_or_backfill`` instead (:issue:`53621`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated the ``method`` and ``limit`` keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`.SeriesGroupBy.fillna`, :meth:`.DataFrameGroupBy.fillna`, and :meth:`.Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the behavior of :meth:`Series.__getitem__`, :meth:`Series.__setitem__`, :meth:`DataFrame.__getitem__`, :meth:`DataFrame.__setitem__` with an integer slice on objects with a floating-dtype index, in a future version this will be treated as *positional* indexing (:issue:`49612`) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 359e0161e763c..6d21f4a1ac8a2 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -296,13 +296,8 @@ def _fill_mask_inplace( func = missing.get_fill_func(method, ndim=self.ndim) func(self._ndarray.T, limit=limit, mask=mask.T) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: mask = self.isna() if mask.any(): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 70c59e94f6d27..19443e2069270 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -924,19 +924,14 @@ def dropna(self) -> Self: """ return type(self)(pc.drop_null(self._pa_array)) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: if not self._hasna: # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is not None and limit_area is not None: + if limit is None: method = missing.clean_fill_method(method) try: if method == "pad": @@ -952,9 +947,7 @@ def pad_or_backfill( # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) @doc(ExtensionArray.fillna) def fillna( @@ -974,7 +967,7 @@ def fillna( return super().fillna(value=value, method=method, limit=limit, copy=copy) if method is not None: - return super().pad_or_backfill(method=method, limit=limit, copy=copy) + return super().fillna(method=method, limit=limit, copy=copy) if isinstance(value, (np.ndarray, ExtensionArray)): # Similar to check_value_size, but we do not mask here since we may diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7babce46a3977..a21253db59323 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -132,7 +132,6 @@ class ExtensionArray: interpolate isin isna - pad_or_backfill ravel repeat searchsorted @@ -148,6 +147,7 @@ class ExtensionArray: _from_sequence _from_sequence_of_strings _hash_pandas_object + _pad_or_backfill _reduce _values_for_argsort _values_for_factorize @@ -183,7 +183,7 @@ class ExtensionArray: methods: * fillna - * pad_or_backfill + * _pad_or_backfill * dropna * unique * factorize / _values_for_factorize @@ -917,13 +917,8 @@ def interpolate( f"{type(self).__name__} does not implement interpolate" ) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -959,26 +954,26 @@ def pad_or_backfill( Examples -------- >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) - >>> arr.pad_or_backfill(method="backfill", limit=1) + >>> arr._pad_or_backfill(method="backfill", limit=1) [, 2, 2, 3, , ] Length: 6, dtype: Int64 """ # If a 3rd-party EA has implemented this functionality in fillna, - # we warn that they need to implement pad_or_backfill instead. + # we warn that they need to implement _pad_or_backfill instead. if ( type(self).fillna is not ExtensionArray.fillna - and type(self).pad_or_backfill is ExtensionArray.pad_or_backfill + and type(self)._pad_or_backfill is ExtensionArray._pad_or_backfill ): - # Check for pad_or_backfill here allows us to call - # super().pad_or_backfill without getting this warning + # Check for _pad_or_backfill here allows us to call + # super()._pad_or_backfill without getting this warning warnings.warn( "ExtensionArray.fillna 'method' keyword is deprecated. " - "In a future version. arr.pad_or_backfill will be called " + "In a future version. arr._pad_or_backfill will be called " "instead. 3rd-party ExtensionArray authors need to implement " - "pad_or_backfill.", - FutureWarning, + "_pad_or_backfill.", + DeprecationWarning, stacklevel=find_stack_level(), ) return self.fillna(method=method, limit=limit) @@ -1062,8 +1057,7 @@ def fillna( if method is not None: warnings.warn( f"The 'method' keyword in {type(self).__name__}.fillna is " - "deprecated and will be removed in a future version. " - "Use pad_or_backfill instead.", + "deprecated and will be removed in a future version.", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2fce6631afa9b..d0510ede5a366 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -890,19 +890,12 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def pad_or_backfill( # pylint: disable=useless-parent-delegation - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bec875f2bbfa1..2cf28c28427ab 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -190,13 +190,8 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: mask = self._mask diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 99a3586871d10..efe0c0df45e00 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -240,7 +240,10 @@ def _values_for_factorize(self) -> tuple[np.ndarray, float | None]: fv = np.nan return self._ndarray, fv - def pad_or_backfill( + # Base EA class (and all other EA classes) don't have limit_area keyword + # This can be removed here as well when the interpolate ffill/bfill method + # deprecation is enforced + def _pad_or_backfill( self, *, method: FillnaOptions, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 024cd6d4aad14..a2e4b595c42aa 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -791,20 +791,13 @@ def searchsorted( m8arr = self._ndarray.view("M8[ns]") return m8arr.searchsorted(npvalue, side=side, sorter=sorter) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta.pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) if copy: return cast("Self", result.view(self.dtype)) else: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d32c98535d7cb..e38fa0a3bdae5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -712,19 +712,12 @@ def isna(self): mask[self.sp_index.indices] = isna(self.sp_values) return type(self)(mask, fill_value=False, dtype=dtype) - def pad_or_backfill( # pylint: disable=useless-parent-delegation - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # TODO(3.0): We can remove this method once deprecation for fillna method # keyword is enforced. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) def fillna( self, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a33c5d9303421..c0b78e734b4b4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1452,7 +1452,7 @@ def pad_or_backfill( vals = cast(NumpyExtensionArray, self.array_values) if axis == 1: vals = vals.T - new_values = vals.pad_or_backfill( + new_values = vals._pad_or_backfill( method=method, limit=limit, limit_area=limit_area, @@ -1955,9 +1955,9 @@ def pad_or_backfill( if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T.pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(method=method, limit=limit).T else: - new_values = values.pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(method=method, limit=limit) return [self.make_block_same_class(new_values)] diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 20530e37116f2..96aab94b24ddd 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -258,7 +258,7 @@ def test_fillna_method_doesnt_change_orig(self, method): fill_value = arr[3] if method == "pad" else arr[5] - result = arr.pad_or_backfill(method=method) + result = arr._pad_or_backfill(method=method) assert result[4] == fill_value # check that the original was not changed diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8e38a8c741b8d..c2d68a79f32d4 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -497,7 +497,7 @@ def test_fillna_preserves_tz(self, method): dtype=DatetimeTZDtype(tz="US/Central"), ) - result = arr.pad_or_backfill(method=method) + result = arr._pad_or_backfill(method=method) tm.assert_extension_array_equal(result, expected) # assert that arr and dti were not modified in-place @@ -510,12 +510,12 @@ def test_fillna_2d(self): dta[0, 1] = pd.NaT dta[1, 0] = pd.NaT - res1 = dta.pad_or_backfill(method="pad") + res1 = dta._pad_or_backfill(method="pad") expected1 = dta.copy() expected1[1, 0] = dta[0, 0] tm.assert_extension_array_equal(res1, expected1) - res2 = dta.pad_or_backfill(method="backfill") + res2 = dta._pad_or_backfill(method="backfill") expected2 = dta.copy() expected2 = dta.copy() expected2[1, 0] = dta[2, 0] @@ -529,10 +529,10 @@ def test_fillna_2d(self): assert not dta2._ndarray.flags["C_CONTIGUOUS"] tm.assert_extension_array_equal(dta, dta2) - res3 = dta2.pad_or_backfill(method="pad") + res3 = dta2._pad_or_backfill(method="pad") tm.assert_extension_array_equal(res3, expected1) - res4 = dta2.pad_or_backfill(method="backfill") + res4 = dta2._pad_or_backfill(method="backfill") tm.assert_extension_array_equal(res4, expected2) # test the DataFrame method while we're here diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index a0c24ee068e81..f1d787041d10b 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -160,9 +160,9 @@ def test_fillna_2d_method(self, data_missing, method): assert arr[0].isna().all() assert not arr[1].isna().any() - result = arr.pad_or_backfill(method=method, limit=None) + result = arr._pad_or_backfill(method=method, limit=None) - expected = data_missing.pad_or_backfill(method=method).repeat(2).reshape(2, 2) + expected = data_missing._pad_or_backfill(method=method).repeat(2).reshape(2, 2) tm.assert_extension_array_equal(result, expected) # Reverse so that backfill is not a no-op. @@ -170,10 +170,10 @@ def test_fillna_2d_method(self, data_missing, method): assert not arr2[0].isna().any() assert arr2[1].isna().all() - result2 = arr2.pad_or_backfill(method=method, limit=None) + result2 = arr2._pad_or_backfill(method=method, limit=None) expected2 = ( - data_missing[::-1].pad_or_backfill(method=method).repeat(2).reshape(2, 2) + data_missing[::-1]._pad_or_backfill(method=method).repeat(2).reshape(2, 2) ) tm.assert_extension_array_equal(result2, expected2) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 2a0bd0770dc07..40cc952d44200 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -94,7 +94,7 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data tm.assert_extension_array_equal(result, data) - result = data.pad_or_backfill(method="backfill") + result = data._pad_or_backfill(method="backfill") assert result is not data tm.assert_extension_array_equal(result, data) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index d2c3cb395ed44..9ce7ac309b6d3 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -285,7 +285,7 @@ def value_counts(self, dropna: bool = True): return value_counts(self.to_numpy(), dropna=dropna) # We override fillna here to simulate a 3rd party EA that has done so. This - # lets us test the deprecation telling authors to implement pad_or_backfill + # lets us test the deprecation telling authors to implement _pad_or_backfill # Simulate a 3rd-party EA that has not yet updated to include a "copy" # keyword in its fillna method. # error: Signature of "fillna" incompatible with supertype "ExtensionArray" diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 17dfc4e801ca0..e61a2894b9aa7 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -140,26 +140,59 @@ def test_fillna_frame(self, data_missing): def test_fillna_limit_pad(self, data_missing): msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_pad(data_missing) + + msg = "The 'method' keyword in DecimalArray.fillna is deprecated" + with tm.assert_produces_warning( + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, ): super().test_fillna_limit_pad(data_missing) def test_fillna_limit_backfill(self, data_missing): - msg = "|".join( - [ - "ExtensionArray.fillna added a 'copy' keyword", - "Series.fillna with 'method' is deprecated", - ] - ) + msg = "Series.fillna with 'method' is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, ): super().test_fillna_limit_backfill(data_missing) - def test_fillna_no_op_returns_copy(self, data): msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_backfill(data_missing) + + msg = "The 'method' keyword in DecimalArray.fillna is deprecated" + with tm.assert_produces_warning( + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_backfill(data_missing) + + def test_fillna_no_op_returns_copy(self, data): + msg = "|".join( + [ + "ExtensionArray.fillna 'method' keyword is deprecated", + "The 'method' keyword in DecimalArray.fillna is deprecated", + ] + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False ): super().test_fillna_no_op_returns_copy(data) @@ -171,9 +204,14 @@ def test_fillna_series(self, data_missing): super().test_fillna_series(data_missing) def test_fillna_series_method(self, data_missing, fillna_method): - msg = "ExtensionArray.fillna 'method' keyword is deprecated" + msg = "|".join( + [ + "ExtensionArray.fillna 'method' keyword is deprecated", + "The 'method' keyword in DecimalArray.fillna is deprecated", + ] + ) with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False ): super().test_fillna_series_method(data_missing, fillna_method) From f7f2057dddad297e900f6f6440976003f7dc5c51 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:54:40 -0700 Subject: [PATCH 058/174] Backport PR #54815 on branch 2.1.x (DOC: added docstring for `storage_options` in `read_html`) (#54852) Backport PR #54815: DOC: added docstring for `storage_options` in `read_html` Co-authored-by: Rajat Subhra Mukherjee --- pandas/io/html.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 8a73c786825e3..10701be4f7e0b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -23,6 +23,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -32,6 +33,7 @@ from pandas.core.indexes.base import Index from pandas.core.indexes.multi import MultiIndex from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( file_exists, @@ -363,13 +365,13 @@ def _parse_tfoot_tr(self, table): """ raise AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): """ Return all tables from the parsed DOM. Parameters ---------- - doc : the DOM from which to parse the table element. + document : the DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. @@ -594,9 +596,9 @@ def __init__(self, *args, **kwargs) -> None: self._strainer = SoupStrainer("table") - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): element_name = self._strainer.name - tables = doc.find_all(element_name, attrs=attrs) + tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") @@ -726,7 +728,7 @@ def _parse_td(self, row): # or (see _parse_thead_tr). return row.xpath("./td|./th") - def _parse_tables(self, doc, match, kwargs): + def _parse_tables(self, document, match, kwargs): pattern = match.pattern # 1. check all descendants for the given pattern and only search tables @@ -738,7 +740,7 @@ def _parse_tables(self, doc, match, kwargs): if kwargs: xpath_expr += _build_xpath_expr(kwargs) - tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + tables = document.xpath(xpath_expr, namespaces=_re_namespace) tables = self._handle_hidden_tables(tables, "attrib") if self.displayed_only: @@ -1026,6 +1028,7 @@ def _parse( return ret +@doc(storage_options=_shared_docs["storage_options"]) def read_html( io: FilePath | ReadBuffer[str], *, @@ -1096,13 +1099,13 @@ def read_html( passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - attrs = {'id': 'table'} + attrs = {{'id': 'table'}} is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - attrs = {'asdf': 'table'} + attrs = {{'asdf': 'table'}} is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 @@ -1144,13 +1147,13 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. - extract_links : {None, "all", "header", "body", "footer"} + extract_links : {{None, "all", "header", "body", "footer"}} Table elements in the specified section(s) with tags will have their href extracted. .. versionadded:: 1.5.0 - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -1161,6 +1164,10 @@ def read_html( .. versionadded:: 2.0 + {storage_options} + + .. versionadded:: 2.1.0 + Returns ------- dfs From 7f469dc0acac76c72ace0a5e6e8656feec2f1e72 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Aug 2023 15:06:15 -0700 Subject: [PATCH 059/174] Backport PR #54850 on branch 2.1.x (DOC: Add date for v2.1.0) (#54864) Backport PR #54850: DOC: Add date for v2.1.0 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0928569641b5b..1f627af02dc75 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_210: -What's new in 2.1.0 (Aug 11, 2023) +What's new in 2.1.0 (Aug 30, 2023) -------------------------------------- These are the changes in pandas 2.1.0. See :ref:`release` for a full changelog @@ -890,3 +890,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.0.3..v2.1.0|HEAD From bf59a956626cb39dbd68f4256d295d49e70016c1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Aug 2023 15:25:48 -0700 Subject: [PATCH 060/174] Backport PR #54856 on branch 2.1.x (BUG: ArrowExtensionArray.factorize with chunked dictionary array) (#54863) Backport PR #54856: BUG: ArrowExtensionArray.factorize with chunked dictionary array Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/arrow/array.py | 6 ++++-- pandas/tests/extension/test_arrow.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 1f627af02dc75..040ca048d1224 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -847,6 +847,7 @@ ExtensionArray - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug in :meth:`Series.unique` for boolean ``ArrowDtype`` with ``NA`` values (:issue:`54667`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) +- Bug in :meth:`~arrays.ArrowExtensionArray.factorize` returning incorrect uniques for a ``pyarrow.dictionary`` type ``pyarrow.chunked_array`` with more than one chunk (:issue:`54844`) - Bug when passing an :class:`ExtensionArray` subclass to ``dtype`` keywords. This will now raise a ``UserWarning`` to encourage passing an instance instead (:issue:`31356`, :issue:`54592`) - Bug where the :class:`DataFrame` repr would not work when a column had an :class:`ArrowDtype` with a ``pyarrow.ExtensionDtype`` (:issue:`54063`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes (e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept PyArrow arrays of type ``pyarrow.null()`` (:issue:`52223`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 19443e2069270..61ee157376656 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1043,13 +1043,15 @@ def factorize( indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) else: - pa_indices = encoded.combine_chunks().indices + # GH 54844 + combined = encoded.combine_chunks() + pa_indices = combined.indices if pa_indices.null_count > 0: pa_indices = pc.fill_null(pa_indices, -1) indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( np.intp, copy=False ) - uniques = type(self)(encoded.chunk(0).dictionary) + uniques = type(self)(combined.dictionary) if pa_version_under11p0 and pa.types.is_duration(pa_type): uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 86a7f20214a9a..a2fee31d9f01b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3059,3 +3059,16 @@ def test_duration_fillna_numpy(pa_type): result = ser1.fillna(ser2) expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type)) tm.assert_series_equal(result, expected) + + +def test_factorize_chunked_dictionary(): + # GH 54844 + pa_array = pa.chunked_array( + [pa.array(["a"]).dictionary_encode(), pa.array(["b"]).dictionary_encode()] + ) + ser = pd.Series(ArrowExtensionArray(pa_array)) + res_indices, res_uniques = ser.factorize() + exp_indicies = np.array([0, 1], dtype=np.intp) + exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks())) + tm.assert_numpy_array_equal(res_indices, exp_indicies) + tm.assert_index_equal(res_uniques, exp_uniques) From ba1cccd19da778f0c3a7d6a885685da16a072870 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 30 Aug 2023 07:23:59 -0400 Subject: [PATCH 061/174] RLS: 2.1.0 (#54866) Co-authored-by: Pandas Development Team From 9dc8da88005c6e24cfeee0dfde4acb2134751800 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Aug 2023 03:46:28 -0700 Subject: [PATCH 062/174] Backport PR #54879 on branch 2.1.x (DOC: Add release notes for 2.1.1) (#54891) Backport PR #54879: DOC: Add release notes for 2.1.1 Co-authored-by: Luke Manley --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.1.1.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.1.1.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index fc225d0f44497..c21fbc28ae68f 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.1 v2.1.0 Version 2.0 diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst new file mode 100644 index 0000000000000..66f6d59d08cb2 --- /dev/null +++ b/doc/source/whatsnew/v2.1.1.rst @@ -0,0 +1,36 @@ +.. _whatsnew_211: + +What's new in 2.1.1 (September XX, 2023) +---------------------------------------- + +These are the changes in pandas 2.1.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_211.contributors: + +Contributors +~~~~~~~~~~~~ From c85679c9e796dd8af19dda63969223aa2aa9ef97 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Aug 2023 08:58:32 -0700 Subject: [PATCH 063/174] Backport PR #54885 on branch 2.1.x (REGR: setitem with part of a MultiIndex raises) (#54899) Backport PR #54885: REGR: setitem with part of a MultiIndex raises Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.1.1.rst | 2 +- pandas/core/indexes/multi.py | 6 +++--- .../tests/indexing/multiindex/test_setitem.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 66f6d59d08cb2..f5aa0968ae362 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) .. --------------------------------------------------------------------------- .. _whatsnew_211.bug_fixes: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 33eb411374e67..bdc9e05a38d1c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2460,12 +2460,12 @@ def _reorder_ilevels(self, order) -> MultiIndex: def _recode_for_new_levels( self, new_levels, copy: bool = True ) -> Generator[np.ndarray, None, None]: - if len(new_levels) != self.nlevels: + if len(new_levels) > self.nlevels: raise AssertionError( f"Length of new_levels ({len(new_levels)}) " - f"must be same as self.nlevels ({self.nlevels})" + f"must be <= self.nlevels ({self.nlevels})" ) - for i in range(self.nlevels): + for i in range(len(new_levels)): yield recode_for_categories( self.codes[i], self.levels[i], new_levels[i], copy=copy ) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 960a8a59ba5a7..51b94c3beeb6f 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -556,3 +556,21 @@ def test_frame_setitem_copy_no_write( result = df tm.assert_frame_equal(result, expected) + + +def test_frame_setitem_partial_multiindex(): + # GH 54875 + df = DataFrame( + { + "a": [1, 2, 3], + "b": [3, 4, 5], + "c": 6, + "d": 7, + } + ).set_index(["a", "b", "c"]) + ser = Series(8, index=df.index.droplevel("c")) + result = df.copy() + result["d"] = ser + expected = df.copy() + expected["d"] = 8 + tm.assert_frame_equal(result, expected) From c1353e9df2022a1e52eb2e7e68145bf9bad8bc1c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Aug 2023 11:59:03 -0700 Subject: [PATCH 064/174] Backport PR #54894 on branch 2.1.x (MAINT: Reflect changes from `numpy` namespace refactor Part 5) (#54905) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #54894: MAINT: Reflect changes from `numpy` namespace refactor Part 5 Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- pandas/core/dtypes/missing.py | 6 ++++-- pandas/core/frame.py | 6 +++--- pandas/core/internals/construction.py | 2 +- pandas/io/stata.py | 4 ++-- pandas/tests/frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/methods/test_to_records.py | 2 +- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index de99f828d604f..7117e34b23ca4 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -285,7 +285,7 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has # type "ndarray[Any, dtype[bool_]]") result = values.isna() # type: ignore[assignment] - elif isinstance(values, np.recarray): + elif isinstance(values, np.rec.recarray): # GH 48526 result = _isna_recarray_dtype(values, inf_as_na=inf_as_na) elif is_string_or_object_np_dtype(values.dtype): @@ -332,7 +332,9 @@ def _has_record_inf_value(record_as_array: np.ndarray) -> np.bool_: return np.any(is_inf_in_record) -def _isna_recarray_dtype(values: np.recarray, inf_as_na: bool) -> npt.NDArray[np.bool_]: +def _isna_recarray_dtype( + values: np.rec.recarray, inf_as_na: bool +) -> npt.NDArray[np.bool_]: result = np.zeros(values.shape, dtype=bool) for i, record in enumerate(values): record_as_array = np.array(record.tolist()) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 471db115da8df..1614ecf1d7037 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2402,7 +2402,7 @@ def maybe_reorder( def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None - ) -> np.recarray: + ) -> np.rec.recarray: """ Convert DataFrame to a NumPy record array. @@ -2427,7 +2427,7 @@ def to_records( Returns ------- - numpy.recarray + numpy.rec.recarray NumPy ndarray with the DataFrame labels as fields and each row of the DataFrame as entries. @@ -2435,7 +2435,7 @@ def to_records( -------- DataFrame.from_records: Convert structured or record ndarray to DataFrame. - numpy.recarray: An ndarray that allows field access using + numpy.rec.recarray: An ndarray that allows field access using attributes, analogous to typed columns in a spreadsheet. diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 8aafb9dd623c5..05aa9f38bb2b0 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -159,7 +159,7 @@ def arrays_to_mgr( def rec_array_to_mgr( - data: np.recarray | np.ndarray, + data: np.rec.recarray | np.ndarray, index, columns, dtype: DtypeObj | None, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 698a2882ada39..9fea451940930 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2961,7 +2961,7 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: """No-op, future compatibility""" return data - def _prepare_data(self) -> np.recarray: + def _prepare_data(self) -> np.rec.recarray: data = self.data typlist = self.typlist convert_dates = self._convert_dates @@ -2993,7 +2993,7 @@ def _prepare_data(self) -> np.recarray: return data.to_records(index=False, column_dtypes=dtypes) - def _write_data(self, records: np.recarray) -> None: + def _write_data(self, records: np.rec.recarray) -> None: self._write_bytes(records.tobytes()) @staticmethod diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 59dca5055f170..7dffa7bb242d5 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -93,7 +93,7 @@ def test_from_records_sequencelike(self): tup.extend(b.iloc[i].values) tuples.append(tuple(tup)) - recarray = np.array(tuples, dtype=dtypes).view(np.recarray) + recarray = np.array(tuples, dtype=dtypes).view(np.rec.recarray) recarray2 = df.to_records() lists = [list(x) for x in tuples] diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 8853d718270f4..7df52a1fb903f 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -391,7 +391,7 @@ def test_to_records_dtype(self, kwargs, expected): # see GH#18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) - if not isinstance(expected, np.recarray): + if not isinstance(expected, np.rec.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: From cbff8f19747431b725dd132989b6bf987bb4b80c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Aug 2023 16:58:13 -0700 Subject: [PATCH 065/174] Backport PR #54896 on branch 2.1.x (DOC: updated docstring for deprecation of axis=1 in groupby) (#54912) Backport PR #54896: DOC: updated docstring for deprecation of axis=1 in groupby Co-authored-by: Rajat Subhra Mukherjee --- pandas/core/shared_docs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 7579f816d0ace..ba793b9c11c27 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -113,6 +113,12 @@ axis : {0 or 'index', 1 or 'columns'}, default 0 Split along rows (0) or columns (1). For `Series` this parameter is unused and defaults to 0. + + .. deprecated:: 2.1.0 + + Will be removed and behave like axis=0 in a future version. + For ``axis=1``, do ``frame.T.groupby(...)`` instead. + level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular level or levels. Do not specify both ``by`` and ``level``. From 9c8800dc4ccb98a1c3a94f45f4c4598b6141a03a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 1 Sep 2023 09:17:12 -0700 Subject: [PATCH 066/174] Backport PR #54880 on branch 2.1.x (REGR: comparing datetime vs None raises) (#54925) Backport PR #54880: REGR: comparing datetime vs None raises Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/arrays/datetimelike.py | 9 +++++++-- pandas/tests/series/indexing/test_datetime.py | 9 +++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index f5aa0968ae362..f5e4446c927f9 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) .. --------------------------------------------------------------------------- .. _whatsnew_211.bug_fixes: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c3b8d1c0e79e8..1a6438b2445c4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -107,6 +107,7 @@ algorithms, missing, nanops, + ops, ) from pandas.core.algorithms import ( checked_add_with_arr, @@ -943,8 +944,12 @@ def _cmp_method(self, other, op): dtype = getattr(other, "dtype", None) if is_object_dtype(dtype): - return op(np.asarray(self, dtype=object), other) - + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would raise when comparing to None + result = ops.comp_method_OBJECT_ARRAY( + op, np.asarray(self.astype(object)), other + ) + return result if other is NaT: if op is operator.ne: result = np.ones(self.shape, dtype=bool) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 8daaf087085c6..11d66d4820fe6 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -486,3 +486,12 @@ def test_getitem_str_second_with_datetimeindex(): msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central'\)" with pytest.raises(KeyError, match=msg): df[df.index[2]] + + +def test_compare_datetime_with_all_none(): + # GH#54870 + ser = Series(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]") + ser2 = Series([None, None]) + result = ser > ser2 + expected = Series([False, False]) + tm.assert_series_equal(result, expected) From cf32a23aed2f494073ebe5baa72b075ed21b2713 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 1 Sep 2023 18:17:23 +0200 Subject: [PATCH 067/174] Backport PR #54881 on branch 2.1.x (REGR: read_csv raising when dtypes is specified with usecols) (#54926) --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/io/parsers/python_parser.py | 8 ++++---- .../tests/io/parser/dtypes/test_dtypes_basic.py | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index f5e4446c927f9..b7e45d043d869 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 520d2193e1c04..6846ea2b196b8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1176,17 +1176,17 @@ def _set_no_thousand_columns(self) -> set[int]: ) if self.columns and self.dtype: assert self._col_indices is not None - for i in self._col_indices: + for i, col in zip(self._col_indices, self.columns): if not isinstance(self.dtype, dict) and not is_numeric_dtype( self.dtype ): no_thousands_columns.add(i) if ( isinstance(self.dtype, dict) - and self.columns[i] in self.dtype + and col in self.dtype and ( - not is_numeric_dtype(self.dtype[self.columns[i]]) - or is_bool_dtype(self.dtype[self.columns[i]]) + not is_numeric_dtype(self.dtype[col]) + or is_bool_dtype(self.dtype[col]) ) ): no_thousands_columns.add(i) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 8494c5d58171b..fbb026f98df6f 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -558,3 +558,20 @@ def test_string_inference(all_parsers): columns=pd.Index(["a", "b"], dtype=dtype), ) tm.assert_frame_equal(result, expected) + + +def test_dtypes_with_usecols(all_parsers): + # GH#54868 + + parser = all_parsers + data = """a,b,c +1,2,3 +4,5,6""" + + result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object}) + if parser.engine == "pyarrow": + values = [1, 4] + else: + values = ["1", "4"] + expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) + tm.assert_frame_equal(result, expected) From 6fcd7bd4ce49f9eeebf78a9a1acd578951387eca Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 1 Sep 2023 10:48:58 -0700 Subject: [PATCH 068/174] Backport PR #54884 on branch 2.1.x (REGR: value_counts raises with bins) (#54932) Backport PR #54884: REGR: value_counts raises with bins Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/algorithms.py | 4 +++- pandas/tests/test_algos.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index b7e45d043d869..a624b888cd789 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0a9c1aad46f89..5e22b774fb880 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -878,7 +878,9 @@ def value_counts_internal( if bins is not None: from pandas.core.reshape.tile import cut - values = Series(values, copy=False) + if isinstance(values, Series): + values = values._values + try: ii = cut(values, bins, include_lowest=True) except TypeError as err: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 856c31b9ccb06..cb703d3439d44 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1412,6 +1412,19 @@ def test_value_counts_uint64(self): tm.assert_series_equal(result, expected) + def test_value_counts_series(self): + # GH#54857 + values = np.array([3, 1, 2, 3, 4, np.nan]) + result = Series(values).value_counts(bins=3) + expected = Series( + [2, 2, 1], + index=IntervalIndex.from_tuples( + [(0.996, 2.0), (2.0, 3.0), (3.0, 4.0)], dtype="interval[float64, right]" + ), + name="count", + ) + tm.assert_series_equal(result, expected) + class TestDuplicated: def test_duplicated_with_nas(self): From eac54834948cd7137f65dda9e0dbd87f32312ccb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 1 Sep 2023 10:49:09 -0700 Subject: [PATCH 069/174] Backport PR #54895 on branch 2.1.x (REGR: Merge raising when left merging on arrow string index) (#54933) Backport PR #54895: REGR: Merge raising when left merging on arrow string index Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/reshape/merge.py | 8 ++++++-- pandas/tests/reshape/merge/test_merge.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index a624b888cd789..fa83c8ee400c2 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d811c53909fde..4eda8d2d75408 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2433,8 +2433,12 @@ def _factorize_keys( length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(), - pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(), + pc.fill_null(dc.indices[slice(len_lk)], length) + .to_numpy() + .astype(np.intp, copy=False), + pc.fill_null(dc.indices[slice(len_lk, None)], length) + .to_numpy() + .astype(np.intp, copy=False), len(dc.dictionary), ) if how == "right": diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 98db24530833b..2c97e8773b0d6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2870,3 +2870,15 @@ def test_merge_ea_int_and_float_numpy(): result = df2.merge(df1) tm.assert_frame_equal(result, expected.astype("float64")) + + +def test_merge_arrow_string_index(): + # GH#54894 + pytest.importorskip("pyarrow") + left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]") + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]")) + result = left.merge(right, left_on="a", right_index=True, how="left") + expected = DataFrame( + {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]} + ) + tm.assert_frame_equal(result, expected) From 06a1581d7d5d192c686041c254d70b2802f1247d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 2 Sep 2023 10:41:22 -0700 Subject: [PATCH 070/174] Backport PR #54913 on branch 2.1.x (REGR: drop_duplicates raising for arrow strings) (#54955) Backport PR #54913: REGR: drop_duplicates raising for arrow strings Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/algorithms.py | 2 +- pandas/tests/series/methods/test_drop_duplicates.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index fa83c8ee400c2..8570a4eacfcc9 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -16,6 +16,7 @@ Fixed regressions - Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5e22b774fb880..1d74bb8b83e4e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1000,7 +1000,7 @@ def duplicated( duplicated : ndarray[bool] """ if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype): + if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": values = values._to_masked() # type: ignore[union-attr] if isinstance(values.dtype, BaseMaskedDtype): diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 324ab1204e16e..10b2e98586365 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Categorical, Series, @@ -256,3 +257,11 @@ def test_duplicated_arrow_dtype(self): result = ser.drop_duplicates() expected = Series([True, False, None], dtype="bool[pyarrow]") tm.assert_series_equal(result, expected) + + def test_drop_duplicates_arrow_strings(self): + # GH#54904 + pa = pytest.importorskip("pyarrow") + ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string())) + result = ser.drop_duplicates() + expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string())) + tm.assert_series_equal(result, expecetd) From ed1f04434d8f6af0b7135ecfb8ad29358c08143f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 2 Sep 2023 19:42:49 +0200 Subject: [PATCH 071/174] Backport PR #54882 on branch 2.1.x (REGR: get_group raising with axis=1) (#54956) REGR: get_group raising with axis=1 (#54882) (cherry picked from commit ab34dd68d5cda8476298c2422b98f34090e2e7e4) --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/groupby/groupby.py | 3 ++- pandas/tests/groupby/test_groupby.py | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 8570a4eacfcc9..e7bfda82494a3 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) +- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dd0c995e6acca..7fa69a70431e4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1059,7 +1059,8 @@ def get_group(self, name, obj=None) -> DataFrame | Series: raise KeyError(name) if obj is None: - return self._selected_obj.iloc[inds] + indexer = inds if self.axis == 0 else (slice(None), inds) + return self._selected_obj.iloc[indexer] else: warnings.warn( "obj is deprecated and will be removed in a future version. " diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9b260d5757767..bcd8323a26767 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3159,3 +3159,26 @@ def test_groupby_series_with_datetimeindex_month_name(): expected = Series([2, 1], name="jan") expected.index.name = "jan" tm.assert_series_equal(result, expected) + + +def test_get_group_axis_1(): + # GH#54858 + df = DataFrame( + { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + "col3": [3, 8, 2, 10], + "col4": [1, 13, 6, 15], + "col5": [-4, 5, 6, -7], + } + ) + with tm.assert_produces_warning(FutureWarning, match="deprecated"): + grouped = df.groupby(axis=1, by=[1, 2, 3, 2, 1]) + result = grouped.get_group(1) + expected = DataFrame( + { + "col1": [0, 3, 2, 3], + "col5": [-4, 5, 6, -7], + } + ) + tm.assert_frame_equal(result, expected) From ca89e78f469e839d0bab13721fd988924a883ead Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 2 Sep 2023 14:30:42 -0700 Subject: [PATCH 072/174] Backport PR #54954 on branch 2.1.x (REGR: read_csv splitting on comma with delim_whitespace) (#54967) Backport PR #54954: REGR: read_csv splitting on comma with delim_whitespace Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 3 ++- pandas/tests/io/parser/test_header.py | 26 ++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index e7bfda82494a3..d0882bdf094ad 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) +- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) - Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index abd3fb9e1fef3..ce8a38df172ef 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -664,7 +664,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) ((c == delimiter) || (delim_whitespace && isblank(c))) +#define IS_DELIMITER(c) \ + ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 5cb54bb4e2916..d72174c40478e 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -658,3 +658,29 @@ def test_header_missing_rows(all_parsers): msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=[0, 1, 2]) + + +@skip_pyarrow +def test_header_multiple_whitespaces(all_parsers): + # GH#54931 + parser = all_parsers + data = """aa bb(1,1) cc(1,1) + 0 2 3.5""" + + result = parser.read_csv(StringIO(data), sep=r"\s+") + expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_header_delim_whitespace(all_parsers): + # GH#54918 + parser = all_parsers + data = """a,b +1,2 +3,4 + """ + + result = parser.read_csv(StringIO(data), delim_whitespace=True) + expected = DataFrame({"a,b": ["1,2", "3,4"]}) + tm.assert_frame_equal(result, expected) From 84e8609cfb3f839673747e914d40e6a19d3d3aee Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 2 Sep 2023 14:30:50 -0700 Subject: [PATCH 073/174] Backport PR #54826 on branch 2.1.x (Infer large_string type as pyarrow_numpy strings) (#54969) Backport PR #54826: Infer large_string type as pyarrow_numpy strings Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/string_arrow.py | 9 +++++++++ pandas/io/_util.py | 5 ++++- .../tests/arrays/string_/test_string_arrow.py | 8 +++++++- pandas/tests/io/test_parquet.py | 19 +++++++++++++++++++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 87dd48b3569d8..cae975c96d61f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -448,6 +448,15 @@ def _str_rstrip(self, to_strip=None): class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" + def __init__(self, values) -> None: + _chk_pyarrow_available() + + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( + values.type + ): + values = pc.cast(values, pa.string()) + super().__init__(values) + @classmethod def _result_converter(cls, values, na=None): if not isna(na): diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 915595833468d..3b2ae5daffdba 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -28,4 +28,7 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get + return { + pa.string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + }.get diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 1ab628f186b47..09f9f788dc3e4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -12,7 +12,10 @@ StringArray, StringDtype, ) -from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) skip_if_no_pyarrow = pytest.mark.skipif( pa_version_under7p0, @@ -166,6 +169,9 @@ def test_pyarrow_not_installed_raises(): with pytest.raises(ImportError, match=msg): ArrowStringArray([]) + with pytest.raises(ImportError, match=msg): + ArrowStringArrayNumpySemantics([]) + with pytest.raises(ImportError, match=msg): ArrowStringArray._from_sequence(["a", None, "b"]) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b2d6e958020be..41ae4042e9ae0 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1125,6 +1125,25 @@ def test_roundtrip_decimal(self, tmp_path, pa): expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") tm.assert_frame_equal(result, expected) + def test_infer_string_large_string_type(self, tmp_path, pa): + # GH#54798 + import pyarrow as pa + import pyarrow.parquet as pq + + path = tmp_path / "large_string.p" + + table = pa.table({"a": pa.array([None, "b", "c"], pa.large_string())}) + pq.write_table(table, path) + + with pd.option_context("future.infer_string", True): + result = read_parquet(path) + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From 44285e1af21daa7290388b944a1bbb9bf73ac12d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 2 Sep 2023 14:31:01 -0700 Subject: [PATCH 074/174] Backport PR #54945 on branch 2.1.x (REGR: MultiIndex.append raising for overlapping IntervalIndex levels) (#54968) Backport PR #54945: REGR: MultiIndex.append raising for overlapping IntervalIndex levels Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/arrays/categorical.py | 2 +- pandas/tests/indexes/multi/test_reshape.py | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index d0882bdf094ad..3848353187cde 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) - Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index aefc94ebd665c..53c942f615abe 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2950,7 +2950,7 @@ def recode_for_categories( return codes indexer = coerce_indexer_dtype( - new_categories.get_indexer(old_categories), new_categories + new_categories.get_indexer_for(old_categories), new_categories ) new_codes = take_nd(indexer, codes, fill_value=-1) return new_codes diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index da9838d4a2ed3..06dbb33aadf97 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -169,6 +169,28 @@ def test_append_names_dont_match(): tm.assert_index_equal(result, expected) +def test_append_overlapping_interval_levels(): + # GH 54934 + ivl1 = pd.IntervalIndex.from_breaks([0.0, 1.0, 2.0]) + ivl2 = pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5]) + mi1 = MultiIndex.from_product([ivl1, ivl1]) + mi2 = MultiIndex.from_product([ivl2, ivl2]) + result = mi1.append(mi2) + expected = MultiIndex.from_tuples( + [ + (pd.Interval(0.0, 1.0), pd.Interval(0.0, 1.0)), + (pd.Interval(0.0, 1.0), pd.Interval(1.0, 2.0)), + (pd.Interval(1.0, 2.0), pd.Interval(0.0, 1.0)), + (pd.Interval(1.0, 2.0), pd.Interval(1.0, 2.0)), + (pd.Interval(0.5, 1.5), pd.Interval(0.5, 1.5)), + (pd.Interval(0.5, 1.5), pd.Interval(1.5, 2.5)), + (pd.Interval(1.5, 2.5), pd.Interval(0.5, 1.5)), + (pd.Interval(1.5, 2.5), pd.Interval(1.5, 2.5)), + ] + ) + tm.assert_index_equal(result, expected) + + def test_repeat(): reps = 2 numbers = [1, 2, 3] From c22f2a251f17a0f3e1a8825abbcfc89cd1606e6c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 2 Sep 2023 14:31:14 -0700 Subject: [PATCH 075/174] Backport PR #54958 on branch 2.1.x (BLD: Fix race condition) (#54965) Backport PR #54958: BLD: Fix race condition Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/_libs/meson.build | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index f302c649bc7bd..c0a9d1ad8ee4a 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -69,7 +69,8 @@ libs_sources = { 'index': {'sources': ['index.pyx', _index_class_helper]}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper]}, + 'interval': {'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep}, 'join': {'sources': ['join.pyx', _khash_primitive_helper], 'deps': _khash_primitive_helper_dep}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, From 0c03254caf5586c8d00763ef5a8ba4b7ee1111d6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 2 Sep 2023 16:43:05 -0700 Subject: [PATCH 076/174] Backport PR #54962 on branch 2.1.x (BUG: DataFrame.stack with future_stack=True failing when columns are tuples) (#54971) Backport PR #54962: BUG: DataFrame.stack with future_stack=True failing when columns are tuples Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 2 +- pandas/core/reshape/reshape.py | 2 +- pandas/tests/frame/test_stack_unstack.py | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 3848353187cde..a6848dad6e3cd 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -28,7 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) .. --------------------------------------------------------------------------- .. _whatsnew_211.other: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fc8d827cd31bb..bf7c7a1ee4dc7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -908,7 +908,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: data = frame.copy() else: # Take the data from frame corresponding to this idx value - if not isinstance(idx, tuple): + if len(level) == 1: idx = (idx,) gen = iter(idx) column_indexer = tuple( diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index c90b871d5d66f..dbd1f96fc17c9 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2508,3 +2508,19 @@ def test_unstack_mixed_level_names(self): index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]), ) tm.assert_frame_equal(result, expected) + + +def test_stack_tuple_columns(future_stack): + # GH#54948 - test stack when the input has a non-MultiIndex with tuples + df = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=[("a", 1), ("a", 2), ("b", 1)] + ) + result = df.stack(future_stack=future_stack) + expected = Series( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=MultiIndex( + levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]], + codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ), + ) + tm.assert_series_equal(result, expected) From 874a329a7026667bc295d451e2ea089850d41b4c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Sep 2023 11:07:51 -0700 Subject: [PATCH 077/174] Backport PR #54985 on branch 2.1.x (REGR: rountripping datetime through sqlite doesn't work) (#54993) Backport PR #54985: REGR: rountripping datetime through sqlite doesn't work Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/io/sql.py | 2 -- pandas/tests/io/test_sql.py | 7 +++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index a6848dad6e3cd..11b19b1508a71 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) - Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 7669d5aa4cea5..2b139f8ca527c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2091,13 +2091,11 @@ def _adapt_time(t) -> str: adapt_date_iso = lambda val: val.isoformat() adapt_datetime_iso = lambda val: val.isoformat() - adapt_datetime_epoch = lambda val: int(val.timestamp()) sqlite3.register_adapter(time, _adapt_time) sqlite3.register_adapter(date, adapt_date_iso) sqlite3.register_adapter(datetime, adapt_datetime_iso) - sqlite3.register_adapter(datetime, adapt_datetime_epoch) convert_date = lambda val: date.fromisoformat(val.decode()) convert_datetime = lambda val: datetime.fromisoformat(val.decode()) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9ec0ba0b12a76..bfa93a4ff910e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2962,6 +2962,13 @@ def test_read_sql_string_inference(self): tm.assert_frame_equal(result, expected) + def test_roundtripping_datetimes(self): + # GH#54877 + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", self.conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", self.conn).iloc[0, 0] + assert result == "2020-12-31 12:00:00.000000" + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): From e0534b329b71d4de1aa9a392b39bea80b0b42fae Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Sep 2023 14:57:43 -0700 Subject: [PATCH 078/174] Backport PR #55013 on branch 2.1.x (CI: Ignore hypothesis differing executors) (#55019) Backport PR #55013: CI: Ignore hypothesis differing executors Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/conftest.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 10826f50d1fe1..b1b35448af134 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -71,6 +71,7 @@ Index, MultiIndex, ) +from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( @@ -190,6 +191,10 @@ def pytest_collection_modifyitems(items, config) -> None: item.add_marker(pytest.mark.arraymanager) +hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] +if Version(hypothesis.__version__) >= Version("6.83.2"): + hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) + # Hypothesis hypothesis.settings.register_profile( "ci", @@ -201,7 +206,7 @@ def pytest_collection_modifyitems(items, config) -> None: # 2022-02-09: Changed deadline from 500 -> None. Deadline leads to # non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969) deadline=None, - suppress_health_check=(hypothesis.HealthCheck.too_slow,), + suppress_health_check=tuple(hypothesis_health_checks), ) hypothesis.settings.load_profile("ci") From 3216c4f2690b51a9af7f06d5fa42ba0248732936 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Sep 2023 16:49:56 -0700 Subject: [PATCH 079/174] Backport PR #55000 on branch 2.1.x (BUG: ArrowDtype raising for fixed size list) (#55016) Backport PR #55000: BUG: ArrowDtype raising for fixed size list Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/dtypes/dtypes.py | 2 ++ pandas/tests/extension/test_arrow.py | 9 +++++++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 11b19b1508a71..64d7481117e8e 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -29,6 +29,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 53f0fb2843653..272e9928b96cb 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2109,6 +2109,8 @@ def type(self): return CategoricalDtypeType elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): return list + elif pa.types.is_fixed_size_list(pa_type): + return list elif pa.types.is_map(pa_type): return list elif pa.types.is_struct(pa_type): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a2fee31d9f01b..ec2ca494b2aa1 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3037,6 +3037,15 @@ def test_groupby_count_return_arrow_dtype(data_missing): tm.assert_frame_equal(result, expected) +def test_fixed_size_list(): + # GH#55000 + ser = pd.Series( + [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2)) + ) + result = ser.dtype.type + assert result == list + + def test_arrowextensiondtype_dataframe_repr(): # GH 54062 df = pd.DataFrame( From 7c17bb6aee4d08a644a7e0fde8869434a78f7bc0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Sep 2023 16:50:28 -0700 Subject: [PATCH 080/174] Backport PR #54927 on branch 2.1.x (REGR: interpolate raising if fill_value is given) (#55017) Backport PR #54927: REGR: interpolate raising if fill_value is given Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/generic.py | 5 +++-- pandas/tests/series/methods/test_interpolate.py | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 64d7481117e8e..6f431ca7eea5f 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) +- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f6bf51f65049a..23c37fb34ec0d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8160,10 +8160,11 @@ def interpolate( stacklevel=find_stack_level(), ) - if "fill_value" in kwargs: + if method in fillna_methods and "fill_value" in kwargs: raise ValueError( "'fill_value' is not a valid keyword for " - f"{type(self).__name__}.interpolate" + f"{type(self).__name__}.interpolate with method from " + f"{fillna_methods}" ) if isinstance(obj.index, MultiIndex) and method != "linear": diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 619690f400d98..549f429f09d35 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self): with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=msg2): ser.interpolate(method="asfreq") + + def test_interpolate_fill_value(self): + # GH#54920 + pytest.importorskip("scipy") + ser = Series([np.nan, 0, 1, np.nan, 3, np.nan]) + result = ser.interpolate(method="nearest", fill_value=0) + expected = Series([np.nan, 0, 1, 1, 3, 0]) + tm.assert_series_equal(result, expected) From eab54eef3731ec6df00992ec2fcd98dd7bcb5bd2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Sep 2023 16:50:40 -0700 Subject: [PATCH 081/174] Backport PR #54914 on branch 2.1.x (REGR: concat raising for 2 different ea dtypes) (#55018) Backport PR #54914: REGR: concat raising for 2 different ea dtypes Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/internals/concat.py | 2 +- pandas/tests/reshape/concat/test_concat.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 6f431ca7eea5f..258f05d4277bd 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`) - Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4d33f0137d3c4..b2d463a8c6c26 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -177,7 +177,7 @@ def concatenate_managers( values = np.concatenate(vals, axis=1) # type: ignore[arg-type] elif is_1d_only_ea_dtype(blk.dtype): # TODO(EA2D): special-casing not needed with 2D EAs - values = concat_compat(vals, axis=1, ea_compat_axis=True) + values = concat_compat(vals, axis=0, ea_compat_axis=True) values = ensure_block_shape(values, ndim=2) else: values = concat_compat(vals, axis=1) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 3efcd930af581..5dde863f246d1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -858,3 +858,12 @@ def test_concat_multiindex_with_category(): ) expected = expected.set_index(["c1", "c2"]) tm.assert_frame_equal(result, expected) + + +def test_concat_ea_upcast(): + # GH#54848 + df1 = DataFrame(["a"], dtype="string") + df2 = DataFrame([1], dtype="Int64") + result = concat([df1, df2]) + expected = DataFrame(["a", 1], index=[0, 0]) + tm.assert_frame_equal(result, expected) From b7369124fb3f7e58eb58a0a3be1d69a128321815 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Sep 2023 18:01:36 -0700 Subject: [PATCH 082/174] Backport PR #54974 on branch 2.1.x (Include pyarrow_numpy string in efficient merge implementation) (#55021) Backport PR #54974: Include pyarrow_numpy string in efficient merge implementation Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/reshape/merge.py | 3 ++- pandas/tests/reshape/merge/test_merge.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4eda8d2d75408..d36ceff800c56 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2417,7 +2417,8 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" + isinstance(lk.dtype, StringDtype) + and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 2c97e8773b0d6..c4067363d934e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2872,13 +2872,13 @@ def test_merge_ea_int_and_float_numpy(): tm.assert_frame_equal(result, expected.astype("float64")) -def test_merge_arrow_string_index(): +def test_merge_arrow_string_index(any_string_dtype): # GH#54894 pytest.importorskip("pyarrow") - left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]") - right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]")) + left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype) + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) result = left.merge(right, left_on="a", right_index=True, how="left") expected = DataFrame( - {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]} + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} ) tm.assert_frame_equal(result, expected) From a6445df906e53072875d8c53507d99ff8ed9df18 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Sep 2023 18:01:48 -0700 Subject: [PATCH 083/174] Backport PR #54982 on branch 2.1.x (REG: filter not respecting the order of labels) (#55022) Backport PR #54982: REG: filter not respecting the order of labels Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/generic.py | 8 +++++--- pandas/tests/frame/methods/test_filter.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 258f05d4277bd..b9bdb36fe0ed3 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) - Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 23c37fb34ec0d..f94e7e0bbab13 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5701,10 +5701,12 @@ def filter( if items is not None: name = self._get_axis_name(axis) + items = Index(items).intersection(labels) + if len(items) == 0: + # Keep the dtype of labels when we are empty + items = items.astype(labels.dtype) # error: Keywords must be strings - return self.reindex( # type: ignore[misc] - **{name: labels.intersection(items)} - ) + return self.reindex(**{name: items}) # type: ignore[misc] elif like: def f(x) -> bool_t: diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index 1a2fbf8a65a55..9d5e6876bb08c 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -137,3 +137,17 @@ def test_filter_regex_non_string(self): result = df.filter(regex="STRING") expected = df[["STRING"]] tm.assert_frame_equal(result, expected) + + def test_filter_keep_order(self): + # GH#54980 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[["B", "A"]] + tm.assert_frame_equal(result, expected) + + def test_filter_different_dtype(self): + # GH#54980 + df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[[]] + tm.assert_frame_equal(result, expected) From 6db7b0083fd7ac32b40119e98ba92dcb43d6203a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 6 Sep 2023 17:20:29 -0700 Subject: [PATCH 084/174] Backport PR #54983 on branch 2.1.x (BUG: pct_change showing unnecessary FutureWarning) (#55040) Backport PR #54983: BUG: pct_change showing unnecessary FutureWarning Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/generic.py | 24 ++++++++++++------- pandas/tests/frame/methods/test_pct_change.py | 18 ++++++++++++++ .../tests/series/methods/test_pct_change.py | 8 +++++++ 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index b9bdb36fe0ed3..fe511b5cdec67 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -34,6 +34,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) +- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`) .. --------------------------------------------------------------------------- .. _whatsnew_211.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f94e7e0bbab13..08d1bf6a0fabc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11692,15 +11692,21 @@ def pct_change( stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - if self.isna().values.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + cols = self.items() if self.ndim == 2 else [(None, self)] + for _, col in cols: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and will be " + "removed in a future version. Call ffill before calling " + "pct_change to retain current behavior and silence this " + "warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index d0153da038a75..ede212ae18ae9 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -160,3 +160,21 @@ def test_pct_change_with_duplicated_indices(fill_method): index=["a", "b"] * 3, ) tm.assert_frame_equal(result, expected) + + +def test_pct_change_none_beginning_no_warning(): + # GH#54481 + df = DataFrame( + [ + [1, None], + [2, 1], + [3, 2], + [4, 3], + [5, 4], + ] + ) + result = df.pct_change() + expected = DataFrame( + {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 4dabf7b87e2cd..6740b8756853e 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -107,3 +107,11 @@ def test_pct_change_with_duplicated_indices(fill_method): expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) tm.assert_series_equal(result, expected) + + +def test_pct_change_no_warning_na_beginning(): + # GH#54981 + ser = Series([None, None, 1, 2, 3]) + result = ser.pct_change() + expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) + tm.assert_series_equal(result, expected) From 77d29db5141d052e5c671806fdd38271011d4aa7 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:12:46 -0700 Subject: [PATCH 085/174] Backport PR #55052 on branch 2.1.x (Improve error message for StringDtype with invalid storage) (#55053) Backport PR #55052: Improve error message for StringDtype with invalid storage Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/string_.py | 3 ++- pandas/tests/arrays/string_/test_string_arrow.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 84a85b9e56aff..58c638e7f59b5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -119,7 +119,8 @@ def __init__(self, storage=None) -> None: storage = get_option("mode.string_storage") if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( - f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " + f"Got {storage} instead." ) if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 09f9f788dc3e4..fb6c338b8f8ea 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -255,3 +255,11 @@ def test_pickle_roundtrip(): result_sliced = pickle.loads(sliced_pickled) tm.assert_series_equal(result_sliced, expected_sliced) + + +@skip_if_no_pyarrow +def test_string_dtype_error_message(): + # GH#55051 + msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + with pytest.raises(ValueError, match=msg): + StringDtype("bla") From 610c0f4c7a9a3a4d7fc4d714c2a642a8cbd8e633 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:13:00 -0700 Subject: [PATCH 086/174] Backport PR #55051 on branch 2.1.x (Fix pickle roundtrip for new arrow string dtype) (#55054) Backport PR #55051: Fix pickle roundtrip for new arrow string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/string_arrow.py | 5 ++++- pandas/tests/arrays/string_/test_string_arrow.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index cae975c96d61f..0b8d3d6b13f60 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -466,7 +466,10 @@ def _result_converter(cls, values, na=None): def __getattribute__(self, item): # ArrowStringArray and we both inherit from ArrowExtensionArray, which # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": + if item in ArrowStringArrayMixin.__dict__ and item not in ( + "_pa_array", + "__dict__", + ): return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index fb6c338b8f8ea..c1d424f12bfc4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,9 +241,10 @@ def test_setitem_invalid_indexer_raises(): @skip_if_no_pyarrow -def test_pickle_roundtrip(): +@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) +def test_pickle_roundtrip(dtype): # GH 42600 - expected = pd.Series(range(10), dtype="string[pyarrow]") + expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) sliced_pickled = pickle.dumps(expected_sliced) From 2f0ef8770b691a20b26aa587fe5a021dfe6b3c3f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:28:37 -0700 Subject: [PATCH 087/174] Backport PR #55042 on branch 2.1.x (REGR: DataFrameGroupBy.agg with duplicate column names and a dict) (#55056) Backport PR #55042: REGR: DataFrameGroupBy.agg with duplicate column names and a dict Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/apply.py | 8 +++++++- pandas/tests/groupby/aggregate/test_aggregate.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index fe511b5cdec67..42af61be26355 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) - Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) +- Fixed regression in :meth:`DataFrameGroupBy.agg` when aggregating a DataFrame with duplicate column names using a dictionary (:issue:`55006`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6ab2f958b8730..712294a0a4c96 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -434,7 +434,13 @@ def compute_dict_like( Data for result. When aggregating with a Series, this can contain any Python object. """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + obj = self.obj + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) func = cast(AggFuncTypeDict, self.func) func = self.normalize_dictlike_arg(op_name, selected_obj, func) @@ -448,7 +454,7 @@ def compute_dict_like( colg = obj._gotitem(selection, ndim=1) results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()] keys = list(func.keys()) - elif is_non_unique_col: + elif not is_groupby and is_non_unique_col: # key used for column selection and output # GH#51099 results = [] diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index cdfa80c8c7cb5..406f4b9b127e7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -513,6 +513,18 @@ def test_groupby_agg_dict_with_getitem(): tm.assert_frame_equal(result, expected) +def test_groupby_agg_dict_dup_columns(): + # GH#55006 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": "sum"}) + expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "op", [ From 1b04dc4c4982b11c269839b0657b958734d34003 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:23:21 -0700 Subject: [PATCH 088/174] Backport PR #55010 on branch 2.1.x (BLD: Build wheels for Python 3.12) (#55055) Backport PR #55010: BLD: Build wheels for Python 3.12 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .circleci/config.yml | 5 ++--- .github/workflows/wheels.yml | 16 +++++++++------- pyproject.toml | 7 +++++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 50f6a116a6630..ba124533e953a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,7 +48,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.14.1 + pip3 install cibuildwheel==2.15.0 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: CIBW_BUILD: << parameters.cibw-build >> @@ -92,5 +92,4 @@ workflows: only: /^v.*/ matrix: parameters: - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"]#, "cp312-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"] diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 5f541f1bae1fd..97d78a1a9afe3 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -97,8 +97,7 @@ jobs: - [macos-12, macosx_*] - [windows-2022, win_amd64] # TODO: support PyPy? - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]#, ["cp312", "3.12"]] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -150,8 +149,10 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: environment-name: wheel-env + # Use a fixed Python, since we might have an unreleased Python not + # yet present on conda-forge create-args: >- - python=${{ matrix.python[1] }} + python=3.11 anaconda-client wheel cache-downloads: true @@ -167,12 +168,13 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; - python -m pip install --find-links=pandas\wheelhouse --no-index pandas; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ - docker pull python:${{ matrix.python[1] }}-windowsservercore - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD + # add rc to the end of the image name if the Python version is unreleased + docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v3 with: diff --git a/pyproject.toml b/pyproject.toml index d2fb6a8d854d7..752ee846eed33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,8 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.22.4; python_version>='3.12'", + # TODO: This needs to be updated when the official numpy 1.26 comes out + "numpy>=1.26.0b1; python_version>='3.12'", "versioneer[toml]" ] @@ -30,7 +31,9 @@ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version>='3.11'", + "numpy>=1.23.2; python_version=='3.11'", + # TODO: This needs to be updated when the official numpy 1.26 comes out + "numpy>=1.26.0b1; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" From 16dc9d824bbaaa97b0b1fcf4a849d6d349d38b45 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 8 Sep 2023 10:01:50 -0700 Subject: [PATCH 089/174] Backport PR #54949 on branch 2.1.x (BLD: improvements to meson.build files) (#55065) Backport PR #54949: BLD: improvements to meson.build files Co-authored-by: Ralf Gommers --- generate_version.py | 2 ++ meson.build | 16 +++++++++------- pandas/_libs/meson.build | 7 ++++--- pandas/_libs/tslibs/meson.build | 7 ++++--- pandas/meson.build | 9 +++------ 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/generate_version.py b/generate_version.py index 46e9f52bfc5de..06e38ce0fd978 100644 --- a/generate_version.py +++ b/generate_version.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Note: This file has to live next to setup.py or versioneer will not work import argparse import os diff --git a/meson.build b/meson.build index 09a1494135af4..e0e533ffade97 100644 --- a/meson.build +++ b/meson.build @@ -2,19 +2,17 @@ project( 'pandas', 'c', 'cpp', 'cython', - version: run_command(['python', 'generate_version.py', '--print'], check: true).stdout().strip(), + version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', meson_version: '>=1.0.1', default_options: [ 'buildtype=release', - # TODO: Reactivate werror, some warnings on Windows - #'werror=true', 'c_std=c99' ] ) fs = import('fs') -py = import('python').find_installation() +py = import('python').find_installation(pure: false) tempita = files('generate_pxi.py') versioneer = files('generate_version.py') @@ -30,7 +28,7 @@ add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c if fs.exists('_version_meson.py') - py.install_sources('_version_meson.py', pure: false, subdir: 'pandas') + py.install_sources('_version_meson.py', subdir: 'pandas') else custom_target('write_version_file', output: '_version_meson.py', @@ -40,11 +38,15 @@ else build_by_default: true, build_always_stale: true, install: true, - install_dir: py.get_install_dir(pure: false) / 'pandas' + install_dir: py.get_install_dir() / 'pandas' ) meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif # Needed by pandas.test() when it looks for the pytest ini options -py.install_sources('pyproject.toml', pure: false, subdir: 'pandas') +py.install_sources( + 'pyproject.toml', + subdir: 'pandas' +) + subdir('pandas') diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c0a9d1ad8ee4a..1cf2c4343d844 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -114,8 +114,9 @@ foreach ext_name, ext_dict : libs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs') +py.install_sources( + '__init__.py', + subdir: 'pandas/_libs' +) subdir('window') diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 14d2eef46da20..167695b84514c 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -31,6 +31,7 @@ foreach ext_name, ext_dict : tslibs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs/tslibs') +py.install_sources( + '__init__.py', + subdir: 'pandas/_libs/tslibs' +) diff --git a/pandas/meson.build b/pandas/meson.build index 1dc9955aa4ff6..f02258c98d46a 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -40,8 +40,9 @@ subdirs_list = [ 'util' ] foreach subdir: subdirs_list - install_subdir(subdir, install_dir: py.get_install_dir(pure: false) / 'pandas') + install_subdir(subdir, install_dir: py.get_install_dir() / 'pandas') endforeach + top_level_py_list = [ '__init__.py', '_typing.py', @@ -49,8 +50,4 @@ top_level_py_list = [ 'conftest.py', 'testing.py' ] -foreach file: top_level_py_list - py.install_sources(file, - pure: false, - subdir: 'pandas') -endforeach +py.install_sources(top_level_py_list, subdir: 'pandas') From dac2a06a7333de5f93161967ef6700dc12a83b79 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 8 Sep 2023 16:53:32 -0700 Subject: [PATCH 090/174] Backport PR #55048 on branch 2.1.x (COMPAT: bump pyarrow min version for div on duration) (#55073) Backport PR #55048: COMPAT: bump pyarrow min version for div on duration Co-authored-by: Ben Greiner --- pandas/compat/__init__.py | 2 ++ pandas/compat/pyarrow.py | 2 ++ pandas/tests/extension/test_arrow.py | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index be0a762642e46..684e9dccdc0f9 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -30,6 +30,7 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, ) if TYPE_CHECKING: @@ -186,6 +187,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under9p0", "pa_version_under11p0", "pa_version_under13p0", + "pa_version_under14p0", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 049ce50920e28..12f58be109d98 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -15,6 +15,7 @@ pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") + pa_version_under14p0 = _palv < Version("14.0.0") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -23,3 +24,4 @@ pa_version_under11p0 = True pa_version_under12p0 = True pa_version_under13p0 = True + pa_version_under14p0 = True diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ec2ca494b2aa1..d6e75a57f71e7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -40,6 +40,7 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, ) from pandas.core.dtypes.dtypes import ( @@ -951,7 +952,7 @@ def _is_temporal_supported(self, opname, pa_dtype): or ( opname in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__") - and not pa_version_under13p0 + and not pa_version_under14p0 ) ) and pa.types.is_duration(pa_dtype) From f98d950948da386ee448a30fbe389236e535314a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:39:24 -0700 Subject: [PATCH 091/174] Backport PR #55082 on branch 2.1.x (Updated future warning msg in transform() for Series.groupby) (#55097) Backport PR #55082: Updated future warning msg in transform() for Series.groupby Co-authored-by: Rajat Subhra Mukherjee --- pandas/core/apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 712294a0a4c96..ad9933aff9eca 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1830,12 +1830,12 @@ def warn_alias_replacement( full_alias = alias else: full_alias = f"{type(obj).__name__}.{alias}" - alias = f"'{alias}'" + alias = f'"{alias}"' warnings.warn( f"The provided callable {func} is currently using " f"{full_alias}. In a future version of pandas, " f"the provided callable will be used directly. To keep current " - f"behavior pass {alias} instead.", + f"behavior pass the string {alias} instead.", category=FutureWarning, stacklevel=find_stack_level(), ) From 4f5cd87482b244220158ef387afd41a44a2ab7c7 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:39:36 -0700 Subject: [PATCH 092/174] Backport PR #55094 on branch 2.1.x (TST: Make test_hash_equality_invariance xfail more generic) (#55100) Backport PR #55094: TST: Make test_hash_equality_invariance xfail more generic Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/scalar/timedelta/test_timedelta.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 701cfdf157d26..884c3a648f395 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -921,7 +921,6 @@ def test_timedelta_hash_equality(self): @pytest.mark.xfail( reason="pd.Timedelta violates the Python hash invariant (GH#44504).", - raises=AssertionError, ) @given( st.integers( From 0cc87ee92e118343c65bbe0ef4839f5314d7fa4f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 13 Sep 2023 11:59:11 -0700 Subject: [PATCH 093/174] Backport PR #55072 on branch 2.1.x (BUG: dt.tz with ArrowDtype returned string) (#55124) Backport PR #55072: BUG: dt.tz with ArrowDtype returned string Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/arrays/arrow/array.py | 3 ++- pandas/tests/extension/test_arrow.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 42af61be26355..6d5da7cdff3b3 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -35,6 +35,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) +- Fixed bug in :meth:`Series.dt.tz` with :class:`ArrowDtype` where a string was returned instead of a ``tzinfo`` object (:issue:`55003`) - Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 61ee157376656..09c7c6c7706ad 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import ( Timedelta, Timestamp, + timezones, ) from pandas.compat import ( pa_version_under7p0, @@ -2421,7 +2422,7 @@ def _dt_time(self): @property def _dt_tz(self): - return self.dtype.pyarrow_dtype.tz + return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz) @property def _dt_unit(self): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d6e75a57f71e7..5017e827c3761 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -31,6 +31,7 @@ import pytest from pandas._libs import lib +from pandas._libs.tslibs import timezones from pandas.compat import ( PY311, is_ci_environment, @@ -2477,7 +2478,7 @@ def test_dt_tz(tz): dtype=ArrowDtype(pa.timestamp("ns", tz=tz)), ) result = ser.dt.tz - assert result == tz + assert result == timezones.maybe_get_tz(tz) def test_dt_isocalendar(): From 124a76d1e88210367d63fdea5330e43ee0171792 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 16 Sep 2023 06:46:48 -0700 Subject: [PATCH 094/174] Backport PR #54784 on branch 2.1.x (CI: Unpin Cython) (#55154) Backport PR #54784: CI: Unpin Cython Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 12 ++++++------ ci/deps/actions-310.yaml | 4 ++-- ci/deps/actions-311-downstream_compat.yaml | 4 ++-- ci/deps/actions-311-numpydev.yaml | 4 ++-- ci/deps/actions-311-pyarrownightly.yaml | 4 ++-- ci/deps/actions-311.yaml | 4 ++-- ci/deps/actions-39-minimum_versions.yaml | 4 ++-- ci/deps/actions-39.yaml | 4 ++-- ci/deps/actions-pypy-39.yaml | 4 ++-- ci/deps/circle-310-arm64.yaml | 4 ++-- environment.yml | 2 +- meson.build | 2 +- pyproject.toml | 2 +- requirements-dev.txt | 2 +- 14 files changed, 28 insertions(+), 28 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6410f2edd6175..1445e5e822017 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -233,8 +233,8 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1 - python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.1" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -271,8 +271,8 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1 - python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.1" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -342,10 +342,10 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata "cython<3.0.1" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip install -ve . --no-build-isolation --no-index python -m pip list diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 2190136220c6c..edd48604dbfda 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index cf85345cb0cc2..2fb6072314b1e 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index acf94fce5288b..7fd3a65ec91f8 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,8 +6,9 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 + - cython>=0.29.33 # test dependencies - pytest>=7.3.2 @@ -25,7 +26,6 @@ dependencies: - pip - pip: - - "cython<3.0.1" - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index e850f69f945c5..893341350f4ef 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 - - cython>=0.29.33, <3.0.1 + - meson[ninja]=1.2.1 + - cython>=0.29.33 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 3c1630714a041..029312202bcaa 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index b1cea49e22d15..2a7312cefe619 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,8 +8,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index b8a119ece4b03..12893698b5534 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 23c3706f43dad..4923c94ab08f3 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 71686837451b4..a945dbfe2ca20 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.1 - - meson[ninja]=1.0.1 + - cython>=0.29.33 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/environment.yml b/environment.yml index 1a9dffb55bca7..d194b11df7195 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - cython=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/meson.build b/meson.build index e0e533ffade97..68018046c081f 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( 'c', 'cpp', 'cython', version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', - meson_version: '>=1.0.1', + meson_version: '>=1.2.1', default_options: [ 'buildtype=release', 'c_std=c99' diff --git a/pyproject.toml b/pyproject.toml index 752ee846eed33..cfa767f808827 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ "meson-python==0.13.1", - "meson==1.0.1", + "meson==1.2.1", "wheel", "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json # Note: numpy 1.25 has a backwards compatible C API by default diff --git a/requirements-dev.txt b/requirements-dev.txt index be02007a36333..0177016806b0e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ pip versioneer[toml] cython==0.29.33 -meson[ninja]==1.0.1 +meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov From 3b57c196bf042d6bd8f9c7abb924c2cdde56e3c0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 16 Sep 2023 09:47:14 -0700 Subject: [PATCH 095/174] Backport PR #55159 on branch 2.1.x (DOCS: temporary pin pydata-sphinx-theme to 0.13) (#55161) Backport PR #55159: DOCS: temporary pin pydata-sphinx-theme to 0.13 Co-authored-by: Joris Van den Bossche --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index d194b11df7195..447abd22e29b4 100644 --- a/environment.yml +++ b/environment.yml @@ -84,7 +84,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme + - pydata-sphinx-theme=0.13 - pytest-cython # doctest - sphinx - sphinx-design diff --git a/requirements-dev.txt b/requirements-dev.txt index 0177016806b0e..15a8633d09039 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -59,7 +59,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme +pydata-sphinx-theme==0.13 pytest-cython sphinx sphinx-design From 42914f3badfb91ccf404f6aa426dc38f11cc4537 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Sep 2023 11:20:06 -0700 Subject: [PATCH 096/174] Backport PR #55180 on branch 2.1.x (MAINT: Cleanup expired ndarray methods) (#55184) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #55180: MAINT: Cleanup expired ndarray methods Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- doc/source/user_guide/gotchas.rst | 2 +- pandas/io/stata.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index c00a236ff4e9d..99c85ac66623d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -379,7 +379,7 @@ constructors using something similar to the following: .. ipython:: python x = np.array(list(range(10)), ">i4") # big endian - newx = x.byteswap().newbyteorder() # force native byteorder + newx = x.byteswap().view(x.dtype.newbyteorder()) # force native byteorder s = pd.Series(newx) See `the NumPy documentation on byte order diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 9fea451940930..5b190619c4ac3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1771,7 +1771,7 @@ def read( self._data_read = True # if necessary, swap the byte order to native here if self._byteorder != self._native_byteorder: - raw_data = raw_data.byteswap().newbyteorder() + raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder()) if convert_categoricals: self._read_value_labels() From 00020bcf2763e7d5030c4d78d75095f17849b666 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Sep 2023 16:32:21 -0700 Subject: [PATCH 097/174] Backport PR #55185 on branch 2.1.x (CI: Fix no BLAS error in 32 bit build) (#55191) Backport PR #55185: CI: Fix no BLAS error in 32 bit build Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 1445e5e822017..c1a0352b5f5c0 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -230,11 +230,13 @@ jobs: git -c user.email="you@example.com" merge --no-commit my_ref_name fi - name: Build environment and Run Tests + # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 From cb7ea726e76c48a7371e5127c95834f11b47211e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 19 Sep 2023 15:16:39 -0700 Subject: [PATCH 098/174] Backport PR #55175 on branch 2.1.x (DEPS: Bump to 1.26 proper for Python 3.12) (#55203) Backport PR #55175: DEPS: Bump to 1.26 proper for Python 3.12 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pyproject.toml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cfa767f808827..477c9be6274b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,7 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - # TODO: This needs to be updated when the official numpy 1.26 comes out - "numpy>=1.26.0b1; python_version>='3.12'", + "numpy>=1.26.0; python_version>='3.12'", "versioneer[toml]" ] @@ -32,8 +31,7 @@ requires-python = '>=3.9' dependencies = [ "numpy>=1.22.4; python_version<'3.11'", "numpy>=1.23.2; python_version=='3.11'", - # TODO: This needs to be updated when the official numpy 1.26 comes out - "numpy>=1.26.0b1; python_version>='3.12'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" From 1ae73af093dcb8b58d77c6d46c763df3cdd3f6f5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 20 Sep 2023 06:07:42 -0700 Subject: [PATCH 099/174] Backport PR #55210 on branch 2.1.x (CI: Pin matplotlib < 3.8) (#55216) Backport PR #55210: CI: Pin matplotlib < 3.8 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- environment.yml | 2 +- requirements-dev.txt | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index edd48604dbfda..5db145e31ebc5 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 2fb6072314b1e..2d50056098a0f 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -35,7 +35,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 029312202bcaa..d56fb9ac9fe90 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 12893698b5534..1843e6bd8005f 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index a945dbfe2ca20..c893beced8fb4 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 # test_numba_vs_cython segfaults with numba 0.57 - numba>=0.55.2, <0.57.0 - numexpr>=2.8.0 diff --git a/environment.yml b/environment.yml index 447abd22e29b4..6f80e9af58643 100644 --- a/environment.yml +++ b/environment.yml @@ -36,7 +36,7 @@ dependencies: - ipython - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - openpyxl>=3.0.10 diff --git a/requirements-dev.txt b/requirements-dev.txt index 15a8633d09039..01e537f7ea28f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -25,7 +25,7 @@ gcsfs>=2022.05.0 ipython jinja2>=3.1.2 lxml>=4.8.0 -matplotlib>=3.6.1 +matplotlib>=3.6.1, <3.8 numba>=0.55.2 numexpr>=2.8.0 openpyxl>=3.0.10 From 7acebe4cc64043783628c3770c09bd1f59d6ea88 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 20 Sep 2023 06:38:22 -0700 Subject: [PATCH 100/174] Backport PR #54952 on branch 2.1.x (REGR: Arrow backed objects not propagating exceptions) (#55209) Backport PR #54952: REGR: Arrow backed objects not propagating exceptions Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/arrays/arrow/array.py | 16 +++++++++------- pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 6d5da7cdff3b3..13de1d5e2ea52 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) +- Fixed regression in comparison operations for PyArrow backed columns not propagating exceptions correctly (:issue:`54944`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 09c7c6c7706ad..49305128267be 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -627,20 +627,22 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - try: + if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): result = pc_func(self._pa_array, self._box_pa(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - if is_scalar(other): + elif is_scalar(other): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") result[valid] = op(np.array(self)[valid], other) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) - else: - raise NotImplementedError( - f"{op.__name__} not implemented for {type(other)}" - ) + else: + raise NotImplementedError( + f"{op.__name__} not implemented for {type(other)}" + ) return ArrowExtensionArray(result) def _evaluate_op_method(self, other, op, arrow_funcs): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5017e827c3761..8702701f161ae 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3072,6 +3072,14 @@ def test_duration_fillna_numpy(pa_type): tm.assert_series_equal(result, expected) +def test_comparison_not_propagating_arrow_error(): + # GH#54944 + a = pd.Series([1 << 63], dtype="uint64[pyarrow]") + b = pd.Series([None], dtype="int64[pyarrow]") + with pytest.raises(pa.lib.ArrowInvalid, match="Integer value"): + a < b + + def test_factorize_chunked_dictionary(): # GH 54844 pa_array = pa.chunked_array( From 652fd138968349a23cfc0bf28ee848114856df38 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 20 Sep 2023 06:38:34 -0700 Subject: [PATCH 101/174] Backport PR #55211 on branch 2.1.x (DOC: Add release date for 2.1.1) (#55218) Backport PR #55211: DOC: Add release date for 2.1.1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 13de1d5e2ea52..bb3fbd4ffc90f 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_211: -What's new in 2.1.1 (September XX, 2023) +What's new in 2.1.1 (September 20, 2023) ---------------------------------------- These are the changes in pandas 2.1.1. See :ref:`release` for a full changelog From 34cc5b7034e098fe19853a861f02142fc6cdbef3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 20 Sep 2023 08:52:47 -0700 Subject: [PATCH 102/174] Backport PR #55050 on branch 2.1.x (Fix docstring of Index.join in base.py) (#55217) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #55050: Fix docstring of Index.join in base.py Co-authored-by: Kai Mühlbauer Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index aa306da017d34..b4ef5380d2b30 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4586,7 +4586,7 @@ def join( ------- join_index, (left_indexer, right_indexer) - Examples + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx2 = pd.Index([4, 5, 6]) From 21d8cdd92fcc7827ea71619671af2b5d6bab17a6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 20 Sep 2023 08:53:48 -0700 Subject: [PATCH 103/174] Backport PR #55189 on branch 2.1.x (Revert "DEPR: Deprecate returning a DataFrame in SeriesApply.apply_standard) (#55205) Backport PR #55189: Revert "DEPR: Deprecate returning a DataFrame in SeriesApply.apply_standard Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/cookbook.rst | 10 +++---- doc/source/user_guide/groupby.rst | 13 +++++++++ doc/source/whatsnew/v0.10.0.rst | 29 +++++++------------- doc/source/whatsnew/v2.1.0.rst | 1 - doc/source/whatsnew/v2.1.1.rst | 2 +- pandas/core/apply.py | 8 ------ pandas/core/series.py | 5 ---- pandas/tests/apply/test_series_apply.py | 36 ++++++++++++------------- 8 files changed, 45 insertions(+), 59 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 66ee571d6b5a5..b7a59516ae671 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -794,12 +794,12 @@ Apply index=["I", "II", "III"], ) - def make_df(ser): - new_vals = [pd.Series(value, name=name) for name, value in ser.items()] - return pd.DataFrame(new_vals) - - df_orgz = pd.concat({ind: row.pipe(make_df) for ind, row in df.iterrows()}) + def SeriesFromSubList(aList): + return pd.Series(aList) + df_orgz = pd.concat( + {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} + ) df_orgz `Rolling apply with a DataFrame returning a Series diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 75c816f66d5e4..92fa38c9da5c2 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1207,6 +1207,19 @@ The dimension of the returned result can also change: grouped.apply(f) +``apply`` on a Series can operate on a returned value from the applied function +that is itself a series, and possibly upcast the result to a DataFrame: + +.. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) + Similar to :ref:`groupby.aggregate.agg`, the resulting dtype will reflect that of the apply function. If the results from different groups have different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction. diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 3425986a37743..3f1db2e3a9cd9 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -244,26 +244,15 @@ Convenience methods ``ffill`` and ``bfill`` have been added: function, that is itself a series, and possibly upcast the result to a DataFrame - .. code-block:: python - - >>> def f(x): - ... return pd.Series([x, x ** 2], index=["x", "x^2"]) - >>> - >>> s = pd.Series(np.random.rand(5)) - >>> s - 0 0.340445 - 1 0.984729 - 2 0.919540 - 3 0.037772 - 4 0.861549 - dtype: float64 - >>> s.apply(f) - x x^2 - 0 0.340445 0.115903 - 1 0.984729 0.969691 - 2 0.919540 0.845555 - 3 0.037772 0.001427 - 4 0.861549 0.742267 + .. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) - New API functions for working with pandas options (:issue:`2097`): diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 040ca048d1224..18054e0b01191 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -555,7 +555,6 @@ Other Deprecations - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :attr:`Series.dt` properties (:issue:`20306`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or NumPy array before operating instead (:issue:`51521`) -- Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) - Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) - Deprecated the ``fastpath`` keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index bb3fbd4ffc90f..42a7029b1fa77 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -44,7 +44,7 @@ Bug fixes Other ~~~~~ -- +- Reverted the deprecation that disallowed :meth:`Series.apply` returning a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object (:issue:`52116`) .. --------------------------------------------------------------------------- .. _whatsnew_211.contributors: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ad9933aff9eca..e5683359c2fb9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1289,14 +1289,6 @@ def curried(x): ) if len(mapped) and isinstance(mapped[0], ABCSeries): - warnings.warn( - "Returning a DataFrame from Series.apply when the supplied function " - "returns a Series is deprecated and will be removed in a future " - "version.", - FutureWarning, - stacklevel=find_stack_level(), - ) # GH52116 - # GH#43986 Need to do list(mapped) in order to get treated as nested # See also GH#25959 regarding EA support return obj._constructor_expanddim(list(mapped), index=obj.index) diff --git a/pandas/core/series.py b/pandas/core/series.py index 885675e5caa5a..1d83643722ece 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4634,11 +4634,6 @@ def apply( """ Invoke function on values of Series. - .. deprecated:: 2.1.0 - - If the result from ``func`` is a ``Series``, wrapping the output in a - ``DataFrame`` instead of a ``Series`` has been deprecated. - Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values. diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index d3e5ac1b4ca7a..aeb6a01eb587a 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -420,8 +420,9 @@ def test_agg_evaluate_lambdas(string_series): def test_with_nested_series(datetime_series, op_name): # GH 2316 # .agg with a reducer and a transform, what to do - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "cannot aggregate" + warning = FutureWarning if op_name == "agg" else None + with tm.assert_produces_warning(warning, match=msg): # GH52123 result = getattr(datetime_series, op_name)( lambda x: Series([x, x**2], index=["x", "x^2"]) @@ -429,6 +430,10 @@ def test_with_nested_series(datetime_series, op_name): expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + def test_replicate_describe(string_series): # this also tests a result set that is all scalars @@ -512,10 +517,7 @@ def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): index = dti.tz_localize("UTC").index else: index = dti.index - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = Series(index).apply(lambda x: Series([1, 2])) + result = Series(index).apply(lambda x: Series([1, 2])) tm.assert_frame_equal(result, exp) @@ -662,19 +664,7 @@ def test_apply_dictlike_lambda(ops, by_row, expected): def test_apply_retains_column_name(by_row): # GH 16380 df = DataFrame({"x": range(3)}, Index(range(3), name="x")) - func = lambda x: Series(range(x + 1), Index(range(x + 1), name="y")) - - if not by_row: - # GH53400 - msg = "'Series' object cannot be interpreted as an integer" - with pytest.raises(TypeError, match=msg): - df.x.apply(func, by_row=by_row) - return - - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = df.x.apply(func, by_row=by_row) + result = df.x.apply(lambda x: Series(range(x + 1), Index(range(x + 1), name="y"))) expected = DataFrame( [[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]], columns=Index(range(3), name="y"), @@ -689,3 +679,11 @@ def test_apply_type(): result = s.apply(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_series_apply_unpack_nested_data(): + # GH#55189 + ser = Series([[1, 2, 3], [4, 5, 6, 7]]) + result = ser.apply(lambda x: Series(x)) + expected = DataFrame({0: [1.0, 4.0], 1: [2.0, 5.0], 2: [3.0, 6.0], 3: [np.nan, 7]}) + tm.assert_frame_equal(result, expected) From 7c93cc91c70db325f3a32f74fad133615993a400 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 20 Sep 2023 17:54:23 +0200 Subject: [PATCH 104/174] Backport PR #55008 on branch 2.1.x (CoW: Clear dead references every time we add a new one) (#55220) CoW: Clear dead references every time we add a new one (#55008) (cherry picked from commit 7134f2c14e614980bcf366f979a0f85aafacbde6) --- pandas/_libs/internals.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index adf4e8c926fa3..859ebce36eaa8 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -951,6 +951,11 @@ cdef class BlockValuesRefs: else: self.referenced_blocks = [] + def _clear_dead_references(self) -> None: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + def add_reference(self, blk: SharedBlock) -> None: """Adds a new reference to our reference collection. @@ -959,6 +964,7 @@ cdef class BlockValuesRefs: blk: SharedBlock The block that the new references should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(blk)) def add_index_reference(self, index: object) -> None: @@ -969,6 +975,7 @@ cdef class BlockValuesRefs: index : Index The index that the new reference should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(index)) def has_reference(self) -> bool: @@ -981,8 +988,6 @@ cdef class BlockValuesRefs: ------- bool """ - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + self._clear_dead_references() # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 From 81768de24c0b2c491e67e88952dfcb02acd0ff91 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 20 Sep 2023 09:27:09 -0700 Subject: [PATCH 105/174] Backport PR #55207 on branch 2.1.x (DOC: Add whatsnew for 2.1.2) (#55221) Backport PR #55207: DOC: Add whatsnew for 2.1.2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.1.0.rst | 2 +- doc/source/whatsnew/v2.1.1.rst | 2 ++ doc/source/whatsnew/v2.1.2.rst | 39 ++++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v2.1.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index c21fbc28ae68f..baab20845a2a2 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.2 v2.1.1 v2.1.0 diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 18054e0b01191..d422fb1be9fdf 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -891,4 +891,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.0.3..v2.1.0|HEAD +.. contributors:: v2.0.3..v2.1.0 diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 42a7029b1fa77..c52f7db8ec3ec 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -51,3 +51,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.0..v2.1.1|HEAD diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst new file mode 100644 index 0000000000000..2c9b10160d144 --- /dev/null +++ b/doc/source/whatsnew/v2.1.2.rst @@ -0,0 +1,39 @@ +.. _whatsnew_212: + +What's new in 2.1.2 (October ??, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.contributors: + +Contributors +~~~~~~~~~~~~ From 84850f382849a5ad49032c94fa3bd88af56e5936 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 20 Sep 2023 12:16:57 -0700 Subject: [PATCH 106/174] Backport PR #55206 on branch 2.1.x (BUILD: Fix duplicate files warning) (#55222) Backport PR #55206: BUILD: Fix duplicate files warning Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/_libs/meson.build | 37 ++++++++++++++++++++++++++++++--- pandas/_libs/tslibs/meson.build | 27 +++++++++++++++++++++--- pandas/_libs/window/meson.build | 13 ++++++++++++ pandas/meson.build | 1 - 4 files changed, 71 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 1cf2c4343d844..fd632790546f6 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -114,9 +114,40 @@ foreach ext_name, ext_dict : libs_sources ) endforeach -py.install_sources( +# Basically just __init__.py and the .pyi files +sources_to_install = [ '__init__.py', - subdir: 'pandas/_libs' -) + 'algos.pyi', + 'arrays.pyi', + 'byteswap.pyi', + 'groupby.pyi', + 'hashing.pyi', + 'hashtable.pyi', + 'index.pyi', + 'indexing.pyi', + 'internals.pyi', + 'interval.pyi', + 'join.pyi', + 'json.pyi', + 'lib.pyi', + 'missing.pyi', + 'ops.pyi', + 'ops_dispatch.pyi', + 'parsers.pyi', + 'properties.pyi', + 'reshape.pyi', + 'sas.pyi', + 'sparse.pyi', + 'testing.pyi', + 'tslib.pyi', + 'writers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs' + ) +endforeach subdir('window') diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 167695b84514c..a1b0c54d1f48c 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -31,7 +31,28 @@ foreach ext_name, ext_dict : tslibs_sources ) endforeach -py.install_sources( +sources_to_install = [ '__init__.py', - subdir: 'pandas/_libs/tslibs' -) + 'ccalendar.pyi', + 'conversion.pyi', + 'dtypes.pyi', + 'fields.pyi', + 'nattype.pyi', + 'np_datetime.pyi', + 'offsets.pyi', + 'parsing.pyi', + 'period.pyi', + 'strptime.pyi', + 'timedeltas.pyi', + 'timestamps.pyi', + 'timezones.pyi', + 'tzconversion.pyi', + 'vectorized.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/tslibs' + ) +endforeach diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index 85aa060a26406..ad15644f73a0c 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -16,3 +16,16 @@ py.extension_module( subdir: 'pandas/_libs/window', install: true ) + +sources_to_install = [ + '__init__.py', + 'aggregations.pyi', + 'indexers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/window' + ) +endforeach diff --git a/pandas/meson.build b/pandas/meson.build index f02258c98d46a..435103a954d86 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -26,7 +26,6 @@ subdir('_libs') subdirs_list = [ '_config', - '_libs', '_testing', 'api', 'arrays', From e86ed377639948c64c429059127bcf5b359ab6be Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Wed, 20 Sep 2023 15:24:13 -0400 Subject: [PATCH 107/174] RLS: 2.1.1 From fd51bb60abb8605d2f9277da832bbc3dcc69c83f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 22 Sep 2023 16:47:52 -0700 Subject: [PATCH 108/174] Backport PR #55249 on branch 2.1.x (BUG: incompatible dtype when creating string column with loc) (#55251) Backport PR #55249: BUG: incompatible dtype when creating string column with loc Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/dtypes/missing.py | 3 +++ pandas/tests/frame/indexing/test_indexing.py | 15 +++++++++++++++ pandas/tests/frame/indexing/test_set_value.py | 5 +---- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 2c9b10160d144..97aeb56924e65 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 7117e34b23ca4..8760c8eeca454 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -624,6 +624,9 @@ def infer_fill_value(val): return np.array("NaT", dtype=DT64NS_DTYPE) elif dtype in ["timedelta", "timedelta64"]: return np.array("NaT", dtype=TD64NS_DTYPE) + return np.array(np.nan, dtype=object) + elif val.dtype.kind == "U": + return np.array(np.nan, dtype=val.dtype) return np.nan diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index b324291bab31e..370cbf0f33174 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1890,6 +1890,21 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): df.loc[key] = 1 +def test_adding_new_conditional_column() -> None: + # https://github.com/pandas-dev/pandas/issues/55025 + df = DataFrame({"x": [1]}) + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame({"x": [1], "y": ["1"]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"x": [1]}) + # try inserting something which numpy would store as 'object' + value = lambda x: x + df.loc[df["x"] == 1, "y"] = value + expected = DataFrame({"x": [1], "y": [value]}) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index cd9ffa0f129a2..32312868adacb 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -26,10 +26,7 @@ def test_set_value_resize(self, float_frame): assert float_frame._get_value("foobar", "qux") == 0 res = float_frame.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - res._set_value("foobar", "baz", "sam") + res._set_value("foobar", "baz", "sam") assert res["baz"].dtype == np.object_ res = float_frame.copy() From 526c7e7cb1baa6822d0e42ece6956b69e745d88d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 25 Sep 2023 13:23:53 -0700 Subject: [PATCH 109/174] Backport PR #55276 on branch 2.1.x (Bump pypa/cibuildwheel from 2.15.0 to 2.16.0) (#55288) Backport PR #55276: Bump pypa/cibuildwheel from 2.15.0 to 2.16.0 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 97d78a1a9afe3..ecb30fefb9ff2 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.15.0 + uses: pypa/cibuildwheel@v2.16.0 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From eec2023f409ed8ae0d526102b65343992558ae6a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 25 Sep 2023 22:11:34 -0700 Subject: [PATCH 110/174] Backport PR #55275 on branch 2.1.x (Update pyproject.toml - replace `output_formatting` with `output-formatting`) (#55291) Backport PR #55275: Update pyproject.toml - replace `output_formatting` with `output-formatting` Co-authored-by: Artur Barseghyan --- .github/workflows/package-checks.yml | 2 +- doc/source/getting_started/install.rst | 6 +++--- doc/source/whatsnew/v2.1.2.rst | 2 +- pyproject.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04abcf4ce8816..7d20ef92587d9 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ae7c9d4ea9c62..7570f9f33d265 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -247,14 +247,14 @@ Dependency Minimum Version pip ext Visualization ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[plot, output_formatting]"``. +Installable with ``pip install "pandas[plot, output-formatting]"``. ========================= ================== ================== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== ================== ============================================================= matplotlib 3.6.1 plot Plotting library -Jinja2 3.1.2 output_formatting Conditional formatting with DataFrame.style -tabulate 0.8.10 output_formatting Printing in Markdown-friendly format (see `tabulate`_) +Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style +tabulate 0.8.10 output-formatting Printing in Markdown-friendly format (see `tabulate`_) ========================= ================== ================== ============================================================= Computation diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97aeb56924e65..6fec66ec8d556 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -29,7 +29,7 @@ Bug fixes Other ~~~~~ -- +- Fixed non-working installation of optional dependency group ``output_formatting``. Replacing underscore ``_`` with a dash ``-`` fixes broken dependency resolution. A correct way to use now is ``pip install pandas[output-formatting]``. - .. --------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 477c9be6274b3..3cfdadc268160 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ sql-other = ['SQLAlchemy>=1.4.36'] html = ['beautifulsoup4>=4.11.1', 'html5lib>=1.1', 'lxml>=4.8.0'] xml = ['lxml>=4.8.0'] plot = ['matplotlib>=3.6.1'] -output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] +output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] compression = ['zstandard>=0.17.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] From fd76235e29e8b63fe2b0bf7294b358562d471c4d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 26 Sep 2023 16:08:48 -0700 Subject: [PATCH 111/174] Backport PR #55300 on branch 2.1.x (TST: xfail test due to new numexpr version) (#55303) Backport PR #55300: TST: xfail test due to new numexpr version Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/frame/test_arithmetic.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 878e94c15e16b..670d4d4f554a6 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -22,14 +22,12 @@ ) import pandas._testing as tm from pandas.core.computation import expressions as expr -from pandas.core.computation.expressions import ( - _MIN_ELEMENTS, - NUMEXPR_INSTALLED, -) +from pandas.core.computation.expressions import _MIN_ELEMENTS from pandas.tests.frame.common import ( _check_mixed_float, _check_mixed_int, ) +from pandas.util.version import Version @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) @@ -502,10 +500,19 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.skipif(not NUMEXPR_INSTALLED, reason="numexpr not installed") @pytest.mark.parametrize("opname", ["floordiv", "pow"]) - def test_floordiv_axis0_numexpr_path(self, opname): + def test_floordiv_axis0_numexpr_path(self, opname, request): # case that goes through numexpr and has to fall back to masked_arith_op + ne = pytest.importorskip("numexpr") + if ( + Version(ne.__version__) >= Version("2.8.7") + and opname == "pow" + and "python" in request.node.callspec.id + ): + request.node.add_marker( + pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454") + ) + op = getattr(operator, opname) arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100 From c46c3bf268e96c0c7c534fd9b6b475c5aea82636 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 2 Oct 2023 22:54:09 +0200 Subject: [PATCH 112/174] Backport PR #55348 on branch 2.1.x (REGR: join segfaulting for arrow string with nulls) (#55357) Backport PR #55348: REGR: join segfaulting for arrow string with nulls Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/reshape/merge.py | 2 ++ pandas/tests/frame/methods/test_join.py | 7 ++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 6fec66ec8d556..158cb51f05316 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) -- +- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d36ceff800c56..74181f8dc853c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2442,6 +2442,8 @@ def _factorize_keys( .astype(np.intp, copy=False), len(dc.dictionary), ) + if dc.null_count > 0: + count += 1 if how == "right": return rlab, llab, count return llab, rlab, count diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 98f3926968ad0..3b6b86056f8c7 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -158,9 +158,14 @@ def test_join_invalid_validate(left_no_dup, right_no_dup): left_no_dup.merge(right_no_dup, on="a", validate="invalid") -def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups): +@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"]) +def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype): # GH 46622 # Dups on right allowed by one_to_many constraint + if dtype == "string[pyarrow]": + pytest.importorskip("pyarrow") + left_no_dup = left_no_dup.astype(dtype) + right_w_dups.index = right_w_dups.index.astype(dtype) left_no_dup.join( right_w_dups, on="a", From 78a5500d5d63f029301500aac31ba3908d12dbbc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 2 Oct 2023 22:54:24 +0200 Subject: [PATCH 113/174] Backport PR #55350 on branch 2.1.x (Bump pypa/cibuildwheel from 2.16.0 to 2.16.1) (#55358) Backport PR #55350: Bump pypa/cibuildwheel from 2.16.0 to 2.16.1 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ecb30fefb9ff2..b156c3d9dfae7 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.16.0 + uses: pypa/cibuildwheel@v2.16.1 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 7e8398c073953f4a08c3ef2e8fc38efb266a6667 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 22:00:58 +0200 Subject: [PATCH 114/174] Backport PR #55368 on branch 2.1.x (BUG: idxmin/max raising for arrow dtypes) (#55377) BUG: idxmin/max raising for arrow dtypes (#55368) (cherry picked from commit 59616c57d6abcf0fb4eb5f81928120c7b000ba88) --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/arrays/arrow/array.py | 13 +++++++++++-- pandas/tests/frame/test_reductions.py | 13 +++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 158cb51f05316..090df1489e493 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -21,7 +21,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 49305128267be..190fd8fd54e02 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -30,6 +30,7 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -1595,13 +1596,21 @@ def _reduce( pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: - result = pa.array([pa_result.as_py()], type=pa_result.type) + if isinstance(pa_result, pa.Scalar): + result = pa.array([pa_result.as_py()], type=pa_result.type) + else: + result = pa.array( + [pa_result], + type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), + ) return type(self)(result) if pc.is_null(pa_result).as_py(): return self.dtype.na_value - else: + elif isinstance(pa_result, pa.Scalar): return pa_result.as_py() + else: + return pa_result def _explode(self): """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e7b6a0c0b39b0..e9c69c9d2df52 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1056,6 +1056,19 @@ def test_idxmax_numeric_only(self, numeric_only): expected = Series([1, 0, 1], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + def test_idxmax_arrow_types(self): + # GH#55368 + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]") + result = df.idxmax() + expected = Series([1, 0], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([2, 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" From 850ceb700a05f51c6eaba8411a1012aa050841e2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 3 Oct 2023 22:01:08 +0200 Subject: [PATCH 115/174] Backport PR #55363 on branch 2.1.x (BUG: ndim of string block incorrect with string inference) (#55376) Backport PR #55363: BUG: ndim of string block incorrect with string inference Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/internals/construction.py | 2 +- pandas/tests/frame/test_constructors.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 05aa9f38bb2b0..8bb6c6b5de7ea 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -383,7 +383,7 @@ def ndarray_to_mgr( new_block( dtype.construct_array_type()._from_sequence(data, dtype=dtype), BlockPlacement(slice(i, i + 1)), - ndim=1, + ndim=2, ) for i, data in enumerate(obj_columns) ] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a8ab3ce1ba014..e1aa682d763a9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2743,6 +2743,13 @@ def test_frame_string_inference_array_string_dtype(self): df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) tm.assert_frame_equal(df, expected) + def test_frame_string_inference_block_dim(self): + # GH#55363 + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + assert df._mgr.blocks[0].ndim == 2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): From 2201408de39ee1a76eb4839d81a0dbbc2a9b50a1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 23:50:55 +0200 Subject: [PATCH 116/174] Backport PR #55365 on branch 2.1.x (BUG: Index.insert raising when inserting None into new string dtype) (#55383) BUG: Index.insert raising when inserting None into new string dtype (#55365) (cherry picked from commit 8664572fe63dbcb865588d9d564b49a572c3351e) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_arrow.py | 5 +++++ pandas/tests/indexes/base_class/test_reshape.py | 8 ++++++++ 3 files changed, 14 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 090df1489e493..9c81956d14d77 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) +- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0b8d3d6b13f60..7e200dc8af374 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -579,3 +579,8 @@ def _reduce( ) else: return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: + if item is np.nan: + item = libmissing.NA + return super().insert(loc, item) # type: ignore[return-value] diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 5ecb2c753644d..6586f5f9de480 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -54,6 +54,14 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) + def test_insert_none_into_string_numpy(self): + # GH#55365 + pytest.importorskip("pyarrow") + index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + result = index.insert(-1, None) + expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "pos,expected", [ From 0191caf02fe7127a34e16ee6c90ad43446460bf4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 3 Oct 2023 23:51:31 +0200 Subject: [PATCH 117/174] Backport PR #55364 on branch 2.1.x (BUG: eq not implemented for categorical and arrow backed strings) (#55381) Backport PR #55364: BUG: eq not implemented for categorical and arrow backed strings Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 5 ++++- pandas/tests/indexes/categorical/test_equals.py | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 9c81956d14d77..ff2df2d4256b6 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -21,6 +21,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 190fd8fd54e02..2e3ad8bc13091 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -32,6 +32,7 @@ from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( + CategoricalDtype, is_array_like, is_bool_dtype, is_integer, @@ -628,7 +629,9 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): + if isinstance( + other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) + ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): result = pc_func(self._pa_array, self._box_pa(other)) elif is_scalar(other): try: diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 1ed8f3a903439..a8353f301a3c3 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -88,3 +88,9 @@ def test_equals_multiindex(self): ci = mi.to_flat_index().astype("category") assert not ci.equals(mi) + + def test_equals_string_dtype(self, any_string_dtype): + # GH#55364 + idx = CategoricalIndex(list("abc"), name="B") + other = Index(["a", "b", "c"], name="B", dtype=any_string_dtype) + assert idx.equals(other) From ad5fe9c3bf7acc59b15fbd78549bb5aa138a8b5d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:26:35 -1000 Subject: [PATCH 118/174] Backport PR #55138: BUG: Silence `Period[B]` warnings in plotting code (#55378) * Backport PR #55138: BUG: Silence `Period[B]` warnings in plotting code * Don't build in parallel to see error * Remove period call * Another suppress? --------- Co-authored-by: torext --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/plotting/_matplotlib/converter.py | 70 +++++++++++++++++------ pandas/tests/plotting/frame/test_frame.py | 9 +++ pandas/tests/plotting/test_series.py | 7 +++ 4 files changed, 69 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index ff2df2d4256b6..60b628c5d658c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) -- +- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index cd7823ba15e44..be0ded0ecdf57 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -14,6 +14,7 @@ Final, cast, ) +import warnings import matplotlib.dates as mdates from matplotlib.ticker import ( @@ -243,18 +244,29 @@ def _convert_1d(values, units, axis): if not hasattr(axis, "freq"): raise TypeError("Axis must have `freq` set to convert to Periods") valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) - if isinstance(values, valid_types) or is_integer(values) or is_float(values): - return get_datevalue(values, axis.freq) - elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).asi8 - elif isinstance(values, Index): - return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values, skipna=False) == "period": - # https://github.com/pandas-dev/pandas/issues/24304 - # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq).asi8 - elif isinstance(values, (list, tuple, np.ndarray, Index)): - return [get_datevalue(x, axis.freq) for x in values] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + if ( + isinstance(values, valid_types) + or is_integer(values) + or is_float(values) + ): + return get_datevalue(values, axis.freq) + elif isinstance(values, PeriodIndex): + return values.asfreq(axis.freq).asi8 + elif isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + elif lib.infer_dtype(values, skipna=False) == "period": + # https://github.com/pandas-dev/pandas/issues/24304 + # convert ndarray[period] -> PeriodIndex + return PeriodIndex(values, freq=axis.freq).asi8 + elif isinstance(values, (list, tuple, np.ndarray, Index)): + return [get_datevalue(x, axis.freq) for x in values] return values @@ -567,14 +579,30 @@ def _daily_finder(vmin, vmax, freq: BaseOffset): # save this for later usage vmin_orig = vmin - (vmin, vmax) = ( - Period(ordinal=int(vmin), freq=freq), - Period(ordinal=int(vmax), freq=freq), - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + (vmin, vmax) = ( + Period(ordinal=int(vmin), freq=freq), + Period(ordinal=int(vmax), freq=freq), + ) assert isinstance(vmin, Period) assert isinstance(vmax, Period) span = vmax.ordinal - vmin.ordinal + 1 - dates_ = period_range(start=vmin, end=vmax, freq=freq) + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + dates_ = period_range(start=vmin, end=vmax, freq=freq) + # Initialize the output info = np.zeros( span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] @@ -1072,7 +1100,13 @@ def __call__(self, x, pos: int = 0) -> str: fmt = self.formatdict.pop(x, "") if isinstance(fmt, np.bytes_): fmt = fmt.decode("utf-8") - period = Period(ordinal=int(x), freq=self.freq) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Period with BDay freq is deprecated", + category=FutureWarning, + ) + period = Period(ordinal=int(x), freq=self.freq) assert isinstance(period, Period) return period.strftime(fmt) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 11008646f0ad4..b97f1d64d57fd 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2487,6 +2487,15 @@ def test_secondary_y(self, secondary_y): assert ax.get_ylim() == (0, 100) assert ax.get_yticks()[0] == 99 + @pytest.mark.slow + def test_plot_no_warning(self): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + df = tm.makeTimeDataFrame() + with tm.assert_produces_warning(False): + _ = df.plot() + _ = df.T.plot() + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6f0afab53c267..768fce023e6e0 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -973,3 +973,10 @@ def test_series_none_color(self): ax = series.plot(color=None) expected = _unpack_cycler(mpl.pyplot.rcParams)[:1] _check_colors(ax.get_lines(), linecolors=expected) + + @pytest.mark.slow + def test_plot_no_warning(self, ts): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + with tm.assert_produces_warning(False): + _ = ts.plot() From f4359ecc3ae922b6ee7f8c3ebb942f55bb6a23f0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 4 Oct 2023 16:50:24 +0200 Subject: [PATCH 119/174] Backport PR #55366 on branch 2.1.x (BUG: Inserting ndim=0 array does not infer string dtype) (#55396) Backport PR #55366: BUG: Inserting ndim=0 array does not infer string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/construction.py | 5 +++++ pandas/tests/frame/indexing/test_indexing.py | 13 +++++++++++++ 3 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 60b628c5d658c..0312ec3682f11 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index aaac0dc73486f..e661d590ab330 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -562,7 +562,12 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") + if isinstance(data, str) and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data elif isinstance(data, ABCExtensionArray): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 370cbf0f33174..6d4eedb49ff83 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1905,6 +1905,19 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) +def test_add_new_column_infer_string(): + # GH#55366 + pytest.importorskip("pyarrow") + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. From db8e3e8365715214fe84f8c68c573343b95897ab Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:45:10 +0200 Subject: [PATCH 120/174] Backport PR #55369 on branch 2.1.x (MAINT: Remove `np.int_` and `np.uint`) (#55387) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #55369: MAINT: Remove `np.int_` and `np.uint` Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- doc/source/user_guide/enhancingperf.rst | 2 +- pandas/compat/numpy/__init__.py | 15 +++++++++++++++ pandas/core/algorithms.py | 2 +- pandas/tests/arrays/boolean/test_reduction.py | 4 +++- pandas/tests/dtypes/cast/test_infer_dtype.py | 2 +- pandas/tests/extension/base/dim2.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 4 ++-- pandas/tests/frame/methods/test_shift.py | 9 +++++---- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_reductions.py | 8 ++++++-- pandas/tests/groupby/aggregate/test_cython.py | 2 +- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 4 ++-- pandas/tests/groupby/test_min_max.py | 2 +- pandas/tests/groupby/test_quantile.py | 2 +- pandas/tests/groupby/test_size.py | 2 +- pandas/tests/groupby/test_value_counts.py | 2 +- pandas/tests/indexes/datetimes/test_datetime.py | 4 +++- pandas/tests/indexes/datetimes/test_indexing.py | 4 +++- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexing/multiindex/test_partial.py | 2 +- pandas/tests/indexing/test_loc.py | 12 ++++++------ pandas/tests/indexing/test_partial.py | 4 ++-- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 4 ++-- pandas/tests/plotting/test_series.py | 7 +++++-- pandas/tests/reshape/merge/test_merge.py | 4 ++-- pandas/tests/scalar/test_na_scalar.py | 11 ++++++----- pandas/tests/series/methods/test_reindex.py | 2 +- pandas/tests/series/test_repr.py | 4 ++-- pyproject.toml | 2 ++ 31 files changed, 81 insertions(+), 49 deletions(-) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index bc2f4420da784..c4721f3a6b09c 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -184,7 +184,7 @@ can be improved by passing an ``np.ndarray``. ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): ...: assert (col_a.dtype == np.float64 - ...: and col_b.dtype == np.float64 and col_N.dtype == np.int_) + ...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int)) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index d376fa4c1919e..1b974a92f8188 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -21,6 +21,21 @@ ) +np_long: type +np_ulong: type + +if _nlv >= Version("2.0.0.dev0"): + try: + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] + except AttributeError: + np_long = np.int_ + np_ulong = np.uint +else: + np_long = np.int_ + np_ulong = np.uint + + __all__ = [ "np", "_np_version", diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1d74bb8b83e4e..5f9fced2e415c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1641,7 +1641,7 @@ def safe_sort( else: mask = None else: - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `-1` next, so we # may deal with them here without performance loss using `mode='wrap'` diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index dd8c3eda9ed05..71156a4d84ae5 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd @@ -51,7 +53,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): s = s.dropna() if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.int_) + assert isinstance(getattr(s, op)(), np_long) elif op == "count": # Oddly on the 32 bit build (but not Windows), this is intc (!= intp) assert isinstance(getattr(s, op)(), np.integer) diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index ed08df74461ef..50eaa1f4d8713 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -170,7 +170,7 @@ def test_infer_dtype_from_scalar(value, expected): @pytest.mark.parametrize( "arr, expected", [ - ([1], np.int_), + ([1], np.dtype(int)), (np.array([1], dtype=np.int64), np.int64), ([np.nan, 1, ""], np.object_), (np.array([[1.0, 2.0]]), np.float64), diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index f1d787041d10b..bff4fbd73984d 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -237,7 +237,7 @@ def get_reduction_result_dtype(dtype): return NUMPY_INT_TO_DTYPE[np.dtype(int)] else: # i.e. dtype.kind == "u" - return NUMPY_INT_TO_DTYPE[np.dtype(np.uint)] + return NUMPY_INT_TO_DTYPE[np.dtype("uint")] if method in ["sum", "prod"]: # std and var are not dtype-preserving diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 6d4eedb49ff83..de8df15a9d747 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -108,11 +108,11 @@ def test_setitem_list(self, float_frame): data["A"] = newcolumndata def test_setitem_list2(self): - df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=int) df.loc[1, ["tt1", "tt2"]] = [1, 2] result = df.loc[df.index[1], ["tt1", "tt2"]] - expected = Series([1, 2], df.columns, dtype=np.int_, name=1) + expected = Series([1, 2], df.columns, dtype=int, name=1) tm.assert_series_equal(result, expected) df["tt1"] = df["tt2"] = "0" diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 1e88152137b1f..7f31c3985d4df 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long import pandas.util._test_decorators as td import pandas as pd @@ -471,22 +472,22 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self): df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int)) df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int)) df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(2, axis=1, fill_value=np.int_(0)) + result = df3.shift(2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([-1, -1, 0, 1], axis=1) - expected.iloc[:, :2] = np.int_(0) + expected.iloc[:, :2] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) # Case with periods < 0 df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(-2, axis=1, fill_value=np.int_(0)) + result = df3.shift(-2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([2, 3, -1, -1], axis=1) - expected.iloc[:, -2:] = np.int_(0) + expected.iloc[:, -2:] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e1aa682d763a9..a291b906d6710 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1823,7 +1823,7 @@ def test_constructor_single_value(self): DataFrame("a", [1, 2], ["a", "c"], float) def test_constructor_with_datetimes(self): - intname = np.dtype(np.int_).name + intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e9c69c9d2df52..4c83c810e8cec 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,10 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import ( + np_long, + np_ulong, +) import pandas.util._test_decorators as td import pandas as pd @@ -1712,11 +1716,11 @@ class TestEmptyDataFrameReductions: "opname, dtype, exp_value, exp_dtype", [ ("sum", np.int8, 0, np.int64), - ("prod", np.int8, 1, np.int_), + ("prod", np.int8, 1, np_long), ("sum", np.int64, 0, np.int64), ("prod", np.int64, 1, np.int64), ("sum", np.uint8, 0, np.uint64), - ("prod", np.uint8, 1, np.uint), + ("prod", np.uint8, 1, np_ulong), ("sum", np.uint64, 0, np.uint64), ("prod", np.uint64, 1, np.uint64), ("sum", np.float32, 0, np.float32), diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index f917f567e1ce3..ff50628aad17d 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -229,7 +229,7 @@ def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = DataFrame([11, 12, 13], columns=["a"]) - grps = np.arange(0, 25, 5, dtype=np.int_) + grps = np.arange(0, 25, 5, dtype=int) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "sum", alt=None, numeric_only=True diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0abf6428730ff..ac58701f5fa39 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -822,7 +822,7 @@ def test_nlargest_and_smallest_noop(data, groups, dtype, method): data = list(reversed(data)) ser = Series(data, name="a") result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bcd8323a26767..78a0497ef611e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2977,12 +2977,12 @@ def test_groupby_reduce_period(): res = gb.max() expected = ser[-10:] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) res = gb.min() expected = ser[:10] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 37eb52be0b37b..30c7e1df1e691 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -108,7 +108,7 @@ def test_max_inat_not_all_na(): # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_series_equal(result, expected, check_exact=True) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 165d72bf3e878..5a12f9a8e0e35 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -468,7 +468,7 @@ def test_groupby_quantile_dt64tz_period(): # Check that we match the group-by-group result exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} expected = DataFrame(exp).T.infer_objects() - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 76b26c04f9f3a..c275db9d1788c 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -39,7 +39,7 @@ def test_size_axis_1(df, axis_1, by, sort, dropna): if sort: expected = expected.sort_index() if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 7c50124e57e29..2bac76967dab3 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -993,7 +993,7 @@ def test_mixed_groupings(normalize, expected_label, expected_values): result = gp.value_counts(sort=True, normalize=normalize) expected = DataFrame( { - "level_0": np.array([4, 4, 5], dtype=np.int_), + "level_0": np.array([4, 4, 5], dtype=int), "A": [1, 1, 2], "level_2": [8, 8, 7], "B": [1, 3, 2], diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index aa4954ff0ba85..e5e6d99c13e94 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DataFrame, @@ -54,7 +56,7 @@ def test_time_overflow_for_32bit_machines(self): # (which has value 1e9) and since the max value for np.int32 is ~2e9, # and since those machines won't promote np.int32 to np.int64, we get # overflow. - periods = np.int_(1000) + periods = np_long(1000) idx1 = date_range(start="2000", periods=periods, freq="S") assert len(idx1) == periods diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 8e1b41095e056..cfbf1a75b25a8 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DatetimeIndex, @@ -91,7 +93,7 @@ def test_dti_business_getitem(self, freq): assert fancy_indexed.freq is None # 32-bit vs. 64-bit platforms - assert rng[4] == rng[np.int_(4)] + assert rng[4] == rng[np_long(4)] @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_getitem_matplotlib_hackaround(self, freq): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bc04c1c6612f4..8fd1e296fb79a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -457,7 +457,7 @@ def test_fancy(self, simple_index): ["string", "int64", "int32", "uint64", "uint32", "float64", "float32"], indirect=True, ) - @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) + @pytest.mark.parametrize("dtype", [int, np.bool_]) def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) empty_index = type(index)([], dtype=index.dtype) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index de989ad550f2b..081da385ebcc3 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -166,7 +166,7 @@ def test_getitem_intkey_leading_level( mi = ser.index assert isinstance(mi, MultiIndex) if dtype is int: - assert mi.levels[0].dtype == np.int_ + assert mi.levels[0].dtype == np.dtype(int) else: assert mi.levels[0].dtype == np.float64 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d0b6adfda0241..8b2730b3ab082 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -442,7 +442,7 @@ def test_loc_to_fail(self): ) msg = ( - rf"\"None of \[Index\(\[1, 2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[1, 2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -460,7 +460,7 @@ def test_loc_to_fail2(self): s.loc[-1] msg = ( - rf"\"None of \[Index\(\[-1, -2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-1, -2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -476,7 +476,7 @@ def test_loc_to_fail2(self): s["a"] = 2 msg = ( - rf"\"None of \[Index\(\[-2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -493,7 +493,7 @@ def test_loc_to_fail3(self): df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) msg = ( - rf"\"None of \[Index\(\[3\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[3\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -510,7 +510,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.int_().dtype}')] are in the [index]" + msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] @@ -1209,7 +1209,7 @@ def test_loc_setitem_empty_append_raises(self): df = DataFrame(columns=["x", "y"]) df.index = df.index.astype(np.int64) msg = ( - rf"None of \[Index\(\[0, 1\], dtype='{np.int_().dtype}'\)\] " + rf"None of \[Index\(\[0, 1\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 8e5cde42ec91b..8f499644f1013 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -402,7 +402,7 @@ def test_series_partial_set(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}'\)\] " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -483,7 +483,7 @@ def test_series_partial_set_with_name(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}', " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}', " r"name='idx'\)\] are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b7108896f01ed..4b23829a554aa 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -468,7 +468,7 @@ def test_set_change_dtype(self, mgr): np.random.default_rng(2).standard_normal(N).astype(int), ) idx = mgr2.items.get_loc("quux") - assert mgr2.iget(idx).dtype == np.int_ + assert mgr2.iget(idx).dtype == np.dtype(int) mgr2.iset( mgr2.items.get_loc("quux"), np.random.default_rng(2).standard_normal(N) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c79fdd9145a6a..9f7840588f89e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -47,7 +47,7 @@ def test_read_csv_with_custom_date_parser(all_parsers): # GH36111 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( @@ -87,7 +87,7 @@ def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): # GH44366 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 768fce023e6e0..7c34567dd2ebe 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -6,7 +6,10 @@ import pytest from pandas.compat import is_platform_linux -from pandas.compat.numpy import np_version_gte1p24 +from pandas.compat.numpy import ( + np_long, + np_version_gte1p24, +) import pandas.util._test_decorators as td import pandas as pd @@ -561,7 +564,7 @@ def test_plot_fails_with_dupe_color_and_style(self): [ ["scott", 20], [None, 20], - [None, np.int_(20)], + [None, np_long(20)], [0.5, np.linspace(-100, 100, 20)], ], ) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c4067363d934e..304ec37b1e453 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -366,7 +366,7 @@ def test_merge_join_key_dtype_cast(self): lkey = np.array([1]) rkey = np.array([2]) df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") - assert df["key_0"].dtype == np.int_ + assert df["key_0"].dtype == np.dtype(int) def test_handle_join_key_pass_array(self): left = DataFrame( @@ -390,7 +390,7 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") - expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=np.int_, name="key_0") + expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=int, name="key_0") tm.assert_series_equal(merged["key_0"], expected) left = DataFrame({"value": np.arange(3)}) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 287b7557f50f9..44ce5c79db348 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -9,6 +9,7 @@ import pytest from pandas._libs.missing import NA +from pandas.compat.numpy import np_long from pandas.core.dtypes.common import is_scalar @@ -102,9 +103,9 @@ def test_comparison_ops(comparison_op, other): -0.0, False, np.bool_(False), - np.int_(0), + np_long(0), np.float64(0), - np.int_(-0), + np_long(-0), np.float64(-0), ], ) @@ -123,7 +124,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float64(1)] + "value", [1, 1.0, True, np.bool_(True), np_long(1), np.float64(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -133,14 +134,14 @@ def test_rpow_special(value, asarray): if asarray: result = result[0] - elif not isinstance(value, (np.float64, np.bool_, np.int_)): + elif not isinstance(value, (np.float64, np.bool_, np_long)): # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) assert result == value -@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float64(-1)]) +@pytest.mark.parametrize("value", [-1, -1.0, np_long(-1), np.float64(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 2ab1cd13a31d8..c29c3297eb63b 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -198,7 +198,7 @@ def test_reindex_int(datetime_series): # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) - assert reindexed_int.dtype == np.int_ + assert reindexed_int.dtype == np.dtype(int) def test_reindex_bool(datetime_series): diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index f294885fb8f4d..be68918d2a380 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -348,7 +348,7 @@ def test_categorical_series_repr(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" +Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" assert repr(s) == exp @@ -374,7 +374,7 @@ def test_categorical_series_repr_ordered(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" +Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" assert repr(s) == exp diff --git a/pyproject.toml b/pyproject.toml index 3cfdadc268160..bfebb2762ff6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -501,6 +501,8 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", + # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged + "ignore:.*In the future `np.long` will be defined as.*:FutureWarning", ] junit_family = "xunit2" markers = [ From 25655e2929dd6d196ee61219af7da9909d60f4dc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 4 Oct 2023 19:06:32 +0200 Subject: [PATCH 121/174] Backport PR #55347 on branch 2.1.x (BUG: interpolate raising wrong error for ea) (#55382) BUG: interpolate raising wrong error for ea (#55347) (cherry picked from commit f196319e52b5823e89648af0c9d26dfdbe821d0c) --- doc/source/whatsnew/v2.1.2.rst | 2 ++ pandas/core/arrays/base.py | 1 - pandas/tests/frame/methods/test_interpolate.py | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 0312ec3682f11..c699ff2df1f30 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,8 +24,10 @@ Bug fixes - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) +- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) +- .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a21253db59323..bfd6ae361e1e8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -889,7 +889,6 @@ def interpolate( limit, limit_direction, limit_area, - fill_value, copy: bool, **kwargs, ) -> Self: diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 291a79815a81c..67aa07dd83764 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -497,3 +497,9 @@ def test_interpolate_empty_df(self): result = df.interpolate(inplace=True) assert result is None tm.assert_frame_equal(df, expected) + + def test_interpolate_ea_raise(self): + # GH#55347 + df = DataFrame({"a": [1, None, 2]}, dtype="Int64") + with pytest.raises(NotImplementedError, match="does not implement"): + df.interpolate() From f07deeff8aa812f47acde00c0f7a9a35e106fe69 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 4 Oct 2023 20:20:06 +0200 Subject: [PATCH 122/174] Backport PR #55397 on branch 2.1.x (MAINT: Add warning filter for `np.long`) (#55402) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #55397: MAINT: Add warning filter for `np.long` Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- pandas/compat/numpy/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 1b974a92f8188..51c9892b64a08 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,4 +1,6 @@ """ support numpy compatibility across versions """ +import warnings + import numpy as np from pandas.util.version import Version @@ -26,8 +28,14 @@ if _nlv >= Version("2.0.0.dev0"): try: - np_long = np.long # type: ignore[attr-defined] - np_ulong = np.ulong # type: ignore[attr-defined] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r".*In the future `np\.long` will be defined as.*", + FutureWarning, + ) + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] except AttributeError: np_long = np.int_ np_ulong = np.uint From 8fd8cf45723258dd69454c34f5687923f0d532ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 5 Oct 2023 00:01:18 +0200 Subject: [PATCH 123/174] Backport PR #55367 on branch 2.1.x (BUG: all not ignoring na when skipna=True) (#55407) BUG: all not ignoring na when skipna=True (#55367) * BUG: all not ignoring na when skipna=True * Update v2.1.2.rst (cherry picked from commit 364c9cb65acf47119dc251b25226ab415cc9a100) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_arrow.py | 8 +++++--- pandas/tests/reductions/test_reductions.py | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index c699ff2df1f30..524d71a56a2e6 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -26,6 +26,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) +- Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7e200dc8af374..7d282dc121332 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -571,9 +571,11 @@ def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): if name in ["any", "all"]: - arr = pc.and_kleene( - pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "") - ) + if not skipna and name == "all": + nas = pc.invert(pc.is_null(self._pa_array)) + arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") return ArrowExtensionArray(arr)._reduce( name, skipna=skipna, keepdims=keepdims, **kwargs ) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 021252500e814..560b2377ada70 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1087,7 +1087,8 @@ def test_any_all_pyarrow_string(self): ser = Series([None, "a"], dtype="string[pyarrow_numpy]") assert ser.any() - assert not ser.all() + assert ser.all() + assert not ser.all(skipna=False) ser = Series([None, ""], dtype="string[pyarrow_numpy]") assert not ser.any() From c0f932b4cf5257cf5ddfb11c73df900c4d06e780 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 6 Oct 2023 15:51:27 +0200 Subject: [PATCH 124/174] Backport PR #55414 on branch 2.1.x (CI: Pin Cython) (#55418) Backport PR #55414: CI: Pin Cython Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 6 +++--- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-numpydev.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index c1a0352b5f5c0..96210d878fa43 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -236,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -274,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -347,7 +347,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip install -ve . --no-build-isolation --no-index python -m pip list diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 5db145e31ebc5..fe8115fa8458b 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 2d50056098a0f..36d6f6d223668 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 7fd3a65ec91f8..9795a1fb39c6f 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer[toml] - meson[ninja]=1.2.1 - meson-python=0.13.1 - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 893341350f4ef..c259286a5359c 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - meson[ninja]=1.2.1 - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d56fb9ac9fe90..d7ef86ac2aa6b 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 2a7312cefe619..91052f8805952 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 1843e6bd8005f..c4d7418b0a5b5 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 4923c94ab08f3..db0723cd3b8fa 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index c893beced8fb4..2c5aebb794c46 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 From 81a192b7b2976241850413a49a661cbc0d79ac58 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 7 Oct 2023 13:03:50 +0200 Subject: [PATCH 125/174] Backport PR #55380 on branch 2.1.x (CI: Exclude benchmarks directory when publishing docs) (#55434) Backport PR #55380: CI: Exclude benchmarks directory when publishing docs Co-authored-by: Marc Garcia --- .github/workflows/docbuild-and-upload.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index e05f12ac6416a..6fa98ff352d38 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -67,7 +67,7 @@ jobs: run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='benchmarks' web/build/ web@${{ secrets.server_ip }}:/var/www/html if: github.event_name == 'push' && github.ref == 'refs/heads/main' - name: Upload dev docs From 748655093fa478aa9b44b6a5857161d07d077549 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 9 Oct 2023 21:40:35 +0200 Subject: [PATCH 126/174] Backport PR #55450 on branch 2.1.x (Bump pypa/cibuildwheel from 2.16.1 to 2.16.2) (#55457) Backport PR #55450: Bump pypa/cibuildwheel from 2.16.1 to 2.16.2 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b156c3d9dfae7..6759f7b82eacf 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.16.1 + uses: pypa/cibuildwheel@v2.16.2 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 423e7b9d6e7ca995154b88127a5623c04ca43892 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 10 Oct 2023 02:36:30 +0200 Subject: [PATCH 127/174] Backport PR #55460 on branch 2.1.x (COMPAT: Update old numpy aliases) (#55463) Backport PR #55460: COMPAT: Update old numpy aliases Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/stata.py | 2 +- pandas/tests/frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/methods/test_select_dtypes.py | 4 +--- pandas/tests/frame/methods/test_to_records.py | 2 +- pandas/tests/indexes/base_class/test_setops.py | 10 +++++----- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/reshape/concat/test_append.py | 4 ++-- pandas/tests/series/methods/test_astype.py | 4 ++-- 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5b190619c4ac3..0a02da09c9e15 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -977,7 +977,7 @@ def __init__(self) -> None: # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [(i, np.dtype(f"S{i}")) for i in range(1, 245)] + [ (251, np.dtype(np.int8)), (252, np.dtype(np.int16)), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 7dffa7bb242d5..55d4a6c3b39fa 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -281,7 +281,7 @@ def test_frame_from_records_utc(self): def test_from_records_to_records(self): # from numpy documentation - arr = np.zeros((2,), dtype=("i4,f4,a10")) + arr = np.zeros((2,), dtype=("i4,f4,S10")) arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] DataFrame.from_records(arr) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 67dd5b6217187..a38d2c6fd016a 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -339,9 +339,7 @@ def test_select_dtypes_datetime_with_tz(self): expected = df3.reindex(columns=[]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype", [str, "str", np.bytes_, "S1", "unicode", np.str_, "U1"] - ) + @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 7df52a1fb903f..27939856503e5 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -253,7 +253,7 @@ def test_to_records_with_categorical(self): ), # Pass in a dtype instance. ( - {"column_dtypes": np.dtype("unicode")}, + {"column_dtypes": np.dtype(np.str_)}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 21d1630af9de2..488f79eea0d11 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -182,7 +182,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), False, ), @@ -190,7 +190,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (1, "B"), (2, "A"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -198,7 +198,7 @@ def test_symmetric_difference(self): "union", np.array( [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -208,13 +208,13 @@ def test_tuple_union_bug(self, method, expected, sort): index1 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) index2 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 437012d836941..663d0a195fa55 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -599,7 +599,7 @@ def test_blocks_compat_GH9037(self): ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype("unicode") + df_mixed.columns = df_mixed.columns.astype(np.str_) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 386f363c81557..81ca227fb7afb 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -91,10 +91,10 @@ def test_append_length0_frame(self, sort): tm.assert_frame_equal(df5, expected) def test_append_records(self): - arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1 = np.zeros((2,), dtype=("i4,f4,S10")) arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2 = np.zeros((3,), dtype=("i4,f4,S10")) arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index b6c409397c9fb..03fc6cba2902a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -403,12 +403,12 @@ def test_astype_unicode(self): # bytes with obj.decode() instead of str(obj) item = "野菜食べないとやばい" ser = Series([item.encode()]) - result = ser.astype("unicode") + result = ser.astype(np.str_) expected = Series([item]) tm.assert_series_equal(result, expected) for ser in test_series: - res = ser.astype("unicode") + res = ser.astype(np.str_) expec = ser.map(str) tm.assert_series_equal(res, expec) From 0ee6fc9b1a793cbfc087c4ced8f183dd4a4986f5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 13 Oct 2023 02:57:33 +0200 Subject: [PATCH 128/174] Backport PR #55488 on branch 2.1.x (BLD: Pin numpy to < 2) (#55500) Backport PR #55488: BLD: Pin numpy to < 2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/actions/build_pandas/action.yml | 4 ++-- .github/workflows/unit-tests.yml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- environment.yml | 2 +- pyproject.toml | 8 ++++---- requirements-dev.txt | 2 +- 13 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 73d7723e2fb49..3ee10efaaf96f 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -25,8 +25,8 @@ runs: - name: Build Pandas run: | if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v + pip install -e . --no-build-isolation -v --no-deps else - pip install . --no-build-isolation -v + pip install . --no-build-isolation -v --no-deps fi shell: bash -el {0} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 96210d878fa43..a7eead7aafbf7 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -348,7 +348,7 @@ jobs: python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 - python -m pip install -ve . --no-build-isolation --no-index + python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list - name: Run Tests diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index fe8115fa8458b..1cbd6c24debb2 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 36d6f6d223668..9ace8e10e0e28 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index c259286a5359c..1a770d74043bf 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -19,7 +19,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d7ef86ac2aa6b..57fcc2c859a5f 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 91052f8805952..f340d17395d34 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4 + - numpy=1.22.4, <2 - pytz=2020.1 # optional dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index c4d7418b0a5b5..0e234a0de6969 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index db0723cd3b8fa..a2f4d6395783a 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -21,7 +21,7 @@ dependencies: - hypothesis>=6.46.1 # required - - numpy + - numpy<2 - python-dateutil - pytz - pip: diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 2c5aebb794c46..6b091df00fae7 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/environment.yml b/environment.yml index 6f80e9af58643..98c39378d1ae1 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/pyproject.toml b/pyproject.toml index bfebb2762ff6e..e9e3e2f67e2f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.26.0,<2; python_version>='3.12'", "versioneer[toml]" ] @@ -29,9 +29,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version=='3.11'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.22.4,<2; python_version<'3.11'", + "numpy>=1.23.2,<2; python_version=='3.11'", + "numpy>=1.26.0,<2; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" diff --git a/requirements-dev.txt b/requirements-dev.txt index 01e537f7ea28f..855eaccfd4f5f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,7 +12,7 @@ pytest-xdist>=2.2.0 pytest-asyncio>=0.17.0 coverage python-dateutil -numpy +numpy<2 pytz beautifulsoup4>=4.11.1 blosc From 2a32088dd28440a4b5ccbca4977d168ef8226de0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 15 Oct 2023 18:05:32 +0200 Subject: [PATCH 129/174] Backport PR #55337 on branch 2.1.x (DOC: Adjust user guide for CoW docs) (#55532) Backport PR #55337: DOC: Adjust user guide for CoW docs Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/copy_on_write.rst | 130 ++++++++++++++---------- 1 file changed, 75 insertions(+), 55 deletions(-) diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 59bdb1926895f..d0c57b56585db 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -7,8 +7,8 @@ Copy-on-Write (CoW) ******************* Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the -optimizations that become possible through CoW are implemented and supported. A complete list -can be found at :ref:`Copy-on-Write optimizations `. +optimizations that become possible through CoW are implemented and supported. All possible +optimizations are supported starting from pandas 2.1. We expect that CoW will be enabled by default in version 3.0. @@ -154,6 +154,77 @@ With copy on write this can be done by using ``loc``. df.loc[df["bar"] > 5, "foo"] = 100 +Read-only NumPy arrays +---------------------- + +Accessing the underlying NumPy array of a DataFrame will return a read-only array if the array +shares data with the initial DataFrame: + +The array is a copy if the initial DataFrame consists of more than one array: + + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]}) + df.to_numpy() + +The array shares data with the DataFrame if the DataFrame consists of only one NumPy array: + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.to_numpy() + +This array is read-only, which means that it can't be modified inplace: + +.. ipython:: python + :okexcept: + + arr = df.to_numpy() + arr[0, 0] = 100 + +The same holds true for a Series, since a Series always consists of a single array. + +There are two potential solution to this: + +- Trigger a copy manually if you want to avoid updating DataFrames that share memory with your array. +- Make the array writeable. This is a more performant solution but circumvents Copy-on-Write rules, so + it should be used with caution. + +.. ipython:: python + + arr = df.to_numpy() + arr.flags.writeable = True + arr[0, 0] = 100 + arr + +Patterns to avoid +----------------- + +No defensive copy will be performed if two objects share the same data while +you are modifying one object inplace. + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = df.reset_index() + df2.iloc[0, 0] = 100 + +This creates two objects that share data and thus the setitem operation will trigger a +copy. This is not necessary if the initial object ``df`` isn't needed anymore. +Simply reassigning to the same variable will invalidate the reference that is +held by the object. + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = df.reset_index() + df.iloc[0, 0] = 100 + +No copy is necessary in this example. +Creating multiple references keeps unnecessary references alive +and thus will hurt performance with Copy-on-Write. + .. _copy_on_write.optimizations: Copy-on-Write optimizations @@ -161,59 +232,8 @@ Copy-on-Write optimizations A new lazy copy mechanism that defers the copy until the object in question is modified and only if this object shares data with another object. This mechanism was added to -following methods: - - - :meth:`DataFrame.reset_index` / :meth:`Series.reset_index` - - :meth:`DataFrame.set_index` - - :meth:`DataFrame.set_axis` / :meth:`Series.set_axis` - - :meth:`DataFrame.set_flags` / :meth:`Series.set_flags` - - :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis` - - :meth:`DataFrame.reindex` / :meth:`Series.reindex` - - :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` - - :meth:`DataFrame.assign` - - :meth:`DataFrame.drop` - - :meth:`DataFrame.dropna` / :meth:`Series.dropna` - - :meth:`DataFrame.select_dtypes` - - :meth:`DataFrame.align` / :meth:`Series.align` - - :meth:`Series.to_frame` - - :meth:`DataFrame.rename` / :meth:`Series.rename` - - :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix` - - :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix` - - :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates` - - :meth:`DataFrame.droplevel` / :meth:`Series.droplevel` - - :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels` - - :meth:`DataFrame.between_time` / :meth:`Series.between_time` - - :meth:`DataFrame.filter` / :meth:`Series.filter` - - :meth:`DataFrame.head` / :meth:`Series.head` - - :meth:`DataFrame.tail` / :meth:`Series.tail` - - :meth:`DataFrame.isetitem` - - :meth:`DataFrame.pipe` / :meth:`Series.pipe` - - :meth:`DataFrame.pop` / :meth:`Series.pop` - - :meth:`DataFrame.replace` / :meth:`Series.replace` - - :meth:`DataFrame.shift` / :meth:`Series.shift` - - :meth:`DataFrame.sort_index` / :meth:`Series.sort_index` - - :meth:`DataFrame.sort_values` / :meth:`Series.sort_values` - - :meth:`DataFrame.squeeze` / :meth:`Series.squeeze` - - :meth:`DataFrame.swapaxes` - - :meth:`DataFrame.swaplevel` / :meth:`Series.swaplevel` - - :meth:`DataFrame.take` / :meth:`Series.take` - - :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp` - - :meth:`DataFrame.to_period` / :meth:`Series.to_period` - - :meth:`DataFrame.truncate` - - :meth:`DataFrame.iterrows` - - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` - - :meth:`DataFrame.fillna` / :meth:`Series.fillna` - - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate` - - :meth:`DataFrame.ffill` / :meth:`Series.ffill` - - :meth:`DataFrame.bfill` / :meth:`Series.bfill` - - :meth:`DataFrame.where` / :meth:`Series.where` - - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` - - :meth:`DataFrame.astype` / :meth:`Series.astype` - - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes` - - :meth:`DataFrame.join` - - :meth:`DataFrame.eval` - - :func:`concat` - - :func:`merge` +methods that don't require a copy of the underlying data. Popular examples are :meth:`DataFrame.drop` for ``axis=1`` +and :meth:`DataFrame.rename`. These methods return views when Copy-on-Write is enabled, which provides a significant performance improvement compared to the regular execution. From 5933c60cf75c79e4a5fd739d9af9c8a25e7b7f9b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 19:03:42 +0200 Subject: [PATCH 130/174] Backport PR #55362 on branch 2.1.x (BUG: rank raising for arrow string dtypes) (#55406) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 31 ++++++++++++++++++++----- pandas/core/arrays/string_arrow.py | 30 ++++++++++++++++++++++++ pandas/tests/frame/methods/test_rank.py | 12 ++++++++++ 4 files changed, 68 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 524d71a56a2e6..ddd1f95c56aea 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -27,6 +27,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2e3ad8bc13091..85351945cf29c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1712,7 +1712,7 @@ def __setitem__(self, key, value) -> None: data = pa.chunked_array([data]) self._pa_array = data - def _rank( + def _rank_calc( self, *, axis: AxisInt = 0, @@ -1721,9 +1721,6 @@ def _rank( ascending: bool = True, pct: bool = False, ): - """ - See Series.rank.__doc__. - """ if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, @@ -1738,7 +1735,7 @@ def _rank( else: pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) - return type(self)(result) + return result data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" @@ -1777,7 +1774,29 @@ def _rank( divisor = pc.count(result) result = pc.divide(result, divisor) - return type(self)(result) + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7d282dc121332..9b3245183bd57 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -48,6 +48,7 @@ if TYPE_CHECKING: from pandas._typing import ( + AxisInt, Dtype, Scalar, npt, @@ -444,6 +445,31 @@ def _str_rstrip(self, to_strip=None): result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + def _convert_int_dtype(self, result): + return Int64Dtype().__from_arrow__(result) + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" @@ -527,6 +553,10 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + elif not isinstance(result, np.ndarray): + result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..b5b5e42691e59 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -488,3 +488,15 @@ def test_rank_mixed_axis_zero(self, data, expected): df.rank() result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH#55362 + pytest.importorskip("pyarrow") + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) From 0c17c961b18da868fe14a096f046be8f55cd72b3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 20:54:22 +0200 Subject: [PATCH 131/174] Backport PR #55384 on branch 2.1.x (BUG: idxmax raising for arrow strings) (#55531) BUG: idxmax raising for arrow strings (#55384) (cherry picked from commit 68e3c4b2f855e6e9a8469aeca6eb73ae60327160) --- pandas/core/arrays/arrow/array.py | 11 ++++++++++- pandas/core/arrays/string_arrow.py | 11 +++++++++++ pandas/tests/frame/test_reductions.py | 9 +++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 85351945cf29c..636a22fcffe3d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1596,6 +1596,15 @@ def _reduce( ------ TypeError : subclass does not define reductions """ + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if isinstance(result, pa.Array): + return type(self)(result) + else: + return result + + def _reduce_calc( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: @@ -1606,7 +1615,7 @@ def _reduce( [pa_result], type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), ) - return type(self)(result) + return result if pc.is_null(pa_result).as_py(): return self.dtype.na_value diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9b3245183bd57..2eef240af53f8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -445,6 +445,17 @@ def _str_rstrip(self, to_strip=None): result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("argmin", "argmax") and isinstance(result, pa.Array): + return self._convert_int_dtype(result) + elif isinstance(result, pa.Array): + return type(self)(result) + else: + return result + def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 4c83c810e8cec..0b501903ad71f 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1073,6 +1073,15 @@ def test_idxmax_arrow_types(self): expected = Series([2, 1], index=["a", "b"]) tm.assert_series_equal(result, expected) + df = DataFrame({"a": ["b", "c", "a"]}, dtype="string[pyarrow]") + result = df.idxmax(numeric_only=False) + expected = Series([1], index=["a"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin(numeric_only=False) + expected = Series([2], index=["a"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" From c9053cc4ca84a73581d32338503ed410b9ce41c0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 16 Oct 2023 20:53:00 +0200 Subject: [PATCH 132/174] Backport PR #55540 on branch 2.1.x (MAINT: Partially revert `np.int_` changes) (#55545) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #55540: MAINT: Partially revert `np.int_` changes Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- pandas/tests/arrays/boolean/test_reduction.py | 4 +--- pandas/tests/frame/methods/test_shift.py | 9 ++++----- pandas/tests/frame/test_reductions.py | 8 ++------ pandas/tests/plotting/test_series.py | 7 ++----- pandas/tests/scalar/test_na_scalar.py | 11 +++++------ 5 files changed, 14 insertions(+), 25 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 71156a4d84ae5..dd8c3eda9ed05 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat.numpy import np_long - import pandas as pd @@ -53,7 +51,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): s = s.dropna() if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np_long) + assert isinstance(getattr(s, op)(), np.int_) elif op == "count": # Oddly on the 32 bit build (but not Windows), this is intc (!= intp) assert isinstance(getattr(s, op)(), np.integer) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 7f31c3985d4df..1e88152137b1f 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,7 +1,6 @@ import numpy as np import pytest -from pandas.compat.numpy import np_long import pandas.util._test_decorators as td import pandas as pd @@ -472,22 +471,22 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self): df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int)) df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int)) df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(2, axis=1, fill_value=np_long(0)) + result = df3.shift(2, axis=1, fill_value=np.int_(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([-1, -1, 0, 1], axis=1) - expected.iloc[:, :2] = np_long(0) + expected.iloc[:, :2] = np.int_(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) # Case with periods < 0 df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(-2, axis=1, fill_value=np_long(0)) + result = df3.shift(-2, axis=1, fill_value=np.int_(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([2, 3, -1, -1], axis=1) - expected.iloc[:, -2:] = np_long(0) + expected.iloc[:, -2:] = np.int_(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0b501903ad71f..61d7c125ee5e7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,10 +10,6 @@ IS64, is_platform_windows, ) -from pandas.compat.numpy import ( - np_long, - np_ulong, -) import pandas.util._test_decorators as td import pandas as pd @@ -1725,11 +1721,11 @@ class TestEmptyDataFrameReductions: "opname, dtype, exp_value, exp_dtype", [ ("sum", np.int8, 0, np.int64), - ("prod", np.int8, 1, np_long), + ("prod", np.int8, 1, np.int_), ("sum", np.int64, 0, np.int64), ("prod", np.int64, 1, np.int64), ("sum", np.uint8, 0, np.uint64), - ("prod", np.uint8, 1, np_ulong), + ("prod", np.uint8, 1, np.uint), ("sum", np.uint64, 0, np.uint64), ("prod", np.uint64, 1, np.uint64), ("sum", np.float32, 0, np.float32), diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 7c34567dd2ebe..768fce023e6e0 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -6,10 +6,7 @@ import pytest from pandas.compat import is_platform_linux -from pandas.compat.numpy import ( - np_long, - np_version_gte1p24, -) +from pandas.compat.numpy import np_version_gte1p24 import pandas.util._test_decorators as td import pandas as pd @@ -564,7 +561,7 @@ def test_plot_fails_with_dupe_color_and_style(self): [ ["scott", 20], [None, 20], - [None, np_long(20)], + [None, np.int_(20)], [0.5, np.linspace(-100, 100, 20)], ], ) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 44ce5c79db348..287b7557f50f9 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -9,7 +9,6 @@ import pytest from pandas._libs.missing import NA -from pandas.compat.numpy import np_long from pandas.core.dtypes.common import is_scalar @@ -103,9 +102,9 @@ def test_comparison_ops(comparison_op, other): -0.0, False, np.bool_(False), - np_long(0), + np.int_(0), np.float64(0), - np_long(-0), + np.int_(-0), np.float64(-0), ], ) @@ -124,7 +123,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np_long(1), np.float64(1)] + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float64(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -134,14 +133,14 @@ def test_rpow_special(value, asarray): if asarray: result = result[0] - elif not isinstance(value, (np.float64, np.bool_, np_long)): + elif not isinstance(value, (np.float64, np.bool_, np.int_)): # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) assert result == value -@pytest.mark.parametrize("value", [-1, -1.0, np_long(-1), np.float64(-1)]) +@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float64(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: From fd53117551fc3349911563232adbef1486cc3811 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 18 Oct 2023 02:28:19 +0200 Subject: [PATCH 133/174] Backport PR #55565 on branch 2.1.x (TST: Remove np.core usage) (#55567) Backport PR #55565: TST: Remove np.core usage Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/frame/constructors/test_from_records.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 55d4a6c3b39fa..bb4aed2163dac 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -42,7 +42,7 @@ def test_from_records_with_datetimes(self): arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] dtypes = [("EXPIRY", " Date: Wed, 18 Oct 2023 12:45:59 -0400 Subject: [PATCH 134/174] REGR: sort index with sliced MultiIndex (#55474) * REGR: sort index with sliced MultiIndex * fixes --- pandas/core/sorting.py | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index bc6e29c3c7fbf..e6b54de9a8bfb 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -365,22 +365,15 @@ def lexsort_indexer( if codes_given: mask = k == -1 codes = k.copy() - n = len(codes) - mask_n = n - # error: Item "ExtensionArray" of "Union[Any, ExtensionArray, - # ndarray[Any, Any]]" has no attribute "any" - if mask.any(): # type: ignore[union-attr] - n -= 1 + # error: Item "ExtensionArray" of "Series | ExtensionArray | + # ndarray[Any, Any]" has no attribute "max" + n = codes.max() + 1 if len(codes) else 0 # type: ignore[union-attr] else: cat = Categorical(k, ordered=True) n = len(cat.categories) codes = cat.codes.copy() mask = cat.codes == -1 - if mask.any(): - mask_n = n + 1 - else: - mask_n = n if order: # ascending if na_position == "last": @@ -391,12 +384,6 @@ def lexsort_indexer( # complex, str, bytes, _NestedSequence[Union[bool, int, float, # complex, str, bytes]]]" codes = np.where(mask, n, codes) # type: ignore[arg-type] - elif na_position == "first": - # error: Incompatible types in assignment (expression has type - # "Union[Any, int, ndarray[Any, dtype[signedinteger[Any]]]]", - # variable has type "Union[Series, ExtensionArray, ndarray[Any, Any]]") - # error: Unsupported operand types for + ("ExtensionArray" and "int") - codes += 1 # type: ignore[operator,assignment] else: # not order means descending if na_position == "last": # error: Unsupported operand types for - ("int" and "ExtensionArray") @@ -406,9 +393,7 @@ def lexsort_indexer( # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, # complex, str, bytes, _NestedSequence[Union[bool, int, float, # complex, str, bytes]]]" - codes = np.where( - mask, n, n - codes - 1 # type: ignore[operator,arg-type] - ) + codes = np.where(mask, n, n - codes - 1) # type: ignore[arg-type] elif na_position == "first": # error: Unsupported operand types for - ("int" and "ExtensionArray") # error: Argument 1 to "where" has incompatible type "Union[Any, @@ -417,9 +402,9 @@ def lexsort_indexer( # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, # complex, str, bytes, _NestedSequence[Union[bool, int, float, # complex, str, bytes]]]" - codes = np.where(mask, 0, n - codes) # type: ignore[operator,arg-type] + codes = np.where(mask, -1, n - codes) # type: ignore[arg-type] - shape.append(mask_n) + shape.append(n + 1) labels.append(codes) return indexer_from_factorized(labels, tuple(shape)) From 1be0cd6aee4a8df9aa9c6765da37094893ca4da8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 19 Oct 2023 00:56:23 +0200 Subject: [PATCH 135/174] Backport PR #55562 on branch 2.1.x (Floordiv fix for pyarrow dtypes) (#55578) Backport PR #55562: Floordiv fix for pyarrow dtypes Co-authored-by: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/arrays/arrow/array.py | 3 ++- pandas/tests/extension/test_arrow.py | 9 +++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index ddd1f95c56aea..5df34f20bece3 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -27,9 +27,9 @@ Bug fixes - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) +- Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) -- .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 636a22fcffe3d..875461d39a93e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -116,7 +116,8 @@ def floordiv_compat( ) -> pa.ChunkedArray: # Ensure int // int -> int mirroring Python/Numpy behavior # as pc.floor(pc.divide_checked(int, int)) -> float - result = pc.floor(pc.divide(left, right)) + converted_left = cast_for_truediv(left, right) + result = pc.floor(pc.divide(converted_left, right)) if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): result = result.cast(left.type) return result diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8702701f161ae..608cb023a3584 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3091,3 +3091,12 @@ def test_factorize_chunked_dictionary(): exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks())) tm.assert_numpy_array_equal(res_indices, exp_indicies) tm.assert_index_equal(res_uniques, exp_uniques) + + +def test_arrow_floordiv(): + # GH 55561 + a = pd.Series([-7], dtype="int64[pyarrow]") + b = pd.Series([4], dtype="int64[pyarrow]") + expected = pd.Series([-2], dtype="int64[pyarrow]") + result = a // b + tm.assert_series_equal(result, expected) From bd53d3755924202489c66c15c08d08c395a289fe Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 19 Oct 2023 14:42:56 +0200 Subject: [PATCH 136/174] Backport PR #55582 on branch 2.1.x (CI: Unpin Cython) (#55585) Backport PR #55582: CI: Unpin Cython Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 6 +++--- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-numpydev.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a7eead7aafbf7..8ff7d290ea9d1 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -236,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -274,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -347,7 +347,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 1cbd6c24debb2..f8fa04a60a378 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 9ace8e10e0e28..8bf5ea51c4bf3 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 9795a1fb39c6f..7fd3a65ec91f8 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer[toml] - meson[ninja]=1.2.1 - meson-python=0.13.1 - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 1a770d74043bf..d7e5523dd2545 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - meson[ninja]=1.2.1 - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 57fcc2c859a5f..7f8cca9e65fe1 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index f340d17395d34..5d34940aa27e1 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 0e234a0de6969..f5ea116e51dd4 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index a2f4d6395783a..a489690776be3 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 6b091df00fae7..a71523e8c3579 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33, <3.0.3 + - cython>=0.29.33 - meson[ninja]=1.2.1 - meson-python=0.13.1 From 58d3f1bc19d0636106aa72e42476510e774ae684 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 19 Oct 2023 12:43:28 -0400 Subject: [PATCH 137/174] Backport PR #55473: TST: sort index with sliced MultiIndex (#55482) --- doc/source/whatsnew/v2.1.2.rst | 2 + pandas/tests/frame/methods/test_sort_index.py | 39 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 5df34f20bece3..28b271f9cf446 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -15,6 +15,8 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) +- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) +- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 228b62a418813..f1465c9ba9df8 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -955,3 +955,42 @@ def test_sort_index_multiindex_sort_remaining(self, ascending): ) tm.assert_frame_equal(result, expected) + + +def test_sort_index_with_sliced_multiindex(): + # GH 55379 + mi = MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("b", "16"), + ("b", "26"), + ("a", "45"), + ("b", "28"), + ("a", "5"), + ("a", "50"), + ("a", "51"), + ("b", "4"), + ], + names=["group", "str"], + ) + + df = DataFrame({"x": range(len(mi))}, index=mi) + result = df.iloc[0:6].sort_index() + + expected = DataFrame( + {"x": [0, 1, 2, 5, 3, 4]}, + index=MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("a", "45"), + ("b", "16"), + ("b", "26"), + ], + names=["group", "str"], + ), + ) + tm.assert_frame_equal(result, expected) From cc2f03a8ae7e93bde1167a7d90eee8f8b7e0976b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 19 Oct 2023 20:12:47 +0200 Subject: [PATCH 138/174] Backport PR #55534 on branch 2.1.x (BUG: Ensure "string[pyarrow]" type is preserved when calling extractall) (#55597) Backport PR #55534: BUG: Ensure "string[pyarrow]" type is preserved when calling extractall Co-authored-by: Amanda Bizzinotto --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/strings/accessor.py | 5 ++--- pandas/tests/strings/test_extract.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 28b271f9cf446..97a718dd496e9 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -31,6 +31,7 @@ Bug fixes - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) +- Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 124ca546c4583..b299f5d6deab3 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3449,10 +3449,9 @@ def _result_dtype(arr): # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype - if isinstance(arr.dtype, StringDtype): + if isinstance(arr.dtype, (ArrowDtype, StringDtype)): return arr.dtype - else: - return object + return object def _get_single_group_name(regex: re.Pattern) -> Hashable: diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 4773c13aad376..b8319e90e09a8 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import ArrowDtype + from pandas import ( DataFrame, Index, @@ -706,3 +708,12 @@ def test_extractall_same_as_extract_subject_index(any_string_dtype): has_match_index = s.str.extractall(pattern_one_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_noname, no_match_index) + + +def test_extractall_preserves_dtype(): + # Ensure that when extractall is called on a series with specific dtypes set, that + # the dtype is preserved in the resulting DataFrame's column. + pa = pytest.importorskip("pyarrow") + + result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)") + assert result.dtypes[0] == "string[pyarrow]" From 9d39df008b3e03724cd22d7f4c3cabed0b009f2f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 20 Oct 2023 05:23:25 +0200 Subject: [PATCH 139/174] Backport PR #55596 on branch 2.1.x (CI: Have NumpyDev build only test nightly numpy) (#55602) Backport PR #55596: CI: Have NumpyDev build only test nightly numpy Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/deps/actions-311-numpydev.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 7fd3a65ec91f8..9a6c05a138f9b 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -29,5 +29,4 @@ dependencies: - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" - - "scipy" - "tzdata>=2022.1" From 3c1fe5cb7770d04e0ef6d524fc45a2294dcd4ed8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 21 Oct 2023 21:43:31 +0200 Subject: [PATCH 140/174] Backport PR #55611 on branch 2.1.x (DEPR: correct warning message while parsing datetimes with mixed time zones if utc=False) (#55613) Backport PR #55611: DEPR: correct warning message while parsing datetimes with mixed time zones if utc=False Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- pandas/_libs/tslib.pyx | 2 +- pandas/core/tools/datetimes.py | 8 ++++---- pandas/io/json/_json.py | 2 +- pandas/io/parsers/base_parser.py | 6 +++--- .../indexes/datetimes/test_constructors.py | 2 +- pandas/tests/tools/test_to_datetime.py | 18 +++++++++--------- pandas/tests/tslibs/test_array_to_datetime.py | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 20a18cf56779f..e61ad28c54afb 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -671,7 +671,7 @@ cdef _array_to_datetime_object( if len(unique_timezones) > 1: warnings.warn( "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise a warning unless `utc=True`. " + "zones will raise an error unless `utc=True`. " "Please specify `utc=True` to opt in to the new behaviour " "and silence this warning. To create a `Series` with mixed offsets and " "`object` dtype, please use `apply` and `datetime.datetime.strptime`", diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ac306229f3111..303136b7d3890 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -356,7 +356,7 @@ def _return_parsed_timezone_results( if len(non_na_timezones) > 1: warnings.warn( "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise a warning unless `utc=True`. Please specify `utc=True` " + "zones will raise an error unless `utc=True`. Please specify `utc=True` " "to opt in to the new behaviour and silence this warning. " "To create a `Series` with mixed offsets and `object` dtype, " "please use `apply` and `datetime.datetime.strptime`", @@ -788,7 +788,7 @@ def to_datetime( .. warning:: In a future version of pandas, parsing datetimes with mixed time - zones will raise a warning unless `utc=True`. + zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`. @@ -1023,7 +1023,7 @@ def to_datetime( >>> pd.to_datetime(['2020-10-25 02:00 +0200', ... '2020-10-25 04:00 +0100']) # doctest: +SKIP FutureWarning: In a future version of pandas, parsing datetimes with mixed - time zones will raise a warning unless `utc=True`. Please specify `utc=True` + time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`. @@ -1037,7 +1037,7 @@ def to_datetime( >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP FutureWarning: In a future version of pandas, parsing datetimes with mixed - time zones will raise a warning unless `utc=True`. Please specify `utc=True` + time zones will raise an error unless `utc=True`. Please specify `utc=True` to opt in to the new behaviour and silence this warning. To create a `Series` with mixed offsets and `object` dtype, please use `apply` and `datetime.datetime.strptime`. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 833f4986b6da6..58979a29c97d3 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1316,7 +1316,7 @@ def _try_convert_to_date(self, data): warnings.filterwarnings( "ignore", ".*parsing datetimes with mixed time " - "zones will raise a warning", + "zones will raise an error", category=FutureWarning, ) new_data = to_datetime(new_data, errors="raise", unit=date_unit) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 60996a7d42187..6b1daa96782a0 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1147,7 +1147,7 @@ def converter(*date_cols, col: Hashable): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", - ".*parsing datetimes with mixed time zones will raise a warning", + ".*parsing datetimes with mixed time zones will raise an error", category=FutureWarning, ) result = tools.to_datetime( @@ -1169,7 +1169,7 @@ def converter(*date_cols, col: Hashable): warnings.filterwarnings( "ignore", ".*parsing datetimes with mixed time zones " - "will raise a warning", + "will raise an error", category=FutureWarning, ) result = tools.to_datetime( @@ -1187,7 +1187,7 @@ def converter(*date_cols, col: Hashable): warnings.filterwarnings( "ignore", ".*parsing datetimes with mixed time zones " - "will raise a warning", + "will raise an error", category=FutureWarning, ) return tools.to_datetime( diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 733c14f33567a..fc7ce19c42b38 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -300,7 +300,7 @@ def test_construction_index_with_mixed_timezones(self): assert not isinstance(result, DatetimeIndex) msg = "DatetimeIndex has mixed timezones" - msg_depr = "parsing datetimes with mixed time zones will raise a warning" + msg_depr = "parsing datetimes with mixed time zones will raise an error" with pytest.raises(TypeError, match=msg): with tm.assert_produces_warning(FutureWarning, match=msg_depr): DatetimeIndex(["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"]) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 93fe9b05adb4f..580f0605efdbc 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -518,7 +518,7 @@ def test_to_datetime_parse_tzname_or_tzoffset_utc_false_deprecated( self, fmt, dates, expected_dates ): # GH 13486, 50887 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(dates, format=fmt) expected = Index(expected_dates) @@ -693,7 +693,7 @@ def test_to_datetime_mixed_datetime_and_string_with_format_mixed_offsets_utc_fal args = ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"] ts1 = constructor(args[0]) ts2 = args[1] - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" expected = Index( [ @@ -734,7 +734,7 @@ def test_to_datetime_mixed_datetime_and_string_with_format_mixed_offsets_utc_fal ) def test_to_datetime_mixed_offsets_with_none_tz(self, fmt, expected): # https://github.com/pandas-dev/pandas/issues/50071 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime( @@ -1236,7 +1236,7 @@ def test_to_datetime_different_offsets(self, cache): ts_string_2 = "March 1, 2018 12:00:00+0500" arr = [ts_string_1] * 5 + [ts_string_2] * 5 expected = Index([parse(x) for x in arr]) - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(arr, cache=cache) tm.assert_index_equal(result, expected) @@ -1604,7 +1604,7 @@ def test_to_datetime_coerce(self): "March 1, 2018 12:00:00+0500", "20100240", ] - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(ts_strings, errors="coerce") expected = Index( @@ -1689,7 +1689,7 @@ def test_iso_8601_strings_with_same_offset(self): def test_iso_8601_strings_with_different_offsets(self): # GH 17697, 11736, 50887 ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result = to_datetime(ts_strings) expected = np.array( @@ -1729,7 +1729,7 @@ def test_mixed_offsets_with_native_datetime_raises(self): now = Timestamp("now") today = Timestamp("today") - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): mixed = to_datetime(ser) expected = Series( @@ -1810,7 +1810,7 @@ def test_to_datetime_fixed_offset(self): ) def test_to_datetime_mixed_offsets_with_utc_false_deprecated(self, date): # GH 50887 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): to_datetime(date, utc=False) @@ -3680,7 +3680,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): # GH 50887 - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): to_datetime( diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 435fe5f4b90d8..829bb140e6e96 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -85,7 +85,7 @@ def test_parsing_different_timezone_offsets(): data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"] data = np.array(data, dtype=object) - msg = "parsing datetimes with mixed time zones will raise a warning" + msg = "parsing datetimes with mixed time zones will raise an error" with tm.assert_produces_warning(FutureWarning, match=msg): result, result_tz = tslib.array_to_datetime(data) expected = np.array( From c26935cd9ffd1676403b43f88dfafee0935b979c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 22 Oct 2023 19:06:46 +0200 Subject: [PATCH 141/174] Backport PR #55620 on branch 2.1.x (BUG: Groupby not keeping object dtype when infer string is set) (#55629) Backport PR #55620: BUG: Groupby not keeping object dtype when infer string is set Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/groupby/groupby.py | 3 +-- pandas/tests/groupby/test_groupby.py | 16 ++++++++++++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..175c534ce0d04 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7fa69a70431e4..d9fdf72eeda2b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1855,8 +1855,7 @@ def _agg_py_fallback( ser = Series(values, copy=False) else: # We only get here with values.dtype == object - # TODO: special case not needed with ArrayManager - df = DataFrame(values.T) + df = DataFrame(values.T, dtype=values.dtype) # bc we split object blocks in grouped_reduce, we have only 1 col # otherwise we'd have to worry about block-splitting GH#39329 assert df.shape[1] == 1 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 78a0497ef611e..7726dbe6994ef 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -2513,13 +2514,24 @@ def test_groupby_column_index_name_lost(func): tm.assert_index_equal(result, expected) -def test_groupby_duplicate_columns(): +@pytest.mark.parametrize( + "infer_string", + [ + False, + pytest.param( + True, + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + ], +) +def test_groupby_duplicate_columns(infer_string): # GH: 31735 df = DataFrame( {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} ).astype(object) df.columns = ["A", "B", "B"] - result = df.groupby([0, 0, 0, 0]).min() + with pd.option_context("future.infer_string", infer_string): + result = df.groupby([0, 0, 0, 0]).min() expected = DataFrame( [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object ) From 66ec371ce3e255c08cbc7d7ee63e36c0347b26e2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 22 Oct 2023 21:18:02 +0200 Subject: [PATCH 142/174] Backport PR #55518 on branch 2.1.x (CoW: Use exponential backoff when clearing dead references) (#55625) Backport PR #55518: CoW: Use exponential backoff when clearing dead references Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/_libs/internals.pyx | 24 ++++++++++++++----- pandas/tests/copy_view/test_internals.py | 30 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 175c534ce0d04..ed8010c2ea258 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) +- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 859ebce36eaa8..10b0f4776e0d1 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -944,17 +944,29 @@ cdef class BlockValuesRefs: """ cdef: public list referenced_blocks + public int clear_counter def __cinit__(self, blk: SharedBlock | None = None) -> None: if blk is not None: self.referenced_blocks = [weakref.ref(blk)] else: self.referenced_blocks = [] - - def _clear_dead_references(self) -> None: - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + self.clear_counter = 500 # set reasonably high + + def _clear_dead_references(self, force=False) -> None: + # Use exponential backoff to decide when we want to clear references + # if force=False. Clearing for every insertion causes slowdowns if + # all these objects stay alive, e.g. df.items() for wide DataFrames + # see GH#55245 and GH#55008 + if force or len(self.referenced_blocks) > self.clear_counter: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + nr_of_refs = len(self.referenced_blocks) + if nr_of_refs < self.clear_counter // 2: + self.clear_counter = max(self.clear_counter // 2, 500) + elif nr_of_refs > self.clear_counter: + self.clear_counter = max(self.clear_counter * 2, nr_of_refs) def add_reference(self, blk: SharedBlock) -> None: """Adds a new reference to our reference collection. @@ -988,6 +1000,6 @@ cdef class BlockValuesRefs: ------- bool """ - self._clear_dead_references() + self._clear_dead_references(force=True) # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 9180bd5a3a426..a727331307d7e 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -119,3 +119,33 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): else: for col in df.columns: assert not np.shares_memory(get_array(df, col), get_array(df2, col)) + + +def test_exponential_backoff(): + # GH#55518 + df = DataFrame({"a": [1, 2, 3]}) + for i in range(490): + df.copy(deep=False) + + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491 + + df = DataFrame({"a": [1, 2, 3]}) + dfs = [df.copy(deep=False) for i in range(510)] + + for i in range(20): + df.copy(deep=False) + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531 + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + for i in range(500): + df.copy(deep=False) + + # Don't reduce since we still have over 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + dfs = dfs[:300] + for i in range(500): + df.copy(deep=False) + + # Reduce since there are less than 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 500 From 782e685ce65ab5703aa76a7f7521d57a1ecc47bc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 23 Oct 2023 00:37:15 +0200 Subject: [PATCH 143/174] Backport PR #55537 on branch 2.1.x (BUG: Series inferring new string dtype even if dtype is given for scalar value) (#55635) Backport PR #55537: BUG: Series inferring new string dtype even if dtype is given for scalar value Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/construction.py | 7 ++++++- pandas/tests/series/test_constructors.py | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e661d590ab330..5903187769f08 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -540,6 +540,7 @@ def sanitize_array( ------- np.ndarray or ExtensionArray """ + original_dtype = dtype if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) @@ -562,7 +563,11 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - if isinstance(data, str) and using_pyarrow_string_dtype(): + if ( + isinstance(data, str) + and using_pyarrow_string_dtype() + and original_dtype is None + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype("pyarrow_numpy") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 2c3fdf627788a..8cc891e1ee9c0 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2123,6 +2123,14 @@ def test_series_string_inference_storage_definition(self): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) + def test_series_constructor_infer_string_scalar(self): + # GH#55537 + with pd.option_context("future.infer_string", True): + ser = Series("a", index=[1, 2], dtype="string[python]") + expected = Series(["a", "a"], index=[1, 2], dtype="string[python]") + tm.assert_series_equal(ser, expected) + assert ser.dtype.storage == "python" + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 89b37c91ea33f39f522e99e9918ff44fc9648408 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 23 Oct 2023 04:57:00 +0200 Subject: [PATCH 144/174] Backport PR #55640 on branch 2.1.x (DOC: Move whatsnew in 2.1.2) (#55641) Backport PR #55640: DOC: Move whatsnew in 2.1.2 Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index ed8010c2ea258..3dcd718bca6fd 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -13,7 +13,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) @@ -34,6 +33,7 @@ Bug fixes - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) +- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) .. --------------------------------------------------------------------------- From 8f003de3da49d2b7858c6722a91342671a93fa3d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 24 Oct 2023 20:21:54 +0200 Subject: [PATCH 145/174] Backport PR #55077 on branch 2.1.x (Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)") (#55459) * Backport PR #55077: Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)" * fixup doctest * noop * fixup doctests --------- Co-authored-by: Marco Edward Gorelli Co-authored-by: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/core/generic.py | 16 +++-- pandas/core/groupby/grouper.py | 12 ++-- pandas/core/resample.py | 13 +--- pandas/tests/resample/test_datetime_index.py | 67 +++++-------------- .../tests/resample/test_resampler_grouper.py | 32 ++++----- 5 files changed, 47 insertions(+), 93 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 08d1bf6a0fabc..f377fc348156b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9097,6 +9097,10 @@ def resample( .. versionadded:: 1.3.0 + .. note:: + + Only takes effect for Tick-frequencies (i.e. fixed frequencies like + days, hours, and minutes, rather than months or quarters). offset : Timedelta or str, default is None An offset timedelta added to the origin. @@ -9367,12 +9371,12 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 - >>> ts.resample('17W', origin='2000-01-01').sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.resample('17min', origin='2000-01-01').sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ea92fbae9566d..9877ddf0ea7a6 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -207,12 +207,12 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17T, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17T, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9b8d1c870091d..b75005ff202e3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2463,16 +2463,8 @@ def _get_timestamp_range_edges( """ if isinstance(freq, Tick): index_tz = first.tz - - if isinstance(origin, Timestamp) and origin.tz != index_tz: + if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): raise ValueError("The origin must have the same timezone as the index.") - - elif isinstance(origin, Timestamp): - if origin <= first: - first = origin - elif origin >= last: - last = origin - if origin == "epoch": # set the epoch based on the timezone to have similar bins results when # resampling on the same kind of indexes on different timezones @@ -2494,9 +2486,6 @@ def _get_timestamp_range_edges( first = first.tz_localize(index_tz) last = last.tz_localize(index_tz) else: - if isinstance(origin, Timestamp): - first = origin - first = first.normalize() last = last.normalize() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index dbda751e82113..1b20b383c4eae 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -796,34 +796,24 @@ def test_resample_offset(unit): @pytest.mark.parametrize( - "kwargs, expected", + "kwargs", [ - ( - {"origin": "1999-12-31 23:57:00"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": Timestamp("1970-01-01 00:02:00")}, - ["1970-01-01 00:02:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": "epoch", "offset": "2m"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), + {"origin": "1999-12-31 23:57:00"}, + {"origin": Timestamp("1970-01-01 00:02:00")}, + {"origin": "epoch", "offset": "2m"}, # origin of '1999-31-12 12:02:00' should be equivalent for this case - ( - {"origin": "1999-12-31 12:02:00"}, - ["1999-12-31 12:02:00", "2000-01-01 01:57:00"], - ), - ({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]), + {"origin": "1999-12-31 12:02:00"}, + {"offset": "-3m"}, ], ) -def test_resample_origin(kwargs, unit, expected): +def test_resample_origin(kwargs, unit): # GH 31809 rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit) + exp_rng = date_range( + "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min" + ).as_unit(unit) resampled = ts.resample("5min", **kwargs).mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit): ts.resample("5min", offset=offset) -def test_resample_monthstart_origin(): - # GH 53662 - df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]}) - result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum() - excepted = Series( - [10.0], - index=DatetimeIndex( - ["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS" - ), - ) - tm.assert_index_equal(result.index, excepted.index) - - df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]}) - result = df.resample( - "3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1) - )["values"].sum() - expected = Series( - [0, 10.0], - index=DatetimeIndex( - ["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS" - ), - ) - tm.assert_index_equal(result.index, expected.index) - - def test_resample_origin_prime_freq(unit): # GH 31809 start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" @@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit): tm.assert_index_equal(resampled.index, exp_rng) exp_rng = date_range( - "2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min" + "2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min" ).as_unit(unit) resampled = ts.resample("17min", origin="2000-01-01").mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit): exp_rng = date_range( "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz ).as_unit(unit) - resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean() tm.assert_index_equal(resampled.index, exp_rng) - resampled = ts.resample( - "5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz) - ).mean() + # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() tm.assert_index_equal(resampled.index, exp_rng) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 62b0bc2012af1..6abdde0f5ccff 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -141,19 +141,6 @@ def test_groupby_with_origin(): start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" middle = "1/15/2000 00:00:00" - # test origin on 1970-01-01 00:00:00 - rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] - - origin = Timestamp(0) - adjusted_grouper = pd.Grouper(freq=freq, origin=origin) - adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") - adjusted_count_ts = adjusted_count_ts[middle_ts:end] - adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") - tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end]) - rng = date_range(start, end, freq="1231min") # prime number ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ts2 = ts[middle:end] @@ -167,19 +154,26 @@ def test_groupby_with_origin(): with pytest.raises(AssertionError, match="Index are different"): tm.assert_index_equal(count_ts.index, count_ts2.index) - # test origin on 2049-10-18 20:00:00 + # test origin on 1970-01-01 00:00:00 + origin = Timestamp(0) + adjusted_grouper = pd.Grouper(freq=freq, origin=origin) + adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") + adjusted_count_ts = adjusted_count_ts[middle:end] + adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") + tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2) - rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] + # test origin on 2049-10-18 20:00:00 origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000 adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future) adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count") - adjusted2_count_ts = adjusted2_count_ts[middle_ts:end] + adjusted2_count_ts = adjusted2_count_ts[middle:end] adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count") tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2) + # both grouper use an adjusted timestamp that is a multiple of 1399 min + # they should be equals even if the adjusted_timestamp is in the future + tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2) + def test_nearest(): # GH 17496 From cdb45c6905951f1b7a9e7fbb4e9b2a2be9a20476 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 24 Oct 2023 21:50:18 +0200 Subject: [PATCH 146/174] Backport PR #55655 on branch 2.1.x (BUG: infer_string not inferring string dtype when NA is first value) (#55666) Backport PR #55655: BUG: infer_string not inferring string dtype when NA is first value Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/_libs/lib.pyx | 3 +++ pandas/tests/series/test_constructors.py | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 3dcd718bca6fd..7c04d1a03a6e4 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -35,6 +35,7 @@ Bug fixes - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) +- Fixed bug in :class:`Series` constructor not inferring string dtype when ``NA`` is the first value and ``infer_string`` is set (:issue:` 55655`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0c0610f72044e..704a5f12204c4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2642,6 +2642,9 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif val is C_NA: + seen.object_ = True + continue else: seen.object_ = True break diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 8cc891e1ee9c0..b74ee5cf8f2bc 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2131,6 +2131,14 @@ def test_series_constructor_infer_string_scalar(self): tm.assert_series_equal(ser, expected) assert ser.dtype.storage == "python" + def test_series_string_inference_na_first(self): + # GH#55655 + pytest.importorskip("pyarrow") + expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series([pd.NA, "b"]) + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From f9a4befc47bc1ce5c87459eae7d2013617eeaf52 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 24 Oct 2023 21:50:39 +0200 Subject: [PATCH 147/174] Backport PR #55665 on branch 2.1.x (DOC: Update release date for 2.1.2) (#55667) Backport PR #55665: DOC: Update release date for 2.1.2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 7c04d1a03a6e4..8298cc0607512 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_212: -What's new in 2.1.2 (October ??, 2023) +What's new in 2.1.2 (October 25, 2023) --------------------------------------- These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog From 96ecfeca6615d1f13a94b01aa319b353ccb9c260 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 25 Oct 2023 11:40:01 +0200 Subject: [PATCH 148/174] Backport PR #55670 on branch 2.1.x (REGR: merge_asof raising TypeError for "by" with datelike dtypes) (#55676) Backport PR #55670: REGR: merge_asof raising TypeError for "by" with datelike dtypes Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/reshape/merge.py | 4 +++ pandas/tests/reshape/merge/test_merge_asof.py | 28 +++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 8298cc0607512..cca508a4d04b9 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) +- Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 74181f8dc853c..0b343a1fd2d82 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2193,6 +2193,10 @@ def injection(obj: ArrayLike): # TODO: conversions for EAs that can be no-copy. lbv = np.asarray(lbv) rbv = np.asarray(rbv) + if needs_i8_conversion(lbv.dtype): + lbv = lbv.view("i8") + if needs_i8_conversion(rbv.dtype): + rbv = rbv.view("i8") else: # We get here with non-ndarrays in test_merge_by_col_tz_aware # and test_merge_groupby_multiple_column_with_categorical_column diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index d4cb89dadde9b..a4ae81f86513c 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1341,6 +1341,34 @@ def test_by_mixed_tz_aware(self): expected["value_y"] = np.array([np.nan], dtype=object) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[us]"]) + def test_by_datelike(self, dtype): + # GH 55453 + left = pd.DataFrame( + { + "by_col": np.array([1], dtype=dtype), + "on_col": [2], + "value": ["a"], + } + ) + right = pd.DataFrame( + { + "by_col": np.array([1], dtype=dtype), + "on_col": [1], + "value": ["b"], + } + ) + result = merge_asof(left, right, by="by_col", on="on_col") + expected = pd.DataFrame( + { + "by_col": np.array([1], dtype=dtype), + "on_col": [2], + "value_x": ["a"], + "value_y": ["b"], + } + ) + tm.assert_frame_equal(result, expected) + def test_timedelta_tolerance_nearest(self, unit): # GH 27642 if unit == "s": From b7aaaf9c8831f7da91c9e31fe1da273744dab5ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 25 Oct 2023 19:02:19 +0200 Subject: [PATCH 149/174] Backport PR #55621 on branch 2.1.x (BUG: mode not sorting values for arrow backed strings) (#55680) BUG: mode not sorting values for arrow backed strings (#55621) * BUG: mode not sorting values for arrow backed strings * Fix tests * Change to pa_installed variable * Update pyarrow.py * Fix * Fix (cherry picked from commit bb2d2e00524fb31d08a43fc54706035f810d2489) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 1 + pandas/tests/extension/test_arrow.py | 4 ++-- pandas/tests/groupby/test_groupby.py | 18 +++++++++++------- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index cca508a4d04b9..a608cd136c559 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -32,6 +32,7 @@ Bug fixes - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) +- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 875461d39a93e..9c8f28d660450 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1886,6 +1886,7 @@ def _mode(self, dropna: bool = True) -> Self: if pa.types.is_temporal(pa_type): most_common = most_common.cast(pa_type) + most_common = most_common.take(pc.array_sort_indices(most_common)) return type(self)(most_common) def _maybe_convert_setitem_value(self, value): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 608cb023a3584..fef4fbea2e485 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1474,7 +1474,7 @@ def test_quantile(data, interpolation, quantile, request): @pytest.mark.parametrize( "take_idx, exp_idx", - [[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]], + [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]], ids=["multi_mode", "single_mode"], ) def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx): @@ -1492,7 +1492,7 @@ def test_mode_dropna_false_mode_na(data): expected = pd.Series([None], dtype=data.dtype) tm.assert_series_equal(result, expected) - expected = pd.Series([None, data[0]], dtype=data.dtype) + expected = pd.Series([data[0], None], dtype=data.dtype) result = expected.mode(dropna=False) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7726dbe6994ef..49ae217513018 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,11 +5,11 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 from pandas.errors import ( PerformanceWarning, SpecificationError, ) +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2518,10 +2518,7 @@ def test_groupby_column_index_name_lost(func): "infer_string", [ False, - pytest.param( - True, - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), - ), + pytest.param(True, marks=td.skip_if_no("pyarrow")), ], ) def test_groupby_duplicate_columns(infer_string): @@ -2751,13 +2748,20 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -def test_by_column_values_with_same_starting_value(): +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_by_column_values_with_same_starting_value(dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": ["sad", "happy", "happy"], + "Mood": Series(["sad", "happy", "happy"], dtype=dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} From 87cb14120af3e1d94298095add0ee060c57c921c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 25 Oct 2023 19:03:03 +0200 Subject: [PATCH 150/174] Backport PR #55627 on branch 2.1.x (BUG: value_counts returning incorrect dtype for string dtype) (#55682) Backport PR #55627: BUG: value_counts returning incorrect dtype for string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/groupby/groupby.py | 15 ++++++++++-- pandas/tests/groupby/test_size.py | 24 ++++++++++++++++++ pandas/tests/groupby/test_value_counts.py | 30 +++++++++++++++++++++++ 4 files changed, 68 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index a608cd136c559..145c364728b40 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -25,6 +25,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`) +- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d9fdf72eeda2b..2b83802944e36 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -108,6 +108,10 @@ class providing the base-class of operations. SparseArray, ) from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) from pandas.core.base import ( PandasObject, SelectionMixin, @@ -2803,7 +2807,9 @@ def _value_counts( result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() - result_frame.columns = columns + [name] + orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] # noqa: E501 + cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) + result_frame.columns = cols result = result_frame return result.__finalize__(self.obj, method="value_counts") @@ -2955,7 +2961,12 @@ def size(self) -> DataFrame | Series: dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): - dtype_backend = "pyarrow" + if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): + dtype_backend = None + elif isinstance(self.obj.array, ArrowStringArray): + dtype_backend = "numpy_nullable" + else: + dtype_backend = "pyarrow" elif isinstance(self.obj.array, BaseMaskedArray): dtype_backend = "numpy_nullable" # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index c275db9d1788c..93a4e743d0d71 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer_dtype from pandas import ( @@ -104,3 +106,25 @@ def test_size_series_masked_type_returns_Int64(dtype): result = ser.groupby(level=0).size() expected = Series([2, 1], dtype="Int64", index=["a", "b"]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_size_strings(dtype): + # GH#55627 + df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) + result = df.groupby("a")["b"].size() + exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + expected = Series( + [2, 1], + index=Index(["a", "b"], name="a", dtype=dtype), + name="b", + dtype=exp_dtype, + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 2bac76967dab3..070bdda976dc4 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,6 +9,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( Categorical, CategoricalIndex, @@ -366,6 +368,14 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -383,7 +393,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, + dtype, ): + education_df = education_df.astype(dtype) + education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) result = gp["education"].value_counts( @@ -392,11 +405,17 @@ def test_compound( expected = DataFrame() for column in ["country", "gender", "education"]: expected[column] = [education_df[column][row] for row in expected_rows] + expected = expected.astype(dtype) + expected.columns = expected.columns.astype(dtype) if normalize: expected["proportion"] = expected_count expected["proportion"] /= expected_group_size + if dtype == "string[pyarrow]": + expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count + if dtype == "string[pyarrow]": + expected["count"] = expected["count"].convert_dtypes() tm.assert_frame_equal(result, expected) @@ -1143,3 +1162,14 @@ def test_value_counts_time_grouper(utc): ) expected = Series(1, index=index, name="count") tm.assert_series_equal(result, expected) + + +def test_value_counts_integer_columns(): + # GH#55627 + df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]}) + gp = df.groupby([1, 2], as_index=False, sort=False) + result = gp[3].value_counts() + expected = DataFrame( + {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} + ) + tm.assert_frame_equal(result, expected) From 0a443460763e7f252da2679fac282050e696401a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 25 Oct 2023 22:22:50 +0200 Subject: [PATCH 151/174] =?UTF-8?q?Backport=20PR=20#55586=20on=20branch=20?= =?UTF-8?q?2.1.x=20(REGR:=20Groupby=20methods=20not=20supporting=20numba?= =?UTF-8?q?=20raising=20TypeError=20when=20the=E2=80=A6)=20(#55686)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #55586: REGR: Groupby methods not supporting numba raising TypeError when the… Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/groupby/generic.py | 7 +++++-- pandas/tests/groupby/test_numba.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 145c364728b40..8863bfa9f3f69 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -16,6 +16,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) +- Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8bef167b747e2..8ba644b588a08 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -236,10 +236,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) kwargs = {} if isinstance(func, str): - if maybe_use_numba(engine): + if maybe_use_numba(engine) and engine is not None: # Not all agg functions support numba, only propagate numba kwargs - # if user asks for numba + # if user asks for numba, and engine is not None + # (if engine is None, the called function will handle the case where + # numba is requested via the global option) kwargs["engine"] = engine + if engine_kwargs is not None: kwargs["engine_kwargs"] = engine_kwargs return getattr(self, func)(*args, **kwargs) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index e36c248c99ad6..ee7d342472493 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, Series, + option_context, ) import pandas._testing as tm @@ -66,3 +67,14 @@ def test_axis_1_unsupported(self, numba_supported_reductions): gb = df.groupby("a", axis=1) with pytest.raises(NotImplementedError, match="axis=1"): getattr(gb, func)(engine="numba", **kwargs) + + def test_no_engine_doesnt_raise(self): + # GH55520 + df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) + gb = df.groupby("a") + # Make sure behavior of functions w/out engine argument don't raise + # when the global use_numba option is set + with option_context("compute.use_numba", True): + res = gb.agg({"b": "first"}) + expected = gb.agg({"b": "first"}) + tm.assert_frame_equal(res, expected) From bceb84b1a8093a17778ad0f3497a9c14f02ad7d4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 26 Oct 2023 11:59:18 +0200 Subject: [PATCH 152/174] Backport PR #55173 on branch 2.1.x (BUG: .rolling() returns incorrect values when ts index is not nano seconds) (#55697) Backport PR #55173: BUG: .rolling() returns incorrect values when ts index is not nano seconds Co-authored-by: Hadi Abdi Khojasteh --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/window/rolling.py | 10 ++++++++- pandas/tests/window/test_rolling.py | 32 +++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 8863bfa9f3f69..16f8e8d264268 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) +- Fixed regression in :meth:`~DataFrame.rolling` where non-nanosecond index or ``on`` column would produce incorrect results (:issue:`55026`, :issue:`55106`, :issue:`55299`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index becbba703f92c..ddd6caaa7f783 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -21,6 +21,7 @@ from pandas._libs.tslibs import ( BaseOffset, + Timedelta, to_offset, ) import pandas._libs.window.aggregations as window_aggregations @@ -112,6 +113,8 @@ from pandas.core.generic import NDFrame from pandas.core.groupby.ops import BaseGrouper +from pandas.core.arrays.datetimelike import dtype_to_unit + class BaseWindow(SelectionMixin): """Provides utilities for performing windowing operations.""" @@ -1882,7 +1885,12 @@ def _validate(self): self._on.freq.nanos / self._on.freq.n ) else: - self._win_freq_i8 = freq.nanos + try: + unit = dtype_to_unit(self._on.dtype) # type: ignore[arg-type] + except TypeError: + # if not a datetime dtype, eg for empty dataframes + unit = "ns" + self._win_freq_i8 = Timedelta(freq.nanos).as_unit(unit)._value # min_periods must be an integer if self.min_periods is None: diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 70b7534b296f3..1beab5340484f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1878,3 +1878,35 @@ def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): op2 = getattr(rolling2, kernel) expected = op2(*arg2, numeric_only=numeric_only) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) +@pytest.mark.parametrize("tz", [None, "UTC", "Europe/Prague"]) +def test_rolling_timedelta_window_non_nanoseconds(unit, tz): + # Test Sum, GH#55106 + df_time = DataFrame( + {"A": range(5)}, index=date_range("2013-01-01", freq="1s", periods=5, tz=tz) + ) + sum_in_nanosecs = df_time.rolling("1s").sum() + # microseconds / milliseconds should not break the correct rolling + df_time.index = df_time.index.as_unit(unit) + sum_in_microsecs = df_time.rolling("1s").sum() + sum_in_microsecs.index = sum_in_microsecs.index.as_unit("ns") + tm.assert_frame_equal(sum_in_nanosecs, sum_in_microsecs) + + # Test max, GH#55026 + ref_dates = date_range("2023-01-01", "2023-01-10", unit="ns", tz=tz) + ref_series = Series(0, index=ref_dates) + ref_series.iloc[0] = 1 + ref_max_series = ref_series.rolling(Timedelta(days=4)).max() + + dates = date_range("2023-01-01", "2023-01-10", unit=unit, tz=tz) + series = Series(0, index=dates) + series.iloc[0] = 1 + max_series = series.rolling(Timedelta(days=4)).max() + + ref_df = DataFrame(ref_max_series) + df = DataFrame(max_series) + df.index = df.index.as_unit("ns") + + tm.assert_frame_equal(ref_df, df) From 47a7cbfe7b1ecca7ccc6a8ae0481b3abd348d7c8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 26 Oct 2023 07:53:56 -0400 Subject: [PATCH 153/174] =?UTF-8?q?Backport=20PR=20#55527:=20BUG=20fix=20d?= =?UTF-8?q?eprecation=20of=20`limit`=20and=20`fill=5Fmethod`=20=E2=80=A6?= =?UTF-8?q?=20(#55701)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #55527: BUG fix deprecation of `limit` and `fill_method` in `pct_change` Co-authored-by: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 8 ++++ pandas/core/generic.py | 44 ++++++++++--------- pandas/core/groupby/groupby.py | 25 ++++++----- pandas/tests/frame/methods/test_pct_change.py | 16 +++---- .../tests/groupby/transform/test_transform.py | 2 +- .../tests/series/methods/test_pct_change.py | 15 ++++--- 6 files changed, 63 insertions(+), 47 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 16f8e8d264268..7c26eecfeeb92 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -8,6 +8,14 @@ including other versions of pandas. {{ header }} +.. --------------------------------------------------------------------------- +.. _whatsnew_212.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- Reverted deprecation of ``fill_method=None`` in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`DataFrameGroupBy.pct_change`, and :meth:`SeriesGroupBy.pct_change`; the values ``'backfill'``, ``'bfill'``, ``'pad'``, and ``'ffill'`` are still deprecated (:issue:`53491`) + .. --------------------------------------------------------------------------- .. _whatsnew_212.regressions: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f377fc348156b..3281d245dca56 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11579,6 +11579,7 @@ def pct_change( How to handle NAs **before** computing percent changes. .. deprecated:: 2.1 + All options of `fill_method` are deprecated except `fill_method=None`. limit : int, default None The number of consecutive NAs to fill before stopping. @@ -11685,32 +11686,33 @@ def pct_change( APPL -0.252395 -0.011860 NaN """ # GH#53491 - if fill_method is not lib.no_default or limit is not lib.no_default: + if fill_method not in (lib.no_default, None) or limit is not lib.no_default: warnings.warn( - "The 'fill_method' and 'limit' keywords in " - f"{type(self).__name__}.pct_change are deprecated and will be " - "removed in a future version. Call " - f"{'bfill' if fill_method in ('backfill', 'bfill') else 'ffill'} " - "before calling pct_change instead.", + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", FutureWarning, stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - cols = self.items() if self.ndim == 2 else [(None, self)] - for _, col in cols: - mask = col.isna().values - mask = mask[np.argmax(~mask) :] - if mask.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this " - "warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) - break + if limit is lib.no_default: + cols = self.items() if self.ndim == 2 else [(None, self)] + for _, col in cols: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and will " + "be removed in a future version. Either fill in any " + "non-leading NA values prior to calling pct_change or " + "specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2b83802944e36..187c62bc0c9ce 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5217,7 +5217,7 @@ def diff( def pct_change( self, periods: int = 1, - fill_method: FillnaOptions | lib.NoDefault = lib.no_default, + fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, limit: int | None | lib.NoDefault = lib.no_default, freq=None, axis: Axis | lib.NoDefault = lib.no_default, @@ -5269,23 +5269,26 @@ def pct_change( goldfish 0.2 0.125 """ # GH#53491 - if fill_method is not lib.no_default or limit is not lib.no_default: + if fill_method not in (lib.no_default, None) or limit is not lib.no_default: warnings.warn( - "The 'fill_method' and 'limit' keywords in " - f"{type(self).__name__}.pct_change are deprecated and will be " - "removed in a future version. Call " - f"{'bfill' if fill_method in ('backfill', 'bfill') else 'ffill'} " - "before calling pct_change instead.", + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", FutureWarning, stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - if any(grp.isna().values.any() for _, grp in self): + if limit is lib.no_default and any( + grp.isna().values.any() for _, grp in self + ): warnings.warn( "The default fill_method='ffill' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this warning.", + f"{type(self).__name__}.pct_change is deprecated and will " + "be removed in a future version. Either fill in any " + "non-leading NA values prior to calling pct_change or " + "specify 'fill_method=None' to not fill NA values.", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index ede212ae18ae9..92b66e12d4356 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -29,7 +29,7 @@ def test_pct_change_with_nas( obj = frame_or_series(vals) msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " f"{type(obj).__name__}.pct_change are deprecated" ) with tm.assert_produces_warning(FutureWarning, match=msg): @@ -46,7 +46,7 @@ def test_pct_change_numeric(self): pnl.iat[2, 3] = 60 msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) @@ -59,12 +59,11 @@ def test_pct_change_numeric(self): def test_pct_change(self, datetime_frame): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = datetime_frame.pct_change(fill_method=None) + rs = datetime_frame.pct_change(fill_method=None) tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) rs = datetime_frame.pct_change(2) @@ -110,7 +109,7 @@ def test_pct_change_periods_freq( self, datetime_frame, freq, periods, fill_method, limit ): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) @@ -144,11 +143,12 @@ def test_pct_change_with_duplicated_indices(fill_method): {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 ) + warn = None if fill_method is None else FutureWarning msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "DataFrame.pct_change are deprecated" ) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): result = data.pct_change(fill_method=fill_method) if fill_method is None: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 062dfe3931423..d51c618b3c29d 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1063,7 +1063,7 @@ def test_pct_change(frame_or_series, freq, periods, fill_method, limit): expected = expected.to_frame("vals") msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " f"{type(gb).__name__}.pct_change are deprecated" ) with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 6740b8756853e..9727ef3d5c27c 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -11,12 +11,11 @@ class TestSeriesPctChange: def test_pct_change(self, datetime_series): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "Series.pct_change are deprecated" ) - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = datetime_series.pct_change(fill_method=None) + rs = datetime_series.pct_change(fill_method=None) tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) rs = datetime_series.pct_change(2) @@ -69,7 +68,7 @@ def test_pct_change_periods_freq( self, freq, periods, fill_method, limit, datetime_series ): msg = ( - "The 'fill_method' and 'limit' keywords in " + "The 'fill_method' keyword being not None and the 'limit' keyword in " "Series.pct_change are deprecated" ) @@ -101,8 +100,12 @@ def test_pct_change_with_duplicated_indices(fill_method): # GH30463 s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) - msg = "The 'fill_method' and 'limit' keywords in Series.pct_change are deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + warn = None if fill_method is None else FutureWarning + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "Series.pct_change are deprecated" + ) + with tm.assert_produces_warning(warn, match=msg): result = s.pct_change(fill_method=fill_method) expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) From 8059d861657a4d1a431b9907c99866009a81e05a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Oct 2023 14:10:07 +0200 Subject: [PATCH 154/174] Backport PR #55690 on branch 2.1.x (REGR: fix roundtripping datetimes with sqlite type detection) (#55702) REGR: fix roundtripping datetimes with sqlite type detection (#55690) (cherry picked from commit aeb3644567a10296cc09aed03bf7e0a3780e67ee) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/io/sql.py | 6 ++---- pandas/tests/io/test_sql.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 7c26eecfeeb92..1d2d89e7eadd1 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -28,6 +28,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) +- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 2b139f8ca527c..c1d68d71ac91c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2090,7 +2090,7 @@ def _adapt_time(t) -> str: # Python 3.12+ doesn't auto-register adapters for us anymore adapt_date_iso = lambda val: val.isoformat() - adapt_datetime_iso = lambda val: val.isoformat() + adapt_datetime_iso = lambda val: val.isoformat(" ") sqlite3.register_adapter(time, _adapt_time) @@ -2098,11 +2098,9 @@ def _adapt_time(t) -> str: sqlite3.register_adapter(datetime, adapt_datetime_iso) convert_date = lambda val: date.fromisoformat(val.decode()) - convert_datetime = lambda val: datetime.fromisoformat(val.decode()) - convert_timestamp = lambda val: datetime.fromtimestamp(int(val)) + convert_timestamp = lambda val: datetime.fromisoformat(val.decode()) sqlite3.register_converter("date", convert_date) - sqlite3.register_converter("datetime", convert_datetime) sqlite3.register_converter("timestamp", convert_timestamp) def sql_schema(self) -> str: diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index bfa93a4ff910e..5fd6a52031c52 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2970,6 +2970,24 @@ def test_roundtripping_datetimes(self): assert result == "2020-12-31 12:00:00.000000" +@pytest.fixture +def sqlite_builtin_detect_types(): + with contextlib.closing( + sqlite3.connect(":memory:", detect_types=sqlite3.PARSE_DECLTYPES) + ) as closing_conn: + with closing_conn as conn: + yield conn + + +def test_roundtripping_datetimes_detect_types(sqlite_builtin_detect_types): + # https://github.com/pandas-dev/pandas/issues/55554 + conn = sqlite_builtin_detect_types + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", conn).iloc[0, 0] + assert result == Timestamp("2020-12-31 12:00:00.000000") + + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): """ From 9c7b5677efe10bef70ed34f207f7ef996503afff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Oct 2023 17:38:40 +0200 Subject: [PATCH 155/174] Backport PR #54922 on branch 2.1.x (REGR: Restore _constructor_from_mgr to pass manager object to constructor) (#55704) Backport PR #54922: REGR: Restore _constructor_from_mgr to pass manager object to constructor --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/frame.py | 19 +++++------ pandas/core/series.py | 20 +++++------ pandas/tests/frame/test_arithmetic.py | 3 ++ pandas/tests/frame/test_subclass.py | 34 ++++++++++++++++++- pandas/tests/groupby/test_groupby_subclass.py | 4 +++ pandas/tests/reshape/concat/test_concat.py | 3 ++ pandas/tests/reshape/merge/test_merge.py | 3 ++ .../tests/reshape/merge/test_merge_ordered.py | 3 ++ pandas/tests/series/methods/test_to_frame.py | 5 +++ pandas/tests/series/test_subclass.py | 4 +++ 11 files changed, 77 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1d2d89e7eadd1..a59cadde422b9 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -29,6 +29,7 @@ Fixed regressions - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`) +- Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1614ecf1d7037..c109070ce461d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -639,14 +639,12 @@ def _constructor(self) -> Callable[..., DataFrame]: return DataFrame def _constructor_from_mgr(self, mgr, axes): - df = self._from_mgr(mgr, axes=axes) - - if type(self) is DataFrame: - # fastpath avoiding constructor call - return df + if self._constructor is DataFrame: + # we are pandas.DataFrame (or a subclass that doesn't override _constructor) + return self._from_mgr(mgr, axes=axes) else: assert axes is mgr.axes - return self._constructor(df, copy=False) + return self._constructor(mgr) _constructor_sliced: Callable[..., Series] = Series @@ -654,13 +652,12 @@ def _sliced_from_mgr(self, mgr, axes) -> Series: return Series._from_mgr(mgr, axes) def _constructor_sliced_from_mgr(self, mgr, axes): - ser = self._sliced_from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is DataFrame: - # fastpath avoiding constructor call + if self._constructor_sliced is Series: + ser = self._sliced_from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name return ser assert axes is mgr.axes - return self._constructor_sliced(ser, copy=False) + return self._constructor_sliced(mgr) # ---------------------------------------------------------------------- # Constructors diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d83643722ece..8ad049e173507 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -579,14 +579,14 @@ def _constructor(self) -> Callable[..., Series]: return Series def _constructor_from_mgr(self, mgr, axes): - ser = self._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is Series: - # fastpath avoiding constructor call + if self._constructor is Series: + # we are pandas.Series (or a subclass that doesn't override _constructor) + ser = self._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name return ser else: assert axes is mgr.axes - return self._constructor(ser, copy=False) + return self._constructor(mgr) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -610,12 +610,12 @@ def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: return DataFrame._from_mgr(mgr, axes=mgr.axes) def _constructor_expanddim_from_mgr(self, mgr, axes): - df = self._expanddim_from_mgr(mgr, axes) - if type(self) is Series: - # fastpath avoiding constructor - return df + from pandas.core.frame import DataFrame + + if self._constructor_expanddim is DataFrame: + return self._expanddim_from_mgr(mgr, axes) assert axes is mgr.axes - return self._constructor_expanddim(df, copy=False) + return self._constructor_expanddim(mgr) # types @property diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 670d4d4f554a6..e5a8feb7a89d3 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2078,6 +2078,9 @@ def test_frame_sub_nullable_int(any_int_ea_dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) def test_frame_op_subclass_nonclass_constructor(): # GH#43201 subclass._constructor is a function, not the subclass itself diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 37ca52eba6451..8e657f197ca1e 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -10,6 +10,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.fixture() def gpd_style_subclass_df(): @@ -734,8 +738,36 @@ def test_replace_list_method(self): # https://github.com/pandas-dev/pandas/pull/46018 df = tm.SubclassedDataFrame({"A": [0, 1, 2]}) msg = "The 'method' keyword in SubclassedDataFrame.replace is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=msg, raise_on_extra_warnings=False + ): result = df.replace([1, 2], method="ffill") expected = tm.SubclassedDataFrame({"A": [0, 0, 0]}) assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) + + +class MySubclassWithMetadata(DataFrame): + _metadata = ["my_metadata"] + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + my_metadata = kwargs.pop("my_metadata", None) + if args and isinstance(args[0], MySubclassWithMetadata): + my_metadata = args[0].my_metadata # type: ignore[has-type] + self.my_metadata = my_metadata + + @property + def _constructor(self): + return MySubclassWithMetadata + + +def test_constructor_with_metadata(): + # https://github.com/pandas-dev/pandas/pull/54922 + # https://github.com/pandas-dev/pandas/issues/55120 + df = MySubclassWithMetadata( + np.random.default_rng(2).random((5, 3)), columns=["A", "B", "C"] + ) + subset = df[["A", "B"]] + assert isinstance(subset, MySubclassWithMetadata) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..678211ea4a053 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -11,6 +11,10 @@ import pandas._testing as tm from pandas.tests.groupby import get_groupby_method_args +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.mark.parametrize( "obj", diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 5dde863f246d1..7863aa55152f1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -591,6 +591,9 @@ def test_duplicate_keys_same_frame(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) @pytest.mark.parametrize( "obj", [ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 304ec37b1e453..6868f7344c153 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -689,6 +689,9 @@ def test_merge_nan_right2(self): )[["i1", "i2", "i1_", "i3"]] tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, df, df2): class NotADataFrame(DataFrame): @property diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 1d0d4e3eb554b..cfb4e92fb45cd 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -70,6 +70,9 @@ def test_multigroup(self, left, right): result = merge_ordered(left, right, on="key", left_by="group") assert result["group"].notna().all() + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, left, right): class NotADataFrame(DataFrame): @property diff --git a/pandas/tests/series/methods/test_to_frame.py b/pandas/tests/series/methods/test_to_frame.py index 01e547aa34b47..0eadf696b34cc 100644 --- a/pandas/tests/series/methods/test_to_frame.py +++ b/pandas/tests/series/methods/test_to_frame.py @@ -1,3 +1,5 @@ +import pytest + from pandas import ( DataFrame, Index, @@ -40,6 +42,9 @@ def test_to_frame(self, datetime_series): ) tm.assert_frame_equal(rs, xp) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_to_frame_expanddim(self): # GH#9762 diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index a3550c6de6780..c2d5afcf884b1 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -4,6 +4,10 @@ import pandas as pd import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + class TestSeriesSubclassing: @pytest.mark.parametrize( From 16a3b01647c5dca09c6767e880bf5754e01a9a76 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 26 Oct 2023 18:05:45 +0200 Subject: [PATCH 156/174] Backport PR #55691 on branch 2.1.x (REGR: fix read_parquet with column of large strings (avoid overflow from concat)) (#55706) Backport PR #55691: REGR: fix read_parquet with column of large strings (avoid overflow from concat) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_.py | 12 ++++++++++-- pandas/tests/io/test_parquet.py | 12 ++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index a59cadde422b9..d2e10b4a42e68 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -28,6 +28,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) +- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`) - Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 58c638e7f59b5..a41214ae17044 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -223,11 +223,19 @@ def __from_arrow__( # pyarrow.ChunkedArray chunks = array.chunks + results = [] + for arr in chunks: + # convert chunk by chunk to numpy and concatenate then, to avoid + # overflow for large string data when concatenating the pyarrow arrays + arr = arr.to_numpy(zero_copy_only=False) + arr = ensure_string_array(arr, na_value=libmissing.NA) + results.append(arr) + if len(chunks) == 0: arr = np.array([], dtype=object) else: - arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = np.concatenate(results) + # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 41ae4042e9ae0..1d68f12270b55 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1144,6 +1144,18 @@ def test_infer_string_large_string_type(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) + # NOTE: this test is not run by default, because it requires a lot of memory (>5GB) + # @pytest.mark.slow + # def test_string_column_above_2GB(self, tmp_path, pa): + # # https://github.com/pandas-dev/pandas/issues/55606 + # # above 2GB of string data + # v1 = b"x" * 100000000 + # v2 = b"x" * 147483646 + # df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string") + # df.to_parquet(tmp_path / "test.parquet") + # result = read_parquet(tmp_path / "test.parquet") + # assert result["strings"].dtype == "string" + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From a60ad39b4a9febdea9a59d602dad44b1538b0ea5 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Thu, 26 Oct 2023 12:08:11 -0400 Subject: [PATCH 157/174] RLS: 2.1.2 From d11fa8d3db49d02e36e090d02b1be7eedc8eff56 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 Oct 2023 17:04:29 +0200 Subject: [PATCH 158/174] Backport PR #55722 on branch 2.1.x (DOC: Add whatsnew for 2.1.3) (#55723) Backport PR #55722: DOC: Add whatsnew for 2.1.3 Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.1.1.rst | 2 +- doc/source/whatsnew/v2.1.2.rst | 2 ++ doc/source/whatsnew/v2.1.3.rst | 41 ++++++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v2.1.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index baab20845a2a2..eef65366a2762 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.3 v2.1.2 v2.1.1 v2.1.0 diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index c52f7db8ec3ec..6101333ae760c 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -52,4 +52,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.0..v2.1.1|HEAD +.. contributors:: v2.1.0..v2.1.1 diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index d2e10b4a42e68..ade7f767815b1 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -66,3 +66,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.1..v2.1.2 diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst new file mode 100644 index 0000000000000..1359123ef153e --- /dev/null +++ b/doc/source/whatsnew/v2.1.3.rst @@ -0,0 +1,41 @@ +.. _whatsnew_213: + +What's new in 2.1.3 (November ??, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.2..v2.1.3|HEAD From c5da5ea15a17a2c0e95495619555b77488e2811b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 Oct 2023 18:43:34 +0200 Subject: [PATCH 159/174] Backport PR #55707 on branch 2.1.x (REF: Avoid np.can_cast for scalar inference for NEP 50) (#55716) Backport PR #55707: REF: Avoid np.can_cast for scalar inference for NEP 50 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/run_tests.sh | 3 ++- pandas/core/dtypes/cast.py | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..6a70ea1df3e71 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,8 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE +PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8ed57e9bf5532..9a9a8ed22f282 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -700,7 +700,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): - if not np.can_cast(fill_value, dtype): + if not np_can_cast_scalar(fill_value, dtype): # type: ignore[arg-type] # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) @@ -1892,4 +1892,25 @@ def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: """ if not len(rng): return True - return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) + return np_can_cast_scalar(rng.start, dtype) and np_can_cast_scalar(rng.stop, dtype) + + +def np_can_cast_scalar(element: Scalar, dtype: np.dtype) -> bool: + """ + np.can_cast pandas-equivalent for pre 2-0 behavior that allowed scalar + inference + + Parameters + ---------- + element : Scalar + dtype : np.dtype + + Returns + ------- + bool + """ + try: + np_can_hold_element(dtype, element) + return True + except (LossySetitemError, NotImplementedError): + return False From 08f2bd4c5da3adae7ea1a82a22fbe7b0494a3d59 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 29 Oct 2023 02:43:00 +0100 Subject: [PATCH 160/174] Backport PR #55747 on branch 2.1.x (DOC: Fix release date for 2.1.2) (#55749) Backport PR #55747: DOC: Fix release date for 2.1.2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index ade7f767815b1..a13a273b01257 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_212: -What's new in 2.1.2 (October 25, 2023) +What's new in 2.1.2 (October 26, 2023) --------------------------------------- These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog From bf9168410cb81cdc87928f637fba6638002cb33a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 1 Nov 2023 23:52:02 +0100 Subject: [PATCH 161/174] Backport PR #55726 on branch 2.1.x (fix segfault with tz_localize_to_utc) (#55796) Backport PR #55726: fix segfault with tz_localize_to_utc Co-authored-by: William Ayd --- pandas/_libs/tslibs/tzconversion.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e17d264333264..af27acf16b771 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -332,13 +332,16 @@ timedelta-like} # Shift the delta_idx by if the UTC offset of # the target tz is greater than 0 and we're moving forward # or vice versa - first_delta = info.deltas[0] - if (shift_forward or shift_delta > 0) and first_delta > 0: - delta_idx_offset = 1 - elif (shift_backward or shift_delta < 0) and first_delta < 0: - delta_idx_offset = 1 - else: - delta_idx_offset = 0 + # TODO: delta_idx_offset and info.deltas are needed for zoneinfo timezones, + # but are not applicable for all timezones. Setting the former to 0 and + # length checking the latter avoids UB, but this could use a larger refactor + delta_idx_offset = 0 + if len(info.deltas): + first_delta = info.deltas[0] + if (shift_forward or shift_delta > 0) and first_delta > 0: + delta_idx_offset = 1 + elif (shift_backward or shift_delta < 0) and first_delta < 0: + delta_idx_offset = 1 for i in range(n): val = vals[i] From 8a616b901f1f2a3f8fdcbc8a6e4754330abafba1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 6 Nov 2023 23:05:51 +0100 Subject: [PATCH 162/174] Backport PR #55817 on branch 2.1.x (COMPAT: Numpy int64 Windows default for Numpy 2.0) (#55851) Backport PR #55817: COMPAT: Numpy int64 Windows default for Numpy 2.0 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/compat/numpy/__init__.py | 3 ++- pandas/tests/extension/base/reduce.py | 4 +-- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/extension/test_masked.py | 29 +++++++++++++++++----- pandas/tests/frame/methods/test_reindex.py | 3 ++- pandas/tests/frame/test_reductions.py | 14 +++++++---- 6 files changed, 39 insertions(+), 16 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 51c9892b64a08..0552301b59400 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -11,6 +11,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") +np_version_gt2 = _nlv >= Version("2.0.0.dev0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.22.4" @@ -26,7 +27,7 @@ np_long: type np_ulong: type -if _nlv >= Version("2.0.0.dev0"): +if np_version_gt2: try: with warnings.catch_warnings(): warnings.filterwarnings( diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 43b2df4290eed..91d4855dcefe5 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -39,7 +39,7 @@ def check_reduce(self, s, op_name, skipna): expected = exp_op(skipna=skipna) tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): # Find the expected dtype when the given reduction is done on a DataFrame # column with this array. The default assumes float64-like behavior, # i.e. retains the dtype. @@ -58,7 +58,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} - cmp_dtype = self._get_expected_reduction_dtype(arr, op_name) + cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) # The DataFrame method just calls arr._reduce with keepdims=True, # so this first check is perfunctory. diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fef4fbea2e485..61474aa94d1c8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -543,7 +543,7 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index c4195be8ea121..588a2fb58d9be 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -20,6 +20,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas as pd import pandas._testing as tm @@ -40,7 +41,7 @@ ) from pandas.tests.extension import base -is_windows_or_32bit = is_platform_windows() or not IS64 +is_windows_or_32bit = (is_platform_windows() and not np_version_gt2) or not IS64 pytestmark = [ pytest.mark.filterwarnings( @@ -325,7 +326,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if tm.is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name elif op_name in ["mean", "median", "var", "std", "skew"]: @@ -335,16 +336,32 @@ def _get_expected_reduction_dtype(self, arr, op_name: str): elif arr.dtype in ["Int64", "UInt64"]: cmp_dtype = arr.dtype.name elif tm.is_signed_integer_dtype(arr.dtype): - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + # TODO: Why does Window Numpy 2.0 dtype depend on skipna? + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) elif tm.is_unsigned_integer_dtype(arr.dtype): - cmp_dtype = "UInt32" if is_windows_or_32bit else "UInt64" + cmp_dtype = ( + "UInt32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "UInt64" + ) elif arr.dtype.kind == "b": if op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" elif op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) else: raise TypeError("not supposed to reach this") else: @@ -360,7 +377,7 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 length = 64 - if not IS64 or is_platform_windows(): + if is_windows_or_32bit: # Item "ExtensionDtype" of "Union[dtype[Any], ExtensionDtype]" has # no attribute "itemsize" if not ser.dtype.itemsize == 8: # type: ignore[union-attr] diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0858e33a989b7..678fec835aa82 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -12,6 +12,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -131,7 +132,7 @@ class TestDataFrameSelectReindex: # test_indexing @pytest.mark.xfail( - not IS64 or is_platform_windows(), + not IS64 or (is_platform_windows() and not np_version_gt2), reason="Passes int32 values to DatetimeArray in make_na_array on " "windows, 32bit linux builds", ) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 61d7c125ee5e7..bec1fcd1e7462 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -32,6 +33,7 @@ nanops, ) +is_windows_np2_or_is32 = (is_platform_windows() and not np_version_gt2) or not IS64 is_windows_or_is32 = is_platform_windows() or not IS64 @@ -1766,13 +1768,13 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): @pytest.mark.parametrize( "opname, dtype, exp_value, exp_dtype", [ - ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int8", 0, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), ("sum", "Int64", 0, "Int64"), ("prod", "Int64", 1, "Int64"), - ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), - ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt8", 0, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), ("sum", "UInt64", 0, "UInt64"), ("prod", "UInt64", 1, "UInt64"), ("sum", "Float32", 0, "Float32"), @@ -1787,6 +1789,8 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype expected = Series([exp_value, exp_value], dtype=exp_dtype) tm.assert_series_equal(result, expected) + # TODO: why does min_count=1 impact the resulting Windows dtype + # differently than min_count=0? @pytest.mark.parametrize( "opname, dtype, exp_dtype", [ From 0bf70c7cf6ff0cc3fd18f44f450bef7e527f984c Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 6 Nov 2023 17:31:11 -0500 Subject: [PATCH 163/174] Backport PR #55761 on branch 2.1.x (BUG: DatetimeIndex.diff raising TypeError) (#55819) * Backport PR #55473: TST: sort index with sliced MultiIndex * BUG: DatetimeIndex.diff raising TypeError (#55761) * BUG: DatetimeIndex.diff raising TypeError * add test and whatsnew * typing * use Index (cherry picked from commit f04da3c1c9f5fd81ce7a04e55a965e4021de2ae9) * fix whatsnew --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/indexes/base.py | 5 +++-- pandas/tests/indexes/test_datetimelike.py | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 1359123ef153e..3b1cd1c152baa 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -21,7 +21,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b4ef5380d2b30..85b68c1bc2ec7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7024,7 +7024,8 @@ def infer_objects(self, copy: bool = True) -> Index: result._references.add_index_reference(result) return result - def diff(self, periods: int = 1): + @final + def diff(self, periods: int = 1) -> Index: """ Computes the difference between consecutive values in the Index object. @@ -7050,7 +7051,7 @@ def diff(self, periods: int = 1): Index([nan, 10.0, 10.0, 10.0, 10.0], dtype='float64') """ - return self._constructor(self.to_series().diff(periods)) + return Index(self.to_series().diff(periods)) def round(self, decimals: int = 0): """ diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 71cc7f29c62bc..5ad2e9b2f717e 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -159,3 +159,11 @@ def test_where_cast_str(self, simple_index): result = index.where(mask, ["foo"]) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_diff(self, unit): + # GH 55080 + dti = pd.to_datetime([10, 20, 30], unit=unit).as_unit(unit) + result = dti.diff(1) + expected = pd.TimedeltaIndex([pd.NaT, 10, 10], unit=unit).as_unit(unit) + tm.assert_index_equal(result, expected) From 7702f416948134d268274dd33f720151e19f3f76 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 Nov 2023 05:46:11 -1000 Subject: [PATCH 164/174] Backport PR #55853: DEPS: Use ipython run_cell instead of run_code; remove pytest-asyncio (#55858) --- .github/workflows/unit-tests.yml | 6 +++--- .github/workflows/wheels.yml | 2 +- ci/deps/actions-310.yaml | 1 - ci/deps/actions-311-downstream_compat.yaml | 1 - ci/deps/actions-311-numpydev.yaml | 1 - ci/deps/actions-311-pyarrownightly.yaml | 1 - ci/deps/actions-311.yaml | 1 - ci/deps/actions-39-minimum_versions.yaml | 1 - ci/deps/actions-39.yaml | 1 - ci/deps/actions-pypy-39.yaml | 1 - ci/deps/circle-310-arm64.yaml | 1 - ci/meta.yaml | 1 - environment.yml | 1 - pandas/tests/arrays/categorical/test_warnings.py | 7 ++----- pandas/tests/frame/test_api.py | 8 ++------ pandas/tests/indexes/test_base.py | 6 ++---- pandas/tests/resample/test_resampler_grouper.py | 6 ++---- pandas/util/_test_decorators.py | 12 +----------- pyproject.toml | 6 ++---- requirements-dev.txt | 1 - scripts/tests/data/deps_expected_random.yaml | 1 - scripts/tests/data/deps_minimum.toml | 6 ++---- scripts/tests/data/deps_unmodified_random.yaml | 1 - 23 files changed, 17 insertions(+), 56 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 8ff7d290ea9d1..14abcdbfdb310 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -236,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -274,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -347,7 +347,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6759f7b82eacf..95b48ee4b7426 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -168,7 +168,7 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index f8fa04a60a378..82f41811bf56e 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 8bf5ea51c4bf3..8422aba5239f9 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -15,7 +15,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 9a6c05a138f9b..695bd483fc572 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -18,7 +18,6 @@ dependencies: # causes an InternalError within pytest - pytest-xdist>=2.2.0, <3 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # pandas dependencies - python-dateutil diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index d7e5523dd2545..b5df68968ea46 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -15,7 +15,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # required dependencies - python-dateutil diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 7f8cca9e65fe1..cb1771c485297 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 5d34940aa27e1..e4db8cb0a540b 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -16,7 +16,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index f5ea116e51dd4..745781298b86a 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index a489690776be3..ce02ea2b9a090 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -16,7 +16,6 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-asyncio>=0.17.0 - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index a71523e8c3579..08a422c6d538a 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/meta.yaml b/ci/meta.yaml index 77f536a18a204..5b343f60b3dce 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -64,7 +64,6 @@ test: requires: - pip - pytest >=7.3.2 - - pytest-asyncio >=0.17.0 - pytest-xdist >=2.2.0 - pytest-cov - hypothesis >=6.46.1 diff --git a/environment.yml b/environment.yml index 98c39378d1ae1..3f6f0de68748b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,7 +16,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - coverage # required dependencies diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index c0623faccd325..68c59706a6c3b 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -1,19 +1,16 @@ import pytest -from pandas.util._test_decorators import async_mark - import pandas._testing as tm class TestCategoricalWarnings: - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = pd.Categorical([])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..aa7aa8964a059 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -7,8 +7,6 @@ from pandas._config.config import option_context -from pandas.util._test_decorators import async_mark - import pandas as pd from pandas import ( DataFrame, @@ -288,8 +286,7 @@ def _check_f(base, f): f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) - @async_mark() - async def test_tab_complete_warning(self, ip, frame_or_series): + def test_tab_complete_warning(self, ip, frame_or_series): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter @@ -299,8 +296,7 @@ async def test_tab_complete_warning(self, ip, frame_or_series): else: code = "from pandas import Series; obj = Series(dtype=object)" - await ip.run_code(code) - + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 with tm.assert_produces_warning(None, raise_on_extra_warnings=False): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8fd1e296fb79a..da4b44227bef3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -10,7 +10,6 @@ from pandas.compat import IS64 from pandas.errors import InvalidIndexError -from pandas.util._test_decorators import async_mark from pandas.core.dtypes.common import ( is_any_real_numeric_dtype, @@ -1218,14 +1217,13 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 6abdde0f5ccff..b5288c793dafa 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -4,7 +4,6 @@ import pytest from pandas.compat import is_platform_windows -from pandas.util._test_decorators import async_mark import pandas as pd from pandas import ( @@ -26,8 +25,7 @@ def test_frame(): ) -@async_mark() -async def test_tab_complete_ipython6_warning(ip): +def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent( @@ -37,7 +35,7 @@ async def test_tab_complete_ipython6_warning(ip): rs = s.resample("D") """ ) - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 03011a1ffe622..93c46274fdc1b 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -38,11 +38,11 @@ def test_foo(): if TYPE_CHECKING: from pandas._typing import F + from pandas.compat import ( IS64, is_platform_windows, ) -from pandas.compat._optional import import_optional_dependency from pandas.core.computation.expressions import ( NUMEXPR_INSTALLED, @@ -214,16 +214,6 @@ def documented_fixture(fixture): return documented_fixture -def async_mark(): - try: - import_optional_dependency("pytest_asyncio") - async_mark = pytest.mark.asyncio - except ImportError: - async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") - - return async_mark - - def mark_array_manager_not_yet_implemented(request) -> None: mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") request.node.add_marker(mark) diff --git a/pyproject.toml b/pyproject.toml index e9e3e2f67e2f1..d7634894b4835 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.4', 'numba>=0.55.2', 'numexpr>=2.8.0'] computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] fss = ['fsspec>=2022.05.0'] @@ -109,7 +109,6 @@ all = ['beautifulsoup4>=4.11.1', 'pyreadstat>=1.1.5', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', @@ -151,7 +150,7 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x *-musllinux_aarch64" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ @@ -514,7 +513,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/requirements-dev.txt b/requirements-dev.txt index 855eaccfd4f5f..af4f82b068d31 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,7 +9,6 @@ meson-python==0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 -pytest-asyncio>=0.17.0 coverage python-dateutil numpy<2 diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index c70025f8f019d..bebb25b9e1bb8 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17.0 - boto3 # required dependencies diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index b43815a982139..5fb51856e841e 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -55,7 +55,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] @@ -102,7 +102,6 @@ all = ['beautifulsoup4>=5.9.3', 'pyreadstat>=1.1.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', @@ -141,7 +140,7 @@ parentdir_prefix = "pandas-" [tool.cibuildwheel] skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*" build-verbosity = "3" -test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = "python {project}/ci/test_wheels.py" [tool.cibuildwheel.macos] @@ -385,7 +384,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 503eb3c7c7734..e54482282fcc8 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17 - boto3 # required dependencies From c9854d9821b2b7e73f88a3c2d6055c8afcf5dff7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:07:46 -1000 Subject: [PATCH 165/174] TST: Make read_csv tests pyarrow 13 compatable on 2.1.x (#55855) * TST: Make read_csv tests pyarrow 13 compatable on 2.1.x * Skip on windows for ARROW_TIMEZONE_DATABASE --- pandas/tests/interchange/test_impl.py | 16 +++++++++++++++- .../tests/io/parser/common/test_common_basic.py | 8 ++++++-- pandas/tests/io/parser/test_parse_dates.py | 8 +++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 8a25a2c1889f3..61d3edcbb2964 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -4,6 +4,10 @@ import pytest from pandas._libs.tslibs import iNaT +from pandas.compat import ( + is_ci_environment, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd @@ -309,11 +313,21 @@ def test_datetimetzdtype(tz, unit): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -def test_interchange_from_non_pandas_tz_aware(): +def test_interchange_from_non_pandas_tz_aware(request): # GH 54239, 54287 pa = pytest.importorskip("pyarrow", "11.0.0") import pyarrow.compute as pc + if is_platform_windows() and is_ci_environment(): + mark = pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) + request.node.add_marker(mark) + arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)]) arr = pc.assume_timezone(arr, "Asia/Kathmandu") table = pa.table({"arr": arr}) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 00a26a755756f..442aa5ef87b10 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -86,7 +86,9 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -177,7 +179,9 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9f7840588f89e..c6afff56de16b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -979,12 +979,15 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) -def test_parse_tz_aware(all_parsers, request): +def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -2231,6 +2234,9 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ From 569f904d45aeb5d0abb925e9875140afd61e3da4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 8 Nov 2023 01:08:46 +0100 Subject: [PATCH 166/174] Backport PR #55227 on branch 2.1.x (BUG: Interchange object data buffer has the wrong dtype / from_dataframe incorrect ) (#55863) Backport PR #55227: BUG: Interchange object data buffer has the wrong dtype / from_dataframe incorrect Co-authored-by: Marco Edward Gorelli Co-authored-by: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> --- pandas/core/interchange/from_dataframe.py | 16 ++++++++-------- pandas/tests/interchange/test_impl.py | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 214fbf9f36435..d45ae37890ba7 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -266,10 +266,9 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units - data_buff, protocol_data_dtype = buffers["data"] + data_buff, _ = buffers["data"] # We're going to reinterpret the buffer as uint8, so make sure we can do it safely - assert protocol_data_dtype[1] == 8 - assert protocol_data_dtype[2] in ( + assert col.dtype[2] in ( ArrowCTypes.STRING, ArrowCTypes.LARGE_STRING, ) # format_str == utf-8 @@ -377,15 +376,16 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any """ buffers = col.get_buffers() - _, _, format_str, _ = col.dtype - dbuf, dtype = buffers["data"] + _, col_bit_width, format_str, _ = col.dtype + dbuf, _ = buffers["data"] # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data = buffer_to_ndarray( dbuf, ( - DtypeKind.UINT, - dtype[1], - getattr(ArrowCTypes, f"UINT{dtype[1]}"), + DtypeKind.INT, + col_bit_width, + getattr(ArrowCTypes, f"INT{col_bit_width}"), Endianness.NATIVE, ), offset=col.offset, diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 61d3edcbb2964..97a388569e261 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -18,6 +18,7 @@ DtypeKind, ) from pandas.core.interchange.from_dataframe import from_dataframe +from pandas.core.interchange.utils import ArrowCTypes @pytest.fixture @@ -340,3 +341,24 @@ def test_interchange_from_non_pandas_tz_aware(request): dtype="datetime64[us, Asia/Kathmandu]", ) tm.assert_frame_equal(expected, result) + + +def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: + # https://github.com/pandas-dev/pandas/issues/54781 + df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() + interchange = df.__dataframe__() + column = interchange.get_column_by_name("a") + buffers = column.get_buffers() + buffers_data = buffers["data"] + buffer_dtype = buffers_data[1] + buffer_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + buffer_dtype[3], + ) + buffers["data"] = (buffers_data[0], buffer_dtype) + column.get_buffers = lambda: buffers + interchange.get_column_by_name = lambda _: column + monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) + pd.api.interchange.from_dataframe(df) From de8af3c3f4fe9552769b10e458678ff12937bcd0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Nov 2023 18:02:19 +0100 Subject: [PATCH 167/174] Backport PR #55427 on branch 2.1.x (DOC: Remove outdated docs about NumPy's broadcasting) (#55896) Backport PR #55427: DOC: Remove outdated docs about NumPy's broadcasting Co-authored-by: cobalt <61329810+RedGuy12@users.noreply.github.com> --- doc/source/user_guide/basics.rst | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2e299da5e5794..d3c9d83b943ce 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -408,20 +408,6 @@ raise a ValueError: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) -Note that this is different from the NumPy behavior where a comparison can -be broadcast: - -.. ipython:: python - - np.array([1, 2, 3]) == np.array([2]) - -or it can return False if broadcasting can not be done: - -.. ipython:: python - :okwarning: - - np.array([1, 2, 3]) == np.array([1, 2]) - Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 41586668d290d83910c9adcbade49a911025016b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Nov 2023 21:23:57 +0100 Subject: [PATCH 168/174] Backport PR #55764 on branch 2.1.x (REGR: fix return class in _constructor_from_mgr for simple subclasses) (#55892) Backport PR #55764: REGR: fix return class in _constructor_from_mgr for simple subclasses Co-authored-by: Isaac Virshup --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/tests/frame/test_subclass.py | 41 +++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 3b1cd1c152baa..264cd440907fd 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c109070ce461d..605cf49856e16 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -641,7 +641,7 @@ def _constructor(self) -> Callable[..., DataFrame]: def _constructor_from_mgr(self, mgr, axes): if self._constructor is DataFrame: # we are pandas.DataFrame (or a subclass that doesn't override _constructor) - return self._from_mgr(mgr, axes=axes) + return DataFrame._from_mgr(mgr, axes=axes) else: assert axes is mgr.axes return self._constructor(mgr) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8ad049e173507..7b22d89bfe22d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -581,7 +581,7 @@ def _constructor(self) -> Callable[..., Series]: def _constructor_from_mgr(self, mgr, axes): if self._constructor is Series: # we are pandas.Series (or a subclass that doesn't override _constructor) - ser = self._from_mgr(mgr, axes=axes) + ser = Series._from_mgr(mgr, axes=axes) ser._name = None # caller is responsible for setting real name return ser else: diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 8e657f197ca1e..ef78ae62cb4d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -771,3 +771,44 @@ def test_constructor_with_metadata(): ) subset = df[["A", "B"]] assert isinstance(subset, MySubclassWithMetadata) + + +class SimpleDataFrameSubClass(DataFrame): + """A subclass of DataFrame that does not define a constructor.""" + + +class SimpleSeriesSubClass(Series): + """A subclass of Series that does not define a constructor.""" + + +class TestSubclassWithoutConstructor: + def test_copy_df(self): + expected = DataFrame({"a": [1, 2, 3]}) + result = SimpleDataFrameSubClass(expected).copy() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_copy_series(self): + expected = Series([1, 2, 3]) + result = SimpleSeriesSubClass(expected).copy() + + tm.assert_series_equal(result, expected) + + def test_series_to_frame(self): + orig = Series([1, 2, 3]) + expected = orig.to_frame() + result = SimpleSeriesSubClass(orig).to_frame() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_groupby(self): + df = SimpleDataFrameSubClass(DataFrame({"a": [1, 2, 3]})) + + for _, v in df.groupby("a"): + assert type(v) is DataFrame From 1691a51ce7d6028ec48ef4f41709f8edce346ab9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Nov 2023 00:24:32 +0100 Subject: [PATCH 169/174] Backport PR #55894 on 2.1.x: Parquet/Feather IO: disable PyExtensionType autoload (#55900) Parquet/Feather IO: disable PyExtensionType autoload (#55894) * Parquet/Feather IO: disable PyExtensionType autoload * don't install hotfix for pyarrow >= 14.0.1 * move patching to extension type definitions * expand error message * fix compat for pyarrow not installed * add whatsnew (cherry picked from commit 851fea0ea38985cd7d5e0a3b7a7a7539b5883307) --- doc/source/whatsnew/v2.1.3.rst | 3 +- pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/core/arrays/arrow/extension_types.py | 60 +++++++++++++++++++++ pandas/io/feather_format.py | 3 ++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 264cd440907fd..531559c259b44 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -22,7 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- +- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) +- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 684e9dccdc0f9..6896945344b24 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under14p1, ) if TYPE_CHECKING: @@ -188,6 +189,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", + "pa_version_under14p1", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 12f58be109d98..96501b47e07f6 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,7 @@ pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -25,3 +26,4 @@ pa_version_under12p0 = True pa_version_under13p0 = True pa_version_under14p0 = True + pa_version_under14p1 = True diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 3249c1c829546..36d536bf86847 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,8 @@ import pyarrow +from pandas.compat import pa_version_under14p1 + from pandas.core.dtypes.dtypes import ( IntervalDtype, PeriodDtype, @@ -112,3 +114,61 @@ def to_pandas_dtype(self): # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c463f6e4d2759..b018b5720d126 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -117,6 +117,9 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401,E501 + check_dtype_backend(dtype_backend) with get_handle( From 92ce245e5d84e05a2943966e328b4e6b632bc8e2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 10 Nov 2023 14:54:05 +0100 Subject: [PATCH 170/174] Backport PR #55907 on branch 2.1.x (DOC: Add release date for 2.1.3) (#55913) Backport PR #55907: DOC: Add release date for 2.1.3 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.3.rst | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 531559c259b44..af626895a9e0e 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -1,6 +1,6 @@ .. _whatsnew_213: -What's new in 2.1.3 (November ??, 2023) +What's new in 2.1.3 (November 10, 2023) --------------------------------------- These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog @@ -14,7 +14,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`) -- .. --------------------------------------------------------------------------- .. _whatsnew_213.bug_fixes: @@ -25,14 +24,6 @@ Bug fixes - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) - Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) -.. --------------------------------------------------------------------------- -.. _whatsnew_213.other: - -Other -~~~~~ -- -- - .. --------------------------------------------------------------------------- .. _whatsnew_213.contributors: From d9665ca37076d4dd6c18a22e0b3a767fdc097691 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 10 Nov 2023 16:41:43 +0100 Subject: [PATCH 171/174] Backport PR #55911 on branch 2.1.x (DOC: convert outdated example of NumPy's broadcasting to literal code block) (#55912) Backport PR #55911: DOC: convert outdated example of NumPy's broadcasting to literal code block Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v0.17.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index abbda2ffc9be2..5a668e5fabd9c 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -726,10 +726,10 @@ be broadcast: or it can return False if broadcasting can not be done: -.. ipython:: python - :okwarning: +.. code-block:: ipython - np.array([1, 2, 3]) == np.array([1, 2]) + In [11]: np.array([1, 2, 3]) == np.array([1, 2]) + Out[11]: False Changes to boolean comparisons vs. None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 2a953cf80b77e4348bf50ed724f8abc0d814d9dd Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 10 Nov 2023 18:14:03 +0100 Subject: [PATCH 172/174] RLS: 2.1.3 From 467bef2e6f2b2d2a0c4a7886a36424a5e6e3bcdc Mon Sep 17 00:00:00 2001 From: Anusha Udayakumar Date: Tue, 15 Apr 2025 22:18:08 -0400 Subject: [PATCH 173/174] WIP: assign() validation for tuple keys --- pandas/core/frame.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 605cf49856e16..2d4fee2f7c227 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5008,6 +5008,13 @@ def assign(self, **kwargs) -> DataFrame: """ data = self.copy(deep=None) + for key in kwargs: + if not isinstance(key, str): + raise TypeError( + f"assign() only supports string column names. " + f"Use df[{key!r}] = ... to assign non-string column names like tuples." + ) + for k, v in kwargs.items(): data[k] = com.apply_if_callable(v, data) return data From 6846be4dc317a874dd45a086a6f72de097dbd55f Mon Sep 17 00:00:00 2001 From: Anusha Udayakumar Date: Tue, 15 Apr 2025 22:59:27 -0400 Subject: [PATCH 174/174] BUG: Raise clear error when assign is used with non-string column keys --- pandas/core/frame.py | 7 ++++--- pandas/tests/frame/methods/test_assign.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d4fee2f7c227..cb04905323af1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4944,7 +4944,7 @@ def insert( value, refs = self._sanitize_column(value) self._mgr.insert(loc, column, value, refs=refs) - def assign(self, **kwargs) -> DataFrame: + def assign(self, __kwargs_dict__=None, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. @@ -5006,7 +5006,8 @@ def assign(self, **kwargs) -> DataFrame: Portland 17.0 62.6 290.15 Berkeley 25.0 77.0 298.15 """ - data = self.copy(deep=None) + if __kwargs_dict__ is not None: + kwargs = __kwargs_dict__ for key in kwargs: if not isinstance(key, str): @@ -5015,10 +5016,10 @@ def assign(self, **kwargs) -> DataFrame: f"Use df[{key!r}] = ... to assign non-string column names like tuples." ) + data = self.copy(deep=None) for k, v in kwargs.items(): data[k] = com.apply_if_callable(v, data) return data - def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: """ Ensures new columns (which go into the BlockManager as new blocks) are diff --git a/pandas/tests/frame/methods/test_assign.py b/pandas/tests/frame/methods/test_assign.py index 0ae501d43e742..5c652be411987 100644 --- a/pandas/tests/frame/methods/test_assign.py +++ b/pandas/tests/frame/methods/test_assign.py @@ -1,5 +1,5 @@ import pytest - +import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -65,7 +65,7 @@ def test_assign_bad(self): df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # non-keyword argument - msg = r"assign\(\) takes 1 positional argument but 2 were given" + msg = r"function' object is not iterable" with pytest.raises(TypeError, match=msg): df.assign(lambda x: x.A) msg = "'DataFrame' object has no attribute 'C'" @@ -82,3 +82,9 @@ def test_assign_dependent(self): result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) tm.assert_frame_equal(result, expected) + + def test_assign_with_tuple_column_key_raises_typeerror(self): + df = DataFrame(columns=pd.MultiIndex.from_tuples([('A', 'x'), ('B', 'y')])) + + with pytest.raises(TypeError, match="assign\\(\\) only supports string column names"): + df.assign(__kwargs_dict__={('C', 'one'): [1, 2, 3]}) \ No newline at end of file