From 3dfb779b85d3e5af870ddcd909c4558590537432 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 6 Sep 2021 11:16:42 -0400 Subject: [PATCH 01/41] ENH: new .agg for list-likes --- pandas/core/apply.py | 74 +++++++++- pandas/core/config_init.py | 17 +++ pandas/core/groupby/generic.py | 36 ++++- pandas/core/groupby/groupby.py | 2 +- pandas/tests/apply/test_frame_apply.py | 129 +++++++++++++----- .../tests/groupby/aggregate/test_aggregate.py | 20 ++- pandas/tests/groupby/aggregate/test_other.py | 23 +++- pandas/tests/groupby/test_function.py | 6 +- pandas/tests/groupby/test_groupby.py | 29 ++-- pandas/tests/resample/test_deprecated.py | 11 +- pandas/tests/resample/test_resample_api.py | 32 ++++- pandas/tests/reshape/test_pivot.py | 12 +- pandas/tests/window/test_api.py | 14 +- 13 files changed, 333 insertions(+), 72 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 7555fb50f16af..f9199ac35643c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -21,7 +21,10 @@ import numpy as np -from pandas._config import option_context +from pandas._config import ( + get_option, + option_context, +) from pandas._libs import lib from pandas._typing import ( @@ -167,7 +170,10 @@ def agg(self) -> DataFrame | Series | None: return self.agg_dict_like() elif is_list_like(arg): # we require a list, but not a 'str' - return self.agg_list_like() + if get_option("new_udf_methods"): + return self.new_list_like("agg") + else: + return self.agg_list_like() if callable(arg): f = com.get_cython_func(arg) @@ -408,6 +414,70 @@ def agg_list_like(self) -> DataFrame | Series: ) return concatenated.reindex(full_ordered_index, copy=False) + def new_list_like(self, method: str) -> DataFrame | Series: + """ + Compute aggregation in the case of a list-like argument. + + Returns + ------- + Result of aggregation. + """ + from pandas.core.reshape.concat import concat + + obj = self.obj + arg = cast(List[AggFuncTypeBase], self.f) + + results = [] + keys = [] + result_dim = None + + for a in arg: + name = None + try: + if isinstance(a, (tuple, list)): + # Handle (name, value) pairs + name, a = a + new_res = getattr(obj, method)(a) + if result_dim is None: + result_dim = getattr(new_res, "ndim", 0) + elif getattr(new_res, "ndim", 0) != result_dim: + raise ValueError( + "cannot combine transform and aggregation operations" + ) + except TypeError: + pass + else: + results.append(new_res) + + # make sure we find a good name + if name is None: + name = com.get_callable_name(a) or a + keys.append(name) + + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + concatenated = concat(results, keys=keys, axis=1, sort=False) + except TypeError: + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + from pandas import Series + + result = Series(results, index=keys, name=obj.name) + return result + else: + # Concat uses the first index to determine the final indexing order. + # The union of a shorter first index with the other indices causes + # the index sorting to be different from the order of the aggregating + # functions. Reindex if this is the case. + index_size = concatenated.index.size + full_ordered_index = next( + result.index for result in results if result.index.size == index_size + ) + return concatenated.reindex(full_ordered_index, copy=False) + def agg_dict_like(self) -> DataFrame | Series: """ Compute aggregation in the case of a dict-like argument. diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index cf41bcff3d0c8..2df98a59cb184 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -511,6 +511,23 @@ def use_inf_as_na_cb(key): validator=is_one_of_factory(["block", "array"]), ) +new_udf_methods = """ +: boolean + Whether to use the new UDF method implementations. Currently experimental. + Defaults to False. +""" + + +with cf.config_prefix("mode"): + cf.register_option( + "new_udf_methods", + # Get the default from an environment variable, if set, otherwise defaults + # to "block". This environment variable can be set for testing. + os.environ.get("PANDAS_NEW_UDF_METHODS", "false").lower() == "true", + new_udf_methods, + validator=is_bool, + ) + # user warnings chained_assignment = """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 38f1d41494fd2..9d0bcf81f3e9c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -26,6 +26,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import reduction as libreduction from pandas._typing import ( ArrayLike, @@ -37,6 +39,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -886,8 +889,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) - op = GroupByApply(self, func, args, kwargs) - result = op.agg() + with group_selection_context(self): + op = GroupByApply(self, func, args, kwargs) + result = op.agg() if not is_dict_like(func) and result is not None: return result elif relabeling and result is not None: @@ -897,6 +901,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = columns if result is None: + if get_option("new_udf_methods"): + return self._new_agg(func, args, kwargs) # grouper specific aggregations if self.grouper.nkeys > 1: @@ -947,6 +953,28 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return result + def _new_agg(self, func, args, kwargs): + if args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early + result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1 and self.grouper.nkeys == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ + result = self._aggregate_frame(func) + return result + else: + # test_groupby_as_index_series_scalar gets here + # with 'not self.as_index' + return self._python_agg_general(func, *args, **kwargs) + + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + result.index = Index(range(len(result))) + + return result + agg = aggregate def _iterate_slices(self) -> Iterable[Series]: @@ -999,7 +1027,7 @@ def array_func(values: ArrayLike) -> ArrayLike: f"Before calling .{how}, select only columns which should be " "valid for the function.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return self._wrap_agged_manager(new_mgr) @@ -1195,7 +1223,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: f"Before calling .{how}, select only columns which should be " "valid for the transforming function.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) res_df = self.obj._constructor(res_mgr) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1ea16939603f9..18d750dd27ed5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1428,7 +1428,7 @@ def _python_agg_general(self, func, *args, **kwargs): "Before calling .agg, select only columns which should be " "valid for the aggregating function.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) continue diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 62983b5327a26..2c2376d163002 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -13,6 +13,7 @@ Series, Timestamp, date_range, + get_option, ) import pandas._testing as tm from pandas.tests.frame.common import zip_frames @@ -639,6 +640,8 @@ def test_apply_dup_names_multi_agg(): # GH 21063 df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + if get_option("mode.new_udf_methods"): + expected = expected.T result = df.agg(["min"]) tm.assert_frame_equal(result, expected) @@ -1010,25 +1013,46 @@ def test_agg_transform(axis, float_frame): # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) + if get_option("mode.new_udf_methods"): + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [["sqrt"], float_frame.columns] + ) + else: + expected.index = MultiIndex.from_product([["sqrt"], float_frame.index]) else: - expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["sqrt"]] + ) + else: + expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) tm.assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) + if get_option("mode.new_udf_methods"): + expected = pd.concat([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [["absolute", "sqrt"], float_frame.columns] + ) + else: + expected.index = MultiIndex.from_product( + [["absolute", "sqrt"], float_frame.index] + ) else: - expected.index = MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) tm.assert_frame_equal(result, expected) @@ -1040,6 +1064,8 @@ def test_demo(): expected = DataFrame( {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] ) + if get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) @@ -1086,18 +1112,29 @@ def test_agg_multiple_mixed_no_warning(): }, index=["min", "sum"], ) + klass, match = None, None + if get_option("mode.new_udf_methods"): + expected = expected.T + klass, match = FutureWarning, "Dropping of nuisance columns" # sorted index - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(klass, match=match, check_stacklevel=False): result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + klass, match = None, None + if get_option("mode.new_udf_methods"): + klass, match = FutureWarning, "Dropping of nuisance columns" + + with tm.assert_produces_warning(klass, match=match, check_stacklevel=False): result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # GH40420: the result of .agg should have an index that is sorted # according to the arguments provided to agg. - expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) + if get_option("mode.new_udf_methods"): + expected = expected.loc[["D", "C", "B", "A"], ["sum", "min"]] + else: + expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) tm.assert_frame_equal(result, expected) @@ -1116,6 +1153,8 @@ def test_agg_reduce(axis, float_frame): ) expected.columns = ["mean", "max", "sum"] expected = expected.T if axis in {0, "index"} else expected + if get_option("mode.new_udf_methods"): + expected = expected.T result = float_frame.agg(["mean", "max", "sum"], axis=axis) tm.assert_frame_equal(result, expected) @@ -1192,6 +1231,8 @@ def test_nuiscance_columns(): index=["min"], columns=df.columns, ) + if get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) with tm.assert_produces_warning( @@ -1205,6 +1246,8 @@ def test_nuiscance_columns(): expected = DataFrame( [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) + if get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) @@ -1244,8 +1287,12 @@ def test_non_callable_aggregates(how): } ) - tm.assert_frame_equal(result1, result2, check_like=True) - tm.assert_frame_equal(result2, expected, check_like=True) + if get_option("new_udf_methods"): + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result1, expected.T) + else: + tm.assert_frame_equal(result1, result2, check_like=True) + tm.assert_frame_equal(result2, expected, check_like=True) # Just functional string arg is same as calling df.arg() result = getattr(df, how)("count") @@ -1282,7 +1329,9 @@ def func(group_col): tm.assert_series_equal(result, expected) result = df.agg([func]) - expected = expected.to_frame("func").T + expected = expected.to_frame("func") + if not get_option("mode.new_udf_methods"): + expected = expected.T tm.assert_frame_equal(result, expected) @@ -1395,14 +1444,20 @@ def test_apply_empty_list_reduce(): tm.assert_series_equal(result, expected) -def test_apply_no_suffix_index(): +def test_apply_no_suffix_index(request): # GH36189 pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) - result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = DataFrame( - {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] - ) - + result = pdf.apply([np.square, lambda x: x, lambda x: x]) + if get_option("mode.new_udf_methods"): + columns = MultiIndex.from_product( + [["A", "B"], ["square", "", ""]] + ) + expected = DataFrame([[16, 4, 4, 81, 9, 9]], columns=columns) + else: + columns = MultiIndex.from_product( + [["A", "B"], ["square", "", ""]] + ) + expected = DataFrame(3 * [[16, 4, 4, 81, 9, 9]], columns=columns) tm.assert_frame_equal(result, expected) @@ -1434,15 +1489,25 @@ def foo(s): aggs = ["sum", foo, "count", "min"] result = df.agg(aggs) - expected = DataFrame( - { - "item": ["123456", np.nan, 6, "1"], - "att1": [21.0, 10.5, 6.0, 1.0], - "att2": [18.0, 9.0, 6.0, 0.0], - "att3": [17.0, 8.5, 6.0, 0.0], - }, - index=["sum", "foo", "count", "min"], - ) + if get_option("mode.new_udf_methods"): + expected = DataFrame( + { + "sum": ["123456", 21, 18, 17], + "count": [6, 6, 6, 6], + "min": ["1", 1, 0, 0], + }, + index=["item", "att1", "att2", "att3"], + ) + else: + expected = DataFrame( + { + "item": ["123456", np.nan, 6, "1"], + "att1": [21.0, 10.5, 6.0, 1.0], + "att2": [18.0, 9.0, 6.0, 0.0], + "att3": [17.0, 8.5, 6.0, 0.0], + }, + index=["sum", "foo", "count", "min"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4bda0e6ef9872..78194a806f456 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -20,6 +20,7 @@ MultiIndex, Series, concat, + get_option, to_datetime, ) import pandas._testing as tm @@ -499,12 +500,18 @@ def test_order_aggregate_multiple_funcs(): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) - result = res.columns.levels[1] + if get_option("new_udf_methods"): + # TODO (GH 35725): This will not raise when agg-must-agg is implemented + msg = "Cannot concat indices that do not have the same number of levels" + with pytest.raises(AssertionError, match=msg): + df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + else: + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + result = res.columns.levels[1] - expected = Index(["sum", "max", "mean", "ohlc", "min"]) + expected = Index(["sum", "max", "mean", "ohlc", "min"]) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("dtype", [np.int64, np.uint64]) @@ -1207,7 +1214,10 @@ def test_nonagg_agg(): g = df.groupby("a") result = g.agg(["cumsum"]) - result.columns = result.columns.droplevel(-1) + if get_option("new_udf_methods"): + result.columns = result.columns.droplevel(0) + else: + result.columns = result.columns.droplevel(-1) expected = g.agg("cumsum") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 79990deed261d..d34538a4f5935 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import get_option + import pandas.util._test_decorators as td import pandas as pd @@ -201,13 +203,21 @@ def test_aggregate_api_consistency(): tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) - expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) - expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) + if get_option("new_udf_methods"): + expected = pd.concat([c_sum, d_sum, c_mean, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["sum", "mean"], ["C", "D"]]) + else: + expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped[["D", "C"]].agg([np.sum, np.mean]) - expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) - expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) + if get_option("new_udf_methods"): + expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["sum", "mean"], ["D", "C"]]) + else: + expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({"C": "mean", "D": "sum"}) @@ -393,7 +403,10 @@ def P1(a): g = df.groupby("date") expected = g.agg([P1]) - expected.columns = expected.columns.levels[0] + if get_option("new_udf_methods"): + expected.columns = expected.columns.levels[1] + else: + expected.columns = expected.columns.levels[0] result = g.agg(P1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3ae11847cc06b..caa04d7994223 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -14,6 +14,7 @@ Series, Timestamp, date_range, + get_option, ) import pandas._testing as tm import pandas.core.nanops as nanops @@ -1138,7 +1139,10 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) + if get_option("new_udf_methods"): + expected.columns = MultiIndex.from_tuples([(function, "b")]) + else: + expected.columns = MultiIndex.from_tuples([("b", function)]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b9a6730996a02..1cf36ddbb1772 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -20,6 +20,7 @@ Timedelta, Timestamp, date_range, + get_option, read_csv, to_datetime, ) @@ -584,11 +585,18 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] agged = grouped.agg(funcs) - expected = pd.concat( - [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], - keys=["D", "E", "F"], - axis=1, - ) + if get_option("new_udf_methods"): + expected = pd.concat( + [grouped.agg(funcs[0]), grouped.agg(funcs[1])], + keys=["mean", "std"], + axis=1, + ) + else: + expected = pd.concat( + [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], + keys=["D", "E", "F"], + axis=1, + ) assert isinstance(agged.index, MultiIndex) assert isinstance(expected.index, MultiIndex) tm.assert_frame_equal(agged, expected) @@ -1985,9 +1993,14 @@ def test_groupby_agg_ohlc_non_first(): index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) - result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) - - tm.assert_frame_equal(result, expected) + if get_option("new_udf_methods"): + # TODO (GH 35725): This will not raise when agg-must-agg is implemented + msg = "Cannot concat indices that do not have the same number of levels" + with pytest.raises(AssertionError, match=msg): + df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) + else: + result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) + tm.assert_frame_equal(result, expected) def test_groupby_multiindex_nat(): diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index 359c3cea62f9c..ff3cb8d873bb9 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -10,6 +10,7 @@ from pandas import ( DataFrame, Series, + get_option, ) import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -97,7 +98,10 @@ def test_resample_loffset_arg_type(frame, create_index, arg): result_agg = df.resample("2D", loffset="2H").agg(arg) if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + if get_option("new_udf_methods"): + expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) + else: + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) @@ -216,7 +220,10 @@ def test_loffset_returns_datetimeindex(frame, kind, agg_arg): with tm.assert_produces_warning(FutureWarning): result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + if get_option("new_udf_methods"): + expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) + else: + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 3b3bd402e4cc7..3566ed42b7133 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -8,6 +8,7 @@ DataFrame, NamedAgg, Series, + get_option, ) import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -347,10 +348,16 @@ def test_agg(): b_std = r["B"].std() b_sum = r["B"].sum() - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + if get_option("new_udf_methods"): + expected = pd.concat([a_mean, b_mean, a_std, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["mean", "std"], ["A", "B"]]) + else: + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: warn = FutureWarning if t in cases[1:3] else None + if get_option("new_udf_methods"): + warn = None with tm.assert_produces_warning( warn, match="Dropping invalid columns", check_stacklevel=False ): @@ -628,11 +635,22 @@ def test_agg_with_datetime_index_list_agg_func(col_name): columns=[col_name], ) result = df.resample("1d").aggregate(["mean"]) - expected = DataFrame( - [47.5, 143.5, 195.5], - index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"), - columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), - ) + if get_option("new_udf_methods"): + expected = DataFrame( + [47.5, 143.5, 195.5], + index=date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[["mean"], [col_name]], codes=[[0], [0]]), + ) + else: + expected = DataFrame( + [47.5, 143.5, 195.5], + index=date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 88607f4b036a0..4c13c9733cf68 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import get_option + import pandas as pd from pandas import ( Categorical, @@ -1905,8 +1907,14 @@ def test_pivot_margins_name_unicode(self): frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek ) index = Index([1, 2, 3, greek], dtype="object", name="foo") - expected = DataFrame(index=index) - tm.assert_frame_equal(table, expected) + + if get_option("new_udf_methods"): + expected = Series([1, 1, 1, 3], index=index) + expected.index.name = None + tm.assert_series_equal(table, expected) + else: + expected = DataFrame(index=index) + tm.assert_frame_equal(table, expected) def test_pivot_string_as_func(self): # GH #18713 diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 5cc22249c26f0..0089f092dd439 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -10,6 +10,7 @@ Timestamp, concat, date_range, + get_option, timedelta_range, ) import pandas._testing as tm @@ -90,8 +91,12 @@ def test_agg(): b_std = r["B"].std() result = r.aggregate([np.mean, np.std]) - expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + if get_option("new_udf_methods"): + expected = concat([a_mean, b_mean, a_std, b_std], axis=1) + expected.columns = MultiIndex.from_product([["mean", "std"], ["A", "B"]]) + else: + expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) tm.assert_frame_equal(result, expected) result = r.aggregate({"A": np.mean, "B": np.std}) @@ -147,7 +152,10 @@ def test_agg_consistency(): r = df.rolling(window=3) result = r.agg([np.sum, np.mean]).columns - expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) + if get_option("new_udf_methods"): + expected = MultiIndex.from_product([["sum", "mean"], list("AB")]) + else: + expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) tm.assert_index_equal(result, expected) result = r["A"].agg([np.sum, np.mean]).columns From 9ef1eb0ca453d9d886f01d1292ab9e828b7373d7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 24 Sep 2021 17:40:55 -0400 Subject: [PATCH 02/41] Refactor single arg computation, test fixup --- pandas/core/apply.py | 43 ++++++++++++++------------ pandas/tests/apply/test_frame_apply.py | 4 +-- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f9199ac35643c..495f944012ebb 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -414,6 +414,28 @@ def agg_list_like(self) -> DataFrame | Series: ) return concatenated.reindex(full_ordered_index, copy=False) + def new_list_single_arg( + self, method: str, a: AggFuncTypeBase, result_dim: int | None + ) -> tuple[int | None, AggFuncTypeBase | None, DataFrame | Series | None]: + name = None + result = None + try: + if isinstance(a, (tuple, list)): + # Handle (name, value) pairs + name, a = a + result = getattr(self.obj, method)(a) + if result_dim is None: + result_dim = getattr(result, "ndim", 0) + elif getattr(result, "ndim", 0) != result_dim: + raise ValueError("cannot combine transform and aggregation operations") + except TypeError: + pass + else: + # make sure we find a good name + if name is None: + name = com.get_callable_name(a) or a + return result_dim, name, result + def new_list_like(self, method: str) -> DataFrame | Series: """ Compute aggregation in the case of a list-like argument. @@ -432,26 +454,9 @@ def new_list_like(self, method: str) -> DataFrame | Series: result_dim = None for a in arg: - name = None - try: - if isinstance(a, (tuple, list)): - # Handle (name, value) pairs - name, a = a - new_res = getattr(obj, method)(a) - if result_dim is None: - result_dim = getattr(new_res, "ndim", 0) - elif getattr(new_res, "ndim", 0) != result_dim: - raise ValueError( - "cannot combine transform and aggregation operations" - ) - except TypeError: - pass - else: + result_dim, name, new_res = self.new_list_single_arg(method, a, result_dim) + if new_res is not None: results.append(new_res) - - # make sure we find a good name - if name is None: - name = com.get_callable_name(a) or a keys.append(name) # if we are empty diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 2c2376d163002..d79317e48bd5b 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1450,9 +1450,9 @@ def test_apply_no_suffix_index(request): result = pdf.apply([np.square, lambda x: x, lambda x: x]) if get_option("mode.new_udf_methods"): columns = MultiIndex.from_product( - [["A", "B"], ["square", "", ""]] + [["square", "", ""], ["A", "B"]] ) - expected = DataFrame([[16, 4, 4, 81, 9, 9]], columns=columns) + expected = DataFrame(3 * [[16, 81, 4, 9, 4, 9]], columns=columns) else: columns = MultiIndex.from_product( [["A", "B"], ["square", "", ""]] From 1974e07853df015f92f1c9a5e103723bf3620ef6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 24 Sep 2021 20:30:36 -0400 Subject: [PATCH 03/41] Revert change to GroupBy.agg --- pandas/core/groupby/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9d0bcf81f3e9c..c6d9c45485ee2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -889,9 +889,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) - with group_selection_context(self): - op = GroupByApply(self, func, args, kwargs) - result = op.agg() + # with group_selection_context(self): + op = GroupByApply(self, func, args, kwargs) + result = op.agg() if not is_dict_like(func) and result is not None: return result elif relabeling and result is not None: From d7b6c7f886b864ec5fe613b643c2b7632131878f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Sep 2021 18:02:42 -0400 Subject: [PATCH 04/41] Rename option and methods --- pandas/core/apply.py | 12 ++++--- pandas/core/config_init.py | 12 +++---- pandas/core/groupby/generic.py | 7 ++-- pandas/tests/apply/test_frame_apply.py | 34 +++++++++++-------- .../tests/groupby/aggregate/test_aggregate.py | 4 +-- pandas/tests/groupby/aggregate/test_other.py | 6 ++-- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 4 +-- pandas/tests/resample/test_deprecated.py | 4 +-- pandas/tests/resample/test_resample_api.py | 4 +-- pandas/tests/reshape/test_pivot.py | 2 +- pandas/tests/window/test_api.py | 4 +-- 12 files changed, 50 insertions(+), 45 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f5f2bbf9bebbe..c9c813159568d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -170,8 +170,8 @@ def agg(self) -> DataFrame | Series | None: return self.agg_dict_like() elif is_list_like(arg): # we require a list, but not a 'str' - if get_option("new_udf_methods"): - return self.new_list_like("agg") + if get_option("future_udf_behavior"): + return self.future_list_like("agg") else: return self.agg_list_like() @@ -414,7 +414,7 @@ def agg_list_like(self) -> DataFrame | Series: ) return concatenated.reindex(full_ordered_index, copy=False) - def new_list_single_arg( + def future_list_single_arg( self, method: str, a: AggFuncTypeBase, result_dim: int | None ) -> tuple[int | None, AggFuncTypeBase | None, DataFrame | Series | None]: name = None @@ -436,7 +436,7 @@ def new_list_single_arg( name = com.get_callable_name(a) or a return result_dim, name, result - def new_list_like(self, method: str) -> DataFrame | Series: + def future_list_like(self, method: str) -> DataFrame | Series: """ Compute aggregation in the case of a list-like argument. @@ -454,7 +454,9 @@ def new_list_like(self, method: str) -> DataFrame | Series: result_dim = None for a in arg: - result_dim, name, new_res = self.new_list_single_arg(method, a, result_dim) + result_dim, name, new_res = self.future_list_single_arg( + method, a, result_dim + ) if new_res is not None: results.append(new_res) keys.append(name) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 5d444fa35a46a..99aff44f75029 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -511,20 +511,20 @@ def use_inf_as_na_cb(key): validator=is_one_of_factory(["block", "array"]), ) -new_udf_methods = """ +future_udf_behavior = """ : boolean - Whether to use the new UDF method implementations. Currently experimental. + Whether to use the future UDF method implementations. Currently experimental. Defaults to False. """ with cf.config_prefix("mode"): cf.register_option( - "new_udf_methods", + "future_udf_behavior", # Get the default from an environment variable, if set, otherwise defaults - # to "block". This environment variable can be set for testing. - os.environ.get("PANDAS_NEW_UDF_METHODS", "false").lower() == "true", - new_udf_methods, + # to False. This environment variable can be set for testing. + os.environ.get("PANDAS_FUTURE_UDF_BEHAVIOR", "false").lower() == "true", + future_udf_behavior, validator=is_bool, ) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 821da94daaf5c..b5cfc01b4f2f9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -895,7 +895,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) - # with group_selection_context(self): op = GroupByApply(self, func, args, kwargs) result = op.agg() if not is_dict_like(func) and result is not None: @@ -907,8 +906,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = columns if result is None: - if get_option("new_udf_methods"): - return self._new_agg(func, args, kwargs) + if get_option("future_udf_behavior"): + return self._future_agg(func, args, kwargs) # grouper specific aggregations if self.grouper.nkeys > 1: @@ -959,7 +958,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return result - def _new_agg(self, func, args, kwargs): + def _future_agg(self, func, args, kwargs): if args or kwargs: # test_pass_args_kwargs gets here (with and without as_index) # can't return early diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index d79317e48bd5b..8978d78ed9e2e 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -640,7 +640,7 @@ def test_apply_dup_names_multi_agg(): # GH 21063 df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = expected.T result = df.agg(["min"]) @@ -1013,7 +1013,7 @@ def test_agg_transform(axis, float_frame): # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): if axis in {0, "index"}: expected.columns = MultiIndex.from_product( [["sqrt"], float_frame.columns] @@ -1033,7 +1033,7 @@ def test_agg_transform(axis, float_frame): # these are in the order as if we are applying both # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = pd.concat([f_abs, f_sqrt], axis=other_axis) if axis in {0, "index"}: expected.columns = MultiIndex.from_product( @@ -1064,7 +1064,7 @@ def test_demo(): expected = DataFrame( {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] ) - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = expected.T tm.assert_frame_equal(result, expected) @@ -1113,7 +1113,7 @@ def test_agg_multiple_mixed_no_warning(): index=["min", "sum"], ) klass, match = None, None - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = expected.T klass, match = FutureWarning, "Dropping of nuisance columns" # sorted index @@ -1123,7 +1123,7 @@ def test_agg_multiple_mixed_no_warning(): tm.assert_frame_equal(result, expected) klass, match = None, None - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): klass, match = FutureWarning, "Dropping of nuisance columns" with tm.assert_produces_warning(klass, match=match, check_stacklevel=False): @@ -1131,7 +1131,7 @@ def test_agg_multiple_mixed_no_warning(): # GH40420: the result of .agg should have an index that is sorted # according to the arguments provided to agg. - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = expected.loc[["D", "C", "B", "A"], ["sum", "min"]] else: expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) @@ -1153,7 +1153,7 @@ def test_agg_reduce(axis, float_frame): ) expected.columns = ["mean", "max", "sum"] expected = expected.T if axis in {0, "index"} else expected - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = expected.T result = float_frame.agg(["mean", "max", "sum"], axis=axis) @@ -1231,7 +1231,7 @@ def test_nuiscance_columns(): index=["min"], columns=df.columns, ) - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = expected.T tm.assert_frame_equal(result, expected) @@ -1242,11 +1242,15 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - result = df.agg(["sum"]) + warn = FutureWarning if get_option("future_udf_behavior") else None + with tm.assert_produces_warning( + warn, match="Select only valid", check_stacklevel=False + ): + result = df.agg(["sum"]) expected = DataFrame( [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = expected.T tm.assert_frame_equal(result, expected) @@ -1287,7 +1291,7 @@ def test_non_callable_aggregates(how): } ) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result1, expected.T) else: @@ -1330,7 +1334,7 @@ def func(group_col): result = df.agg([func]) expected = expected.to_frame("func") - if not get_option("mode.new_udf_methods"): + if not get_option("future_udf_behavior"): expected = expected.T tm.assert_frame_equal(result, expected) @@ -1448,7 +1452,7 @@ def test_apply_no_suffix_index(request): # GH36189 pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) result = pdf.apply([np.square, lambda x: x, lambda x: x]) - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): columns = MultiIndex.from_product( [["square", "", ""], ["A", "B"]] ) @@ -1489,7 +1493,7 @@ def foo(s): aggs = ["sum", foo, "count", "min"] result = df.agg(aggs) - if get_option("mode.new_udf_methods"): + if get_option("future_udf_behavior"): expected = DataFrame( { "sum": ["123456", 21, 18, 17], diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 78194a806f456..129fd3419743f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -500,7 +500,7 @@ def test_order_aggregate_multiple_funcs(): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): # TODO (GH 35725): This will not raise when agg-must-agg is implemented msg = "Cannot concat indices that do not have the same number of levels" with pytest.raises(AssertionError, match=msg): @@ -1214,7 +1214,7 @@ def test_nonagg_agg(): g = df.groupby("a") result = g.agg(["cumsum"]) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): result.columns = result.columns.droplevel(0) else: result.columns = result.columns.droplevel(-1) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index d34538a4f5935..1720c293cfff3 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -203,7 +203,7 @@ def test_aggregate_api_consistency(): tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = pd.concat([c_sum, d_sum, c_mean, d_mean], axis=1) expected.columns = MultiIndex.from_product([["sum", "mean"], ["C", "D"]]) else: @@ -212,7 +212,7 @@ def test_aggregate_api_consistency(): tm.assert_frame_equal(result, expected, check_like=True) result = grouped[["D", "C"]].agg([np.sum, np.mean]) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) expected.columns = MultiIndex.from_product([["sum", "mean"], ["D", "C"]]) else: @@ -403,7 +403,7 @@ def P1(a): g = df.groupby("date") expected = g.agg([P1]) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected.columns = expected.columns.levels[1] else: expected.columns = expected.columns.levels[0] diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index caa04d7994223..4539b2ef67e8d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1139,7 +1139,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) result = groups.agg([function]) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected.columns = MultiIndex.from_tuples([(function, "b")]) else: expected.columns = MultiIndex.from_tuples([("b", function)]) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1cf36ddbb1772..45c10a53c9c0f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -585,7 +585,7 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] agged = grouped.agg(funcs) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = pd.concat( [grouped.agg(funcs[0]), grouped.agg(funcs[1])], keys=["mean", "std"], @@ -1993,7 +1993,7 @@ def test_groupby_agg_ohlc_non_first(): index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): # TODO (GH 35725): This will not raise when agg-must-agg is implemented msg = "Cannot concat indices that do not have the same number of levels" with pytest.raises(AssertionError, match=msg): diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py index ff3cb8d873bb9..3de3694f1eb52 100644 --- a/pandas/tests/resample/test_deprecated.py +++ b/pandas/tests/resample/test_deprecated.py @@ -98,7 +98,7 @@ def test_resample_loffset_arg_type(frame, create_index, arg): result_agg = df.resample("2D", loffset="2H").agg(arg) if isinstance(arg, list): - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) else: expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) @@ -220,7 +220,7 @@ def test_loffset_returns_datetimeindex(frame, kind, agg_arg): with tm.assert_produces_warning(FutureWarning): result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) if isinstance(agg_arg, list): - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected.columns = pd.MultiIndex.from_tuples([("mean", "value")]) else: expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index a8df3b9b81c12..d9e44ba6625ec 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -348,7 +348,7 @@ def test_agg(): b_std = r["B"].std() b_sum = r["B"].sum() - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = pd.concat([a_mean, b_mean, a_std, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["mean", "std"], ["A", "B"]]) else: @@ -632,7 +632,7 @@ def test_agg_with_datetime_index_list_agg_func(col_name): columns=[col_name], ) result = df.resample("1d").aggregate(["mean"]) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = DataFrame( [47.5, 143.5, 195.5], index=date_range( diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 4c13c9733cf68..080eb7dd9cd29 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1908,7 +1908,7 @@ def test_pivot_margins_name_unicode(self): ) index = Index([1, 2, 3, greek], dtype="object", name="foo") - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = Series([1, 1, 1, 3], index=index) expected.index.name = None tm.assert_series_equal(table, expected) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 0089f092dd439..d24399cb8a83b 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -91,7 +91,7 @@ def test_agg(): b_std = r["B"].std() result = r.aggregate([np.mean, np.std]) - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = concat([a_mean, b_mean, a_std, b_std], axis=1) expected.columns = MultiIndex.from_product([["mean", "std"], ["A", "B"]]) else: @@ -152,7 +152,7 @@ def test_agg_consistency(): r = df.rolling(window=3) result = r.agg([np.sum, np.mean]).columns - if get_option("new_udf_methods"): + if get_option("future_udf_behavior"): expected = MultiIndex.from_product([["sum", "mean"], list("AB")]) else: expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) From d412b4f1158c660c65fc4ac86050d3b214e4200e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 9 Oct 2021 16:36:42 -0400 Subject: [PATCH 05/41] Merge fixups --- pandas/tests/groupby/aggregate/test_other.py | 2 +- pandas/tests/groupby/test_groupby.py | 8 +++++--- pandas/tests/resample/test_resample_api.py | 5 ++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 885de13ae853d..3ec47870a4fcb 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -49,7 +49,7 @@ def peak_to_peak(arr): if get_option("future_udf_behavior"): match = "Dropping invalid columns in DataFrameGroupBy.agg" else: - match = (r"\['key2'\] did not aggregate successfully",) + match = r"\['key2'\] did not aggregate successfully" with tm.assert_produces_warning( FutureWarning, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 537220707f3f8..820a18d59d62e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -584,9 +584,11 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - with tm.assert_produces_warning( - FutureWarning, match=r"\['C'\] did not aggregate successfully" - ): + if get_option("future_udf_behavior"): + klass, msg = None, None + else: + klass, msg = FutureWarning, r"\['C'\] did not aggregate successfully" + with tm.assert_produces_warning(klass, match=msg): agged = grouped.agg(funcs) if get_option("future_udf_behavior"): expected = pd.concat( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 4be47eaa3c25d..e51396d380b6a 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -355,7 +355,10 @@ def test_agg(): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: - warn = FutureWarning if t in cases[1:3] else None + if t in cases[1:3] and not get_option("FUTURE_UDF_BEHAVIOR"): + warn = FutureWarning + else: + warn = None with tm.assert_produces_warning( warn, match=r"\['date'\] did not aggregate successfully", From 0cea15b4ab3c23e3ae215b90a66b4f059a42d0bd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Sun, 3 Oct 2021 20:09:53 -0400 Subject: [PATCH 06/41] BUG/ERR: sparse array cmp methods mismatched len (#43863) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/sparse/array.py | 7 +++++-- pandas/tests/arrays/sparse/test_arithmetics.py | 8 ++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 381b0f39ff849..0c841078fe9b4 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -511,6 +511,7 @@ Sparse - Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`) - Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`) - Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`) +- Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`) - ExtensionArray diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 7c5f0578bda27..87fcf54ed684b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1642,11 +1642,14 @@ def _cmp_method(self, other, op) -> SparseArray: if isinstance(other, np.ndarray): # TODO: make this more flexible than just ndarray... - if len(self) != len(other): - raise AssertionError(f"length mismatch: {len(self)} vs. {len(other)}") other = SparseArray(other, fill_value=self.fill_value) if isinstance(other, SparseArray): + if len(self) != len(other): + raise ValueError( + f"operands have mismatched length {len(self)} and {len(other)}" + ) + op_name = op.__name__.strip("_") return _sparse_array_op(self, other, op, op_name) else: diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 2ae60a90fee60..d7c39c0e0708e 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -529,3 +529,11 @@ def test_unary_op(op, fill_value): result = op(sparray) expected = SparseArray(op(arr), fill_value=op(fill_value)) tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("cons", [list, np.array, SparseArray]) +def test_mismatched_length_cmp_op(cons): + left = SparseArray([True, True]) + right = cons([True, True, True]) + with pytest.raises(ValueError, match="operands have mismatched length"): + left & right From 665b304f71608d7ca3abdce6de809a3ef11e79be Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 4 Oct 2021 05:07:41 -0700 Subject: [PATCH 07/41] Add deprecation tag for passing a string for ewm(times=...) (#43873) --- pandas/core/window/ewm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 79102c2bc82ee..29a6704ae5092 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -205,6 +205,8 @@ class ExponentialMovingWindow(BaseWindow): If str, the name of the column in the DataFrame representing the times. + .. deprecated:: 1.4.0 + If 1-D array like, a sequence with the same shape as the observations. Only applicable to ``mean()``. From 214ba4aff27a474f161e52963eca17b48921b998 Mon Sep 17 00:00:00 2001 From: Julian Fleischer Date: Mon, 4 Oct 2021 14:10:46 +0200 Subject: [PATCH 08/41] Make components of Suffixes Optional (#42544) --- pandas/_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 9c20eb12dc7fc..68ec331c2781f 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -111,7 +111,7 @@ IndexLabel = Union[Hashable, Sequence[Hashable]] Level = Union[Hashable, int] Shape = Tuple[int, ...] -Suffixes = Tuple[str, str] +Suffixes = Tuple[Optional[str], Optional[str]] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] From 9d6da6d4383ad91ae6eb9f23e200dc37ef14813a Mon Sep 17 00:00:00 2001 From: Robin Raymond Date: Mon, 4 Oct 2021 21:13:40 +0200 Subject: [PATCH 09/41] BUG: Fix dtypes for read_json (#42819) * Fix dtypes for read_json * Address comments * Add whatsnew entry * Update doc/source/whatsnew/v1.4.0.rst Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> * Linting Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/json/_json.py | 9 +-------- pandas/tests/io/json/test_pandas.py | 30 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 0c841078fe9b4..8113ac97a3a37 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -465,6 +465,7 @@ I/O - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`) +- Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`) - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f92fc65f55df6..b9bdfb91ca154 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -876,11 +876,8 @@ def check_keys_split(self, decoded): def parse(self): - # try numpy - numpy = self.numpy - if numpy: + if self.numpy: self._parse_numpy() - else: self._parse_no_numpy() @@ -941,10 +938,6 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): ) if dtype is not None: try: - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; - # expected "Type[Any]" - dtype = np.dtype(dtype) # type: ignore[arg-type] return data.astype(dtype), True except (TypeError, ValueError): return data, False diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a856f031e20ba..747770ad78684 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1387,6 +1387,36 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) + def test_to_json_from_json_columns_dtypes(self, orient): + # GH21892 GH33205 + expected = DataFrame.from_dict( + { + "Integer": Series([1, 2, 3], dtype="int64"), + "Float": Series([None, 2.0, 3.0], dtype="float64"), + "Object": Series([None, "", "c"], dtype="object"), + "Bool": Series([True, False, True], dtype="bool"), + "Category": Series(["a", "b", None], dtype="category"), + "Datetime": Series( + ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]" + ), + } + ) + dfjson = expected.to_json(orient=orient) + result = read_json( + dfjson, + orient=orient, + dtype={ + "Integer": "int64", + "Float": "float64", + "Object": "object", + "Bool": "bool", + "Category": "category", + "Datetime": "datetime64[ns]", + }, + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 From 005598c7b77e66b9bd1989b2d1197acacb232ab0 Mon Sep 17 00:00:00 2001 From: Horace Lai <44500643+horaceklai@users.noreply.github.com> Date: Tue, 5 Oct 2021 02:30:10 +0200 Subject: [PATCH 10/41] TST: dropping of nuisance columns for groupby ops #38815 (#43674) --- pandas/tests/groupby/test_groupby.py | 42 ++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 820a18d59d62e..c836ef0c6130c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -863,11 +863,6 @@ def test_groupby_multi_corner(df): def test_omit_nuisance(df): grouped = df.groupby("A") - - result = grouped.mean() - expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() - tm.assert_frame_equal(result, expected) - agged = grouped.agg(np.mean) exp = grouped.mean() tm.assert_frame_equal(agged, exp) @@ -886,14 +881,43 @@ def test_omit_nuisance(df): grouped.agg(lambda x: x.sum(0, numeric_only=False)) -def test_omit_nuisance_sem(df): - # GH 38774 - sem should work with nuisance columns +@pytest.mark.parametrize( + "agg_function", + ["max", "min"], +) +def test_keep_nuisance_agg(df, agg_function): + # GH 38815 + grouped = df.groupby("A") + result = getattr(grouped, agg_function)() + expected = result.copy() + expected.loc["bar", "B"] = getattr(df.loc[df["A"] == "bar", "B"], agg_function)() + expected.loc["foo", "B"] = getattr(df.loc[df["A"] == "foo", "B"], agg_function)() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "agg_function", + ["sum", "mean", "prod", "std", "var", "median"], +) +def test_omit_nuisance_agg(df, agg_function): + # GH 38774, GH 38815 grouped = df.groupby("A") - result = grouped.sem() - expected = df.loc[:, ["A", "C", "D"]].groupby("A").sem() + result = getattr(grouped, agg_function)() + expected = getattr(df.loc[:, ["A", "C", "D"]].groupby("A"), agg_function)() tm.assert_frame_equal(result, expected) +def test_omit_nuisance_warnings(df): + # GH 38815 + with tm.assert_produces_warning( + FutureWarning, filter_level="always", check_stacklevel=False + ): + grouped = df.groupby("A") + result = grouped.skew() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").skew() + tm.assert_frame_equal(result, expected) + + def test_omit_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) From 7afb062a217c018ee48f857fef256e16b59071d6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Oct 2021 17:32:09 -0700 Subject: [PATCH 11/41] BUG: retain EA dtypes in DataFrame __pos__, __neg__ (#43883) --- doc/source/whatsnew/v1.4.0.rst | 3 +- pandas/_libs/ops_dispatch.pyx | 41 ++++++++++++++---- pandas/core/arrays/numpy_.py | 9 ++++ pandas/core/generic.py | 47 +++++++++------------ pandas/tests/arithmetic/test_datetime64.py | 2 +- pandas/tests/arrays/test_numpy.py | 11 +++-- pandas/tests/frame/test_unary.py | 49 +++++++++++++++++++++- 7 files changed, 120 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8113ac97a3a37..dcd31abaa8857 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -126,7 +126,8 @@ Other enhancements - Attempting to write into a file in missing parent directory with :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_html`, :meth:`DataFrame.to_excel`, :meth:`DataFrame.to_feather`, :meth:`DataFrame.to_parquet`, :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_json`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_xml` now explicitly mentions missing parent directory, the same is true for :class:`Series` counterparts (:issue:`24306`) - :meth:`IntegerArray.all` , :meth:`IntegerArray.any`, :meth:`FloatingArray.any`, and :meth:`FloatingArray.all` use Kleene logic (:issue:`41967`) - Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`) -- +- :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) + .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/ops_dispatch.pyx b/pandas/_libs/ops_dispatch.pyx index f6ecef2038cf3..c34504732ac32 100644 --- a/pandas/_libs/ops_dispatch.pyx +++ b/pandas/_libs/ops_dispatch.pyx @@ -18,6 +18,14 @@ DISPATCHED_UFUNCS = { "or", "xor", "and", + "neg", + "pos", + "abs", +} +UNARY_UFUNCS = { + "neg", + "pos", + "abs", } UFUNC_ALIASES = { "subtract": "sub", @@ -36,6 +44,9 @@ UFUNC_ALIASES = { "bitwise_or": "or", "bitwise_and": "and", "bitwise_xor": "xor", + "negative": "neg", + "absolute": "abs", + "positive": "pos", } # For op(., Array) -> Array.__r{op}__ @@ -80,15 +91,31 @@ def maybe_dispatch_ufunc_to_dunder_op( def not_implemented(*args, **kwargs): return NotImplemented - if (method == "__call__" - and op_name in DISPATCHED_UFUNCS - and kwargs.get("out") is None): - if isinstance(inputs[0], type(self)): + if kwargs or ufunc.nin > 2: + return NotImplemented + + if method == "__call__" and op_name in DISPATCHED_UFUNCS: + + if inputs[0] is self: name = f"__{op_name}__" - return getattr(self, name, not_implemented)(inputs[1]) - else: + meth = getattr(self, name, not_implemented) + + if op_name in UNARY_UFUNCS: + assert len(inputs) == 1 + return meth() + + return meth(inputs[1]) + + elif inputs[1] is self: name = REVERSED_NAMES.get(op_name, f"__r{op_name}__") - result = getattr(self, name, not_implemented)(inputs[0]) + + meth = getattr(self, name, not_implemented) + result = meth(inputs[0]) return result + + else: + # should not be reached, but covering our bases + return NotImplemented + else: return NotImplemented diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 410497d61c98b..8fe0c0114fb04 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -388,6 +388,15 @@ def to_numpy( def __invert__(self) -> PandasArray: return type(self)(~self._ndarray) + def __neg__(self) -> PandasArray: + return type(self)(-self._ndarray) + + def __pos__(self) -> PandasArray: + return type(self)(+self._ndarray) + + def __abs__(self) -> PandasArray: + return type(self)(abs(self._ndarray)) + def _cmp_method(self, other, op): if isinstance(other, PandasArray): other = other._ndarray diff --git a/pandas/core/generic.py b/pandas/core/generic.py index af8c64d5c0202..b235f120d98c8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -36,6 +36,7 @@ to_offset, ) from pandas._typing import ( + ArrayLike, Axis, CompressionOptions, Dtype, @@ -90,7 +91,6 @@ is_list_like, is_number, is_numeric_dtype, - is_object_dtype, is_re_compilable, is_scalar, is_timedelta64_dtype, @@ -1495,36 +1495,27 @@ def equals(self, other: object) -> bool_t: @final def __neg__(self): - values = self._values - if is_bool_dtype(values): - arr = operator.inv(values) - elif ( - is_numeric_dtype(values) - or is_timedelta64_dtype(values) - or is_object_dtype(values) - ): - arr = operator.neg(values) - else: - raise TypeError(f"Unary negative expects numeric dtype, not {values.dtype}") - return self.__array_wrap__(arr) + def blk_func(values: ArrayLike): + if is_bool_dtype(values.dtype): + return operator.inv(values) + else: + return operator.neg(values) + + new_data = self._mgr.apply(blk_func) + res = self._constructor(new_data) + return res.__finalize__(self, method="__neg__") @final def __pos__(self): - values = self._values - if is_bool_dtype(values): - arr = values - elif ( - is_numeric_dtype(values) - or is_timedelta64_dtype(values) - or is_object_dtype(values) - ): - arr = operator.pos(values) - else: - raise TypeError( - "Unary plus expects bool, numeric, timedelta, " - f"or object dtype, not {values.dtype}" - ) - return self.__array_wrap__(arr) + def blk_func(values: ArrayLike): + if is_bool_dtype(values.dtype): + return values.copy() + else: + return operator.pos(values) + + new_data = self._mgr.apply(blk_func) + res = self._constructor(new_data) + return res.__finalize__(self, method="__pos__") @final def __invert__(self): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index c0f38a1181026..60a58b7bbea78 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1878,7 +1878,7 @@ def test_datetime64_ops_nat(self): # subtraction tm.assert_series_equal(-NaT + datetime_series, nat_series_dtype_timestamp) - msg = "Unary negative expects" + msg = "bad operand type for unary -: 'DatetimeArray'" with pytest.raises(TypeError, match=msg): -single_nat_dtype_datetime + datetime_series diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 753ec99e683e6..e8e9ee86e77dd 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -198,12 +198,17 @@ def test_validate_reduction_keyword_args(): # Ops -def test_ufunc(): +@pytest.mark.parametrize("ufunc", [np.abs, np.negative, np.positive]) +def test_ufunc_unary(ufunc): arr = PandasArray(np.array([-1.0, 0.0, 1.0])) - result = np.abs(arr) - expected = PandasArray(np.abs(arr._ndarray)) + result = ufunc(arr) + expected = PandasArray(ufunc(arr._ndarray)) tm.assert_extension_array_equal(result, expected) + +def test_ufunc(): + arr = PandasArray(np.array([-1.0, 0.0, 1.0])) + r1, r2 = np.divmod(arr, np.add(arr, 2)) e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2)) e1 = PandasArray(e1) diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index ea6243e2eae4a..2129586455333 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -49,7 +49,7 @@ def test_neg_object(self, df, expected): def test_neg_raises(self, df): msg = ( "bad operand type for unary -: 'str'|" - r"Unary negative expects numeric dtype, not datetime64\[ns\]" + r"bad operand type for unary -: 'DatetimeArray'" ) with pytest.raises(TypeError, match=msg): (-df) @@ -116,8 +116,53 @@ def test_pos_object(self, df): "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] ) def test_pos_raises(self, df): - msg = "Unary plus expects .* dtype, not datetime64\\[ns\\]" + msg = r"bad operand type for unary \+: 'DatetimeArray'" with pytest.raises(TypeError, match=msg): (+df) with pytest.raises(TypeError, match=msg): (+df["a"]) + + def test_unary_nullable(self): + df = pd.DataFrame( + { + "a": pd.array([1, -2, 3, pd.NA], dtype="Int64"), + "b": pd.array([4.0, -5.0, 6.0, pd.NA], dtype="Float32"), + "c": pd.array([True, False, False, pd.NA], dtype="boolean"), + # include numpy bool to make sure bool-vs-boolean behavior + # is consistent in non-NA locations + "d": np.array([True, False, False, True]), + } + ) + + result = +df + res_ufunc = np.positive(df) + expected = df + # TODO: assert that we have copies? + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(res_ufunc, expected) + + result = -df + res_ufunc = np.negative(df) + expected = pd.DataFrame( + { + "a": pd.array([-1, 2, -3, pd.NA], dtype="Int64"), + "b": pd.array([-4.0, 5.0, -6.0, pd.NA], dtype="Float32"), + "c": pd.array([False, True, True, pd.NA], dtype="boolean"), + "d": np.array([False, True, True, False]), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(res_ufunc, expected) + + result = abs(df) + res_ufunc = np.abs(df) + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, pd.NA], dtype="Int64"), + "b": pd.array([4.0, 5.0, 6.0, pd.NA], dtype="Float32"), + "c": pd.array([True, False, False, pd.NA], dtype="boolean"), + "d": np.array([True, False, False, True]), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(res_ufunc, expected) From 195f9cf098a91494be95cc82b2f2b1a8229b3192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Mon, 4 Oct 2021 17:32:36 -0700 Subject: [PATCH 12/41] TST: Test Series' settitem with Interval and NaN (#43844) --- pandas/tests/indexing/test_iloc.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b04a2c86a79d7..b8c53c7b59239 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -931,6 +931,17 @@ def test_iloc_setitem_td64_values_cast_na(self, value): expected = Series([NaT, 1, 2], dtype="timedelta64[ns]") tm.assert_series_equal(series, expected) + @pytest.mark.parametrize("not_na", [Interval(0, 1), "a", 1.0]) + def test_setitem_mix_of_nan_and_interval(self, not_na, nulls_fixture): + # GH#27937 + dtype = CategoricalDtype(categories=[not_na]) + ser = Series( + [nulls_fixture, nulls_fixture, nulls_fixture, nulls_fixture], dtype=dtype + ) + ser.iloc[:3] = [nulls_fixture, not_na, nulls_fixture] + exp = Series([nulls_fixture, not_na, nulls_fixture, nulls_fixture], dtype=dtype) + tm.assert_series_equal(ser, exp) + def test_iloc_setitem_empty_frame_raises_with_3d_ndarray(self): idx = Index([]) obj = DataFrame(np.random.randn(len(idx), len(idx)), index=idx, columns=idx) From 6021c0663f091b575bac1b1526cf801a6d73c854 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Oct 2021 17:33:02 -0700 Subject: [PATCH 13/41] PERF: tighter cython declarations, faster __iter__ (#43872) --- pandas/_libs/algos_common_helper.pxi.in | 6 ++--- pandas/_libs/algos_take_helper.pxi.in | 8 +++--- pandas/_libs/internals.pyx | 32 +++++++++++++++------- pandas/_libs/lib.pyx | 36 ++++++++++++++----------- pandas/_libs/testing.pyx | 5 ++-- 5 files changed, 51 insertions(+), 36 deletions(-) diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 87130906ef28b..4242a76dcc3b7 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -8,18 +8,16 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # ensure_dtype # ---------------------------------------------------------------------- -cdef int PLATFORM_INT = (np.arange(0, dtype=np.intp)).descr.type_num - def ensure_platform_int(object arr): # GH3033, GH1392 # platform int is the size of the int pointer, e.g. np.intp if util.is_array(arr): - if (arr).descr.type_num == PLATFORM_INT: + if (arr).descr.type_num == cnp.NPY_INTP: return arr else: # equiv: arr.astype(np.intp) - return cnp.PyArray_Cast(arr, PLATFORM_INT) + return cnp.PyArray_Cast(arr, cnp.NPY_INTP) else: return np.array(arr, dtype=np.intp) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index ec041c03b05e1..2a3858674af9e 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -103,7 +103,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[intp_t] indexer, + ndarray[intp_t, ndim=1] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): cdef: @@ -158,7 +158,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[intp_t] indexer, + ndarray[intp_t, ndim=1] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): @@ -195,8 +195,8 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - ndarray[intp_t] idx0 = indexer[0] - ndarray[intp_t] idx1 = indexer[1] + ndarray[intp_t, ndim=1] idx0 = indexer[0] + ndarray[intp_t, ndim=1] idx1 = indexer[1] {{c_type_out}} fv n = len(idx0) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 87709ac6c33bf..2f0bcefefaaa1 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -227,7 +227,7 @@ cdef class BlockPlacement: cdef: slice nv, s = self._ensure_has_slice() Py_ssize_t other_int, start, stop, step, l - ndarray newarr + ndarray[intp_t, ndim=1] newarr if s is not None: # see if we are either all-above or all-below, each of which @@ -260,7 +260,7 @@ cdef class BlockPlacement: cdef: slice slc = self._ensure_has_slice() slice new_slice - ndarray new_placement + ndarray[intp_t, ndim=1] new_placement if slc is not None and slc.step == 1: new_slc = slice(slc.start * factor, slc.stop * factor, 1) @@ -345,7 +345,9 @@ cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except - return length -cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): +cdef (Py_ssize_t, Py_ssize_t, Py_ssize_t, Py_ssize_t) slice_get_indices_ex( + slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX +): """ Get (start, stop, step, length) tuple for a slice. @@ -460,9 +462,11 @@ def get_blkno_indexers( # blockno handling. cdef: int64_t cur_blkno - Py_ssize_t i, start, stop, n, diff, tot_len + Py_ssize_t i, start, stop, n, diff + cnp.npy_intp tot_len int64_t blkno object group_dict = defaultdict(list) + ndarray[int64_t, ndim=1] arr n = blknos.shape[0] result = list() @@ -495,7 +499,8 @@ def get_blkno_indexers( result.append((blkno, slice(slices[0][0], slices[0][1]))) else: tot_len = sum(stop - start for start, stop in slices) - arr = np.empty(tot_len, dtype=np.int64) + # equiv np.empty(tot_len, dtype=np.int64) + arr = cnp.PyArray_EMPTY(1, &tot_len, cnp.NPY_INT64, 0) i = 0 for start, stop in slices: @@ -526,8 +531,13 @@ def get_blkno_placements(blknos, group: bool = True): yield blkno, BlockPlacement(indexer) +@cython.boundscheck(False) +@cython.wraparound(False) cpdef update_blklocs_and_blknos( - ndarray[intp_t] blklocs, ndarray[intp_t] blknos, Py_ssize_t loc, intp_t nblocks + ndarray[intp_t, ndim=1] blklocs, + ndarray[intp_t, ndim=1] blknos, + Py_ssize_t loc, + intp_t nblocks, ): """ Update blklocs and blknos when a new column is inserted at 'loc'. @@ -535,7 +545,7 @@ cpdef update_blklocs_and_blknos( cdef: Py_ssize_t i cnp.npy_intp length = len(blklocs) + 1 - ndarray[intp_t] new_blklocs, new_blknos + ndarray[intp_t, ndim=1] new_blklocs, new_blknos # equiv: new_blklocs = np.empty(length, dtype=np.intp) new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) @@ -693,7 +703,7 @@ cdef class BlockManager: cnp.npy_intp length = self.shape[0] SharedBlock blk BlockPlacement bp - ndarray[intp_t] new_blknos, new_blklocs + ndarray[intp_t, ndim=1] new_blknos, new_blklocs # equiv: np.empty(length, dtype=np.intp) new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) @@ -711,7 +721,11 @@ cdef class BlockManager: new_blknos[j] = blkno new_blklocs[j] = i - for blkno in new_blknos: + for i in range(length): + # faster than `for blkno in new_blknos` + # https://github.com/cython/cython/issues/4393 + blkno = new_blknos[i] + # If there are any -1s remaining, this indicates that our mgr_locs # are invalid. if blkno == -1: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7e49c7f1952c4..2c7b052917463 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -448,7 +448,7 @@ def fast_zip(list ndarrays) -> ndarray[object]: """ cdef: Py_ssize_t i, j, k, n - ndarray[object] result + ndarray[object, ndim=1] result flatiter it object val, tup @@ -507,7 +507,7 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: """ cdef: Py_ssize_t i, n = len(indexer) - ndarray[intp_t] rev_indexer + ndarray[intp_t, ndim=1] rev_indexer intp_t idx rev_indexer = np.empty(length, dtype=np.intp) @@ -540,7 +540,7 @@ def has_infs(floating[:] arr) -> bool: return ret -def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len): +def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) int k, vstart, vlast, v @@ -579,7 +579,7 @@ def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len): @cython.wraparound(False) @cython.boundscheck(False) -def maybe_booleans_to_slice(ndarray[uint8_t] mask): +def maybe_booleans_to_slice(ndarray[uint8_t, ndim=1] mask): cdef: Py_ssize_t i, n = len(mask) Py_ssize_t start = 0, end = 0 @@ -775,14 +775,14 @@ def is_all_arraylike(obj: list) -> bool: # is a general, O(max(len(values), len(binner))) method. @cython.boundscheck(False) @cython.wraparound(False) -def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, +def generate_bins_dt64(ndarray[int64_t, ndim=1] values, const int64_t[:] binner, object closed='left', bint hasnans=False): """ Int64 (datetime64) version of generic python version in ``groupby.py``. """ cdef: Py_ssize_t lenidx, lenbin, i, j, bc, vc - ndarray[int64_t] bins + ndarray[int64_t, ndim=1] bins int64_t l_bin, r_bin, nat_count bint right_closed = closed == 'right' @@ -931,7 +931,7 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): return np.asarray(starts), np.asarray(ends) -def indices_fast(ndarray[intp_t] index, const int64_t[:] labels, list keys, +def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys, list sorted_labels) -> dict: """ Parameters @@ -2067,7 +2067,9 @@ cdef bint is_period_array(ndarray[object] values): if len(values) == 0: return False - for val in values: + for i in range(n): + val = values[i] + if is_period_object(val): if dtype_code == -10000: dtype_code = val._dtype._dtype_code @@ -2102,7 +2104,9 @@ cpdef bint is_interval_array(ndarray values): if len(values) == 0: return False - for val in values: + for i in range(n): + val = values[i] + if is_interval(val): if closed is None: closed = val.closed @@ -2144,7 +2148,7 @@ cpdef bint is_interval_array(ndarray values): @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_numeric( - ndarray[object] values, + ndarray[object, ndim=1] values, set na_values, bint convert_empty=True, bint coerce_numeric=False, @@ -2205,12 +2209,12 @@ def maybe_convert_numeric( int status, maybe_int Py_ssize_t i, n = values.size Seen seen = Seen(coerce_numeric) - ndarray[float64_t] floats = np.empty(n, dtype='f8') - ndarray[complex128_t] complexes = np.empty(n, dtype='c16') - ndarray[int64_t] ints = np.empty(n, dtype='i8') - ndarray[uint64_t] uints = np.empty(n, dtype='u8') - ndarray[uint8_t] bools = np.empty(n, dtype='u1') - ndarray[uint8_t] mask = np.zeros(n, dtype="u1") + ndarray[float64_t, ndim=1] floats = np.empty(n, dtype='f8') + ndarray[complex128_t, ndim=1] complexes = np.empty(n, dtype='c16') + ndarray[int64_t, ndim=1] ints = np.empty(n, dtype='i8') + ndarray[uint64_t, ndim=1] uints = np.empty(n, dtype='u8') + ndarray[uint8_t, ndim=1] bools = np.empty(n, dtype='u1') + ndarray[uint8_t, ndim=1] mask = np.zeros(n, dtype="u1") float64_t fval bint allow_null_in_int = convert_to_masked_nullable diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index ff15a2c720c2c..cfe9f40f12452 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -7,10 +7,9 @@ from numpy cimport import_array import_array() -from pandas._libs.lib import is_complex - from pandas._libs.util cimport ( is_array, + is_complex_object, is_real_number_object, ) @@ -196,7 +195,7 @@ cpdef assert_almost_equal(a, b, f"with rtol={rtol}, atol={atol}") return True - if is_complex(a) and is_complex(b): + if is_complex_object(a) and is_complex_object(b): if array_equivalent(a, b, strict_nan=True): # inf comparison return True From aa0a1d6e6b3062978dbc4c6c95ff2ddf564bf1f9 Mon Sep 17 00:00:00 2001 From: michal-gh Date: Wed, 6 Oct 2021 03:02:47 +0200 Subject: [PATCH 14/41] PERF: read_csv with memory_map=True when file encoding is UTF-8 (#43787) (#43787) --- asv_bench/benchmarks/io/csv.py | 29 ++++++++++++++++++++++++ doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/common.py | 2 +- pandas/tests/io/parser/test_encoding.py | 30 +++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 35058ba03ade8..153cad403dcc3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,6 +10,7 @@ from pandas import ( Categorical, DataFrame, + concat, date_range, read_csv, to_datetime, @@ -459,6 +460,34 @@ def time_read_special_date(self, value, engine): ) +class ReadCSVMemMapUTF8: + + fname = "__test__.csv" + number = 5 + + def setup(self): + lines = [] + line_length = 128 + start_char = " " + end_char = "\U00010080" + # This for loop creates a list of 128-char strings + # consisting of consecutive Unicode chars + for lnum in range(ord(start_char), ord(end_char), line_length): + line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" + try: + line.encode("utf-8") + except UnicodeEncodeError: + # Some 16-bit words are not valid Unicode chars and must be skipped + continue + lines.append(line) + df = DataFrame(lines) + df = concat([df for n in range(100)], ignore_index=True) + df.to_csv(self.fname, index=False, header=False, encoding="utf-8") + + def time_read_memmapped_utf8(self): + read_csv(self.fname, header=None, memory_map=True, encoding="utf-8", engine="c") + + class ParseDateComparison(StringIORewind): params = ([False, True],) param_names = ["cache_dates"] diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index dcd31abaa8857..83820ac25491d 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -364,6 +364,7 @@ Performance improvements - Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`) - Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`) - Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`) +- Improved performance of :meth:`pandas.read_csv` with ``memory_map=True`` when file encoding is UTF-8 (:issue:`43787`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 6dfddd571b88f..be6577e646ac3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -874,7 +874,7 @@ def __iter__(self) -> _MMapWrapper: def read(self, size: int = -1) -> str | bytes: # CSV c-engine uses read instead of iterating content: bytes = self.mmap.read(size) - if self.decode: + if self.decode and self.encoding != "utf-8": # memory mapping is applied before compression. Encoding should # be applied to the de-compressed data. final = size == -1 or len(content) < size diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 6ca3fdf9a6258..2573314f155cf 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -272,6 +272,36 @@ def test_chunk_splits_multibyte_char(all_parsers): tm.assert_frame_equal(dfr, df) +@skip_pyarrow +def test_readcsv_memmap_utf8(all_parsers): + """ + GH 43787 + + Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8 + """ + lines = [] + line_length = 128 + start_char = " " + end_char = "\U00010080" + # This for loop creates a list of 128-char strings + # consisting of consecutive Unicode chars + for lnum in range(ord(start_char), ord(end_char), line_length): + line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n" + try: + line.encode("utf-8") + except UnicodeEncodeError: + continue + lines.append(line) + parser = all_parsers + df = DataFrame(lines) + with tm.ensure_clean("utf8test.csv") as fname: + df.to_csv(fname, index=False, header=False, encoding="utf-8") + dfr = parser.read_csv( + fname, header=None, memory_map=True, engine="c", encoding="utf-8" + ) + tm.assert_frame_equal(df, dfr) + + def test_not_readable(all_parsers): # GH43439 parser = all_parsers From ef35a190d18cd28ca6ba127fb2a88b64a588d8c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 5 Oct 2021 21:04:57 -0400 Subject: [PATCH 15/41] TYP: enable reportMissingImports (#43790) --- pandas/_libs/reshape.pyi | 2 +- pandas/_libs/tslibs/timedeltas.pyi | 2 +- pandas/io/excel/_pyxlsb.py | 1 + pyproject.toml | 3 +-- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/reshape.pyi b/pandas/_libs/reshape.pyi index 893826a35d41e..110687fcd0c31 100644 --- a/pandas/_libs/reshape.pyi +++ b/pandas/_libs/reshape.pyi @@ -1,6 +1,6 @@ import numpy as np -import pandas._tying as npt +from pandas._typing import npt def unstack( values: np.ndarray, # reshape_t[:, :] diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 8de02aa566456..7c0131cf28c9a 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -7,12 +7,12 @@ from typing import ( ) import numpy as np -from pands._typing import npt from pandas._libs.tslibs import ( NaTType, Tick, ) +from pandas._typing import npt _S = TypeVar("_S") diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 52a67336aaa82..4b2b9f7a3a678 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,3 +1,4 @@ +# pyright: reportMissingImports=false from __future__ import annotations from pandas._typing import ( diff --git a/pyproject.toml b/pyproject.toml index 0223a1c035cbc..fe48a4d684cf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -159,14 +159,13 @@ skip = "pandas/__init__.py" pythonVersion = "3.8" typeCheckingMode = "strict" include = ["pandas"] -exclude = ["pandas/tests", "pandas/util/version"] +exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version"] reportGeneralTypeIssues = false reportConstantRedefinition = false reportFunctionMemberAccess = false reportImportCycles = false reportIncompatibleMethodOverride = false reportIncompatibleVariableOverride = false -reportMissingImports = false reportMissingModuleSource = false reportMissingTypeArgument = false reportMissingTypeStubs = false From eefd0f0da3e2957db8ed091d380b1c4a6da9cdb5 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Wed, 6 Oct 2021 02:07:56 +0100 Subject: [PATCH 16/41] Don't suppress exception chaining for optional dependencies (#43882) --- doc/source/whatsnew/v1.4.0.rst | 3 ++- pandas/compat/_optional.py | 2 +- pandas/tests/test_optional_dependency.py | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 83820ac25491d..9ecd49ee31047 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -127,7 +127,8 @@ Other enhancements - :meth:`IntegerArray.all` , :meth:`IntegerArray.any`, :meth:`FloatingArray.any`, and :meth:`FloatingArray.all` use Kleene logic (:issue:`41967`) - Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`) - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) - +- The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) +- .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 651729cd0ad44..adf20f3322a79 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -115,7 +115,7 @@ def import_optional_dependency( module = importlib.import_module(name) except ImportError: if errors == "raise": - raise ImportError(msg) from None + raise ImportError(msg) else: return None diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index f75ee0d0ddd95..c1d1948d6c31a 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -13,8 +13,10 @@ def test_import_optional(): match = "Missing .*notapackage.* pip .* conda .* notapackage" - with pytest.raises(ImportError, match=match): + with pytest.raises(ImportError, match=match) as exc_info: import_optional_dependency("notapackage") + # The original exception should be there as context: + assert isinstance(exc_info.value.__context__, ImportError) result = import_optional_dependency("notapackage", errors="ignore") assert result is None From d3f5a4473dd2ecb10125d70aa97a9792ac43aa04 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 5 Oct 2021 18:08:43 -0700 Subject: [PATCH 17/41] BUG: DataFrame arithmetic with subclass where constructor is not the subclass itself (#43897) --- doc/source/whatsnew/v1.4.0.rst | 2 ++ pandas/core/frame.py | 4 +-- pandas/tests/frame/test_arithmetic.py | 37 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9ecd49ee31047..2a3049895a390 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -408,6 +408,8 @@ Numeric - Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`) - Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`) - Bug in ``numexpr`` engine still being used when the option ``compute.use_numexpr`` is set to ``False`` (:issue:`32556`) +- Bug in :class:`DataFrame` arithmetic ops with a subclass whose :meth:`_constructor` attribute is a callable other than the subclass itself (:issue:`43201`) +- Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69ba0904165f7..2eb66c7db0ba6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6955,7 +6955,7 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): # i.e. scalar, faster than checking np.ndim(right) == 0 with np.errstate(all="ignore"): bm = self._mgr.apply(array_op, right=right) - return type(self)(bm) + return self._constructor(bm) elif isinstance(right, DataFrame): assert self.index.equals(right.index) @@ -6976,7 +6976,7 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): right._mgr, # type: ignore[arg-type] array_op, ) - return type(self)(bm) + return self._constructor(bm) elif isinstance(right, Series) and axis == 1: # axis=1 means we want to operate row-by-row diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index afa9593807acc..1ddb18c218cc6 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,5 +1,6 @@ from collections import deque from datetime import datetime +import functools import operator import re @@ -1845,3 +1846,39 @@ def test_bool_frame_mult_float(): result = df * 1.0 expected = DataFrame(np.ones((2, 2)), list("ab"), list("cd")) tm.assert_frame_equal(result, expected) + + +def test_frame_op_subclass_nonclass_constructor(): + # GH#43201 subclass._constructor is a function, not the subclass itself + + class SubclassedSeries(Series): + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(DataFrame): + _metadata = ["my_extra_data"] + + def __init__(self, my_extra_data, *args, **kwargs): + self.my_extra_data = my_extra_data + super().__init__(*args, **kwargs) + + @property + def _constructor(self): + return functools.partial(type(self), self.my_extra_data) + + @property + def _constructor_sliced(self): + return SubclassedSeries + + sdf = SubclassedDataFrame("some_data", {"A": [1, 2, 3], "B": [4, 5, 6]}) + result = sdf * 2 + expected = SubclassedDataFrame("some_data", {"A": [2, 4, 6], "B": [8, 10, 12]}) + tm.assert_frame_equal(result, expected) + + result = sdf + sdf + tm.assert_frame_equal(result, expected) From 114621514a608a30130bf530fcef7df9d54f612f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 5 Oct 2021 18:09:29 -0700 Subject: [PATCH 18/41] REF: remove _get_attributes_dict (#43895) --- pandas/core/indexes/base.py | 30 ++++++++++-------------------- pandas/core/indexes/category.py | 2 -- pandas/core/indexes/datetimes.py | 7 +------ pandas/core/indexes/interval.py | 8 ++++++-- pandas/core/indexes/period.py | 1 - pandas/core/indexes/range.py | 5 ++--- 6 files changed, 19 insertions(+), 34 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c9e128ffc4289..2b49a88e27961 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -728,13 +728,6 @@ def _format_duplicate_message(self) -> DataFrame: # -------------------------------------------------------------------- # Index Internals Methods - @final - def _get_attributes_dict(self) -> dict[str_t, Any]: - """ - Return an attributes dict for my class. - """ - return {k: getattr(self, k, None) for k in self._attributes} - def _shallow_copy(self: _IndexT, values, name: Hashable = no_default) -> _IndexT: """ Create a new Index with the same class as the caller, don't copy the @@ -859,9 +852,7 @@ def __array_wrap__(self, result, context=None): if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: return result - attrs = self._get_attributes_dict() - attrs.pop("freq", None) # For DatetimeIndex/TimedeltaIndex - return Index(result, **attrs) + return Index(result, name=self.name) @cache_readonly def dtype(self) -> DtypeObj: @@ -2493,8 +2484,7 @@ def _is_multi(self) -> bool: # Pickle Methods def __reduce__(self): - d = {"data": self._data} - d.update(self._get_attributes_dict()) + d = {"data": self._data, "name": self.name} return _new_Index, (type(self), d), None # -------------------------------------------------------------------- @@ -5820,29 +5810,29 @@ def map(self, mapper, na_action=None): new_values = self._map_values(mapper, na_action=na_action) - attributes = self._get_attributes_dict() - # we can return a MultiIndex if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): names = self.names - elif attributes.get("name"): - names = [attributes.get("name")] * len(new_values[0]) + elif self.name: + names = [self.name] * len(new_values[0]) else: names = None return MultiIndex.from_tuples(new_values, names=names) - attributes["copy"] = False + dtype = None if not new_values.size: # empty - attributes["dtype"] = self.dtype + dtype = self.dtype if self._is_backward_compat_public_numeric_index and is_numeric_dtype( new_values.dtype ): - return self._constructor(new_values, **attributes) + return self._constructor( + new_values, dtype=dtype, copy=False, name=self.name + ) - return Index._with_infer(new_values, **attributes) + return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) # TODO: De-duplicate with map, xref GH#32349 @final diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c45543a9187bd..02bbfe69be1b8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -194,8 +194,6 @@ def _engine_type(self): np.int64: libindex.Int64Engine, }[self.codes.dtype.type] - _attributes = ["name"] - # -------------------------------------------------------------------- # Constructors diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d556466554ea4..6078da3bedd8c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -357,12 +357,7 @@ def _is_dates_only(self) -> bool: return self.tz is None and is_dates_only(self._values) # type: ignore[arg-type] def __reduce__(self): - - # we use a special reduce here because we need - # to simply set the .tz (and not reinterpret it) - - d = {"data": self._data} - d.update(self._get_attributes_dict()) + d = {"data": self._data, "name": self.name} return _new_DatetimeIndex, (type(self), d), None def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f494638ba1aa4..165048e2a591a 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -354,8 +354,12 @@ def _multiindex(self) -> MultiIndex: return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) def __reduce__(self): - d = {"left": self.left, "right": self.right, "closed": self.closed} - d.update(self._get_attributes_dict()) + d = { + "left": self.left, + "right": self.right, + "closed": self.closed, + "name": self.name, + } return _new_IntervalIndex, (type(self), d), None @property diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 4c4902d3ce89f..e422f2bc3ff9a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -148,7 +148,6 @@ class PeriodIndex(DatetimeIndexOpsMixin): """ _typ = "periodindex" - _attributes = ["name"] _data: PeriodArray freq: BaseOffset diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 71bc4af78db6b..51d9f15390789 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -206,7 +206,7 @@ def _get_data_as_items(self): return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)] def __reduce__(self): - d = self._get_attributes_dict() + d = {"name": self.name} d.update(dict(self._get_data_as_items())) return ibase._new_Index, (type(self), d), None @@ -913,7 +913,6 @@ def _arith_method(self, other, op): # TODO: if other is a RangeIndex we may have more efficient options other = extract_array(other, extract_numpy=True, extract_range=True) - attrs = self._get_attributes_dict() left, right = self, other @@ -935,7 +934,7 @@ def _arith_method(self, other, op): rstart = op(left.start, right) rstop = op(left.stop, right) - result = type(self)(rstart, rstop, rstep, **attrs) + result = type(self)(rstart, rstop, rstep, name=self.name) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return From 58ff02d373e8c9b46d0f3125835133dab700d705 Mon Sep 17 00:00:00 2001 From: Nikita Sobolev Date: Wed, 6 Oct 2021 04:10:03 +0300 Subject: [PATCH 19/41] Annotates `indexers/utils.py` functions that don't return anything with `None` (#43893) --- pandas/core/indexers/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexers/utils.py b/pandas/core/indexers/utils.py index eacc7960a82aa..23d83343c96a2 100644 --- a/pandas/core/indexers/utils.py +++ b/pandas/core/indexers/utils.py @@ -363,7 +363,7 @@ def length_of_indexer(indexer, target=None) -> int: raise AssertionError("cannot find the length of the indexer") -def deprecate_ndim_indexing(result, stacklevel: int = 3): +def deprecate_ndim_indexing(result, stacklevel: int = 3) -> None: """ Helper function to raise the deprecation warning for multi-dimensional indexing on 1D Series/Index. @@ -409,7 +409,7 @@ def unpack_1tuple(tup): return tup -def check_key_length(columns: Index, key, value: DataFrame): +def check_key_length(columns: Index, key, value: DataFrame) -> None: """ Checks if a key used as indexer has the same length as the columns it is associated with. From c9b0a6d1bcb3d2363a3867a8bc6e5c66a56c556b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 6 Oct 2021 05:27:58 -0700 Subject: [PATCH 20/41] CI: Test Python 3.10 on MacOS and Windows too (#43772) --- .github/workflows/python-dev.yml | 23 ++++++++++++++++++----- pandas/tests/frame/test_reductions.py | 8 +++----- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 596c3b6df9d49..3a139936fbd22 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -21,12 +21,20 @@ env: jobs: build: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macOS-latest, windows-latest] + name: actions-310-dev timeout-minutes: 60 + env: + NUMPY_WHEELS_AVAILABLE: ${{ matrix.os == 'ubuntu-latest' }} + concurrency: - group: ${{ github.ref }}-dev + group: ${{ github.ref }}-${{ matrix.os }}-dev cancel-in-progress: ${{github.event_name == 'pull_request'}} steps: @@ -40,12 +48,16 @@ jobs: python-version: '3.10-dev' - name: Install dependencies + shell: bash run: | python -m pip install --upgrade pip setuptools wheel - pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy - pip install git+https://github.com/pytest-dev/pytest.git + if [[ "$NUMPY_WHEELS_AVAILABLE" == "true" ]]; then + pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy + else + pip install git+https://github.com/numpy/numpy.git + fi pip install git+https://github.com/nedbat/coveragepy.git - pip install cython python-dateutil pytz hypothesis pytest-xdist pytest-cov + pip install cython python-dateutil pytz hypothesis pytest>=6.2.5 pytest-xdist pytest-cov pip list - name: Build Pandas @@ -58,6 +70,7 @@ jobs: python -c "import pandas; pandas.show_versions();" - name: Test with pytest + shell: bash run: | ci/run_tests.sh # GH 41935 diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 886cdfb7d76b0..258e4e6eb0cc9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1366,11 +1366,9 @@ def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture if isinstance(tz, tzlocal) and is_platform_windows(): - request.node.add_marker( - pytest.mark.xfail( - reason="GH#37659 OSError raised within tzlocal bc Windows " - "chokes in times before 1970-01-01" - ) + pytest.skip( + "GH#37659 OSError raised within tzlocal bc Windows " + "chokes in times before 1970-01-01" ) df = DataFrame( From 28c28c76e19efb76297a536c800de6c1402919ff Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 6 Oct 2021 05:57:43 -0700 Subject: [PATCH 21/41] ENH: ExponentialMovingWindow.sum (#43871) --- doc/source/reference/window.rst | 1 + doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/_libs/window/aggregations.pyi | 3 +- pandas/_libs/window/aggregations.pyx | 57 +++++++------- pandas/core/window/ewm.py | 85 +++++++++++++++++--- pandas/core/window/numba_.py | 111 +++++++++++++++------------ pandas/tests/window/test_ewm.py | 19 +++++ pandas/tests/window/test_numba.py | 29 ++++--- 8 files changed, 207 insertions(+), 100 deletions(-) diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 5e230a533625f..0be3184a9356c 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -88,6 +88,7 @@ Exponentially-weighted window functions :toctree: api/ ExponentialMovingWindow.mean + ExponentialMovingWindow.sum ExponentialMovingWindow.std ExponentialMovingWindow.var ExponentialMovingWindow.corr diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2a3049895a390..daf0d0d000079 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -128,7 +128,7 @@ Other enhancements - Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`) - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) -- +- Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index 879809a259266..f3317ff5a60be 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -100,7 +100,7 @@ def roll_weighted_var( minp: int, # int64_t ddof: int, # unsigned int ) -> np.ndarray: ... # np.ndarray[np.float64] -def ewma( +def ewm( vals: np.ndarray, # const float64_t[:] start: np.ndarray, # const int64_t[:] end: np.ndarray, # const int64_t[:] @@ -109,6 +109,7 @@ def ewma( adjust: bool, ignore_na: bool, deltas: np.ndarray, # const float64_t[:] + normalize: bool, ) -> np.ndarray: ... # np.ndarray[np.float64] def ewmcov( input_x: np.ndarray, # const float64_t[:] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 29fe20090875b..1941a3c4a37f0 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1604,13 +1604,13 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, # ---------------------------------------------------------------------- -# Exponentially weighted moving average +# Exponentially weighted moving -def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, - int minp, float64_t com, bint adjust, bint ignore_na, - const float64_t[:] deltas=None) -> np.ndarray: +def ewm(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, + int minp, float64_t com, bint adjust, bint ignore_na, + const float64_t[:] deltas=None, bint normalize=True) -> np.ndarray: """ - Compute exponentially-weighted moving average using center-of-mass. + Compute exponentially-weighted moving average or sum using center-of-mass. Parameters ---------- @@ -1623,6 +1623,8 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, ignore_na : bool deltas : ndarray (float64 type), optional. If None, implicitly assumes equally spaced points (used when `times` is not passed) + normalize : bool, optional. + If True, calculate the mean. If False, calculate the sum. Returns ------- @@ -1634,7 +1636,7 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, const float64_t[:] sub_vals const float64_t[:] sub_deltas=None ndarray[float64_t] sub_output, output = np.empty(N, dtype=np.float64) - float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + float64_t alpha, old_wt_factor, new_wt, weighted, old_wt, cur bint is_observation, use_deltas if N == 0: @@ -1657,10 +1659,10 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, win_size = len(sub_vals) sub_output = np.empty(win_size, dtype=np.float64) - weighted_avg = sub_vals[0] - is_observation = weighted_avg == weighted_avg + weighted = sub_vals[0] + is_observation = weighted == weighted nobs = int(is_observation) - sub_output[0] = weighted_avg if nobs >= minp else NaN + sub_output[0] = weighted if nobs >= minp else NaN old_wt = 1. with nogil: @@ -1668,37 +1670,38 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, cur = sub_vals[i] is_observation = cur == cur nobs += is_observation - if weighted_avg == weighted_avg: + if weighted == weighted: if is_observation or not ignore_na: - if use_deltas: - old_wt *= old_wt_factor ** sub_deltas[i - 1] + if normalize: + if use_deltas: + old_wt *= old_wt_factor ** sub_deltas[i - 1] + else: + old_wt *= old_wt_factor else: - old_wt *= old_wt_factor + weighted = old_wt_factor * weighted if is_observation: - - # avoid numerical errors on constant series - if weighted_avg != cur: - weighted_avg = ((old_wt * weighted_avg) + - (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt + if normalize: + # avoid numerical errors on constant series + if weighted != cur: + weighted = old_wt * weighted + new_wt * cur + weighted /= (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. else: - old_wt = 1. + weighted += cur elif is_observation: - weighted_avg = cur + weighted = cur - sub_output[i] = weighted_avg if nobs >= minp else NaN + sub_output[i] = weighted if nobs >= minp else NaN output[s:e] = sub_output return output -# ---------------------------------------------------------------------- -# Exponentially weighted moving covariance - - def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:] end, int minp, const float64_t[:] input_y, float64_t com, bint adjust, bint ignore_na, bint bias) -> np.ndarray: diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 29a6704ae5092..d769f846b3bdc 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -46,8 +46,8 @@ window_agg_numba_parameters, ) from pandas.core.window.numba_ import ( - generate_ewma_numba_table_func, - generate_numba_ewma_func, + generate_numba_ewm_func, + generate_numba_ewm_table_func, ) from pandas.core.window.online import ( EWMMeanState, @@ -469,17 +469,21 @@ def aggregate(self, func, *args, **kwargs): def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): if self.method == "single": - ewma_func = generate_numba_ewma_func( - engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas - ) - numba_cache_key = (lambda x: x, "ewma") + func = generate_numba_ewm_func + numba_cache_key = (lambda x: x, "ewm_mean") else: - ewma_func = generate_ewma_numba_table_func( - engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas - ) - numba_cache_key = (lambda x: x, "ewma_table") + func = generate_numba_ewm_table_func + numba_cache_key = (lambda x: x, "ewm_mean_table") + ewm_func = func( + engine_kwargs=engine_kwargs, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=self._deltas, + normalize=True, + ) return self._apply( - ewma_func, + ewm_func, numba_cache_key=numba_cache_key, ) elif engine in ("cython", None): @@ -489,11 +493,68 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): deltas = None if self.times is None else self._deltas window_func = partial( - window_aggregations.ewma, + window_aggregations.ewm, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=deltas, + normalize=True, + ) + return self._apply(window_func) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + @doc( + template_header, + create_section_header("Parameters"), + args_compat, + window_agg_numba_parameters, + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes.replace("\n", "", 1), + window_method="ewm", + aggregation_description="(exponential weighted moment) sum", + agg_method="sum", + ) + def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): + if not self.adjust: + raise NotImplementedError("sum is not implemented with adjust=False") + if maybe_use_numba(engine): + if self.method == "single": + func = generate_numba_ewm_func + numba_cache_key = (lambda x: x, "ewm_sum") + else: + func = generate_numba_ewm_table_func + numba_cache_key = (lambda x: x, "ewm_sum_table") + ewm_func = func( + engine_kwargs=engine_kwargs, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=self._deltas, + normalize=False, + ) + return self._apply( + ewm_func, + numba_cache_key=numba_cache_key, + ) + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + nv.validate_window_func("sum", args, kwargs) + + deltas = None if self.times is None else self._deltas + window_func = partial( + window_aggregations.ewm, com=self._com, adjust=self.adjust, ignore_na=self.ignore_na, deltas=deltas, + normalize=False, ) return self._apply(window_func) else: diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index ab1eb9d3a2688..f41711a4d1f19 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -80,15 +80,16 @@ def roll_apply( return roll_apply -def generate_numba_ewma_func( +def generate_numba_ewm_func( engine_kwargs: dict[str, bool] | None, com: float, adjust: bool, ignore_na: bool, deltas: np.ndarray, + normalize: bool, ): """ - Generate a numba jitted ewma function specified by values + Generate a numba jitted ewm mean or sum function specified by values from engine_kwargs. Parameters @@ -99,6 +100,7 @@ def generate_numba_ewma_func( adjust : bool ignore_na : bool deltas : numpy.ndarray + normalize : bool Returns ------- @@ -106,14 +108,15 @@ def generate_numba_ewma_func( """ nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - cache_key = (lambda x: x, "ewma") + str_key = "ewm_mean" if normalize else "ewm_sum" + cache_key = (lambda x: x, str_key) if cache_key in NUMBA_FUNC_CACHE: return NUMBA_FUNC_CACHE[cache_key] numba = import_optional_dependency("numba") @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def ewma( + def ewm( values: np.ndarray, begin: np.ndarray, end: np.ndarray, @@ -130,43 +133,47 @@ def ewma( window = values[start:stop] sub_result = np.empty(len(window)) - weighted_avg = window[0] - nobs = int(not np.isnan(weighted_avg)) - sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan + weighted = window[0] + nobs = int(not np.isnan(weighted)) + sub_result[0] = weighted if nobs >= minimum_periods else np.nan old_wt = 1.0 for j in range(1, len(window)): cur = window[j] is_observation = not np.isnan(cur) nobs += is_observation - if not np.isnan(weighted_avg): + if not np.isnan(weighted): if is_observation or not ignore_na: - - # note that len(deltas) = len(vals) - 1 and deltas[i] is to be - # used in conjunction with vals[i+1] - old_wt *= old_wt_factor ** deltas[start + j - 1] + if normalize: + # note that len(deltas) = len(vals) - 1 and deltas[i] + # is to be used in conjunction with vals[i+1] + old_wt *= old_wt_factor ** deltas[start + j - 1] + else: + weighted = old_wt_factor * weighted if is_observation: - - # avoid numerical errors on constant series - if weighted_avg != cur: - weighted_avg = ( - (old_wt * weighted_avg) + (new_wt * cur) - ) / (old_wt + new_wt) - if adjust: - old_wt += new_wt + if normalize: + # avoid numerical errors on constant series + if weighted != cur: + weighted = old_wt * weighted + new_wt * cur + if normalize: + weighted = weighted / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1.0 else: - old_wt = 1.0 + weighted += cur elif is_observation: - weighted_avg = cur + weighted = cur - sub_result[j] = weighted_avg if nobs >= minimum_periods else np.nan + sub_result[j] = weighted if nobs >= minimum_periods else np.nan result[start:stop] = sub_result return result - return ewma + return ewm def generate_numba_table_func( @@ -252,15 +259,16 @@ def nan_agg_with_axis(table): return nan_agg_with_axis -def generate_ewma_numba_table_func( +def generate_numba_ewm_table_func( engine_kwargs: dict[str, bool] | None, com: float, adjust: bool, ignore_na: bool, deltas: np.ndarray, + normalize: bool, ): """ - Generate a numba jitted ewma function applied table wise specified + Generate a numba jitted ewm mean or sum function applied table wise specified by values from engine_kwargs. Parameters @@ -271,6 +279,7 @@ def generate_ewma_numba_table_func( adjust : bool ignore_na : bool deltas : numpy.ndarray + normalize: bool Returns ------- @@ -278,14 +287,15 @@ def generate_ewma_numba_table_func( """ nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - cache_key = (lambda x: x, "ewma_table") + str_key = "ewm_mean_table" if normalize else "ewm_sum_table" + cache_key = (lambda x: x, str_key) if cache_key in NUMBA_FUNC_CACHE: return NUMBA_FUNC_CACHE[cache_key] numba = import_optional_dependency("numba") @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def ewma_table( + def ewm_table( values: np.ndarray, begin: np.ndarray, end: np.ndarray, @@ -297,35 +307,42 @@ def ewma_table( old_wt = np.ones(values.shape[1]) result = np.empty(values.shape) - weighted_avg = values[0].copy() - nobs = (~np.isnan(weighted_avg)).astype(np.int64) - result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) + weighted = values[0].copy() + nobs = (~np.isnan(weighted)).astype(np.int64) + result[0] = np.where(nobs >= minimum_periods, weighted, np.nan) for i in range(1, len(values)): cur = values[i] is_observations = ~np.isnan(cur) nobs += is_observations.astype(np.int64) for j in numba.prange(len(cur)): - if not np.isnan(weighted_avg[j]): + if not np.isnan(weighted[j]): if is_observations[j] or not ignore_na: - - # note that len(deltas) = len(vals) - 1 and deltas[i] is to be - # used in conjunction with vals[i+1] - old_wt[j] *= old_wt_factor ** deltas[i - 1] + if normalize: + # note that len(deltas) = len(vals) - 1 and deltas[i] + # is to be used in conjunction with vals[i+1] + old_wt[j] *= old_wt_factor ** deltas[i - 1] + else: + weighted[j] = old_wt_factor * weighted[j] if is_observations[j]: - # avoid numerical errors on constant series - if weighted_avg[j] != cur[j]: - weighted_avg[j] = ( - (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j]) - ) / (old_wt[j] + new_wt) - if adjust: - old_wt[j] += new_wt + if normalize: + # avoid numerical errors on constant series + if weighted[j] != cur[j]: + weighted[j] = ( + old_wt[j] * weighted[j] + new_wt * cur[j] + ) + if normalize: + weighted[j] = weighted[j] / (old_wt[j] + new_wt) + if adjust: + old_wt[j] += new_wt + else: + old_wt[j] = 1.0 else: - old_wt[j] = 1.0 + weighted[j] += cur[j] elif is_observations[j]: - weighted_avg[j] = cur[j] + weighted[j] = cur[j] - result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) + result[i] = np.where(nobs >= minimum_periods, weighted, np.nan) return result - return ewma_table + return ewm_table diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 5579444f99bbb..4cb5d0342572b 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -241,3 +241,22 @@ def test_times_string_col_deprecated(): result = df.ewm(halflife="1 day", min_periods=0, times="time_col").mean() expected = df.ewm(halflife=1.0, min_periods=0).mean() tm.assert_frame_equal(result, expected) + + +def test_ewm_sum_adjust_false_notimplemented(): + data = Series(range(1)).ewm(com=1, adjust=False) + with pytest.raises(NotImplementedError, match="sum is not"): + data.sum() + + +@pytest.mark.parametrize( + "expected_data, ignore", + [[[10.0, 5.0, 2.5, 11.25], False], [[10.0, 5.0, 5.0, 12.5], True]], +) +def test_ewm_sum(expected_data, ignore): + # xref from Numbagg tests + # https://github.com/numbagg/numbagg/blob/v0.2.1/numbagg/test/test_moving.py#L50 + data = Series([10, 0, np.nan, 10]) + result = data.ewm(alpha=0.5, ignore_na=ignore).sum() + expected = Series(expected_data) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index af2ca7270c982..d47b3e856cb25 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -159,28 +159,31 @@ def add(values, x): @td.skip_if_no("numba") -class TestEWMMean: +class TestEWM: @pytest.mark.parametrize( "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] ) - def test_invalid_engine(self, grouper): + @pytest.mark.parametrize("method", ["mean", "sum"]) + def test_invalid_engine(self, grouper, method): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="engine must be either"): - grouper(df).ewm(com=1.0).mean(engine="foo") + getattr(grouper(df).ewm(com=1.0), method)(engine="foo") @pytest.mark.parametrize( "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] ) - def test_invalid_engine_kwargs(self, grouper): + @pytest.mark.parametrize("method", ["mean", "sum"]) + def test_invalid_engine_kwargs(self, grouper, method): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="cython engine does not"): - grouper(df).ewm(com=1.0).mean( + getattr(grouper(df).ewm(com=1.0), method)( engine="cython", engine_kwargs={"nopython": True} ) @pytest.mark.parametrize("grouper", ["None", "groupby"]) + @pytest.mark.parametrize("method", ["mean", "sum"]) def test_cython_vs_numba( - self, grouper, nogil, parallel, nopython, ignore_na, adjust + self, grouper, method, nogil, parallel, nopython, ignore_na, adjust ): if grouper == "None": grouper = lambda x: x @@ -188,15 +191,16 @@ def test_cython_vs_numba( else: grouper = lambda x: x.groupby("A") warn = None - + if method == "sum": + adjust = True df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} with tm.assert_produces_warning(warn, match="nuisance"): # GH#42738 - result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = ewm.mean(engine="cython") + result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs) + expected = getattr(ewm, method)(engine="cython") tm.assert_frame_equal(result, expected) @@ -358,15 +362,16 @@ def test_table_method_expanding_methods( tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("data", [np.eye(3), np.ones((2, 3)), np.ones((3, 2))]) - def test_table_method_ewm(self, data, axis, nogil, parallel, nopython): + @pytest.mark.parametrize("method", ["mean", "sum"]) + def test_table_method_ewm(self, data, method, axis, nogil, parallel, nopython): engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(data) - result = df.ewm(com=1, method="table", axis=axis).mean( + result = getattr(df.ewm(com=1, method="table", axis=axis), method)( engine_kwargs=engine_kwargs, engine="numba" ) - expected = df.ewm(com=1, method="single", axis=axis).mean( + expected = getattr(df.ewm(com=1, method="single", axis=axis), method)( engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) From f157d4d77f35ac0a9296e0a5ef6b5132b6300eed Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Wed, 6 Oct 2021 09:29:09 -0400 Subject: [PATCH 22/41] TST: slow collection in test_algos.py (#43898) --- pandas/tests/test_algos.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5488c076554fd..9a9cd9fa4baaa 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1797,13 +1797,13 @@ def test_too_many_ndims(self): @pytest.mark.single @pytest.mark.high_memory - @pytest.mark.parametrize( - "values", - [np.arange(2 ** 24 + 1), np.arange(2 ** 25 + 2).reshape(2 ** 24 + 1, 2)], - ids=["1d", "2d"], - ) - def test_pct_max_many_rows(self, values): + def test_pct_max_many_rows(self): # GH 18271 + values = np.arange(2 ** 24 + 1) + result = algos.rank(values, pct=True).max() + assert result == 1 + + values = np.arange(2 ** 25 + 2).reshape(2 ** 24 + 1, 2) result = algos.rank(values, pct=True).max() assert result == 1 From cdc7b4a86a1e63c03eb437eecd11c1396525e79a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Oct 2021 11:32:42 -0700 Subject: [PATCH 23/41] ENH: implement ExtensionArray.__array_ufunc__ (#43899) --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/arraylike.py | 20 +++++++++++++- pandas/core/arrays/base.py | 15 +++++++++++ pandas/core/arrays/boolean.py | 3 +++ pandas/tests/arrays/boolean/test_ops.py | 7 +++++ pandas/tests/arrays/test_timedeltas.py | 19 ++++++++++++++ pandas/tests/extension/arrow/test_bool.py | 5 +++- pandas/tests/extension/base/ops.py | 32 ++++++++++++++++++++--- 8 files changed, 96 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index daf0d0d000079..22b49c35e0e68 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -522,7 +522,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- +- NumPy ufuncs ``np.abs``, ``np.positive``, ``np.negative`` now correctly preserve dtype when called on ExtensionArrays that implement ``__abs__, __pos__, __neg__``, respectively. In particular this is fixed for :class:`TimedeltaArray` (:issue:`43899`) - Styler diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index f114278caf3ee..3d209189d97d8 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -371,6 +371,8 @@ def reconstruct(result): # * len(inputs) > 1 is doable when we know that we have # aligned blocks / dtypes. inputs = tuple(np.asarray(x) for x in inputs) + # Note: we can't use default_array_ufunc here bc reindexing means + # that `self` may not be among `inputs` result = getattr(ufunc, method)(*inputs, **kwargs) elif self.ndim == 1: # ufunc(series, ...) @@ -387,7 +389,7 @@ def reconstruct(result): else: # otherwise specific ufunc methods (eg np..accumulate(..)) # Those can have an axis keyword and thus can't be called block-by-block - result = getattr(ufunc, method)(np.asarray(inputs[0]), **kwargs) + result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs) result = reconstruct(result) return result @@ -452,3 +454,19 @@ def _assign_where(out, result, where) -> None: out[:] = result else: np.putmask(out, where, result) + + +def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + """ + Fallback to the behavior we would get if we did not define __array_ufunc__. + + Notes + ----- + We are assuming that `self` is among `inputs`. + """ + if not any(x is self for x in inputs): + raise NotImplementedError + + new_inputs = [x if x is not self else np.asarray(x) for x in inputs] + + return getattr(ufunc, method)(*new_inputs, **kwargs) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8ee5a4a2d913a..b17f309e5f9fb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -65,6 +65,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ( + arraylike, missing, ops, ) @@ -1366,6 +1367,20 @@ def _empty(cls, shape: Shape, dtype: ExtensionDtype): ) return result + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + if any( + isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)) for other in inputs + ): + return NotImplemented + + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) + class ExtensionOpsMixin: """ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 69896a389102f..1df7c191bdb68 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -604,3 +604,6 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): else: result[mask] = np.nan return result + + def __abs__(self): + return self.copy() diff --git a/pandas/tests/arrays/boolean/test_ops.py b/pandas/tests/arrays/boolean/test_ops.py index 52f602258a049..95ebe8528c2e5 100644 --- a/pandas/tests/arrays/boolean/test_ops.py +++ b/pandas/tests/arrays/boolean/test_ops.py @@ -18,3 +18,10 @@ def test_invert(self): {"A": expected, "B": [False, True, True]}, index=["a", "b", "c"] ) tm.assert_frame_equal(result, expected) + + def test_abs(self): + # matching numpy behavior, abs is the identity function + arr = pd.array([True, False, None], dtype="boolean") + result = abs(arr) + + tm.assert_extension_array_equal(result, arr) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 9e2b8e0f1603e..98329776242f1 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -90,6 +90,19 @@ def test_abs(self): result = abs(arr) tm.assert_timedelta_array_equal(result, expected) + result2 = np.abs(arr) + tm.assert_timedelta_array_equal(result2, expected) + + def test_pos(self): + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") + arr = TimedeltaArray(vals) + + result = +arr + tm.assert_timedelta_array_equal(result, arr) + + result2 = np.positive(arr) + tm.assert_timedelta_array_equal(result2, arr) + def test_neg(self): vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") arr = TimedeltaArray(vals) @@ -100,6 +113,9 @@ def test_neg(self): result = -arr tm.assert_timedelta_array_equal(result, expected) + result2 = np.negative(arr) + tm.assert_timedelta_array_equal(result2, expected) + def test_neg_freq(self): tdi = pd.timedelta_range("2 Days", periods=4, freq="H") arr = TimedeltaArray(tdi, freq=tdi.freq) @@ -108,3 +124,6 @@ def test_neg_freq(self): result = -arr tm.assert_timedelta_array_equal(result, expected) + + result2 = np.negative(arr) + tm.assert_timedelta_array_equal(result2, expected) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 6a16433aa0a32..d262f09182a9c 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -54,7 +54,10 @@ def test_view(self, data): # __setitem__ does not work, so we only have a smoke-test data.view() - @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet") + @pytest.mark.xfail( + raises=AttributeError, + reason="__eq__ incorrectly returns bool instead of ndarray[bool]", + ) def test_contains(self, data, data_missing): super().test_contains(data, data_missing) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index ca22973d0b4d3..e9ceec3a3d7e6 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -1,5 +1,6 @@ from __future__ import annotations +import numpy as np import pytest import pandas as pd @@ -128,11 +129,13 @@ class BaseComparisonOpsTests(BaseOpsUtil): """Various Series and DataFrame comparison ops methods.""" def _compare_other(self, s, data, op_name, other): + op = self.get_op_from_name(op_name) - if op_name == "__eq__": - assert not op(s, other).all() - elif op_name == "__ne__": - assert op(s, other).all() + if op_name in ["__eq__", "__ne__"]: + # comparison should match point-wise comparisons + result = op(s, other) + expected = s.combine(other, op) + self.assert_series_equal(result, expected) else: @@ -182,3 +185,24 @@ def test_invert(self, data): result = ~s expected = pd.Series(~data, name="name") self.assert_series_equal(result, expected) + + @pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs]) + def test_unary_ufunc_dunder_equivalence(self, data, ufunc): + # the dunder __pos__ works if and only if np.positive works, + # same for __neg__/np.negative and __abs__/np.abs + attr = {np.positive: "__pos__", np.negative: "__neg__", np.abs: "__abs__"}[ + ufunc + ] + + exc = None + try: + result = getattr(data, attr)() + except Exception as err: + exc = err + + # if __pos__ raised, then so should the ufunc + with pytest.raises((type(exc), TypeError)): + ufunc(data) + else: + alt = ufunc(data) + self.assert_extension_array_equal(result, alt) From 2688ca8c226609a5d08f8d4e61e2570a3332bc04 Mon Sep 17 00:00:00 2001 From: realead Date: Thu, 7 Oct 2021 02:36:47 +0200 Subject: [PATCH 24/41] [ENH] introducing IntpHashMap and making unique_label_indices use intp (#40653) --- pandas/_libs/hashtable.pyi | 1 + pandas/_libs/hashtable.pyx | 47 ++++++---------------- pandas/_libs/hashtable_func_helper.pxi.in | 48 +++++++++++++++++++++++ pandas/core/sorting.py | 3 +- pandas/tests/libs/test_hashtable.py | 10 +++++ pandas/tests/test_algos.py | 2 +- 6 files changed, 73 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index bf7df5776896b..9c1de67a7ba2a 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -192,6 +192,7 @@ class UInt16HashTable(HashTable): ... class UInt8HashTable(HashTable): ... class StringHashTable(HashTable): ... class PyObjectHashTable(HashTable): ... +class IntpHashTable(HashTable): ... def duplicated_int64( values: np.ndarray, # const int64_t[:] values diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 3eb7bcc673cd4..6e97c13c644cf 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -65,6 +65,18 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128 include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" + +# map derived hash-map types onto basic hash-map types: +if np.dtype(np.intp) == np.dtype(np.int64): + IntpHashTable = Int64HashTable + unique_label_indices = _unique_label_indices_int64 +elif np.dtype(np.intp) == np.dtype(np.int32): + IntpHashTable = Int32HashTable + unique_label_indices = _unique_label_indices_int32 +else: + raise ValueError(np.dtype(np.intp)) + + cdef class Factorizer: cdef readonly: Py_ssize_t count @@ -168,38 +180,3 @@ cdef class Int64Factorizer(Factorizer): self.count = len(self.uniques) return labels - - -@cython.wraparound(False) -@cython.boundscheck(False) -def unique_label_indices(const int64_t[:] labels) -> ndarray: - """ - Indices of the first occurrences of the unique labels - *excluding* -1. equivalent to: - np.unique(labels, return_index=True)[1] - """ - cdef: - int ret = 0 - Py_ssize_t i, n = len(labels) - kh_int64_t *table = kh_init_int64() - Int64Vector idx = Int64Vector() - ndarray[int64_t, ndim=1] arr - Int64VectorData *ud = idx.data - - kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) - - with nogil: - for i in range(n): - kh_put_int64(table, labels[i], &ret) - if ret != 0: - if needs_resize(ud): - with gil: - idx.resize() - append_data_int64(ud, i) - - kh_destroy_int64(table) - - arr = idx.to_array() - arr = arr[np.asarray(labels)[arr].argsort()] - - return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index ceb473a0b06af..fb8ce79a924a4 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna): else: raise TypeError(values.dtype) + + +{{py: + +# name, dtype, ttype, c_type +dtypes = [('Int64', 'int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32', 'int32_t'), ] + +}} + +{{for name, dtype, ttype, c_type in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: + """ + Indices of the first occurrences of the unique labels + *excluding* -1. equivalent to: + np.unique(labels, return_index=True)[1] + """ + cdef: + int ret = 0 + Py_ssize_t i, n = len(labels) + kh_{{ttype}}_t *table = kh_init_{{ttype}}() + {{name}}Vector idx = {{name}}Vector() + ndarray[{{c_type}}, ndim=1] arr + {{name}}VectorData *ud = idx.data + + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + + with nogil: + for i in range(n): + kh_put_{{ttype}}(table, labels[i], &ret) + if ret != 0: + if needs_resize(ud): + with gil: + idx.resize() + append_data_{{ttype}}(ud, i) + + kh_destroy_{{ttype}}(table) + + arr = idx.to_array() + arr = arr[np.asarray(labels)[arr].argsort()] + + return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr + +{{endfor}} diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ccb51a0ea2132..a8348b0c5773f 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -261,8 +261,7 @@ def decons_obs_group_ids( out = decons_group_index(obs_ids, shape) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] - # TODO: unique_label_indices only used here, should take ndarray[np.intp] - indexer = unique_label_indices(ensure_int64(comp_ids)) + indexer = unique_label_indices(comp_ids) return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 5ff20051da8c0..8b7304a84c27b 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -44,6 +44,7 @@ def get_allocated_khash_memory(): (ht.UInt16HashTable, np.uint16), (ht.Int8HashTable, np.int8), (ht.UInt8HashTable, np.uint8), + (ht.IntpHashTable, np.intp), ], ) class TestHashTable: @@ -389,6 +390,7 @@ def get_ht_function(fun_name, type_suffix): (np.uint16, "uint16"), (np.int8, "int8"), (np.uint8, "uint8"), + (np.intp, "intp"), ], ) class TestHelpFunctions: @@ -471,6 +473,14 @@ def test_modes_with_nans(): assert np.isnan(modes[0]) +def test_unique_label_indices_intp(writable): + keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp) + keys.flags.writeable = writable + result = ht.unique_label_indices(keys) + expected = np.array([0, 1, 5], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( "dtype, type_suffix", [ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9a9cd9fa4baaa..4a0d6f2cccc32 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1741,7 +1741,7 @@ def test_quantile(): def test_unique_label_indices(): - a = np.random.randint(1, 1 << 10, 1 << 15).astype("int64") + a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp) left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] From 5fe8d7df40f90ab9869a2e37472fa38a7f66419a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Oct 2021 05:55:37 -0700 Subject: [PATCH 25/41] ENH: implement Index.__array_ufunc__ (#43904) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arraylike.py | 4 ++-- pandas/core/arrays/base.py | 5 +++++ pandas/core/arrays/datetimelike.py | 4 ++-- pandas/core/indexes/base.py | 20 ++++++++++++++++++++ pandas/core/indexes/datetimelike.py | 9 --------- pandas/tests/arithmetic/test_datetime64.py | 15 ++++++++++++++- 7 files changed, 44 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 22b49c35e0e68..722d0dcc10041 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -390,6 +390,7 @@ Datetimelike - Bug in :func:`to_datetime` with ``format`` and ``pandas.NA`` was raising ``ValueError`` (:issue:`42957`) - :func:`to_datetime` would silently swap ``MM/DD/YYYY`` and ``DD/MM/YYYY`` formats if the given ``dayfirst`` option could not be respected - now, a warning is raised in the case of delimited date strings (e.g. ``31-12-2012``) (:issue:`12585`) - Bug in :meth:`date_range` and :meth:`bdate_range` do not return right bound when ``start`` = ``end`` and set is closed on one side (:issue:`43394`) +- Bug in inplace addition and subtraction of :class:`DatetimeIndex` or :class:`TimedeltaIndex` with :class:`DatetimeArray` or :class:`TimedeltaArray` (:issue:`43904`) - Timedelta diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 3d209189d97d8..fe09a044566f8 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -357,7 +357,7 @@ def reconstruct(result): return result if "out" in kwargs: - result = _dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs) + result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs) return reconstruct(result) # We still get here with kwargs `axis` for e.g. np.maximum.accumulate @@ -410,7 +410,7 @@ def _standardize_out_kwarg(**kwargs) -> dict: return kwargs -def _dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): +def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): """ If we have an `out` keyword, then call the ufunc without `out` and then set the result into the given `out`. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b17f309e5f9fb..46b0a6873986e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1379,6 +1379,11 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if result is not NotImplemented: return result + if "out" in kwargs: + return arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2c9796e826825..1f42463cb9f2d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1414,7 +1414,7 @@ def __iadd__(self, other): if not is_period_dtype(self.dtype): # restore freq, which is invalidated by setitem - self._freq = result._freq + self._freq = result.freq return self def __isub__(self, other): @@ -1423,7 +1423,7 @@ def __isub__(self, other): if not is_period_dtype(self.dtype): # restore freq, which is invalidated by setitem - self._freq = result._freq + self._freq = result.freq return self # -------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2b49a88e27961..da953fe46ef1d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -102,6 +102,7 @@ PeriodDtype, ) from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCDatetimeIndex, ABCMultiIndex, ABCPeriodIndex, @@ -116,6 +117,7 @@ ) from pandas.core import ( + arraylike, missing, ops, ) @@ -844,6 +846,24 @@ def __array__(self, dtype=None) -> np.ndarray: """ return np.asarray(self._data, dtype=dtype) + def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): + if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): + return NotImplemented + + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + new_inputs = [x if x is not self else x._values for x in inputs] + result = getattr(ufunc, method)(*new_inputs, **kwargs) + if ufunc.nout == 2: + # i.e. np.divmod, np.modf, np.frexp + return tuple(self.__array_wrap__(x) for x in result) + + return self.__array_wrap__(result) + def __array_wrap__(self, result, context=None): """ Gets called after a ufunc and other functions. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 063bb4aafeb75..48171bdef24fd 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -672,15 +672,6 @@ def insert(self, loc: int, item): # -------------------------------------------------------------------- # NDArray-Like Methods - def __array_wrap__(self, result, context=None): - """ - Gets called after a ufunc and other functions. - """ - out = super().__array_wrap__(result, context=context) - if isinstance(out, DatetimeTimedeltaMixin) and self.freq is not None: - out = out._with_freq("infer") - return out - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take((), kwargs) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 60a58b7bbea78..0d3f7dcaaf65b 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -2163,6 +2163,15 @@ def test_dti_isub_tdi(self, tz_naive_fixture): result -= tdi tm.assert_index_equal(result, expected) + # DTA.__isub__ GH#43904 + dta = dti._data.copy() + dta -= tdi + tm.assert_datetime_array_equal(dta, expected._data) + + out = dti._data.copy() + np.subtract(out, tdi, out=out) + tm.assert_datetime_array_equal(out, expected._data) + msg = "cannot subtract .* from a TimedeltaArray" with pytest.raises(TypeError, match=msg): tdi -= dti @@ -2172,10 +2181,14 @@ def test_dti_isub_tdi(self, tz_naive_fixture): result -= tdi.values tm.assert_index_equal(result, expected) - msg = "cannot subtract a datelike from a TimedeltaArray" + msg = "cannot subtract DatetimeArray from ndarray" with pytest.raises(TypeError, match=msg): tdi.values -= dti + msg = "cannot subtract a datelike from a TimedeltaArray" + with pytest.raises(TypeError, match=msg): + tdi._values -= dti + # ------------------------------------------------------------- # Binary Operations DatetimeIndex and datetime-like # TODO: A couple other tests belong in this section. Move them in From a49977c7097e244832d808dbde7b069edad2cf82 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Oct 2021 05:57:06 -0700 Subject: [PATCH 26/41] TST/REF: share/split index tests (#43905) --- pandas/tests/indexes/datetimelike.py | 12 ++++++++ .../tests/indexes/datetimes/test_datetime.py | 12 -------- .../tests/indexes/datetimes/test_indexing.py | 18 ++++++----- pandas/tests/indexes/datetimes/test_misc.py | 9 ++++++ pandas/tests/indexes/period/test_period.py | 10 ------- .../tests/indexes/timedeltas/test_pickle.py | 11 +++++++ .../indexes/timedeltas/test_timedelta.py | 30 ++++--------------- 7 files changed, 48 insertions(+), 54 deletions(-) create mode 100644 pandas/tests/indexes/timedeltas/test_pickle.py diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 70156092eeabe..ecdbf01fd41c1 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -9,6 +9,18 @@ class DatetimeLike(Base): + def test_isin(self, simple_index): + index = simple_index[:4] + result = index.isin(index) + assert result.all() + + result = index.isin(list(index)) + assert result.all() + + result = index.isin([index[2], 5]) + expected = np.array([False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + def test_argsort_matches_array(self, simple_index): idx = simple_index idx = idx.insert(1, pd.NaT) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 17b80fbc0afc2..b220ce486f80b 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -154,18 +154,6 @@ def test_groupby_function_tuple_1677(self): result = monthly_group.mean() assert isinstance(result.index[0], tuple) - def test_isin(self): - index = tm.makeDateIndex(4) - result = index.isin(index) - assert result.all() - - result = index.isin(list(index)) - assert result.all() - - tm.assert_almost_equal( - index.isin([index[2], 5]), np.array([False, False, True, False]) - ) - def assert_index_parameters(self, index): assert index.freq == "40960N" assert index.inferred_freq == "40960N" diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 6eaf799ae2779..4ad85f7d4e30f 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -485,19 +485,23 @@ def test_get_loc(self): with pytest.raises(InvalidIndexError, match=r"slice\(None, 2, None\)"): idx.get_loc(slice(2)) - idx = pd.to_datetime(["2000-01-01", "2000-01-04"]) + idx = DatetimeIndex(["2000-01-01", "2000-01-04"]) assert idx.get_loc("2000-01-02", method="nearest") == 0 assert idx.get_loc("2000-01-03", method="nearest") == 1 assert idx.get_loc("2000-01", method="nearest") == slice(0, 2) + def test_get_loc_time_obj(self): # time indexing idx = date_range("2000-01-01", periods=24, freq="H") - tm.assert_numpy_array_equal( - idx.get_loc(time(12)), np.array([12]), check_dtype=False - ) - tm.assert_numpy_array_equal( - idx.get_loc(time(12, 30)), np.array([]), check_dtype=False - ) + + result = idx.get_loc(time(12)) + expected = np.array([12]) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + result = idx.get_loc(time(12, 30)) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + msg = "cannot yet lookup inexact labels when key is a time object" with pytest.raises(NotImplementedError, match=msg): with tm.assert_produces_warning(FutureWarning, match="deprecated"): diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 408ed2db316ca..647f7739b482a 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -156,6 +156,15 @@ def test_range_edges9(self): class TestDatetime64: + def test_no_millisecond_field(self): + msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex.millisecond + + msg = "'DatetimeIndex' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex([]).millisecond + def test_datetimeindex_accessors(self): dti_naive = date_range(freq="D", start=datetime(1998, 1, 1), periods=365) # GH#13303 diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 83c82c18f3d1e..e0f794a188ba3 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -4,7 +4,6 @@ from pandas._libs.tslibs.period import IncompatibleFrequency from pandas import ( - DatetimeIndex, Index, NaT, Period, @@ -49,15 +48,6 @@ def test_where(self): # This is handled in test_indexing pass - def test_no_millisecond_field(self): - msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" - with pytest.raises(AttributeError, match=msg): - DatetimeIndex.millisecond - - msg = "'DatetimeIndex' object has no attribute 'millisecond'" - with pytest.raises(AttributeError, match=msg): - DatetimeIndex([]).millisecond - def test_make_time_series(self): index = period_range(freq="A", start="1/1/2001", end="12/1/2009") series = Series(1, index=index) diff --git a/pandas/tests/indexes/timedeltas/test_pickle.py b/pandas/tests/indexes/timedeltas/test_pickle.py new file mode 100644 index 0000000000000..befe709728bdd --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_pickle.py @@ -0,0 +1,11 @@ +from pandas import timedelta_range +import pandas._testing as tm + + +class TestPickle: + def test_pickle_after_set_freq(self): + tdi = timedelta_range("1 day", periods=4, freq="s") + tdi = tdi._with_freq(None) + + res = tm.round_trip_pickle(tdi) + tm.assert_index_equal(res, tdi) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 33f0565c0b23b..952036428d3c9 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -7,10 +7,10 @@ from pandas import ( Index, Int64Index, + NaT, Series, Timedelta, TimedeltaIndex, - date_range, timedelta_range, ) import pandas._testing as tm @@ -42,26 +42,6 @@ def test_numeric_compat(self): def test_shift(self): pass # this is handled in test_arithmetic.py - def test_pickle_after_set_freq(self): - tdi = timedelta_range("1 day", periods=4, freq="s") - tdi = tdi._with_freq(None) - - res = tm.round_trip_pickle(tdi) - tm.assert_index_equal(res, tdi) - - def test_isin(self): - - index = tm.makeTimedeltaIndex(4) - result = index.isin(index) - assert result.all() - - result = index.isin(list(index)) - assert result.all() - - tm.assert_almost_equal( - index.isin([index[2], 5]), np.array([False, False, True, False]) - ) - def test_misc_coverage(self): rng = timedelta_range("1 day", periods=5) @@ -140,11 +120,11 @@ def test_freq_conversion(self): # doc example # series - td = Series(date_range("20130101", periods=4)) - Series( - date_range("20121201", periods=4) + scalar = Timedelta(days=31) + td = Series( + [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", ) - td[2] += timedelta(minutes=5, seconds=3) - td[3] = np.nan result = td / np.timedelta64(1, "D") expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) From 47791717a25654355235fe85cd19bdbf158591c1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Oct 2021 05:57:46 -0700 Subject: [PATCH 27/41] TST/REF: misplaced Index.putmask tests (#43906) --- pandas/tests/indexes/interval/test_base.py | 27 ---------------- .../tests/indexes/interval/test_indexing.py | 26 ++++++++++++++++ pandas/tests/indexes/multi/test_indexing.py | 31 +++++++++++++------ pandas/tests/indexes/multi/test_putmask.py | 17 ---------- pandas/tests/indexes/numeric/test_indexing.py | 2 +- pandas/tests/indexes/ranges/test_indexing.py | 14 +++++++++ pandas/tests/indexes/ranges/test_setops.py | 14 --------- 7 files changed, 63 insertions(+), 68 deletions(-) delete mode 100644 pandas/tests/indexes/multi/test_putmask.py diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 3589fe726b3bb..aa88bca2faec9 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -4,7 +4,6 @@ from pandas import ( IntervalIndex, Series, - date_range, ) import pandas._testing as tm from pandas.tests.indexes.common import Base @@ -66,29 +65,3 @@ def test_getitem_2d_deprecated(self, simple_index): with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): with tm.assert_produces_warning(FutureWarning): idx[:, None] - - -class TestPutmask: - @pytest.mark.parametrize("tz", ["US/Pacific", None]) - def test_putmask_dt64(self, tz): - # GH#37968 - dti = date_range("2016-01-01", periods=9, tz=tz) - idx = IntervalIndex.from_breaks(dti) - mask = np.zeros(idx.shape, dtype=bool) - mask[0:3] = True - - result = idx.putmask(mask, idx[-1]) - expected = IntervalIndex([idx[-1]] * 3 + list(idx[3:])) - tm.assert_index_equal(result, expected) - - def test_putmask_td64(self): - # GH#37968 - dti = date_range("2016-01-01", periods=9) - tdi = dti - dti[0] - idx = IntervalIndex.from_breaks(tdi) - mask = np.zeros(idx.shape, dtype=bool) - mask[0:3] = True - - result = idx.putmask(mask, idx[-1]) - expected = IntervalIndex([idx[-1]] * 3 + list(idx[3:])) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index aa3359d775c5a..8df8eef69e9c9 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -497,3 +497,29 @@ def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): ), ): index.slice_locs(start, stop) + + +class TestPutmask: + @pytest.mark.parametrize("tz", ["US/Pacific", None]) + def test_putmask_dt64(self, tz): + # GH#37968 + dti = date_range("2016-01-01", periods=9, tz=tz) + idx = IntervalIndex.from_breaks(dti) + mask = np.zeros(idx.shape, dtype=bool) + mask[0:3] = True + + result = idx.putmask(mask, idx[-1]) + expected = IntervalIndex([idx[-1]] * 3 + list(idx[3:])) + tm.assert_index_equal(result, expected) + + def test_putmask_td64(self): + # GH#37968 + dti = date_range("2016-01-01", periods=9) + tdi = dti - dti[0] + idx = IntervalIndex.from_breaks(tdi) + mask = np.zeros(idx.shape, dtype=bool) + mask[0:3] = True + + result = idx.putmask(mask, idx[-1]) + expected = IntervalIndex([idx[-1]] * 3 + list(idx[3:])) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index e142cbf89f1bd..405b41c829a2f 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -136,18 +136,31 @@ def test_slice_locs_with_missing_value( assert result == expected -def test_putmask_with_wrong_mask(idx): - # GH18368 +class TestPutmask: + def test_putmask_with_wrong_mask(self, idx): + # GH18368 - msg = "putmask: mask and data must be the same size" - with pytest.raises(ValueError, match=msg): - idx.putmask(np.ones(len(idx) + 1, np.bool_), 1) + msg = "putmask: mask and data must be the same size" + with pytest.raises(ValueError, match=msg): + idx.putmask(np.ones(len(idx) + 1, np.bool_), 1) + + with pytest.raises(ValueError, match=msg): + idx.putmask(np.ones(len(idx) - 1, np.bool_), 1) + + with pytest.raises(ValueError, match=msg): + idx.putmask("foo", 1) + + def test_putmask_multiindex_other(self): + # GH#43212 `value` is also a MultiIndex + + left = MultiIndex.from_tuples([(np.nan, 6), (np.nan, 6), ("a", 4)]) + right = MultiIndex.from_tuples([("a", 1), ("a", 1), ("d", 1)]) + mask = np.array([True, True, False]) - with pytest.raises(ValueError, match=msg): - idx.putmask(np.ones(len(idx) - 1, np.bool_), 1) + result = left.putmask(mask, right) - with pytest.raises(ValueError, match=msg): - idx.putmask("foo", 1) + expected = MultiIndex.from_tuples([right[0], right[1], left[2]]) + tm.assert_index_equal(result, expected) class TestGetIndexer: diff --git a/pandas/tests/indexes/multi/test_putmask.py b/pandas/tests/indexes/multi/test_putmask.py deleted file mode 100644 index 2a24be9003302..0000000000000 --- a/pandas/tests/indexes/multi/test_putmask.py +++ /dev/null @@ -1,17 +0,0 @@ -import numpy as np - -from pandas import MultiIndex -import pandas._testing as tm - - -def test_putmask_multiindex_other(): - # GH#43212 `value` is also a MultiIndex - - left = MultiIndex.from_tuples([(np.nan, 6), (np.nan, 6), ("a", 4)]) - right = MultiIndex.from_tuples([("a", 1), ("a", 1), ("d", 1)]) - mask = np.array([True, True, False]) - - result = left.putmask(mask, right) - - expected = MultiIndex.from_tuples([right[0], right[1], left[2]]) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 4621cbcb9d462..be05d5d8a9cae 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -406,7 +406,7 @@ def test_where(self, klass, index): result = index.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_uin64(self): + def test_where_uint64(self): idx = UInt64Index([0, 6, 2]) mask = np.array([False, True, False]) other = np.array([1], dtype=np.int64) diff --git a/pandas/tests/indexes/ranges/test_indexing.py b/pandas/tests/indexes/ranges/test_indexing.py index b46354939f3c5..f8c3eff0ab80a 100644 --- a/pandas/tests/indexes/ranges/test_indexing.py +++ b/pandas/tests/indexes/ranges/test_indexing.py @@ -77,3 +77,17 @@ def test_take_fill_value(self): msg = "index -5 is out of bounds for (axis 0 with )?size 3" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) + + +class TestWhere: + def test_where_putmask_range_cast(self): + # GH#43240 + idx = RangeIndex(0, 5, name="test") + + mask = np.array([True, True, False, False, False]) + result = idx.putmask(mask, 10) + expected = Int64Index([10, 10, 2, 3, 4], name="test") + tm.assert_index_equal(result, expected) + + result = idx.where(~mask, 10) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 210bcd300b1b0..ba938f82e9d89 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -354,17 +354,3 @@ def test_symmetric_difference(self): result = left.symmetric_difference(right[1:]) expected = Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14]) tm.assert_index_equal(result, expected) - - def test_putmask_range_cast(self): - # GH#43240 - idx = RangeIndex(0, 5, name="test") - result = idx.putmask(np.array([True, True, False, False, False]), 10) - expected = Index([10, 10, 2, 3, 4], name="test") - tm.assert_index_equal(result, expected) - - def test_where_range_cast(self): - # GH#43240 - idx = RangeIndex(0, 5, name="test") - result = idx.where(np.array([False, False, True, True, True]), 10) - expected = Index([10, 10, 2, 3, 4], name="test") - tm.assert_index_equal(result, expected) From d4ae657dfe22cde2825eb595d71db9436beb8635 Mon Sep 17 00:00:00 2001 From: Jernej Makovsek Date: Thu, 7 Oct 2021 14:58:24 +0200 Subject: [PATCH 28/41] Add clarifications to the docs regarding `to_feather` (#43866) --- pandas/core/frame.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2eb66c7db0ba6..4db99c5d7f074 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2565,6 +2565,13 @@ def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: `compression_level`, `chunksize` and `version` keywords. .. versionadded:: 1.1.0 + + Notes + ----- + This function writes the dataframe as a `feather file + `_. Requires a default + index. For saving the DataFrame with your custom index use a method that + supports custom indices e.g. `to_parquet`. """ from pandas.io.feather_format import to_feather From bde9b111a0ba9def1fb64064de898cfde4682255 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Oct 2021 15:02:09 -0700 Subject: [PATCH 29/41] TST/REF: collect/de-dup index tests (#43914) --- pandas/tests/frame/methods/test_reindex.py | 32 +++++ .../tests/indexes/categorical/test_reindex.py | 42 ------ .../indexes/datetimes/test_date_range.py | 37 ++++- pandas/tests/indexes/datetimes/test_misc.py | 136 ------------------ .../indexes/period/test_partial_slicing.py | 26 ++++ pandas/tests/indexes/period/test_period.py | 25 ---- .../indexes/timedeltas/test_timedelta.py | 41 ++---- pandas/tests/series/methods/test_reindex.py | 8 ++ 8 files changed, 115 insertions(+), 232 deletions(-) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index c6b19547904ec..bee8025275b42 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -1078,3 +1078,35 @@ def test_reindex_datetimelike_to_object(self, dtype): assert res.iloc[-1, 0] is fv assert res.iloc[-1, 1] is fv tm.assert_frame_equal(res, expected) + + @pytest.mark.parametrize( + "index_df,index_res,index_exp", + [ + ( + CategoricalIndex([], categories=["A"]), + Index(["A"]), + Index(["A"]), + ), + ( + CategoricalIndex([], categories=["A"]), + Index(["B"]), + Index(["B"]), + ), + ( + CategoricalIndex([], categories=["A"]), + CategoricalIndex(["A"]), + CategoricalIndex(["A"]), + ), + ( + CategoricalIndex([], categories=["A"]), + CategoricalIndex(["B"]), + CategoricalIndex(["B"]), + ), + ], + ) + def test_reindex_not_category(self, index_df, index_res, index_exp): + # GH#28690 + df = DataFrame(index=index_df) + result = df.reindex(index=index_res) + expected = DataFrame(index=index_exp) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py index 0b81d4f88eaf8..72130ef9e4627 100644 --- a/pandas/tests/indexes/categorical/test_reindex.py +++ b/pandas/tests/indexes/categorical/test_reindex.py @@ -1,13 +1,10 @@ import numpy as np -import pytest from pandas import ( Categorical, CategoricalIndex, - DataFrame, Index, Interval, - Series, ) import pandas._testing as tm @@ -66,45 +63,6 @@ def test_reindex_empty_index(self): tm.assert_index_equal(res, Index(["a", "b"]), exact=True) tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) - def test_reindex_missing_category(self): - # GH: 18185 - ser = Series([1, 2, 3, 1], dtype="category") - msg = r"Cannot setitem on a Categorical with a new category \(-1\)" - with pytest.raises(TypeError, match=msg): - ser.reindex([1, 2, 3, 4, 5], fill_value=-1) - - @pytest.mark.parametrize( - "index_df,index_res,index_exp", - [ - ( - CategoricalIndex([], categories=["A"]), - Index(["A"]), - Index(["A"]), - ), - ( - CategoricalIndex([], categories=["A"]), - Index(["B"]), - Index(["B"]), - ), - ( - CategoricalIndex([], categories=["A"]), - CategoricalIndex(["A"]), - CategoricalIndex(["A"]), - ), - ( - CategoricalIndex([], categories=["A"]), - CategoricalIndex(["B"]), - CategoricalIndex(["B"]), - ), - ], - ) - def test_reindex_not_category(self, index_df, index_res, index_exp): - # GH: 28690 - df = DataFrame(index=index_df) - result = df.reindex(index=index_res) - expected = DataFrame(index=index_exp) - tm.assert_frame_equal(result, expected) - def test_reindex_categorical_added_category(self): # GH 42424 ci = CategoricalIndex( diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 7559d7ce645e0..80c86e0103436 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -121,6 +121,41 @@ def test_date_range_timestamp_equiv_preserve_frequency(self): class TestDateRanges: + @pytest.mark.parametrize("freq", ["N", "U", "L", "T", "S", "H", "D"]) + def test_date_range_edges(self, freq): + # GH#13672 + td = Timedelta(f"1{freq}") + ts = Timestamp("1970-01-01") + + idx = date_range( + start=ts + td, + end=ts + 4 * td, + freq=freq, + ) + exp = DatetimeIndex( + [ts + n * td for n in range(1, 5)], + freq=freq, + ) + tm.assert_index_equal(idx, exp) + + # start after end + idx = date_range( + start=ts + 4 * td, + end=ts + td, + freq=freq, + ) + exp = DatetimeIndex([], freq=freq) + tm.assert_index_equal(idx, exp) + + # start matches end + idx = date_range( + start=ts + td, + end=ts + td, + freq=freq, + ) + exp = DatetimeIndex([ts + td], freq=freq) + tm.assert_index_equal(idx, exp) + def test_date_range_near_implementation_bound(self): # GH#??? freq = Timedelta(1) @@ -717,7 +752,7 @@ def test_timezone_comparaison_bug(self): result = date_range(start, periods=2, tz="US/Eastern") assert len(result) == 2 - def test_timezone_comparaison_assert(self): + def test_timezone_comparison_assert(self): start = Timestamp("20130220 10:00", tz="US/Eastern") msg = "Inferred time zone not equal to passed time zone" with pytest.raises(AssertionError, match=msg): diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 647f7739b482a..f0757d0ba555e 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -19,142 +19,6 @@ from pandas.core.arrays import DatetimeArray -class TestTimeSeries: - def test_range_edges(self): - # GH#13672 - idx = date_range( - start=Timestamp("1970-01-01 00:00:00.000000001"), - end=Timestamp("1970-01-01 00:00:00.000000004"), - freq="N", - ) - exp = DatetimeIndex( - [ - "1970-01-01 00:00:00.000000001", - "1970-01-01 00:00:00.000000002", - "1970-01-01 00:00:00.000000003", - "1970-01-01 00:00:00.000000004", - ], - freq="N", - ) - tm.assert_index_equal(idx, exp) - - def test_range_edges2(self): - - idx = date_range( - start=Timestamp("1970-01-01 00:00:00.000000004"), - end=Timestamp("1970-01-01 00:00:00.000000001"), - freq="N", - ) - exp = DatetimeIndex([], freq="N") - tm.assert_index_equal(idx, exp) - - def test_range_edges3(self): - - idx = date_range( - start=Timestamp("1970-01-01 00:00:00.000000001"), - end=Timestamp("1970-01-01 00:00:00.000000001"), - freq="N", - ) - exp = DatetimeIndex(["1970-01-01 00:00:00.000000001"], freq="N") - tm.assert_index_equal(idx, exp) - - def test_range_edges4(self): - - idx = date_range( - start=Timestamp("1970-01-01 00:00:00.000001"), - end=Timestamp("1970-01-01 00:00:00.000004"), - freq="U", - ) - exp = DatetimeIndex( - [ - "1970-01-01 00:00:00.000001", - "1970-01-01 00:00:00.000002", - "1970-01-01 00:00:00.000003", - "1970-01-01 00:00:00.000004", - ], - freq="U", - ) - tm.assert_index_equal(idx, exp) - - def test_range_edges5(self): - - idx = date_range( - start=Timestamp("1970-01-01 00:00:00.001"), - end=Timestamp("1970-01-01 00:00:00.004"), - freq="L", - ) - exp = DatetimeIndex( - [ - "1970-01-01 00:00:00.001", - "1970-01-01 00:00:00.002", - "1970-01-01 00:00:00.003", - "1970-01-01 00:00:00.004", - ], - freq="L", - ) - tm.assert_index_equal(idx, exp) - - def test_range_edges6(self): - idx = date_range( - start=Timestamp("1970-01-01 00:00:01"), - end=Timestamp("1970-01-01 00:00:04"), - freq="S", - ) - exp = DatetimeIndex( - [ - "1970-01-01 00:00:01", - "1970-01-01 00:00:02", - "1970-01-01 00:00:03", - "1970-01-01 00:00:04", - ], - freq="S", - ) - tm.assert_index_equal(idx, exp) - - def test_range_edges7(self): - idx = date_range( - start=Timestamp("1970-01-01 00:01"), - end=Timestamp("1970-01-01 00:04"), - freq="T", - ) - exp = DatetimeIndex( - [ - "1970-01-01 00:01", - "1970-01-01 00:02", - "1970-01-01 00:03", - "1970-01-01 00:04", - ], - freq="T", - ) - tm.assert_index_equal(idx, exp) - - def test_range_edges8(self): - idx = date_range( - start=Timestamp("1970-01-01 01:00"), - end=Timestamp("1970-01-01 04:00"), - freq="H", - ) - exp = DatetimeIndex( - [ - "1970-01-01 01:00", - "1970-01-01 02:00", - "1970-01-01 03:00", - "1970-01-01 04:00", - ], - freq="H", - ) - tm.assert_index_equal(idx, exp) - - def test_range_edges9(self): - idx = date_range( - start=Timestamp("1970-01-01"), end=Timestamp("1970-01-04"), freq="D" - ) - exp = DatetimeIndex( - ["1970-01-01", "1970-01-02", "1970-01-03", "1970-01-04"], freq="D" - ) - tm.assert_index_equal(idx, exp) - - class TestDatetime64: def test_no_millisecond_field(self): msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 148999d90d554..c565902d080c3 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, + PeriodIndex, Series, date_range, period_range, @@ -11,6 +12,31 @@ class TestPeriodIndex: + def test_getitem_periodindex_duplicates_string_slice(self): + # monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts["2007"] + expected = ts[1:3] + tm.assert_series_equal(result, expected) + result[:] = 1 + assert (ts[1:3] == 1).all() + + # not monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts["2007"] + expected = ts[idx == "2007"] + tm.assert_series_equal(result, expected) + + def test_getitem_periodindex_quarter_string(self): + pi = PeriodIndex(["2Q05", "3Q05", "4Q05", "1Q06", "2Q06"], freq="Q") + ser = Series(np.random.rand(len(pi)), index=pi).cumsum() + # Todo: fix these accessors! + assert ser["05Q4"] == ser[2] + def test_pindex_slice_index(self): pi = period_range(start="1/1/10", end="12/31/12", freq="M") s = Series(np.random.rand(len(pi)), index=pi) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index e0f794a188ba3..e6c31d22e626f 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -245,25 +245,6 @@ def test_is_(self): assert not index.is_(index - 2) assert not index.is_(index - 0) - def test_index_duplicate_periods(self): - # monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") - ts = Series(np.random.randn(len(idx)), index=idx) - - result = ts["2007"] - expected = ts[1:3] - tm.assert_series_equal(result, expected) - result[:] = 1 - assert (ts[1:3] == 1).all() - - # not monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") - ts = Series(np.random.randn(len(idx)), index=idx) - - result = ts["2007"] - expected = ts[idx == "2007"] - tm.assert_series_equal(result, expected) - def test_index_unique(self): idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN") @@ -292,12 +273,6 @@ def test_pindex_fieldaccessor_nat(self): exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name="name") tm.assert_index_equal(idx.month, exp) - def test_pindex_qaccess(self): - pi = PeriodIndex(["2Q05", "3Q05", "4Q05", "1Q06", "2Q06"], freq="Q") - s = Series(np.random.rand(len(pi)), index=pi).cumsum() - # Todo: fix these accessors! - assert s["05Q4"] == s[2] - def test_pindex_multiples(self): expected = PeriodIndex( ["2011-01", "2011-03", "2011-05", "2011-07", "2011-09", "2011-11"], diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 952036428d3c9..9672929ecc06b 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -115,46 +115,31 @@ def test_freq_conversion_always_floating(self): res = tdi.to_series().astype("m8[s]") tm.assert_numpy_array_equal(res._values, expected._values) - def test_freq_conversion(self): + def test_freq_conversion(self, index_or_series): # doc example - # series scalar = Timedelta(days=31) - td = Series( + td = index_or_series( [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], dtype="m8[ns]", ) result = td / np.timedelta64(1, "D") - expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) - tm.assert_series_equal(result, expected) - - result = td.astype("timedelta64[D]") - expected = Series([31, 31, 31, np.nan]) - tm.assert_series_equal(result, expected) - - result = td / np.timedelta64(1, "s") - expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) - tm.assert_series_equal(result, expected) - - result = td.astype("timedelta64[s]") - tm.assert_series_equal(result, expected) - - # tdi - td = TimedeltaIndex(td) - - result = td / np.timedelta64(1, "D") - expected = Index([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) - tm.assert_index_equal(result, expected) + expected = index_or_series( + [31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan] + ) + tm.assert_equal(result, expected) result = td.astype("timedelta64[D]") - expected = Index([31, 31, 31, np.nan]) - tm.assert_index_equal(result, expected) + expected = index_or_series([31, 31, 31, np.nan]) + tm.assert_equal(result, expected) result = td / np.timedelta64(1, "s") - expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) - tm.assert_index_equal(result, expected) + expected = index_or_series( + [31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan] + ) + tm.assert_equal(result, expected) result = td.astype("timedelta64[s]") - tm.assert_index_equal(result, expected) + tm.assert_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 36d3971d10a3d..be9f96c8b509a 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -359,3 +359,11 @@ def test_reindex_empty_with_level(values): index=MultiIndex(levels=[["b"], values[1]], codes=[[], []]), dtype="object" ) tm.assert_series_equal(result, expected) + + +def test_reindex_missing_category(): + # GH#18185 + ser = Series([1, 2, 3, 1], dtype="category") + msg = r"Cannot setitem on a Categorical with a new category \(-1\)" + with pytest.raises(TypeError, match=msg): + ser.reindex([1, 2, 3, 4, 5], fill_value=-1) From ecab3a24dc08848ecca0ca48578d919a41935e94 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Oct 2021 16:54:28 -0700 Subject: [PATCH 30/41] BENCH: indexing_engines (#43916) --- asv_bench/benchmarks/indexing_engines.py | 46 ++++++++++++++++++------ 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 0cbc300ee2fc4..60e07a9d1469c 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -35,25 +35,49 @@ class NumericEngineIndexing: params = [ _get_numeric_engines(), ["monotonic_incr", "monotonic_decr", "non_monotonic"], + [True, False], + [10 ** 5, 2 * 10 ** 6], # 2e6 is above SIZE_CUTOFF ] - param_names = ["engine_and_dtype", "index_type"] + param_names = ["engine_and_dtype", "index_type", "unique", "N"] - def setup(self, engine_and_dtype, index_type): + def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype - N = 10 ** 5 - values = list([1] * N + [2] * N + [3] * N) - arr = { - "monotonic_incr": np.array(values, dtype=dtype), - "monotonic_decr": np.array(list(reversed(values)), dtype=dtype), - "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype), - }[index_type] + + if index_type == "monotonic_incr": + if unique: + arr = np.arange(N * 3, dtype=dtype) + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype) + elif index_type == "monotonic_decr": + if unique: + arr = np.arange(N * 3, dtype=dtype)[::-1] + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype)[::-1] + else: + assert index_type == "non_monotonic" + if unique: + arr = np.empty(N * 3, dtype=dtype) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) + arr[N:] = np.arange(N * 2, dtype=dtype) + else: + arr = np.array([1, 2, 3] * N, dtype=dtype) self.data = engine(arr) # code belows avoids populating the mapping etc. while timing. self.data.get_loc(2) - def time_get_loc(self, engine_and_dtype, index_type): - self.data.get_loc(2) + self.key_middle = arr[len(arr) // 2] + self.key_early = arr[2] + + def time_get_loc(self, engine_and_dtype, index_type, unique, N): + self.data.get_loc(self.key_early) + + def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): + # searchsorted performance may be different near the middle of a range + # vs near an endpoint + self.data.get_loc(self.key_middle) class ObjectEngineIndexing: From adef17cfd4dc1614d818fe6214528de1cd463035 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Oct 2021 05:31:13 -0700 Subject: [PATCH 31/41] TST: avoid re-running tests 14 times (#43922) --- pandas/tests/indexes/common.py | 32 ---------------------------- pandas/tests/indexes/test_common.py | 33 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index a8684ca4d3c25..50097ae3787b3 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -525,38 +525,6 @@ def test_format_empty(self): assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] - def test_hasnans_isnans(self, index_flat): - # GH 11343, added tests for hasnans / isnans - index = index_flat - - # cases in indices doesn't include NaN - idx = index.copy(deep=True) - expected = np.array([False] * len(idx), dtype=bool) - tm.assert_numpy_array_equal(idx._isnan, expected) - assert idx.hasnans is False - - idx = index.copy(deep=True) - values = np.asarray(idx.values) - - if len(index) == 0: - return - elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype): - return - elif isinstance(index, DatetimeIndexOpsMixin): - values[1] = iNaT - else: - values[1] = np.nan - - if isinstance(index, PeriodIndex): - idx = type(index)(values, freq=index.freq) - else: - idx = type(index)(values) - - expected = np.array([False] * len(idx), dtype=bool) - expected[1] = True - tm.assert_numpy_array_equal(idx._isnan, expected) - assert idx.hasnans is True - def test_fillna(self, index): # GH 11343 if len(index) == 0: diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 33aa8bbb942d5..604b68cfcc791 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -12,6 +12,7 @@ from pandas.compat import IS64 from pandas.core.dtypes.common import ( + is_integer_dtype, is_period_dtype, needs_i8_conversion, ) @@ -366,6 +367,38 @@ def test_asi8_deprecation(self, index): with tm.assert_produces_warning(warn): index.asi8 + def test_hasnans_isnans(self, index_flat): + # GH#11343, added tests for hasnans / isnans + index = index_flat + + # cases in indices doesn't include NaN + idx = index.copy(deep=True) + expected = np.array([False] * len(idx), dtype=bool) + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is False + + idx = index.copy(deep=True) + values = np.asarray(idx.values) + + if len(index) == 0: + return + elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype): + return + elif needs_i8_conversion(index.dtype): + values[1] = iNaT + else: + values[1] = np.nan + + if isinstance(index, PeriodIndex): + idx = type(index)(values, freq=index.freq) + else: + idx = type(index)(values) + + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is True + @pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): From d5716c7ae53c97afed2741d2485e0102b398cc32 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Oct 2021 05:32:03 -0700 Subject: [PATCH 32/41] CLN: unnecessary warning-catching (#43919) --- pandas/core/arrays/datetimes.py | 18 +++++++----------- pandas/core/frame.py | 10 +++++----- pandas/tests/indexes/numeric/test_astype.py | 6 +++--- pandas/tests/indexes/numeric/test_indexing.py | 8 +++++--- pandas/tests/indexes/numeric/test_join.py | 4 ++-- pandas/tests/indexes/numeric/test_numeric.py | 8 +++++--- pandas/tests/indexes/numeric/test_setops.py | 4 ++-- .../indexes/period/methods/test_astype.py | 6 ++++-- pandas/tests/indexes/ranges/test_join.py | 2 +- pandas/tests/indexes/ranges/test_range.py | 4 ++-- pandas/tests/indexes/ranges/test_setops.py | 4 ++-- .../tests/indexes/timedeltas/test_timedelta.py | 2 +- pandas/tests/indexing/test_partial.py | 6 +++--- 13 files changed, 42 insertions(+), 40 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d9f9c07a4f645..eb7638df301f7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -621,17 +621,13 @@ def __iter__(self): chunksize = 10000 chunks = (length // chunksize) + 1 - with warnings.catch_warnings(): - # filter out warnings about Timestamp.freq - warnings.filterwarnings("ignore", category=FutureWarning) - - for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, length) - converted = ints_to_pydatetime( - data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" - ) - yield from converted + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pydatetime( + data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" + ) + yield from converted def astype(self, dtype, copy: bool = True): # We handle diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4db99c5d7f074..077ac303b8327 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9361,17 +9361,17 @@ def round( """ from pandas.core.reshape.concat import concat - def _dict_round(df, decimals): + def _dict_round(df: DataFrame, decimals): for col, vals in df.items(): try: yield _series_round(vals, decimals[col]) except KeyError: yield vals - def _series_round(s, decimals): - if is_integer_dtype(s) or is_float_dtype(s): - return s.round(decimals) - return s + def _series_round(ser: Series, decimals: int): + if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): + return ser.round(decimals) + return ser nv.validate_round(args, kwargs) diff --git a/pandas/tests/indexes/numeric/test_astype.py b/pandas/tests/indexes/numeric/test_astype.py index bda66856fb57a..89f26e953400d 100644 --- a/pandas/tests/indexes/numeric/test_astype.py +++ b/pandas/tests/indexes/numeric/test_astype.py @@ -5,12 +5,12 @@ from pandas.core.dtypes.common import pandas_dtype -from pandas import ( +from pandas import Index +import pandas._testing as tm +from pandas.core.indexes.api import ( Float64Index, - Index, Int64Index, ) -import pandas._testing as tm class TestAstype: diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index be05d5d8a9cae..cb861aaab80f8 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -2,15 +2,17 @@ import pytest from pandas import ( - Float64Index, Index, - Int64Index, RangeIndex, Series, Timestamp, - UInt64Index, ) import pandas._testing as tm +from pandas.core.indexes.api import ( + Float64Index, + Int64Index, + UInt64Index, +) @pytest.fixture diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 43d731f8c3142..2a47289b65aad 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -1,12 +1,12 @@ import numpy as np import pytest -from pandas import ( +import pandas._testing as tm +from pandas.core.indexes.api import ( Index, Int64Index, UInt64Index, ) -import pandas._testing as tm class TestJoinInt64Index: diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 6d35568b69fac..ec451ac13ec44 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -5,14 +5,16 @@ import pandas as pd from pandas import ( - Float64Index, Index, - Int64Index, NumericIndex, Series, - UInt64Index, ) import pandas._testing as tm +from pandas.core.indexes.api import ( + Float64Index, + Int64Index, + UInt64Index, +) from pandas.tests.indexes.common import NumericBase diff --git a/pandas/tests/indexes/numeric/test_setops.py b/pandas/tests/indexes/numeric/test_setops.py index 5a7db9858dbad..4045cc0b91313 100644 --- a/pandas/tests/indexes/numeric/test_setops.py +++ b/pandas/tests/indexes/numeric/test_setops.py @@ -6,14 +6,14 @@ import numpy as np import pytest -from pandas import ( +import pandas._testing as tm +from pandas.core.indexes.api import ( Float64Index, Index, Int64Index, RangeIndex, UInt64Index, ) -import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index 74f627478a29c..e2340a2db02f7 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -5,15 +5,17 @@ CategoricalIndex, DatetimeIndex, Index, - Int64Index, NaT, Period, PeriodIndex, Timedelta, - UInt64Index, period_range, ) import pandas._testing as tm +from pandas.core.indexes.api import ( + Int64Index, + UInt64Index, +) class TestPeriodIndexAsType: diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index 6668a7c6a3d02..353605da91f94 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -2,10 +2,10 @@ from pandas import ( Index, - Int64Index, RangeIndex, ) import pandas._testing as tm +from pandas.core.indexes.api import Int64Index class TestJoin: diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 1b98f3c8194b5..7dcdb627b9abb 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -4,13 +4,13 @@ from pandas.core.dtypes.common import ensure_platform_int import pandas as pd -from pandas import ( +import pandas._testing as tm +from pandas.core.indexes.api import ( Float64Index, Index, Int64Index, RangeIndex, ) -import pandas._testing as tm from pandas.tests.indexes.common import NumericBase # aliases to make some tests easier to read diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index ba938f82e9d89..6dc47b7fef5ac 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -6,13 +6,13 @@ import numpy as np import pytest -from pandas import ( +import pandas._testing as tm +from pandas.core.indexes.api import ( Index, Int64Index, RangeIndex, UInt64Index, ) -import pandas._testing as tm class TestRangeIndexSetOps: diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 9672929ecc06b..8ceef8186e4ea 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -6,7 +6,6 @@ import pandas as pd from pandas import ( Index, - Int64Index, NaT, Series, Timedelta, @@ -14,6 +13,7 @@ timedelta_range, ) import pandas._testing as tm +from pandas.core.indexes.api import Int64Index from pandas.tests.indexes.datetimelike import DatetimeLike randn = np.random.randn diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 172301a2fde84..7b2713ad274c6 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -540,9 +540,9 @@ def test_partial_set_empty_frame_empty_consistencies(self): date_range(start="2000", periods=20, freq="D"), ["2000-01-04", "2000-01-08", "2000-01-12"], [ - Timestamp("2000-01-04", freq="D"), - Timestamp("2000-01-08", freq="D"), - Timestamp("2000-01-12", freq="D"), + Timestamp("2000-01-04"), + Timestamp("2000-01-08"), + Timestamp("2000-01-12"), ], ), ( From 505ed3f343e0bf4b66afae05494bbaf434f7b927 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Oct 2021 05:32:56 -0700 Subject: [PATCH 33/41] TST/REF: fixturize (#43918) --- .../indexes/categorical/test_indexing.py | 5 ++- pandas/tests/indexes/common.py | 5 ++- pandas/tests/indexes/conftest.py | 24 +++++++++++ .../tests/indexes/datetimes/test_datetime.py | 40 ------------------- .../tests/indexes/datetimes/test_indexing.py | 26 +++++++++++- .../indexes/datetimes/test_partial_slicing.py | 17 ++++++++ pandas/tests/indexes/interval/test_base.py | 10 ++--- .../tests/indexes/interval/test_interval.py | 7 ++-- pandas/tests/indexes/multi/test_indexing.py | 7 ++-- pandas/tests/indexes/numeric/test_indexing.py | 7 ++-- pandas/tests/indexes/period/test_indexing.py | 7 ++-- .../tests/indexes/period/test_searchsorted.py | 9 ++--- .../indexes/timedeltas/test_searchsorted.py | 9 ++--- 13 files changed, 94 insertions(+), 79 deletions(-) diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 5d89fd3bb4bc3..798aa7188cb9a 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -300,8 +300,9 @@ def test_get_indexer_same_categories_different_order(self): class TestWhere: - @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) - def test_where(self, klass): + def test_where(self, listlike_box_with_tuple): + klass = listlike_box_with_tuple + i = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) cond = [True] * len(i) expected = i diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 50097ae3787b3..8357595fdaa40 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -378,8 +378,9 @@ def test_numpy_repeat(self, simple_index): with pytest.raises(ValueError, match=msg): np.repeat(idx, rep, axis=0) - @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, klass, simple_index): + def test_where(self, listlike_box_with_tuple, simple_index): + klass = listlike_box_with_tuple + idx = simple_index if isinstance(idx, (DatetimeIndex, TimedeltaIndex)): # where does not preserve freq diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index ac4477e60d5dc..2eae51c62aa0d 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -1,5 +1,11 @@ +import numpy as np import pytest +from pandas import ( + Series, + array, +) + @pytest.fixture(params=[None, False]) def sort(request): @@ -25,3 +31,21 @@ def freq_sample(request): timedelta_range.. """ return request.param + + +@pytest.fixture(params=[list, np.array, array, Series]) +def listlike_box(request): + """ + Types that may be passed as the indexer to searchsorted. + """ + return request.param + + +# TODO: not clear if this _needs_ to be different from listlike_box or +# if that is just a historical artifact +@pytest.fixture(params=[list, tuple, np.array, Series]) +def listlike_box_with_tuple(request): + """ + Types that may be passed as the indexer to searchsorted. + """ + return request.param diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index b220ce486f80b..5c85221c5a753 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -17,29 +17,6 @@ class TestDatetimeIndex: - def test_time_loc(self): # GH8667 - from datetime import time - - from pandas._libs.index import _SIZE_CUTOFF - - ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) - key = time(15, 11, 30) - start = key.hour * 3600 + key.minute * 60 + key.second - step = 24 * 3600 - - for n in ns: - idx = date_range("2014-11-26", periods=n, freq="S") - ts = pd.Series(np.random.randn(n), index=idx) - i = np.arange(start, n, step) - - tm.assert_numpy_array_equal(ts.index.get_loc(key), i, check_dtype=False) - tm.assert_series_equal(ts[key], ts.iloc[i]) - - left, right = ts.copy(), ts.copy() - left[key] *= -10 - right.iloc[i] *= -10 - tm.assert_series_equal(left, right) - def test_time_overflow_for_32bit_machines(self): # GH8943. On some machines NumPy defaults to np.int32 (for example, # 32-bit Linux machines). In the function _generate_regular_range @@ -78,13 +55,6 @@ def test_week_of_month_frequency(self): expected = DatetimeIndex(dates, freq="WOM-1SAT") tm.assert_index_equal(result, expected) - def test_stringified_slice_with_tz(self): - # GH#2658 - start = "2013-01-07" - idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") - df = DataFrame(np.arange(10), index=idx) - df["2013-01-14 23:44:34.437768-05:00":] # no exception here - def test_append_nondatetimeindex(self): rng = date_range("1/1/2000", periods=10) idx = Index(["a", "b", "c", "d"]) @@ -137,16 +107,6 @@ def test_misc_coverage(self): result = rng.groupby(rng.day) assert isinstance(list(result.values())[0][0], Timestamp) - def test_string_index_series_name_converted(self): - # #1644 - df = DataFrame(np.random.randn(10, 4), index=date_range("1/1/2000", periods=10)) - - result = df.loc["1/3/2000"] - assert result.name == df.index[2] - - result = df.T["1/3/2000"] - assert result.name == df.index[2] - def test_groupby_function_tuple_1677(self): df = DataFrame(np.random.rand(100), index=date_range("1/1/2000", periods=100)) monthly_group = df.groupby(lambda x: (x.year, x.month)) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 4ad85f7d4e30f..c3152b77d39df 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -507,6 +507,30 @@ def test_get_loc_time_obj(self): with tm.assert_produces_warning(FutureWarning, match="deprecated"): idx.get_loc(time(12, 30), method="pad") + def test_get_loc_time_obj2(self): + # GH#8667 + + from pandas._libs.index import _SIZE_CUTOFF + + ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) + key = time(15, 11, 30) + start = key.hour * 3600 + key.minute * 60 + key.second + step = 24 * 3600 + + for n in ns: + idx = date_range("2014-11-26", periods=n, freq="S") + ts = pd.Series(np.random.randn(n), index=idx) + locs = np.arange(start, n, step, dtype=np.intp) + + result = ts.index.get_loc(key) + tm.assert_numpy_array_equal(result, locs) + tm.assert_series_equal(ts[key], ts.iloc[locs]) + + left, right = ts.copy(), ts.copy() + left[key] *= -10 + right.iloc[locs] *= -10 + tm.assert_series_equal(left, right) + def test_get_loc_time_nat(self): # GH#35114 # Case where key's total microseconds happens to match iNaT % 1e6 // 1000 @@ -705,7 +729,7 @@ def test_maybe_cast_slice_duplicate_monotonic(self): assert result == expected -class TestDatetimeIndex: +class TestGetValue: def test_get_value(self): # specifically make sure we have test for np.datetime64 key dti = date_range("2016-01-01", periods=3) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index c5b47053471eb..896c43db5e356 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -19,6 +19,23 @@ class TestSlicing: + def test_string_index_series_name_converted(self): + # GH#1644 + df = DataFrame(np.random.randn(10, 4), index=date_range("1/1/2000", periods=10)) + + result = df.loc["1/3/2000"] + assert result.name == df.index[2] + + result = df.T["1/3/2000"] + assert result.name == df.index[2] + + def test_stringified_slice_with_tz(self): + # GH#2658 + start = "2013-01-07" + idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") + df = DataFrame(np.arange(10), index=idx) + df["2013-01-14 23:44:34.437768-05:00":] # no exception here + def test_return_type_doesnt_depend_on_monotonicity(self): # GH#24892 we get Series back regardless of whether our DTI is monotonic dti = date_range(start="2015-5-13 23:59:00", freq="min", periods=3) diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index aa88bca2faec9..411e76ca5d8b7 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -1,10 +1,7 @@ import numpy as np import pytest -from pandas import ( - IntervalIndex, - Series, -) +from pandas import IntervalIndex import pandas._testing as tm from pandas.tests.indexes.common import Base @@ -46,8 +43,9 @@ def test_take(self, closed): expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, simple_index, klass): + def test_where(self, simple_index, listlike_box_with_tuple): + klass = listlike_box_with_tuple + idx = simple_index cond = [True] * len(idx) expected = idx diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index ce8323199ce62..321d1aa34b9af 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -934,15 +934,14 @@ def test_dir(): assert "str" not in result -@pytest.mark.parametrize("klass", [list, np.array, pd.array, pd.Series]) -def test_searchsorted_different_argument_classes(klass): +def test_searchsorted_different_argument_classes(listlike_box): # https://github.com/pandas-dev/pandas/issues/32762 values = IntervalIndex([Interval(0, 1), Interval(1, 2)]) - result = values.searchsorted(klass(values)) + result = values.searchsorted(listlike_box(values)) expected = np.array([0, 1], dtype=result.dtype) tm.assert_numpy_array_equal(result, expected) - result = values._data.searchsorted(klass(values)) + result = values._data.searchsorted(listlike_box(values)) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 405b41c829a2f..99322f474dd9e 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -720,13 +720,12 @@ def test_where(self): with pytest.raises(NotImplementedError, match=msg): i.where(True) - @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) - def test_where_array_like(self, klass): - i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + def test_where_array_like(self, listlike_box_with_tuple): + mi = MultiIndex.from_tuples([("A", 1), ("A", 2)]) cond = [False, True] msg = r"\.where is not supported for MultiIndex operations" with pytest.raises(NotImplementedError, match=msg): - i.where(klass(cond)) + mi.where(listlike_box_with_tuple(cond)) class TestContains: diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index cb861aaab80f8..cc309beef92d6 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -397,15 +397,14 @@ class TestWhere: UInt64Index(np.arange(5, dtype="uint64")), ], ) - @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, klass, index): + def test_where(self, listlike_box_with_tuple, index): cond = [True] * len(index) expected = index - result = index.where(klass(cond)) + result = index.where(listlike_box_with_tuple(cond)) cond = [False] + [True] * (len(index) - 1) expected = Float64Index([index._na_value] + index[1:].tolist()) - result = index.where(klass(cond)) + result = index.where(listlike_box_with_tuple(cond)) tm.assert_index_equal(result, expected) def test_where_uint64(self): diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index cef045766efcc..78afcf2fdc78a 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -602,17 +602,16 @@ def test_get_indexer2(self): class TestWhere: - @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, klass): + def test_where(self, listlike_box_with_tuple): i = period_range("20130101", periods=5, freq="D") cond = [True] * len(i) expected = i - result = i.where(klass(cond)) + result = i.where(listlike_box_with_tuple(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * (len(i) - 1) expected = PeriodIndex([NaT] + i[1:].tolist(), freq="D") - result = i.where(klass(cond)) + result = i.where(listlike_box_with_tuple(cond)) tm.assert_index_equal(result, expected) def test_where_other(self): diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py index 27e998284c189..b9863d1bb019a 100644 --- a/pandas/tests/indexes/period/test_searchsorted.py +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -7,8 +7,6 @@ NaT, Period, PeriodIndex, - Series, - array, ) import pandas._testing as tm @@ -37,17 +35,16 @@ def test_searchsorted(self, freq): with pytest.raises(IncompatibleFrequency, match=msg): pidx.searchsorted(Period("2014-01-01", freq="5D")) - @pytest.mark.parametrize("klass", [list, np.array, array, Series]) - def test_searchsorted_different_argument_classes(self, klass): + def test_searchsorted_different_argument_classes(self, listlike_box): pidx = PeriodIndex( ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], freq="D", ) - result = pidx.searchsorted(klass(pidx)) + result = pidx.searchsorted(listlike_box(pidx)) expected = np.arange(len(pidx), dtype=result.dtype) tm.assert_numpy_array_equal(result, expected) - result = pidx._data.searchsorted(klass(pidx)) + result = pidx._data.searchsorted(listlike_box(pidx)) tm.assert_numpy_array_equal(result, expected) def test_searchsorted_invalid(self): diff --git a/pandas/tests/indexes/timedeltas/test_searchsorted.py b/pandas/tests/indexes/timedeltas/test_searchsorted.py index 8a48da91ef31d..710571ef38397 100644 --- a/pandas/tests/indexes/timedeltas/test_searchsorted.py +++ b/pandas/tests/indexes/timedeltas/test_searchsorted.py @@ -2,23 +2,20 @@ import pytest from pandas import ( - Series, TimedeltaIndex, Timestamp, - array, ) import pandas._testing as tm class TestSearchSorted: - @pytest.mark.parametrize("klass", [list, np.array, array, Series]) - def test_searchsorted_different_argument_classes(self, klass): + def test_searchsorted_different_argument_classes(self, listlike_box): idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) - result = idx.searchsorted(klass(idx)) + result = idx.searchsorted(listlike_box(idx)) expected = np.arange(len(idx), dtype=result.dtype) tm.assert_numpy_array_equal(result, expected) - result = idx._data.searchsorted(klass(idx)) + result = idx._data.searchsorted(listlike_box(idx)) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( From 9ee956b764768ab6c7d33ad4e50ecfcea924d470 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Oct 2021 17:17:40 -0700 Subject: [PATCH 34/41] BUG: NumericIndex.insert (#43933) --- pandas/core/indexes/base.py | 7 ++++--- pandas/tests/indexes/common.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index da953fe46ef1d..2ff9b3973a526 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6329,10 +6329,11 @@ def insert(self, loc: int, item) -> Index: arr = np.asarray(self) - # Use Index constructor to ensure we get tuples cast correctly. - item = Index([item], dtype=self.dtype)._values + # Use constructor to ensure we get tuples cast correctly. + # Use self._constructor instead of Index to retain NumericIndex GH#43921 + item = self._constructor([item], dtype=self.dtype)._values idx = np.concatenate((arr[:loc], item, arr[loc:])) - return Index._with_infer(idx, name=self.name) + return self._constructor._with_infer(idx, name=self.name) def drop(self, labels, errors: str_t = "raise") -> Index: """ diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8357595fdaa40..7e43664c6b3de 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -793,6 +793,20 @@ def test_format(self, simple_index): def test_numeric_compat(self): pass # override Base method + def test_insert_non_na(self, simple_index): + # GH#43921 inserting an element that we know we can hold should + # not change dtype or type (except for RangeIndex) + index = simple_index + + result = index.insert(0, index[0]) + + cls = type(index) + if cls is RangeIndex: + cls = Int64Index + + expected = cls([index[0]] + list(index), dtype=index.dtype) + tm.assert_index_equal(result, expected) + def test_insert_na(self, nulls_fixture, simple_index): # GH 18295 (test missing) index = simple_index @@ -800,6 +814,11 @@ def test_insert_na(self, nulls_fixture, simple_index): if na_val is pd.NaT: expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object) + elif type(index) is NumericIndex and index.dtype.kind == "f": + # GH#43921 + expected = NumericIndex( + [index[0], np.nan] + list(index[1:]), dtype=index.dtype + ) else: expected = Float64Index([index[0], np.nan] + list(index[1:])) From 1e370aa138c6e0186cd959fe6816187ac9698a3b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 8 Oct 2021 17:18:23 -0700 Subject: [PATCH 35/41] TST: Skip leaky test on Python 3.10 (#43910) --- .github/workflows/python-dev.yml | 2 -- pandas/tests/io/parser/common/test_common_basic.py | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 3a139936fbd22..b32b18b86e9df 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -73,8 +73,6 @@ jobs: shell: bash run: | ci/run_tests.sh - # GH 41935 - continue-on-error: true - name: Publish test results uses: actions/upload-artifact@master diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index ff54a378806fa..6d958f46a49dd 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -12,6 +12,7 @@ import numpy as np import pytest +from pandas.compat import PY310 from pandas.errors import ( EmptyDataError, ParserError, @@ -674,6 +675,11 @@ def test_read_table_equivalency_to_read_csv(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.skipif( + PY310, + reason="GH41935 This test is leaking only on Python 3.10," + "causing other tests to fail with a cryptic error.", +) @pytest.mark.parametrize("read_func", ["read_csv", "read_table"]) def test_read_csv_and_table_sys_setprofile(all_parsers, read_func): # GH#41069 From acb76509cfd87f23807bf408f21b217c4f2d3687 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Oct 2021 17:19:13 -0700 Subject: [PATCH 36/41] ENH: EA.tolist (#43920) --- doc/source/reference/extensions.rst | 1 + pandas/core/arrays/base.py | 17 +++++++++++++++++ pandas/core/arrays/categorical.py | 13 +++---------- pandas/core/base.py | 3 --- pandas/tests/extension/base/dim2.py | 11 +++++++++++ pandas/tests/extension/base/interface.py | 6 ++++++ 6 files changed, 38 insertions(+), 13 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 7b451ed3bf296..e2e8c94ef8fc6 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -60,6 +60,7 @@ objects. api.extensions.ExtensionArray.nbytes api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape + api.extensions.ExtensionArray.tolist Additionally, we have some utility methods for ensuring your object behaves correctly. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 46b0a6873986e..99c4944a1cfa7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -130,6 +130,7 @@ class ExtensionArray: searchsorted shift take + tolist unique view _concat_same_type @@ -1348,6 +1349,22 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # ------------------------------------------------------------------------ # Non-Optimized Default Methods + def tolist(self) -> list: + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + list + """ + if self.ndim > 1: + return [x.tolist() for x in self] + return list(self) + def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) return self.take(indexer) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c0fc172139149..7e3bf33f411bb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -40,7 +40,6 @@ Ordered, PositionalIndexer2D, PositionalIndexerTuple, - Scalar, ScalarIndexer, SequenceIndexer, Shape, @@ -566,17 +565,11 @@ def itemsize(self) -> int: """ return self.categories.itemsize - def tolist(self) -> list[Scalar]: + def to_list(self): """ - Return a list of the values. - - These are each a scalar type, which is a Python scalar - (for str, int, float) or a pandas scalar - (for Timestamp/Timedelta/Interval/Period) + Alias for tolist. """ - return list(self) - - to_list = tolist + return self.tolist() @classmethod def _from_inferred_categories( diff --git a/pandas/core/base.py b/pandas/core/base.py index 24fa362eea9c3..a1bf448df18c4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -740,9 +740,6 @@ def tolist(self): numpy.ndarray.tolist : Return the array as an a.ndim-levels deep nested list of Python scalars. """ - if not isinstance(self._values, np.ndarray): - # check for ndarray instead of dtype to catch DTA/TDA - return list(self._values) return self._values.tolist() to_list = tolist diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index b56ec23c63569..b80d2a3586b3b 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -97,6 +97,17 @@ def test_iter_2d(self, data): assert obj.ndim == 1 assert len(obj) == arr2d.shape[1] + def test_tolist_2d(self, data): + arr2d = data.reshape(1, -1) + + result = arr2d.tolist() + expected = [data.tolist()] + + assert isinstance(result, list) + assert all(isinstance(x, list) for x in result) + + assert result == expected + def test_concat_2d(self, data): left = data.reshape(-1, 1) right = left.copy() diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index f51f9f732bace..3e8a754c8c527 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -119,3 +119,9 @@ def test_view(self, data): # check specifically that the `dtype` kwarg is accepted data.view(dtype=None) + + def test_tolist(self, data): + result = data.tolist() + expected = list(data) + assert isinstance(result, list) + assert result == expected From e12643ea788a9bf23c323580c8dde1557287515b Mon Sep 17 00:00:00 2001 From: rosagold Date: Sat, 9 Oct 2021 03:10:01 +0200 Subject: [PATCH 37/41] fixed rolling for a decreasing index, added a test for that (#43928) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/window/indexers.pyx | 6 +++-- pandas/tests/apply/test_frame_apply.py | 2 -- pandas/tests/window/test_rolling.py | 33 ++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 722d0dcc10041..e638a24f830ef 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -502,6 +502,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`) - Bug in :meth:`GroupBy.apply` with time-based :class:`Grouper` objects incorrectly raising ``ValueError`` in corner cases where the grouping vector contains a ``NaT`` (:issue:`43500`, :issue:`43515`) - Bug in :meth:`GroupBy.mean` failing with ``complex`` dtype (:issue:`43701`) +- Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and index is decreasing (:issue:`43927`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 197345b3ce6ac..3782b55bd19b3 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -81,9 +81,11 @@ def calculate_variable_window_bounds( if center: end_bound = index[0] + index_growth_sign * window_size / 2 for j in range(0, num_values): - if (index[j] < end_bound) or (index[j] == end_bound and right_closed): + if (index[j] - end_bound) * index_growth_sign < 0: end[0] = j + 1 - elif index[j] >= end_bound: + elif (index[j] - end_bound) * index_growth_sign == 0 and right_closed: + end[0] = j + 1 + elif (index[j] - end_bound) * index_growth_sign >= 0: end[0] = j break diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b51b66a0adda0..f1a93714b4c62 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1133,8 +1133,6 @@ def test_agg_multiple_mixed_no_warning(): else: expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"]) tm.assert_frame_equal(result, expected) - if get_option("future_udf_behavior"): - assert False, "Yay!" def test_agg_reduce(axis, float_frame): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 1631c9f0e2ffd..d88ce2ccb54cc 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1210,6 +1210,39 @@ def test_rolling_decreasing_indices(method): assert np.abs(decreasing.values[::-1][:-4] - increasing.values[4:]).max() < 1e-12 +@pytest.mark.parametrize( + "window,closed,expected", + [ + ("2s", "right", [1.0, 3.0, 5.0, 3.0]), + ("2s", "left", [0.0, 1.0, 3.0, 5.0]), + ("2s", "both", [1.0, 3.0, 6.0, 5.0]), + ("2s", "neither", [0.0, 1.0, 2.0, 3.0]), + ("3s", "right", [1.0, 3.0, 6.0, 5.0]), + ("3s", "left", [1.0, 3.0, 6.0, 5.0]), + ("3s", "both", [1.0, 3.0, 6.0, 5.0]), + ("3s", "neither", [1.0, 3.0, 6.0, 5.0]), + ], +) +def test_rolling_decreasing_indices_centered(window, closed, expected, frame_or_series): + """ + Ensure that a symmetrical inverted index return same result as non-inverted. + """ + # GH 43927 + + index = date_range("2020", periods=4, freq="1s") + df_inc = frame_or_series(range(4), index=index) + df_dec = frame_or_series(range(4), index=index[::-1]) + + expected_inc = frame_or_series(expected, index=index) + expected_dec = frame_or_series(expected, index=index[::-1]) + + result_inc = df_inc.rolling(window, closed=closed, center=True).sum() + result_dec = df_dec.rolling(window, closed=closed, center=True).sum() + + tm.assert_equal(result_inc, expected_inc) + tm.assert_equal(result_dec, expected_dec) + + @pytest.mark.parametrize( "method,expected", [ From 3bba3719f057586ab400a6d3d556d16b49d5c340 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 18 Oct 2021 16:13:11 -0400 Subject: [PATCH 38/41] Added docs --- doc/source/user_guide/future_udf_behavior.rst | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 doc/source/user_guide/future_udf_behavior.rst diff --git a/doc/source/user_guide/future_udf_behavior.rst b/doc/source/user_guide/future_udf_behavior.rst new file mode 100644 index 0000000000000..294133846bca9 --- /dev/null +++ b/doc/source/user_guide/future_udf_behavior.rst @@ -0,0 +1,71 @@ +.. _future_udf_behavior: +:orphan: + +{{ header }} + +******************* +Future UDF Behavior +******************* + +pandas is experimenting with improving the behavior of methods that take a +user-defined function (UDF). These methods include ``.apply``, ``.agg``, ``.transform``, +and ``.filter``. The goal is to make these methods behave in a more predictable +and consistent manner, reducing the complexity of their implementation, and improving +performance where possible. This page details the differences between the old and +new behaviors, as well as providing some context behind each change that is being made. + +There are a great number of changes that are planned. In order to transition in a +reasonable manner for users, all changes are behind an experimental "future_udf_behavior" +option. This is currently experimental and subject to breaking changes without notice. +Users can opt into the new behavior and provide feedback. Once the improvements have +been made, this option will be declared no longer experimental. pandas will then raise +a ``FutureWarning`` that the default value of this option will be set to ``True`` in +a future version. Once the default is ``True``, users can still override it to ``False``. +After a sufficient amount of time, pandas will remove this option altogether and only +the future behavior will remain. + +``DataFrame.agg`` with list-likes +--------------------------------- + +Previously, using ``DataFrame.agg`` with a list-like argument would transpose the result when +compared with just providing a single aggregation function. + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) + + df.agg('sum') + df.agg(['sum']) + +This transpose no longer occurs, making the result more consistent. + +.. ipython:: python + + with pd.option_context('future_udf_behavior', True): + result = df.agg(['sum']) + result + + with pd.option_context('future_udf_behavior', True): + result = df.agg(['sum', 'mean']) + result + +``DataFrame.groupby(...).agg`` with list-likes +---------------------------------------------- + +Previously, using ``DataFrame.groupby(...).agg`` with a list-like argument would put the +columns as the first level of the resulting hierarchical columns. The result is +that the columns for each aggregation function are separated, inconsistent with the result +for a single aggregator. + +.. ipython:: python + + df.groupby("a").agg('sum') + df.groupby("a").agg(["sum", "min"]) + +Now the levels are swapped, so that the columns for each aggregation are together. + +.. ipython:: python + + with pd.option_context('future_udf_behavior', True): + result = df.groupby("a").agg(["sum", "min"]) + result From 7abdff9caec9e03fde8d0341aa5857a41701cd4c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 18 Oct 2021 16:14:37 -0400 Subject: [PATCH 39/41] Make quotes consistent --- doc/source/user_guide/future_udf_behavior.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/future_udf_behavior.rst b/doc/source/user_guide/future_udf_behavior.rst index 294133846bca9..5ac78b19594ed 100644 --- a/doc/source/user_guide/future_udf_behavior.rst +++ b/doc/source/user_guide/future_udf_behavior.rst @@ -32,21 +32,21 @@ compared with just providing a single aggregation function. .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - df.agg('sum') - df.agg(['sum']) + df.agg("sum") + df.agg(["sum"]) This transpose no longer occurs, making the result more consistent. .. ipython:: python - with pd.option_context('future_udf_behavior', True): - result = df.agg(['sum']) + with pd.option_context("future_udf_behavior", True): + result = df.agg(["sum"]) result - with pd.option_context('future_udf_behavior', True): - result = df.agg(['sum', 'mean']) + with pd.option_context("future_udf_behavior", True): + result = df.agg(["sum", "mean"]) result ``DataFrame.groupby(...).agg`` with list-likes @@ -59,13 +59,13 @@ for a single aggregator. .. ipython:: python - df.groupby("a").agg('sum') + df.groupby("a").agg("sum") df.groupby("a").agg(["sum", "min"]) Now the levels are swapped, so that the columns for each aggregation are together. .. ipython:: python - with pd.option_context('future_udf_behavior', True): + with pd.option_context("future_udf_behavior", True): result = df.groupby("a").agg(["sum", "min"]) result From a72a5eb3a68229b6b530133e0258c12aa1a248f3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 18 Oct 2021 23:42:13 -0400 Subject: [PATCH 40/41] Fixup docs --- doc/source/user_guide/future_udf_behavior.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/future_udf_behavior.rst b/doc/source/user_guide/future_udf_behavior.rst index 5ac78b19594ed..8871f767c9cb5 100644 --- a/doc/source/user_guide/future_udf_behavior.rst +++ b/doc/source/user_guide/future_udf_behavior.rst @@ -1,4 +1,5 @@ .. _future_udf_behavior: + :orphan: {{ header }} From afc27ba4c379807358e7ec75d6ec30e8d954cb44 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 7 Nov 2021 17:12:37 -0500 Subject: [PATCH 41/41] Merge cleanup --- pandas/core/frame.py | 5 ++-- pandas/tests/apply/test_frame_apply.py | 26 +++++++++++-------- .../tests/groupby/aggregate/test_aggregate.py | 5 ++-- pandas/tests/groupby/aggregate/test_other.py | 8 ++++-- pandas/tests/groupby/test_groupby.py | 3 ++- pandas/tests/resample/test_resample_api.py | 5 +++- 6 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ebf3428020652..3f46669c61683 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -83,6 +83,7 @@ doc, rewrite_axis_style_signature, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, validate_axis_style_args, @@ -10016,7 +10017,7 @@ def _get_data() -> DataFrame: "version this will raise TypeError. Select only valid " "columns before calling the reduction.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) return out @@ -10049,7 +10050,7 @@ def _get_data() -> DataFrame: "version this will raise TypeError. Select only valid " "columns before calling the reduction.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) if hasattr(result, "dtype"): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 2931be660ea6e..a12ca64a7a0eb 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1112,21 +1112,23 @@ def test_agg_multiple_mixed_no_warning(): }, index=["min", "sum"], ) - klass, match = None, None if get_option("future_udf_behavior"): expected = expected.T - klass, match = FutureWarning, "Dropping of nuisance columns" + match = "Dropping of nuisance columns" + else: + match = "did not aggregate successfully" # sorted index - with tm.assert_produces_warning(klass, match=match, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, match=match): result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) - klass, match = None, None if get_option("future_udf_behavior"): - klass, match = FutureWarning, "Dropping of nuisance columns" + match = "Dropping of nuisance columns" + else: + match = "did not aggregate successfully" - with tm.assert_produces_warning(klass, match=match, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, match=match, check_stacklevel=False): result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # GH40420: the result of .agg should have an index that is sorted @@ -1242,10 +1244,11 @@ def test_nuiscance_columns(): expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) - warn = FutureWarning if get_option("future_udf_behavior") else None - with tm.assert_produces_warning( - warn, match="Select only valid", check_stacklevel=False - ): + if get_option("future_udf_behavior"): + match = "Select only valid" + else: + match = "did not aggregate successfully" + with tm.assert_produces_warning(FutureWarning, match=match): result = df.agg(["sum"]) expected = DataFrame( [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] @@ -1492,8 +1495,9 @@ def foo(s): return s.sum() / 2 aggs = ["sum", foo, "count", "min"] + klass = None if get_option("future_udf_behavior") else FutureWarning with tm.assert_produces_warning( - FutureWarning, match=r"\['item'\] did not aggregate successfully" + klass, match=r"\['item'\] did not aggregate successfully" ): result = df.agg(aggs) if get_option("future_udf_behavior"): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 18e17fd70216a..095c3fbaf10fb 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -390,12 +390,13 @@ def test_multiple_functions_tuples_and_non_tuples(df): expected = df.groupby("A")["C"].agg(ex_funcs) tm.assert_frame_equal(result, expected) + klass = None if get_option("future_udf_behavior") else FutureWarning with tm.assert_produces_warning( - FutureWarning, match=r"\['B'\] did not aggregate successfully" + klass, match=r"\['B'\] did not aggregate successfully" ): result = df.groupby("A").agg(funcs) with tm.assert_produces_warning( - FutureWarning, match=r"\['B'\] did not aggregate successfully" + klass, match=r"\['B'\] did not aggregate successfully" ): expected = df.groupby("A").agg(ex_funcs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 35ac60eeac45f..c79878d71f5ae 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -46,16 +46,20 @@ def test_agg_api(): def peak_to_peak(arr): return arr.max() - arr.min() + if get_option("future_udf_behavior"): + msg = "Dropping invalid columns" + else: + msg = r"\['key2'\] did not aggregate successfully" with tm.assert_produces_warning( FutureWarning, - match=r"\['key2'\] did not aggregate successfully", + match=msg, ): expected = grouped.agg([peak_to_peak]) expected.columns = ["data1", "data2"] with tm.assert_produces_warning( FutureWarning, - match=r"\['key2'\] did not aggregate successfully", + match=msg, ): result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6cb2d6484ca4e..2f1fc1efa26c7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -589,8 +589,9 @@ def test_frame_multi_key_function_list(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] + klass = None if get_option("future_udf_behavior") else FutureWarning with tm.assert_produces_warning( - FutureWarning, match=r"\['C'\] did not aggregate successfully" + klass, match=r"\['C'\] did not aggregate successfully" ): agged = grouped.agg(funcs) if get_option("future_udf_behavior"): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 4be47eaa3c25d..476b29217a8c0 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -355,7 +355,10 @@ def test_agg(): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: - warn = FutureWarning if t in cases[1:3] else None + if t in cases[1:3] and not get_option("future_udf_behavior"): + warn = FutureWarning + else: + warn = None with tm.assert_produces_warning( warn, match=r"\['date'\] did not aggregate successfully",