From ca136de734caa566efbca6b90487a2dd603cd51c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 17:39:11 -0700 Subject: [PATCH 01/11] BUG: IndexError in libreduction --- pandas/_libs/reduction.pyx | 4 ++-- pandas/core/groupby/generic.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0eac0e94f0beb..043d50ba9e68f 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -246,11 +246,11 @@ cdef class SeriesBinGrouper: object res bint initialized = 0 Slider vslider, islider - object name, cached_typ=None, cached_ityp=None + object name, cached_typ = None, cached_ityp = None counts = np.zeros(self.ngroups, dtype=np.int64) - if self.ngroups > 0: + if len(self.bins) > 0: counts[0] = self.bins[0] for i in range(1, self.ngroups): if i == self.ngroups - 1: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a78857423e7e0..316416b93b34a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -263,6 +263,8 @@ def aggregate(self, func=None, *args, **kwargs): return self._python_agg_general(func, *args, **kwargs) except AssertionError: raise + except IndexError: + raise except Exception: result = self._aggregate_named(func, *args, **kwargs) From 14e2433959d1da07bcf55c6522ce4f5b0002af60 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 11:32:13 -0700 Subject: [PATCH 02/11] BUG: fix TypeError raised in maybe_downcast_numeric --- pandas/core/dtypes/cast.py | 2 +- pandas/core/groupby/generic.py | 2 +- pandas/tests/dtypes/cast/test_downcast.py | 9 +++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index dd001e78c07de..7fcaf60088ad2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -202,7 +202,7 @@ def trans(x): r = result.ravel() arr = np.array([r[0]]) - if isna(arr).any() or not np.allclose(arr, trans(arr).astype(dtype), rtol=0): + if isna(arr).any(): # if we have any nulls, then we are done return result diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 316416b93b34a..c0069c1980243 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -261,7 +261,7 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except AssertionError: + except (AssertionError, TypeError): raise except IndexError: raise diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index d574b03a8c724..9e2eca5259bc3 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -1,3 +1,5 @@ +import decimal + import numpy as np import pytest @@ -25,6 +27,13 @@ "infer", np.array([8, 8, 8, 8, 9], dtype=np.int64), ), + ( + # This is a judgement call, but we do _not_ downcast Decimal + # objects + np.array([decimal.Decimal(0.0)]), + "int64", + np.array([decimal.Decimal(0.0)]), + ), ], ) def test_downcast(arr, expected, dtype): From 37d829b327243f4b541ef6c2085f8ba41c78755d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 09:21:59 -0700 Subject: [PATCH 03/11] BUG: fix AttributeError raised in libreduction --- pandas/_libs/reduction.pyx | 16 ++++++++++------ pandas/core/groupby/generic.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 043d50ba9e68f..cfcaa4d7230ea 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -203,7 +203,8 @@ cdef class SeriesBinGrouper: self.f = f values = series.values - if not values.flags.c_contiguous: + if util.is_array(values) and not values.flags.c_contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values self.typ = series._constructor @@ -230,7 +231,8 @@ cdef class SeriesBinGrouper: values = dummy.values if values.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') - if not values.flags.contiguous: + if util.is_array(values) and not values.flags.contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values if not index.flags.contiguous: @@ -356,7 +358,8 @@ cdef class SeriesGrouper: if (dummy.dtype != self.arr.dtype and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') - if not values.flags.contiguous: + if util.is_array(values) and not values.flags.contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values if not index.flags.contiguous: @@ -467,12 +470,13 @@ cdef class Slider: char *orig_data def __init__(self, object values, object buf): - assert(values.ndim == 1) + assert (values.ndim == 1) - if not values.flags.contiguous: + if util.is_array(values) and not values.flags.contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy() - assert(values.dtype == buf.dtype) + assert (values.dtype == buf.dtype) self.values = values self.buf = buf self.stride = values.strides[0] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c0069c1980243..7cff36658cb4b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -261,7 +261,7 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except (AssertionError, TypeError): + except (AssertionError, TypeError, AttributeError): raise except IndexError: raise From ec9e851c3f7faf2b56784317583c6e0f7406d7a4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 11:43:35 -0700 Subject: [PATCH 04/11] missed one --- pandas/_libs/reduction.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index cfcaa4d7230ea..6663ee828bec8 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -334,7 +334,8 @@ cdef class SeriesGrouper: self.f = f values = series.values - if not values.flags.c_contiguous: + if util.is_array(values) and not values.flags.c_contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values self.typ = series._constructor From ce2b9ba28230c639ba850a255d9e0780747b26c8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 13:57:19 -0700 Subject: [PATCH 05/11] revert re-raising AttributeError --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7cff36658cb4b..c0069c1980243 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -261,7 +261,7 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except (AssertionError, TypeError, AttributeError): + except (AssertionError, TypeError): raise except IndexError: raise From 030e6fb4af6a6e8a7c46521bacc9fcc8ed0e8119 Mon Sep 17 00:00:00 2001 From: Grigorios Giannakopoulos Date: Sun, 20 Oct 2019 16:49:54 +0300 Subject: [PATCH 06/11] Add a regression test for the timezone issue (#29097) --- pandas/tests/frame/test_apply.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe034504b8161..4b7439cd40023 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1346,3 +1346,17 @@ def test_frequency_is_original(self, num_cols): df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq + + def test_apply_datetime_tz_issue(self): + # GH 29052 + + timestamps = [ + pd.Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1) + expected = pd.Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) From 618462f0b48f9c8bdf6b076fe0db93e046ca2bdf Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Sun, 20 Oct 2019 18:26:40 -0400 Subject: [PATCH 07/11] Fix typing errors (#29115) Thanks, @AbhijeetKrishnan --- pandas/tests/frame/test_constructors.py | 6 +++--- setup.cfg | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 583093af6d3e6..aa00cf234d9ee 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -50,13 +50,13 @@ class TestDataFrameConstructors: lambda: DataFrame({}), lambda: DataFrame(()), lambda: DataFrame([]), - lambda: DataFrame((x for x in [])), + lambda: DataFrame((_ for _ in [])), lambda: DataFrame(range(0)), lambda: DataFrame(data=None), lambda: DataFrame(data={}), lambda: DataFrame(data=()), lambda: DataFrame(data=[]), - lambda: DataFrame(data=(x for x in [])), + lambda: DataFrame(data=(_ for _ in [])), lambda: DataFrame(data=range(0)), ], ) @@ -72,7 +72,7 @@ def test_empty_constructor(self, constructor): [ ([[]], RangeIndex(1), RangeIndex(0)), ([[], []], RangeIndex(2), RangeIndex(0)), - ([(x for x in [])], RangeIndex(1), RangeIndex(0)), + ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)), ], ) def test_emptylike_constructor(self, emptylike, expected_index, expected_columns): diff --git a/setup.cfg b/setup.cfg index ca1ca4a7b5733..de251bafd34fb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -160,9 +160,6 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True -[mypy-pandas.tests.frame.test_constructors] -ignore_errors=True - [mypy-pandas.tests.indexes.datetimes.test_datetimelike] ignore_errors=True From 762d0bcd0831d84335d48636b3a5206c7b01f9b0 Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Sun, 20 Oct 2019 18:27:41 -0400 Subject: [PATCH 08/11] Fix typing errors (#29114) --- pandas/tests/dtypes/test_inference.py | 7 +++++-- setup.cfg | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index cfa6304909bb7..60afd768195d9 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -75,7 +75,7 @@ def coerce(request): (iter([1, 2]), True, "iterator"), # noqa: E241 (iter([]), True, "iterator-empty"), # noqa: E241 ((x for x in [1, 2]), True, "generator"), # noqa: E241 - ((x for x in []), True, "generator-empty"), # noqa: E241 + ((_ for _ in []), True, "generator-empty"), # noqa: E241 (Series([1]), True, "Series"), # noqa: E241 (Series([]), True, "Series-empty"), # noqa: E241 (Series(["a"]).str, True, "StringMethods"), # noqa: E241 @@ -288,7 +288,10 @@ class MockFile: assert not is_file(data) -@pytest.mark.parametrize("ll", [collections.namedtuple("Test", list("abc"))(1, 2, 3)]) +test_tuple = collections.namedtuple("Test", ["a", "b", "c"]) + + +@pytest.mark.parametrize("ll", [test_tuple(1, 2, 3)]) def test_is_names_tuple_passes(ll): assert inference.is_named_tuple(ll) diff --git a/setup.cfg b/setup.cfg index de251bafd34fb..199ad34626011 100644 --- a/setup.cfg +++ b/setup.cfg @@ -148,9 +148,6 @@ ignore_errors=True [mypy-pandas.tests.dtypes.test_common] ignore_errors=True -[mypy-pandas.tests.dtypes.test_inference] -ignore_errors=True - [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True From f06432e490e3e4dd299aec42d4b91bba12c3d64b Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Mon, 21 Oct 2019 02:49:55 -0400 Subject: [PATCH 09/11] Fix mypy errors (#29108) --- pandas/tests/series/test_constructors.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 65cbf5fcf91d2..ca14f0fd05869 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -45,12 +45,12 @@ class TestSeriesConstructors: (lambda: Series({}), True), (lambda: Series(()), False), # creates a RangeIndex (lambda: Series([]), False), # creates a RangeIndex - (lambda: Series((x for x in [])), False), # creates a RangeIndex + (lambda: Series((_ for _ in [])), False), # creates a RangeIndex (lambda: Series(data=None), True), (lambda: Series(data={}), True), (lambda: Series(data=()), False), # creates a RangeIndex (lambda: Series(data=[]), False), # creates a RangeIndex - (lambda: Series(data=(x for x in [])), False), # creates a RangeIndex + (lambda: Series(data=(_ for _ in [])), False), # creates a RangeIndex ], ) def test_empty_constructor(self, constructor, check_index_type): diff --git a/setup.cfg b/setup.cfg index 199ad34626011..f32deff9dafb8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -187,9 +187,6 @@ ignore_errors=True [mypy-pandas.tests.indexing.test_loc] ignore_errors=True -[mypy-pandas.tests.series.test_constructors] -ignore_errors=True - [mypy-pandas.tests.series.test_operators] ignore_errors=True From 6242ddc670f8fe4094baa35b8cf0b94f68df3d6c Mon Sep 17 00:00:00 2001 From: yogendrasoni Date: Mon, 21 Oct 2019 17:27:28 +0530 Subject: [PATCH 10/11] fix #28926 mypy error in pandas\tests\arrays\test_array.py (#28970) --- pandas/core/arrays/period.py | 4 +++- setup.cfg | 9 --------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index a21d9e67e49e5..78cc54db4b1b8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -831,7 +831,9 @@ def _raise_on_incompatible(left, right): def period_array( - data: Sequence[Optional[Period]], freq: Optional[Tick] = None, copy: bool = False + data: Sequence[Optional[Period]], + freq: Optional[Union[str, Tick]] = None, + copy: bool = False, ) -> PeriodArray: """ Construct a new PeriodArray from a sequence of Period scalars. diff --git a/setup.cfg b/setup.cfg index f32deff9dafb8..c9ba13443e97c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -136,15 +136,9 @@ ignore_errors=True [mypy-pandas.tests.arithmetic.test_datetime64] ignore_errors=True -[mypy-pandas.tests.arrays.test_array] -ignore_errors=True - [mypy-pandas.tests.arrays.test_datetimelike] ignore_errors=True -[mypy-pandas.tests.arrays.test_period] -ignore_errors=True - [mypy-pandas.tests.dtypes.test_common] ignore_errors=True @@ -190,9 +184,6 @@ ignore_errors=True [mypy-pandas.tests.series.test_operators] ignore_errors=True -[mypy-pandas.tests.test_base] -ignore_errors=True - [mypy-pandas.tests.tseries.offsets.test_offsets] ignore_errors=True From 0042cdf8674209470d9dafeab858584305a2ef8a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 23 Oct 2019 08:40:42 -0700 Subject: [PATCH 11/11] port decimal test --- pandas/_libs/reduction.pyx | 4 +-- pandas/core/groupby/generic.py | 9 +++-- pandas/core/groupby/ops.py | 10 ++++++ .../tests/extension/decimal/test_decimal.py | 34 +++++++++++++++++++ 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 6663ee828bec8..7ed131e1c7608 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -248,11 +248,11 @@ cdef class SeriesBinGrouper: object res bint initialized = 0 Slider vslider, islider - object name, cached_typ = None, cached_ityp = None + object name, cached_typ=None, cached_ityp=None counts = np.zeros(self.ngroups, dtype=np.int64) - if len(self.bins) > 0: + if self.ngroups > 0: counts[0] = self.bins[0] for i in range(1, self.ngroups): if i == self.ngroups - 1: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0bcf163ad2f6c..2afb77a619a80 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -262,9 +262,14 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except (AssertionError, TypeError, IndexError): + except (AssertionError, TypeError): raise - except Exception: + except (ValueError, KeyError, AttributeError, IndexError): + # TODO: IndexError can be removed here following GH#29106 + # TODO: AttributeError is caused by _index_data hijinx in + # libreduction, can be removed after GH#29160 + # TODO: KeyError is raised in _python_agg_general, + # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 00e7012b40986..33857dcea328f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -26,6 +26,7 @@ is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, + is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, @@ -659,6 +660,12 @@ def _transform( return result def agg_series(self, obj, func): + if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + # _aggregate_series_fast would raise TypeError when + # calling libreduction.Slider + # TODO: is the datetime64tz case supposed to go through here? + return self._aggregate_series_pure_python(obj, func) + try: return self._aggregate_series_fast(obj, func) except AssertionError: @@ -677,6 +684,8 @@ def agg_series(self, obj, func): def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) + # TODO: pre-empt this, also pre-empt get_result raising TypError if we pass a EA + # for EAs backed by ndarray we may have a performant workaround if obj.index._has_complex_internals: raise TypeError("Incompatible index for Cython grouper") @@ -711,6 +720,7 @@ def _aggregate_series_pure_python(self, obj, func): result[label] = res result = lib.maybe_convert_objects(result, try_float=0) + # TODO: try_cast back to EA? return result, counts diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 3ac9d37ccf4f3..3da136899d83c 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -426,3 +426,37 @@ def test_array_ufunc_series_defer(): tm.assert_series_equal(r1, expected) tm.assert_series_equal(r2, expected) + + +def test_groupby_agg(): + # Ensure that the result of agg is inferred to be decimal dtype + # https://github.com/pandas-dev/pandas/issues/29141 + + data = make_data()[:5] + df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)}) + expected = pd.Series(to_decimal([data[0], data[3]])) + + result = df.groupby("id")["decimals"].agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + result = df["decimals"].groupby(df["id"]).agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + + +def test_groupby_agg_ea_method(monkeypatch): + # Ensure that the result of agg is inferred to be decimal dtype + # https://github.com/pandas-dev/pandas/issues/29141 + + def DecimalArray__my_sum(self): + return np.sum(np.array(self)) + + monkeypatch.setattr(DecimalArray, "my_sum", DecimalArray__my_sum, raising=False) + + data = make_data()[:5] + df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)}) + expected = pd.Series(to_decimal([data[0] + data[1] + data[2], data[3] + data[4]])) + + result = df.groupby("id")["decimals"].agg(lambda x: x.values.my_sum()) + tm.assert_series_equal(result, expected, check_names=False) + s = pd.Series(DecimalArray(data)) + result = s.groupby(np.array([0, 0, 0, 1, 1])).agg(lambda x: x.values.my_sum()) + tm.assert_series_equal(result, expected, check_names=False)