From 30c9b83447a6d5d578344ddfd48995ae0f3bb01f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 27 Jun 2020 05:24:03 +0000 Subject: [PATCH 01/47] add values.dtype.kind==f branch to array_with_unit_datetime --- pandas/_libs/tslib.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 44693d60486a9..7ff309b3725d6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -416,7 +416,6 @@ def array_with_unit_to_datetime( m = cast_from_unit(None, unit) if is_raise: - # try a quick conversion to i8 # if we have nulls that are not type-compat # then need to iterate @@ -429,9 +428,17 @@ def array_with_unit_to_datetime( fvalues = iresult.astype('f8') * m need_to_iterate = False + # GH20445 + if values.dtype.kind == "f": + fresult = values.astype('f8', casting='same_kind', copy=False) + # fill by comparing to NPY_NAT constant + mask = fresult == NPY_NAT + fresult[mask] = 0.0 + fvalues = fvalues.astype('f8') * m # FIXME: this line segfaults rn + need_to_iterate = False + # check the bounds if not need_to_iterate: - if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") @@ -599,7 +606,6 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed - # specify error conditions assert is_raise or is_ignore or is_coerce From cf67f903456e8742381b0f83eaa312529602708d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 27 Jun 2020 05:35:09 +0000 Subject: [PATCH 02/47] remove unnecessary styling changes --- pandas/_libs/tslib.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 7ff309b3725d6..faeb5ac829ee6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -416,6 +416,7 @@ def array_with_unit_to_datetime( m = cast_from_unit(None, unit) if is_raise: + # try a quick conversion to i8 # if we have nulls that are not type-compat # then need to iterate @@ -439,6 +440,7 @@ def array_with_unit_to_datetime( # check the bounds if not need_to_iterate: + if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") @@ -606,6 +608,7 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed + # specify error conditions assert is_raise or is_ignore or is_coerce From a69b28cf5ae65c1f4c9df2f80891eb7f1b2f4c94 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 27 Jun 2020 05:48:15 +0000 Subject: [PATCH 03/47] added cast_from_unit definition for float --- pandas/_libs/tslibs/conversion.pyx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 0811ba22977fd..2f7c0dc0bbcc0 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -76,6 +76,15 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: frac = round(frac, p) return (base * m) + (frac * m) +cdef inline float cast_from_unit(object ts, str unit) except? -1: + """ return a casting of the unit represented to nanoseconds + round the fractional part of a float to our precision, p """ + cdef: + float m + int p + + # TO DO: fill in body + cpdef inline object precision_from_unit(str unit): """ From 9df9d4d8dbe854636a66f017cc9811618db7bc81 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 00:36:05 +0000 Subject: [PATCH 04/47] to_datetime: added astyping for floats --- pandas/_libs/tslib.pyx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index faeb5ac829ee6..346c24b901aae 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -435,20 +435,25 @@ def array_with_unit_to_datetime( # fill by comparing to NPY_NAT constant mask = fresult == NPY_NAT fresult[mask] = 0.0 - fvalues = fvalues.astype('f8') * m # FIXME: this line segfaults rn + m_as_float = m + fvalues = fresult.astype('f8') * m_as_float need_to_iterate = False # check the bounds if not need_to_iterate: - if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - result = (iresult * m).astype('M8[ns]') - iresult = result.view('i8') - iresult[mask] = NPY_NAT - return result, tz - + if values.dtype.kind == 'i': + result = (iresult * m).astype('M8[ns]') + iresult = result.view('i8') + iresult[mask] = NPY_NAT + return result, tz + elif values.dtype.kind == 'f': + result = (fresult * m_as_float).astype('M8[ns]') + fresult = result.view('f8') + fresult[mask] = NPY_NAT + return result, tz result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') @@ -608,7 +613,6 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed - # specify error conditions assert is_raise or is_ignore or is_coerce From 5746581165bf70fd263bc7b9a56da8ef4cba1ef9 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 00:38:59 +0000 Subject: [PATCH 05/47] revert changes --- pandas/_libs/tslibs/conversion.pyx | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 2f7c0dc0bbcc0..e9c3ba912298f 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -76,16 +76,6 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: frac = round(frac, p) return (base * m) + (frac * m) -cdef inline float cast_from_unit(object ts, str unit) except? -1: - """ return a casting of the unit represented to nanoseconds - round the fractional part of a float to our precision, p """ - cdef: - float m - int p - - # TO DO: fill in body - - cpdef inline object precision_from_unit(str unit): """ Return a casting of the unit represented to nanoseconds + the precision From 6b9d4de824746ce64b03b6e0ddc8aee66f988240 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 00:39:35 +0000 Subject: [PATCH 06/47] revert changes --- pandas/_libs/tslibs/conversion.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index e9c3ba912298f..0811ba22977fd 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -76,6 +76,7 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: frac = round(frac, p) return (base * m) + (frac * m) + cpdef inline object precision_from_unit(str unit): """ Return a casting of the unit represented to nanoseconds + the precision From 0e3a8763389a549f35014f7374df37f938f15c7e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 00:40:45 +0000 Subject: [PATCH 07/47] revert styling change --- pandas/_libs/tslib.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 346c24b901aae..9661dda105414 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -613,6 +613,7 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed + # specify error conditions assert is_raise or is_ignore or is_coerce From f1ae8f562db24046c1e75258cc1091ada507347a Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 00:42:29 +0000 Subject: [PATCH 08/47] _libs/tslib.pyx added comments --- pandas/_libs/tslib.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9661dda105414..387b12555c546 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -444,6 +444,7 @@ def array_with_unit_to_datetime( if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") + # GH20445 if values.dtype.kind == 'i': result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') @@ -613,7 +614,7 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed - + # specify error conditions assert is_raise or is_ignore or is_coerce From 572363a2928fa1de6b2ca2789147d5a416710faa Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 01:06:23 +0000 Subject: [PATCH 09/47] revert pandas/_libs/tslib.pyx --- pandas/_libs/tslib.pyx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 7ff309b3725d6..44693d60486a9 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -416,6 +416,7 @@ def array_with_unit_to_datetime( m = cast_from_unit(None, unit) if is_raise: + # try a quick conversion to i8 # if we have nulls that are not type-compat # then need to iterate @@ -428,17 +429,9 @@ def array_with_unit_to_datetime( fvalues = iresult.astype('f8') * m need_to_iterate = False - # GH20445 - if values.dtype.kind == "f": - fresult = values.astype('f8', casting='same_kind', copy=False) - # fill by comparing to NPY_NAT constant - mask = fresult == NPY_NAT - fresult[mask] = 0.0 - fvalues = fvalues.astype('f8') * m # FIXME: this line segfaults rn - need_to_iterate = False - # check the bounds if not need_to_iterate: + if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") @@ -606,6 +599,7 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed + # specify error conditions assert is_raise or is_ignore or is_coerce From 38bac1ada3eadac144c4cbd5e6f61fb4138d05fc Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:43:36 +0000 Subject: [PATCH 10/47] update Grouping.indicies to return for nan values --- pandas/core/groupby/grouper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 67003dffb90bb..3aaea43af3623 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -20,6 +20,7 @@ ) from pandas.core.dtypes.generic import ABCSeries +import pandas as pd import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com @@ -558,7 +559,12 @@ def indices(self): return self.grouper.indices values = Categorical(self.grouper) - return values._reverse_indexer() + + # GH35014 + res = values._reverse_indexer() + res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] + print(res) + return res @property def codes(self) -> np.ndarray: From 65a29637e03da98f2485cc70aed9df26fc37a54e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:46:43 +0000 Subject: [PATCH 11/47] updated _GroupBy._get_index to return for nan values --- pandas/core/groupby/groupby.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..929efdecdcd1b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -624,7 +624,10 @@ def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - return self._get_indices([name])[0] + if isna(name): + return [i for i, v in enumerate(self.indices) if isna(v)] + else: + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): @@ -896,6 +899,7 @@ def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): + print(f"name={name}, group={group}") raise AbstractMethodError(self) def _cumcount_array(self, ascending: bool = True): From 7df44d10f08d1458f449208ffcdb5f74387a5fda Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:51:08 +0000 Subject: [PATCH 12/47] revert accidental changes --- pandas/_libs/tslib.pyx | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 387b12555c546..44693d60486a9 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -429,32 +429,17 @@ def array_with_unit_to_datetime( fvalues = iresult.astype('f8') * m need_to_iterate = False - # GH20445 - if values.dtype.kind == "f": - fresult = values.astype('f8', casting='same_kind', copy=False) - # fill by comparing to NPY_NAT constant - mask = fresult == NPY_NAT - fresult[mask] = 0.0 - m_as_float = m - fvalues = fresult.astype('f8') * m_as_float - need_to_iterate = False - # check the bounds if not need_to_iterate: + if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - # GH20445 - if values.dtype.kind == 'i': - result = (iresult * m).astype('M8[ns]') - iresult = result.view('i8') - iresult[mask] = NPY_NAT - return result, tz - elif values.dtype.kind == 'f': - result = (fresult * m_as_float).astype('M8[ns]') - fresult = result.view('f8') - fresult[mask] = NPY_NAT - return result, tz + result = (iresult * m).astype('M8[ns]') + iresult = result.view('i8') + iresult[mask] = NPY_NAT + return result, tz + result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') From 4eb8a17c99a0253e5f5af2d1688c920fc25893b9 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:52:48 +0000 Subject: [PATCH 13/47] revert accidental changes --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 929efdecdcd1b..195bd1422c1b3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -899,7 +899,7 @@ def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): - print(f"name={name}, group={group}") + raise AbstractMethodError(self) def _cumcount_array(self, ascending: bool = True): From 21bb8e745e91e42c031b5cf0a58a3fe85900d608 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:53:20 +0000 Subject: [PATCH 14/47] revert accidental changes --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 195bd1422c1b3..58bfd73f55fd6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -899,7 +899,6 @@ def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): - raise AbstractMethodError(self) def _cumcount_array(self, ascending: bool = True): From 0daca6677d741f0e89dd22d85008936b47b1de66 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 03:00:02 +0000 Subject: [PATCH 15/47] styling change --- pandas/core/groupby/grouper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3aaea43af3623..550d736329ae5 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -563,7 +563,6 @@ def indices(self): # GH35014 res = values._reverse_indexer() res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] - print(res) return res @property From 0c0e28935e453f28b01791c52bb922179bdb6cc7 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 2 Jul 2020 18:27:58 +0000 Subject: [PATCH 16/47] added tests --- pandas/tests/groupby/test_groupby_dropna.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1a525d306e9f5..4e65760df50aa 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,6 +162,27 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) +def test_slice_groupby_then_transform(): + # GH35014 + + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + gb = df.groupby("A", dropna=False) + + res = gb.transform(len) + expected = pd.DataFrame({"B": [2, 2, 1, 1]}) + tm.assert_frame_equal(res, expected) + + gb_slice = gb[["B"]] + res = gb_slice.transform(len) + expected = pd.DataFrame({"B": [2, 2, 1, 1]}) + tm.assert_frame_equal(res, expected) + + gb_slice = gb["B"] + res = gb["B"].transform(len) + expected = pd.Series([2, 2, 1, 1]) + tm.assert_series_equal(res, expected) + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ From a804909bea0ffe0220be30eed6af53268dae29fd Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 6 Jul 2020 20:27:22 +0000 Subject: [PATCH 17/47] fixed groupby/groupby.py's _get_indicies --- pandas/core/groupby/groupby.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 58bfd73f55fd6..98a7aba3430ca 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -575,6 +575,7 @@ def _get_indices(self, names): Safe get multiple indices, translate keys for datelike to underlying repr. """ + print(f"names={names}") def get_converter(s): # possibly convert to the actual key types @@ -618,16 +619,20 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - return [self.indices.get(name, []) for name in names] + res = [] + for name in names: + if isna(name): + res += [v for k, v in self.indices.items() if isna(k)] + else: + res += [self.indices.get(name, [])] + + return res def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - if isna(name): - return [i for i, v in enumerate(self.indices) if isna(v)] - else: - return self._get_indices([name])[0] + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): From 5e4419e34f5a0a7ea1071055b9ab63b987e3bad8 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 6 Jul 2020 20:28:36 +0000 Subject: [PATCH 18/47] removed debug statement --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 98a7aba3430ca..9be80c24ea167 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -575,7 +575,6 @@ def _get_indices(self, names): Safe get multiple indices, translate keys for datelike to underlying repr. """ - print(f"names={names}") def get_converter(s): # possibly convert to the actual key types From 1e694c97824a58bb4f8356ea04083389172b18f0 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 04:52:45 +0000 Subject: [PATCH 19/47] fixed naming error in test --- pandas/tests/groupby/test_groupby_dropna.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 4e65760df50aa..f81f6a1bf70c6 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -179,7 +179,9 @@ def test_slice_groupby_then_transform(): gb_slice = gb["B"] res = gb["B"].transform(len) - expected = pd.Series([2, 2, 1, 1]) + expected = pd.Series(data=[2, 2, 1, 1], name="B") + print(f"res={res}") + print(f"expected={expected}") tm.assert_series_equal(res, expected) From d5e1a3b46f88484be5a1b0edab901d6b3e60ebe2 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:14:06 +0000 Subject: [PATCH 20/47] remove type coercion block --- pandas/core/groupby/generic.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dab8475d9580c..618460ecd3026 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -38,7 +38,6 @@ maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, - maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -46,7 +45,6 @@ is_bool, is_integer_dtype, is_interval_dtype, - is_numeric_dtype, is_object_dtype, is_scalar, needs_i8_conversion, @@ -528,13 +526,6 @@ def _transform_general( else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if - # we have a numeric dtype, as these are *always* user-defined funcs - # the cython take a different path (and casting) - dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype): - result = maybe_downcast_to_dtype(result, dtype) - result.name = self._selected_obj.name result.index = self._selected_obj.index return result From 91947c5a6e1a842e697dbed3ab74ea91155e4453 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:41:55 +0000 Subject: [PATCH 21/47] added missing values handing for _GroupBy.get_group method --- pandas/core/groupby/groupby.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9be80c24ea167..8006bd5eb04af 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -54,6 +54,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna +import pandas as pd from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, DatetimeArray @@ -631,7 +632,8 @@ def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - return self._get_indices([name])[0] + res = self._get_indices([name]) + return res[0] if res else [] @cache_readonly def _selected_obj(self): @@ -809,7 +811,10 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - inds = self._get_index(name) + if pd.isna(name): + inds = self._get_index(np.nan) + else: + inds = self._get_index(name) if not len(inds): raise KeyError(name) From ce80f7cf01d7f8537b8619523e645568032adb2d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:43:44 +0000 Subject: [PATCH 22/47] updated indicies for case dropna=True --- pandas/core/groupby/grouper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 550d736329ae5..7c01f3a58093f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -562,7 +562,10 @@ def indices(self): # GH35014 res = values._reverse_indexer() - res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] + if self.dropna is False: + nan_locs = [i for i, v in enumerate(values) if pd.isna(v)] + if nan_locs: + res[np.nan] = nan_locs return res @property From 5c992d2981d62aa1ead140d1613fe31d5544931b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:45:49 +0000 Subject: [PATCH 23/47] cleaned up syntax --- pandas/core/groupby/grouper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7c01f3a58093f..65bb5508934c3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -562,10 +562,8 @@ def indices(self): # GH35014 res = values._reverse_indexer() - if self.dropna is False: - nan_locs = [i for i, v in enumerate(values) if pd.isna(v)] - if nan_locs: - res[np.nan] = nan_locs + if self.dropna is False and any(pd.isna(v) for v in values): + res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] return res @property From 15215fe58a5dbe761da7fe7c230fc5d3923591e3 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:48:12 +0000 Subject: [PATCH 24/47] cleaned up syntax --- pandas/core/groupby/groupby.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8006bd5eb04af..823dd75cfa2eb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -811,10 +811,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - if pd.isna(name): - inds = self._get_index(np.nan) - else: - inds = self._get_index(name) + inds = self._get_index(np.nan) if pd.isna(name) else self._get_index(name) if not len(inds): raise KeyError(name) From 30c2fb598f95e7d390aa03786c47c0c33229ba75 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:07:23 +0000 Subject: [PATCH 25/47] removed print statements --- pandas/tests/groupby/test_groupby_dropna.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index f81f6a1bf70c6..3f158e99fab31 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -180,8 +180,6 @@ def test_slice_groupby_then_transform(): gb_slice = gb["B"] res = gb["B"].transform(len) expected = pd.Series(data=[2, 2, 1, 1], name="B") - print(f"res={res}") - print(f"expected={expected}") tm.assert_series_equal(res, expected) From 0746a76606d9952240276279826e1e9ab3f0e131 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:36:42 +0000 Subject: [PATCH 26/47] _transform_general: add a check that we don't accidentally upcast --- pandas/core/groupby/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 618460ecd3026..8d64c7279285d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -38,6 +38,7 @@ maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, + maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -45,6 +46,7 @@ is_bool, is_integer_dtype, is_interval_dtype, + is_numeric_dtype, is_object_dtype, is_scalar, needs_i8_conversion, @@ -526,6 +528,15 @@ def _transform_general( else: result = self.obj._constructor(dtype=np.float64) + # we will only try to coerce the result type if + # we have a numeric dtype, as these are *always* user-defined funcs + # the cython take a different path (and casting) + # make sure we don't accidentally upcast (GH35014) + types = ["bool", "int64", "float64"] + dtype = self._selected_obj.dtype + if is_numeric_dtype(dtype) and types.index(dtype) < types.index(result.dtype): + result = maybe_downcast_to_dtype(result, dtype) + result.name = self._selected_obj.name result.index = self._selected_obj.index return result From d4316cde2cf0b41c1da9d2781559c7b2c06dc7bf Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:51:39 +0000 Subject: [PATCH 27/47] _transform_general: add int32, float32 to upcasting check --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8d64c7279285d..21b5670747a27 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -532,7 +532,7 @@ def _transform_general( # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) # make sure we don't accidentally upcast (GH35014) - types = ["bool", "int64", "float64"] + types = ["bool", "int32", "int64", "float32", "float64"] dtype = self._selected_obj.dtype if is_numeric_dtype(dtype) and types.index(dtype) < types.index(result.dtype): result = maybe_downcast_to_dtype(result, dtype) From 7a4315574eb7e5b3084938cb1ed0a3e643e763a0 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:55:35 +0000 Subject: [PATCH 28/47] rewrite for loop as list comprehension --- pandas/core/groupby/groupby.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 823dd75cfa2eb..3ce0154105d23 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -619,12 +619,12 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - res = [] - for name in names: - if isna(name): - res += [v for k, v in self.indices.items() if isna(k)] - else: - res += [self.indices.get(name, [])] + res = [ + [v for k, v in self.indices.items() if isna(k)] + if isna(name) + else self.indices.get(name, []) + for name in names + ] return res From 2bd58859d5ca7fbed6cd03190161d4a5dff6c69c Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:05:14 +0000 Subject: [PATCH 29/47] rewrote if statement as dict comp + ternary --- pandas/core/groupby/grouper.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 65bb5508934c3..14ccffba0dad9 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -561,10 +561,12 @@ def indices(self): values = Categorical(self.grouper) # GH35014 - res = values._reverse_indexer() - if self.dropna is False and any(pd.isna(v) for v in values): - res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] - return res + reverse_indexer = values._reverse_indexer() + return ( + {**reverse_indexer, pd.NaT: [i for i, v in enumerate(values) if pd.isna(v)]} + if not self.dropna and any(pd.isna(v) for v in values) + else reverse_indexer + ) @property def codes(self) -> np.ndarray: From 550985fc94b64da80ece71dc36ddbf4a7e37f61f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:37:36 +0000 Subject: [PATCH 30/47] fixed small bug in list comp in groupby/groupby.py --- pandas/core/groupby/groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3ce0154105d23..d3a4008d0f4be 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -622,10 +622,11 @@ def get_converter(s): res = [ [v for k, v in self.indices.items() if isna(k)] if isna(name) - else self.indices.get(name, []) + else [self.indices.get(name, [])] for name in names ] + print(f"groupby.py res={res}") return res def _get_index(self, name): @@ -811,7 +812,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - inds = self._get_index(np.nan) if pd.isna(name) else self._get_index(name) + inds = self._get_index(pd.NaT) if pd.isna(name) else self._get_index(name) if not len(inds): raise KeyError(name) From 57a8da40620f929140b2deb5c92c53a7a642c6e1 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:40:17 +0000 Subject: [PATCH 31/47] deleted debug statement in groupby/groupby.py --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d3a4008d0f4be..6200335854cdc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -626,7 +626,6 @@ def get_converter(s): for name in names ] - print(f"groupby.py res={res}") return res def _get_index(self, name): From c1e7bcec4796ec348a544679656b2e7a8f142c8b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:52:44 +0000 Subject: [PATCH 32/47] rewrite _get_index using next_iter to set default value --- pandas/core/groupby/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6200335854cdc..f9ee765b8ab42 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -632,8 +632,7 @@ def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - res = self._get_indices([name]) - return res[0] if res else [] + return next(iter(self._get_indices([name])), []) @cache_readonly def _selected_obj(self): From 62f52b8f44c98b4029f3a60044cc1bbe454d4b47 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 15:30:21 +0000 Subject: [PATCH 33/47] update exepcted test_groupby_nat_exclude for new missing values handling --- pandas/tests/groupby/test_groupby.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0d040b8e6955a..0299356216c50 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1252,13 +1252,15 @@ def test_groupby_nat_exclude(): } for k in grouped.indices: + if pd.isna(k): + continue # GH 35014 tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) - with pytest.raises(KeyError, match=r"^NaT$"): - grouped.get_group(pd.NaT) + # GH35014 + grouped.get_group(pd.NaT) nan_df = DataFrame( {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} @@ -1268,6 +1270,7 @@ def test_groupby_nat_exclude(): for key in ["nan", "nat"]: grouped = nan_df.groupby(key) + print(f"grouped.__dict__={grouped.__dict__}") assert grouped.groups == {} assert grouped.ngroups == 0 assert grouped.indices == {} From ef3c1992d41af5b00614be6aebe04c7fe83bcd2a Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 15:46:14 +0000 Subject: [PATCH 34/47] remove print statement --- pandas/tests/groupby/test_groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0299356216c50..9f0da6b01383a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1270,7 +1270,6 @@ def test_groupby_nat_exclude(): for key in ["nan", "nat"]: grouped = nan_df.groupby(key) - print(f"grouped.__dict__={grouped.__dict__}") assert grouped.groups == {} assert grouped.ngroups == 0 assert grouped.indices == {} From b55021dc79ac4bd0fcdfac029a7e8712a838cd86 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 8 Jul 2020 17:53:03 +0000 Subject: [PATCH 35/47] removed xfail tests --- pandas/tests/io/json/test_pandas.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 10f49b9b81528..aef55f63e866e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1250,21 +1250,11 @@ def test_to_json_large_numbers(self, bigNum): json = series.to_json() expected = '{"articleId":' + str(bigNum) + "}" assert json == expected - # GH 20599 - with pytest.raises(ValueError): - json = StringIO(json) - result = read_json(json) - tm.assert_series_equal(series, result) df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) json = df.to_json() expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - # GH 20599 - with pytest.raises(ValueError): - json = StringIO(json) - result = read_json(json) - tm.assert_frame_equal(df, result) def test_read_json_large_numbers(self): # GH18842 From 15c1c333f9acbde47554594c644ee4f1ee2536df Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 17:16:40 +0000 Subject: [PATCH 36/47] reworked solution --- pandas/core/groupby/groupby.py | 15 +++++---------- pandas/core/groupby/grouper.py | 8 +++----- pandas/tests/groupby/test_groupby.py | 5 ++--- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f9ee765b8ab42..c29ad20ed60a7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -619,21 +619,16 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - res = [ - [v for k, v in self.indices.items() if isna(k)] - if isna(name) - else [self.indices.get(name, [])] - for name in names - ] - - return res + return [self.indices.get(name, []) for name in names] def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - return next(iter(self._get_indices([name])), []) - + if isna(name): + return self._get_indices([pd.NaT])[0] + else: + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 14ccffba0dad9..57b7afabfb075 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -562,11 +562,9 @@ def indices(self): # GH35014 reverse_indexer = values._reverse_indexer() - return ( - {**reverse_indexer, pd.NaT: [i for i, v in enumerate(values) if pd.isna(v)]} - if not self.dropna and any(pd.isna(v) for v in values) - else reverse_indexer - ) + res = {**reverse_indexer, pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)])} if not self.dropna and any(pd.isna(v) for v in values) else reverse_indexer + print(f"grouper.py Grouping.indices returns {res}") + return res @property def codes(self) -> np.ndarray: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9f0da6b01383a..5bf2411cf5a0d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1252,15 +1252,14 @@ def test_groupby_nat_exclude(): } for k in grouped.indices: - if pd.isna(k): - continue # GH 35014 tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) # GH35014 - grouped.get_group(pd.NaT) + with pytest.raises(KeyError): + grouped.get_group(pd.NaT) nan_df = DataFrame( {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} From 657c13bf9a12759f11becc52ae58b4d6c7793683 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 17:20:38 +0000 Subject: [PATCH 37/47] fixed PEP8 issue --- pandas/core/groupby/grouper.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 57b7afabfb075..efdaa4afe3511 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -562,9 +562,13 @@ def indices(self): # GH35014 reverse_indexer = values._reverse_indexer() - res = {**reverse_indexer, pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)])} if not self.dropna and any(pd.isna(v) for v in values) else reverse_indexer - print(f"grouper.py Grouping.indices returns {res}") - return res + if not self.dropna and any(pd.isna(v) for v in values): + return { + **reverse_indexer, + pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]) + } + else: + return reverse_indexer @property def codes(self) -> np.ndarray: From 70e3a19995819e9f18fabb66a42b4ae63e28ea94 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 17:55:42 +0000 Subject: [PATCH 38/47] run pre-commit checks --- pandas/tests/groupby/test_groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5bf2411cf5a0d..0d040b8e6955a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1257,8 +1257,7 @@ def test_groupby_nat_exclude(): tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) - # GH35014 - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=r"^NaT$"): grouped.get_group(pd.NaT) nan_df = DataFrame( From 5ace6acde2f81693754cf5fd49bbe4e86498eecc Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 18:25:52 +0000 Subject: [PATCH 39/47] styling fix --- pandas/core/groupby/grouper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index efdaa4afe3511..0b5593c3e0a5a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -564,11 +564,11 @@ def indices(self): reverse_indexer = values._reverse_indexer() if not self.dropna and any(pd.isna(v) for v in values): return { - **reverse_indexer, - pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]) - } + **reverse_indexer, + pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]), + } else: - return reverse_indexer + return reverse_indexer @property def codes(self) -> np.ndarray: From 90e9b6a108d8b1da13183233aa1e642ee8a47fcc Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 19:10:14 +0000 Subject: [PATCH 40/47] update whatnew + styling improvements --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/groupby.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 986ee371566cd..86915248cbc66 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -990,6 +990,7 @@ Missing - :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) - Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) - passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c29ad20ed60a7..e206782a948d1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -629,6 +629,7 @@ def _get_index(self, name): return self._get_indices([pd.NaT])[0] else: return self._get_indices([name])[0] + @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy From be0557581e43517ca7b0e3afe2e0f98cddecf8da Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 11 Jul 2020 07:42:12 +0000 Subject: [PATCH 41/47] add read_json tests --- pandas/tests/io/json/test_pandas.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index aef55f63e866e..17f13e0f0050b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1256,6 +1256,24 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected + @pytest.mark.parametrize("bigNum", [2**64 + 1, -(2**64 + 2)]) + def test_read_json_large_numbers(self, bigNum): + # GH20599 + + series = Series(bigNum, dtype=object, index=["articleId"]) + json = '{"articleId":' + str(bigNum) + "}" + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_series_equal(series, result) + + df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) + json = '{"0":{"articleId":' + str(bigNum) + "}}" + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_frame_equal(df, result) + def test_read_json_large_numbers(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' From 319ae661eb7898b2b8a0e0c78a64f4f303de8d31 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 08:24:13 -0500 Subject: [PATCH 42/47] Fixups --- pandas/tests/io/json/test_pandas.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 17f13e0f0050b..052aa03f78912 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1256,25 +1256,25 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [2**64 + 1, -(2**64 + 2)]) + @pytest.mark.parametrize("bigNum", [2 ** 64 + 1, -(2 ** 64 + 2)]) def test_read_json_large_numbers(self, bigNum): # GH20599 series = Series(bigNum, dtype=object, index=["articleId"]) json = '{"articleId":' + str(bigNum) + "}" - with pytest.raises(ValueError): - json = StringIO(json) - result = read_json(json) + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) tm.assert_series_equal(series, result) df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) json = '{"0":{"articleId":' + str(bigNum) + "}}" - with pytest.raises(ValueError): - json = StringIO(json) - result = read_json(json) + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) tm.assert_frame_equal(df, result) - def test_read_json_large_numbers(self): + def test_read_json_large_numbers2(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' json = StringIO(json) From 7a78da5c83dab468901b8fa861e08172083d63e3 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 14 Jul 2020 19:24:36 +0000 Subject: [PATCH 43/47] fixed git mistake --- doc/source/whatsnew/v1.1.0.rst | 1 - pandas/core/groupby/generic.py | 4 +--- pandas/core/groupby/groupby.py | 8 ++------ pandas/core/groupby/grouper.py | 11 +---------- pandas/tests/groupby/test_groupby_dropna.py | 20 -------------------- 5 files changed, 4 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 64e473acf37c0..a4c107ddefd7b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -992,7 +992,6 @@ Missing - :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) - Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) - passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) -- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e9558878f3fbb..1f49ee2b0b665 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -548,10 +548,8 @@ def _transform_general( # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) - # make sure we don't accidentally upcast (GH35014) - types = ["bool", "int32", "int64", "float32", "float64"] dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype) and types.index(dtype) < types.index(result.dtype): + if is_numeric_dtype(dtype): result = maybe_downcast_to_dtype(result, dtype) result.name = self._selected_obj.name diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e206782a948d1..d039b715b3c08 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -54,7 +54,6 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna -import pandas as pd from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, DatetimeArray @@ -625,10 +624,7 @@ def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - if isna(name): - return self._get_indices([pd.NaT])[0] - else: - return self._get_indices([name])[0] + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): @@ -806,7 +802,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - inds = self._get_index(pd.NaT) if pd.isna(name) else self._get_index(name) + inds = self._get_index(name) if not len(inds): raise KeyError(name) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 0b5593c3e0a5a..76a7c1ccb0b6a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -20,7 +20,6 @@ ) from pandas.core.dtypes.generic import ABCSeries -import pandas as pd import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com @@ -560,15 +559,7 @@ def indices(self): values = Categorical(self.grouper) - # GH35014 - reverse_indexer = values._reverse_indexer() - if not self.dropna and any(pd.isna(v) for v in values): - return { - **reverse_indexer, - pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]), - } - else: - return reverse_indexer + return values._reverse_indexer() @property def codes(self) -> np.ndarray: diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 3f158e99fab31..bbf71d59be140 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,26 +162,6 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) -def test_slice_groupby_then_transform(): - # GH35014 - - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) - gb = df.groupby("A", dropna=False) - - res = gb.transform(len) - expected = pd.DataFrame({"B": [2, 2, 1, 1]}) - tm.assert_frame_equal(res, expected) - - gb_slice = gb[["B"]] - res = gb_slice.transform(len) - expected = pd.DataFrame({"B": [2, 2, 1, 1]}) - tm.assert_frame_equal(res, expected) - - gb_slice = gb["B"] - res = gb["B"].transform(len) - expected = pd.Series(data=[2, 2, 1, 1], name="B") - tm.assert_series_equal(res, expected) - @pytest.mark.parametrize( "dropna, tuples, outputs", From 128173c30d127661f22993d48c7acfc25ce72a64 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 14 Jul 2020 19:28:15 +0000 Subject: [PATCH 44/47] minimize diff --- pandas/core/groupby/grouper.py | 1 - pandas/tests/groupby/test_groupby_dropna.py | 1 - 2 files changed, 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 76a7c1ccb0b6a..67003dffb90bb 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -558,7 +558,6 @@ def indices(self): return self.grouper.indices values = Categorical(self.grouper) - return values._reverse_indexer() @property diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index bbf71d59be140..1a525d306e9f5 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,7 +162,6 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "dropna, tuples, outputs", [ From fc5bce69c53109e2f43b2f8d84be12e08d37a3f5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 14 Jul 2020 22:54:01 +0000 Subject: [PATCH 45/47] fix input to test --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 052aa03f78912..d0fb504c9635f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1256,7 +1256,7 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [2 ** 64 + 1, -(2 ** 64 + 2)]) + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) def test_read_json_large_numbers(self, bigNum): # GH20599 From 1f2fa9e1b26258e6d7f67e554a0978adbfd4acf4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 19:41:20 -0500 Subject: [PATCH 46/47] xfail --- pandas/tests/io/json/test_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d0fb504c9635f..37ce92047ba1a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1256,7 +1256,8 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + @pytest.mark.parametrize("bigNum", [2**64 + 1, -(2**64 + 2)]) + @pytest.mark.xfail(sys.maxsize <= 2**32, reason="GH-35279") def test_read_json_large_numbers(self, bigNum): # GH20599 From 72a612f5e6321ba5c805110fe905571713d26cfd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 20:42:09 -0500 Subject: [PATCH 47/47] fixup --- pandas/tests/io/json/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 37ce92047ba1a..97b53a6e66575 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1256,8 +1256,8 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [2**64 + 1, -(2**64 + 2)]) - @pytest.mark.xfail(sys.maxsize <= 2**32, reason="GH-35279") + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + @pytest.mark.skipif(sys.maxsize <= 2 ** 32, reason="GH-35279") def test_read_json_large_numbers(self, bigNum): # GH20599