From c5f3a36f40120562d24f1ac2c83ee074f4aa432d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 12:42:00 -0700 Subject: [PATCH 01/14] de-kludge quantile, make interpolate_with_fill understand datetime64 --- pandas/core/internals/blocks.py | 40 +++++++++++-------------------- pandas/core/internals/managers.py | 2 +- pandas/core/missing.py | 15 +++++++++++- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4ca867b1088e7..09729de9501b0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1223,7 +1223,6 @@ def _interpolate_with_fill( fill_value=fill_value, dtype=self.dtype, ) - values = self._try_coerce_result(values) blocks = [self.make_block_same_class(values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) @@ -1526,18 +1525,7 @@ def quantile(self, qs, interpolation="linear", axis=0): # We should always have ndim == 2 becase Series dispatches to DataFrame assert self.ndim == 2 - if self.is_datetimetz: - # TODO: cleanup this special case. - # We need to operate on i8 values for datetimetz - # but `Block.get_values()` returns an ndarray of objects - # right now. We need an API for "values to do numeric-like ops on" - values = self.values.view("M8[ns]") - - # TODO: NonConsolidatableMixin shape - # Usual shape inconsistencies for ExtensionBlocks - values = values[None, :] - else: - values = self.get_values() + values = self.get_values() is_empty = values.shape[axis] == 0 orig_scalar = not is_list_like(qs) @@ -1576,7 +1564,6 @@ def quantile(self, qs, interpolation="linear", axis=0): result = lib.item_from_zerodim(result) ndim = getattr(result, "ndim", None) or 0 - result = self._try_coerce_result(result) return make_block(result, placement=np.arange(len(result)), ndim=ndim) def _replace_coerce( @@ -1710,7 +1697,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) mask = _safe_reshape(mask, new_values.shape) new_values[mask] = new - new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] def _try_cast_result(self, result, dtype=None): @@ -2293,13 +2279,6 @@ def _try_coerce_args(self, other): return other - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - if isinstance(result, np.ndarray) and result.dtype.kind == "i": - # needed for _interpolate_with_ffill - result = result.view("M8[ns]") - return result - def to_native_types( self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs ): @@ -2564,6 +2543,19 @@ def equals(self, other): return False return (self.values.view("i8") == other.values.view("i8")).all() + def quantile(self, qs, interpolation="linear", axis=0): + naive = self.values.view("M8[ns]") + + # kludge for 2D block with 1D values + naive = naive.reshape(self.shape) + + blk = self.make_block(naive) + res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis) + + # ravel is kludge for 2D block with 1D values, assumes column-like + aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) + return self.make_block_same_class(aware, ndim=res_blk.ndim) + class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () @@ -2639,10 +2631,6 @@ def _try_coerce_args(self, other): return other - def _try_coerce_result(self, result): - """ reverse of try_coerce_args / try_operate """ - return result - def should_store(self, value): return issubclass( value.dtype.type, np.timedelta64 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 344d41ed26943..f8511e6445ea5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -908,7 +908,7 @@ def fast_xs(self, loc): # Such assignment may incorrectly coerce NaT to None # result[blk.mgr_locs] = blk._slice((slice(None), loc)) for i, rl in enumerate(blk.mgr_locs): - result[rl] = blk._try_coerce_result(blk.iget((i, loc))) + result[rl] = blk.iget((i, loc)) if is_extension_array_dtype(dtype): result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 8f0abc91f7aef..9217612b01b9c 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -463,6 +463,15 @@ def interpolate_2d( Perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result. """ + if is_datetime64tz_dtype(values): + naive = values.view("M8[ns]") + result = interpolate_2d( + naive, method=method, axis=axis, limit=limit, + fill_value=fill_value, dtype=dtype + ) + return type(values)._from_sequence(result, dtype=values.dtype) + + orig_values = values transf = (lambda x: x) if axis == 0 else (lambda x: x.T) @@ -470,7 +479,7 @@ def interpolate_2d( ndim = values.ndim if values.ndim == 1: if axis != 0: # pragma: no cover - raise AssertionError("cannot interpolate on a ndim == 1 with " "axis != 0") + raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") values = values.reshape(tuple((1,) + values.shape)) if fill_value is None: @@ -490,6 +499,10 @@ def interpolate_2d( if ndim == 1: values = values[0] + if orig_values.dtype.kind == 'M': + # convert float back to datetime64 + values = values.astype(orig_values.dtype) + return values From 519ef0d9eb9d6c0ab92c151d6b62ae361aa8fc14 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 13:23:09 -0700 Subject: [PATCH 02/14] remove no-lomnger-necessary --- pandas/core/internals/blocks.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 09729de9501b0..ed4267793a2a1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2456,15 +2456,7 @@ def _try_coerce_result(self, result): result = self._holder._from_sequence( result.astype(np.int64), freq=None, dtype=self.values.dtype ) - elif result.dtype == "M8[ns]": - # otherwise we get here via quantile and already have M8[ns] - result = self._holder._simple_new( - result, freq=None, dtype=self.values.dtype - ) - elif isinstance(result, np.datetime64): - # also for post-quantile - result = self._box_func(result) return result @property From 0b00695405e113fbbced898fd8a54f52406ae764 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 13:48:02 -0700 Subject: [PATCH 03/14] blackify --- pandas/core/missing.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 9217612b01b9c..6318bfcb83dd5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -466,8 +466,12 @@ def interpolate_2d( if is_datetime64tz_dtype(values): naive = values.view("M8[ns]") result = interpolate_2d( - naive, method=method, axis=axis, limit=limit, - fill_value=fill_value, dtype=dtype + naive, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=dtype, ) return type(values)._from_sequence(result, dtype=values.dtype) @@ -499,7 +503,7 @@ def interpolate_2d( if ndim == 1: values = values[0] - if orig_values.dtype.kind == 'M': + if orig_values.dtype.kind == "M": # convert float back to datetime64 values = values.astype(orig_values.dtype) From d531e96515041df523daf813fc36fd7757ba3115 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 15:16:43 -0700 Subject: [PATCH 04/14] remove _try_coerce_result --- pandas/core/groupby/ops.py | 11 ++++++++ pandas/core/internals/blocks.py | 48 +-------------------------------- 2 files changed, 12 insertions(+), 47 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index cc8aec4cc243b..bd947180c9d3d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -25,6 +25,7 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, @@ -452,6 +453,16 @@ def wrapper(*args, **kwargs): def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] + if is_datetime64tz_dtype(values): + # TODO: possible need to reshape? kludge can be avoided when + # 2D EA is allowed. + naive = values.view("M8[ns]") + result, names = self._cython_operation( + kind, naive, how=how, axis=axis, min_count=min_count, **kwargs + ) + result = type(values)(result.astype(np.int64), dtype=values.dtype) + return result, names + # can we do this operation with our cython functions # if not raise NotImplementedError diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ed4267793a2a1..b498d8df53144 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -417,9 +417,6 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): if self._can_hold_element(value): # equivalent: self._try_coerce_args(value) would not raise blocks = self.putmask(mask, value, inplace=inplace) - blocks = [ - b.make_block(values=self._try_coerce_result(b.values)) for b in blocks - ] return self._maybe_downcast(blocks, downcast) # we can't process the value, but nothing to do @@ -737,12 +734,7 @@ def _try_coerce_args(self, other): return other - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - return result - def _try_coerce_and_cast_result(self, result, dtype=None): - result = self._try_coerce_result(result) result = self._try_cast_result(result, dtype=dtype) return result @@ -1409,7 +1401,7 @@ def func(cond, values, other): try: fastres = expressions.where(cond, values, other) - return self._try_coerce_result(fastres) + return fastres except Exception as detail: if errors == "raise": raise TypeError( @@ -1874,20 +1866,6 @@ def _slice(self, slicer): return self.values[slicer] - def _try_cast_result(self, result, dtype=None): - """ - if we have an operation that operates on for example floats - we want to try to cast back to our EA here if possible - - result could be a 2-D numpy array, e.g. the result of - a numeric operation; but it must be shape (1, X) because - we by-definition operate on the ExtensionBlocks one-by-one - - result could also be an EA Array itself, in which case it - is already a 1-D array - """ - return result - def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we @@ -2445,20 +2423,6 @@ def _try_coerce_args(self, other): return other - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - if isinstance(result, np.ndarray): - if result.ndim == 2: - # kludge for 2D blocks with 1D EAs - result = result[0, :] - if result.dtype == np.float64: - # needed for post-groupby.median - result = self._holder._from_sequence( - result.astype(np.int64), freq=None, dtype=self.values.dtype - ) - - return result - @property def _box_func(self): return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) @@ -3031,16 +2995,6 @@ def array_dtype(self): """ return np.object_ - def _try_coerce_result(self, result): - """ reverse of try_coerce_args """ - - # GH12564: CategoricalBlock is 1-dim only - # while returned results could be any dim - if (not is_categorical_dtype(result)) and isinstance(result, np.ndarray): - result = _block_shape(result, ndim=self.ndim) - - return result - def to_dense(self): # Categorical.get_values returns a DatetimeIndex for datetime # categories, so we can't simply use `np.asarray(self.values)` like From 4db6e026db311de6c057f153980415823902f97d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 15:42:40 -0700 Subject: [PATCH 05/14] passing --- pandas/core/groupby/generic.py | 2 +- pandas/core/internals/blocks.py | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4dc1dfcae0777..b64aac559a0ac 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -183,7 +183,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): dtype = block.values.dtype # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result, dtype=dtype) + result = block._try_cast_result(result, dtype=dtype) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b498d8df53144..b6fdca82be107 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -734,10 +734,6 @@ def _try_coerce_args(self, other): return other - def _try_coerce_and_cast_result(self, result, dtype=None): - result = self._try_cast_result(result, dtype=dtype) - return result - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -931,7 +927,7 @@ def setitem(self, indexer, value): values[indexer] = value # coerce and try to infer the dtypes of the result - values = self._try_coerce_and_cast_result(values, dtype) + #values = self._try_cast_result(values, dtype) if transpose: values = values.T block = self.make_block(values) @@ -1442,8 +1438,8 @@ def func(cond, values, other): result = result.T # try to cast if requested - if try_cast: - result = self._try_cast_result(result) + #if try_cast: + # result = self._try_cast_result(result) return self.make_block(result) @@ -1456,7 +1452,7 @@ def func(cond, values, other): for m in [mask, ~mask]: if m.any(): taken = result.take(m.nonzero()[0], axis=axis) - r = self._try_cast_result(taken) + r = self._try_cast_result(taken) # Note: removing this breaks stuff nb = self.make_block(r.T, placement=self.mgr_locs[m]) result_blocks.append(nb) @@ -2423,10 +2419,6 @@ def _try_coerce_args(self, other): return other - @property - def _box_func(self): - return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) - def diff(self, n, axis=0): """1st discrete difference From ed69e699d8e27f21ff37431e7c4088ad680c00b9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 16:13:17 -0700 Subject: [PATCH 06/14] simplify try_cast_result --- pandas/core/groupby/generic.py | 4 +--- pandas/core/internals/blocks.py | 28 +++++++--------------------- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b64aac559a0ac..8ee637454eb2b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -180,10 +180,8 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): continue finally: if result is not no_result: - dtype = block.values.dtype - # see if we can cast the block back to the original dtype - result = block._try_cast_result(result, dtype=dtype) + result = block._try_cast_result(result) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b6fdca82be107..506b0849a4250 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -55,7 +55,6 @@ ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, - ABCIndexClass, ABCPandasArray, ABCSeries, ) @@ -690,27 +689,15 @@ def _can_hold_element(self, element): return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def _try_cast_result(self, result, dtype=None): + def _try_cast_result(self, result): """ try to cast the result to our original type, we may have roundtripped thru object in the mean-time """ - if dtype is None: - dtype = self.dtype - - if self.is_integer or self.is_bool or self.is_datetime: - pass - elif self.is_float and result.dtype == self.dtype: - # protect against a bool/object showing up here - if isinstance(dtype, str) and dtype == "infer": - return result - - # This is only reached via Block.setitem, where dtype is always - # either "infer", self.dtype, or values.dtype. - assert dtype == self.dtype, (dtype, self.dtype) + if self.is_float and result.dtype == self.dtype: return result # may need to change the dtype here - return maybe_downcast_to_dtype(result, dtype) + return maybe_downcast_to_dtype(result, self.dtype) def _try_coerce_args(self, other): """ provide coercion to our input arguments """ @@ -1687,7 +1674,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new_values[mask] = new return [self.make_block(values=new_values)] - def _try_cast_result(self, result, dtype=None): + def _try_cast_result(self, result): return result def _get_unstack_items(self, unstacker, new_columns): @@ -1741,7 +1728,8 @@ def __init__(self, values, placement, ndim=None): super().__init__(values, placement, ndim) def _maybe_coerce_values(self, values): - """Unbox to an extension array. + """ + Unbox to an extension array. This will unbox an ExtensionArray stored in an Index or Series. ExtensionArrays pass through. No dtype coercion is done. @@ -1754,9 +1742,7 @@ def _maybe_coerce_values(self, values): ------- ExtensionArray """ - if isinstance(values, (ABCIndexClass, ABCSeries)): - values = values._values - return values + return extract_array(values) @property def _holder(self): From 73bec10b3b3a198cc690edea9046400cd4bfdca4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 16:13:59 -0700 Subject: [PATCH 07/14] remove commented-out --- pandas/core/internals/blocks.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 506b0849a4250..0dd79ba675add 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -913,8 +913,6 @@ def setitem(self, indexer, value): else: values[indexer] = value - # coerce and try to infer the dtypes of the result - #values = self._try_cast_result(values, dtype) if transpose: values = values.T block = self.make_block(values) @@ -1424,10 +1422,6 @@ def func(cond, values, other): if transpose: result = result.T - # try to cast if requested - #if try_cast: - # result = self._try_cast_result(result) - return self.make_block(result) # might need to separate out blocks From 7eb559cde9fe0993564d1ac78dfe02c1bc20b520 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 27 Jul 2019 16:35:34 -0700 Subject: [PATCH 08/14] OK --- pandas/core/internals/blocks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0dd79ba675add..3eddb2f36f4d6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -693,9 +693,17 @@ def _try_cast_result(self, result): """ try to cast the result to our original type, we may have roundtripped thru object in the mean-time """ - if self.is_float and result.dtype == self.dtype: + if result.dtype == self.dtype: return result + if self.dtype.kind == result.dtype.kind: + # don't allow upcasts here (except if empty) + if result.dtype.itemsize <= self.dtype.itemsize and result.size: + return result + + if result.dtype.itemsize > self.dtype.itemsize: + return result.astype(self.dtype) + # may need to change the dtype here return maybe_downcast_to_dtype(result, self.dtype) From 47ef968bb6e6ecc3c1c4ee410c4740d46f1303c4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Aug 2019 21:25:31 -0700 Subject: [PATCH 09/14] OK --- pandas/core/dtypes/cast.py | 63 +++++++++++++++++++++++++++++++-- pandas/core/groupby/generic.py | 5 +-- pandas/core/groupby/ops.py | 4 +++ pandas/core/internals/blocks.py | 10 ++---- 4 files changed, 70 insertions(+), 12 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fd8536e38eee7..74b9d793b334d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -125,13 +125,14 @@ def trans(x): # noqa else: dtype = "object" - if isinstance(dtype, str): dtype = np.dtype(dtype) + try: - # don't allow upcasts here (except if empty) + # This can fail if we have SparseDType, which doesnt have itemsize if dtype.kind == result.dtype.kind: + # don't allow upcasts here (except if empty) if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): return result @@ -194,11 +195,69 @@ def trans(x): # noqa return PeriodArray(result, freq=dtype.freq) except Exception: + # TODO: catch more specific pass return result +# TODO: de-duplicate with maybe_downcast_to_dtype +def maybe_downcast_numeric(result, dtype): + """ try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 + """ + + if result.dtype == dtype: + return result + + if dtype.kind == result.dtype.kind: + # don't allow upcasts here (except if empty) + if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): + return result + + if is_bool_dtype(dtype) or is_integer_dtype(dtype): + + # if we don't have any elements, just astype it + if not np.prod(result.shape): + return result.astype(dtype) + + # do a test on the first element, if it fails then we are done + r = result.ravel() + arr = np.array([r[0]]) + + # if we have any nulls, then we are done + if isna(arr).any() or not np.allclose( + arr, arr.astype(dtype), rtol=0 + ): + return result + + # a comparable, e.g. a Decimal may slip in here + elif not isinstance( + r[0], (np.integer, np.floating, np.bool, int, float, bool) + ): + return result + + if ( + issubclass(result.dtype.type, (np.object_, np.number)) + and notna(result).all() + ): + new_result = result.astype(dtype) + try: + if np.allclose(new_result, result, rtol=0): + return new_result + except Exception: + + # comparison of an object dtype with a number type could + # hit here + if (new_result == result).all(): + return new_result + elif issubclass(dtype.type, np.floating) and not is_bool_dtype(result.dtype): + return result.astype(dtype) + + + return result + + def maybe_upcast_putmask(result, mask, other): """ A safe version of putmask that potentially upcasts the result. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8ee637454eb2b..a8e7707ce8ae4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,7 +21,8 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution -from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype +from pandas.core.dtypes.cast import ( + maybe_convert_objects, maybe_downcast_to_dtype, maybe_downcast_numeric) from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, @@ -181,7 +182,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): finally: if result is not no_result: # see if we can cast the block back to the original dtype - result = block._try_cast_result(result) + result = maybe_downcast_numeric(result, block.dtype) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bd947180c9d3d..06026b3574d50 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -463,6 +463,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): result = type(values)(result.astype(np.int64), dtype=values.dtype) return result, names + orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError @@ -576,6 +577,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): result = result.astype("float64") result[mask] = np.nan + if is_datetimelike and kind == "aggregate": + result = result.astype(orig_values.dtype) + if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3eddb2f36f4d6..0a4c878034cd5 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -22,6 +22,7 @@ maybe_infer_dtype_type, maybe_promote, maybe_upcast, + maybe_downcast_numeric, soft_convert_objects, ) from pandas.core.dtypes.common import ( @@ -693,17 +694,10 @@ def _try_cast_result(self, result): """ try to cast the result to our original type, we may have roundtripped thru object in the mean-time """ + return maybe_downcast_numeric(result, self.dtype) if result.dtype == self.dtype: return result - if self.dtype.kind == result.dtype.kind: - # don't allow upcasts here (except if empty) - if result.dtype.itemsize <= self.dtype.itemsize and result.size: - return result - - if result.dtype.itemsize > self.dtype.itemsize: - return result.astype(self.dtype) - # may need to change the dtype here return maybe_downcast_to_dtype(result, self.dtype) From 1bf2e4a6444f21cbfa2bc250a2608aa623d6340b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Aug 2019 21:29:33 -0700 Subject: [PATCH 10/14] Remove rebase mixup --- pandas/core/dtypes/cast.py | 57 -------------------------------------- 1 file changed, 57 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6c056be6410b7..4bb1deffd9524 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -233,63 +233,6 @@ def trans(x): return result -# TODO: de-duplicate with maybe_downcast_to_dtype -def maybe_downcast_numeric(result, dtype): - """ try to cast to the specified dtype (e.g. convert back to bool/int - or could be an astype of float64->float32 - """ - - if result.dtype == dtype: - return result - - if dtype.kind == result.dtype.kind: - # don't allow upcasts here (except if empty) - if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): - return result - - if is_bool_dtype(dtype) or is_integer_dtype(dtype): - - # if we don't have any elements, just astype it - if not np.prod(result.shape): - return result.astype(dtype) - - # do a test on the first element, if it fails then we are done - r = result.ravel() - arr = np.array([r[0]]) - - # if we have any nulls, then we are done - if isna(arr).any() or not np.allclose( - arr, arr.astype(dtype), rtol=0 - ): - return result - - # a comparable, e.g. a Decimal may slip in here - elif not isinstance( - r[0], (np.integer, np.floating, np.bool, int, float, bool) - ): - return result - - if ( - issubclass(result.dtype.type, (np.object_, np.number)) - and notna(result).all() - ): - new_result = result.astype(dtype) - try: - if np.allclose(new_result, result, rtol=0): - return new_result - except Exception: - - # comparison of an object dtype with a number type could - # hit here - if (new_result == result).all(): - return new_result - elif issubclass(dtype.type, np.floating) and not is_bool_dtype(result.dtype): - return result.astype(dtype) - - - return result - - def maybe_upcast_putmask(result, mask, other): """ A safe version of putmask that potentially upcasts the result. From 53b60b8c96d04a9e245c14cac373fe42eec9d16b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 10:11:23 -0700 Subject: [PATCH 11/14] remove try_cast_result --- pandas/core/groupby/ops.py | 3 ++- pandas/core/internals/blocks.py | 16 +--------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e103bf4be2dae..a1b70dedab9ab 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -454,6 +454,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values + # FIXME: Why is this necessary? Shouldn't it be handled by the + # datetime64tz check already in place below? if is_datetime64tz_dtype(values): # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. @@ -464,7 +466,6 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): result = type(values)(result.astype(np.int64), dtype=values.dtype) return result, names - orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5fa30db790ee0..f877796083b14 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -687,17 +687,6 @@ def _can_hold_element(self, element): return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def _try_cast_result(self, result): - """ try to cast the result to our original type, we may have - roundtripped thru object in the mean-time - """ - return maybe_downcast_numeric(result, self.dtype) - if result.dtype == self.dtype: - return result - - # may need to change the dtype here - return maybe_downcast_to_dtype(result, self.dtype) - def _try_coerce_args(self, other): """ provide coercion to our input arguments """ @@ -1432,7 +1421,7 @@ def func(cond, values, other): for m in [mask, ~mask]: if m.any(): taken = result.take(m.nonzero()[0], axis=axis) - r = self._try_cast_result(taken) # Note: removing this breaks stuff + r = maybe_downcast_numeric(taken, self.dtype) nb = self.make_block(r.T, placement=self.mgr_locs[m]) result_blocks.append(nb) @@ -1665,9 +1654,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new_values[mask] = new return [self.make_block(values=new_values)] - def _try_cast_result(self, result): - return result - def _get_unstack_items(self, unstacker, new_columns): """ Get the placement, values, and mask for a Block unstack. From 087082922664e1502c6b4f87557201c4b02da74f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 11:24:20 -0700 Subject: [PATCH 12/14] isort --- pandas/core/groupby/generic.py | 5 ++++- pandas/core/internals/blocks.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3361df5c53071..ea2bd22cccc3d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -22,7 +22,10 @@ from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( - maybe_convert_objects, maybe_downcast_to_dtype, maybe_downcast_numeric) + maybe_convert_objects, + maybe_downcast_numeric, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7a55a69d5aaf7..8c3cf7cc51495 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,11 +18,11 @@ find_common_type, infer_dtype_from, infer_dtype_from_scalar, + maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, maybe_upcast, - maybe_downcast_numeric, soft_convert_objects, ) from pandas.core.dtypes.common import ( From 913272bd303eb79b7a6465da9dcd618703b8aef7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 14:20:42 -0700 Subject: [PATCH 13/14] remove redundant check --- pandas/core/groupby/ops.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a1b70dedab9ab..5708c9fb735c2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -454,18 +454,6 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values - # FIXME: Why is this necessary? Shouldn't it be handled by the - # datetime64tz check already in place below? - if is_datetime64tz_dtype(values): - # TODO: possible need to reshape? kludge can be avoided when - # 2D EA is allowed. - naive = values.view("M8[ns]") - result, names = self._cython_operation( - kind, naive, how=how, axis=axis, min_count=min_count, **kwargs - ) - result = type(values)(result.astype(np.int64), dtype=values.dtype) - return result, names - # can we do this operation with our cython functions # if not raise NotImplementedError @@ -585,7 +573,11 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): result = result.astype("float64") result[mask] = np.nan - if is_datetimelike and kind == "aggregate": + if ( + is_datetimelike + and kind == "aggregate" + and not is_datetime64tz_dtype(orig_values.dtype) + ): result = result.astype(orig_values.dtype) if kind == "aggregate" and self._filter_empty_groups and not counts.all(): From 2f1c05e49552f774edbc3bd6e48db0da1fbc628d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Aug 2019 14:36:22 -0700 Subject: [PATCH 14/14] move check back, making it prettier --- pandas/core/groupby/ops.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5708c9fb735c2..b0c629f017dd3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -573,13 +573,6 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): result = result.astype("float64") result[mask] = np.nan - if ( - is_datetimelike - and kind == "aggregate" - and not is_datetime64tz_dtype(orig_values.dtype) - ): - result = result.astype(orig_values.dtype) - if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] @@ -598,6 +591,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) + elif is_datetimelike and kind == "aggregate": + result = result.astype(orig_values.dtype) return result, names