From d66032869e2ed7330d9859157d2f464e3e02d4b1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jul 2019 11:38:19 -0700 Subject: [PATCH 01/12] use extract_array --- pandas/core/ops/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 3a5dfe6700bd2..b24a4b2514092 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -39,18 +39,20 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeArray, + ABCDatetimeIndex, ABCIndex, ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries, ABCTimedeltaArray, + ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, notna import pandas as pd from pandas._typing import ArrayLike -import pandas.core.common as com +from pandas.core.construction import extract_array from . import missing from .docstrings import ( @@ -1017,7 +1019,7 @@ def wrapper(left, right): right = np.broadcast_to(right, left.shape) right = pd.TimedeltaIndex(right) - assert isinstance(right, (pd.TimedeltaIndex, ABCTimedeltaArray, ABCSeries)) + assert isinstance(right, (ABCTimedeltaIndex, ABCTimedeltaArray, ABCSeries)) try: result = op(left._values, right) except NullFrequencyError: @@ -1035,7 +1037,7 @@ def wrapper(left, right): # does inference in the case where `result` has object-dtype. return construct_result(left, result, index=left.index, name=res_name) - elif isinstance(right, (ABCDatetimeArray, pd.DatetimeIndex)): + elif isinstance(right, (ABCDatetimeArray, ABCDatetimeIndex)): result = op(left._values, right) return construct_result(left, result, index=left.index, name=res_name) @@ -1231,7 +1233,7 @@ def wrapper(self, other, axis=None): ) # always return a full value series here - res_values = com.values_from_object(res) + res_values = extract_array(res, extract_numpy=True) return self._constructor( res_values, index=self.index, name=res_name, dtype="bool" ) From 7a7eb3254edb8e342ae3fa2e18096161f10fe4a5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 11:30:44 -0700 Subject: [PATCH 02/12] cleanups --- pandas/_libs/reduction.pyx | 2 +- pandas/_libs/tslibs/timedeltas.pyx | 3 ++- pandas/core/apply.py | 4 ++-- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 15 ++------------- pandas/core/arrays/datetimes.py | 1 - pandas/core/arrays/period.py | 1 - pandas/core/arrays/timedeltas.py | 1 - pandas/core/groupby/groupby.py | 6 +++--- pandas/core/internals/concat.py | 10 ++++------ pandas/tests/groupby/test_bin_groupby.py | 12 +++++++----- 11 files changed, 22 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 739ac0ed397ca..5ad1d71ff8ebb 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -628,7 +628,7 @@ cdef class BlockSlider: arr.shape[1] = 0 -def reduce(arr, f, axis=0, dummy=None, labels=None): +def do_reduce(arr, f, axis=0, dummy=None, labels=None): """ Parameters diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 6a32553fe2d38..d24aafae0967d 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1280,7 +1280,8 @@ class Timedelta(_Timedelta): else: raise ValueError( "Value must be Timedelta, string, integer, " - "float, timedelta or convertible") + "float, timedelta or convertible, not {typ}" + .format(typ=type(value).__name__)) if is_timedelta64_object(value): value = value.view('i8') diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2246bbfde636d..b7995f20ce2b7 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -221,7 +221,7 @@ def apply_raw(self): """ apply to the values as a numpy array """ try: - result = reduction.reduce(self.values, self.f, axis=self.axis) + result = reduction.do_reduce(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) @@ -281,7 +281,7 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.reduce( + result = reduction.do_reduce( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) return self.obj._constructor_sliced(result, index=labels) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b16217d5d0a32..d22b4bd4d3f2b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2703,7 +2703,7 @@ def _convert_to_list_like(list_like): elif is_scalar(list_like): return [list_like] else: - # is this reached? + # TODO: is this reached? return [list_like] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f86b307e5ede3..599300bc7973a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -57,21 +57,10 @@ class AttributesMixin: _data = None # type: np.ndarray - @property - def _attributes(self): - # Inheriting subclass should implement _attributes as a list of strings - raise AbstractMethodError(self) - @classmethod def _simple_new(cls, values, **kwargs): raise AbstractMethodError(cls) - def _get_attributes_dict(self): - """ - return an attributes dict for my class - """ - return {k: getattr(self, k, None) for k in self._attributes} - @property def _scalar_type(self) -> Type[DatetimeLikeScalar]: """The scalar associated with this datelike @@ -224,8 +213,8 @@ class TimelikeOps: .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ -default 'raise' + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, + default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 2e086c8ce8c34..e6f6fada05877 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -328,7 +328,6 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps # ----------------------------------------------------------------- # Constructors - _attributes = ["freq", "tz"] _dtype = None # type: Union[np.dtype, DatetimeTZDtype] _freq = None diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c290391278def..2e72f0dbf9b07 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -161,7 +161,6 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): # array priority higher than numpy scalars __array_priority__ = 1000 - _attributes = ["freq"] _typ = "periodarray" # ABCPeriodArray _scalar_type = Period diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index dd0b9a79c6dca..afd1e8203059e 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -199,7 +199,6 @@ def dtype(self): # ---------------------------------------------------------------- # Constructors - _attributes = ["freq"] def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): if isinstance(values, (ABCSeries, ABCIndexClass)): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 15b94e59c065c..12b9cf25687cf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1206,7 +1206,7 @@ def mean(self, *args, **kwargs): ) except GroupByError: raise - except Exception: # pragma: no cover + except Exception: with _group_selection_context(self): f = lambda x: x.mean(axis=self.axis, **kwargs) return self._python_agg_general(f) @@ -1232,7 +1232,7 @@ def median(self, **kwargs): ) except GroupByError: raise - except Exception: # pragma: no cover + except Exception: def f(x): if isinstance(x, np.ndarray): @@ -2470,7 +2470,7 @@ def groupby(obj, by, **kwds): from pandas.core.groupby.generic import DataFrameGroupBy klass = DataFrameGroupBy - else: # pragma: no cover + else: raise TypeError("invalid type: {}".format(obj)) return klass(obj, by, **kwds) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 121c61d8d3623..7eaec0687b790 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -183,7 +183,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): fill_value = upcasted_na if self.is_na: - if getattr(self.block, "is_object", False): + if self.block.is_object: # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls @@ -191,18 +191,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): if len(values) and values[0] is None: fill_value = None - if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype( - empty_dtype - ): + if self.block.is_datetimetz or is_datetime64tz_dtype(empty_dtype): if self.block is None: array = empty_dtype.construct_array_type() return array( np.full(self.shape[1], fill_value.value), dtype=empty_dtype ) pass - elif getattr(self.block, "is_categorical", False): + elif self.block.is_categorical: pass - elif getattr(self.block, "is_extension", False): + elif self.block.is_extension: pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index b240876de92b1..ce249e9de3497 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -126,27 +126,29 @@ def test_int_index(self): from pandas.core.series import Series arr = np.random.randn(100, 4) - result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4))) + result = reduction.do_reduce(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) + result = reduction.do_reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(100)) - result = reduction.reduce(arr, np.sum, dummy=dummy, labels=Index(np.arange(4))) + result = reduction.do_reduce( + arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) + ) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = reduction.reduce( + result = reduction.do_reduce( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.reduce( + result = reduction.do_reduce( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) assert_almost_equal(result, expected) From b750f06ce3fec1a27b2f3e8ac06708a9082a9880 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 11:53:58 -0700 Subject: [PATCH 03/12] remove apparently-unreachable value_counts --- pandas/core/groupby/generic.py | 125 --------------------------------- 1 file changed, 125 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 811836d0e8a4d..ac390b9388c31 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,13 +23,10 @@ from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like, - is_integer_dtype, - is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, @@ -1195,128 +1192,6 @@ def describe(self, **kwargs): return result.T return result.unstack() - def value_counts( - self, normalize=False, sort=True, ascending=False, bins=None, dropna=True - ): - - from pandas.core.reshape.tile import cut - from pandas.core.reshape.merge import _get_join_indexers - - if bins is not None and not np.iterable(bins): - # scalar bins cannot be done at top level - # in a backward compatible way - return self.apply( - Series.value_counts, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - ) - - ids, _, _ = self.grouper.group_info - val = self.obj._internal_get_values() - - # groupby removes null keys from groupings - mask = ids != -1 - ids, val = ids[mask], val[mask] - - if bins is None: - lab, lev = algorithms.factorize(val, sort=True) - llab = lambda lab, inc: lab[inc] - else: - - # lab is a Categorical with categories an IntervalIndex - lab = cut(Series(val), bins, include_lowest=True) - lev = lab.cat.categories - lab = lev.take(lab.cat.codes) - llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] - - if is_interval_dtype(lab): - # TODO: should we do this inside II? - sorter = np.lexsort((lab.left, lab.right, ids)) - else: - sorter = np.lexsort((lab, ids)) - - ids, lab = ids[sorter], lab[sorter] - - # group boundaries are where group ids change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - - # new values are where sorted labels change - lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) - inc = np.r_[True, lchanges] - inc[idx] = True # group boundaries are also new values - out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts - - # num. of times each group should be repeated - rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) - - # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] - names = self.grouper.names + [self._selection_name] - - if dropna: - mask = labels[-1] != -1 - if mask.all(): - dropna = False - else: - out, labels = out[mask], [label[mask] for label in labels] - - if normalize: - out = out.astype("float") - d = np.diff(np.r_[idx, len(ids)]) - if dropna: - m = ids[lab == -1] - np.add.at(d, m, -1) - acc = rep(d)[mask] - else: - acc = rep(d) - out /= acc - - if sort and bins is None: - cat = ids[inc][mask] if dropna else ids[inc] - sorter = np.lexsort((out if ascending else -out, cat)) - out, labels[-1] = out[sorter], labels[-1][sorter] - - if bins is None: - mi = MultiIndex( - levels=levels, codes=labels, names=names, verify_integrity=False - ) - - if is_integer_dtype(out): - out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) - - # for compat. with libgroupby.value_counts need to ensure every - # bin is present at every index level, null filled with zeros - diff = np.zeros(len(out), dtype="bool") - for lab in labels[:-1]: - diff |= np.r_[True, lab[1:] != lab[:-1]] - - ncat, nbin = diff.sum(), len(levels[-1]) - - left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] - - right = [diff.cumsum() - 1, labels[-1]] - - _, idx = _get_join_indexers(left, right, sort=False, how="left") - out = np.where(idx != -1, out[idx], 0) - - if sort: - sorter = np.lexsort((out if ascending else -out, left[0])) - out, left[-1] = out[sorter], left[-1][sorter] - - # build the multi-index w/ full levels - codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) - codes.append(left[-1]) - - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) - - if is_integer_dtype(out): - out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) - def count(self): """ Compute count of group, excluding missing values. From 64e9897b09566062c4f9c3d667261ec2a2f0c68f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 13:05:00 -0700 Subject: [PATCH 04/12] typing --- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/grouper.py | 3 ++- pandas/core/groupby/ops.py | 4 ++-- pandas/core/internals/concat.py | 10 ++++++---- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 12b9cf25687cf..b40cf13379c2d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -343,7 +343,7 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__( self, - obj, + obj: NDFrame, keys=None, axis=0, level=None, @@ -360,8 +360,8 @@ def __init__( self._selection = selection - if isinstance(obj, NDFrame): - obj._consolidate_inplace() + assert isinstance(obj, NDFrame), type(obj) + obj._consolidate_inplace() self.level = level diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1d88ebd26b1b6..e7ed348ccb140 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -25,6 +25,7 @@ from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby from pandas.core.groupby.ops import BaseGrouper from pandas.core.index import CategoricalIndex, Index, MultiIndex @@ -423,7 +424,7 @@ def groups(self): def _get_grouper( - obj, + obj: NDFrame, key=None, axis=0, level=None, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1484feeeada64..f20c3f702e29d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -906,7 +906,7 @@ def _get_sorted_data(self): return self.data.take(self.sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj): - return sdata.iloc[slice_obj] + raise AbstractMethodError(self) def apply(self, f): raise AbstractMethodError(self) @@ -933,7 +933,7 @@ def _chop(self, sdata, slice_obj): if self.axis == 0: return sdata.iloc[slice_obj] else: - return sdata._slice(slice_obj, axis=1) # .loc[:, slice_obj] + return sdata._slice(slice_obj, axis=1) def get_splitter(data, *args, **kwargs): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7eaec0687b790..121c61d8d3623 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -183,7 +183,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): fill_value = upcasted_na if self.is_na: - if self.block.is_object: + if getattr(self.block, "is_object", False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls @@ -191,16 +191,18 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): if len(values) and values[0] is None: fill_value = None - if self.block.is_datetimetz or is_datetime64tz_dtype(empty_dtype): + if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype( + empty_dtype + ): if self.block is None: array = empty_dtype.construct_array_type() return array( np.full(self.shape[1], fill_value.value), dtype=empty_dtype ) pass - elif self.block.is_categorical: + elif getattr(self.block, "is_categorical", False): pass - elif self.block.is_extension: + elif getattr(self.block, "is_extension", False): pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) From 6fd209667490eeff1cfaa2e9058d87e15f1503d7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 18:01:49 -0700 Subject: [PATCH 05/12] add types --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 821c35e0cce2f..b3d3802015528 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3563,7 +3563,7 @@ def _iget_item_cache(self, item): def _box_item_values(self, key, values): raise AbstractMethodError(self) - def _slice(self, slobj, axis=0, kind=None): + def _slice(self, slob: slice, axis: int = 0, kind=None): """ Construct a slice of this container. From 84fbabc7699cf967c80e354290a4f96257581fc1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 18:03:13 -0700 Subject: [PATCH 06/12] revert a type --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b3d3802015528..e097cb84e7777 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3563,7 +3563,7 @@ def _iget_item_cache(self, item): def _box_item_values(self, key, values): raise AbstractMethodError(self) - def _slice(self, slob: slice, axis: int = 0, kind=None): + def _slice(self, slob: slice, axis=0, kind=None): """ Construct a slice of this container. From e01f2869c9e4b610578cd2a83a04d0dbadeb35a9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 18:05:39 -0700 Subject: [PATCH 07/12] improve type --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e7ed348ccb140..93d9fa58cdb52 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -21,11 +21,11 @@ ) from pandas.core.dtypes.generic import ABCSeries +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby from pandas.core.groupby.ops import BaseGrouper from pandas.core.index import CategoricalIndex, Index, MultiIndex @@ -424,7 +424,7 @@ def groups(self): def _get_grouper( - obj: NDFrame, + obj: FrameOrSeries, key=None, axis=0, level=None, From b809ac6e487b01092290f87e206cad8e2f01be6c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 18:10:25 -0700 Subject: [PATCH 08/12] restore value_counts --- pandas/core/groupby/generic.py | 125 +++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ac390b9388c31..811836d0e8a4d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,10 +23,13 @@ from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype from pandas.core.dtypes.common import ( + ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like, + is_integer_dtype, + is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, @@ -1192,6 +1195,128 @@ def describe(self, **kwargs): return result.T return result.unstack() + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): + + from pandas.core.reshape.tile import cut + from pandas.core.reshape.merge import _get_join_indexers + + if bins is not None and not np.iterable(bins): + # scalar bins cannot be done at top level + # in a backward compatible way + return self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + ) + + ids, _, _ = self.grouper.group_info + val = self.obj._internal_get_values() + + # groupby removes null keys from groupings + mask = ids != -1 + ids, val = ids[mask], val[mask] + + if bins is None: + lab, lev = algorithms.factorize(val, sort=True) + llab = lambda lab, inc: lab[inc] + else: + + # lab is a Categorical with categories an IntervalIndex + lab = cut(Series(val), bins, include_lowest=True) + lev = lab.cat.categories + lab = lev.take(lab.cat.codes) + llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] + + if is_interval_dtype(lab): + # TODO: should we do this inside II? + sorter = np.lexsort((lab.left, lab.right, ids)) + else: + sorter = np.lexsort((lab, ids)) + + ids, lab = ids[sorter], lab[sorter] + + # group boundaries are where group ids change + idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] + + # new values are where sorted labels change + lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) + inc = np.r_[True, lchanges] + inc[idx] = True # group boundaries are also new values + out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts + + # num. of times each group should be repeated + rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) + + # multi-index components + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + names = self.grouper.names + [self._selection_name] + + if dropna: + mask = labels[-1] != -1 + if mask.all(): + dropna = False + else: + out, labels = out[mask], [label[mask] for label in labels] + + if normalize: + out = out.astype("float") + d = np.diff(np.r_[idx, len(ids)]) + if dropna: + m = ids[lab == -1] + np.add.at(d, m, -1) + acc = rep(d)[mask] + else: + acc = rep(d) + out /= acc + + if sort and bins is None: + cat = ids[inc][mask] if dropna else ids[inc] + sorter = np.lexsort((out if ascending else -out, cat)) + out, labels[-1] = out[sorter], labels[-1][sorter] + + if bins is None: + mi = MultiIndex( + levels=levels, codes=labels, names=names, verify_integrity=False + ) + + if is_integer_dtype(out): + out = ensure_int64(out) + return Series(out, index=mi, name=self._selection_name) + + # for compat. with libgroupby.value_counts need to ensure every + # bin is present at every index level, null filled with zeros + diff = np.zeros(len(out), dtype="bool") + for lab in labels[:-1]: + diff |= np.r_[True, lab[1:] != lab[:-1]] + + ncat, nbin = diff.sum(), len(levels[-1]) + + left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] + + right = [diff.cumsum() - 1, labels[-1]] + + _, idx = _get_join_indexers(left, right, sort=False, how="left") + out = np.where(idx != -1, out[idx], 0) + + if sort: + sorter = np.lexsort((out if ascending else -out, left[0])) + out, left[-1] = out[sorter], left[-1][sorter] + + # build the multi-index w/ full levels + codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) + codes.append(left[-1]) + + mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) + + if is_integer_dtype(out): + out = ensure_int64(out) + return Series(out, index=mi, name=self._selection_name) + def count(self): """ Compute count of group, excluding missing values. From a6dc9e0b2614fba22bce6d3627d463843fb8fd9c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 19:53:40 -0700 Subject: [PATCH 09/12] typo fixup --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e097cb84e7777..ecda9d616960a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3563,7 +3563,7 @@ def _iget_item_cache(self, item): def _box_item_values(self, key, values): raise AbstractMethodError(self) - def _slice(self, slob: slice, axis=0, kind=None): + def _slice(self, slobj: slice, axis=0, kind=None): """ Construct a slice of this container. From 5b7a89f8a82e68271477c84ef11cdc9f5be78632 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Aug 2019 20:32:21 -0700 Subject: [PATCH 10/12] mypy fixup --- pandas/core/generic.py | 4 +--- pandas/core/groupby/groupby.py | 4 +++- pandas/core/groupby/grouper.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ecda9d616960a..f6b89a6e90cd8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6190,8 +6190,6 @@ def fillna( axis = 0 axis = self._get_axis_number(axis) - from pandas import DataFrame - if value is None: if self._is_mixed_type and axis == 1: @@ -6254,7 +6252,7 @@ def fillna( new_data = self._data.fillna( value=value, limit=limit, inplace=inplace, downcast=downcast ) - elif isinstance(value, DataFrame) and self.ndim == 2: + elif isinstance(value, ABCDataFrame) and self.ndim == 2: new_data = self.where(self.notna(), value) else: raise ValueError("invalid fill value with a %s" % type(value)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b40cf13379c2d..c9352cfad409c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -29,14 +29,16 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( ensure_float, + is_datetime64_dtype, is_datetime64tz_dtype, is_extension_array_dtype, + is_integer_dtype, is_numeric_dtype, + is_object_dtype, is_scalar, ) from pandas.core.dtypes.missing import isna, notna -from pandas.api.types import is_datetime64_dtype, is_integer_dtype, is_object_dtype import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical from pandas.core.base import ( diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 93d9fa58cdb52..e7ed348ccb140 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -21,11 +21,11 @@ ) from pandas.core.dtypes.generic import ABCSeries -from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby from pandas.core.groupby.ops import BaseGrouper from pandas.core.index import CategoricalIndex, Index, MultiIndex @@ -424,7 +424,7 @@ def groups(self): def _get_grouper( - obj: FrameOrSeries, + obj: NDFrame, key=None, axis=0, level=None, From 482f2d037c83c618265d214c09e7bfdcfa655557 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Aug 2019 07:32:36 -0700 Subject: [PATCH 11/12] rename do_reduce-->compute_reduction --- pandas/_libs/reduction.pyx | 2 +- pandas/core/apply.py | 4 ++-- pandas/core/arrays/datetimelike.py | 4 ++-- pandas/tests/groupby/test_bin_groupby.py | 20 ++++++++------------ 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 5ad1d71ff8ebb..f95685c337969 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -628,7 +628,7 @@ cdef class BlockSlider: arr.shape[1] = 0 -def do_reduce(arr, f, axis=0, dummy=None, labels=None): +def compute_reduction(arr, f, axis=0, dummy=None, labels=None): """ Parameters diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b7995f20ce2b7..5c8599dbb054b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -221,7 +221,7 @@ def apply_raw(self): """ apply to the values as a numpy array """ try: - result = reduction.do_reduce(self.values, self.f, axis=self.axis) + result = reduction.compute_reduction(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) @@ -281,7 +281,7 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.do_reduce( + result = reduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) return self.obj._constructor_sliced(result, index=labels) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 599300bc7973a..ecad58e49bde4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -213,8 +213,8 @@ class TimelikeOps: .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, - default 'raise' + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ +default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index ce249e9de3497..2195686ee9c7f 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -6,15 +6,13 @@ from pandas.core.dtypes.common import ensure_int64 -from pandas import Index, isna +from pandas import Index, Series, isna from pandas.core.groupby.ops import generate_bins_generic import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal def test_series_grouper(): - from pandas import Series - obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -31,8 +29,6 @@ def test_series_grouper(): def test_series_bin_grouper(): - from pandas import Series - obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -123,32 +119,32 @@ class TestMoments: class TestReducer: def test_int_index(self): - from pandas.core.series import Series - arr = np.random.randn(100, 4) - result = reduction.do_reduce(arr, np.sum, labels=Index(np.arange(4))) + result = reduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.do_reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) + result = reduction.compute_reduction( + arr, np.sum, axis=1, labels=Index(np.arange(100)) + ) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(100)) - result = reduction.do_reduce( + result = reduction.compute_reduction( arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) ) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = reduction.do_reduce( + result = reduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.do_reduce( + result = reduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) assert_almost_equal(result, expected) From 61f0241a1a1e546be23aa32c9078b247a7c324d0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Aug 2019 08:10:05 -0700 Subject: [PATCH 12/12] dummy to force cI