From 24ad2217bdb250eac9ff1e91e1b8c86354783391 Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 26 Apr 2020 17:27:34 -0400 Subject: [PATCH 1/4] BUG: DataFrame.groupby with as_index=False shouldn't modify grouping columns --- asv_bench/benchmarks/arithmetic.py | 53 ++++++ doc/source/whatsnew/v1.1.0.rst | 63 ++++++- pandas/_libs/index.pyx | 5 +- pandas/_libs/internals.pyx | 2 +- pandas/_libs/lib.pyx | 2 +- pandas/_libs/tslibs/base.pxd | 7 - pandas/_libs/tslibs/base.pyx | 8 - pandas/_libs/tslibs/conversion.pyx | 9 +- pandas/_libs/tslibs/nattype.pyx | 7 +- pandas/_libs/tslibs/offsets.pyx | 178 +++++++++++++++--- pandas/_libs/tslibs/period.pxd | 1 + pandas/_libs/tslibs/period.pyx | 101 +++++----- pandas/core/arrays/categorical.py | 7 +- pandas/core/arrays/datetimelike.py | 10 +- pandas/core/arrays/interval.py | 3 +- pandas/core/arrays/period.py | 7 +- pandas/core/dtypes/cast.py | 4 +- pandas/core/frame.py | 12 +- pandas/core/generic.py | 5 +- pandas/core/groupby/generic.py | 68 ++++--- pandas/core/groupby/groupby.py | 65 +++++-- pandas/core/groupby/grouper.py | 4 +- pandas/core/indexes/base.py | 8 +- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/multi.py | 6 +- pandas/core/internals/construction.py | 2 +- pandas/core/internals/managers.py | 63 +++++-- pandas/core/ops/__init__.py | 6 +- pandas/core/ops/array_ops.py | 15 +- pandas/core/ops/blockwise.py | 102 ++++++++++ pandas/core/reshape/pivot.py | 38 ++-- pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/arithmetic/common.py | 9 +- pandas/tests/arithmetic/test_datetime64.py | 6 +- pandas/tests/arithmetic/test_timedelta64.py | 3 +- .../arrays/categorical/test_analytics.py | 32 ++++ pandas/tests/frame/test_arithmetic.py | 8 +- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 28 +++ pandas/tests/groupby/test_nunique.py | 10 +- pandas/tests/groupby/test_size.py | 10 +- pandas/tests/groupby/test_whitelist.py | 2 +- pandas/tests/io/test_feather.py | 4 +- pandas/tests/scalar/period/test_period.py | 19 +- pandas/tests/tseries/offsets/test_ticks.py | 8 +- pandas/tseries/frequencies.py | 4 +- pandas/tseries/offsets.py | 126 +------------ 47 files changed, 786 insertions(+), 350 deletions(-) create mode 100644 pandas/_libs/tslibs/period.pxd create mode 100644 pandas/core/ops/blockwise.py diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 8aa29468559b2..08a11ba2607a5 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -101,6 +101,59 @@ def time_frame_op_with_series_axis1(self, opname): getattr(operator, opname)(self.df, self.ser) +class FrameWithFrameWide: + # Many-columns, mixed dtypes + + params = [ + [ + # GH#32779 has discussion of which operators are included here + operator.add, + operator.floordiv, + operator.gt, + ] + ] + param_names = ["op"] + + def setup(self, op): + # we choose dtypes so as to make the blocks + # a) not perfectly match between right and left + # b) appreciably bigger than single columns + n_cols = 2000 + n_rows = 500 + + # construct dataframe with 2 blocks + arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8") + arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4") + df = pd.concat( + [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True, + ) + # should already be the case, but just to be sure + df._consolidate_inplace() + + # TODO: GH#33198 the setting here shoudlnt need two steps + arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8") + arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8") + arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8") + df2 = pd.concat( + [pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)], + axis=1, + ignore_index=True, + ) + # should already be the case, but just to be sure + df2._consolidate_inplace() + + self.left = df + self.right = df2 + + def time_op_different_blocks(self, op): + # blocks (and dtypes) are not aligned + op(self.left, self.right) + + def time_op_same_blocks(self, op): + # blocks (and dtypes) are aligned + op(self.left, self.left) + + class Ops: params = [[True, False], ["default", 1]] diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3b60085e9fa66..a02da69ecf4fb 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -561,6 +561,63 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns df[['a', 'c']] = 1 df +.. _whatsnew_110.api_breaking.groupby_nunique: + +Using groupby with ``nunique`` and ``as_index=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now, the grouping columns only appear in the index. This is now consistent with other aggregation functions. (:issue:`32579`) + +.. ipython:: python + + df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=True).nunique() + Out[4]: + a b + a + x 1 1 + y 1 2 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=True).nunique() + +.. _whatsnew_110.api_breaking.groupby_as_index_false: + +Using groupby with ``as_index=False`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, or ``skew`` would modify the grouping column. Now, the grouping column remains unchanged. (:issue:`21090`) + +.. ipython:: python + + df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=False).nunique() + Out[4]: + a b + 0 1 1 + 1 1 2 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=False).nunique() + .. _whatsnew_110.deprecations: Deprecations @@ -611,7 +668,7 @@ Performance improvements and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - +- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) .. --------------------------------------------------------------------------- @@ -816,9 +873,10 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) +- Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) - +- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) Reshaping ^^^^^^^^^ @@ -883,6 +941,7 @@ Other - Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) - Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) - :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) +- More informative error message with ``np.min`` or ``np.max`` on unordered :class:`Categorical` (:issue:`33115`) - Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index b4dcdaa10d0ef..6b3d1c1ec2c13 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -22,7 +22,8 @@ cnp.import_array() from pandas._libs cimport util from pandas._libs.tslibs.nattype cimport c_NaT as NaT -from pandas._libs.tslibs.base cimport ABCTimestamp, ABCTimedelta, ABCPeriod +from pandas._libs.tslibs.base cimport ABCTimestamp, ABCTimedelta +from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.hashtable cimport HashTable @@ -479,7 +480,7 @@ cdef class PeriodEngine(Int64Engine): cdef int64_t _unbox_scalar(self, scalar) except? -1: if scalar is NaT: return scalar.value - if isinstance(scalar, ABCPeriod): + if is_period_object(scalar): # NB: we assume that we have the correct freq here. return scalar.ordinal raise TypeError(scalar) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 1aa95e92b73d1..db452cb0f1fa4 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -49,7 +49,7 @@ cdef class BlockPlacement: else: # Cython memoryview interface requires ndarray to be writeable. arr = np.require(val, dtype=np.int64, requirements='W') - assert arr.ndim == 1 + assert arr.ndim == 1, arr.shape self._as_array = arr self._has_array = True diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bd623a39010f6..bbd19a33fd34e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -74,7 +74,7 @@ from pandas._libs.tslibs.nattype cimport ( from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare -from pandas._libs.tslibs.base cimport is_period_object +from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.missing cimport ( checknull, diff --git a/pandas/_libs/tslibs/base.pxd b/pandas/_libs/tslibs/base.pxd index 0521279025d4f..d8c76542f3457 100644 --- a/pandas/_libs/tslibs/base.pxd +++ b/pandas/_libs/tslibs/base.pxd @@ -6,10 +6,3 @@ cdef class ABCTimedelta(timedelta): cdef class ABCTimestamp(datetime): pass - - -cdef class ABCPeriod: - pass - - -cdef bint is_period_object(object obj) diff --git a/pandas/_libs/tslibs/base.pyx b/pandas/_libs/tslibs/base.pyx index 91178fe3933f7..6a5ee3f784334 100644 --- a/pandas/_libs/tslibs/base.pyx +++ b/pandas/_libs/tslibs/base.pyx @@ -14,11 +14,3 @@ cdef class ABCTimedelta(timedelta): cdef class ABCTimestamp(datetime): pass - - -cdef class ABCPeriod: - pass - - -cdef bint is_period_object(object obj): - return isinstance(obj, ABCPeriod) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 7d8c85b6c8e0b..b8544e2a9f39e 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -13,7 +13,7 @@ from cpython.datetime cimport (datetime, time, tzinfo, PyDateTime_IMPORT) PyDateTime_IMPORT -from pandas._libs.tslibs.base cimport ABCTimestamp, is_period_object +from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.np_datetime cimport ( check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, @@ -290,10 +290,11 @@ cdef convert_to_tsobject(object ts, object tz, object unit, # Keep the converter same as PyDateTime's ts = datetime.combine(ts, time()) return convert_datetime_to_tsobject(ts, tz) - elif is_period_object(ts): - raise ValueError("Cannot convert Period to Timestamp " - "unambiguously. Use to_timestamp") else: + from .period import Period + if isinstance(ts, Period): + raise ValueError("Cannot convert Period to Timestamp " + "unambiguously. Use to_timestamp") raise TypeError(f'Cannot convert input [{ts}] of type {type(ts)} to ' f'Timestamp') diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 84733f50681e4..6342735888550 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -29,7 +29,6 @@ from pandas._libs.tslibs.np_datetime cimport ( get_timedelta64_value, ) cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.base cimport is_period_object # ---------------------------------------------------------------------- @@ -149,7 +148,7 @@ cdef class _NaT(datetime): elif util.is_offset_object(other): return c_NaT - elif util.is_integer_object(other) or is_period_object(other): + elif util.is_integer_object(other): # For Period compat # TODO: the integer behavior is deprecated, remove it return c_NaT @@ -163,6 +162,7 @@ cdef class _NaT(datetime): return result raise TypeError(f"Cannot add NaT to ndarray with dtype {other.dtype}") + # Includes Period going through here return NotImplemented def __sub__(self, other): @@ -185,7 +185,7 @@ cdef class _NaT(datetime): elif util.is_offset_object(other): return c_NaT - elif util.is_integer_object(other) or is_period_object(other): + elif util.is_integer_object(other): # For Period compat # TODO: the integer behavior is deprecated, remove it return c_NaT @@ -216,6 +216,7 @@ cdef class _NaT(datetime): f"Cannot subtract NaT from ndarray with dtype {other.dtype}" ) + # Includes Period going through here return NotImplemented def __pos__(self): diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b25558e8572fe..17ea389611b84 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -5,6 +5,7 @@ from typing import Any import warnings from cpython.datetime cimport (PyDateTime_IMPORT, PyDateTime_Check, + PyDate_Check, PyDelta_Check, datetime, timedelta, date, time as dt_time) @@ -35,6 +36,8 @@ from pandas._libs.tslibs.np_datetime cimport ( from pandas._libs.tslibs.timezones cimport utc_pytz as UTC from pandas._libs.tslibs.tzconversion cimport tz_convert_single +from .timedeltas cimport delta_to_nanoseconds + # --------------------------------------------------------------------- # Constants @@ -87,11 +90,11 @@ for _d in DAYS: # Misc Helpers cdef bint is_offset_object(object obj): - return isinstance(obj, _BaseOffset) + return isinstance(obj, BaseOffset) cdef bint is_tick_object(object obj): - return isinstance(obj, _Tick) + return isinstance(obj, Tick) cdef to_offset(object obj): @@ -99,7 +102,7 @@ cdef to_offset(object obj): Wrap pandas.tseries.frequencies.to_offset to keep centralize runtime imports """ - if isinstance(obj, _BaseOffset): + if isinstance(obj, BaseOffset): return obj from pandas.tseries.frequencies import to_offset return to_offset(obj) @@ -161,10 +164,11 @@ def apply_wraps(func): if other is NaT: return NaT - elif isinstance(other, (timedelta, BaseOffset)): + elif isinstance(other, BaseOffset) or PyDelta_Check(other): # timedelta path return func(self, other) - elif isinstance(other, (datetime, date)) or is_datetime64_object(other): + elif is_datetime64_object(other) or PyDate_Check(other): + # PyDate_Check includes date, datetime other = Timestamp(other) else: # This will end up returning NotImplemented back in __add__ @@ -227,7 +231,6 @@ cdef _wrap_timedelta_result(result): """ if PyDelta_Check(result): # convert Timedelta back to a Tick - from pandas.tseries.offsets import delta_to_tick return delta_to_tick(result) return result @@ -398,7 +401,7 @@ class ApplyTypeError(TypeError): # --------------------------------------------------------------------- # Base Classes -cdef class _BaseOffset: +cdef class BaseOffset: """ Base class for DateOffset methods that are not overridden by subclasses and will (after pickle errors are resolved) go into a cdef class. @@ -477,6 +480,9 @@ cdef class _BaseOffset: return type(self)(n=1, normalize=self.normalize, **self.kwds) def __add__(self, other): + if not isinstance(self, BaseOffset): + # cython semantics; this is __radd__ + return other.__add__(self) try: return self.apply(other) except ApplyTypeError: @@ -488,6 +494,9 @@ cdef class _BaseOffset: elif type(other) == type(self): return type(self)(self.n - other.n, normalize=self.normalize, **self.kwds) + elif not isinstance(self, BaseOffset): + # cython semantics, this is __rsub__ + return (-other).__add__(self) else: # pragma: no cover return NotImplemented @@ -506,6 +515,9 @@ cdef class _BaseOffset: elif is_integer_object(other): return type(self)(n=other * self.n, normalize=self.normalize, **self.kwds) + elif not isinstance(self, BaseOffset): + # cython semantics, this is __rmul__ + return other.__mul__(self) return NotImplemented def __neg__(self): @@ -657,8 +669,8 @@ cdef class _BaseOffset: # ------------------------------------------------------------------ - # Staticmethod so we can call from _Tick.__init__, will be unnecessary - # once BaseOffset is a cdef class and is inherited by _Tick + # Staticmethod so we can call from Tick.__init__, will be unnecessary + # once BaseOffset is a cdef class and is inherited by Tick @staticmethod def _validate_n(n): """ @@ -758,24 +770,7 @@ cdef class _BaseOffset: return self.n == 1 -class BaseOffset(_BaseOffset): - # Here we add __rfoo__ methods that don't play well with cdef classes - def __rmul__(self, other): - return self.__mul__(other) - - def __radd__(self, other): - return self.__add__(other) - - def __rsub__(self, other): - return (-self).__add__(other) - - -cdef class _Tick(_BaseOffset): - """ - dummy class to mix into tseries.offsets.Tick so that in tslibs.period we - can do isinstance checks on _Tick and avoid importing tseries.offsets - """ - +cdef class Tick(BaseOffset): # ensure that reversed-ops with numpy scalars return NotImplemented __array_priority__ = 1000 _adjust_dst = False @@ -793,13 +788,25 @@ cdef class _Tick(_BaseOffset): "Tick offset with `normalize=True` are not allowed." ) + @classmethod + def _from_name(cls, suffix=None): + # default _from_name calls cls with no args + if suffix: + raise ValueError(f"Bad freq suffix {suffix}") + return cls() + + def _repr_attrs(self) -> str: + # Since cdef classes have no __dict__, we need to override + return "" + @property def delta(self): - return self.n * self._inc + from .timedeltas import Timedelta + return self.n * Timedelta(self._nanos_inc) @property def nanos(self) -> int64_t: - return self.delta.value + return self.n * self._nanos_inc def is_on_offset(self, dt) -> bool: return True @@ -837,13 +844,63 @@ cdef class _Tick(_BaseOffset): return self.delta.__gt__(other) def __truediv__(self, other): - if not isinstance(self, _Tick): + if not isinstance(self, Tick): # cython semantics mean the args are sometimes swapped result = other.delta.__rtruediv__(self) else: result = self.delta.__truediv__(other) return _wrap_timedelta_result(result) + def __add__(self, other): + if not isinstance(self, Tick): + # cython semantics; this is __radd__ + return other.__add__(self) + + if isinstance(other, Tick): + if type(self) == type(other): + return type(self)(self.n + other.n) + else: + return delta_to_tick(self.delta + other.delta) + try: + return self.apply(other) + except ApplyTypeError: + # Includes pd.Period + return NotImplemented + except OverflowError as err: + raise OverflowError( + f"the add operation between {self} and {other} will overflow" + ) from err + + def apply(self, other): + # Timestamp can handle tz and nano sec, thus no need to use apply_wraps + if isinstance(other, ABCTimestamp): + + # GH#15126 + # in order to avoid a recursive + # call of __add__ and __radd__ if there is + # an exception, when we call using the + operator, + # we directly call the known method + result = other.__add__(self) + if result is NotImplemented: + raise OverflowError + return result + elif other is NaT: + return NaT + elif is_datetime64_object(other) or PyDate_Check(other): + # PyDate_Check includes date, datetime + from pandas import Timestamp + return Timestamp(other) + self + + if PyDelta_Check(other): + return other + self.delta + elif isinstance(other, type(self)): + # TODO: this is reached in tests that specifically call apply, + # but should not be reached "naturally" because __add__ should + # catch this case first. + return type(self)(self.n + other.n) + + raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") + # -------------------------------------------------------------------- # Pickle Methods @@ -855,6 +912,67 @@ cdef class _Tick(_BaseOffset): self.normalize = False +cdef class Day(Tick): + _nanos_inc = 24 * 3600 * 1_000_000_000 + _prefix = "D" + + +cdef class Hour(Tick): + _nanos_inc = 3600 * 1_000_000_000 + _prefix = "H" + + +cdef class Minute(Tick): + _nanos_inc = 60 * 1_000_000_000 + _prefix = "T" + + +cdef class Second(Tick): + _nanos_inc = 1_000_000_000 + _prefix = "S" + + +cdef class Milli(Tick): + _nanos_inc = 1_000_000 + _prefix = "L" + + +cdef class Micro(Tick): + _nanos_inc = 1000 + _prefix = "U" + + +cdef class Nano(Tick): + _nanos_inc = 1 + _prefix = "N" + + +def delta_to_tick(delta: timedelta) -> Tick: + if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0: + # nanoseconds only for pd.Timedelta + if delta.seconds == 0: + return Day(delta.days) + else: + seconds = delta.days * 86400 + delta.seconds + if seconds % 3600 == 0: + return Hour(seconds / 3600) + elif seconds % 60 == 0: + return Minute(seconds / 60) + else: + return Second(seconds) + else: + nanos = delta_to_nanoseconds(delta) + if nanos % 1_000_000 == 0: + return Milli(nanos // 1_000_000) + elif nanos % 1000 == 0: + return Micro(nanos // 1000) + else: # pragma: no cover + return Nano(nanos) + + +# -------------------------------------------------------------------- + + class BusinessMixin(BaseOffset): """ Mixin to business types to provide related functions. diff --git a/pandas/_libs/tslibs/period.pxd b/pandas/_libs/tslibs/period.pxd new file mode 100644 index 0000000000000..eb11a4a572e85 --- /dev/null +++ b/pandas/_libs/tslibs/period.pxd @@ -0,0 +1 @@ +cdef bint is_period_object(object obj) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 380231129c9b2..d28585f15e5b5 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -37,8 +37,6 @@ cdef extern from "src/datetime/np_datetime.h": cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.base cimport ABCPeriod, is_period_object - from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info from pandas._libs.tslibs.timedeltas import Timedelta @@ -54,7 +52,6 @@ from pandas._libs.tslibs.ccalendar cimport ( from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.frequencies cimport ( attrname_to_abbrevs, - get_base_alias, get_freq_code, get_freq_str, get_rule_month, @@ -1533,7 +1530,7 @@ class IncompatibleFrequency(ValueError): pass -cdef class _Period(ABCPeriod): +cdef class _Period: cdef readonly: int64_t ordinal @@ -1602,9 +1599,7 @@ cdef class _Period(ABCPeriod): raise IncompatibleFrequency("Input cannot be converted to " f"Period(freq={self.freqstr})") elif util.is_offset_object(other): - freqstr = other.rule_code - base = get_base_alias(freqstr) - if base == self.freq.rule_code: + if other.base == self.freq.base: ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) msg = DIFFERENT_FREQ.format(cls=type(self).__name__, @@ -1615,55 +1610,57 @@ cdef class _Period(ABCPeriod): return NotImplemented def __add__(self, other): - if is_period_object(self): - if (PyDelta_Check(other) or util.is_timedelta64_object(other) or - util.is_offset_object(other)): - return self._add_delta(other) - elif other is NaT: + if not is_period_object(self): + # cython semantics; this is analogous to a call to __radd__ + if self is NaT: return NaT - elif util.is_integer_object(other): - ordinal = self.ordinal + other * self.freq.n - return Period(ordinal=ordinal, freq=self.freq) - elif (PyDateTime_Check(other) or - is_period_object(other) or util.is_datetime64_object(other)): - # can't add datetime-like - # GH#17983 - sname = type(self).__name__ - oname = type(other).__name__ - raise TypeError(f"unsupported operand type(s) for +: '{sname}' " - f"and '{oname}'") - else: # pragma: no cover - return NotImplemented - elif is_period_object(other): - # this can be reached via __radd__ because of cython rules - return other + self - else: - return NotImplemented + return other.__add__(self) + + if (PyDelta_Check(other) or util.is_timedelta64_object(other) or + util.is_offset_object(other)): + return self._add_delta(other) + elif other is NaT: + return NaT + elif util.is_integer_object(other): + ordinal = self.ordinal + other * self.freq.n + return Period(ordinal=ordinal, freq=self.freq) + elif (PyDateTime_Check(other) or + is_period_object(other) or util.is_datetime64_object(other)): + # can't add datetime-like + # GH#17983 + sname = type(self).__name__ + oname = type(other).__name__ + raise TypeError(f"unsupported operand type(s) for +: '{sname}' " + f"and '{oname}'") + + return NotImplemented def __sub__(self, other): - if is_period_object(self): - if (PyDelta_Check(other) or util.is_timedelta64_object(other) or - util.is_offset_object(other)): - neg_other = -other - return self + neg_other - elif util.is_integer_object(other): - ordinal = self.ordinal - other * self.freq.n - return Period(ordinal=ordinal, freq=self.freq) - elif is_period_object(other): - if other.freq != self.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other.freqstr) - raise IncompatibleFrequency(msg) - # GH 23915 - mul by base freq since __add__ is agnostic of n - return (self.ordinal - other.ordinal) * self.freq.base - return NotImplemented - elif is_period_object(other): + if not is_period_object(self): + # cython semantics; this is like a call to __rsub__ if self is NaT: return NaT return NotImplemented - else: - return NotImplemented + + elif (PyDelta_Check(other) or util.is_timedelta64_object(other) or + util.is_offset_object(other)): + neg_other = -other + return self + neg_other + elif util.is_integer_object(other): + ordinal = self.ordinal - other * self.freq.n + return Period(ordinal=ordinal, freq=self.freq) + elif is_period_object(other): + if other.freq != self.freq: + msg = DIFFERENT_FREQ.format(cls=type(self).__name__, + own_freq=self.freqstr, + other_freq=other.freqstr) + raise IncompatibleFrequency(msg) + # GH 23915 - mul by base freq since __add__ is agnostic of n + return (self.ordinal - other.ordinal) * self.freq.base + elif other is NaT: + return NaT + + return NotImplemented def asfreq(self, freq, how='E') -> "Period": """ @@ -2463,6 +2460,10 @@ class Period(_Period): return cls._from_ordinal(ordinal, freq) +cdef bint is_period_object(object obj): + return isinstance(obj, _Period) + + cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day, int hour, int minute, int second, freq): base, mult = get_freq_code(freq) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 2a01ab3802e62..80fe1ac7ce619 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -9,6 +9,7 @@ from pandas._libs import NaT, algos as libalgos, hashtable as htable from pandas._typing import ArrayLike, Dtype, Ordered, Scalar +from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs @@ -2077,7 +2078,7 @@ def _reduce(self, name, axis=0, **kwargs): return func(**kwargs) @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") - def min(self, skipna=True): + def min(self, skipna=True, **kwargs): """ The minimum value of the object. @@ -2096,6 +2097,7 @@ def min(self, skipna=True): ------- min : the minimum of this `Categorical` """ + nv.validate_min((), kwargs) self.check_for_ordered("min") if not len(self._codes): @@ -2112,7 +2114,7 @@ def min(self, skipna=True): return self.categories[pointer] @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") - def max(self, skipna=True): + def max(self, skipna=True, **kwargs): """ The maximum value of the object. @@ -2131,6 +2133,7 @@ def max(self, skipna=True): ------- max : the maximum of this `Categorical` """ + nv.validate_max((), kwargs) self.check_for_ordered("max") if not len(self._codes): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 708b0ea4da96d..9121e00b3e8d6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -98,6 +98,10 @@ def _validate_comparison_value(self, other): @unpack_zerodim_and_defer(opname) def wrapper(self, other): + if self.ndim > 1 and getattr(other, "shape", None) == self.shape: + # TODO: handle 2D-like listlikes + return op(self.ravel(), other.ravel()).reshape(self.shape) + try: other = _validate_comparison_value(self, other) except InvalidComparison: @@ -1308,10 +1312,12 @@ def _addsub_object_array(self, other: np.ndarray, op): """ assert op in [operator.add, operator.sub] if len(other) == 1: + # If both 1D then broadcasting is unambiguous + # TODO(EA2D): require self.ndim == other.ndim here return op(self, other[0]) warnings.warn( - "Adding/subtracting array of DateOffsets to " + "Adding/subtracting object-dtype array to " f"{type(self).__name__} not vectorized", PerformanceWarning, ) @@ -1319,7 +1325,7 @@ def _addsub_object_array(self, other: np.ndarray, op): # Caller is responsible for broadcasting if necessary assert self.shape == other.shape, (self.shape, other.shape) - res_values = op(self.astype("O"), np.array(other)) + res_values = op(self.astype("O"), np.asarray(other)) result = array(res_values.ravel()) result = extract_array(result, extract_numpy=True).reshape(self.shape) return result diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cf15f36cb03a3..c5366884fbdfe 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -27,7 +27,6 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( ABCDatetimeIndex, - ABCExtensionArray, ABCIndexClass, ABCIntervalIndex, ABCPeriodIndex, @@ -767,7 +766,7 @@ def size(self) -> int: # Avoid materializing self.values return self.left.size - def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": if not len(self) or periods == 0: return self.copy() diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 3978161829481..f1f8abb9e93a2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -12,6 +12,7 @@ period as libperiod, ) from pandas._libs.tslibs.fields import isleapyear_arr +from pandas._libs.tslibs.offsets import Tick, delta_to_tick from pandas._libs.tslibs.period import ( DIFFERENT_FREQ, IncompatibleFrequency, @@ -45,7 +46,7 @@ import pandas.core.common as com from pandas.tseries import frequencies -from pandas.tseries.offsets import DateOffset, Tick, delta_to_tick +from pandas.tseries.offsets import DateOffset def _field_accessor(name: str, alias: int, docstring=None): @@ -666,8 +667,8 @@ def _addsub_int_array( def _add_offset(self, other): assert not isinstance(other, Tick) - base = libfrequencies.get_base_alias(other.rule_code) - if base != self.freq.rule_code: + + if other.base != self.freq.base: raise raise_on_incompatible(self, other) # Note: when calling parent class's _add_timedeltalike_scalar, diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0855d9335cc3d..424eb9d673df5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -103,9 +103,9 @@ def is_nested_object(obj) -> bool: This may not be necessarily be performant. """ - if isinstance(obj, ABCSeries) and is_object_dtype(obj): + if isinstance(obj, ABCSeries) and is_object_dtype(obj.dtype): - if any(isinstance(v, ABCSeries) for v in obj.values): + if any(isinstance(v, ABCSeries) for v in obj._values): return True return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 31015e3095e7d..2d181e826c2a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -455,6 +455,7 @@ def __init__( mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) + elif isinstance(data, dict): mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): @@ -1159,7 +1160,7 @@ def dot(self, other): left = self.reindex(columns=common, copy=False) right = other.reindex(index=common, copy=False) lvals = left.values - rvals = right.values + rvals = right._values else: left = self lvals = self.values @@ -1891,7 +1892,7 @@ def to_records( if index: if isinstance(self.index, MultiIndex): # array of tuples to numpy cols. copy copy copy - ix_vals = list(map(np.array, zip(*self.index.values))) + ix_vals = list(map(np.array, zip(*self.index._values))) else: ix_vals = [self.index.values] @@ -3009,7 +3010,7 @@ def _setitem_frame(self, key, value): raise ValueError("Array conditional must be same shape as self") key = self._constructor(key, **self._construct_axes_dict()) - if key.values.size and not is_bool_dtype(key.values): + if key.size and not is_bool_dtype(key.values): raise TypeError( "Must pass DataFrame or 2-d ndarray with boolean values only" ) @@ -5754,10 +5755,11 @@ def _construct_result(self, result) -> "DataFrame": ------- DataFrame """ - out = self._constructor(result, index=self.index, copy=False) + out = self._constructor(result, copy=False) # Pin columns instead of passing to constructor for compat with # non-unique columns case out.columns = self.columns + out.index = self.index return out def combine( @@ -7450,7 +7452,7 @@ def applymap(self, func) -> "DataFrame": def infer(x): if x.empty: return lib.map_infer(x, func) - return lib.map_infer(x.astype(object).values, func) + return lib.map_infer(x.astype(object)._values, func) return self.apply(infer) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 85b6a8431617a..5c7d0eae24cee 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1012,6 +1012,9 @@ def rename_axis(self, mapper=lib.no_default, **kwargs): index, columns : scalar, list-like, dict-like or function, optional A scalar, list-like, dict-like or functions transformations to apply to that axis' values. + Note that the ``columns`` parameter is not allowed if the + object is a Series. This parameter only apply for DataFrame + type objects. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` @@ -4980,7 +4983,7 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") - weights = weights.values + weights = weights._values # If no frac or n, default to n=1. if n is None and frac is None: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 69b143febeea2..ea4b6f4e65341 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1265,7 +1265,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): v = values[0] - if isinstance(v, (np.ndarray, Index, Series)): + if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) all_indexed_same = all_indexes_same([x.index for x in values]) @@ -1341,6 +1341,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): result = self.obj._constructor( stacked_values.T, index=v.index, columns=key_index ) + elif not self.as_index: + # We add grouping column below, so create a frame here + result = DataFrame( + values, index=key_index, columns=[self._selection] + ) else: # GH#1738: values is list of arrays of unequal lengths # fall through to the outer else clause @@ -1358,6 +1363,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): else: result = result._convert(datetime=True) + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + return self._reindex_output(result) # values are not series or array-like but scalars @@ -1700,9 +1708,11 @@ def _insert_inaxis_grouper_inplace(self, result): ), ) ) - + columns = result.columns for name, lev, in_axis in izip: - if in_axis: + # GH #28549 + # When using .apply(-), name will be in columns already + if in_axis and name not in columns: result.insert(0, name, lev) def _wrap_aggregated_output( @@ -1852,11 +1862,11 @@ def nunique(self, dropna: bool = True): 5 ham 5 y >>> df.groupby('id').nunique() - id value1 value2 + value1 value2 id - egg 1 1 1 - ham 1 1 2 - spam 1 2 1 + egg 1 1 + ham 1 2 + spam 2 1 Check for rows with the same id but conflicting values: @@ -1867,37 +1877,37 @@ def nunique(self, dropna: bool = True): 4 ham 5 x 5 ham 5 y """ - obj = self._selected_obj + from pandas.core.reshape.concat import concat - def groupby_series(obj, col=None): - return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique( - dropna=dropna - ) + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions - if isinstance(obj, Series): - results = groupby_series(obj) + obj = self._obj_with_exclusions + axis_number = obj._get_axis_number(self.axis) + other_axis = int(not axis_number) + if axis_number == 0: + iter_func = obj.items else: - # TODO: this is duplicative of how GroupBy naturally works - # Try to consolidate with normal wrapping functions - from pandas.core.reshape.concat import concat - - axis_number = obj._get_axis_number(self.axis) - other_axis = int(not axis_number) - if axis_number == 0: - iter_func = obj.items - else: - iter_func = obj.iterrows + iter_func = obj.iterrows - results = [groupby_series(content, label) for label, content in iter_func()] - results = concat(results, axis=1) + results = concat( + [ + SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique( + dropna + ) + for label, content in iter_func() + ], + axis=1, + ) - if axis_number == 1: - results = results.T + if axis_number == 1: + results = results.T - results._get_axis(other_axis).names = obj._get_axis(other_axis).names + results._get_axis(other_axis).names = obj._get_axis(other_axis).names if not self.as_index: results.index = ibase.default_index(len(results)) + self._insert_inaxis_grouper_inplace(results) return results boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 55b9c28c74cb2..604c9071584a3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -35,7 +35,7 @@ class providing the base-class of operations. from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby -from pandas._typing import FrameOrSeries, Scalar +from pandas._typing import F, FrameOrSeries, Scalar from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -666,11 +666,11 @@ def _set_group_selection(self): ): return - ax = self.obj._info_axis groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] if len(groupers): # GH12839 clear selected obj cache when group selection changes + ax = self.obj._info_axis self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") @@ -735,11 +735,11 @@ def _make_wrapper(self, name): # need to setup the selection # as are not passed directly but in the grouper - f = getattr(self._selected_obj, name) + f = getattr(self._obj_with_exclusions, name) if not isinstance(f, types.MethodType): return self.apply(lambda self: getattr(self, name)) - f = getattr(type(self._selected_obj), name) + f = getattr(type(self._obj_with_exclusions), name) sig = inspect.signature(f) def wrapper(*args, **kwargs): @@ -762,7 +762,7 @@ def curried(x): return self.apply(curried) try: - return self.apply(curried) + return self._python_apply_general(curried, self._obj_with_exclusions) except TypeError as err: if not re.search( "reduction operation '.*' not allowed for this dtype", str(err) @@ -853,7 +853,7 @@ def f(g): # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): try: - result = self._python_apply_general(f) + result = self._python_apply_general(f, self._selected_obj) except TypeError: # gh-20949 # try again, with .apply acting as a filtering @@ -864,12 +864,27 @@ def f(g): # on a string grouper column with _group_selection_context(self): - return self._python_apply_general(f) + return self._python_apply_general(f, self._selected_obj) return result - def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) + def _python_apply_general(self, f: F, data: Union[Series, DataFrame]): + """ + Apply function f in python space + + Parameters + ---------- + f : callable + Function to apply + data : Series or DataFrame + Data to apply f to + + Returns + ------- + Series or DataFrame + data after applying f + """ + keys, values, mutated = self.grouper.apply(f, data, self.axis) return self._wrap_applied_output( keys, values, not_indexed_same=mutated or self.mutated @@ -1067,7 +1082,7 @@ def _python_agg_general( output[key] = maybe_cast_result(result, obj, numeric_only=True) if len(output) == 0: - return self._python_apply_general(f) + return self._python_apply_general(f, self._selected_obj) if self.grouper._filter_empty_groups: @@ -1416,8 +1431,18 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - # TODO: implement at Cython level? - return np.sqrt(self.var(ddof=ddof)) + result = self.var(ddof=ddof) + if result.ndim == 1: + result = np.sqrt(result) + else: + cols = result.columns.get_indexer_for( + result.columns.difference(self.exclusions).unique() + ) + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values + + return result @Substitution(name="groupby") @Appender(_common_see_also) @@ -1464,7 +1489,19 @@ def sem(self, ddof: int = 1): Series or DataFrame Standard error of the mean of values within each group. """ - return self.std(ddof=ddof) / np.sqrt(self.count()) + result = self.std(ddof=ddof) + if result.ndim == 1: + result /= np.sqrt(self.count()) + else: + cols = result.columns.get_indexer_for( + result.columns.difference(self.exclusions).unique() + ) + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = ( + result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values + ) + return result @Substitution(name="groupby") @Appender(_common_see_also) @@ -1948,7 +1985,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) sizes, result = grb.size(), grb.nth(n) - mask = (sizes < max_len).values + mask = (sizes < max_len)._values # set the results which don't meet the criteria if len(result) and mask.any(): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9660fb9c2e1b0..39892d87bfd69 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -754,7 +754,9 @@ def is_in_obj(gpr) -> bool: return False try: return gpr is obj[gpr.name] - except (KeyError, IndexError): + except (KeyError, IndexError, ValueError): + # TODO: ValueError: Given date string not likely a datetime. + # should be KeyError? return False for i, (gpr, level) in enumerate(zip(keys, levels)): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d9828707b6164..a97407394e7d7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -639,13 +639,15 @@ def astype(self, dtype, copy=True): elif is_categorical_dtype(dtype): from pandas.core.indexes.category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) + return CategoricalIndex( + self._values, name=self.name, dtype=dtype, copy=copy + ) elif is_extension_array_dtype(dtype): return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) try: - casted = self.values.astype(dtype, copy=copy) + casted = self._values.astype(dtype, copy=copy) except (TypeError, ValueError) as err: raise TypeError( f"Cannot cast {type(self).__name__} to dtype {dtype}" @@ -906,7 +908,7 @@ def format(self, name: bool = False, formatter=None, **kwargs): return self._format_with_header(header, **kwargs) def _format_with_header(self, header, na_rep="NaN", **kwargs): - values = self.values + values = self._values from pandas.io.formats.format import format_array diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6f1614d050cad..47c50dd2c7b14 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -440,7 +440,7 @@ def to_series(self, keep_tz=lib.no_default, index=None, name=None): # preserve the tz & copy values = self.copy(deep=True) else: - values = self.values.copy() + values = self._values.view("M8[ns]").copy() return Series(values, index=index, name=name) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f1e1ebcaca1c4..fa6ea60cec4c9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -387,7 +387,7 @@ def _verify_integrity( return new_codes @classmethod - def from_arrays(cls, arrays, sortorder=None, names=lib.no_default): + def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> "MultiIndex": """ Convert arrays to MultiIndex. @@ -1464,7 +1464,7 @@ def is_monotonic_increasing(self) -> bool: # reversed() because lexsort() wants the most significant key last. values = [ - self._get_level_values(i).values for i in reversed(range(len(self.levels))) + self._get_level_values(i)._values for i in reversed(range(len(self.levels))) ] try: sort_order = np.lexsort(values) @@ -2455,7 +2455,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "tolerance not implemented yet for MultiIndex" ) indexer = self._engine.get_indexer( - values=self.values, target=target, method=method, limit=limit + values=self._values, target=target, method=method, limit=limit ) elif method == "nearest": raise NotImplementedError( diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b2af149ccf14c..d49f1f154a2c1 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -347,7 +347,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): val = com.dict_compat(val) else: val = dict(val) - val = lib.fast_multiget(val, oindex.values, default=np.nan) + val = lib.fast_multiget(val, oindex._values, default=np.nan) val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4f6d84e52ea54..aa21c0f2bca5e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -475,7 +475,7 @@ def get_axe(block, qs, axes): b.mgr_locs = sb.mgr_locs else: - new_axes[axis] = Index(np.concatenate([ax.values for ax in axes])) + new_axes[axis] = Index(np.concatenate([ax._values for ax in axes])) if transposed: new_axes = new_axes[::-1] @@ -1269,12 +1269,22 @@ def reindex_indexer( return type(self).from_blocks(new_blocks, new_axes) - def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): + def _slice_take_blocks_ax0( + self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False + ): """ Slice/take blocks along axis=0. Overloaded for SingleBlock + Parameters + ---------- + slice_or_indexer : slice, ndarray[bool], or list-like of ints + fill_value : scalar, default lib.no_default + only_slice : bool, default False + If True, we always return views on existing arrays, never copies. + This is used when called from ops.blockwise.operate_blockwise. + Returns ------- new_blocks : list of Block @@ -1298,14 +1308,23 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) - return [ - blk.take_nd( - slobj, - axis=0, - new_mgr_locs=slice(0, sllen), - fill_value=fill_value, - ) - ] + if not allow_fill and only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + blocks = [ + blk.getitem_block([ml], new_mgr_locs=i) + for i, ml in enumerate(slobj) + ] + return blocks + else: + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=slice(0, sllen), + fill_value=fill_value, + ) + ] if sl_type in ("slice", "mask"): blknos = self.blknos[slobj] @@ -1342,11 +1361,25 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): blocks.append(newblk) else: - blocks.append( - blk.take_nd( - blklocs[mgr_locs.indexer], axis=0, new_mgr_locs=mgr_locs, - ) - ) + # GH#32779 to avoid the performance penalty of copying, + # we may try to only slice + taker = blklocs[mgr_locs.indexer] + max_len = max(len(mgr_locs), taker.max() + 1) + if only_slice: + taker = lib.maybe_indices_to_slice(taker, max_len) + + if isinstance(taker, slice): + nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) + blocks.append(nb) + elif only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + for i, ml in zip(taker, mgr_locs): + nb = blk.getitem_block([i], new_mgr_locs=ml) + blocks.append(nb) + else: + nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) + blocks.append(nb) return blocks diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index da1caea13b598..585e6d0eb0811 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -26,6 +26,7 @@ logical_op, ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 +from pandas.core.ops.blockwise import operate_blockwise from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( @@ -325,8 +326,9 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) - def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} + array_op = get_array_op(func, str_rep=str_rep) + bm = operate_blockwise(left, right, array_op) + return type(left)(bm) elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via _combine_series_frame, diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 59ac2a2071f0a..eef42592d2b30 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -6,6 +6,7 @@ from functools import partial import operator from typing import Any, Optional, Tuple +import warnings import numpy as np @@ -120,7 +121,7 @@ def masked_arith_op(x: np.ndarray, y, op): return result -def define_na_arithmetic_op(op, str_rep: str): +def define_na_arithmetic_op(op, str_rep: Optional[str]): def na_op(x, y): return na_arithmetic_op(x, y, op, str_rep) @@ -191,7 +192,8 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str): # NB: We assume that extract_array has already been called # on `left` and `right`. lvalues = maybe_upcast_datetimelike_array(left) - rvalues = maybe_upcast_for_op(right, lvalues.shape) + rvalues = maybe_upcast_datetimelike_array(right) + rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 @@ -254,8 +256,13 @@ def comparison_op( res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, is_cmp=True) + with warnings.catch_warnings(): + # suppress warnings from numpy about element-wise comparison + warnings.simplefilter("ignore", DeprecationWarning) + with np.errstate(all="ignore"): + res_values = na_arithmetic_op( + lvalues, rvalues, op, str_rep, is_cmp=True + ) return res_values diff --git a/pandas/core/ops/blockwise.py b/pandas/core/ops/blockwise.py new file mode 100644 index 0000000000000..f41a30b136637 --- /dev/null +++ b/pandas/core/ops/blockwise.py @@ -0,0 +1,102 @@ +from typing import TYPE_CHECKING, List, Tuple + +import numpy as np + +from pandas._typing import ArrayLike + +if TYPE_CHECKING: + from pandas.core.internals.blocks import Block # noqa:F401 + + +def operate_blockwise(left, right, array_op): + # At this point we have already checked + # assert right._indexed_same(left) + + res_blks: List["Block"] = [] + rmgr = right._mgr + for n, blk in enumerate(left._mgr.blocks): + locs = blk.mgr_locs + blk_vals = blk.values + + left_ea = not isinstance(blk_vals, np.ndarray) + + rblks = rmgr._slice_take_blocks_ax0(locs.indexer, only_slice=True) + + # Assertions are disabled for performance, but should hold: + # if left_ea: + # assert len(locs) == 1, locs + # assert len(rblks) == 1, rblks + # assert rblks[0].shape[0] == 1, rblks[0].shape + + for k, rblk in enumerate(rblks): + right_ea = not isinstance(rblk.values, np.ndarray) + + lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) + + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) + + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + + _reset_block_mgr_locs(nbs, locs) + + res_blks.extend(nbs) + + # Assertions are disabled for performance, but should hold: + # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} + # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) + # assert nlocs == len(left.columns), (nlocs, len(left.columns)) + # assert len(slocs) == nlocs, (len(slocs), nlocs) + # assert slocs == set(range(nlocs)), slocs + + new_mgr = type(rmgr)(res_blks, axes=rmgr.axes, do_integrity_check=False) + return new_mgr + + +def _reset_block_mgr_locs(nbs: List["Block"], locs): + """ + Reset mgr_locs to correspond to our original DataFrame. + """ + for nb in nbs: + nblocs = locs.as_array[nb.mgr_locs.indexer] + nb.mgr_locs = nblocs + # Assertions are disabled for performance, but should hold: + # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + + +def _get_same_shape_values( + lblk: "Block", rblk: "Block", left_ea: bool, right_ea: bool +) -> Tuple[ArrayLike, ArrayLike]: + """ + Slice lblk.values to align with rblk. Squeeze if we have EAs. + """ + lvals = lblk.values + rvals = rblk.values + + # Require that the indexing into lvals be slice-like + assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs + + # TODO(EA2D): with 2D EAs pnly this first clause would be needed + if not (left_ea or right_ea): + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: + # lvals are 2D, rvals are 1D + lvals = lvals[rblk.mgr_locs.indexer, :] + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + else: + # lvals are 1D, rvals are 2D + assert rvals.shape[0] == 1, rvals.shape + rvals = rvals[0, :] + + return lvals, rvals diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index c8d5eecf0e496..ea5916eff3afa 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,7 +1,18 @@ -from typing import TYPE_CHECKING, Callable, Dict, List, Tuple, Union +from typing import ( + TYPE_CHECKING, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, + cast, +) import numpy as np +from pandas._typing import Label from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -424,19 +435,22 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) -def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": +def pivot( + data: "DataFrame", + index: Optional[Union[Label, Sequence[Label]]] = None, + columns: Optional[Union[Label, Sequence[Label]]] = None, + values: Optional[Union[Label, Sequence[Label]]] = None, +) -> "DataFrame": if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") - columns = columns if is_list_like(columns) else [columns] + + columns = com.convert_to_list_like(columns) if values is None: - cols: List[str] = [] - if index is None: - pass - elif is_list_like(index): - cols = list(index) + if index is not None: + cols = com.convert_to_list_like(index) else: - cols = [index] + cols = [] cols.extend(columns) append = index is None @@ -444,10 +458,9 @@ def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFram else: if index is None: index = [Series(data.index, name=data.index.name)] - elif is_list_like(index): - index = [data[idx] for idx in index] else: - index = [data[index]] + index = com.convert_to_list_like(index) + index = [data[idx] for idx in index] data_columns = [data[col] for col in columns] index.extend(data_columns) @@ -455,6 +468,7 @@ def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFram if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name + values = cast(Sequence[Label], values) indexed = data._constructor( data[values]._values, index=index, columns=values ) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 19a75eb151782..a049ac99f0e08 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -247,7 +247,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): yield col, values.values @property - def nseries(self): + def nseries(self) -> int: if self.data.ndim == 1: return 1 else: diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index ccc49adc5da82..755fbd0d9036c 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -70,7 +70,14 @@ def assert_invalid_comparison(left, right, box): result = right != left tm.assert_equal(result, ~expected) - msg = "Invalid comparison between|Cannot compare type|not supported between" + msg = "|".join( + [ + "Invalid comparison between", + "Cannot compare type", + "not supported between", + "invalid type promotion", + ] + ) with pytest.raises(TypeError, match=msg): left < right with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 8c480faa4ee81..b3f4d5f5d9ee5 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -962,7 +962,9 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) - warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + warn = None + if box_with_array is not pd.DataFrame or tz_naive_fixture is None: + warn = PerformanceWarning with tm.assert_produces_warning(warn): result = obj - obj.astype(object) tm.assert_equal(result, expected) @@ -1465,7 +1467,7 @@ def test_dt64arr_add_sub_offset_array( other = tm.box_expected(other, box_with_array) warn = PerformanceWarning - if box_with_array is pd.DataFrame and not (tz is None and not box_other): + if box_with_array is pd.DataFrame and tz is not None: warn = None with tm.assert_produces_warning(warn): res = op(dtarr, other) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 65e3c6a07d4f3..904846c5fa099 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -552,8 +552,7 @@ def test_tda_add_dt64_object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(tdi, box) other = tm.box_expected(dti, box) - warn = PerformanceWarning if box is not pd.DataFrame else None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = obj + other.astype(object) tm.assert_equal(result, other) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 83ecebafbea07..4bf9b4b40d0b6 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -1,3 +1,4 @@ +import re import sys import numpy as np @@ -93,6 +94,37 @@ def test_deprecate_numeric_only_min_max(self, method): with tm.assert_produces_warning(expected_warning=FutureWarning): getattr(cat, method)(numeric_only=True) + @pytest.mark.parametrize("method", ["min", "max"]) + def test_numpy_min_max_raises(self, method): + cat = Categorical(["a", "b", "c", "b"], ordered=False) + msg = ( + f"Categorical is not ordered for operation {method}\n" + "you can use .as_ordered() to change the Categorical to an ordered one" + ) + method = getattr(np, method) + with pytest.raises(TypeError, match=re.escape(msg)): + method(cat) + + @pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"]) + @pytest.mark.parametrize("method", ["min", "max"]) + def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg): + cat = Categorical(["a", "b", "c", "b"], ordered=True) + msg = ( + f"the '{kwarg}' parameter is not supported in the pandas implementation " + f"of {method}" + ) + kwargs = {kwarg: 42} + method = getattr(np, method) + with pytest.raises(ValueError, match=msg): + method(cat, **kwargs) + + @pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")]) + def test_numpy_min_max_axis_equals_none(self, method, expected): + cat = Categorical(["a", "b", "c", "b"], ordered=True) + method = getattr(np, method) + result = method(cat, axis=None) + assert result == expected + @pytest.mark.parametrize( "values,categories,exp_mode", [ diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index b9102b1f84c4a..5cb27c697a64d 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -49,9 +49,11 @@ def check(df, df2): ) tm.assert_frame_equal(result, expected) - msg = re.escape( - "Invalid comparison between dtype=datetime64[ns] and ndarray" - ) + msgs = [ + r"Invalid comparison between dtype=datetime64\[ns\] and ndarray", + "invalid type promotion", + ] + msg = "|".join(msgs) with pytest.raises(TypeError, match=msg): x >= y with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 68c8b86250e06..9303a084f1e71 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -280,7 +280,7 @@ def test_non_cython_api(): result = g.mad() tm.assert_frame_equal(result, expected) - expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1]) + expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1]) result = gni.mad() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c88d16e34eab8..90c0d6bd183f2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -658,6 +658,34 @@ def test_groupby_as_index_agg(df): tm.assert_frame_equal(left, right) +def test_ops_not_as_index(reduction_func): + # GH 10355, 21090 + # Using as_index=False should not modify grouped column + + if reduction_func in ("corrwith",): + pytest.skip("Test not applicable") + + if reduction_func in ("nth", "ngroup", "size",): + pytest.skip("Skip until behavior is determined (GH #5755)") + + df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) + expected = getattr(df.groupby("a"), reduction_func)().reset_index() + + g = df.groupby("a", as_index=False) + + result = getattr(g, reduction_func)() + tm.assert_frame_equal(result, expected) + + result = g.agg(reduction_func) + tm.assert_frame_equal(result, expected) + + result = getattr(g["b"], reduction_func)() + tm.assert_frame_equal(result, expected) + + result = g["b"].agg(reduction_func) + tm.assert_frame_equal(result, expected) + + def test_as_index_series_return_frame(df): grouped = df.groupby("A", as_index=False) grouped2 = df.groupby(["A", "B"], as_index=False) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py index 952443e0ad23b..1475b1ce2907c 100644 --- a/pandas/tests/groupby/test_nunique.py +++ b/pandas/tests/groupby/test_nunique.py @@ -25,7 +25,10 @@ def check_nunique(df, keys, as_index=True): if not as_index: right = right.reset_index(drop=True) - tm.assert_series_equal(left, right, check_names=False) + if as_index: + tm.assert_series_equal(left, right, check_names=False) + else: + tm.assert_frame_equal(left, right, check_names=False) tm.assert_frame_equal(df, original_df) days = date_range("2015-08-23", periods=10) @@ -56,13 +59,14 @@ def check_nunique(df, keys, as_index=True): def test_nunique(): df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) - expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]}) + expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) result = df.groupby("A", as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index expected.index = list("abc") expected.index.name = "A" + expected = expected.drop(columns="A") result = df.groupby("A").nunique() tm.assert_frame_equal(result, expected) @@ -71,7 +75,7 @@ def test_nunique(): tm.assert_frame_equal(result, expected) # dropna - expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) expected.index.name = "A" result = df.replace({"x": None}).groupby("A").nunique() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 346e6ae6cb9cb..42bccc67fe0f8 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, PeriodIndex, Series import pandas._testing as tm @@ -36,3 +36,11 @@ def test_size_groupby_all_null(): result = df.groupby("A").size() expected = Series(dtype="int64", index=Index([], name="A")) tm.assert_series_equal(result, expected) + + +def test_size_period_index(): + # https://github.com/pandas-dev/pandas/issues/34010 + ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D")) + grp = ser.groupby(level="A") + result = grp.size() + tm.assert_series_equal(result, ser) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 8e387e9202ef6..6b33049a664de 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe): if new_names: msg = f""" There are uncatgeorized methods defined on the Grouper class: -{names}. +{new_names}. Was a new method recently added? diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index c29caf13bb629..e59100146249a 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -83,7 +83,9 @@ def test_basic(self): if pyarrow_version >= LooseVersion("0.16.1.dev"): df["periods"] = pd.period_range("2013", freq="M", periods=3) df["timedeltas"] = pd.timedelta_range("1 day", periods=3) - df["intervals"] = pd.interval_range(0, 3, 3) + # TODO temporary disable due to regression in pyarrow 0.17.1 + # https://github.com/pandas-dev/pandas/issues/34255 + # df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 01b61da099481..e81f2ee55eebd 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1061,7 +1061,13 @@ def test_add_invalid(self): per1 = Period(freq="D", year=2008, month=1, day=1) per2 = Period(freq="D", year=2008, month=1, day=2) - msg = r"unsupported operand type\(s\)" + msg = "|".join( + [ + r"unsupported operand type\(s\)", + "can only concatenate str", + "must be str, not Period", + ] + ) with pytest.raises(TypeError, match=msg): per1 + "str" with pytest.raises(TypeError, match=msg): @@ -1402,8 +1408,15 @@ def test_sub_offset(self): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_period_addsub_nat(self, freq): - assert NaT - Period("2011-01", freq=freq) is NaT - assert Period("2011-01", freq=freq) - NaT is NaT + per = Period("2011-01", freq=freq) + + # For subtraction, NaT is treated as another Period object + assert NaT - per is NaT + assert per - NaT is NaT + + # For addition, NaT is treated as offset-like + assert NaT + per is NaT + assert per + NaT is NaT def test_period_ops_offset(self): p = Period("2011-04-01", freq="D") diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index a37dbbc89f5af..e5b0142dae48b 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._libs.tslibs.offsets import delta_to_tick + from pandas import Timedelta, Timestamp import pandas._testing as tm @@ -33,11 +35,11 @@ def test_apply_ticks(): def test_delta_to_tick(): delta = timedelta(3) - tick = offsets.delta_to_tick(delta) + tick = delta_to_tick(delta) assert tick == offsets.Day(3) td = Timedelta(nanoseconds=5) - tick = offsets.delta_to_tick(td) + tick = delta_to_tick(td) assert tick == Nano(5) @@ -234,7 +236,7 @@ def test_tick_division(cls): assert not isinstance(result, cls) assert result.delta == off.delta / 1000 - if cls._inc < Timedelta(seconds=1): + if cls._nanos_inc < Timedelta(seconds=1).value: # Case where we end up with a bigger class result = off / 0.001 assert isinstance(result, offsets.Tick) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index f907c5570bd18..f20734598bc74 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -164,7 +164,7 @@ def to_offset(freq) -> Optional[DateOffset]: ) stride = int(stride) offset = _get_offset(name) - offset = offset * int(np.fabs(stride) * stride_sign) # type: ignore + offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: delta = offset else: @@ -218,7 +218,7 @@ def _get_offset(name: str) -> DateOffset: klass = prefix_mapping[split[0]] # handles case where there's no suffix (and will TypeError if too # many '-') - offset = klass._from_name(*split[1:]) # type: ignore + offset = klass._from_name(*split[1:]) except (ValueError, TypeError, KeyError) as err: # bad prefix or suffix raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(name)) from err diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 364a50be5c291..3dd5f2a2fc4c8 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -6,20 +6,26 @@ import numpy as np from pandas._libs.tslibs import ( - Period, Timedelta, Timestamp, ccalendar, conversion, - delta_to_nanoseconds, frequencies as libfrequencies, offsets as liboffsets, ) -from pandas._libs.tslibs.offsets import ( +from pandas._libs.tslibs.offsets import ( # noqa:F401 ApplyTypeError, BaseOffset, BusinessMixin, CustomMixin, + Day, + Hour, + Micro, + Milli, + Minute, + Nano, + Second, + Tick, apply_index_wraps, apply_wraps, as_datetime, @@ -2125,118 +2131,6 @@ def is_on_offset(self, dt: datetime) -> bool: # --------------------------------------------------------------------- -# Ticks - - -class Tick(liboffsets._Tick, SingleConstructorOffset): - _inc = Timedelta(microseconds=1000) - - def __add__(self, other): - if isinstance(other, Tick): - if type(self) == type(other): - return type(self)(self.n + other.n) - else: - return delta_to_tick(self.delta + other.delta) - elif isinstance(other, Period): - return other + self - try: - return self.apply(other) - except ApplyTypeError: - return NotImplemented - except OverflowError as err: - raise OverflowError( - f"the add operation between {self} and {other} will overflow" - ) from err - - # This is identical to DateOffset.__hash__, but has to be redefined here - # for Python 3, because we've redefined __eq__. - def __hash__(self) -> int: - return hash(self._params) - - def apply(self, other): - # Timestamp can handle tz and nano sec, thus no need to use apply_wraps - if isinstance(other, Timestamp): - - # GH 15126 - # in order to avoid a recursive - # call of __add__ and __radd__ if there is - # an exception, when we call using the + operator, - # we directly call the known method - result = other.__add__(self) - if result is NotImplemented: - raise OverflowError - return result - elif isinstance(other, (datetime, np.datetime64, date)): - return Timestamp(other) + self - - if isinstance(other, timedelta): - return other + self.delta - elif isinstance(other, type(self)): - # TODO: this is reached in tests that specifically call apply, - # but should not be reached "naturally" because __add__ should - # catch this case first. - return type(self)(self.n + other.n) - - raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") - - -def delta_to_tick(delta: timedelta) -> Tick: - if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0: - # nanoseconds only for pd.Timedelta - if delta.seconds == 0: - return Day(delta.days) - else: - seconds = delta.days * 86400 + delta.seconds - if seconds % 3600 == 0: - return Hour(seconds / 3600) - elif seconds % 60 == 0: - return Minute(seconds / 60) - else: - return Second(seconds) - else: - nanos = delta_to_nanoseconds(delta) - if nanos % 1_000_000 == 0: - return Milli(nanos // 1_000_000) - elif nanos % 1000 == 0: - return Micro(nanos // 1000) - else: # pragma: no cover - return Nano(nanos) - - -class Day(Tick): - _inc = Timedelta(days=1) - _prefix = "D" - - -class Hour(Tick): - _inc = Timedelta(hours=1) - _prefix = "H" - - -class Minute(Tick): - _inc = Timedelta(minutes=1) - _prefix = "T" - - -class Second(Tick): - _inc = Timedelta(seconds=1) - _prefix = "S" - - -class Milli(Tick): - _inc = Timedelta(milliseconds=1) - _prefix = "L" - - -class Micro(Tick): - _inc = Timedelta(microseconds=1) - _prefix = "U" - - -class Nano(Tick): - _inc = Timedelta(nanoseconds=1) - _prefix = "N" - BDay = BusinessDay BMonthEnd = BusinessMonthEnd @@ -2246,7 +2140,7 @@ class Nano(Tick): CDay = CustomBusinessDay prefix_mapping = { - offset._prefix: offset # type: ignore + offset._prefix: offset for offset in [ YearBegin, # 'AS' YearEnd, # 'A' From 3a0e80bd24d7c5fdef31e49ebc20cc3b761c7d7c Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 19 May 2020 19:16:59 -0400 Subject: [PATCH 2/4] Removed old test --- pandas/tests/groupby/test_function.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 7ceeddbe82ca5..9303a084f1e71 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -573,28 +573,6 @@ def test_ops_general(op, targop): tm.assert_frame_equal(result, expected) -def test_ops_not_as_index(reduction_func): - # GH 10355 - # Using as_index=False should not modify grouped column - - if reduction_func in ("nth", "ngroup", "size",): - pytest.skip("Skip until behavior is determined (GH #5755)") - - if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",): - pytest.xfail( - "_GroupBy._python_apply_general incorrectly modifies grouping columns" - ) - - df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) - expected = getattr(df.groupby("a"), reduction_func)().reset_index() - - result = getattr(df.groupby("a", as_index=False), reduction_func)() - tm.assert_frame_equal(result, expected) - - result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)() - tm.assert_frame_equal(result, expected) - - def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log From 995565ddee7702eaf74fd56c907f382c5adc54a4 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 26 May 2020 16:22:41 -0400 Subject: [PATCH 3/4] Fix typing, whatsnew --- doc/source/whatsnew/v1.1.0.rst | 20 +++++--------------- pandas/core/groupby/groupby.py | 6 ++++-- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a02da69ecf4fb..0cd55ddfabf37 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -561,12 +561,12 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns df[['a', 'c']] = 1 df -.. _whatsnew_110.api_breaking.groupby_nunique: +.. _whatsnew_110.api_breaking.groupby_consistency: -Using groupby with ``nunique`` and ``as_index=True`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Consistency across groupby reductions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now, the grouping columns only appear in the index. This is now consistent with other aggregation functions. (:issue:`32579`) +Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now the grouping column(s) only appear in the index, consistent with other reductions. (:issue:`32579`) .. ipython:: python @@ -590,17 +590,7 @@ Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nun df.groupby("a", as_index=True).nunique() -.. _whatsnew_110.api_breaking.groupby_as_index_false: - -Using groupby with ``as_index=False`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, or ``skew`` would modify the grouping column. Now, the grouping column remains unchanged. (:issue:`21090`) - -.. ipython:: python - - df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]}) - df +Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, ``sem``, ``skew``, or ``std`` would modify the grouping column. Now the grouping column remains unchanged, consistent with other reductions. (:issue:`21090`, :issue:`10355`) *Previous behavior*: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 604c9071584a3..9838cff9b34f9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -35,7 +35,7 @@ class providing the base-class of operations. from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby -from pandas._typing import F, FrameOrSeries, Scalar +from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -868,7 +868,9 @@ def f(g): return result - def _python_apply_general(self, f: F, data: Union[Series, DataFrame]): + def _python_apply_general( + self, f: F, data: FrameOrSeriesUnion + ) -> FrameOrSeriesUnion: """ Apply function f in python space From a70ae8de3d5a633ce58abc840d02451ac91ce24f Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 26 May 2020 16:35:22 -0400 Subject: [PATCH 4/4] Removed redundant whatsnew line --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0cd55ddfabf37..b69eda2f5e2b3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -866,7 +866,6 @@ Groupby/resample/rolling - Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) -- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) Reshaping ^^^^^^^^^