From b2ff784859e0e22fd4380df7501b949398dabaf4 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 14:28:59 -0800 Subject: [PATCH 1/6] PERF: make DTA/TDA/PA _ndarray the attribute, _data the property --- pandas/core/arrays/datetimelike.py | 21 ++++++++++-- pandas/core/arrays/datetimes.py | 10 +++--- pandas/core/arrays/period.py | 10 +++--- pandas/core/arrays/timedeltas.py | 51 ++++++++++++++++------------- pandas/core/indexes/datetimelike.py | 4 +-- 5 files changed, 59 insertions(+), 37 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index bae22505145b5..1616d09020d67 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -253,9 +253,24 @@ def _check_compatible_with( # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat + def __setstate__(self, state): + if isinstance(state, dict): + if "_data" in state and "_ndarray" not in state: + # backward compat, changed what is property vs attribute + state["_ndarray"] = state.pop("_data") + for key, value in state.items(): + setattr(self, key, value) + else: + # PeriodArray, bc it mixes in a cython class + if isinstance(state, tuple) and len(state) == 1: + state = state[0] + self.__setstate__(state) + else: + raise TypeError(state) + @cache_readonly - def _ndarray(self) -> np.ndarray: - return self._data + def _data(self) -> np.ndarray: + return self._ndarray def _from_backing_data( self: DatetimeLikeArrayT, arr: np.ndarray @@ -294,7 +309,7 @@ def asi8(self) -> np.ndarray: An ndarray with int64 dtype. """ # do not cache or you'll create a memory leak - return self._data.view("i8") + return self._ndarray.view("i8") # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 3982a7deca2bb..28e469547fe62 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -261,7 +261,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): if freq is None: freq = values.freq - values = values._data + values = values._ndarray if not isinstance(values, np.ndarray): raise ValueError( @@ -303,7 +303,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): # be incorrect(ish?) for the array as a whole dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz)) - self._data = values + self._ndarray = values self._dtype = dtype self._freq = freq @@ -320,7 +320,7 @@ def _simple_new( values = values.view(DT64NS_DTYPE) result = object.__new__(cls) - result._data = values + result._ndarray = values result._freq = freq result._dtype = dtype return result @@ -618,7 +618,7 @@ def astype(self, dtype, copy=True): elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype: # unit conversion e.g. datetime64[s] - return self._data.astype(dtype) + return self._ndarray.astype(dtype) elif is_period_dtype(dtype): return self.to_period(freq=dtype.freq) @@ -1138,7 +1138,7 @@ def to_period(self, freq=None): freq = res - return PeriodArray._from_datetime64(self._data, freq, tz=self.tz) + return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz) def to_perioddelta(self, freq): """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 109be2c67bb1a..96a159c0804c9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -181,6 +181,8 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): _datetimelike_ops = _field_ops + _object_ops + _bool_ops _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"] + __setstate__ = dtl.DatelikeOps.__setstate__ + # -------------------------------------------------------------------- # Constructors @@ -201,10 +203,10 @@ def __init__(self, values, dtype: Optional[Dtype] = None, freq=None, copy=False) if isinstance(values, type(self)): if freq is not None and freq != values.freq: raise raise_on_incompatible(values, freq) - values, freq = values._data, values.freq + values, freq = values._ndarray, values.freq values = np.array(values, dtype="int64", copy=copy) - self._data = values + self._ndarray = values if freq is None: raise ValueError("freq is not specified and cannot be inferred") self._dtype = PeriodDtype(freq) @@ -347,7 +349,7 @@ def __arrow_array__(self, type=None): if type is not None: if pyarrow.types.is_integer(type): - return pyarrow.array(self._data, mask=self.isna(), type=type) + return pyarrow.array(self._ndarray, mask=self.isna(), type=type) elif isinstance(type, ArrowPeriodType): # ensure we have the same freq if self.freqstr != type.freq: @@ -361,7 +363,7 @@ def __arrow_array__(self, type=None): ) period_type = ArrowPeriodType(self.freqstr) - storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64") + storage_array = pyarrow.array(self._ndarray, mask=self.isna(), type="int64") return pyarrow.ExtensionArray.from_storage(period_type, storage_array) # -------------------------------------------------------------------- diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 893644be23a0e..1a4ee52e414b4 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -163,6 +163,8 @@ def dtype(self) -> np.dtype: # ---------------------------------------------------------------- # Constructors + _freq = None + def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): values = extract_array(values) @@ -179,7 +181,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): elif freq and values.freq: freq = to_offset(freq) freq, _ = dtl.validate_inferred_freq(freq, values.freq, False) - values = values._data + values = values._ndarray if not isinstance(values, np.ndarray): msg = ( @@ -211,7 +213,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): if freq: freq = to_offset(freq) - self._data = values + self._ndarray = values self._dtype = dtype self._freq = freq @@ -229,7 +231,7 @@ def _simple_new( values = values.view(TD64NS_DTYPE) result = object.__new__(cls) - result._data = values + result._ndarray = values result._freq = to_offset(freq) result._dtype = TD64NS_DTYPE return result @@ -341,7 +343,7 @@ def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) if dtype.kind == "m": - return astype_td64_unit_conversion(self._data, dtype, copy=copy) + return astype_td64_unit_conversion(self._ndarray, dtype, copy=copy) return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) @@ -415,8 +417,8 @@ def _formatter(self, boxed=False): def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import get_format_timedelta64 - formatter = get_format_timedelta64(self._data, na_rep) - return np.array([formatter(x) for x in self._data]) + formatter = get_format_timedelta64(self._ndarray, na_rep) + return np.array([formatter(x) for x in self._ndarray]) # ---------------------------------------------------------------- # Arithmetic Methods @@ -485,7 +487,7 @@ def _addsub_object_array(self, other, op): def __mul__(self, other) -> TimedeltaArray: if is_scalar(other): # numpy will accept float and int, raise TypeError for others - result = self._data * other + result = self._ndarray * other freq = None if self.freq is not None and not isna(other): freq = self.freq * other @@ -508,7 +510,7 @@ def __mul__(self, other) -> TimedeltaArray: return type(self)(result) # numpy will accept float or int dtype, raise TypeError for others - result = self._data * other + result = self._ndarray * other return type(self)(result) __rmul__ = __mul__ @@ -526,11 +528,11 @@ def __truediv__(self, other): return result # otherwise, dispatch to Timedelta implementation - return self._data / other + return self._ndarray / other elif lib.is_scalar(other): # assume it is numeric - result = self._data / other + result = self._ndarray / other freq = None if self.freq is not None: # Tick division is not implemented, so operate on Timedelta @@ -546,7 +548,7 @@ def __truediv__(self, other): elif is_timedelta64_dtype(other.dtype): # let numpy handle it - return self._data / other + return self._ndarray / other elif is_object_dtype(other.dtype): # We operate on raveled arrays to avoid problems in inference @@ -568,7 +570,7 @@ def __truediv__(self, other): return result else: - result = self._data / other + result = self._ndarray / other return type(self)(result) @unpack_zerodim_and_defer("__rtruediv__") @@ -583,7 +585,7 @@ def __rtruediv__(self, other): return result # otherwise, dispatch to Timedelta implementation - return other / self._data + return other / self._ndarray elif lib.is_scalar(other): raise TypeError( @@ -599,7 +601,7 @@ def __rtruediv__(self, other): elif is_timedelta64_dtype(other.dtype): # let numpy handle it - return other / self._data + return other / self._ndarray elif is_object_dtype(other.dtype): # Note: unlike in __truediv__, we do not _need_ to do type @@ -626,7 +628,7 @@ def __floordiv__(self, other): return result # dispatch to Timedelta implementation - result = other.__rfloordiv__(self._data) + result = other.__rfloordiv__(self._ndarray) return result # at this point we should only have numeric scalars; anything @@ -670,7 +672,7 @@ def __floordiv__(self, other): return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): - result = self._data // other + result = self._ndarray // other return type(self)(result) else: @@ -690,7 +692,7 @@ def __rfloordiv__(self, other): return result # dispatch to Timedelta implementation - result = other.__floordiv__(self._data) + result = other.__floordiv__(self._ndarray) return result raise TypeError( @@ -760,15 +762,15 @@ def __rdivmod__(self, other): def __neg__(self) -> TimedeltaArray: if self.freq is not None: - return type(self)(-self._data, freq=-self.freq) - return type(self)(-self._data) + return type(self)(-self._ndarray, freq=-self.freq) + return type(self)(-self._ndarray) def __pos__(self) -> TimedeltaArray: - return type(self)(self._data, freq=self.freq) + return type(self)(self._ndarray, freq=self.freq) def __abs__(self) -> TimedeltaArray: # Note: freq is not preserved - return type(self)(np.abs(self._data)) + return type(self)(np.abs(self._ndarray)) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timedelta methods @@ -946,9 +948,12 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values - elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): + elif isinstance(data, ABCTimedeltaIndex): + inferred_freq = data.freq + data = data._data._ndarray + elif isinstance(data, TimedeltaArray): inferred_freq = data.freq - data = data._data + data = data._ndarray elif isinstance(data, IntegerArray): data = data.to_numpy("int64", na_value=tslibs.iNaT) elif is_categorical_dtype(data.dtype): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e1f2a40598963..6d5992540ef49 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -150,7 +150,7 @@ def _simple_new( result._cache = {} # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data + result._index_data = values._ndarray result._reset_identity() return result @@ -165,7 +165,7 @@ def _is_all_dates(self) -> bool: @property def values(self) -> np.ndarray: # Note: PeriodArray overrides this to return an ndarray of objects. - return self._data._data + return self._data._ndarray def __array_wrap__(self, result, context=None): """ From 1a8c218c1814c0b697ca28573598cc48fd5e9e42 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Feb 2021 15:04:08 -0800 Subject: [PATCH 2/6] merge --- .github/workflows/ci.yml | 1 + .pre-commit-config.yaml | 5 + asv_bench/benchmarks/algos/isin.py | 6 +- doc/source/ecosystem.rst | 2 +- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 4 +- doc/source/whatsnew/v1.2.3.rst | 3 + doc/source/whatsnew/v1.3.0.rst | 7 + pandas/_libs/lib.pyx | 2 +- pandas/_libs/tslibs/conversion.pyx | 5 + pandas/_libs/window/indexers.pyx | 2 +- pandas/core/algorithms.py | 399 +-- pandas/core/apply.py | 21 +- pandas/core/array_algos/take.py | 442 +++ pandas/core/arrays/_mixins.py | 11 +- pandas/core/arrays/base.py | 18 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/integer.py | 4 +- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/string_arrow.py | 13 +- pandas/core/arrays/timedeltas.py | 12 +- pandas/core/construction.py | 36 +- pandas/core/dtypes/cast.py | 59 +- pandas/core/frame.py | 5 +- pandas/core/generic.py | 2 + pandas/core/groupby/generic.py | 45 +- pandas/core/groupby/groupby.py | 6 +- pandas/core/internals/__init__.py | 4 +- pandas/core/internals/array_manager.py | 35 +- pandas/core/internals/blocks.py | 8 +- pandas/core/internals/concat.py | 56 +- pandas/core/internals/construction.py | 2 +- pandas/core/internals/managers.py | 61 +- pandas/core/internals/ops.py | 2 +- pandas/core/missing.py | 16 + pandas/core/reshape/concat.py | 4 +- pandas/core/reshape/merge.py | 10 +- pandas/core/reshape/pivot.py | 19 +- pandas/core/series.py | 2 +- pandas/io/formats/style.py | 5 +- pandas/io/json/_json.py | 2 +- pandas/tests/apply/test_frame_apply.py | 2735 +++++++++-------- .../apply/test_frame_apply_relabeling.py | 197 +- pandas/tests/apply/test_series_apply.py | 1508 ++++----- .../apply/test_series_apply_relabeling.py | 58 +- pandas/tests/arrays/masked/test_arithmetic.py | 13 + pandas/tests/base/test_conversion.py | 2 +- pandas/tests/dtypes/test_common.py | 2 +- .../tests/frame/indexing/test_categorical.py | 293 -- pandas/tests/frame/indexing/test_getitem.py | 70 + pandas/tests/frame/indexing/test_indexing.py | 463 ++- pandas/tests/frame/indexing/test_setitem.py | 183 ++ pandas/tests/frame/methods/test_drop.py | 2 +- pandas/tests/frame/methods/test_explode.py | 5 - pandas/tests/frame/methods/test_join.py | 10 +- pandas/tests/frame/methods/test_update.py | 9 + .../tests/groupby/aggregate/test_aggregate.py | 10 +- pandas/tests/groupby/test_groupby.py | 53 +- pandas/tests/groupby/test_sample.py | 13 +- .../indexing/test_chaining_and_caching.py | 16 + pandas/tests/indexing/test_datetime.py | 85 - pandas/tests/indexing/test_floats.py | 134 +- pandas/tests/indexing/test_iloc.py | 75 + pandas/tests/indexing/test_indexing.py | 60 +- pandas/tests/indexing/test_loc.py | 269 ++ pandas/tests/io/formats/test_printing.py | 2 +- pandas/tests/io/json/test_compression.py | 12 + pandas/tests/io/test_fsspec.py | 16 +- pandas/tests/resample/test_base.py | 2 +- .../tests/resample/test_resampler_grouper.py | 13 + pandas/tests/reshape/merge/test_join.py | 3 + pandas/tests/reshape/merge/test_merge.py | 25 +- pandas/tests/reshape/test_crosstab.py | 5 +- pandas/tests/reshape/test_cut.py | 22 +- pandas/tests/reshape/test_pivot.py | 5 +- pandas/tests/series/indexing/test_datetime.py | 100 +- pandas/tests/series/indexing/test_getitem.py | 31 + pandas/tests/series/indexing/test_indexing.py | 189 +- pandas/tests/series/indexing/test_setitem.py | 94 + pandas/tests/tools/test_to_timedelta.py | 16 + pandas/tests/window/test_rolling.py | 15 + 81 files changed, 4250 insertions(+), 3907 deletions(-) create mode 100644 pandas/core/array_algos/take.py delete mode 100644 pandas/tests/frame/indexing/test_categorical.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a07149fe8171..461363d295f6a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,6 +154,7 @@ jobs: source activate pandas-dev pytest pandas/tests/frame/methods --array-manager pytest pandas/tests/arithmetic/ --array-manager + pytest pandas/tests/reshape/merge --array-manager # indexing subset (temporary since other tests don't pass yet) pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d433fb08209bf..47a9ae592f940 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -163,6 +163,11 @@ repos: entry: np\.bool[^_8] language: pygrep types_or: [python, cython, rst] + - id: np-object + name: Check for use of np.object instead of np.object_ + entry: np\.object[^_8] + language: pygrep + types_or: [python, cython, rst] - id: no-os-remove name: Check code for instances of os.remove entry: os\.remove diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 5d81d9d0d45a3..75a96e5b691ca 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -104,7 +104,7 @@ def time_isin(self, dtype, exponent, title): class IsinWithRandomFloat: params = [ - [np.float64, np.object], + [np.float64, np.object_], [ 1_300, 2_000, @@ -134,7 +134,7 @@ def time_isin(self, dtype, size, title): class IsinWithArangeSorted: params = [ - [np.float64, np.int64, np.uint64, np.object], + [np.float64, np.int64, np.uint64, np.object_], [ 1_000, 2_000, @@ -155,7 +155,7 @@ def time_isin(self, dtype, size): class IsinWithArange: params = [ - [np.float64, np.int64, np.uint64, np.object], + [np.float64, np.int64, np.uint64, np.object_], [ 1_000, 2_000, diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 4b69d5b0c8c77..e72a9d86daeaf 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -156,7 +156,7 @@ A good implementation for Python users is `has2k1/plotnine `__ leverages `Vega -`__ to create plots within Jupyter Notebook. +`__ to create plots within Jupyter Notebook. `Plotly `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 67c74f9a04618..d7c1ca8bca598 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -29,7 +29,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` binary;`ORC Format `__;:ref:`read_orc`; - binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` + binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index ce784231a47d2..f5175283cce4e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1755,8 +1755,8 @@ Missing - Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) -- :func:`Series.isin` now treats all NaN-floats as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) -- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for ``np.object``-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) +- :func:`Series.isin` now treats all NaN-floats as equal also for ``np.object_``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for ``np.object_``-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) - :class:`DataFrame` and :class:`Series` now properly handle numpy masked arrays with hardened masks. Previously, constructing a DataFrame or Series from a masked array with a hard mask would create a pandas object containing the underlying value, rather than the expected NaN. (:issue:`24574`) - Bug in :class:`DataFrame` constructor where ``dtype`` argument was not honored when handling numpy masked record arrays. (:issue:`24874`) diff --git a/doc/source/whatsnew/v1.2.3.rst b/doc/source/whatsnew/v1.2.3.rst index d305024909703..28fc83459b69d 100644 --- a/doc/source/whatsnew/v1.2.3.rst +++ b/doc/source/whatsnew/v1.2.3.rst @@ -16,7 +16,10 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`~DataFrame.to_excel` raising ``KeyError`` when giving duplicate columns with ``columns`` attribute (:issue:`39695`) +- Fixed regression in nullable integer unary ops propagating mask on assignment (:issue:`39943`) - Fixed regression in :meth:`DataFrame.__setitem__` not aligning :class:`DataFrame` on right-hand side for boolean indexer (:issue:`39931`) +- Fixed regression in :meth:`~DataFrame.to_json` failing to use ``compression`` with URL-like paths that are internally opened in binary mode or with user-provided file objects that are opened in binary mode (:issue:`39985`) +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7d1adf99198d0..09f235bde5f79 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -269,6 +269,7 @@ Performance improvements - Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) - Performance improvement in :func:`unique` for object data type (:issue:`37615`) - Performance improvement in :class:`core.window.rolling.ExpandingGroupby` aggregation methods (:issue:`39664`) +- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) .. --------------------------------------------------------------------------- @@ -307,6 +308,7 @@ Timedelta - Bug in constructing :class:`Timedelta` from ``np.timedelta64`` objects with non-nanosecond units that are out of bounds for ``timedelta64[ns]`` (:issue:`38965`) - Bug in constructing a :class:`TimedeltaIndex` incorrectly accepting ``np.datetime64("NaT")`` objects (:issue:`39462`) - Bug in constructing :class:`Timedelta` from input string with only symbols and no digits failed to raise an error (:issue:`39710`) +- Bug in :class:`TimedeltaIndex` and :func:`to_timedelta` failing to raise when passed non-nanosecond ``timedelta64`` arrays that overflow when converting to ``timedelta64[ns]`` (:issue:`40008`) Timezones ^^^^^^^^^ @@ -322,6 +324,7 @@ Numeric - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) - Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`) +- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Conversion @@ -437,6 +440,9 @@ Groupby/resample/rolling - Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`) +- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`) +- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`) +- Bug in :meth:`DataFrameGroupBy.sample` where error was raised when ``weights`` was specified and the index was an :class:`Int64Index` (:issue:`39927`) - Reshaping @@ -452,6 +458,7 @@ Reshaping - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) +- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`) Sparse ^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index dc8b36a3898b7..d2aa47f65d263 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1468,7 +1468,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if is_decimal_array(values): return "decimal" - elif is_complex(val): + elif util.is_complex_object(val): if is_complex_array(values): return "complex" diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 0646c58fa84b6..536cb63cc6119 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -239,6 +239,11 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): return result unit = get_datetime64_unit(arr.flat[0]) + if unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # without raising explicitly here, we end up with a SystemError + # built-in function ensure_datetime64ns returned a result with an error + raise ValueError("datetime64/timedelta64 must have a unit specified") + if unit == NPY_FR_ns: if copy: arr = arr.copy() diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index b8b9a8553161f..67b196b7cb179 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -88,7 +88,7 @@ def calculate_variable_window_bounds( # left endpoint is closed if left_closed: - start_bound -= 1 + start_bound -= 1 * index_growth_sign # advance the start bound until we are # within the constraint diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 62172de5b7ec2..819e5a1c32d9b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -39,7 +39,6 @@ from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, infer_dtype_from_array, - maybe_promote, ) from pandas.core.dtypes.common import ( ensure_float64, @@ -83,6 +82,7 @@ na_value_for_dtype, ) +from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( array, ensure_wrapped_if_datetimelike, @@ -1414,219 +1414,6 @@ def get_indexer(current_indexer, other_indexer): # ---- # -def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): - def wrapper( - arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan - ): - if arr_dtype is not None: - arr = arr.view(arr_dtype) - if out_dtype is not None: - out = out.view(out_dtype) - if fill_wrap is not None: - fill_value = fill_wrap(fill_value) - f(arr, indexer, out, fill_value=fill_value) - - return wrapper - - -def _convert_wrapper(f, conv_dtype): - def wrapper( - arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan - ): - if conv_dtype == object: - # GH#39755 avoid casting dt64/td64 to integers - arr = ensure_wrapped_if_datetimelike(arr) - arr = arr.astype(conv_dtype) - f(arr, indexer, out, fill_value=fill_value) - - return wrapper - - -def _take_2d_multi_object( - arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, mask_info -) -> None: - # this is not ideal, performance-wise, but it's better than raising - # an exception (best to optimize in Cython to avoid getting here) - row_idx, col_idx = indexer - if mask_info is not None: - (row_mask, col_mask), (row_needs, col_needs) = mask_info - else: - row_mask = row_idx == -1 - col_mask = col_idx == -1 - row_needs = row_mask.any() - col_needs = col_mask.any() - if fill_value is not None: - if row_needs: - out[row_mask, :] = fill_value - if col_needs: - out[:, col_mask] = fill_value - for i in range(len(row_idx)): - u_ = row_idx[i] - for j in range(len(col_idx)): - v = col_idx[j] - out[i, j] = arr[u_, v] - - -def _take_nd_object( - arr: np.ndarray, - indexer: np.ndarray, - out: np.ndarray, - axis: int, - fill_value, - mask_info, -): - if mask_info is not None: - mask, needs_masking = mask_info - else: - mask = indexer == -1 - needs_masking = mask.any() - if arr.dtype != out.dtype: - arr = arr.astype(out.dtype) - if arr.shape[axis] > 0: - arr.take(ensure_platform_int(indexer), axis=axis, out=out) - if needs_masking: - outindexer = [slice(None)] * arr.ndim - outindexer[axis] = mask - out[tuple(outindexer)] = fill_value - - -_take_1d_dict = { - ("int8", "int8"): algos.take_1d_int8_int8, - ("int8", "int32"): algos.take_1d_int8_int32, - ("int8", "int64"): algos.take_1d_int8_int64, - ("int8", "float64"): algos.take_1d_int8_float64, - ("int16", "int16"): algos.take_1d_int16_int16, - ("int16", "int32"): algos.take_1d_int16_int32, - ("int16", "int64"): algos.take_1d_int16_int64, - ("int16", "float64"): algos.take_1d_int16_float64, - ("int32", "int32"): algos.take_1d_int32_int32, - ("int32", "int64"): algos.take_1d_int32_int64, - ("int32", "float64"): algos.take_1d_int32_float64, - ("int64", "int64"): algos.take_1d_int64_int64, - ("int64", "float64"): algos.take_1d_int64_float64, - ("float32", "float32"): algos.take_1d_float32_float32, - ("float32", "float64"): algos.take_1d_float32_float64, - ("float64", "float64"): algos.take_1d_float64_float64, - ("object", "object"): algos.take_1d_object_object, - ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_1d_int64_int64, np.int64, np.int64, np.int64 - ), -} - -_take_2d_axis0_dict = { - ("int8", "int8"): algos.take_2d_axis0_int8_int8, - ("int8", "int32"): algos.take_2d_axis0_int8_int32, - ("int8", "int64"): algos.take_2d_axis0_int8_int64, - ("int8", "float64"): algos.take_2d_axis0_int8_float64, - ("int16", "int16"): algos.take_2d_axis0_int16_int16, - ("int16", "int32"): algos.take_2d_axis0_int16_int32, - ("int16", "int64"): algos.take_2d_axis0_int16_int64, - ("int16", "float64"): algos.take_2d_axis0_int16_float64, - ("int32", "int32"): algos.take_2d_axis0_int32_int32, - ("int32", "int64"): algos.take_2d_axis0_int32_int64, - ("int32", "float64"): algos.take_2d_axis0_int32_float64, - ("int64", "int64"): algos.take_2d_axis0_int64_int64, - ("int64", "float64"): algos.take_2d_axis0_int64_float64, - ("float32", "float32"): algos.take_2d_axis0_float32_float32, - ("float32", "float64"): algos.take_2d_axis0_float32_float64, - ("float64", "float64"): algos.take_2d_axis0_float64_float64, - ("object", "object"): algos.take_2d_axis0_object_object, - ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 - ), -} - -_take_2d_axis1_dict = { - ("int8", "int8"): algos.take_2d_axis1_int8_int8, - ("int8", "int32"): algos.take_2d_axis1_int8_int32, - ("int8", "int64"): algos.take_2d_axis1_int8_int64, - ("int8", "float64"): algos.take_2d_axis1_int8_float64, - ("int16", "int16"): algos.take_2d_axis1_int16_int16, - ("int16", "int32"): algos.take_2d_axis1_int16_int32, - ("int16", "int64"): algos.take_2d_axis1_int16_int64, - ("int16", "float64"): algos.take_2d_axis1_int16_float64, - ("int32", "int32"): algos.take_2d_axis1_int32_int32, - ("int32", "int64"): algos.take_2d_axis1_int32_int64, - ("int32", "float64"): algos.take_2d_axis1_int32_float64, - ("int64", "int64"): algos.take_2d_axis1_int64_int64, - ("int64", "float64"): algos.take_2d_axis1_int64_float64, - ("float32", "float32"): algos.take_2d_axis1_float32_float32, - ("float32", "float64"): algos.take_2d_axis1_float32_float64, - ("float64", "float64"): algos.take_2d_axis1_float64_float64, - ("object", "object"): algos.take_2d_axis1_object_object, - ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 - ), -} - -_take_2d_multi_dict = { - ("int8", "int8"): algos.take_2d_multi_int8_int8, - ("int8", "int32"): algos.take_2d_multi_int8_int32, - ("int8", "int64"): algos.take_2d_multi_int8_int64, - ("int8", "float64"): algos.take_2d_multi_int8_float64, - ("int16", "int16"): algos.take_2d_multi_int16_int16, - ("int16", "int32"): algos.take_2d_multi_int16_int32, - ("int16", "int64"): algos.take_2d_multi_int16_int64, - ("int16", "float64"): algos.take_2d_multi_int16_float64, - ("int32", "int32"): algos.take_2d_multi_int32_int32, - ("int32", "int64"): algos.take_2d_multi_int32_int64, - ("int32", "float64"): algos.take_2d_multi_int32_float64, - ("int64", "int64"): algos.take_2d_multi_int64_int64, - ("int64", "float64"): algos.take_2d_multi_int64_float64, - ("float32", "float32"): algos.take_2d_multi_float32_float32, - ("float32", "float64"): algos.take_2d_multi_float32_float64, - ("float64", "float64"): algos.take_2d_multi_float64_float64, - ("object", "object"): algos.take_2d_multi_object_object, - ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 - ), -} - - -def _get_take_nd_function( - ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: int = 0, mask_info=None -): - if ndim <= 2: - tup = (arr_dtype.name, out_dtype.name) - if ndim == 1: - func = _take_1d_dict.get(tup, None) - elif ndim == 2: - if axis == 0: - func = _take_2d_axis0_dict.get(tup, None) - else: - func = _take_2d_axis1_dict.get(tup, None) - if func is not None: - return func - - tup = (out_dtype.name, out_dtype.name) - if ndim == 1: - func = _take_1d_dict.get(tup, None) - elif ndim == 2: - if axis == 0: - func = _take_2d_axis0_dict.get(tup, None) - else: - func = _take_2d_axis1_dict.get(tup, None) - if func is not None: - func = _convert_wrapper(func, out_dtype) - return func - - def func2(arr, indexer, out, fill_value=np.nan): - indexer = ensure_int64(indexer) - _take_nd_object( - arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info - ) - - return func2 - - def take( arr, indices: np.ndarray, axis: int = 0, allow_fill: bool = False, fill_value=None ): @@ -1720,190 +1507,6 @@ def take( return result -def _take_preprocess_indexer_and_fill_value( - arr: np.ndarray, - indexer: Optional[np.ndarray], - axis: int, - out: Optional[np.ndarray], - fill_value, - allow_fill: bool, -): - mask_info = None - - if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) - dtype, fill_value = arr.dtype, arr.dtype.type() - else: - indexer = ensure_int64(indexer, copy=False) - if not allow_fill: - dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - mask = indexer == -1 - needs_masking = mask.any() - mask_info = mask, needs_masking - if needs_masking: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() - - return indexer, dtype, fill_value, mask_info - - -def take_nd( - arr, - indexer, - axis: int = 0, - out: Optional[np.ndarray] = None, - fill_value=lib.no_default, - allow_fill: bool = True, -): - """ - Specialized Cython take which sets NaN values in one pass - - This dispatches to ``take`` defined on ExtensionArrays. It does not - currently dispatch to ``SparseArray.take`` for sparse ``arr``. - - Parameters - ---------- - arr : array-like - Input array. - indexer : ndarray - 1-D array of indices to take, subarrays corresponding to -1 value - indices are filed with fill_value - axis : int, default 0 - Axis to take from - out : ndarray or None, default None - Optional output array, must be appropriate type to hold input and - fill_value together, if indexer has any -1 value entries; call - maybe_promote to determine this type for any fill_value - fill_value : any, default np.nan - Fill value to replace -1 values with - allow_fill : boolean, default True - If False, indexer is assumed to contain no -1 values so no filling - will be done. This short-circuits computation of a mask. Result is - undefined if allow_fill == False and -1 is present in indexer. - - Returns - ------- - subarray : array-like - May be the same type as the input, or cast to an ndarray. - """ - if fill_value is lib.no_default: - fill_value = na_value_for_dtype(arr.dtype, compat=False) - - if isinstance(arr, ABCExtensionArray): - # Check for EA to catch DatetimeArray, TimedeltaArray - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - - arr = extract_array(arr) - arr = np.asarray(arr) - - indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, axis, out, fill_value, allow_fill - ) - - flip_order = False - if arr.ndim == 2 and arr.flags.f_contiguous: - flip_order = True - - if flip_order: - arr = arr.T - axis = arr.ndim - axis - 1 - if out is not None: - out = out.T - - # at this point, it's guaranteed that dtype can hold both the arr values - # and the fill_value - if out is None: - out_shape_ = list(arr.shape) - out_shape_[axis] = len(indexer) - out_shape = tuple(out_shape_) - if arr.flags.f_contiguous and axis == arr.ndim - 1: - # minor tweak that can make an order-of-magnitude difference - # for dataframes initialized directly from 2-d ndarrays - # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its - # f-contiguous transpose) - out = np.empty(out_shape, dtype=dtype, order="F") - else: - out = np.empty(out_shape, dtype=dtype) - - func = _get_take_nd_function( - arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info - ) - func(arr, indexer, out, fill_value) - - if flip_order: - out = out.T - return out - - -def take_2d_multi( - arr: np.ndarray, indexer: np.ndarray, fill_value=np.nan -) -> np.ndarray: - """ - Specialized Cython take which sets NaN values in one pass. - """ - # This is only called from one place in DataFrame._reindex_multi, - # so we know indexer is well-behaved. - assert indexer is not None - assert indexer[0] is not None - assert indexer[1] is not None - - row_idx, col_idx = indexer - - row_idx = ensure_int64(row_idx) - col_idx = ensure_int64(col_idx) - indexer = row_idx, col_idx - mask_info = None - - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype: - # check if promotion is actually required based on indexer - row_mask = row_idx == -1 - col_mask = col_idx == -1 - row_needs = row_mask.any() - col_needs = col_mask.any() - mask_info = (row_mask, col_mask), (row_needs, col_needs) - - if not (row_needs or col_needs): - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() - - # at this point, it's guaranteed that dtype can hold both the arr values - # and the fill_value - out_shape = len(row_idx), len(col_idx) - out = np.empty(out_shape, dtype=dtype) - - func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) - if func is None and arr.dtype != out.dtype: - func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) - if func is not None: - func = _convert_wrapper(func, out.dtype) - if func is None: - - def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_object( - arr, indexer, out, fill_value=fill_value, mask_info=mask_info - ) - - func(arr, indexer, out=out, fill_value=fill_value) - return out - - # ------------ # # searchsorted # # ------------ # diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 46b1e5b20ce3a..c7fa298b06a2f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -159,6 +159,10 @@ def f(x): def index(self) -> Index: return self.obj.index + @property + def agg_axis(self) -> Index: + return self.obj._get_agg_axis(self.axis) + @abc.abstractmethod def apply(self) -> FrameOrSeriesUnion: pass @@ -541,17 +545,26 @@ def maybe_apply_str(self) -> Optional[FrameOrSeriesUnion]: f = self.f if not isinstance(f, str): return None + + obj = self.obj + + # TODO: GH 39993 - Avoid special-casing by replacing with lambda + if f == "size" and isinstance(obj, ABCDataFrame): + # Special-cased because DataFrame.size returns a single scalar + value = obj.shape[self.axis] + return obj._constructor_sliced(value, index=self.agg_axis, name="size") + # Support for `frame.transform('method')` # Some methods (shift, etc.) require the axis argument, others # don't, so inspect and insert if necessary. - func = getattr(self.obj, f, None) + func = getattr(obj, f, None) if callable(func): sig = inspect.getfullargspec(func) if "axis" in sig.args: self.kwargs["axis"] = self.axis elif self.axis != 0: raise ValueError(f"Operation {f} does not support axis=1") - return self.obj._try_aggregate_string_function(f, *self.args, **self.kwargs) + return obj._try_aggregate_string_function(f, *self.args, **self.kwargs) def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]: """ @@ -613,10 +626,6 @@ def values(self): def dtypes(self) -> Series: return self.obj.dtypes - @property - def agg_axis(self) -> Index: - return self.obj._get_agg_axis(self.axis) - def apply(self) -> FrameOrSeriesUnion: """ compute the results """ # dispatch to agg diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py new file mode 100644 index 0000000000000..447167aa8fae5 --- /dev/null +++ b/pandas/core/array_algos/take.py @@ -0,0 +1,442 @@ +from __future__ import annotations + +from typing import Optional + +import numpy as np + +from pandas._libs import ( + algos as libalgos, + lib, +) +from pandas._typing import ArrayLike + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, +) +from pandas.core.dtypes.missing import na_value_for_dtype + +from pandas.core.construction import ensure_wrapped_if_datetimelike + + +def take_nd( + arr: ArrayLike, + indexer, + axis: int = 0, + out: Optional[np.ndarray] = None, + fill_value=lib.no_default, + allow_fill: bool = True, +) -> ArrayLike: + + """ + Specialized Cython take which sets NaN values in one pass + + This dispatches to ``take`` defined on ExtensionArrays. It does not + currently dispatch to ``SparseArray.take`` for sparse ``arr``. + + Parameters + ---------- + arr : np.ndarray or ExtensionArray + Input array. + indexer : ndarray + 1-D array of indices to take, subarrays corresponding to -1 value + indices are filed with fill_value + axis : int, default 0 + Axis to take from + out : ndarray or None, default None + Optional output array, must be appropriate type to hold input and + fill_value together, if indexer has any -1 value entries; call + maybe_promote to determine this type for any fill_value + fill_value : any, default np.nan + Fill value to replace -1 values with + allow_fill : boolean, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + + Returns + ------- + subarray : np.ndarray or ExtensionArray + May be the same type as the input, or cast to an ndarray. + """ + if fill_value is lib.no_default: + fill_value = na_value_for_dtype(arr.dtype, compat=False) + + if not isinstance(arr, np.ndarray): + # i.e. ExtensionArray, + # includes for EA to catch DatetimeArray, TimedeltaArray + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + + arr = np.asarray(arr) + return _take_nd_ndarray(arr, indexer, axis, out, fill_value, allow_fill) + + +def _take_nd_ndarray( + arr: np.ndarray, + indexer, + axis: int, + out: Optional[np.ndarray], + fill_value, + allow_fill: bool, +) -> np.ndarray: + + indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( + arr, indexer, axis, out, fill_value, allow_fill + ) + + flip_order = False + if arr.ndim == 2 and arr.flags.f_contiguous: + flip_order = True + + if flip_order: + arr = arr.T + axis = arr.ndim - axis - 1 + if out is not None: + out = out.T + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + if out is None: + out_shape_ = list(arr.shape) + out_shape_[axis] = len(indexer) + out_shape = tuple(out_shape_) + if arr.flags.f_contiguous and axis == arr.ndim - 1: + # minor tweak that can make an order-of-magnitude difference + # for dataframes initialized directly from 2-d ndarrays + # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its + # f-contiguous transpose) + out = np.empty(out_shape, dtype=dtype, order="F") + else: + out = np.empty(out_shape, dtype=dtype) + + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info + ) + func(arr, indexer, out, fill_value) + + if flip_order: + out = out.T + return out + + +def take_2d_multi( + arr: np.ndarray, indexer: np.ndarray, fill_value=np.nan +) -> np.ndarray: + """ + Specialized Cython take which sets NaN values in one pass. + """ + # This is only called from one place in DataFrame._reindex_multi, + # so we know indexer is well-behaved. + assert indexer is not None + assert indexer[0] is not None + assert indexer[1] is not None + + row_idx, col_idx = indexer + + row_idx = ensure_int64(row_idx) + col_idx = ensure_int64(col_idx) + indexer = row_idx, col_idx + mask_info = None + + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + + if not (row_needs or col_needs): + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) + + func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) + if func is None and arr.dtype != out.dtype: + func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) + if func is not None: + func = _convert_wrapper(func, out.dtype) + + if func is not None: + func(arr, indexer, out=out, fill_value=fill_value) + else: + _take_2d_multi_object( + arr, indexer, out, fill_value=fill_value, mask_info=mask_info + ) + + return out + + +def _get_take_nd_function( + ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: int = 0, mask_info=None +): + + if ndim <= 2: + tup = (arr_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + return func + + tup = (out_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + func = _convert_wrapper(func, out_dtype) + return func + + def func2(arr, indexer, out, fill_value=np.nan): + indexer = ensure_int64(indexer) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) + + return func2 + + +def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ): + if arr_dtype is not None: + arr = arr.view(arr_dtype) + if out_dtype is not None: + out = out.view(out_dtype) + if fill_wrap is not None: + fill_value = fill_wrap(fill_value) + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +def _convert_wrapper(f, conv_dtype): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ): + if conv_dtype == object: + # GH#39755 avoid casting dt64/td64 to integers + arr = ensure_wrapped_if_datetimelike(arr) + arr = arr.astype(conv_dtype) + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +_take_1d_dict = { + ("int8", "int8"): libalgos.take_1d_int8_int8, + ("int8", "int32"): libalgos.take_1d_int8_int32, + ("int8", "int64"): libalgos.take_1d_int8_int64, + ("int8", "float64"): libalgos.take_1d_int8_float64, + ("int16", "int16"): libalgos.take_1d_int16_int16, + ("int16", "int32"): libalgos.take_1d_int16_int32, + ("int16", "int64"): libalgos.take_1d_int16_int64, + ("int16", "float64"): libalgos.take_1d_int16_float64, + ("int32", "int32"): libalgos.take_1d_int32_int32, + ("int32", "int64"): libalgos.take_1d_int32_int64, + ("int32", "float64"): libalgos.take_1d_int32_float64, + ("int64", "int64"): libalgos.take_1d_int64_int64, + ("int64", "float64"): libalgos.take_1d_int64_float64, + ("float32", "float32"): libalgos.take_1d_float32_float32, + ("float32", "float64"): libalgos.take_1d_float32_float64, + ("float64", "float64"): libalgos.take_1d_float64_float64, + ("object", "object"): libalgos.take_1d_object_object, + ("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), +} + +_take_2d_axis0_dict = { + ("int8", "int8"): libalgos.take_2d_axis0_int8_int8, + ("int8", "int32"): libalgos.take_2d_axis0_int8_int32, + ("int8", "int64"): libalgos.take_2d_axis0_int8_int64, + ("int8", "float64"): libalgos.take_2d_axis0_int8_float64, + ("int16", "int16"): libalgos.take_2d_axis0_int16_int16, + ("int16", "int32"): libalgos.take_2d_axis0_int16_int32, + ("int16", "int64"): libalgos.take_2d_axis0_int16_int64, + ("int16", "float64"): libalgos.take_2d_axis0_int16_float64, + ("int32", "int32"): libalgos.take_2d_axis0_int32_int32, + ("int32", "int64"): libalgos.take_2d_axis0_int32_int64, + ("int32", "float64"): libalgos.take_2d_axis0_int32_float64, + ("int64", "int64"): libalgos.take_2d_axis0_int64_int64, + ("int64", "float64"): libalgos.take_2d_axis0_int64_float64, + ("float32", "float32"): libalgos.take_2d_axis0_float32_float32, + ("float32", "float64"): libalgos.take_2d_axis0_float32_float64, + ("float64", "float64"): libalgos.take_2d_axis0_float64_float64, + ("object", "object"): libalgos.take_2d_axis0_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_axis0_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_axis1_dict = { + ("int8", "int8"): libalgos.take_2d_axis1_int8_int8, + ("int8", "int32"): libalgos.take_2d_axis1_int8_int32, + ("int8", "int64"): libalgos.take_2d_axis1_int8_int64, + ("int8", "float64"): libalgos.take_2d_axis1_int8_float64, + ("int16", "int16"): libalgos.take_2d_axis1_int16_int16, + ("int16", "int32"): libalgos.take_2d_axis1_int16_int32, + ("int16", "int64"): libalgos.take_2d_axis1_int16_int64, + ("int16", "float64"): libalgos.take_2d_axis1_int16_float64, + ("int32", "int32"): libalgos.take_2d_axis1_int32_int32, + ("int32", "int64"): libalgos.take_2d_axis1_int32_int64, + ("int32", "float64"): libalgos.take_2d_axis1_int32_float64, + ("int64", "int64"): libalgos.take_2d_axis1_int64_int64, + ("int64", "float64"): libalgos.take_2d_axis1_int64_float64, + ("float32", "float32"): libalgos.take_2d_axis1_float32_float32, + ("float32", "float64"): libalgos.take_2d_axis1_float32_float64, + ("float64", "float64"): libalgos.take_2d_axis1_float64_float64, + ("object", "object"): libalgos.take_2d_axis1_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_axis1_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_multi_dict = { + ("int8", "int8"): libalgos.take_2d_multi_int8_int8, + ("int8", "int32"): libalgos.take_2d_multi_int8_int32, + ("int8", "int64"): libalgos.take_2d_multi_int8_int64, + ("int8", "float64"): libalgos.take_2d_multi_int8_float64, + ("int16", "int16"): libalgos.take_2d_multi_int16_int16, + ("int16", "int32"): libalgos.take_2d_multi_int16_int32, + ("int16", "int64"): libalgos.take_2d_multi_int16_int64, + ("int16", "float64"): libalgos.take_2d_multi_int16_float64, + ("int32", "int32"): libalgos.take_2d_multi_int32_int32, + ("int32", "int64"): libalgos.take_2d_multi_int32_int64, + ("int32", "float64"): libalgos.take_2d_multi_int32_float64, + ("int64", "int64"): libalgos.take_2d_multi_int64_int64, + ("int64", "float64"): libalgos.take_2d_multi_int64_float64, + ("float32", "float32"): libalgos.take_2d_multi_float32_float32, + ("float32", "float64"): libalgos.take_2d_multi_float32_float64, + ("float64", "float64"): libalgos.take_2d_multi_float64_float64, + ("object", "object"): libalgos.take_2d_multi_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_multi_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + + +def _take_nd_object( + arr: np.ndarray, + indexer: np.ndarray, + out: np.ndarray, + axis: int, + fill_value, + mask_info, +): + if mask_info is not None: + mask, needs_masking = mask_info + else: + mask = indexer == -1 + needs_masking = mask.any() + if arr.dtype != out.dtype: + arr = arr.astype(out.dtype) + if arr.shape[axis] > 0: + arr.take(ensure_platform_int(indexer), axis=axis, out=out) + if needs_masking: + outindexer = [slice(None)] * arr.ndim + outindexer[axis] = mask + out[tuple(outindexer)] = fill_value + + +def _take_2d_multi_object( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, mask_info +) -> None: + # this is not ideal, performance-wise, but it's better than raising + # an exception (best to optimize in Cython to avoid getting here) + row_idx, col_idx = indexer + if mask_info is not None: + (row_mask, col_mask), (row_needs, col_needs) = mask_info + else: + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + if fill_value is not None: + if row_needs: + out[row_mask, :] = fill_value + if col_needs: + out[:, col_mask] = fill_value + for i in range(len(row_idx)): + u_ = row_idx[i] + for j in range(len(col_idx)): + v = col_idx[j] + out[i, j] = arr[u_, v] + + +def _take_preprocess_indexer_and_fill_value( + arr: np.ndarray, + indexer: Optional[np.ndarray], + axis: int, + out: Optional[np.ndarray], + fill_value, + allow_fill: bool, +): + mask_info = None + + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.int64) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = ensure_int64(indexer, copy=False) + if not allow_fill: + dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False + else: + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking + if needs_masking: + if out is not None and out.dtype != dtype: + raise TypeError("Incompatible type for fill_value") + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 825757ddffee4..57df9ed9ed2d7 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -23,7 +23,6 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import is_dtype_equal -from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import array_equivalent from pandas.core import missing @@ -275,15 +274,7 @@ def fillna( value, method = validate_fillna_kwargs(value, method) mask = self.isna() - - # TODO: share this with EA base class implementation - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f" expected {len(self)}" - ) - value = value[mask] + value = missing.check_value_size(value, mask, len(self)) if mask.any(): if method is not None: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index edc8fa14ca142..3b80c0b189108 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -44,7 +44,6 @@ from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import ( - is_array_like, is_dtype_equal, is_list_like, is_scalar, @@ -58,13 +57,15 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core import ops +from pandas.core import ( + missing, + ops, +) from pandas.core.algorithms import ( factorize_array, isin, unique, ) -from pandas.core.missing import get_fill_func from pandas.core.sorting import ( nargminmax, nargsort, @@ -696,18 +697,11 @@ def fillna(self, value=None, method=None, limit=None): value, method = validate_fillna_kwargs(value, method) mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f"expected {len(self)}" - ) - value = value[mask] + value = missing.check_value_size(value, mask, len(self)) if mask.any(): if method is not None: - func = get_fill_func(method) + func = missing.get_fill_func(method) new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1616d09020d67..e476c3566c10f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -156,7 +156,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): _infer_matches: Tuple[str, ...] _is_recognized_dtype: Callable[[DtypeObj], bool] _recognized_scalars: Tuple[Type, ...] - _data: np.ndarray + _ndarray: np.ndarray def __init__(self, data, dtype: Optional[Dtype] = None, freq=None, copy=False): raise AbstractMethodError(self) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d62a05253b265..b16b4b3ae856a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -316,13 +316,13 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): super().__init__(values, mask, copy=copy) def __neg__(self): - return type(self)(-self._data, self._mask) + return type(self)(-self._data, self._mask.copy()) def __pos__(self): return self def __abs__(self): - return type(self)(np.abs(self._data), self._mask) + return type(self)(np.abs(self._data), self._mask.copy()) @classmethod def _from_sequence( diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bae14f4e560c2..8cf876fa32d7b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -172,7 +172,7 @@ def __len__(self) -> int: return len(self._data) def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: - return type(self)(~self._data, self._mask) + return type(self)(~self._data, self._mask.copy()) def to_numpy( self, diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e2b0ad372bf88..8441b324515f3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -33,13 +33,13 @@ is_integer_dtype, is_scalar, ) +from pandas.core import missing from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( check_array_indexer, validate_indices, ) -from pandas.core.missing import get_fill_func try: import pyarrow as pa @@ -380,18 +380,11 @@ def fillna(self, value=None, method=None, limit=None): value, method = validate_fillna_kwargs(value, method) mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f"expected {len(self)}" - ) - value = value[mask] + value = missing.check_value_size(value, mask, len(self)) if mask.any(): if method is not None: - func = get_fill_func(method) + func = missing.get_fill_func(method) new_values = func(self.to_numpy(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values) else: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 1a4ee52e414b4..f7af1bb3da86b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -24,7 +24,10 @@ iNaT, to_offset, ) -from pandas._libs.tslibs.conversion import precision_from_unit +from pandas._libs.tslibs.conversion import ( + ensure_timedelta64ns, + precision_from_unit, +) from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( array_to_timedelta64, @@ -987,8 +990,7 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): elif is_timedelta64_dtype(data.dtype): if data.dtype != TD64NS_DTYPE: # non-nano unit - # TODO: watch out for overflows - data = data.astype(TD64NS_DTYPE) + data = ensure_timedelta64ns(data) copy = False else: @@ -1030,8 +1032,8 @@ def ints_to_td64ns(data, unit="ns"): dtype_str = f"timedelta64[{unit}]" data = data.view(dtype_str) - # TODO: watch out for overflows when converting from lower-resolution - data = data.astype("timedelta64[ns]") + data = ensure_timedelta64ns(data) + # the astype conversion makes a copy, so we can avoid re-copying later copy_made = True diff --git a/pandas/core/construction.py b/pandas/core/construction.py index dd75473da6d78..0c0084f2492d3 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -456,11 +456,29 @@ def sanitize_array( index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, - raise_cast_failure: bool = False, + raise_cast_failure: bool = True, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. + + Parameters + ---------- + data : Any + index : Index or None, default None + dtype : np.dtype, ExtensionDtype, or None, default None + copy : bool, default False + raise_cast_failure : bool, default True + + Returns + ------- + np.ndarray or ExtensionArray + + Notes + ----- + raise_cast_failure=False is only intended to be True when called from the + DataFrame constructor, as the dtype keyword there may be interpreted as only + applying to a subset of columns, see GH#24435. """ if isinstance(data, ma.MaskedArray): @@ -521,6 +539,9 @@ def sanitize_array( subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: + # realize e.g. generators + # TODO: non-standard array-likes we can convert to ndarray more efficiently? + data = list(data) subarr = _try_cast(data, dtype, copy, raise_cast_failure) subarr = _sanitize_ndim(subarr, data, dtype, index) @@ -594,13 +615,18 @@ def _maybe_repeat(arr: ArrayLike, index: Optional[Index]) -> ArrayLike: return arr -def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool): +def _try_cast( + arr: Union[list, np.ndarray], + dtype: Optional[DtypeObj], + copy: bool, + raise_cast_failure: bool, +) -> ArrayLike: """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- - arr : ndarray, list, tuple, iterator (catchall) + arr : ndarray or list Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool @@ -608,6 +634,10 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. + + Returns + ------- + np.ndarray or ExtensionArray """ # perf shortcut as this is the most common case if ( diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3a6830467ab50..531d784925e9d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1422,7 +1422,7 @@ def maybe_infer_to_datetimelike( v = np.array(v, copy=False) # we only care about object dtypes - if not is_object_dtype(v): + if not is_object_dtype(v.dtype): return value shape = v.shape @@ -1499,7 +1499,9 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: return value -def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): +def maybe_cast_to_datetime( + value: Union[ExtensionArray, np.ndarray, list], dtype: Optional[DtypeObj] +) -> Union[ExtensionArray, np.ndarray, list]: """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1563,26 +1565,28 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): try: if is_datetime64: - value = to_datetime(value, errors="raise") + dti = to_datetime(value, errors="raise") # GH 25843: Remove tz information since the dtype # didn't specify one - if value.tz is not None: - value = value.tz_localize(None) - value = value._values + if dti.tz is not None: + dti = dti.tz_localize(None) + value = dti._values elif is_datetime64tz: # The string check can be removed once issue #13712 # is solved. String data that is passed with a # datetime64tz is assumed to be naive which should # be localized to the timezone. is_dt_string = is_string_dtype(value.dtype) - value = to_datetime(value, errors="raise").array - if is_dt_string: + dta = to_datetime(value, errors="raise").array + if dta.tz is not None: + value = dta.astype(dtype, copy=False) + elif is_dt_string: # Strings here are naive, so directly localize - value = value.tz_localize(dtype.tz) + value = dta.tz_localize(dtype.tz) else: # Numeric values are UTC at this point, # so localize and convert - value = value.tz_localize("UTC").tz_convert(dtype.tz) + value = dta.tz_localize("UTC").tz_convert(dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors="raise")._values except OutOfBoundsDatetime: @@ -1595,6 +1599,8 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): getattr(value, "dtype", None) ) and not is_datetime64_dtype(dtype): if is_object_dtype(dtype): + value = cast(np.ndarray, value) + if value.dtype != DT64NS_DTYPE: value = value.astype(DT64NS_DTYPE) ints = np.asarray(value).view("i8") @@ -1603,25 +1609,20 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): # we have a non-castable dtype that was passed raise TypeError(f"Cannot cast datetime64 to {dtype}") - else: - - is_array = isinstance(value, np.ndarray) - - # catch a datetime/timedelta that is not of ns variety - # and no coercion specified - if is_array and value.dtype.kind in ["M", "m"]: + elif isinstance(value, np.ndarray): + if value.dtype.kind in ["M", "m"]: + # catch a datetime/timedelta that is not of ns variety + # and no coercion specified value = sanitize_to_nanoseconds(value) + elif value.dtype == object: + value = maybe_infer_to_datetimelike(value) + + else: # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion - elif not ( - is_array - and not ( - issubclass(value.dtype.type, np.integer) or value.dtype == np.object_ - ) - ): - value = maybe_infer_to_datetimelike(value) + value = maybe_infer_to_datetimelike(value) return value @@ -1835,7 +1836,9 @@ def construct_1d_ndarray_preserving_na( return subarr -def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): +def maybe_cast_to_integer_array( + arr: Union[list, np.ndarray], dtype: np.dtype, copy: bool = False +): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -1844,9 +1847,9 @@ def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): Parameters ---------- - arr : array-like + arr : np.ndarray or list The array to cast. - dtype : str, np.dtype + dtype : np.dtype The integer dtype to cast the array to. copy: bool, default False Whether to make a copy of the array before returning. @@ -1880,7 +1883,7 @@ def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): assert is_integer_dtype(dtype) try: - if not hasattr(arr, "astype"): + if not isinstance(arr, np.ndarray): casted = np.array(arr, dtype=dtype, copy=copy) else: casted = arr.astype(dtype, copy=copy) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 131a96d10a6d0..2c95e65c70899 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -141,6 +141,7 @@ reconstruct_func, relabel_result, ) +from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -4189,9 +4190,7 @@ def _reindex_multi(self, axes, copy: bool, fill_value) -> DataFrame: if row_indexer is not None and col_indexer is not None: indexer = row_indexer, col_indexer - new_values = algorithms.take_2d_multi( - self.values, indexer, fill_value=fill_value - ) + new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) return self._constructor(new_values, index=new_index, columns=new_columns) else: return self._reindex_with_indexers( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1b7c02cd7a05b..eb4c5c07af2c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6490,6 +6490,7 @@ def fillna( return result.__finalize__(self, method="fillna") @final + @doc(klass=_shared_doc_kwargs["klass"]) def ffill( self: FrameOrSeries, axis=None, @@ -6512,6 +6513,7 @@ def ffill( pad = ffill @final + @doc(klass=_shared_doc_kwargs["klass"]) def bfill( self: FrameOrSeries, axis=None, diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c1a277925de2a..4df1d036e5321 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -15,7 +15,6 @@ from functools import partial from textwrap import dedent from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -25,7 +24,6 @@ List, Mapping, Optional, - Sequence, Type, TypeVar, Union, @@ -115,10 +113,6 @@ from pandas.plotting import boxplot_frame_groupby -if TYPE_CHECKING: - from pandas.core.internals import Block - - NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. AggScalar = Union[str, Callable[..., Any]] @@ -450,13 +444,19 @@ def _wrap_transformed_output( return result def _wrap_applied_output( - self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False + self, + data: Series, + keys: Index, + values: Optional[List[Any]], + not_indexed_same: bool = False, ) -> FrameOrSeriesUnion: """ Wrap the output of SeriesGroupBy.apply into the expected result. Parameters ---------- + data : Series + Input data for groupby operation. keys : Index Keys of groups that Series was grouped by. values : Optional[List[Any]] @@ -471,7 +471,10 @@ def _wrap_applied_output( if len(keys) == 0: # GH #6265 return self.obj._constructor( - [], name=self._selection_name, index=keys, dtype=np.float64 + [], + name=self._selection_name, + index=self.grouper.result_index, + dtype=data.dtype, ) assert values is not None @@ -1074,7 +1077,7 @@ def _cython_agg_general( agg_mgr = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items) + return self._wrap_agged_manager(agg_mgr) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1174,7 +1177,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block # NotImplementedError -> "ohlc" with wrong dtype - new_mgr = data.apply(blk_func, ignore_failures=True) + new_mgr = data.grouped_reduce(blk_func, ignore_failures=True) if not len(new_mgr): raise DataError("No numeric types to aggregate") @@ -1229,9 +1232,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): if len(keys) == 0: - return self.obj._constructor(index=keys) + result = self.obj._constructor( + index=self.grouper.result_index, columns=data.columns + ) + result = result.astype(data.dtypes.to_dict(), copy=False) + return result # GH12824 first_not_none = next(com.not_none(*values), None) @@ -1748,17 +1755,17 @@ def _wrap_transformed_output( return result - def _wrap_agged_blocks(self, blocks: Sequence[Block], items: Index) -> DataFrame: + def _wrap_agged_manager(self, mgr: BlockManager) -> DataFrame: if not self.as_index: - index = np.arange(blocks[0].values.shape[-1]) - mgr = BlockManager(blocks, axes=[items, index]) + index = np.arange(mgr.shape[1]) + mgr.axes[1] = ibase.Index(index) result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - mgr = BlockManager(blocks, axes=[items, index]) + mgr.axes[1] = index result = self.obj._constructor(mgr) if self.axis == 1: @@ -1808,13 +1815,13 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) return counted - new_mgr = data.apply(hfunc) + new_mgr = data.grouped_reduce(hfunc) # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in - # _wrap_agged_blocks() returns. GH 35028 + # _wrap_agged_manager() returns. GH 35028 with com.temp_setattr(self, "observed", True): - result = self._wrap_agged_blocks(new_mgr.blocks, items=data.items) + result = self._wrap_agged_manager(new_mgr) return self._reindex_output(result, fill_value=0) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e939c184d501a..7bcdb348b8a1e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -981,7 +981,7 @@ def _python_apply_general( keys, values, mutated = self.grouper.apply(f, data, self.axis) return self._wrap_applied_output( - keys, values, not_indexed_same=mutated or self.mutated + data, keys, values, not_indexed_same=mutated or self.mutated ) def _iterate_slices(self) -> Iterable[Series]: @@ -1058,7 +1058,7 @@ def _wrap_aggregated_output( def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): + def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) @final @@ -3076,7 +3076,7 @@ def sample( if weights is not None: weights = Series(weights, index=self._selected_obj.index) - ws = [weights[idx] for idx in self.indices.values()] + ws = [weights.iloc[idx] for idx in self.indices.values()] else: ws = [None] * self.ngroups diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 132598e03d6c0..054ce8a40288b 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -12,7 +12,7 @@ TimeDeltaBlock, make_block, ) -from pandas.core.internals.concat import concatenate_block_managers +from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, @@ -35,7 +35,7 @@ "ArrayManager", "BlockManager", "SingleBlockManager", - "concatenate_block_managers", + "concatenate_managers", # those two are preserved here for downstream compatibility (GH-33892) "create_block_manager_from_arrays", "create_block_manager_from_blocks", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index e09a434170780..cd8d3e547abbd 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -79,7 +79,7 @@ class ArrayManager(DataManager): ---------- arrays : Sequence of arrays axes : Sequence of Index - do_integrity_check : bool, default True + verify_integrity : bool, default True """ @@ -95,14 +95,14 @@ def __init__( self, arrays: List[Union[np.ndarray, ExtensionArray]], axes: List[Index], - do_integrity_check: bool = True, + verify_integrity: bool = True, ): # Note: we are storing the axes in "_axes" in the (row, columns) order # which contrasts the order how it is stored in BlockManager self._axes = axes self.arrays = arrays - if do_integrity_check: + if verify_integrity: self._axes = [ensure_index(ax) for ax in axes] self._verify_integrity() @@ -480,31 +480,34 @@ def is_view(self) -> bool: def is_single_block(self) -> bool: return False + def _get_data_subset(self, predicate: Callable) -> ArrayManager: + indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] + arrays = [self.arrays[i] for i in indices] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][np.array(indices, dtype="int64")]] + return type(self)(arrays, new_axes, verify_integrity=False) + def get_bool_data(self, copy: bool = False) -> ArrayManager: """ + Select columns that are bool-dtype. + Parameters ---------- copy : bool, default False Whether to copy the blocks """ - mask = np.array([is_bool_dtype(t) for t in self.get_dtypes()], dtype="object") - arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] - # TODO copy? - new_axes = [self._axes[0], self._axes[1][mask]] - return type(self)(arrays, new_axes) + return self._get_data_subset(lambda arr: is_bool_dtype(arr.dtype)) def get_numeric_data(self, copy: bool = False) -> ArrayManager: """ + Select columns that have a numeric dtype. + Parameters ---------- copy : bool, default False Whether to copy the blocks """ - mask = np.array([is_numeric_dtype(t) for t in self.get_dtypes()]) - arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] - # TODO copy? - new_axes = [self._axes[0], self._axes[1][mask]] - return type(self)(arrays, new_axes) + return self._get_data_subset(lambda arr: is_numeric_dtype(arr.dtype)) def copy(self: T, deep=True) -> T: """ @@ -607,7 +610,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: new_axes = list(self._axes) new_axes[axis] = new_axes[axis][slobj] - return type(self)(arrays, new_axes, do_integrity_check=False) + return type(self)(arrays, new_axes, verify_integrity=False) def fast_xs(self, loc: int) -> ArrayLike: """ @@ -831,7 +834,7 @@ def _reindex_indexer( new_axes = list(self._axes) new_axes[axis] = new_axis - return type(self)(new_arrays, new_axes) + return type(self)(new_arrays, new_axes, verify_integrity=False) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ @@ -909,7 +912,7 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: new_columns = unstacker.get_new_columns(self._axes[1]) new_axes = [new_index, new_columns] - return type(self)(new_arrays, new_axes, do_integrity_check=False) + return type(self)(new_arrays, new_axes, verify_integrity=False) # TODO # equals diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1fc0329b2a78e..8d5cadce823c7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1186,7 +1186,6 @@ def coerce_to_target_dtype(self, other) -> Block: return self.astype(new_dtype, copy=False) - @final def interpolate( self, method: str = "pad", @@ -1293,11 +1292,10 @@ def _interpolate( # only deal with floats if self.dtype.kind != "f": - if self.dtype.kind not in ["i", "u"]: - return [self] - data = data.astype(np.float64) + # bc we already checked that can_hold_na, we dont have int dtype here + return [self] - if fill_value is None: + if is_valid_na_for_dtype(fill_value, self.dtype): fill_value = self.fill_value if method in ("krogh", "piecewise_polynomial", "pchip"): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 16440f7a4c2bf..a71fdff043212 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -49,7 +49,46 @@ from pandas import Index -def concatenate_block_managers( +def concatenate_array_managers( + mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool +) -> Manager: + """ + Concatenate array managers into one. + + Parameters + ---------- + mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + Returns + ------- + ArrayManager + """ + # reindex all arrays + mgrs = [] + for mgr, indexers in mgrs_indexers: + for ax, indexer in indexers.items(): + mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True) + mgrs.append(mgr) + + if concat_axis == 1: + # concatting along the rows -> concat the reindexed arrays + # TODO(ArrayManager) doesn't yet preserve the correct dtype + arrays = [ + concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))]) + for j in range(len(mgrs[0].arrays)) + ] + return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) + else: + # concatting along the columns -> combine reindexed arrays in a single manager + assert concat_axis == 0 + arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) + return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) + + +def concatenate_managers( mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool ) -> Manager: """ @@ -66,20 +105,9 @@ def concatenate_block_managers( ------- BlockManager """ + # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): - - if concat_axis == 1: - # TODO for now only fastpath without indexers - mgrs = [t[0] for t in mgrs_indexers] - arrays = [ - concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) - for j in range(len(mgrs[0].arrays)) - ] - return ArrayManager(arrays, [axes[1], axes[0]]) - elif concat_axis == 0: - mgrs = [t[0] for t in mgrs_indexers] - arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - return ArrayManager(arrays, [axes[1], axes[0]]) + return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index eb1a7a355f313..9903dab9976c4 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -403,7 +403,7 @@ def convert(v): return values -def _homogenize(data, index, dtype: Optional[DtypeObj]): +def _homogenize(data, index: Index, dtype: Optional[DtypeObj]): oindex = None homogenized = [] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9c536abbc7559..81fccc335b7c8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -128,7 +128,7 @@ class BlockManager(DataManager): ---------- blocks: Sequence of Block axes: Sequence of Index - do_integrity_check: bool, default True + verify_integrity: bool, default True Notes ----- @@ -151,7 +151,7 @@ def __init__( self, blocks: Sequence[Block], axes: Sequence[Index], - do_integrity_check: bool = True, + verify_integrity: bool = True, ): self.axes = [ensure_index(ax) for ax in axes] self.blocks: Tuple[Block, ...] = tuple(blocks) @@ -163,7 +163,7 @@ def __init__( f"number of axes ({self.ndim})" ) - if do_integrity_check: + if verify_integrity: self._verify_integrity() # Populate known_consolidate, blknos, and blklocs lazily @@ -176,7 +176,7 @@ def from_blocks(cls, blocks: List[Block], axes: List[Index]): """ Constructor for BlockManager and SingleBlockManager with same signature. """ - return cls(blocks, axes, do_integrity_check=False) + return cls(blocks, axes, verify_integrity=False) @property def blknos(self): @@ -282,6 +282,18 @@ def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return algos.take_nd(dtypes, self.blknos, allow_fill=False) + @property + def arrays(self): + """ + Quick access to the backing arrays of the Blocks. + + Only for compatibility with ArrayManager for testing convenience. + Not to be used in actual code, and return value is not the same as the + ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). + """ + for blk in self.blocks: + yield blk.values + def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] @@ -391,6 +403,41 @@ def reduce( new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) return new_mgr, indexer + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + """ + Apply grouped reduction function blockwise, returning a new BlockManager. + + Parameters + ---------- + func : grouped reduction function + ignore_failures : bool, default False + Whether to drop blocks where func raises TypeError. + + Returns + ------- + BlockManager + """ + result_blocks: List[Block] = [] + + for blk in self.blocks: + try: + applied = blk.apply(func) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_blocks = extend_blocks(applied, result_blocks) + + if len(result_blocks) == 0: + index = Index([None]) # placeholder + else: + index = Index(range(result_blocks[0].values.shape[-1])) + + if ignore_failures: + return self._combine(result_blocks, index=index) + + return type(self).from_blocks(result_blocks, [self.axes[0], index]) + def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager: """ Apply array_op blockwise with another (aligned) BlockManager. @@ -748,7 +795,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis][slobj] - bm = type(self)(new_blocks, new_axes, do_integrity_check=False) + bm = type(self)(new_blocks, new_axes, verify_integrity=False) return bm @property @@ -1487,7 +1534,7 @@ def __init__( self, block: Block, axis: Index, - do_integrity_check: bool = False, + verify_integrity: bool = False, fastpath=lib.no_default, ): assert isinstance(block, Block), type(block) @@ -1511,7 +1558,7 @@ def from_blocks(cls, blocks: List[Block], axes: List[Index]) -> SingleBlockManag """ assert len(blocks) == 1 assert len(axes) == 1 - return cls(blocks[0], axes[0], do_integrity_check=False) + return cls(blocks[0], axes[0], verify_integrity=False) @classmethod def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 70d4f3b91c245..602c4bfd740ca 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -80,7 +80,7 @@ def operate_blockwise( # assert len(slocs) == nlocs, (len(slocs), nlocs) # assert slocs == set(range(nlocs)), slocs - new_mgr = type(right)(res_blks, axes=right.axes, do_integrity_check=False) + new_mgr = type(right)(res_blks, axes=right.axes, verify_integrity=False) return new_mgr diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 9ae5f7d1b7497..0b77a6d821c6d 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -29,6 +29,7 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( ensure_float64, + is_array_like, is_integer_dtype, is_numeric_v_string_like, needs_i8_conversion, @@ -39,6 +40,21 @@ from pandas import Index +def check_value_size(value, mask: np.ndarray, length: int): + """ + Validate the size of the values passed to ExtensionArray.fillna. + """ + if is_array_like(value): + if len(value) != length: + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {length}" + ) + value = value[mask] + + return value + + def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray: """ Return a masking array of same size/shape as arr diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 92fc4a2e85163..a8c6913cd5d6c 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -43,7 +43,7 @@ get_unanimous_names, ) import pandas.core.indexes.base as ibase -from pandas.core.internals import concatenate_block_managers +from pandas.core.internals import concatenate_managers if TYPE_CHECKING: from pandas import ( @@ -524,7 +524,7 @@ def get_result(self): mgrs_indexers.append((obj._mgr, indexers)) - new_data = concatenate_block_managers( + new_data = concatenate_managers( mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) if not self.copy: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 79d018427aa33..9291dcf552786 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -76,7 +76,7 @@ import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc -from pandas.core.internals import concatenate_block_managers +from pandas.core.internals import concatenate_managers from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -720,7 +720,7 @@ def get_result(self): lindexers = {1: left_indexer} if left_indexer is not None else {} rindexers = {1: right_indexer} if right_indexer is not None else {} - result_data = concatenate_block_managers( + result_data = concatenate_managers( [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, @@ -874,12 +874,16 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if take_left is None: lvals = result[name]._values else: + # TODO: can we pin down take_left's type earlier? + take_left = extract_array(take_left, extract_numpy=True) lfill = na_value_for_dtype(take_left.dtype) lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill) if take_right is None: rvals = result[name]._values else: + # TODO: can we pin down take_right's type earlier? + take_right = extract_array(take_right, extract_numpy=True) rfill = na_value_for_dtype(take_right.dtype) rvals = algos.take_nd(take_right, right_indexer, fill_value=rfill) @@ -1616,7 +1620,7 @@ def get_result(self): lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} - result_data = concatenate_block_managers( + result_data = concatenate_managers( [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 8feb379a82ada..d0026d7acbe65 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -236,14 +236,8 @@ def __internal_pivot_table( ) # discard the top level - if ( - values_passed - and not values_multi - and not table.empty - and (table.columns.nlevels > 1) - ): - table = table[values[0]] - + if values_passed and not values_multi and table.columns.nlevels > 1: + table = table.droplevel(0, axis=1) if len(index) == 0 and len(columns) > 0: table = table.T @@ -650,7 +644,6 @@ def crosstab( **dict(zip(unique_colnames, columns)), } df = DataFrame(data, index=common_idx) - original_df_cols = df.columns if values is None: df["__dummy__"] = 0 @@ -660,7 +653,7 @@ def crosstab( kwargs = {"aggfunc": aggfunc} table = df.pivot_table( - ["__dummy__"], + "__dummy__", index=unique_rownames, columns=unique_colnames, margins=margins, @@ -669,12 +662,6 @@ def crosstab( **kwargs, ) - # GH18321, after pivoting, an extra top level of column index of `__dummy__` is - # created, and this extra level should not be included in the further steps - if not table.empty: - cols_diff = df.columns.difference(original_df_cols)[0] - table = table[cols_diff] - # Post-process if normalize is not False: table = _normalize( diff --git a/pandas/core/series.py b/pandas/core/series.py index 34e9464006b30..50a537aeb8623 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -395,7 +395,7 @@ def __init__( elif copy: data = data.copy() else: - data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True) + data = sanitize_array(data, index, dtype, copy) data = SingleBlockManager.from_array(data, index) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 3b0d857217d43..854f41d6b4dc3 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -499,7 +499,7 @@ def _translate(self): head.append(index_header_row) body = [] - for r, idx in enumerate(self.data.index): + for r, row_tup in enumerate(self.data.itertuples()): row_es = [] for c, value in enumerate(rlabels[r]): rid = [ @@ -520,10 +520,9 @@ def _translate(self): es["attributes"] = f'rowspan="{rowspan}"' row_es.append(es) - for c, col in enumerate(self.data.columns): + for c, value in enumerate(row_tup[1:]): cs = [DATA_CLASS, f"row{r}", f"col{c}"] formatter = self._display_funcs[(r, c)] - value = self.data.iloc[r, c] row_dict = { "type": "td", "value": value, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 635a493d03d61..f050a6a086584 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -130,7 +130,7 @@ def to_json( if path_or_buf is not None: # apply compression and byte/text conversion with get_handle( - path_or_buf, "wt", compression=compression, storage_options=storage_options + path_or_buf, "w", compression=compression, storage_options=storage_options ) as handles: handles.handle.write(s) else: diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3ac9d98874f86..3532040a2fd7b 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -35,1525 +35,1572 @@ def int_frame_const_col(): return df -class TestDataFrameApply: - def test_apply(self, float_frame): - with np.errstate(all="ignore"): - # ufunc - applied = float_frame.apply(np.sqrt) - tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) - - # aggregator - applied = float_frame.apply(np.mean) - assert applied["A"] == np.mean(float_frame["A"]) - - d = float_frame.index[0] - applied = float_frame.apply(np.mean, axis=1) - assert applied[d] == np.mean(float_frame.xs(d)) - assert applied.index is float_frame.index # want this - - # invalid axis - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) - msg = "No axis named 2 for object type DataFrame" - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: x, 2) - - # GH 9573 - df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) - df = df.apply(lambda ts: ts.astype("category")) - - assert df.shape == (4, 2) - assert isinstance(df["c0"].dtype, CategoricalDtype) - assert isinstance(df["c1"].dtype, CategoricalDtype) - - def test_apply_axis1_with_ea(self): - # GH#36785 - df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) - result = df.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result, df) - - def test_apply_mixed_datetimelike(self): - # mixed datetimelike - # GH 7778 - df = DataFrame( - { - "A": date_range("20130101", periods=3), - "B": pd.to_timedelta(np.arange(3), unit="s"), - } - ) - result = df.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result, df) - - def test_apply_empty(self, float_frame): - # empty - empty_frame = DataFrame() - - applied = empty_frame.apply(np.sqrt) - assert applied.empty - - applied = empty_frame.apply(np.mean) - assert applied.empty - - no_rows = float_frame[:0] - result = no_rows.apply(lambda x: x.mean()) - expected = Series(np.nan, index=float_frame.columns) - tm.assert_series_equal(result, expected) - - no_cols = float_frame.loc[:, []] - result = no_cols.apply(lambda x: x.mean(), axis=1) - expected = Series(np.nan, index=float_frame.index) - tm.assert_series_equal(result, expected) - - # GH 2476 - expected = DataFrame(index=["a"]) - result = expected.apply(lambda x: x["a"], axis=1) - tm.assert_frame_equal(expected, result) - - def test_apply_with_reduce_empty(self): - # reduce with an empty DataFrame - empty_frame = DataFrame() - - x = [] - result = empty_frame.apply(x.append, axis=1, result_type="expand") - tm.assert_frame_equal(result, empty_frame) - result = empty_frame.apply(x.append, axis=1, result_type="reduce") - expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) - tm.assert_series_equal(result, expected) - - empty_with_cols = DataFrame(columns=["a", "b", "c"]) - result = empty_with_cols.apply(x.append, axis=1, result_type="expand") - tm.assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") - expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) - tm.assert_series_equal(result, expected) - - # Ensure that x.append hasn't been called - assert x == [] - - @pytest.mark.parametrize("func", ["sum", "prod", "any", "all"]) - def test_apply_funcs_over_empty(self, func): - # GH 28213 - df = DataFrame(columns=["a", "b", "c"]) - - result = df.apply(getattr(np, func)) - expected = getattr(df, func)() - tm.assert_series_equal(result, expected) - - def test_nunique_empty(self): - # GH 28213 - df = DataFrame(columns=["a", "b", "c"]) - - result = df.nunique() - expected = Series(0, index=df.columns) - tm.assert_series_equal(result, expected) - - result = df.T.nunique() - expected = Series([], index=pd.Index([]), dtype=np.float64) - tm.assert_series_equal(result, expected) - - def test_apply_standard_nonunique(self): - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) - - result = df.apply(lambda s: s[0], axis=1) - expected = Series([1, 4, 7], ["a", "a", "c"]) - tm.assert_series_equal(result, expected) - - result = df.T.apply(lambda s: s[0], axis=0) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) - @pytest.mark.parametrize( - "args,kwds", - [ - pytest.param([], {}, id="no_args_or_kwds"), - pytest.param([1], {}, id="axis_from_args"), - pytest.param([], {"axis": 1}, id="axis_from_kwds"), - pytest.param([], {"numeric_only": True}, id="optional_kwds"), - pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), - ], +def test_apply(float_frame): + with np.errstate(all="ignore"): + # ufunc + applied = float_frame.apply(np.sqrt) + tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) + + # aggregator + applied = float_frame.apply(np.mean) + assert applied["A"] == np.mean(float_frame["A"]) + + d = float_frame.index[0] + applied = float_frame.apply(np.mean, axis=1) + assert applied[d] == np.mean(float_frame.xs(d)) + assert applied.index is float_frame.index # want this + + # invalid axis + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: x, 2) + + # GH 9573 + df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) + df = df.apply(lambda ts: ts.astype("category")) + + assert df.shape == (4, 2) + assert isinstance(df["c0"].dtype, CategoricalDtype) + assert isinstance(df["c1"].dtype, CategoricalDtype) + + +def test_apply_axis1_with_ea(): + # GH#36785 + df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) + result = df.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, df) + + +def test_apply_mixed_datetimelike(): + # mixed datetimelike + # GH 7778 + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": pd.to_timedelta(np.arange(3), unit="s"), + } ) - @pytest.mark.parametrize("how", ["agg", "apply"]) - def test_apply_with_string_funcs(self, request, float_frame, func, args, kwds, how): - if len(args) > 1 and how == "agg": - request.node.add_marker( - pytest.mark.xfail( - reason="agg/apply signature mismatch - agg passes 2nd " - "argument to func" - ) + result = df.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, df) + + +def test_apply_empty(float_frame): + # empty + empty_frame = DataFrame() + + applied = empty_frame.apply(np.sqrt) + assert applied.empty + + applied = empty_frame.apply(np.mean) + assert applied.empty + + no_rows = float_frame[:0] + result = no_rows.apply(lambda x: x.mean()) + expected = Series(np.nan, index=float_frame.columns) + tm.assert_series_equal(result, expected) + + no_cols = float_frame.loc[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1) + expected = Series(np.nan, index=float_frame.index) + tm.assert_series_equal(result, expected) + + # GH 2476 + expected = DataFrame(index=["a"]) + result = expected.apply(lambda x: x["a"], axis=1) + tm.assert_frame_equal(expected, result) + + +def test_apply_with_reduce_empty(): + # reduce with an empty DataFrame + empty_frame = DataFrame() + + x = [] + result = empty_frame.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_frame) + result = empty_frame.apply(x.append, axis=1, result_type="reduce") + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) + + empty_with_cols = DataFrame(columns=["a", "b", "c"]) + result = empty_with_cols.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_with_cols) + result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) + + # Ensure that x.append hasn't been called + assert x == [] + + +@pytest.mark.parametrize("func", ["sum", "prod", "any", "all"]) +def test_apply_funcs_over_empty(func): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.apply(getattr(np, func)) + expected = getattr(df, func)() + tm.assert_series_equal(result, expected) + + +def test_nunique_empty(): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.nunique() + expected = Series(0, index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.T.nunique() + expected = Series([], index=pd.Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + +def test_apply_standard_nonunique(): + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + + result = df.apply(lambda s: s[0], axis=1) + expected = Series([1, 4, 7], ["a", "a", "c"]) + tm.assert_series_equal(result, expected) + + result = df.T.apply(lambda s: s[0], axis=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) +@pytest.mark.parametrize( + "args,kwds", + [ + pytest.param([], {}, id="no_args_or_kwds"), + pytest.param([1], {}, id="axis_from_args"), + pytest.param([], {"axis": 1}, id="axis_from_kwds"), + pytest.param([], {"numeric_only": True}, id="optional_kwds"), + pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), + ], +) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): + if len(args) > 1 and how == "agg": + request.node.add_marker( + pytest.mark.xfail( + reason="agg/apply signature mismatch - agg passes 2nd " + "argument to func" ) - result = getattr(float_frame, how)(func, *args, **kwds) - expected = getattr(float_frame, func)(*args, **kwds) - tm.assert_series_equal(result, expected) + ) + result = getattr(float_frame, how)(func, *args, **kwds) + expected = getattr(float_frame, func)(*args, **kwds) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)] +) +def test_apply_str_axis_1_raises(how, args): + # GH 39211 - some ops don't support axis=1 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + msg = f"Operation {how} does not support axis=1" + with pytest.raises(ValueError, match=msg): + df.apply(how, axis=1, args=args) + + +def test_apply_broadcast(float_frame, int_frame_const_col): + + # scalars + result = float_frame.apply(np.mean, result_type="broadcast") + expected = DataFrame([float_frame.mean()], index=float_frame.index) + tm.assert_frame_equal(result, expected) + + result = float_frame.apply(np.mean, axis=1, result_type="broadcast") + m = float_frame.mean(axis=1) + expected = DataFrame({c: m for c in float_frame.columns}) + tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)] + # lists + result = float_frame.apply( + lambda x: list(range(len(float_frame.columns))), + axis=1, + result_type="broadcast", ) - def test_apply_str_axis_1_raises(self, how, args): - # GH 39211 - some ops don't support axis=1 - df = DataFrame({"a": [1, 2], "b": [3, 4]}) - msg = f"Operation {how} does not support axis=1" - with pytest.raises(ValueError, match=msg): - df.apply(how, axis=1, args=args) - - def test_apply_broadcast(self, float_frame, int_frame_const_col): - - # scalars - result = float_frame.apply(np.mean, result_type="broadcast") - expected = DataFrame([float_frame.mean()], index=float_frame.index) - tm.assert_frame_equal(result, expected) + m = list(range(len(float_frame.columns))) + expected = DataFrame( + [m] * len(float_frame.index), + dtype="float64", + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) - result = float_frame.apply(np.mean, axis=1, result_type="broadcast") - m = float_frame.mean(axis=1) - expected = DataFrame({c: m for c in float_frame.columns}) - tm.assert_frame_equal(result, expected) + result = float_frame.apply( + lambda x: list(range(len(float_frame.index))), result_type="broadcast" + ) + m = list(range(len(float_frame.index))) + expected = DataFrame( + {c: m for c in float_frame.columns}, + dtype="float64", + index=float_frame.index, + ) + tm.assert_frame_equal(result, expected) - # lists - result = float_frame.apply( - lambda x: list(range(len(float_frame.columns))), - axis=1, - result_type="broadcast", - ) - m = list(range(len(float_frame.columns))) - expected = DataFrame( - [m] * len(float_frame.index), - dtype="float64", - index=float_frame.index, - columns=float_frame.columns, - ) - tm.assert_frame_equal(result, expected) + # preserve columns + df = int_frame_const_col + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + tm.assert_frame_equal(result, df) + + df = int_frame_const_col + result = df.apply( + lambda x: Series([1, 2, 3], index=list("abc")), + axis=1, + result_type="broadcast", + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) - result = float_frame.apply( - lambda x: list(range(len(float_frame.index))), result_type="broadcast" - ) - m = list(range(len(float_frame.index))) - expected = DataFrame( - {c: m for c in float_frame.columns}, - dtype="float64", - index=float_frame.index, - ) - tm.assert_frame_equal(result, expected) - # preserve columns - df = int_frame_const_col - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") - tm.assert_frame_equal(result, df) +def test_apply_broadcast_error(int_frame_const_col): + df = int_frame_const_col - df = int_frame_const_col - result = df.apply( - lambda x: Series([1, 2, 3], index=list("abc")), + # > 1 ndim + msg = "too many dims to broadcast" + with pytest.raises(ValueError, match=msg): + df.apply( + lambda x: np.array([1, 2]).reshape(-1, 2), axis=1, result_type="broadcast", ) - expected = df.copy() - tm.assert_frame_equal(result, expected) - def test_apply_broadcast_error(self, int_frame_const_col): - df = int_frame_const_col + # cannot broadcast + msg = "cannot broadcast result" + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - # > 1 ndim - msg = "too many dims to broadcast" - with pytest.raises(ValueError, match=msg): - df.apply( - lambda x: np.array([1, 2]).reshape(-1, 2), - axis=1, - result_type="broadcast", - ) + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") - # cannot broadcast - msg = "cannot broadcast result" - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") +def test_apply_raw(float_frame, mixed_type_frame): + def _assert_raw(x): + assert isinstance(x, np.ndarray) + assert x.ndim == 1 - def test_apply_raw(self, float_frame, mixed_type_frame): - def _assert_raw(x): - assert isinstance(x, np.ndarray) - assert x.ndim == 1 + float_frame.apply(_assert_raw, raw=True) + float_frame.apply(_assert_raw, axis=1, raw=True) - float_frame.apply(_assert_raw, raw=True) - float_frame.apply(_assert_raw, axis=1, raw=True) + result0 = float_frame.apply(np.mean, raw=True) + result1 = float_frame.apply(np.mean, axis=1, raw=True) - result0 = float_frame.apply(np.mean, raw=True) - result1 = float_frame.apply(np.mean, axis=1, raw=True) + expected0 = float_frame.apply(lambda x: x.values.mean()) + expected1 = float_frame.apply(lambda x: x.values.mean(), axis=1) - expected0 = float_frame.apply(lambda x: x.values.mean()) - expected1 = float_frame.apply(lambda x: x.values.mean(), axis=1) + tm.assert_series_equal(result0, expected0) + tm.assert_series_equal(result1, expected1) - tm.assert_series_equal(result0, expected0) - tm.assert_series_equal(result1, expected1) + # no reduction + result = float_frame.apply(lambda x: x * 2, raw=True) + expected = float_frame * 2 + tm.assert_frame_equal(result, expected) - # no reduction - result = float_frame.apply(lambda x: x * 2, raw=True) - expected = float_frame * 2 - tm.assert_frame_equal(result, expected) + # Mixed dtype (GH-32423) + mixed_type_frame.apply(_assert_raw, raw=True) + mixed_type_frame.apply(_assert_raw, axis=1, raw=True) - # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, raw=True) - mixed_type_frame.apply(_assert_raw, axis=1, raw=True) - def test_apply_axis1(self, float_frame): - d = float_frame.index[0] - tapplied = float_frame.apply(np.mean, axis=1) - assert tapplied[d] == np.mean(float_frame.xs(d)) - - def test_apply_mixed_dtype_corner(self): - df = DataFrame({"A": ["foo"], "B": [1.0]}) - result = df[:0].apply(np.mean, axis=1) - # the result here is actually kind of ambiguous, should it be a Series - # or a DataFrame? - expected = Series(np.nan, index=pd.Index([], dtype="int64")) - tm.assert_series_equal(result, expected) - - df = DataFrame({"A": ["foo"], "B": [1.0]}) - result = df.apply(lambda x: x["A"], axis=1) - expected = Series(["foo"], index=[0]) - tm.assert_series_equal(result, expected) - - result = df.apply(lambda x: x["B"], axis=1) - expected = Series([1.0], index=[0]) - tm.assert_series_equal(result, expected) - - def test_apply_empty_infer_type(self): - no_cols = DataFrame(index=["a", "b", "c"]) - no_index = DataFrame(columns=["a", "b", "c"]) - - def _check(df, f): - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - test_res = f(np.array([], dtype="f8")) - is_reduction = not isinstance(test_res, np.ndarray) - - def _checkit(axis=0, raw=False): - result = df.apply(f, axis=axis, raw=raw) - if is_reduction: - agg_axis = df._get_agg_axis(axis) - assert isinstance(result, Series) - assert result.index is agg_axis - else: - assert isinstance(result, DataFrame) - - _checkit() - _checkit(axis=1) - _checkit(raw=True) - _checkit(axis=0, raw=True) +def test_apply_axis1(float_frame): + d = float_frame.index[0] + tapplied = float_frame.apply(np.mean, axis=1) + assert tapplied[d] == np.mean(float_frame.xs(d)) - with np.errstate(all="ignore"): - _check(no_cols, lambda x: x) - _check(no_cols, lambda x: x.mean()) - _check(no_index, lambda x: x) - _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") - assert isinstance(result, DataFrame) +def test_apply_mixed_dtype_corner(): + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df[:0].apply(np.mean, axis=1) + # the result here is actually kind of ambiguous, should it be a Series + # or a DataFrame? + expected = Series(np.nan, index=pd.Index([], dtype="int64")) + tm.assert_series_equal(result, expected) - def test_apply_with_args_kwds(self, float_frame): - def add_some(x, howmuch=0): - return x + howmuch + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df.apply(lambda x: x["A"], axis=1) + expected = Series(["foo"], index=[0]) + tm.assert_series_equal(result, expected) - def agg_and_add(x, howmuch=0): - return x.mean() + howmuch + result = df.apply(lambda x: x["B"], axis=1) + expected = Series([1.0], index=[0]) + tm.assert_series_equal(result, expected) - def subtract_and_divide(x, sub, divide=1): - return (x - sub) / divide - result = float_frame.apply(add_some, howmuch=2) - expected = float_frame.apply(lambda x: x + 2) - tm.assert_frame_equal(result, expected) +def test_apply_empty_infer_type(): + no_cols = DataFrame(index=["a", "b", "c"]) + no_index = DataFrame(columns=["a", "b", "c"]) - result = float_frame.apply(agg_and_add, howmuch=2) - expected = float_frame.apply(lambda x: x.mean() + 2) - tm.assert_series_equal(result, expected) + def _check(df, f): + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + test_res = f(np.array([], dtype="f8")) + is_reduction = not isinstance(test_res, np.ndarray) - result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) - expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) - tm.assert_frame_equal(result, expected) + def _checkit(axis=0, raw=False): + result = df.apply(f, axis=axis, raw=raw) + if is_reduction: + agg_axis = df._get_agg_axis(axis) + assert isinstance(result, Series) + assert result.index is agg_axis + else: + assert isinstance(result, DataFrame) - def test_apply_yield_list(self, float_frame): - result = float_frame.apply(list) - tm.assert_frame_equal(result, float_frame) + _checkit() + _checkit(axis=1) + _checkit(raw=True) + _checkit(axis=0, raw=True) - def test_apply_reduce_Series(self, float_frame): - float_frame["A"].iloc[::2] = np.nan - expected = float_frame.mean(1) - result = float_frame.apply(np.mean, axis=1) - tm.assert_series_equal(result, expected) + with np.errstate(all="ignore"): + _check(no_cols, lambda x: x) + _check(no_cols, lambda x: x.mean()) + _check(no_index, lambda x: x) + _check(no_index, lambda x: x.mean()) - def test_apply_reduce_to_dict(self): - # GH 25196 37544 - data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"]) + result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") + assert isinstance(result, DataFrame) - result0 = data.apply(dict, axis=0) - expected0 = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) - tm.assert_series_equal(result0, expected0) - result1 = data.apply(dict, axis=1) - expected1 = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) - tm.assert_series_equal(result1, expected1) +def test_apply_with_args_kwds(float_frame): + def add_some(x, howmuch=0): + return x + howmuch - def test_apply_differently_indexed(self): - df = DataFrame(np.random.randn(20, 10)) + def agg_and_add(x, howmuch=0): + return x.mean() + howmuch - result0 = df.apply(Series.describe, axis=0) - expected0 = DataFrame( - {i: v.describe() for i, v in df.items()}, columns=df.columns - ) - tm.assert_frame_equal(result0, expected0) - - result1 = df.apply(Series.describe, axis=1) - expected1 = DataFrame( - {i: v.describe() for i, v in df.T.items()}, columns=df.index - ).T - tm.assert_frame_equal(result1, expected1) - - def test_apply_modify_traceback(self): - data = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide - data.loc[4, "C"] = np.nan + result = float_frame.apply(add_some, howmuch=2) + expected = float_frame.apply(lambda x: x + 2) + tm.assert_frame_equal(result, expected) - def transform(row): - if row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row + result = float_frame.apply(agg_and_add, howmuch=2) + expected = float_frame.apply(lambda x: x.mean() + 2) + tm.assert_series_equal(result, expected) - def transform2(row): - if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row + result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) + expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) + tm.assert_frame_equal(result, expected) - msg = "'float' object has no attribute 'startswith'" - with pytest.raises(AttributeError, match=msg): - data.apply(transform, axis=1) - def test_apply_bug(self): +def test_apply_yield_list(float_frame): + result = float_frame.apply(list) + tm.assert_frame_equal(result, float_frame) - # GH 6125 - positions = DataFrame( - [ - [1, "ABC0", 50], - [1, "YUM0", 20], - [1, "DEF0", 20], - [2, "ABC1", 50], - [2, "YUM1", 20], - [2, "DEF1", 20], + +def test_apply_reduce_Series(float_frame): + float_frame["A"].iloc[::2] = np.nan + expected = float_frame.mean(1) + result = float_frame.apply(np.mean, axis=1) + tm.assert_series_equal(result, expected) + + +def test_apply_reduce_to_dict(): + # GH 25196 37544 + data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"]) + + result0 = data.apply(dict, axis=0) + expected0 = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns) + tm.assert_series_equal(result0, expected0) + + result1 = data.apply(dict, axis=1) + expected1 = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index) + tm.assert_series_equal(result1, expected1) + + +def test_apply_differently_indexed(): + df = DataFrame(np.random.randn(20, 10)) + + result0 = df.apply(Series.describe, axis=0) + expected0 = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns) + tm.assert_frame_equal(result0, expected0) + + result1 = df.apply(Series.describe, axis=1) + expected1 = DataFrame( + {i: v.describe() for i, v in df.T.items()}, columns=df.index + ).T + tm.assert_frame_equal(result1, expected1) + + +def test_apply_modify_traceback(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", ], - columns=["a", "market", "position"], - ) + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) - def f(r): - return r["market"] + data.loc[4, "C"] = np.nan - expected = positions.apply(f, axis=1) + def transform(row): + if row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row - positions = DataFrame( - [ - [datetime(2013, 1, 1), "ABC0", 50], - [datetime(2013, 1, 2), "YUM0", 20], - [datetime(2013, 1, 3), "DEF0", 20], - [datetime(2013, 1, 4), "ABC1", 50], - [datetime(2013, 1, 5), "YUM1", 20], - [datetime(2013, 1, 6), "DEF1", 20], + def transform2(row): + if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row + + msg = "'float' object has no attribute 'startswith'" + with pytest.raises(AttributeError, match=msg): + data.apply(transform, axis=1) + + +def test_apply_bug(): + + # GH 6125 + positions = DataFrame( + [ + [1, "ABC0", 50], + [1, "YUM0", 20], + [1, "DEF0", 20], + [2, "ABC1", 50], + [2, "YUM1", 20], + [2, "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + + def f(r): + return r["market"] + + expected = positions.apply(f, axis=1) + + positions = DataFrame( + [ + [datetime(2013, 1, 1), "ABC0", 50], + [datetime(2013, 1, 2), "YUM0", 20], + [datetime(2013, 1, 3), "DEF0", 20], + [datetime(2013, 1, 4), "ABC1", 50], + [datetime(2013, 1, 5), "YUM1", 20], + [datetime(2013, 1, 6), "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + result = positions.apply(f, axis=1) + tm.assert_series_equal(result, expected) + + +def test_apply_convert_objects(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", ], - columns=["a", "market", "position"], - ) - result = positions.apply(f, axis=1) - tm.assert_series_equal(result, expected) - - def test_apply_convert_objects(self): - data = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) - result = data.apply(lambda x: x, axis=1) - tm.assert_frame_equal(result._convert(datetime=True), data) + result = data.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result._convert(datetime=True), data) - def test_apply_attach_name(self, float_frame): - result = float_frame.apply(lambda x: x.name) - expected = Series(float_frame.columns, index=float_frame.columns) - tm.assert_series_equal(result, expected) - result = float_frame.apply(lambda x: x.name, axis=1) - expected = Series(float_frame.index, index=float_frame.index) - tm.assert_series_equal(result, expected) +def test_apply_attach_name(float_frame): + result = float_frame.apply(lambda x: x.name) + expected = Series(float_frame.columns, index=float_frame.columns) + tm.assert_series_equal(result, expected) - # non-reductions - result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) - expected = DataFrame( - np.tile(float_frame.columns, (len(float_frame.index), 1)), - index=float_frame.index, - columns=float_frame.columns, - ) - tm.assert_frame_equal(result, expected) + result = float_frame.apply(lambda x: x.name, axis=1) + expected = Series(float_frame.index, index=float_frame.index) + tm.assert_series_equal(result, expected) - result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) - expected = Series( - np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() - ) - expected.index = float_frame.index - tm.assert_series_equal(result, expected) - - def test_apply_multi_index(self, float_frame): - index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) - s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) - result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) - expected = DataFrame( - [[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"] - ) - tm.assert_frame_equal(result, expected, check_like=True) - - def test_apply_dict(self): - - # GH 8735 - A = DataFrame([["foo", "bar"], ["spam", "eggs"]]) - A_dicts = Series([{0: "foo", 1: "spam"}, {0: "bar", 1: "eggs"}]) - B = DataFrame([[0, 1], [2, 3]]) - B_dicts = Series([{0: 0, 1: 2}, {0: 1, 1: 3}]) - fn = lambda x: x.to_dict() - - for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, result_type="reduce") - reduce_false = df.apply(fn, result_type="expand") - reduce_none = df.apply(fn) - - tm.assert_series_equal(reduce_true, dicts) - tm.assert_frame_equal(reduce_false, df) - tm.assert_series_equal(reduce_none, dicts) - - def test_applymap(self, float_frame): - applied = float_frame.applymap(lambda x: x * 2) - tm.assert_frame_equal(applied, float_frame * 2) - float_frame.applymap(type) - - # GH 465: function returning tuples - result = float_frame.applymap(lambda x: (x, x)) - assert isinstance(result["A"][0], tuple) - - # GH 2909: object conversion to float in constructor? - df = DataFrame(data=[1, "a"]) - result = df.applymap(lambda x: x) - assert result.dtypes[0] == object - - df = DataFrame(data=[1.0, "a"]) - result = df.applymap(lambda x: x) - assert result.dtypes[0] == object - - # GH 2786 - df = DataFrame(np.random.random((3, 4))) - df2 = df.copy() - cols = ["a", "a", "a", "a"] - df.columns = cols - - expected = df2.applymap(str) - expected.columns = cols - result = df.applymap(str) - tm.assert_frame_equal(result, expected) + # non-reductions + result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) + expected = DataFrame( + np.tile(float_frame.columns, (len(float_frame.index), 1)), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) - # datetime/timedelta - df["datetime"] = Timestamp("20130101") - df["timedelta"] = pd.Timedelta("1 min") - result = df.applymap(str) - for f in ["datetime", "timedelta"]: - assert result.loc[0, f] == str(df.loc[0, f]) + result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) + expected = Series( + np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() + ) + expected.index = float_frame.index + tm.assert_series_equal(result, expected) - # GH 8222 - empty_frames = [ - DataFrame(), - DataFrame(columns=list("ABC")), - DataFrame(index=list("ABC")), - DataFrame({"A": [], "B": [], "C": []}), - ] - for frame in empty_frames: - for func in [round, lambda x: x]: - result = frame.applymap(func) - tm.assert_frame_equal(result, frame) - - def test_applymap_na_ignore(self, float_frame): - # GH 23803 - strlen_frame = float_frame.applymap(lambda x: len(str(x))) - float_frame_with_na = float_frame.copy() - mask = np.random.randint(0, 2, size=float_frame.shape, dtype=bool) - float_frame_with_na[mask] = pd.NA - strlen_frame_na_ignore = float_frame_with_na.applymap( - lambda x: len(str(x)), na_action="ignore" - ) - strlen_frame_with_na = strlen_frame.copy() - strlen_frame_with_na[mask] = pd.NA - tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na) - - with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): - float_frame_with_na.applymap(lambda x: len(str(x)), na_action="abc") - - def test_applymap_box_timestamps(self): - # GH 2689, GH 2627 - ser = Series(date_range("1/1/2000", periods=10)) - - def func(x): - return (x.hour, x.day, x.month) - - # it works! - DataFrame(ser).applymap(func) - - def test_applymap_box(self): - # ufunc will not be boxed. Same test cases as the test_map_box - df = DataFrame( - { - "a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")], - "b": [ - Timestamp("2011-01-01", tz="US/Eastern"), - Timestamp("2011-01-02", tz="US/Eastern"), - ], - "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], - "d": [ - pd.Period("2011-01-01", freq="M"), - pd.Period("2011-01-02", freq="M"), - ], - } - ) - result = df.applymap(lambda x: type(x).__name__) - expected = DataFrame( - { - "a": ["Timestamp", "Timestamp"], - "b": ["Timestamp", "Timestamp"], - "c": ["Timedelta", "Timedelta"], - "d": ["Period", "Period"], - } - ) - tm.assert_frame_equal(result, expected) +def test_apply_multi_index(float_frame): + index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) + result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"]) + tm.assert_frame_equal(result, expected, check_like=True) - def test_frame_apply_dont_convert_datetime64(self): - from pandas.tseries.offsets import BDay - df = DataFrame({"x1": [datetime(1996, 1, 1)]}) +def test_apply_dict(): - df = df.applymap(lambda x: x + BDay()) - df = df.applymap(lambda x: x + BDay()) + # GH 8735 + A = DataFrame([["foo", "bar"], ["spam", "eggs"]]) + A_dicts = Series([{0: "foo", 1: "spam"}, {0: "bar", 1: "eggs"}]) + B = DataFrame([[0, 1], [2, 3]]) + B_dicts = Series([{0: 0, 1: 2}, {0: 1, 1: 3}]) + fn = lambda x: x.to_dict() - assert df.x1.dtype == "M8[ns]" + for df, dicts in [(A, A_dicts), (B, B_dicts)]: + reduce_true = df.apply(fn, result_type="reduce") + reduce_false = df.apply(fn, result_type="expand") + reduce_none = df.apply(fn) - def test_apply_non_numpy_dtype(self): - # GH 12244 - df = DataFrame( - {"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")} - ) - result = df.apply(lambda x: x) - tm.assert_frame_equal(result, df) + tm.assert_series_equal(reduce_true, dicts) + tm.assert_frame_equal(reduce_false, df) + tm.assert_series_equal(reduce_none, dicts) - result = df.apply(lambda x: x + pd.Timedelta("1day")) - expected = DataFrame( - {"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")} - ) - tm.assert_frame_equal(result, expected) - df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") - result = df.apply(lambda x: x) - tm.assert_frame_equal(result, df) +def test_applymap(float_frame): + applied = float_frame.applymap(lambda x: x * 2) + tm.assert_frame_equal(applied, float_frame * 2) + float_frame.applymap(type) - def test_apply_dup_names_multi_agg(self): - # GH 21063 - df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) - expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) - result = df.agg(["min"]) + # GH 465: function returning tuples + result = float_frame.applymap(lambda x: (x, x)) + assert isinstance(result["A"][0], tuple) - tm.assert_frame_equal(result, expected) + # GH 2909: object conversion to float in constructor? + df = DataFrame(data=[1, "a"]) + result = df.applymap(lambda x: x) + assert result.dtypes[0] == object - def test_apply_nested_result_axis_1(self): - # GH 13820 - def apply_list(row): - return [2 * row["A"], 2 * row["C"], 2 * row["B"]] + df = DataFrame(data=[1.0, "a"]) + result = df.applymap(lambda x: x) + assert result.dtypes[0] == object - df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) - result = df.apply(apply_list, axis=1) - expected = Series( - [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] - ) - tm.assert_series_equal(result, expected) + # GH 2786 + df = DataFrame(np.random.random((3, 4))) + df2 = df.copy() + cols = ["a", "a", "a", "a"] + df.columns = cols - def test_apply_noreduction_tzaware_object(self): - # https://github.com/pandas-dev/pandas/issues/31505 - df = DataFrame( - {"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]" - ) - result = df.apply(lambda x: x) - tm.assert_frame_equal(result, df) - result = df.apply(lambda x: x.copy()) - tm.assert_frame_equal(result, df) + expected = df2.applymap(str) + expected.columns = cols + result = df.applymap(str) + tm.assert_frame_equal(result, expected) - def test_apply_function_runs_once(self): - # https://github.com/pandas-dev/pandas/issues/30815 + # datetime/timedelta + df["datetime"] = Timestamp("20130101") + df["timedelta"] = pd.Timedelta("1 min") + result = df.applymap(str) + for f in ["datetime", "timedelta"]: + assert result.loc[0, f] == str(df.loc[0, f]) + + # GH 8222 + empty_frames = [ + DataFrame(), + DataFrame(columns=list("ABC")), + DataFrame(index=list("ABC")), + DataFrame({"A": [], "B": [], "C": []}), + ] + for frame in empty_frames: + for func in [round, lambda x: x]: + result = frame.applymap(func) + tm.assert_frame_equal(result, frame) + + +def test_applymap_na_ignore(float_frame): + # GH 23803 + strlen_frame = float_frame.applymap(lambda x: len(str(x))) + float_frame_with_na = float_frame.copy() + mask = np.random.randint(0, 2, size=float_frame.shape, dtype=bool) + float_frame_with_na[mask] = pd.NA + strlen_frame_na_ignore = float_frame_with_na.applymap( + lambda x: len(str(x)), na_action="ignore" + ) + strlen_frame_with_na = strlen_frame.copy() + strlen_frame_with_na[mask] = pd.NA + tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na) - df = DataFrame({"a": [1, 2, 3]}) - names = [] # Save row names function is applied to + with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): + float_frame_with_na.applymap(lambda x: len(str(x)), na_action="abc") - def reducing_function(row): - names.append(row.name) - def non_reducing_function(row): - names.append(row.name) - return row +def test_applymap_box_timestamps(): + # GH 2689, GH 2627 + ser = Series(date_range("1/1/2000", periods=10)) - for func in [reducing_function, non_reducing_function]: - del names[:] + def func(x): + return (x.hour, x.day, x.month) - df.apply(func, axis=1) - assert names == list(df.index) + # it works! + DataFrame(ser).applymap(func) - def test_apply_raw_function_runs_once(self): - # https://github.com/pandas-dev/pandas/issues/34506 - df = DataFrame({"a": [1, 2, 3]}) - values = [] # Save row values function is applied to +def test_applymap_box(): + # ufunc will not be boxed. Same test cases as the test_map_box + df = DataFrame( + { + "a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")], + "b": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ], + "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + "d": [ + pd.Period("2011-01-01", freq="M"), + pd.Period("2011-01-02", freq="M"), + ], + } + ) - def reducing_function(row): - values.extend(row) + result = df.applymap(lambda x: type(x).__name__) + expected = DataFrame( + { + "a": ["Timestamp", "Timestamp"], + "b": ["Timestamp", "Timestamp"], + "c": ["Timedelta", "Timedelta"], + "d": ["Period", "Period"], + } + ) + tm.assert_frame_equal(result, expected) - def non_reducing_function(row): - values.extend(row) - return row - for func in [reducing_function, non_reducing_function]: - del values[:] +def test_frame_apply_dont_convert_datetime64(): + from pandas.tseries.offsets import BDay - df.apply(func, raw=True, axis=1) - assert values == list(df.a.to_list()) + df = DataFrame({"x1": [datetime(1996, 1, 1)]}) - def test_applymap_function_runs_once(self): + df = df.applymap(lambda x: x + BDay()) + df = df.applymap(lambda x: x + BDay()) - df = DataFrame({"a": [1, 2, 3]}) - values = [] # Save values function is applied to + assert df.x1.dtype == "M8[ns]" - def reducing_function(val): - values.append(val) - def non_reducing_function(val): - values.append(val) - return val +def test_apply_non_numpy_dtype(): + # GH 12244 + df = DataFrame({"dt": date_range("2015-01-01", periods=3, tz="Europe/Brussels")}) + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) - for func in [reducing_function, non_reducing_function]: - del values[:] + result = df.apply(lambda x: x + pd.Timedelta("1day")) + expected = DataFrame( + {"dt": date_range("2015-01-02", periods=3, tz="Europe/Brussels")} + ) + tm.assert_frame_equal(result, expected) - df.applymap(func) - assert values == df.a.to_list() + df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) - def test_apply_with_byte_string(self): - # GH 34529 - df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) - expected = DataFrame( - np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object - ) - # After we make the aply we exect a dataframe just - # like the original but with the object datatype - result = df.apply(lambda x: x.astype("object")) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("val", ["asd", 12, None, np.NaN]) - def test_apply_category_equalness(self, val): - # Check if categorical comparisons on apply, GH 21239 - df_values = ["asd", None, 12, "asd", "cde", np.NaN] - df = DataFrame({"a": df_values}, dtype="category") +def test_apply_dup_names_multi_agg(): + # GH 21063 + df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) + expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + result = df.agg(["min"]) - result = df.a.apply(lambda x: x == val) - expected = Series( - [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" - ) - tm.assert_series_equal(result, expected) - - -class TestInferOutputShape: - # the user has supplied an opaque UDF where - # they are transforming the input that requires - # us to infer the output - - def test_infer_row_shape(self): - # GH 17437 - # if row shape is changing, infer it - df = DataFrame(np.random.rand(10, 2)) - result = df.apply(np.fft.fft, axis=0) - assert result.shape == (10, 2) - - result = df.apply(np.fft.rfft, axis=0) - assert result.shape == (6, 2) - - def test_with_dictlike_columns(self): - # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) - result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) - expected = Series([{"s": 3} for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - df["tm"] = [ - Timestamp("2017-05-01 00:00:00"), - Timestamp("2017-05-02 00:00:00"), - ] - result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) - tm.assert_series_equal(result, expected) - - # compose a series - result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) - expected = Series([{"s": 3}, {"s": 3}]) - tm.assert_series_equal(result, expected) - - # GH 18775 - df = DataFrame() - df["author"] = ["X", "Y", "Z"] - df["publisher"] = ["BBC", "NBC", "N24"] - df["date"] = pd.to_datetime( - ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] - ) - result = df.apply(lambda x: {}, axis=1) - expected = Series([{}, {}, {}]) - tm.assert_series_equal(result, expected) - - def test_with_dictlike_columns_with_infer(self): - # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) - result = df.apply( - lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" - ) - expected = DataFrame({"s": [3, 3]}) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) - df["tm"] = [ - Timestamp("2017-05-01 00:00:00"), - Timestamp("2017-05-02 00:00:00"), - ] - result = df.apply( - lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" - ) - tm.assert_frame_equal(result, expected) - def test_with_listlike_columns(self): - # GH 17348 - df = DataFrame( - { - "a": Series(np.random.randn(4)), - "b": ["a", "list", "of", "words"], - "ts": date_range("2016-10-01", periods=4, freq="H"), - } - ) +def test_apply_nested_result_axis_1(): + # GH 13820 + def apply_list(row): + return [2 * row["A"], 2 * row["C"], 2 * row["B"]] - result = df[["a", "b"]].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) - tm.assert_series_equal(result, expected) + df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + result = df.apply(apply_list, axis=1) + expected = Series( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + ) + tm.assert_series_equal(result, expected) - result = df[["a", "ts"]].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) - tm.assert_series_equal(result, expected) - # GH 18919 - df = DataFrame( - {"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])} - ) - df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) - - result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) - expected = Series([[], ["q"]], index=df.index) - tm.assert_series_equal(result, expected) - - def test_infer_output_shape_columns(self): - # GH 18573 - - df = DataFrame( - { - "number": [1.0, 2.0], - "string": ["foo", "bar"], - "datetime": [ - Timestamp("2017-11-29 03:30:00"), - Timestamp("2017-11-29 03:45:00"), - ], - } - ) - result = df.apply(lambda row: (row.number, row.string), axis=1) - expected = Series([(t.number, t.string) for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - def test_infer_output_shape_listlike_columns(self): - # GH 16353 - - df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"]) - - result = df.apply(lambda x: [1, 2, 3], axis=1) - expected = Series([[1, 2, 3] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - result = df.apply(lambda x: [1, 2], axis=1) - expected = Series([[1, 2] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - - # GH 17970 - df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) - - result = df.apply(lambda row: np.ones(1), axis=1) - expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) - tm.assert_series_equal(result, expected) - - result = df.apply(lambda row: np.ones(2), axis=1) - expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) - tm.assert_series_equal(result, expected) - - # GH 17892 - df = DataFrame( - { - "a": [ - Timestamp("2010-02-01"), - Timestamp("2010-02-04"), - Timestamp("2010-02-05"), - Timestamp("2010-02-06"), - ], - "b": [9, 5, 4, 3], - "c": [5, 3, 4, 2], - "d": [1, 2, 3, 4], - } - ) +def test_apply_noreduction_tzaware_object(): + # https://github.com/pandas-dev/pandas/issues/31505 + df = DataFrame({"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]") + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) + result = df.apply(lambda x: x.copy()) + tm.assert_frame_equal(result, df) + - def fun(x): - return (1, 2) +def test_apply_function_runs_once(): + # https://github.com/pandas-dev/pandas/issues/30815 - result = df.apply(fun, axis=1) - expected = Series([(1, 2) for t in df.itertuples()]) - tm.assert_series_equal(result, expected) + df = DataFrame({"a": [1, 2, 3]}) + names = [] # Save row names function is applied to - def test_consistent_coerce_for_shapes(self): - # we want column names to NOT be propagated - # just because the shape matches the input shape - df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) + def reducing_function(row): + names.append(row.name) - result = df.apply(lambda x: [1, 2, 3], axis=1) - expected = Series([[1, 2, 3] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) + def non_reducing_function(row): + names.append(row.name) + return row - result = df.apply(lambda x: [1, 2], axis=1) - expected = Series([[1, 2] for t in df.itertuples()]) - tm.assert_series_equal(result, expected) + for func in [reducing_function, non_reducing_function]: + del names[:] - def test_consistent_names(self, int_frame_const_col): - # if a Series is returned, we should use the resulting index names - df = int_frame_const_col + df.apply(func, axis=1) + assert names == list(df.index) - result = df.apply( - lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 - ) - expected = int_frame_const_col.rename( - columns={"A": "test", "B": "other", "C": "cols"} - ) - tm.assert_frame_equal(result, expected) - result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) - expected = expected[["test", "other"]] - tm.assert_frame_equal(result, expected) +def test_apply_raw_function_runs_once(): + # https://github.com/pandas-dev/pandas/issues/34506 - def test_result_type(self, int_frame_const_col): - # result_type should be consistent no matter which - # path we take in the code - df = int_frame_const_col + df = DataFrame({"a": [1, 2, 3]}) + values = [] # Save row values function is applied to - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") - expected = df.copy() - expected.columns = [0, 1, 2] - tm.assert_frame_equal(result, expected) + def reducing_function(row): + values.extend(row) - result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") - expected = df[["A", "B"]].copy() - expected.columns = [0, 1] - tm.assert_frame_equal(result, expected) + def non_reducing_function(row): + values.extend(row) + return row - # broadcast result - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") - expected = df.copy() - tm.assert_frame_equal(result, expected) + for func in [reducing_function, non_reducing_function]: + del values[:] - columns = ["other", "col", "names"] - result = df.apply( - lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" - ) - expected = df.copy() - tm.assert_frame_equal(result, expected) + df.apply(func, raw=True, axis=1) + assert values == list(df.a.to_list()) - # series result - result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) - expected = df.copy() - tm.assert_frame_equal(result, expected) - # series result with other index - columns = ["other", "col", "names"] - result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) - expected = df.copy() - expected.columns = columns - tm.assert_frame_equal(result, expected) +def test_applymap_function_runs_once(): - @pytest.mark.parametrize("result_type", ["foo", 1]) - def test_result_type_error(self, result_type, int_frame_const_col): - # allowed result_type - df = int_frame_const_col + df = DataFrame({"a": [1, 2, 3]}) + values = [] # Save values function is applied to - msg = ( - "invalid value for result_type, must be one of " - "{None, 'reduce', 'broadcast', 'expand'}" - ) - with pytest.raises(ValueError, match=msg): - df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) + def reducing_function(val): + values.append(val) + + def non_reducing_function(val): + values.append(val) + return val - @pytest.mark.parametrize( - "box", - [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], - ids=["list", "tuple", "array"], + for func in [reducing_function, non_reducing_function]: + del values[:] + + df.applymap(func) + assert values == df.a.to_list() + + +def test_apply_with_byte_string(): + # GH 34529 + df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) + expected = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object) + # After we make the aply we exect a dataframe just + # like the original but with the object datatype + result = df.apply(lambda x: x.astype("object")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", ["asd", 12, None, np.NaN]) +def test_apply_category_equalness(val): + # Check if categorical comparisons on apply, GH 21239 + df_values = ["asd", None, 12, "asd", "cde", np.NaN] + df = DataFrame({"a": df_values}, dtype="category") + + result = df.a.apply(lambda x: x == val) + expected = Series( + [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" ) - def test_consistency_for_boxed(self, box, int_frame_const_col): - # passing an array or list should not affect the output shape - df = int_frame_const_col + tm.assert_series_equal(result, expected) - result = df.apply(lambda x: box([1, 2]), axis=1) - expected = Series([box([1, 2]) for t in df.itertuples()]) - tm.assert_series_equal(result, expected) - result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") - expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) - tm.assert_frame_equal(result, expected) +# the user has supplied an opaque UDF where +# they are transforming the input that requires +# us to infer the output -class TestDataFrameAggregate: - def test_agg_transform(self, axis, float_frame): - other_axis = 1 if axis in {0, "index"} else 0 +def test_infer_row_shape(): + # GH 17437 + # if row shape is changing, infer it + df = DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) - with np.errstate(all="ignore"): + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) - f_abs = np.abs(float_frame) - f_sqrt = np.sqrt(float_frame) - - # ufunc - expected = f_sqrt.copy() - result = float_frame.apply(np.sqrt, axis=axis) - tm.assert_frame_equal(result, expected) - - # list-like - result = float_frame.apply([np.sqrt], axis=axis) - expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product( - [float_frame.index, ["sqrt"]] - ) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both - # functions per series and then concatting - result = float_frame.apply([np.abs, np.sqrt], axis=axis) - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) - tm.assert_frame_equal(result, expected) - - def test_transform_and_agg_err(self, axis, float_frame): - # cannot both transform and agg - msg = "cannot combine transform and aggregation operations" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.agg(["max", "sqrt"], axis=axis) - - df = DataFrame({"A": range(5), "B": 5}) - - def f(): - with np.errstate(all="ignore"): - df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - - def test_demo(self): - # demonstration tests - df = DataFrame({"A": range(5), "B": 5}) - - result = df.agg(["min", "max"]) - expected = DataFrame( - {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] - ) - tm.assert_frame_equal(result, expected) - result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) - expected = DataFrame( - {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, - columns=["A", "B"], - index=["max", "min", "sum"], - ) - tm.assert_frame_equal(result.reindex_like(expected), expected) - - def test_agg_with_name_as_column_name(self): - # GH 36212 - Column name is "name" - data = {"name": ["foo", "bar"]} - df = DataFrame(data) - - # result's name should be None - result = df.agg({"name": "count"}) - expected = Series({"name": 2}) - tm.assert_series_equal(result, expected) - - # Check if name is still preserved when aggregating series instead - result = df["name"].agg({"name": "count"}) - expected = Series({"name": 2}, name="name") - tm.assert_series_equal(result, expected) - - def test_agg_multiple_mixed_no_warning(self): - # GH 20909 - mdf = DataFrame( - { - "A": [1, 2, 3], - "B": [1.0, 2.0, 3.0], - "C": ["foo", "bar", "baz"], - "D": pd.date_range("20130101", periods=3), - } - ) - expected = DataFrame( - { - "A": [1, 6], - "B": [1.0, 6.0], - "C": ["bar", "foobarbaz"], - "D": [Timestamp("2013-01-01"), pd.NaT], - }, - index=["min", "sum"], - ) - # sorted index - with tm.assert_produces_warning(None): - result = mdf.agg(["min", "sum"]) +def test_with_dictlike_columns(): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + expected = Series([{"s": 3} for t in df.itertuples()]) + tm.assert_series_equal(result, expected) - tm.assert_frame_equal(result, expected) + df["tm"] = [ + Timestamp("2017-05-01 00:00:00"), + Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(None): - result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) + # compose a series + result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) + expected = Series([{"s": 3}, {"s": 3}]) + tm.assert_series_equal(result, expected) - # For backwards compatibility, the result's index is - # still sorted by function name, so it's ['min', 'sum'] - # not ['sum', 'min']. - expected = expected[["D", "C", "B", "A"]] - tm.assert_frame_equal(result, expected) + # GH 18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime( + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] + ) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + tm.assert_series_equal(result, expected) - def test_agg_dict_nested_renaming_depr(self): - df = DataFrame({"A": range(5), "B": 5}) +def test_with_dictlike_columns_with_infer(): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand") + expected = DataFrame({"s": [3, 3]}) + tm.assert_frame_equal(result, expected) - # nested renaming - msg = r"nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) + df["tm"] = [ + Timestamp("2017-05-01 00:00:00"), + Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand") + tm.assert_frame_equal(result, expected) - def test_agg_reduce(self, axis, float_frame): - other_axis = 1 if axis in {0, "index"} else 0 - name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() - # all reducers - expected = pd.concat( - [ - float_frame.mean(axis=axis), - float_frame.max(axis=axis), - float_frame.sum(axis=axis), +def test_with_listlike_columns(): + # GH 17348 + df = DataFrame( + { + "a": Series(np.random.randn(4)), + "b": ["a", "list", "of", "words"], + "ts": date_range("2016-10-01", periods=4, freq="H"), + } + ) + + result = df[["a", "b"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) + tm.assert_series_equal(result, expected) + + result = df[["a", "ts"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) + tm.assert_series_equal(result, expected) + + # GH 18919 + df = DataFrame({"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])}) + df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) + + result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) + expected = Series([[], ["q"]], index=df.index) + tm.assert_series_equal(result, expected) + + +def test_infer_output_shape_columns(): + # GH 18573 + + df = DataFrame( + { + "number": [1.0, 2.0], + "string": ["foo", "bar"], + "datetime": [ + Timestamp("2017-11-29 03:30:00"), + Timestamp("2017-11-29 03:45:00"), ], - axis=1, - ) - expected.columns = ["mean", "max", "sum"] - expected = expected.T if axis in {0, "index"} else expected + } + ) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([(t.number, t.string) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) - result = float_frame.agg(["mean", "max", "sum"], axis=axis) - tm.assert_frame_equal(result, expected) - # dict input with scalars - func = {name1: "mean", name2: "sum"} - result = float_frame.agg(func, axis=axis) - expected = Series( - [ - float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name2].sum(), +def test_infer_output_shape_listlike_columns(): + # GH 16353 + + df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"]) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + # GH 17970 + df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) + tm.assert_series_equal(result, expected) + + # GH 17892 + df = DataFrame( + { + "a": [ + Timestamp("2010-02-01"), + Timestamp("2010-02-04"), + Timestamp("2010-02-05"), + Timestamp("2010-02-06"), ], - index=[name1, name2], - ) - tm.assert_series_equal(result, expected) - - # dict input with lists - func = {name1: ["mean"], name2: ["sum"]} - result = float_frame.agg(func, axis=axis) - expected = DataFrame( - { - name1: Series( - [float_frame.loc(other_axis)[name1].mean()], index=["mean"] - ), - name2: Series( - [float_frame.loc(other_axis)[name2].sum()], index=["sum"] - ), - } - ) - expected = expected.T if axis in {1, "columns"} else expected - tm.assert_frame_equal(result, expected) + "b": [9, 5, 4, 3], + "c": [5, 3, 4, 2], + "d": [1, 2, 3, 4], + } + ) - # dict input with lists with multiple - func = {name1: ["mean", "sum"], name2: ["sum", "max"]} - result = float_frame.agg(func, axis=axis) - expected = pd.concat( - { - name1: Series( - [ - float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name1].sum(), - ], - index=["mean", "sum"], - ), - name2: Series( - [ - float_frame.loc(other_axis)[name2].sum(), - float_frame.loc(other_axis)[name2].max(), - ], - index=["sum", "max"], - ), - }, - axis=1, - ) - expected = expected.T if axis in {1, "columns"} else expected - tm.assert_frame_equal(result, expected) + def fun(x): + return (1, 2) - def test_nuiscance_columns(self): + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) - # GH 15015 - df = DataFrame( - { - "A": [1, 2, 3], - "B": [1.0, 2.0, 3.0], - "C": ["foo", "bar", "baz"], - "D": pd.date_range("20130101", periods=3), - } - ) - result = df.agg("min") - expected = Series([1, 1.0, "bar", Timestamp("20130101")], index=df.columns) - tm.assert_series_equal(result, expected) +def test_consistent_coerce_for_shapes(): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) - result = df.agg(["min"]) - expected = DataFrame( - [[1, 1.0, "bar", Timestamp("20130101")]], - index=["min"], - columns=df.columns, - ) + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + +def test_consistent_names(int_frame_const_col): + # if a Series is returned, we should use the resulting index names + df = int_frame_const_col + + result = df.apply( + lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 + ) + expected = int_frame_const_col.rename( + columns={"A": "test", "B": "other", "C": "cols"} + ) + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) + expected = expected[["test", "other"]] + tm.assert_frame_equal(result, expected) + + +def test_result_type(int_frame_const_col): + # result_type should be consistent no matter which + # path we take in the code + df = int_frame_const_col + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") + expected = df.copy() + expected.columns = [0, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") + expected = df[["A", "B"]].copy() + expected.columns = [0, 1] + tm.assert_frame_equal(result, expected) + + # broadcast result + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + expected = df.copy() + tm.assert_frame_equal(result, expected) + + columns = ["other", "col", "names"] + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + # series result with other index + columns = ["other", "col", "names"] + result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) + expected = df.copy() + expected.columns = columns + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("result_type", ["foo", 1]) +def test_result_type_error(result_type, int_frame_const_col): + # allowed result_type + df = int_frame_const_col + + msg = ( + "invalid value for result_type, must be one of " + "{None, 'reduce', 'broadcast', 'expand'}" + ) + with pytest.raises(ValueError, match=msg): + df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) + + +@pytest.mark.parametrize( + "box", + [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], + ids=["list", "tuple", "array"], +) +def test_consistency_for_boxed(box, int_frame_const_col): + # passing an array or list should not affect the output shape + df = int_frame_const_col + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") + expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) + tm.assert_frame_equal(result, expected) + + +def test_agg_transform(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + + with np.errstate(all="ignore"): + + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + expected = f_sqrt.copy() + result = float_frame.apply(np.sqrt, axis=axis) tm.assert_frame_equal(result, expected) - result = df.agg("sum") - expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) - tm.assert_series_equal(result, expected) + # list-like + result = float_frame.apply([np.sqrt], axis=axis) + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]]) + else: + expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]]) + tm.assert_frame_equal(result, expected) - result = df.agg(["sum"]) - expected = DataFrame( - [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] - ) + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + result = float_frame.apply([np.abs, np.sqrt], axis=axis) + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("how", ["agg", "apply"]) - def test_non_callable_aggregates(self, how): - # GH 16405 - # 'size' is a property of frame/series - # validate that this is working - # GH 39116 - expand to apply - df = DataFrame( - {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} - ) +def test_transform_and_agg_err(axis, float_frame): + # cannot both transform and agg + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + float_frame.agg(["max", "sqrt"], axis=axis) - # Function aggregate - result = getattr(df, how)({"A": "count"}) - expected = Series({"A": 2}) + df = DataFrame({"A": range(5), "B": 5}) - tm.assert_series_equal(result, expected) + def f(): + with np.errstate(all="ignore"): + df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - # Non-function aggregate - result = getattr(df, how)({"A": "size"}) - expected = Series({"A": 3}) - tm.assert_series_equal(result, expected) +def test_demo(): + # demonstration tests + df = DataFrame({"A": range(5), "B": 5}) - # Mix function and non-function aggs - result1 = getattr(df, how)(["count", "size"]) - result2 = getattr(df, how)( - {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} - ) - expected = DataFrame( - { - "A": {"count": 2, "size": 3}, - "B": {"count": 2, "size": 3}, - "C": {"count": 2, "size": 3}, - } - ) + result = df.agg(["min", "max"]) + expected = DataFrame( + {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] + ) + tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result1, result2, check_like=True) - tm.assert_frame_equal(result2, expected, check_like=True) + result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) + expected = DataFrame( + {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, + columns=["A", "B"], + index=["max", "min", "sum"], + ) + tm.assert_frame_equal(result.reindex_like(expected), expected) - # Just functional string arg is same as calling df.arg() - result = getattr(df, how)("count") - expected = df.count() - tm.assert_series_equal(result, expected) +def test_agg_with_name_as_column_name(): + # GH 36212 - Column name is "name" + data = {"name": ["foo", "bar"]} + df = DataFrame(data) - # Just a string attribute arg same as calling df.arg - result = getattr(df, how)("size") - expected = df.size + # result's name should be None + result = df.agg({"name": "count"}) + expected = Series({"name": 2}) + tm.assert_series_equal(result, expected) - assert result == expected + # Check if name is still preserved when aggregating series instead + result = df["name"].agg({"name": "count"}) + expected = Series({"name": 2}, name="name") + tm.assert_series_equal(result, expected) - def test_agg_listlike_result(self): - # GH-29587 user defined function returning list-likes - df = DataFrame( - {"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]} - ) - def func(group_col): - return list(group_col.dropna().unique()) +def test_agg_multiple_mixed_no_warning(): + # GH 20909 + mdf = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": date_range("20130101", periods=3), + } + ) + expected = DataFrame( + { + "A": [1, 6], + "B": [1.0, 6.0], + "C": ["bar", "foobarbaz"], + "D": [Timestamp("2013-01-01"), pd.NaT], + }, + index=["min", "sum"], + ) + # sorted index + with tm.assert_produces_warning(None): + result = mdf.agg(["min", "sum"]) + + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(None): + result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) + + # For backwards compatibility, the result's index is + # still sorted by function name, so it's ['min', 'sum'] + # not ['sum', 'min']. + expected = expected[["D", "C", "B", "A"]] + tm.assert_frame_equal(result, expected) - result = df.agg(func) - expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) - tm.assert_series_equal(result, expected) - result = df.agg([func]) - expected = expected.to_frame("func").T - tm.assert_frame_equal(result, expected) +def test_agg_dict_nested_renaming_depr(): - @pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), + df = DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + +def test_agg_reduce(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() + + # all reducers + expected = pd.concat( + [ + float_frame.mean(axis=axis), + float_frame.max(axis=axis), + float_frame.sum(axis=axis), + ], + axis=1, + ) + expected.columns = ["mean", "max", "sum"] + expected = expected.T if axis in {0, "index"} else expected + + result = float_frame.agg(["mean", "max", "sum"], axis=axis) + tm.assert_frame_equal(result, expected) + + # dict input with scalars + func = {name1: "mean", name2: "sum"} + result = float_frame.agg(func, axis=axis) + expected = Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name2].sum(), + ], + index=[name1, name2], + ) + tm.assert_series_equal(result, expected) + + # dict input with lists + func = {name1: ["mean"], name2: ["sum"]} + result = float_frame.agg(func, axis=axis) + expected = DataFrame( + { + name1: Series([float_frame.loc(other_axis)[name1].mean()], index=["mean"]), + name2: Series([float_frame.loc(other_axis)[name2].sum()], index=["sum"]), + } + ) + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + # dict input with lists with multiple + func = {name1: ["mean", "sum"], name2: ["sum", "max"]} + result = float_frame.agg(func, axis=axis) + expected = pd.concat( + { + name1: Series( [ - ("sum", Series(dtype="float64")), - ("max", Series(dtype="float64")), - ("min", Series(dtype="float64")), - ("all", Series(dtype=bool)), - ("any", Series(dtype=bool)), - ("mean", Series(dtype="float64")), - ("prod", Series(dtype="float64")), - ("std", Series(dtype="float64")), - ("var", Series(dtype="float64")), - ("median", Series(dtype="float64")), + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name1].sum(), ], + index=["mean", "sum"], ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), + name2: Series( [ - ("sum", Series([1.0, 3])), - ("max", Series([1.0, 2])), - ("min", Series([1.0, 1])), - ("all", Series([True, True])), - ("any", Series([True, True])), - ("mean", Series([1, 1.5])), - ("prod", Series([1.0, 2])), - ("std", Series([np.nan, 0.707107])), - ("var", Series([np.nan, 0.5])), - ("median", Series([1, 1.5])), + float_frame.loc(other_axis)[name2].sum(), + float_frame.loc(other_axis)[name2].max(), ], + index=["sum", "max"], ), - ), + }, + axis=1, ) - def test_agg_cython_table(self, df, func, expected, axis): - # GH 21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = df.agg(func, axis=axis) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] - ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), - [ - ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), - ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), - ], - ), - ), + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + +def test_nuiscance_columns(): + + # GH 15015 + df = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": date_range("20130101", periods=3), + } ) - def test_agg_cython_table_transform(self, df, func, expected, axis): - # GH 21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if axis == "columns" or axis == 1: - # operating blockwise doesn't let us preserve dtypes - expected = expected.astype("float64") - - result = df.agg(func, axis=axis) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "df, func, expected", - tm.get_cython_table_params( - DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] - ), + result = df.agg("min") + expected = Series([1, 1.0, "bar", Timestamp("20130101")], index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.agg(["min"]) + expected = DataFrame( + [[1, 1.0, "bar", Timestamp("20130101")]], + index=["min"], + columns=df.columns, ) - def test_agg_cython_table_raises(self, df, func, expected, axis): - # GH 21224 - msg = "can't multiply sequence by non-int of type 'str'" - with pytest.raises(expected, match=msg): - df.agg(func, axis=axis) - - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize( - "args, kwargs", - [ - ((1, 2, 3), {}), - ((8, 7, 15), {}), - ((1, 2), {}), - ((1,), {"b": 2}), - ((), {"a": 1, "b": 2}), - ((), {"a": 2, "b": 1}), - ((), {"a": 1, "b": 2, "c": 3}), - ], + tm.assert_frame_equal(result, expected) + + result = df.agg("sum") + expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg(["sum"]) + expected = DataFrame( + [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] ) - def test_agg_args_kwargs(self, axis, args, kwargs): - def f(x, a, b, c=3): - return x.sum() + (a + b) / c + tm.assert_frame_equal(result, expected) - df = DataFrame([[1, 2], [3, 4]]) - if axis == 0: - expected = Series([5.0, 7.0]) - else: - expected = Series([4.0, 8.0]) - - result = df.agg(f, axis, *args, **kwargs) - - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("num_cols", [2, 3, 5]) - def test_frequency_is_original(self, num_cols): - # GH 22150 - index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) - original = index.copy() - df = DataFrame(1, index=index, columns=range(num_cols)) - df.apply(lambda x: x) - assert index.freq == original.freq - - def test_apply_datetime_tz_issue(self): - # GH 29052 - - timestamps = [ - Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), - Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), - Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), - ] - df = DataFrame(data=[0, 1, 2], index=timestamps) - result = df.apply(lambda x: x.name, axis=1) - expected = Series(index=timestamps, data=timestamps) - - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) - @pytest.mark.parametrize("method", ["min", "max", "sum"]) - def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, method): - # GH 16832 - none_in_first_column_result = getattr(df[["A", "B"]], method)() - none_in_second_column_result = getattr(df[["B", "A"]], method)() - - tm.assert_series_equal( - none_in_first_column_result, none_in_second_column_result - ) +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_non_callable_aggregates(how): + + # GH 16405 + # 'size' is a property of frame/series + # validate that this is working + # GH 39116 - expand to apply + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + + # Function aggregate + result = getattr(df, how)({"A": "count"}) + expected = Series({"A": 2}) + + tm.assert_series_equal(result, expected) + + # Non-function aggregate + result = getattr(df, how)({"A": "size"}) + expected = Series({"A": 3}) + + tm.assert_series_equal(result, expected) + + # Mix function and non-function aggs + result1 = getattr(df, how)(["count", "size"]) + result2 = getattr(df, how)( + {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} + ) + expected = DataFrame( + { + "A": {"count": 2, "size": 3}, + "B": {"count": 2, "size": 3}, + "C": {"count": 2, "size": 3}, + } + ) - @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) - def test_apply_dtype(self, col): - # GH 31466 - df = DataFrame([[1.0, col]], columns=["a", "b"]) - result = df.apply(lambda x: x.dtype) - expected = df.dtypes + tm.assert_frame_equal(result1, result2, check_like=True) + tm.assert_frame_equal(result2, expected, check_like=True) - tm.assert_series_equal(result, expected) + # Just functional string arg is same as calling df.arg() + result = getattr(df, how)("count") + expected = df.count() + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_size_as_str(how, axis): + # GH 39934 + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + # Just a string attribute arg same as calling df.arg + # on the columns + result = getattr(df, how)("size", axis=axis) + if axis == 0 or axis == "index": + expected = Series(df.shape[0], index=df.columns, name="size") + else: + expected = Series(df.shape[1], index=df.index, name="size") + tm.assert_series_equal(result, expected) + + +def test_agg_listlike_result(): + # GH-29587 user defined function returning list-likes + df = DataFrame({"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]}) + + def func(group_col): + return list(group_col.dropna().unique()) + + result = df.agg(func) + expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg([func]) + expected = expected.to_frame("func").T + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), + [ + ("sum", Series(dtype="float64")), + ("max", Series(dtype="float64")), + ("min", Series(dtype="float64")), + ("all", Series(dtype=bool)), + ("any", Series(dtype=bool)), + ("mean", Series(dtype="float64")), + ("prod", Series(dtype="float64")), + ("std", Series(dtype="float64")), + ("var", Series(dtype="float64")), + ("median", Series(dtype="float64")), + ], + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("sum", Series([1.0, 3])), + ("max", Series([1.0, 2])), + ("min", Series([1.0, 1])), + ("all", Series([True, True])), + ("any", Series([True, True])), + ("mean", Series([1, 1.5])), + ("prod", Series([1.0, 2])), + ("std", Series([np.nan, 0.707107])), + ("var", Series([np.nan, 0.5])), + ("median", Series([1, 1.5])), + ], + ), + ), +) +def test_agg_cython_table(df, func, expected, axis): + # GH 21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = df.agg(func, axis=axis) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + chain( + tm.get_cython_table_params( + DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] + ), + tm.get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), + ], + ), + ), +) +def test_agg_cython_table_transform(df, func, expected, axis): + # GH 21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + + result = df.agg(func, axis=axis) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df, func, expected", + tm.get_cython_table_params( + DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] + ), +) +def test_agg_cython_table_raises(df, func, expected, axis): + # GH 21224 + msg = "can't multiply sequence by non-int of type 'str'" + with pytest.raises(expected, match=msg): + df.agg(func, axis=axis) + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize( + "args, kwargs", + [ + ((1, 2, 3), {}), + ((8, 7, 15), {}), + ((1, 2), {}), + ((1,), {"b": 2}), + ((), {"a": 1, "b": 2}), + ((), {"a": 2, "b": 1}), + ((), {"a": 1, "b": 2, "c": 3}), + ], +) +def test_agg_args_kwargs(axis, args, kwargs): + def f(x, a, b, c=3): + return x.sum() + (a + b) / c + + df = DataFrame([[1, 2], [3, 4]]) + + if axis == 0: + expected = Series([5.0, 7.0]) + else: + expected = Series([4.0, 8.0]) + + result = df.agg(f, axis, *args, **kwargs) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("num_cols", [2, 3, 5]) +def test_frequency_is_original(num_cols): + # GH 22150 + index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) + original = index.copy() + df = DataFrame(1, index=index, columns=range(num_cols)) + df.apply(lambda x: x) + assert index.freq == original.freq + + +def test_apply_datetime_tz_issue(): + # GH 29052 + + timestamps = [ + Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1) + expected = Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) +@pytest.mark.parametrize("method", ["min", "max", "sum"]) +def test_consistency_of_aggregates_of_columns_with_missing_values(df, method): + # GH 16832 + none_in_first_column_result = getattr(df[["A", "B"]], method)() + none_in_second_column_result = getattr(df[["B", "A"]], method)() + + tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result) + + +@pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) +def test_apply_dtype(col): + # GH 31466 + df = DataFrame([[1.0, col]], columns=["a", "b"]) + result = df.apply(lambda x: x.dtype) + expected = df.dtypes + + tm.assert_series_equal(result, expected) def test_apply_mutating(): diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py index 965f69753bdc7..732aff24428ac 100644 --- a/pandas/tests/apply/test_frame_apply_relabeling.py +++ b/pandas/tests/apply/test_frame_apply_relabeling.py @@ -5,100 +5,103 @@ import pandas._testing as tm -class TestDataFrameNamedAggregate: - def test_agg_relabel(self): - # GH 26513 - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - - # simplest case with one column, one func - result = df.agg(foo=("B", "sum")) - expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) - tm.assert_frame_equal(result, expected) - - # test on same column with different methods - result = df.agg(foo=("B", "sum"), bar=("B", "min")) - expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) - - tm.assert_frame_equal(result, expected) - - def test_agg_relabel_multi_columns_multi_methods(self): - # GH 26513, test on multiple columns with multiple methods - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - result = df.agg( - foo=("A", "sum"), - bar=("B", "mean"), - cat=("A", "min"), - dat=("B", "max"), - f=("A", "max"), - g=("C", "min"), - ) - expected = pd.DataFrame( - { - "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], - "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], - "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], - }, - index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_relabel_partial_functions(self): - # GH 26513, test on partial, functools or more complex cases - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) - expected = pd.DataFrame( - {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) - ) - tm.assert_frame_equal(result, expected) - - result = df.agg( - foo=("A", min), - bar=("A", np.min), - cat=("B", max), - dat=("C", "min"), - f=("B", np.sum), - kk=("B", lambda x: min(x)), - ) - expected = pd.DataFrame( - { - "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], - "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], - }, - index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_namedtuple(self): - # GH 26513 - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - result = df.agg( - foo=pd.NamedAgg("B", "sum"), - bar=pd.NamedAgg("B", min), - cat=pd.NamedAgg(column="B", aggfunc="count"), - fft=pd.NamedAgg("B", aggfunc="max"), - ) - - expected = pd.DataFrame( - {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) - ) - tm.assert_frame_equal(result, expected) - - result = df.agg( - foo=pd.NamedAgg("A", "min"), - bar=pd.NamedAgg(column="B", aggfunc="max"), - cat=pd.NamedAgg(column="A", aggfunc="max"), - ) - expected = pd.DataFrame( - {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, - index=pd.Index(["foo", "bar", "cat"]), - ) - tm.assert_frame_equal(result, expected) - - def test_agg_raises(self): - # GH 26513 - df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) - msg = "Must provide" - - with pytest.raises(TypeError, match=msg): - df.agg() +def test_agg_relabel(): + # GH 26513 + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + + # simplest case with one column, one func + result = df.agg(foo=("B", "sum")) + expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) + tm.assert_frame_equal(result, expected) + + # test on same column with different methods + result = df.agg(foo=("B", "sum"), bar=("B", "min")) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multi_columns_multi_methods(): + # GH 26513, test on multiple columns with multiple methods + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg( + foo=("A", "sum"), + bar=("B", "mean"), + cat=("A", "min"), + dat=("B", "max"), + f=("A", "max"), + g=("C", "min"), + ) + expected = pd.DataFrame( + { + "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], + "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_partial_functions(): + # GH 26513, test on partial, functools or more complex cases + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + expected = pd.DataFrame( + {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + kk=("B", lambda x: min(x)), + ) + expected = pd.DataFrame( + { + "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_namedtuple(): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.agg( + foo=pd.NamedAgg("B", "sum"), + bar=pd.NamedAgg("B", min), + cat=pd.NamedAgg(column="B", aggfunc="count"), + fft=pd.NamedAgg("B", aggfunc="max"), + ) + + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=pd.NamedAgg("A", "min"), + bar=pd.NamedAgg(column="B", aggfunc="max"), + cat=pd.NamedAgg(column="A", aggfunc="max"), + ) + expected = pd.DataFrame( + {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, + index=pd.Index(["foo", "bar", "cat"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_raises(): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index bf8311f992ea5..19e6cda4ebd22 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -23,821 +23,857 @@ from pandas.core.base import SpecificationError -class TestSeriesApply: - def test_series_map_box_timedelta(self): - # GH#11349 - ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) +def test_series_map_box_timedelta(): + # GH#11349 + ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) - def f(x): - return x.total_seconds() + def f(x): + return x.total_seconds() - ser.map(f) - ser.apply(f) - DataFrame(ser).applymap(f) + ser.map(f) + ser.apply(f) + DataFrame(ser).applymap(f) - def test_apply(self, datetime_series): - with np.errstate(all="ignore"): - tm.assert_series_equal( - datetime_series.apply(np.sqrt), np.sqrt(datetime_series) - ) - # element-wise apply - import math +def test_apply(datetime_series): + with np.errstate(all="ignore"): + tm.assert_series_equal(datetime_series.apply(np.sqrt), np.sqrt(datetime_series)) - tm.assert_series_equal( - datetime_series.apply(math.exp), np.exp(datetime_series) - ) + # element-wise apply + import math - # empty series - s = Series(dtype=object, name="foo", index=Index([], name="bar")) - rs = s.apply(lambda x: x) - tm.assert_series_equal(s, rs) + tm.assert_series_equal(datetime_series.apply(math.exp), np.exp(datetime_series)) - # check all metadata (GH 9322) - assert s is not rs - assert s.index is rs.index - assert s.dtype == rs.dtype - assert s.name == rs.name + # empty series + s = Series(dtype=object, name="foo", index=Index([], name="bar")) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) - # index but no data - s = Series(index=[1, 2, 3], dtype=np.float64) - rs = s.apply(lambda x: x) - tm.assert_series_equal(s, rs) + # check all metadata (GH 9322) + assert s is not rs + assert s.index is rs.index + assert s.dtype == rs.dtype + assert s.name == rs.name - def test_apply_same_length_inference_bug(self): - s = Series([1, 2]) + # index but no data + s = Series(index=[1, 2, 3], dtype=np.float64) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) - def f(x): - return (x, x + 1) - result = s.apply(f) - expected = s.map(f) - tm.assert_series_equal(result, expected) +def test_apply_same_length_inference_bug(): + s = Series([1, 2]) - s = Series([1, 2, 3]) - result = s.apply(f) - expected = s.map(f) - tm.assert_series_equal(result, expected) + def f(x): + return (x, x + 1) - def test_apply_dont_convert_dtype(self): - s = Series(np.random.randn(10)) - - def f(x): - return x if x > 0 else np.nan - - result = s.apply(f, convert_dtype=False) - assert result.dtype == object - - def test_with_string_args(self, datetime_series): - - for arg in ["sum", "mean", "min", "max", "std"]: - result = datetime_series.apply(arg) - expected = getattr(datetime_series, arg)() - assert result == expected - - def test_apply_args(self): - s = Series(["foo,bar"]) - - result = s.apply(str.split, args=(",",)) - assert result[0] == ["foo", "bar"] - assert isinstance(result[0], list) - - def test_series_map_box_timestamps(self): - # GH#2689, GH#2627 - ser = Series(pd.date_range("1/1/2000", periods=10)) - - def func(x): - return (x.hour, x.day, x.month) - - # it works! - ser.map(func) - ser.apply(func) - - def test_apply_box(self): - # ufunc will not be boxed. Same test cases as the test_map_box - vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" - # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) - tm.assert_series_equal(res, exp) - - vals = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - ] - s = Series(vals) - assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) - tm.assert_series_equal(res, exp) - - # timedelta - vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") - exp = Series(["Timedelta_1", "Timedelta_2"]) - tm.assert_series_equal(res, exp) - - # period - vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") - exp = Series(["Period_M", "Period_M"]) - tm.assert_series_equal(res, exp) - - def test_apply_datetimetz(self): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( - "Asia/Tokyo" - ) - s = Series(values, name="XX") + result = s.apply(f) + expected = s.map(f) + tm.assert_series_equal(result, expected) - result = s.apply(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( - "Asia/Tokyo" - ) - exp = Series(exp_values, name="XX") - tm.assert_series_equal(result, exp) - - # change dtype - # GH 14506 : Returned dtype changed from int32 to int64 - result = s.apply(lambda x: x.hour) - exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) - tm.assert_series_equal(result, exp) - - # not vectorized - def f(x): - if not isinstance(x, pd.Timestamp): - raise ValueError - return str(x.tz) - - result = s.map(f) - exp = Series(["Asia/Tokyo"] * 25, name="XX") - tm.assert_series_equal(result, exp) - - def test_apply_dict_depr(self): - - tsdf = DataFrame( - np.random.randn(10, 3), - columns=["A", "B", "C"], - index=pd.date_range("1/1/2000", periods=10), - ) - msg = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - tsdf.A.agg({"foo": ["sum", "mean"]}) - - def test_apply_categorical(self): - values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - ser = Series(values, name="XX", index=list("abcdefg")) - result = ser.apply(lambda x: x.lower()) - - # should be categorical dtype when the number of categories are - # the same - values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) - exp = Series(values, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - tm.assert_categorical_equal(result.values, exp.values) - - result = ser.apply(lambda x: "A") - exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - assert result.dtype == object - - @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) - def test_apply_categorical_with_nan_values(self, series): - # GH 20714 bug fixed in: GH 24275 - s = Series(series, dtype="category") - result = s.apply(lambda x: x.split("-")[0]) - result = result.astype(object) - expected = Series(["1", "1", np.NaN], dtype="category") - expected = expected.astype(object) - tm.assert_series_equal(result, expected) + s = Series([1, 2, 3]) + result = s.apply(f) + expected = s.map(f) + tm.assert_series_equal(result, expected) - def test_apply_empty_integer_series_with_datetime_index(self): - # GH 21245 - s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) - result = s.apply(lambda x: x) - tm.assert_series_equal(result, s) +def test_apply_dont_convert_dtype(): + s = Series(np.random.randn(10)) -class TestSeriesAggregate: - def test_transform(self, string_series): - # transforming functions + def f(x): + return x if x > 0 else np.nan - with np.errstate(all="ignore"): + result = s.apply(f, convert_dtype=False) + assert result.dtype == object - f_sqrt = np.sqrt(string_series) - f_abs = np.abs(string_series) - - # ufunc - result = string_series.apply(np.sqrt) - expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) - - # list-like - result = string_series.apply([np.sqrt]) - expected = f_sqrt.to_frame().copy() - expected.columns = ["sqrt"] - tm.assert_frame_equal(result, expected) - - result = string_series.apply(["sqrt"]) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both functions per - # series and then concatting - expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ["sqrt", "absolute"] - result = string_series.apply([np.sqrt, np.abs]) - tm.assert_frame_equal(result, expected) - - # dict, provide renaming - expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ["foo", "bar"] - expected = expected.unstack().rename("series") - - result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) - tm.assert_series_equal(result.reindex_like(expected), expected) - - def test_transform_and_agg_error(self, string_series): - # we are trying to transform with an aggregator - msg = "cannot combine transform and aggregation" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.agg(["sqrt", "max"]) - - msg = "cannot perform both aggregation and transformation" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.agg({"foo": np.sqrt, "bar": "sum"}) - - def test_demo(self): - # demonstration tests - s = Series(range(6), dtype="int64", name="series") - - result = s.agg(["min", "max"]) - expected = Series([0, 5], index=["min", "max"], name="series") - tm.assert_series_equal(result, expected) - result = s.agg({"foo": "min"}) - expected = Series([0], index=["foo"], name="series") - tm.assert_series_equal(result, expected) +def test_with_string_args(datetime_series): + + for arg in ["sum", "mean", "min", "max", "std"]: + result = datetime_series.apply(arg) + expected = getattr(datetime_series, arg)() + assert result == expected - # nested renaming - msg = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - s.agg({"foo": ["min", "max"]}) - def test_multiple_aggregators_with_dict_api(self): +def test_apply_args(): + s = Series(["foo,bar"]) + + result = s.apply(str.split, args=(",",)) + assert result[0] == ["foo", "bar"] + assert isinstance(result[0], list) + + +def test_series_map_box_timestamps(): + # GH#2689, GH#2627 + ser = Series(pd.date_range("1/1/2000", periods=10)) + + def func(x): + return (x.hour, x.day, x.month) + + # it works! + ser.map(func) + ser.apply(func) + + +def test_apply_box(): + # ufunc will not be boxed. Same test cases as the test_map_box + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + s = Series(vals) + assert s.dtype == "datetime64[ns]" + # boxed value must be Timestamp instance + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = Series(vals) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + s = Series(vals) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") + exp = Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = Series(vals) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") + exp = Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + +def test_apply_datetimetz(): + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = Series(values, name="XX") - s = Series(range(6), dtype="int64", name="series") - # nested renaming - msg = "nested renamer is not supported" - with pytest.raises(SpecificationError, match=msg): - s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) + result = s.apply(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) - def test_agg_apply_evaluate_lambdas_the_same(self, string_series): - # test that we are evaluating row-by-row first - # before vectorized evaluation - result = string_series.apply(lambda x: str(x)) - expected = string_series.agg(lambda x: str(x)) - tm.assert_series_equal(result, expected) + # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 + result = s.apply(lambda x: x.hour) + exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) + tm.assert_series_equal(result, exp) + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) - result = string_series.apply(str) - expected = string_series.agg(str) + +def test_apply_dict_depr(): + + tsdf = DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + tsdf.A.agg({"foo": ["sum", "mean"]}) + + +def test_apply_categorical(): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + ser = Series(values, name="XX", index=list("abcdefg")) + result = ser.apply(lambda x: x.lower()) + + # should be categorical dtype when the number of categories are + # the same + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = Series(values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp.values) + + result = ser.apply(lambda x: "A") + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == object + + +@pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) +def test_apply_categorical_with_nan_values(series): + # GH 20714 bug fixed in: GH 24275 + s = Series(series, dtype="category") + result = s.apply(lambda x: x.split("-")[0]) + result = result.astype(object) + expected = Series(["1", "1", np.NaN], dtype="category") + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + + +def test_apply_empty_integer_series_with_datetime_index(): + # GH 21245 + s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) + result = s.apply(lambda x: x) + tm.assert_series_equal(result, s) + + +def test_transform(string_series): + # transforming functions + + with np.errstate(all="ignore"): + + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.apply(np.sqrt) + expected = f_sqrt.copy() tm.assert_series_equal(result, expected) - def test_with_nested_series(self, datetime_series): - # GH 2316 - # .agg with a reducer and a transform, what to do - result = datetime_series.apply( - lambda x: Series([x, x ** 2], index=["x", "x^2"]) - ) - expected = DataFrame({"x": datetime_series, "x^2": datetime_series ** 2}) + # list-like + result = string_series.apply([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) - result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) + result = string_series.apply(["sqrt"]) tm.assert_frame_equal(result, expected) - def test_replicate_describe(self, string_series): - # this also tests a result set that is all scalars - expected = string_series.describe() - result = string_series.apply( - { - "count": "count", - "mean": "mean", - "std": "std", - "min": "min", - "25%": lambda x: x.quantile(0.25), - "50%": "median", - "75%": lambda x: x.quantile(0.75), - "max": "max", - } - ) - tm.assert_series_equal(result, expected) + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["sqrt", "absolute"] + result = string_series.apply([np.sqrt, np.abs]) + tm.assert_frame_equal(result, expected) - def test_reduce(self, string_series): - # reductions with named functions - result = string_series.agg(["sum", "mean"]) - expected = Series( - [string_series.sum(), string_series.mean()], - ["sum", "mean"], - name=string_series.name, - ) - tm.assert_series_equal(result, expected) + # dict, provide renaming + expected = concat([f_sqrt, f_abs], axis=1) + expected.columns = ["foo", "bar"] + expected = expected.unstack().rename("series") - @pytest.mark.parametrize("how", ["agg", "apply"]) - def test_non_callable_aggregates(self, how): - # test agg using non-callable series attributes - # GH 39116 - expand to apply - s = Series([1, 2, None]) + result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) + tm.assert_series_equal(result.reindex_like(expected), expected) - # Calling agg w/ just a string arg same as calling s.arg - result = getattr(s, how)("size") - expected = s.size + +def test_transform_and_agg_error(string_series): + # we are trying to transform with an aggregator + msg = "cannot combine transform and aggregation" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.agg(["sqrt", "max"]) + + msg = "cannot perform both aggregation and transformation" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.agg({"foo": np.sqrt, "bar": "sum"}) + + +def test_demo(): + # demonstration tests + s = Series(range(6), dtype="int64", name="series") + + result = s.agg(["min", "max"]) + expected = Series([0, 5], index=["min", "max"], name="series") + tm.assert_series_equal(result, expected) + + result = s.agg({"foo": "min"}) + expected = Series([0], index=["foo"], name="series") + tm.assert_series_equal(result, expected) + + # nested renaming + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"]}) + + +def test_multiple_aggregators_with_dict_api(): + + s = Series(range(6), dtype="int64", name="series") + # nested renaming + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) + + +def test_agg_apply_evaluate_lambdas_the_same(string_series): + # test that we are evaluating row-by-row first + # before vectorized evaluation + result = string_series.apply(lambda x: str(x)) + expected = string_series.agg(lambda x: str(x)) + tm.assert_series_equal(result, expected) + + result = string_series.apply(str) + expected = string_series.agg(str) + tm.assert_series_equal(result, expected) + + +def test_with_nested_series(datetime_series): + # GH 2316 + # .agg with a reducer and a transform, what to do + result = datetime_series.apply(lambda x: Series([x, x ** 2], index=["x", "x^2"])) + expected = DataFrame({"x": datetime_series, "x^2": datetime_series ** 2}) + tm.assert_frame_equal(result, expected) + + result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + + +def test_replicate_describe(string_series): + # this also tests a result set that is all scalars + expected = string_series.describe() + result = string_series.apply( + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + } + ) + tm.assert_series_equal(result, expected) + + +def test_reduce(string_series): + # reductions with named functions + result = string_series.agg(["sum", "mean"]) + expected = Series( + [string_series.sum(), string_series.mean()], + ["sum", "mean"], + name=string_series.name, + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["agg", "apply"]) +def test_non_callable_aggregates(how): + # test agg using non-callable series attributes + # GH 39116 - expand to apply + s = Series([1, 2, None]) + + # Calling agg w/ just a string arg same as calling s.arg + result = getattr(s, how)("size") + expected = s.size + assert result == expected + + # test when mixed w/ callable reducers + result = getattr(s, how)(["size", "count", "mean"]) + expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("sum", 0), + ("max", np.nan), + ("min", np.nan), + ("all", True), + ("any", False), + ("mean", np.nan), + ("prod", 1), + ("std", np.nan), + ("var", np.nan), + ("median", np.nan), + ], + ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("sum", 6), + ("max", 3), + ("min", 1), + ("all", True), + ("any", True), + ("mean", 2), + ("prod", 6), + ("std", 1), + ("var", 1), + ("median", 2), + ], + ), + tm.get_cython_table_params( + Series("a b c".split()), + [ + ("sum", "abc"), + ("max", "c"), + ("min", "a"), + ("all", "c"), # see GH12863 + ("any", "a"), + ], + ), + ), +) +def test_agg_cython_table(series, func, expected): + # GH21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = series.agg(func) + if is_number(expected): + assert np.isclose(result, expected, equal_nan=True) + else: assert result == expected - # test when mixed w/ callable reducers - result = getattr(s, how)(["size", "count", "mean"]) - expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("sum", 0), - ("max", np.nan), - ("min", np.nan), - ("all", True), - ("any", False), - ("mean", np.nan), - ("prod", 1), - ("std", np.nan), - ("var", np.nan), - ("median", np.nan), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("sum", 6), - ("max", 3), - ("min", 1), - ("all", True), - ("any", True), - ("mean", 2), - ("prod", 6), - ("std", 1), - ("var", 1), - ("median", 2), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), - [ - ("sum", "abc"), - ("max", "c"), - ("min", "a"), - ("all", "c"), # see GH12863 - ("any", "a"), - ], - ), +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series(dtype=np.float64), + [ + ("cumprod", Series([], Index([]), dtype=np.float64)), + ("cumsum", Series([], Index([]), dtype=np.float64)), + ], ), - ) - def test_agg_cython_table(self, series, func, expected): - # GH21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = series.agg(func) - if is_number(expected): - assert np.isclose(result, expected, equal_nan=True) - else: - assert result == expected - - @pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("cumprod", Series([], Index([]), dtype=np.float64)), - ("cumsum", Series([], Index([]), dtype=np.float64)), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("cumprod", Series([np.nan, 1, 2, 6])), - ("cumsum", Series([np.nan, 1, 3, 6])), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] - ), + tm.get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("cumprod", Series([np.nan, 1, 2, 6])), + ("cumsum", Series([np.nan, 1, 3, 6])), + ], ), + tm.get_cython_table_params( + Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] + ), + ), +) +def test_agg_cython_table_transform(series, func, expected): + # GH21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + result = series.agg(func) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "series, func, expected", + chain( + tm.get_cython_table_params( + Series("a b c".split()), + [ + ("mean", TypeError), # mean raises TypeError + ("prod", TypeError), + ("std", TypeError), + ("var", TypeError), + ("median", TypeError), + ("cumprod", TypeError), + ], + ) + ), +) +def test_agg_cython_table_raises(series, func, expected): + # GH21224 + msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + with pytest.raises(expected, match=msg): + # e.g. Series('a b'.split()).cumprod() will raise + series.agg(func) + + +def test_series_apply_no_suffix_index(): + # GH36189 + s = Series([4] * 3) + result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + expected = Series([12, 12, 12], index=["sum", "", ""]) + + tm.assert_series_equal(result, expected) + + +def test_map(datetime_series): + index, data = tm.getMixedTypeDict() + + source = Series(data["B"], index=data["C"]) + target = Series(data["C"][:4], index=data["D"][:4]) + + merged = target.map(source) + + for k, v in merged.items(): + assert v == source[target[k]] + + # input could be a dict + merged = target.map(source.to_dict()) + + for k, v in merged.items(): + assert v == source[target[k]] + + # function + result = datetime_series.map(lambda x: x * 2) + tm.assert_series_equal(result, datetime_series * 2) + + # GH 10324 + a = Series([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) + + exp = Series(["odd", "even", "odd", np.nan], dtype="category") + tm.assert_series_equal(a.map(b), exp) + exp = Series(["odd", "even", "odd", np.nan]) + tm.assert_series_equal(a.map(c), exp) + + a = Series(["a", "b", "c", "d"]) + b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) + c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) + + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(c), exp) + + a = Series(["a", "b", "c", "d"]) + b = Series( + ["B", "C", "D", "E"], + dtype="category", + index=pd.CategoricalIndex(["b", "c", "d", "e"]), ) - def test_agg_cython_table_transform(self, series, func, expected): - # GH21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - result = series.agg(func) - tm.assert_series_equal(result, expected) + c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) - @pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series("a b c".split()), - [ - ("mean", TypeError), # mean raises TypeError - ("prod", TypeError), - ("std", TypeError), - ("var", TypeError), - ("median", TypeError), - ("cumprod", TypeError), - ], - ) - ), + exp = Series( + pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) ) - def test_agg_cython_table_raises(self, series, func, expected): - # GH21224 - msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" - with pytest.raises(expected, match=msg): - # e.g. Series('a b'.split()).cumprod() will raise - series.agg(func) - - def test_series_apply_no_suffix_index(self): - # GH36189 - s = Series([4] * 3) - result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) - expected = Series([12, 12, 12], index=["sum", "", ""]) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, "B", "C", "D"]) + tm.assert_series_equal(a.map(c), exp) - tm.assert_series_equal(result, expected) +def test_map_empty(index): + if isinstance(index, MultiIndex): + pytest.skip("Initializing a Series from a MultiIndex is not supported") -class TestSeriesMap: - def test_map(self, datetime_series): - index, data = tm.getMixedTypeDict() + s = Series(index) + result = s.map({}) - source = Series(data["B"], index=data["C"]) - target = Series(data["C"][:4], index=data["D"][:4]) + expected = Series(np.nan, index=s.index) + tm.assert_series_equal(result, expected) - merged = target.map(source) - for k, v in merged.items(): - assert v == source[target[k]] +def test_map_compat(): + # related GH 8024 + s = Series([True, True, False], index=[1, 2, 3]) + result = s.map({True: "foo", False: "bar"}) + expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) - # input could be a dict - merged = target.map(source.to_dict()) - for k, v in merged.items(): - assert v == source[target[k]] +def test_map_int(): + left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) + right = Series({1: 11, 2: 22, 3: 33}) - # function - result = datetime_series.map(lambda x: x * 2) - tm.assert_series_equal(result, datetime_series * 2) + assert left.dtype == np.float_ + assert issubclass(right.dtype.type, np.integer) - # GH 10324 - a = Series([1, 2, 3, 4]) - b = Series(["even", "odd", "even", "odd"], dtype="category") - c = Series(["even", "odd", "even", "odd"]) + merged = left.map(right) + assert merged.dtype == np.float_ + assert isna(merged["d"]) + assert not isna(merged["c"]) - exp = Series(["odd", "even", "odd", np.nan], dtype="category") - tm.assert_series_equal(a.map(b), exp) - exp = Series(["odd", "even", "odd", np.nan]) - tm.assert_series_equal(a.map(c), exp) - a = Series(["a", "b", "c", "d"]) - b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) - c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) +def test_map_type_inference(): + s = Series(range(3)) + s2 = s.map(lambda x: np.where(x == 0, 0, 1)) + assert issubclass(s2.dtype.type, np.integer) - exp = Series([np.nan, 1, 2, 3]) - tm.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, 1, 2, 3]) - tm.assert_series_equal(a.map(c), exp) - a = Series(["a", "b", "c", "d"]) - b = Series( - ["B", "C", "D", "E"], - dtype="category", - index=pd.CategoricalIndex(["b", "c", "d", "e"]), - ) - c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) +def test_map_decimal(string_series): + from decimal import Decimal - exp = Series( - pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) - ) - tm.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, "B", "C", "D"]) - tm.assert_series_equal(a.map(c), exp) + result = string_series.map(lambda x: Decimal(str(x))) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) - def test_map_empty(self, index): - if isinstance(index, MultiIndex): - pytest.skip("Initializing a Series from a MultiIndex is not supported") - s = Series(index) - result = s.map({}) +def test_map_na_exclusion(): + s = Series([1.5, np.nan, 3, np.nan, 5]) - expected = Series(np.nan, index=s.index) - tm.assert_series_equal(result, expected) + result = s.map(lambda x: x * 2, na_action="ignore") + exp = s * 2 + tm.assert_series_equal(result, exp) - def test_map_compat(self): - # related GH 8024 - s = Series([True, True, False], index=[1, 2, 3]) - result = s.map({True: "foo", False: "bar"}) - expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) - tm.assert_series_equal(result, expected) - def test_map_int(self): - left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) - right = Series({1: 11, 2: 22, 3: 33}) - - assert left.dtype == np.float_ - assert issubclass(right.dtype.type, np.integer) - - merged = left.map(right) - assert merged.dtype == np.float_ - assert isna(merged["d"]) - assert not isna(merged["c"]) - - def test_map_type_inference(self): - s = Series(range(3)) - s2 = s.map(lambda x: np.where(x == 0, 0, 1)) - assert issubclass(s2.dtype.type, np.integer) - - def test_map_decimal(self, string_series): - from decimal import Decimal - - result = string_series.map(lambda x: Decimal(str(x))) - assert result.dtype == np.object_ - assert isinstance(result[0], Decimal) - - def test_map_na_exclusion(self): - s = Series([1.5, np.nan, 3, np.nan, 5]) - - result = s.map(lambda x: x * 2, na_action="ignore") - exp = s * 2 - tm.assert_series_equal(result, exp) - - def test_map_dict_with_tuple_keys(self): - """ - Due to new MultiIndex-ing behaviour in v0.14.0, - dicts with tuple keys passed to map were being - converted to a multi-index, preventing tuple values - from being mapped properly. - """ - # GH 18496 - df = DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) - label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} - - df["labels"] = df["a"].map(label_mappings) - df["expected_labels"] = Series(["A", "B", "A", "B"], index=df.index) - # All labels should be filled now - tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) - - def test_map_counter(self): - s = Series(["a", "b", "c"], index=[1, 2, 3]) - counter = Counter() - counter["b"] = 5 - counter["c"] += 1 - result = s.map(counter) - expected = Series([0, 5, 1], index=[1, 2, 3]) - tm.assert_series_equal(result, expected) +def test_map_dict_with_tuple_keys(): + """ + Due to new MultiIndex-ing behaviour in v0.14.0, + dicts with tuple keys passed to map were being + converted to a multi-index, preventing tuple values + from being mapped properly. + """ + # GH 18496 + df = DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) + label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} - def test_map_defaultdict(self): - s = Series([1, 2, 3], index=["a", "b", "c"]) - default_dict = defaultdict(lambda: "blank") - default_dict[1] = "stuff" - result = s.map(default_dict) - expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) - tm.assert_series_equal(result, expected) + df["labels"] = df["a"].map(label_mappings) + df["expected_labels"] = Series(["A", "B", "A", "B"], index=df.index) + # All labels should be filled now + tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) - def test_map_dict_na_key(self): - # https://github.com/pandas-dev/pandas/issues/17648 - # Checks that np.nan key is appropriately mapped - s = Series([1, 2, np.nan]) - expected = Series(["a", "b", "c"]) - result = s.map({1: "a", 2: "b", np.nan: "c"}) - tm.assert_series_equal(result, expected) - def test_map_dict_subclass_with_missing(self): - """ - Test Series.map with a dictionary subclass that defines __missing__, - i.e. sets a default value (GH #15999). - """ +def test_map_counter(): + s = Series(["a", "b", "c"], index=[1, 2, 3]) + counter = Counter() + counter["b"] = 5 + counter["c"] += 1 + result = s.map(counter) + expected = Series([0, 5, 1], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) - class DictWithMissing(dict): - def __missing__(self, key): - return "missing" - s = Series([1, 2, 3]) - dictionary = DictWithMissing({3: "three"}) - result = s.map(dictionary) - expected = Series(["missing", "missing", "three"]) - tm.assert_series_equal(result, expected) +def test_map_defaultdict(): + s = Series([1, 2, 3], index=["a", "b", "c"]) + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" + result = s.map(default_dict) + expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) - def test_map_dict_subclass_without_missing(self): - class DictWithoutMissing(dict): - pass - s = Series([1, 2, 3]) - dictionary = DictWithoutMissing({3: "three"}) - result = s.map(dictionary) - expected = Series([np.nan, np.nan, "three"]) - tm.assert_series_equal(result, expected) +def test_map_dict_na_key(): + # https://github.com/pandas-dev/pandas/issues/17648 + # Checks that np.nan key is appropriately mapped + s = Series([1, 2, np.nan]) + expected = Series(["a", "b", "c"]) + result = s.map({1: "a", 2: "b", np.nan: "c"}) + tm.assert_series_equal(result, expected) - def test_map_abc_mapping(self, non_dict_mapping_subclass): - # https://github.com/pandas-dev/pandas/issues/29733 - # Check collections.abc.Mapping support as mapper for Series.map - s = Series([1, 2, 3]) - not_a_dictionary = non_dict_mapping_subclass({3: "three"}) - result = s.map(not_a_dictionary) - expected = Series([np.nan, np.nan, "three"]) - tm.assert_series_equal(result, expected) - def test_map_abc_mapping_with_missing(self, non_dict_mapping_subclass): - # https://github.com/pandas-dev/pandas/issues/29733 - # Check collections.abc.Mapping support as mapper for Series.map - class NonDictMappingWithMissing(non_dict_mapping_subclass): - def __missing__(self, key): - return "missing" - - s = Series([1, 2, 3]) - not_a_dictionary = NonDictMappingWithMissing({3: "three"}) - result = s.map(not_a_dictionary) - # __missing__ is a dict concept, not a Mapping concept, - # so it should not change the result! - expected = Series([np.nan, np.nan, "three"]) - tm.assert_series_equal(result, expected) +def test_map_dict_subclass_with_missing(): + """ + Test Series.map with a dictionary subclass that defines __missing__, + i.e. sets a default value (GH #15999). + """ - def test_map_box(self): - vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" - # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) - tm.assert_series_equal(res, exp) - - vals = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - ] - s = Series(vals) - assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") - exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) - tm.assert_series_equal(res, exp) - - # timedelta - vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") - exp = Series(["Timedelta_1", "Timedelta_2"]) - tm.assert_series_equal(res, exp) - - # period - vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") - exp = Series(["Period_M", "Period_M"]) - tm.assert_series_equal(res, exp) - - def test_map_categorical(self): - values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - s = Series(values, name="XX", index=list("abcdefg")) - - result = s.map(lambda x: x.lower()) - exp_values = pd.Categorical( - list("abbabcd"), categories=list("dcba"), ordered=True - ) - exp = Series(exp_values, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - tm.assert_categorical_equal(result.values, exp_values) + class DictWithMissing(dict): + def __missing__(self, key): + return "missing" - result = s.map(lambda x: "A") - exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - assert result.dtype == object + s = Series([1, 2, 3]) + dictionary = DictWithMissing({3: "three"}) + result = s.map(dictionary) + expected = Series(["missing", "missing", "three"]) + tm.assert_series_equal(result, expected) - with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): - s.map(lambda x: x, na_action="ignore") - def test_map_datetimetz(self): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( - "Asia/Tokyo" - ) - s = Series(values, name="XX") +def test_map_dict_subclass_without_missing(): + class DictWithoutMissing(dict): + pass - # keep tz - result = s.map(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( - "Asia/Tokyo" - ) - exp = Series(exp_values, name="XX") - tm.assert_series_equal(result, exp) - - # change dtype - # GH 14506 : Returned dtype changed from int32 to int64 - result = s.map(lambda x: x.hour) - exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) - tm.assert_series_equal(result, exp) - - with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): - s.map(lambda x: x, na_action="ignore") - - # not vectorized - def f(x): - if not isinstance(x, pd.Timestamp): - raise ValueError - return str(x.tz) - - result = s.map(f) - exp = Series(["Asia/Tokyo"] * 25, name="XX") - tm.assert_series_equal(result, exp) - - @pytest.mark.parametrize( - "vals,mapping,exp", - [ - (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), - (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), - (list(range(3)), {0: 42}, [42] + [np.nan] * 3), - ], + s = Series([1, 2, 3]) + dictionary = DictWithoutMissing({3: "three"}) + result = s.map(dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_abc_mapping(non_dict_mapping_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + s = Series([1, 2, 3]) + not_a_dictionary = non_dict_mapping_subclass({3: "three"}) + result = s.map(not_a_dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_abc_mapping_with_missing(non_dict_mapping_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + class NonDictMappingWithMissing(non_dict_mapping_subclass): + def __missing__(key): + return "missing" + + s = Series([1, 2, 3]) + not_a_dictionary = NonDictMappingWithMissing({3: "three"}) + result = s.map(not_a_dictionary) + # __missing__ is a dict concept, not a Mapping concept, + # so it should not change the result! + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + +def test_map_box(): + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + s = Series(vals) + assert s.dtype == "datetime64[ns]" + # boxed value must be Timestamp instance + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = Series(vals) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + s = Series(vals) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") + exp = Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = Series(vals) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") + exp = Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + +def test_map_categorical(): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = Series(values, name="XX", index=list("abcdefg")) + + result = s.map(lambda x: x.lower()) + exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = Series(exp_values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp_values) + + result = s.map(lambda x: "A") + exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == object + + with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): + s.map(lambda x: x, na_action="ignore") + + +def test_map_datetimetz(): + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" ) - def test_map_missing_mixed(self, vals, mapping, exp): - # GH20495 - s = Series(vals + [np.nan]) - result = s.map(mapping) - - tm.assert_series_equal(result, Series(exp)) - - @pytest.mark.parametrize( - "dti,exp", - [ - ( - Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), - DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), - ), - ( - tm.makeTimeSeries(nper=30), - DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), - ), - ], + s = Series(values, name="XX") + + # keep tz + result = s.map(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" ) - @pytest.mark.parametrize("aware", [True, False]) - def test_apply_series_on_date_time_index_aware_series(self, dti, exp, aware): - # GH 25959 - # Calling apply on a localized time series should not cause an error - if aware: - index = dti.tz_localize("UTC").index - else: - index = dti.index - result = Series(index).apply(lambda x: Series([1, 2])) - tm.assert_frame_equal(result, exp) - - def test_apply_scaler_on_date_time_index_aware_series(self): - # GH 25959 - # Calling apply on a localized time series should not cause an error - series = tm.makeTimeSeries(nper=30).tz_localize("UTC") - result = Series(series.index).apply(lambda x: 1) - tm.assert_series_equal(result, Series(np.ones(30), dtype="int64")) - - def test_map_float_to_string_precision(self): - # GH 13228 - ser = Series(1 / 3) - result = ser.map(lambda val: str(val)).to_dict() - expected = {0: "0.3333333333333333"} - assert result == expected + exp = Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) + + # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 + result = s.map(lambda x: x.hour) + exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) + tm.assert_series_equal(result, exp) + + with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): + s.map(lambda x: x, na_action="ignore") + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) - def test_map_with_invalid_na_action_raises(self): - # https://github.com/pandas-dev/pandas/issues/32815 - s = Series([1, 2, 3]) - msg = "na_action must either be 'ignore' or None" - with pytest.raises(ValueError, match=msg): - s.map(lambda x: x, na_action="____") - - def test_apply_to_timedelta(self): - list_of_valid_strings = ["00:00:01", "00:00:02"] - a = pd.to_timedelta(list_of_valid_strings) - b = Series(list_of_valid_strings).apply(pd.to_timedelta) - # FIXME: dont leave commented-out - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) - - list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] - - a = pd.to_timedelta(list_of_strings) # noqa - b = Series(list_of_strings).apply(pd.to_timedelta) # noqa - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) + +@pytest.mark.parametrize( + "vals,mapping,exp", + [ + (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), + (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), + (list(range(3)), {0: 42}, [42] + [np.nan] * 3), + ], +) +def test_map_missing_mixed(vals, mapping, exp): + # GH20495 + s = Series(vals + [np.nan]) + result = s.map(mapping) + + tm.assert_series_equal(result, Series(exp)) + + +@pytest.mark.parametrize( + "dti,exp", + [ + ( + Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), + DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), + ), + ( + tm.makeTimeSeries(nper=30), + DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), + ), + ], +) +@pytest.mark.parametrize("aware", [True, False]) +def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): + # GH 25959 + # Calling apply on a localized time series should not cause an error + if aware: + index = dti.tz_localize("UTC").index + else: + index = dti.index + result = Series(index).apply(lambda x: Series([1, 2])) + tm.assert_frame_equal(result, exp) + + +def test_apply_scaler_on_date_time_index_aware_series(): + # GH 25959 + # Calling apply on a localized time series should not cause an error + series = tm.makeTimeSeries(nper=30).tz_localize("UTC") + result = Series(series.index).apply(lambda x: 1) + tm.assert_series_equal(result, Series(np.ones(30), dtype="int64")) + + +def test_map_float_to_string_precision(): + # GH 13228 + ser = Series(1 / 3) + result = ser.map(lambda val: str(val)).to_dict() + expected = {0: "0.3333333333333333"} + assert result == expected + + +def test_map_with_invalid_na_action_raises(): + # https://github.com/pandas-dev/pandas/issues/32815 + s = Series([1, 2, 3]) + msg = "na_action must either be 'ignore' or None" + with pytest.raises(ValueError, match=msg): + s.map(lambda x: x, na_action="____") + + +def test_apply_to_timedelta(): + list_of_valid_strings = ["00:00:01", "00:00:02"] + a = pd.to_timedelta(list_of_valid_strings) + b = Series(list_of_valid_strings).apply(pd.to_timedelta) + # FIXME: dont leave commented-out + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) + + list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] + + a = pd.to_timedelta(list_of_strings) # noqa + b = Series(list_of_strings).apply(pd.to_timedelta) # noqa + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) @pytest.mark.parametrize( diff --git a/pandas/tests/apply/test_series_apply_relabeling.py b/pandas/tests/apply/test_series_apply_relabeling.py index 0b8d2c4e1f26d..c0a285e6eb38c 100644 --- a/pandas/tests/apply/test_series_apply_relabeling.py +++ b/pandas/tests/apply/test_series_apply_relabeling.py @@ -2,32 +2,32 @@ import pandas._testing as tm -class TestNamedAggregation: - def test_relabel_no_duplicated_method(self): - # this is to test there is no duplicated method used in agg - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) - - result = df["A"].agg(foo="sum") - expected = df["A"].agg({"foo": "sum"}) - tm.assert_series_equal(result, expected) - - result = df["B"].agg(foo="min", bar="max") - expected = df["B"].agg({"foo": "min", "bar": "max"}) - tm.assert_series_equal(result, expected) - - result = df["B"].agg(foo=sum, bar=min, cat="max") - expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) - tm.assert_series_equal(result, expected) - - def test_relabel_duplicated_method(self): - # this is to test with nested renaming, duplicated method can be used - # if they are assigned with different new names - df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) - - result = df["A"].agg(foo="sum", bar="sum") - expected = pd.Series([6, 6], index=["foo", "bar"], name="A") - tm.assert_series_equal(result, expected) - - result = df["B"].agg(foo=min, bar="min") - expected = pd.Series([1, 1], index=["foo", "bar"], name="B") - tm.assert_series_equal(result, expected) +def test_relabel_no_duplicated_method(): + # this is to test there is no duplicated method used in agg + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum") + expected = df["A"].agg({"foo": "sum"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo="min", bar="max") + expected = df["B"].agg({"foo": "min", "bar": "max"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=sum, bar=min, cat="max") + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) + tm.assert_series_equal(result, expected) + + +def test_relabel_duplicated_method(): + # this is to test with nested renaming, duplicated method can be used + # if they are assigned with different new names + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum", bar="sum") + expected = pd.Series([6, 6], index=["foo", "bar"], name="A") + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=min, bar="min") + expected = pd.Series([1, 1], index=["foo", "bar"], name="B") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 88e11f57a7835..1fc7f824c6daa 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -162,3 +162,16 @@ def test_error_len_mismatch(data, all_arithmetic_operators): s = pd.Series(data) with pytest.raises(ValueError, match="Lengths must match"): op(s, other) + + +@pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"]) +@pytest.mark.parametrize( + "values, dtype", [([1, 2, 3], "Int64"), ([True, False, True], "boolean")] +) +def test_unary_op_does_not_propagate_mask(op, values, dtype): + # https://github.com/pandas-dev/pandas/issues/39943 + s = pd.Series(values, dtype=dtype) + result = getattr(s, op)() + expected = result.copy(deep=True) + s[0] = None + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 8be475d5a922a..bb1ff2abd9bc4 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -327,7 +327,7 @@ def test_array_multiindex_raises(): ), # GH#26406 tz is preserved in Categorical[dt64tz] ( - pd.Categorical(pd.date_range("2016-01-01", periods=2, tz="US/Pacific")), + pd.Categorical(date_range("2016-01-01", periods=2, tz="US/Pacific")), np.array( [ Timestamp("2016-01-01", tz="US/Pacific"), diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 2b689364c5002..248798408381e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -765,7 +765,7 @@ def test_astype_datetime64_bad_dtype_raises(from_type, to_type): @pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) def test_astype_object_preserves_datetime_na(from_type): - arr = np.array([from_type("NaT")]) + arr = np.array([from_type("NaT", "ns")]) result = astype_nansafe(arr, dtype=np.dtype("object")) assert isna(result)[0] diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py deleted file mode 100644 index 464b24e45abf4..0000000000000 --- a/pandas/tests/frame/indexing/test_categorical.py +++ /dev/null @@ -1,293 +0,0 @@ -import numpy as np -import pytest - -from pandas.core.dtypes.dtypes import CategoricalDtype - -import pandas as pd -from pandas import ( - Categorical, - DataFrame, - Index, - Series, -) -import pandas._testing as tm - -msg1 = "Cannot setitem on a Categorical with a new category, set the categories first" -msg2 = "Cannot set a Categorical with another, without identical categories" - - -class TestDataFrameIndexingCategorical: - def test_assignment(self): - # assignment - df = DataFrame( - {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} - ) - labels = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) - - df = df.sort_values(by=["value"], ascending=True) - s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) - d = s.values - df["D"] = d - str(df) - - result = df.dtypes - expected = Series( - [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], - index=["value", "D"], - ) - tm.assert_series_equal(result, expected) - - df["E"] = s - str(df) - - result = df.dtypes - expected = Series( - [ - np.dtype("int32"), - CategoricalDtype(categories=labels, ordered=False), - CategoricalDtype(categories=labels, ordered=False), - ], - index=["value", "D", "E"], - ) - tm.assert_series_equal(result, expected) - - result1 = df["D"] - result2 = df["E"] - tm.assert_categorical_equal(result1._mgr._block.values, d) - - # sorting - s.name = "E" - tm.assert_series_equal(result2.sort_index(), s.sort_index()) - - cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) - df = DataFrame(Series(cat)) - - @pytest.fixture - def orig(self): - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n"]) - values = [1, 1, 1, 1, 1, 1, 1] - orig = DataFrame({"cats": cats, "values": values}, index=idx) - return orig - - @pytest.fixture - def exp_single_row(self): - # The expected values if we change a single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) - return exp_single_row - - @pytest.fixture - def exp_multi_row(self): - # assign multiple rows (mixed values) (-> array) -> exp_multi_row - # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - return exp_multi_row - - @pytest.fixture - def exp_parts_cats_col(self): - # changed part of the cats column - cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values3 = [1, 1, 1, 1, 1, 1, 1] - exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) - return exp_parts_cats_col - - @pytest.fixture - def exp_single_cats_value(self): - # changed single value in cats col - cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values4 = [1, 1, 1, 1, 1, 1, 1] - exp_single_cats_value = DataFrame( - {"cats": cats4, "values": values4}, index=idx4 - ) - return exp_single_cats_value - - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer): - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - - key = slice(2, 4) - if indexer is tm.loc: - key = slice("j", "k") - - indexer(df)[key, :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - df = orig.copy() - with pytest.raises(ValueError, match=msg1): - indexer(df)[key, :] = [["c", 2], ["c", 2]] - - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat]) - def test_loc_iloc_at_iat_setitem_single_value_in_categories( - self, orig, exp_single_cats_value, indexer - ): - # - assign a single value -> exp_single_cats_value - df = orig.copy() - - key = (2, 0) - if indexer in [tm.loc, tm.at]: - key = (df.index[2], df.columns[0]) - - # "b" is among the categories for df["cat"}] - indexer(df)[key] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # "c" is not among the categories for df["cat"] - with pytest.raises(ValueError, match=msg1): - indexer(df)[key] = "c" - - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_loc_iloc_setitem_mask_single_value_in_categories( - self, orig, exp_single_cats_value, indexer - ): - # mask with single True - df = orig.copy() - - mask = df.index == "j" - key = 0 - if indexer is tm.loc: - key = df.columns[key] - - indexer(df)[mask, key] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_setitem_full_row_non_categorical_rhs( - self, orig, exp_single_row, indexer - ): - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - - key = 2 - if indexer is tm.loc: - key = df.index[2] - - # not categorical dtype, but "b" _is_ among the categories for df["cat"] - indexer(df)[key, :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # "c" is not among the categories for df["cat"] - with pytest.raises(ValueError, match=msg1): - indexer(df)[key, :] = ["c", 2] - - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_loc_iloc_setitem_partial_col_categorical_rhs( - self, orig, exp_parts_cats_col, indexer - ): - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - - key = (slice(2, 4), 0) - if indexer is tm.loc: - key = (slice("j", "k"), df.columns[0]) - - # same categories as we currently have in df["cats"] - compat = Categorical(["b", "b"], categories=["a", "b"]) - indexer(df)[key] = compat - tm.assert_frame_equal(df, exp_parts_cats_col) - - # categories do not match df["cat"]'s, but "b" is among them - semi_compat = Categorical(list("bb"), categories=list("abc")) - with pytest.raises(ValueError, match=msg2): - # different categories but holdable values - # -> not sure if this should fail or pass - indexer(df)[key] = semi_compat - - # categories do not match df["cat"]'s, and "c" is not among them - incompat = Categorical(list("cc"), categories=list("abc")) - with pytest.raises(ValueError, match=msg2): - # different values - indexer(df)[key] = incompat - - @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_loc_iloc_setitem_non_categorical_rhs( - self, orig, exp_parts_cats_col, indexer - ): - # assign a part of a column with dtype != categorical -> exp_parts_cats_col - df = orig.copy() - - key = (slice(2, 4), 0) - if indexer is tm.loc: - key = (slice("j", "k"), df.columns[0]) - - # "b" is among the categories for df["cat"] - indexer(df)[key] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - # "c" not part of the categories - with pytest.raises(ValueError, match=msg1): - indexer(df)[key] = ["c", "c"] - - def test_setitem_mask_categorical(self, exp_multi_row): - # fancy indexing - - catsf = Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] - ) - idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) - valuesf = [1, 1, 3, 3, 1, 1, 1] - df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) - - exp_fancy = exp_multi_row.copy() - return_value = exp_fancy["cats"].cat.set_categories( - ["a", "b", "c"], inplace=True - ) - assert return_value is None - - mask = df["cats"] == "c" - df[mask] = ["b", 2] - # category c is kept in .categories - tm.assert_frame_equal(df, exp_fancy) - - def test_loc_setitem_categorical_values_partial_column_slice(self): - # Assigning a Category to parts of a int/... column uses the values of - # the Categorical - df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) - df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) - - def test_loc_setitem_single_row_categorical(self): - # GH 25495 - df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) - categories = Categorical(df["Alpha"], categories=["a", "b", "c"]) - df.loc[:, "Alpha"] = categories - - result = df["Alpha"] - expected = Series(categories, index=df.index, name="Alpha") - tm.assert_series_equal(result, expected) - - def test_loc_indexing_preserves_index_category_dtype(self): - # GH 15166 - df = DataFrame( - data=np.arange(2, 22, 2), - index=pd.MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], - codes=[[0] * 5 + [1] * 5, range(10)], - names=["Index1", "Index2"], - ), - ) - - expected = pd.CategoricalIndex( - ["a", "b"], - categories=["a", "b"], - ordered=False, - name="Index1", - dtype="category", - ) - - result = df.index.levels[0] - tm.assert_index_equal(result, expected) - - result = df.loc[["a"]].index.levels[0] - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 7c48c412fd694..290ba67c7d05b 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -81,6 +83,67 @@ def test_getitem_list_missing_key(self): with pytest.raises(KeyError, match=r"\['y'\] not in index"): df[["x", "y", "z"]] + def test_getitem_list_duplicates(self): + # GH#1943 + df = DataFrame(np.random.randn(4, 4), columns=list("AABC")) + df.columns.name = "foo" + + result = df[["B", "C"]] + assert result.columns.name == "foo" + + expected = df.iloc[:, 2:] + tm.assert_frame_equal(result, expected) + + def test_getitem_dupe_cols(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" + with pytest.raises(KeyError, match=re.escape(msg)): + df[["baf"]] + + @pytest.mark.parametrize( + "idx_type", + [ + list, + iter, + Index, + set, + lambda l: dict(zip(l, range(len(l)))), + lambda l: dict(zip(l, range(len(l)))).keys(), + ], + ids=["list", "iter", "Index", "set", "dict", "dict_keys"], + ) + @pytest.mark.parametrize("levels", [1, 2]) + def test_getitem_listlike(self, idx_type, levels, float_frame): + # GH#21294 + + if levels == 1: + frame, missing = float_frame, "food" + else: + # MultiIndex columns + frame = DataFrame( + np.random.randn(8, 3), + columns=Index( + [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")], + name=("sth", "sth2"), + ), + ) + missing = ("good", "food") + + keys = [frame.columns[1], frame.columns[0]] + idx = idx_type(keys) + idx_check = list(idx_type(keys)) + + result = frame[idx] + + expected = frame.loc[:, idx_check] + expected.columns.names = frame.columns.names + + tm.assert_frame_equal(result, expected) + + idx = idx_type(keys + [missing]) + with pytest.raises(KeyError, match="not in index"): + frame[idx] + class TestGetitemCallable: def test_getitem_callable(self, float_frame): @@ -258,6 +321,13 @@ def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols): result.dtypes str(result) + def test_getitem_empty_frame_with_boolean(self): + # Test for issue GH#11859 + + df = DataFrame() + df2 = df[df > 0] + tm.assert_frame_equal(df, df2) + class TestGetitemSlice: def test_getitem_slice_float64(self, frame_or_series): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9b6bdbf3a9d60..366ccf2fc9219 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1,7 +1,5 @@ from datetime import ( - date, datetime, - time, timedelta, ) import re @@ -15,6 +13,7 @@ import pandas as pd from pandas import ( + Categorical, DataFrame, DatetimeIndex, Index, @@ -27,9 +26,6 @@ ) import pandas._testing as tm import pandas.core.common as com -from pandas.core.indexing import IndexingError - -from pandas.tseries.offsets import BDay # We pass through a TypeError raised by numpy _slice_msg = "slice indices must be integers or None or have an __index__ method" @@ -53,6 +49,8 @@ def test_getitem(self, float_frame): with pytest.raises(KeyError, match="random"): float_frame["random"] + def test_getitem2(self, float_frame): + df = float_frame.copy() df["$10"] = np.random.randn(len(df)) @@ -65,56 +63,6 @@ def test_getitem(self, float_frame): res = df["@awesome_domain"] tm.assert_numpy_array_equal(ad, res.values) - def test_getitem_dupe_cols(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" - with pytest.raises(KeyError, match=re.escape(msg)): - df[["baf"]] - - @pytest.mark.parametrize( - "idx_type", - [ - list, - iter, - Index, - set, - lambda l: dict(zip(l, range(len(l)))), - lambda l: dict(zip(l, range(len(l)))).keys(), - ], - ids=["list", "iter", "Index", "set", "dict", "dict_keys"], - ) - @pytest.mark.parametrize("levels", [1, 2]) - def test_getitem_listlike(self, idx_type, levels, float_frame): - # GH 21294 - - if levels == 1: - frame, missing = float_frame, "food" - else: - # MultiIndex columns - frame = DataFrame( - np.random.randn(8, 3), - columns=Index( - [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")], - name=("sth", "sth2"), - ), - ) - missing = ("good", "food") - - keys = [frame.columns[1], frame.columns[0]] - idx = idx_type(keys) - idx_check = list(idx_type(keys)) - - result = frame[idx] - - expected = frame.loc[:, idx_check] - expected.columns.names = frame.columns.names - - tm.assert_frame_equal(result, expected) - - idx = idx_type(keys + [missing]) - with pytest.raises(KeyError, match="not in index"): - frame[idx] - def test_setitem_list(self, float_frame): float_frame["E"] = "foo" @@ -135,6 +83,8 @@ def test_setitem_list(self, float_frame): with pytest.raises(ValueError, match=msg): data["A"] = newcolumndata + def test_setitem_list2(self): + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) df.loc[1, ["tt1", "tt2"]] = [1, 2] @@ -235,17 +185,6 @@ def test_setitem_multi_index(self): df[("joe", "last")] = df[("jolie", "first")].loc[i, j] tm.assert_frame_equal(df[("joe", "last")], df[("jolie", "first")]) - def test_setitem_other_callable(self): - # GH 13299 - def inc(x): - return x + 1 - - df = DataFrame([[-1, 1], [1, -1]]) - df[df > 0] = inc - - expected = DataFrame([[-1, inc], [inc, -1]]) - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "cols, values, expected", [ @@ -490,21 +429,6 @@ def test_setitem(self, float_frame): df.loc[0] = np.nan tm.assert_frame_equal(df, expected) - def test_setitem_tuple(self, float_frame): - float_frame["A", "B"] = float_frame["A"] - assert ("A", "B") in float_frame.columns - - result = float_frame["A", "B"] - expected = float_frame["A"] - tm.assert_series_equal(result, expected, check_names=False) - - def test_setitem_always_copy(self, float_frame): - s = float_frame["A"].copy() - float_frame["E"] = s - - float_frame["E"][5:10] = np.nan - assert notna(s[5:10]).all() - def test_setitem_boolean(self, float_frame): df = float_frame.copy() values = float_frame.values @@ -612,15 +536,6 @@ def test_setitem_boolean_column(self, float_frame): tm.assert_frame_equal(float_frame, expected) - def test_frame_setitem_timestamp(self): - # GH#2155 - columns = date_range(start="1/1/2012", end="2/1/2012", freq=BDay()) - data = DataFrame(columns=columns, index=range(10)) - t = datetime(2012, 11, 1) - ts = Timestamp(t) - data[ts] = np.nan # works, mostly a smoke-test - assert np.isnan(data[ts]).all() - def test_setitem_corner(self, float_frame): # corner case df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) @@ -698,22 +613,6 @@ def test_setitem_ambig(self): assert len(dm.columns) == 3 assert dm[2].dtype == np.object_ - def test_setitem_clear_caches(self): - # see gh-304 - df = DataFrame( - {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] - ) - df.insert(2, "z", np.nan) - - # cache it - foo = df["z"] - df.loc[df.index[2:], "z"] = 42 - - expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") - - assert df["z"] is not foo - tm.assert_series_equal(df["z"], expected) - def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] @@ -726,7 +625,7 @@ def test_setitem_None(self, float_frame): tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) repr(float_frame) - def test_setitem_empty(self): + def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 df = DataFrame( {"a": ["1", "2", "3"], "b": ["11", "22", "33"], "c": ["111", "222", "333"]} @@ -736,39 +635,6 @@ def test_setitem_empty(self): result.loc[result.b.isna(), "a"] = result.a tm.assert_frame_equal(result, df) - @pytest.mark.parametrize("dtype", ["float", "int64"]) - @pytest.mark.parametrize("kwargs", [{}, {"index": [1]}, {"columns": ["A"]}]) - def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): - # see gh-10126 - kwargs["dtype"] = dtype - df = DataFrame(**kwargs) - - df2 = df.copy() - df[df > df2] = 47 - tm.assert_frame_equal(df, df2) - - def test_setitem_with_empty_listlike(self): - # GH #17101 - index = Index([], name="idx") - result = DataFrame(columns=["A"], index=index) - result["A"] = [] - expected = DataFrame(columns=["A"], index=index) - tm.assert_index_equal(result.index, expected.index) - - def test_setitem_scalars_no_index(self): - # GH16823 / 17894 - df = DataFrame() - df["foo"] = 1 - expected = DataFrame(columns=["foo"]).astype(np.int64) - tm.assert_frame_equal(df, expected) - - def test_getitem_empty_frame_with_boolean(self): - # Test for issue #11859 - - df = DataFrame() - df2 = df[df > 0] - tm.assert_frame_equal(df, df2) - def test_getitem_fancy_slice_integers_step(self): df = DataFrame(np.random.randn(10, 5)) @@ -926,14 +792,6 @@ def test_getitem_fancy_ints(self, float_frame): expected = float_frame.loc[:, float_frame.columns[[2, 0, 1]]] tm.assert_frame_equal(result, expected) - def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): - with pytest.raises(IndexingError, match="Too many indexers"): - float_frame.iloc[:, :, :] - - with pytest.raises(IndexError, match="too many indices for array"): - # GH#32257 we let numpy do validation, get their exception - float_frame.iloc[:, :, :] = 1 - def test_getitem_setitem_boolean_misaligned(self, float_frame): # boolean index misaligned labels mask = float_frame["A"][::-1] > 1 @@ -1244,17 +1102,6 @@ def test_getitem_setitem_ix_bool_keyerror(self): with pytest.raises(KeyError, match=msg): df.loc[True] = 0 - def test_getitem_list_duplicates(self): - # #1943 - df = DataFrame(np.random.randn(4, 4), columns=list("AABC")) - df.columns.name = "foo" - - result = df[["B", "C"]] - assert result.columns.name == "foo" - - expected = df.iloc[:, 2:] - tm.assert_frame_equal(result, expected) - # TODO: rename? remove? def test_single_element_ix_dont_upcast(self, float_frame): float_frame["E"] = 1 @@ -1384,39 +1231,6 @@ def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) assert x[0].dtype == np.dtype("M8[ns]") - def test_iloc_getitem_float_duplicates(self): - df = DataFrame( - np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") - ) - expect = df.iloc[1:] - tm.assert_frame_equal(df.loc[0.2], expect) - - expect = df.iloc[1:, 0] - tm.assert_series_equal(df.loc[0.2, "a"], expect) - - df.index = [1, 0.2, 0.2] - expect = df.iloc[1:] - tm.assert_frame_equal(df.loc[0.2], expect) - - expect = df.iloc[1:, 0] - tm.assert_series_equal(df.loc[0.2, "a"], expect) - - df = DataFrame( - np.random.randn(4, 3), index=[1, 0.2, 0.2, 1], columns=list("abc") - ) - expect = df.iloc[1:-1] - tm.assert_frame_equal(df.loc[0.2], expect) - - expect = df.iloc[1:-1, 0] - tm.assert_series_equal(df.loc[0.2, "a"], expect) - - df.index = [0.1, 0.2, 2, 0.2] - expect = df.iloc[[1, -1]] - tm.assert_frame_equal(df.loc[0.2], expect) - - expect = df.iloc[[1, -1], 0] - tm.assert_series_equal(df.loc[0.2, "a"], expect) - def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. @@ -1430,18 +1244,6 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - def test_loc_setitem_datetime_coercion(self): - # gh-1048 - df = DataFrame({"c": [Timestamp("2010-10-01")] * 3}) - df.loc[0:1, "c"] = np.datetime64("2008-08-08") - assert Timestamp("2008-08-08") == df.loc[0, "c"] - assert Timestamp("2008-08-08") == df.loc[1, "c"] - df.loc[2, "c"] = date(2005, 5, 5) - with tm.assert_produces_warning(FutureWarning): - # Comparing Timestamp to date obj is deprecated - assert Timestamp("2005-05-05") == df.loc[2, "c"] - assert Timestamp("2005-05-05").date() == df.loc[2, "c"] - def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1464,48 +1266,6 @@ def test_loc_setitem_datetimelike_with_inference(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("idxer", ["var", ["var"]]) - def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): - # GH 11365 - tz = tz_naive_fixture - idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) - expected = DataFrame(1.2, index=idx, columns=["var"]) - result = DataFrame(index=idx, columns=["var"]) - result.loc[:, idxer] = expected - tm.assert_frame_equal(result, expected) - - def test_loc_setitem_time_key(self): - index = date_range("2012-01-01", "2012-01-05", freq="30min") - df = DataFrame(np.random.randn(len(index), 5), index=index) - akey = time(12, 0, 0) - bkey = slice(time(13, 0, 0), time(14, 0, 0)) - ainds = [24, 72, 120, 168] - binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172] - - result = df.copy() - result.loc[akey] = 0 - result = result.loc[akey] - expected = df.loc[akey].copy() - expected.loc[:] = 0 - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.loc[akey] = 0 - result.loc[akey] = df.iloc[ainds] - tm.assert_frame_equal(result, df) - - result = df.copy() - result.loc[bkey] = 0 - result = result.loc[bkey] - expected = df.loc[bkey].copy() - expected.loc[:] = 0 - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.loc[bkey] = 0 - result.loc[bkey] = df.iloc[binds] - tm.assert_frame_equal(result, df) - def test_loc_getitem_index_namedtuple(self): from collections import namedtuple @@ -1533,29 +1293,6 @@ def test_loc_getitem_index_single_double_tuples(self, tpl): expected = DataFrame(index=idx) tm.assert_frame_equal(result, expected) - def test_setitem_boolean_indexing(self): - idx = list(range(3)) - cols = ["A", "B", "C"] - df1 = DataFrame( - index=idx, - columns=cols, - data=np.array( - [[0.0, 0.5, 1.0], [1.5, 2.0, 2.5], [3.0, 3.5, 4.0]], dtype=float - ), - ) - df2 = DataFrame(index=idx, columns=cols, data=np.ones((len(idx), len(cols)))) - - expected = DataFrame( - index=idx, - columns=cols, - data=np.array([[0.0, 0.5, 1.0], [1.5, 2.0, -1], [-1, -1, -1]], dtype=float), - ) - - df1[df1 > 2.0 * df2] = -1 - tm.assert_frame_equal(df1, expected) - with pytest.raises(ValueError, match="Item wrong length"): - df1[df1.index[:-1] > 2] = -1 - def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { @@ -1664,21 +1401,6 @@ def test_getitem_interval_index_partial_indexing(self): res = df.loc[:, 0.5] tm.assert_series_equal(res, expected) - @pytest.mark.parametrize("indexer", ["A", ["A"], ("A", slice(None))]) - def test_setitem_unsorted_multiindex_columns(self, indexer): - # GH#38601 - mi = MultiIndex.from_tuples([("A", 4), ("B", "3"), ("A", "2")]) - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=mi) - obj = df.copy() - obj.loc[:, indexer] = np.zeros((2, 2), dtype=int) - expected = DataFrame([[0, 2, 0], [0, 5, 0]], columns=mi) - tm.assert_frame_equal(obj, expected) - - df = df.sort_index(1) - df.loc[:, indexer] = np.zeros((2, 2), dtype=int) - expected = expected.sort_index(1) - tm.assert_frame_equal(df, expected) - class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): @@ -1687,9 +1409,11 @@ def test_setitem(self, uint64_frame): idx = df["A"].rename("foo") # setitem + assert "C" not in df.columns df["C"] = idx tm.assert_series_equal(df["C"], Series(idx, name="C")) + assert "D" not in df.columns df["D"] = "foo" df["D"] = idx tm.assert_series_equal(df["D"], Series(idx, name="D")) @@ -1748,3 +1472,174 @@ def test_object_casting_indexing_wraps_datetimelike(): assert blk.dtype == "m8[ns]" # we got the right block val = blk.iget((0, 0)) assert isinstance(val, pd.Timedelta) + + +msg1 = "Cannot setitem on a Categorical with a new category, set the categories first" +msg2 = "Cannot set a Categorical with another, without identical categories" + + +class TestLocILocDataFrameCategorical: + @pytest.fixture + def orig(self): + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 1, 1, 1, 1, 1, 1] + orig = DataFrame({"cats": cats, "values": values}, index=idx) + return orig + + @pytest.fixture + def exp_single_row(self): + # The expected values if we change a single row + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + return exp_single_row + + @pytest.fixture + def exp_multi_row(self): + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + return exp_multi_row + + @pytest.fixture + def exp_parts_cats_col(self): + # changed part of the cats column + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values3 = [1, 1, 1, 1, 1, 1, 1] + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) + return exp_parts_cats_col + + @pytest.fixture + def exp_single_cats_value(self): + # changed single value in cats col + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values4 = [1, 1, 1, 1, 1, 1, 1] + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) + return exp_single_cats_value + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer): + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + + key = slice(2, 4) + if indexer is tm.loc: + key = slice("j", "k") + + indexer(df)[key, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + df = orig.copy() + with pytest.raises(ValueError, match=msg1): + indexer(df)[key, :] = [["c", 2], ["c", 2]] + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat]) + def test_loc_iloc_at_iat_setitem_single_value_in_categories( + self, orig, exp_single_cats_value, indexer + ): + # - assign a single value -> exp_single_cats_value + df = orig.copy() + + key = (2, 0) + if indexer in [tm.loc, tm.at]: + key = (df.index[2], df.columns[0]) + + # "b" is among the categories for df["cat"}] + indexer(df)[key] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # "c" is not among the categories for df["cat"] + with pytest.raises(ValueError, match=msg1): + indexer(df)[key] = "c" + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_mask_single_value_in_categories( + self, orig, exp_single_cats_value, indexer + ): + # mask with single True + df = orig.copy() + + mask = df.index == "j" + key = 0 + if indexer is tm.loc: + key = df.columns[key] + + indexer(df)[mask, key] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_full_row_non_categorical_rhs( + self, orig, exp_single_row, indexer + ): + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + + key = 2 + if indexer is tm.loc: + key = df.index[2] + + # not categorical dtype, but "b" _is_ among the categories for df["cat"] + indexer(df)[key, :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # "c" is not among the categories for df["cat"] + with pytest.raises(ValueError, match=msg1): + indexer(df)[key, :] = ["c", 2] + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_partial_col_categorical_rhs( + self, orig, exp_parts_cats_col, indexer + ): + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + + key = (slice(2, 4), 0) + if indexer is tm.loc: + key = (slice("j", "k"), df.columns[0]) + + # same categories as we currently have in df["cats"] + compat = Categorical(["b", "b"], categories=["a", "b"]) + indexer(df)[key] = compat + tm.assert_frame_equal(df, exp_parts_cats_col) + + # categories do not match df["cat"]'s, but "b" is among them + semi_compat = Categorical(list("bb"), categories=list("abc")) + with pytest.raises(ValueError, match=msg2): + # different categories but holdable values + # -> not sure if this should fail or pass + indexer(df)[key] = semi_compat + + # categories do not match df["cat"]'s, and "c" is not among them + incompat = Categorical(list("cc"), categories=list("abc")) + with pytest.raises(ValueError, match=msg2): + # different values + indexer(df)[key] = incompat + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_non_categorical_rhs( + self, orig, exp_parts_cats_col, indexer + ): + # assign a part of a column with dtype != categorical -> exp_parts_cats_col + df = orig.copy() + + key = (slice(2, 4), 0) + if indexer is tm.loc: + key = (slice("j", "k"), df.columns[0]) + + # "b" is among the categories for df["cat"] + indexer(df)[key] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + # "c" not part of the categories + with pytest.raises(ValueError, match=msg1): + indexer(df)[key] = ["c", "c"] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 6a9d4e6b5ab3c..7e3de9a5ae67c 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -8,6 +10,7 @@ is_object_dtype, ) from pandas.core.dtypes.dtypes import ( + CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, @@ -33,6 +36,8 @@ import pandas._testing as tm from pandas.core.arrays import SparseArray +from pandas.tseries.offsets import BDay + class TestDataFrameSetItem: @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) @@ -454,6 +459,14 @@ def test_setitem_categorical(self): ) tm.assert_frame_equal(df, expected) + def test_setitem_with_empty_listlike(self): + # GH#17101 + index = Index([], name="idx") + result = DataFrame(columns=["A"], index=index) + result["A"] = [] + expected = DataFrame(columns=["A"], index=index) + tm.assert_index_equal(result.index, expected.index) + class TestSetitemTZAwareValues: @pytest.fixture @@ -534,6 +547,79 @@ def test_setitem_empty_df_duplicate_columns(self): ) tm.assert_frame_equal(df, expected) + def test_setitem_with_expansion_categorical_dtype(self): + # assignment + df = DataFrame( + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} + ) + labels = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) + + df = df.sort_values(by=["value"], ascending=True) + ser = cut(df.value, range(0, 10500, 500), right=False, labels=labels) + cat = ser.values + + # setting with a Categorical + df["D"] = cat + str(df) + + result = df.dtypes + expected = Series( + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) + tm.assert_series_equal(result, expected) + + # setting with a Series + df["E"] = ser + str(df) + + result = df.dtypes + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) + tm.assert_series_equal(result, expected) + + result1 = df["D"] + result2 = df["E"] + tm.assert_categorical_equal(result1._mgr._block.values, cat) + + # sorting + ser.name = "E" + tm.assert_series_equal(result2.sort_index(), ser.sort_index()) + + def test_setitem_scalars_no_index(self): + # GH#16823 / GH#17894 + df = DataFrame() + df["foo"] = 1 + expected = DataFrame(columns=["foo"]).astype(np.int64) + tm.assert_frame_equal(df, expected) + + def test_setitem_newcol_tuple_key(self, float_frame): + assert ( + "A", + "B", + ) not in float_frame.columns + float_frame["A", "B"] = float_frame["A"] + assert ("A", "B") in float_frame.columns + + result = float_frame["A", "B"] + expected = float_frame["A"] + tm.assert_series_equal(result, expected, check_names=False) + + def test_frame_setitem_newcol_timestamp(self): + # GH#2155 + columns = date_range(start="1/1/2012", end="2/1/2012", freq=BDay()) + data = DataFrame(columns=columns, index=range(10)) + t = datetime(2012, 11, 1) + ts = Timestamp(t) + data[ts] = np.nan # works, mostly a smoke-test + assert np.isnan(data[ts]).all() + class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): @@ -555,6 +641,17 @@ def test_setitem_callable(self): exp = DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]}) tm.assert_frame_equal(df, exp) + def test_setitem_other_callable(self): + # GH#13299 + def inc(x): + return x + 1 + + df = DataFrame([[-1, 1], [1, -1]]) + df[df > 0] = inc + + expected = DataFrame([[-1, inc], [inc, -1]]) + tm.assert_frame_equal(df, expected) + class TestDataFrameSetItemBooleanMask: @pytest.mark.parametrize( @@ -584,3 +681,89 @@ def test_setitem_boolean_mask_aligning(self, indexer): mask = df["a"] >= 3 indexer(df)[mask] = indexer(df)[mask].sort_values("a") tm.assert_frame_equal(df, expected) + + def test_setitem_mask_categorical(self): + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) + idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) + valuesf = [1, 1, 3, 3, 1, 1, 1] + df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) + + exp_fancy = exp_multi_row.copy() + return_value = exp_fancy["cats"].cat.set_categories( + ["a", "b", "c"], inplace=True + ) + assert return_value is None + + mask = df["cats"] == "c" + df[mask] = ["b", 2] + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) + + @pytest.mark.parametrize("dtype", ["float", "int64"]) + @pytest.mark.parametrize("kwargs", [{}, {"index": [1]}, {"columns": ["A"]}]) + def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): + # see GH#10126 + kwargs["dtype"] = dtype + df = DataFrame(**kwargs) + + df2 = df.copy() + df[df > df2] = 47 + tm.assert_frame_equal(df, df2) + + def test_setitem_boolean_indexing(self): + idx = list(range(3)) + cols = ["A", "B", "C"] + df1 = DataFrame( + index=idx, + columns=cols, + data=np.array( + [[0.0, 0.5, 1.0], [1.5, 2.0, 2.5], [3.0, 3.5, 4.0]], dtype=float + ), + ) + df2 = DataFrame(index=idx, columns=cols, data=np.ones((len(idx), len(cols)))) + + expected = DataFrame( + index=idx, + columns=cols, + data=np.array([[0.0, 0.5, 1.0], [1.5, 2.0, -1], [-1, -1, -1]], dtype=float), + ) + + df1[df1 > 2.0 * df2] = -1 + tm.assert_frame_equal(df1, expected) + with pytest.raises(ValueError, match="Item wrong length"): + df1[df1.index[:-1] > 2] = -1 + + +class TestDataFrameSetitemCopyViewSemantics: + def test_setitem_always_copy(self, float_frame): + assert "E" not in float_frame.columns + s = float_frame["A"].copy() + float_frame["E"] = s + + float_frame["E"][5:10] = np.nan + assert notna(s[5:10]).all() + + def test_setitem_clear_caches(self): + # see GH#304 + df = DataFrame( + {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] + ) + df.insert(2, "z", np.nan) + + # cache it + foo = df["z"] + df.loc[df.index[2:], "z"] = 42 + + expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") + + assert df["z"] is not foo + tm.assert_series_equal(df["z"], expected) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index dc9a1565aad1e..0f51c4aef79db 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -161,7 +161,7 @@ def test_drop(self): assert return_value is None tm.assert_frame_equal(df, expected) - @td.skip_array_manager_not_yet_implemented + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_drop_multiindex_not_lexsorted(self): # GH#11640 diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index be80dd49ff1fb..bd0901387eeed 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -1,14 +1,9 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm -# TODO(ArrayManager) concat with reindexing -pytestmark = td.skip_array_manager_not_yet_implemented - def test_error(): df = pd.DataFrame( diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 90456ad949f59..1c7f7e3ff674a 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -15,9 +13,6 @@ ) import pandas._testing as tm -# TODO(ArrayManager) concat with reindexing -pytestmark = td.skip_array_manager_not_yet_implemented - @pytest.fixture def frame_with_period_index(): @@ -240,8 +235,9 @@ def test_join(self, multiindex_dataframe_random_data): b = frame.loc[frame.index[2:], ["B", "C"]] joined = a.join(b, how="outer").reindex(frame.index) - expected = frame.copy() - expected.values[np.isnan(joined.values)] = np.nan + expected = frame.copy().values + expected[np.isnan(joined.values)] = np.nan + expected = DataFrame(expected, index=frame.index, columns=frame.columns) assert not np.isnan(joined.values).all() diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 59e0605cc5a91..408113e9bc417 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -137,3 +137,12 @@ def test_update_datetime_tz(self): result.update(result) expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) tm.assert_frame_equal(result, expected) + + def test_update_with_different_dtype(self): + # GH#3217 + df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) + df["c"] = np.nan + df["c"].update(Series(["foo"], index=[0])) + + expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 92b7aefa6dd8c..f2f9cfee178d9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -147,11 +147,13 @@ def test_agg_apply_corner(ts, tsframe): # DataFrame grouped = tsframe.groupby(tsframe["A"] * np.nan) exp_df = DataFrame( - columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64) + columns=tsframe.columns, + dtype=float, + index=Index([], name="A", dtype=np.float64), ) - tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) - tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) + tm.assert_frame_equal(grouped.sum(), exp_df) + tm.assert_frame_equal(grouped.agg(np.sum), exp_df) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df) def test_agg_grouping_is_list_tuple(ts): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4dce7e8553be4..6731790c89384 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import ( + Categorical, DataFrame, Grouper, Index, @@ -18,6 +19,7 @@ Timestamp, date_range, read_csv, + to_datetime, ) import pandas._testing as tm from pandas.core.base import SpecificationError @@ -1716,15 +1718,48 @@ def test_pivot_table_values_key_error(): ) -def test_empty_dataframe_groupby(): - # GH8093 - df = DataFrame(columns=["A", "B", "C"]) - - result = df.groupby("A").sum() - expected = DataFrame(columns=["B", "C"], dtype=np.float64) - expected.index.name = "A" - - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("columns", ["C", ["C"]]) +@pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) +@pytest.mark.parametrize( + "values", + [ + [True], + [0], + [0.0], + ["a"], + [Categorical([0])], + [to_datetime(0)], + [date_range(0, 1, 1, tz="US/Eastern")], + [pd.array([0], dtype="Int64")], + ], +) +@pytest.mark.parametrize("method", ["attr", "agg", "apply"]) +@pytest.mark.parametrize( + "op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"] +) +def test_empty_groupby(columns, keys, values, method, op): + # GH8093 & GH26411 + + override_dtype = None + if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply": + # sum/product of bools is an integer + override_dtype = "int64" + + df = DataFrame([3 * values], columns=list("ABC")) + df = df.iloc[:0] + + gb = df.groupby(keys)[columns] + if method == "attr": + result = getattr(gb, op)() + else: + result = getattr(gb, method)(op) + + expected = df.set_index(keys)[columns] + if override_dtype is not None: + expected = expected.astype(override_dtype) + if len(keys) == 1: + expected.index.name = keys[0] + tm.assert_equal(result, expected) def test_tuple_as_grouping(): diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/test_sample.py index 13147ca704b56..4b8b0173789ae 100644 --- a/pandas/tests/groupby/test_sample.py +++ b/pandas/tests/groupby/test_sample.py @@ -116,14 +116,19 @@ def test_groupby_sample_without_n_or_frac(): tm.assert_series_equal(result, expected) -def test_groupby_sample_with_weights(): +@pytest.mark.parametrize( + "index, expected_index", + [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])], +) +def test_groupby_sample_with_weights(index, expected_index): + # GH 39927 - tests for integer index needed values = [1] * 2 + [2] * 2 - df = DataFrame({"a": values, "b": values}, index=Index(["w", "x", "y", "z"])) + df = DataFrame({"a": values, "b": values}, index=Index(index)) result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0]) - expected = DataFrame({"a": values, "b": values}, index=Index(["w", "w", "y", "y"])) + expected = DataFrame({"a": values, "b": values}, index=Index(expected_index)) tm.assert_frame_equal(result, expected) result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) - expected = Series(values, name="b", index=Index(["w", "w", "y", "y"])) + expected = Series(values, name="b", index=Index(expected_index)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 4a66073d4f7a5..49181f0fdee7e 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -457,3 +457,19 @@ def test_cache_updating2(self): tm.assert_frame_equal(df, expected) expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) + + def test_iloc_setitem_chained_assignment(self): + # GH#3970 + with option_context("chained_assignment", None): + df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) + df["cc"] = 0.0 + + ck = [True] * len(df) + + df["bb"].iloc[0] = 0.13 + + # TODO: unused + df_tmp = df.iloc[ck] # noqa + + df["bb"].iloc[0] = 0.15 + assert df["bb"].iloc[0] == 0.15 diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 28a1098c10d9f..a177c6e18f4cc 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -1,6 +1,3 @@ -import numpy as np -import pytest - import pandas as pd from pandas import ( DataFrame, @@ -55,28 +52,6 @@ def test_indexing_fast_xs(self): expected = df.iloc[4:] tm.assert_frame_equal(result, expected) - def test_setitem_with_expansion(self): - # indexing - setting an element - df = DataFrame( - data=pd.to_datetime(["2015-03-30 20:12:32", "2015-03-12 00:11:11"]), - columns=["time"], - ) - df["new_col"] = ["new", "old"] - df.time = df.set_index("time").index.tz_localize("UTC") - v = df[df.new_col == "new"].set_index("time").index.tz_convert("US/Pacific") - - # trying to set a single element on a part of a different timezone - # this converts to object - df2 = df.copy() - df2.loc[df2.new_col == "new", "time"] = v - - expected = Series([v[0], df.loc[1, "time"]], name="time") - tm.assert_series_equal(df2.time, expected) - - v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s") - df.loc[df.new_col == "new", "time"] = v - tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) - def test_consistency_with_tz_aware_scalar(self): # xef gh-12938 # various ways of indexing the same tz-aware scalar @@ -163,48 +138,6 @@ def test_indexing_with_datetimeindex_tz(self): expected = Series([0, 5], index=index) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("to_period", [True, False]) - def test_loc_getitem_listlike_of_datetimelike_keys(self, to_period): - # GH 11497 - - idx = date_range("2011-01-01", "2011-01-02", freq="D", name="idx") - if to_period: - idx = idx.to_period("D") - ser = Series([0.1, 0.2], index=idx, name="s") - - keys = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] - if to_period: - keys = [x.to_period("D") for x in keys] - result = ser.loc[keys] - exp = Series([0.1, 0.2], index=idx, name="s") - if not to_period: - exp.index = exp.index._with_freq(None) - tm.assert_series_equal(result, exp, check_index_type=True) - - keys = [ - Timestamp("2011-01-02"), - Timestamp("2011-01-02"), - Timestamp("2011-01-01"), - ] - if to_period: - keys = [x.to_period("D") for x in keys] - exp = Series( - [0.2, 0.2, 0.1], index=Index(keys, name="idx", dtype=idx.dtype), name="s" - ) - result = ser.loc[keys] - tm.assert_series_equal(result, exp, check_index_type=True) - - keys = [ - Timestamp("2011-01-03"), - Timestamp("2011-01-02"), - Timestamp("2011-01-03"), - ] - if to_period: - keys = [x.to_period("D") for x in keys] - - with pytest.raises(KeyError, match="with any missing labels"): - ser.loc[keys] - def test_nanosecond_getitem_setitem_with_tz(self): # GH 11679 data = ["2016-06-28 08:30:00.123456789"] @@ -219,24 +152,6 @@ def test_nanosecond_getitem_setitem_with_tz(self): expected = DataFrame(-1, index=index, columns=["a"]) tm.assert_frame_equal(result, expected) - def test_loc_setitem_with_expansion_and_existing_dst(self): - # GH 18308 - start = Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") - end = Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") - ts = Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") - idx = pd.date_range(start, end, closed="left", freq="H") - assert ts not in idx # i.e. result.loc setitem is with-expansion - - result = DataFrame(index=idx, columns=["value"]) - result.loc[ts, "value"] = 12 - expected = DataFrame( - [np.nan] * len(idx) + [12], - index=idx.append(pd.DatetimeIndex([ts])), - columns=["value"], - dtype=object, - ) - tm.assert_frame_equal(result, expected) - def test_getitem_millisecond_resolution(self, frame_or_series): # GH#33589 diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 21ea6fbd2e3c6..a84be049ebff4 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -50,7 +50,7 @@ def check(self, result, original, indexer, getitem): tm.makePeriodIndex, ], ) - def test_scalar_non_numeric(self, index_func, frame_or_series): + def test_scalar_non_numeric(self, index_func, frame_or_series, indexer_sl): # GH 4892 # float_indexers should raise exceptions @@ -61,10 +61,7 @@ def test_scalar_non_numeric(self, index_func, frame_or_series): # getting with pytest.raises(KeyError, match="^3.0$"): - s[3.0] - - with pytest.raises(KeyError, match="^3.0$"): - s.loc[3.0] + indexer_sl(s)[3.0] # contains assert 3.0 not in s @@ -88,11 +85,7 @@ def test_scalar_non_numeric(self, index_func, frame_or_series): else: s2 = s.copy() - s2.loc[3.0] = 10 - assert s2.index.is_object() - - s2 = s.copy() - s2[3.0] = 0 + indexer_sl(s2)[3.0] = 10 assert s2.index.is_object() @pytest.mark.parametrize( @@ -114,7 +107,7 @@ def test_scalar_non_numeric_series_fallback(self, index_func): with pytest.raises(KeyError, match="^3.0$"): s[3.0] - def test_scalar_with_mixed(self): + def test_scalar_with_mixed(self, indexer_sl): s2 = Series([1, 2, 3], index=["a", "b", "c"]) s3 = Series([1, 2, 3], index=["a", "b", 1.5]) @@ -122,36 +115,36 @@ def test_scalar_with_mixed(self): # lookup in a pure string index with an invalid indexer with pytest.raises(KeyError, match="^1.0$"): - s2[1.0] + indexer_sl(s2)[1.0] with pytest.raises(KeyError, match=r"^1\.0$"): - s2.loc[1.0] + indexer_sl(s2)[1.0] - result = s2.loc["b"] + result = indexer_sl(s2)["b"] expected = 2 assert result == expected # mixed index so we have label # indexing with pytest.raises(KeyError, match="^1.0$"): - s3[1.0] + indexer_sl(s3)[1.0] - result = s3[1] - expected = 2 - assert result == expected + if indexer_sl is not tm.loc: + # __getitem__ falls back to positional + result = s3[1] + expected = 2 + assert result == expected with pytest.raises(KeyError, match=r"^1\.0$"): - s3.loc[1.0] + indexer_sl(s3)[1.0] - result = s3.loc[1.5] + result = indexer_sl(s3)[1.5] expected = 3 assert result == expected - @pytest.mark.parametrize( - "idxr,getitem", [(lambda x: x.loc, False), (lambda x: x, True)] - ) @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) - def test_scalar_integer(self, index_func, frame_or_series, idxr, getitem): + def test_scalar_integer(self, index_func, frame_or_series, indexer_sl): + getitem = indexer_sl is not tm.loc # test how scalar float indexers work on int indexes @@ -161,7 +154,7 @@ def test_scalar_integer(self, index_func, frame_or_series, idxr, getitem): # coerce to equal int - result = idxr(obj)[3.0] + result = indexer_sl(obj)[3.0] self.check(result, obj, 3, getitem) if isinstance(obj, Series): @@ -178,12 +171,12 @@ def compare(x, y): expected = Series(100.0, index=range(len(obj)), name=3) s2 = obj.copy() - idxr(s2)[3.0] = 100 + indexer_sl(s2)[3.0] = 100 - result = idxr(s2)[3.0] + result = indexer_sl(s2)[3.0] compare(result, expected) - result = idxr(s2)[3] + result = indexer_sl(s2)[3] compare(result, expected) @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) @@ -204,7 +197,8 @@ def test_scalar_float(self, frame_or_series): # assert all operations except for iloc are ok indexer = index[3] - for idxr, getitem in [(lambda x: x.loc, False), (lambda x: x, True)]: + for idxr in [tm.loc, tm.setitem]: + getitem = idxr is not tm.loc # getting result = idxr(s)[indexer] @@ -242,7 +236,7 @@ def test_scalar_float(self, frame_or_series): ], ) @pytest.mark.parametrize("idx", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - def test_slice_non_numeric(self, index_func, idx, frame_or_series): + def test_slice_non_numeric(self, index_func, idx, frame_or_series, indexer_sli): # GH 4892 # float_indexers should raise exceptions @@ -252,38 +246,28 @@ def test_slice_non_numeric(self, index_func, idx, frame_or_series): s = gen_obj(frame_or_series, index) # getitem - msg = ( - "cannot do positional indexing " - fr"on {type(index).__name__} with these indexers \[(3|4)\.0\] of " - "type float" - ) + if indexer_sli is tm.iloc: + msg = ( + "cannot do positional indexing " + fr"on {type(index).__name__} with these indexers \[(3|4)\.0\] of " + "type float" + ) + else: + msg = ( + "cannot do slice indexing " + fr"on {type(index).__name__} with these indexers " + r"\[(3|4)(\.0)?\] " + r"of type (float|int)" + ) with pytest.raises(TypeError, match=msg): - s.iloc[idx] - - msg = ( - "cannot do (slice|positional) indexing " - fr"on {type(index).__name__} with these indexers " - r"\[(3|4)(\.0)?\] " - r"of type (float|int)" - ) - for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: - with pytest.raises(TypeError, match=msg): - idxr(s)[idx] + indexer_sli(s)[idx] # setitem - msg = "slice indices must be integers or None or have an __index__ method" + if indexer_sli is tm.iloc: + # otherwise we keep the same message as above + msg = "slice indices must be integers or None or have an __index__ method" with pytest.raises(TypeError, match=msg): - s.iloc[idx] = 0 - - msg = ( - "cannot do (slice|positional) indexing " - fr"on {type(index).__name__} with these indexers " - r"\[(3|4)(\.0)?\] " - r"of type (float|int)" - ) - for idxr in [lambda x: x.loc, lambda x: x]: - with pytest.raises(TypeError, match=msg): - idxr(s)[idx] = 0 + indexer_sli(s)[idx] = 0 def test_slice_integer(self): @@ -469,25 +453,24 @@ def test_float_slice_getitem_with_integer_index_raises(self, idx, index_func): s[idx] @pytest.mark.parametrize("idx", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - def test_slice_float(self, idx, frame_or_series): + def test_slice_float(self, idx, frame_or_series, indexer_sl): # same as above, but for floats index = Index(np.arange(5.0)) + 0.1 s = gen_obj(frame_or_series, index) expected = s.iloc[3:4] - for idxr in [lambda x: x.loc, lambda x: x]: - # getitem - result = idxr(s)[idx] - assert isinstance(result, type(s)) - tm.assert_equal(result, expected) + # getitem + result = indexer_sl(s)[idx] + assert isinstance(result, type(s)) + tm.assert_equal(result, expected) - # setitem - s2 = s.copy() - idxr(s2)[idx] = 0 - result = idxr(s2)[idx].values.ravel() - assert (result == 0).all() + # setitem + s2 = s.copy() + indexer_sl(s2)[idx] = 0 + result = indexer_sl(s2)[idx].values.ravel() + assert (result == 0).all() def test_floating_index_doc_example(self): @@ -564,19 +547,6 @@ def test_floating_misc(self, indexer_sl): result = indexer_sl(s)[[2.5]] tm.assert_series_equal(result, Series([1], index=[2.5])) - def test_floating_tuples(self): - # see gh-13509 - s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name="foo") - - result = s[0.0] - assert result == (1, 1) - - expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name="foo") - s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name="foo") - - result = s[0.0] - tm.assert_series_equal(result, expected) - def test_float64index_slicing_bug(self): # GH 5557, related to slicing a float index ser = { diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 3b3ea1227ba99..696693ec158ca 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -973,6 +973,73 @@ def test_iloc_setitem_dictionary_value(self): expected = DataFrame({"x": [1, 9], "y": [2.0, 99.0]}) tm.assert_frame_equal(df, expected) + def test_iloc_getitem_float_duplicates(self): + df = DataFrame( + np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") + ) + expect = df.iloc[1:] + tm.assert_frame_equal(df.loc[0.2], expect) + + expect = df.iloc[1:, 0] + tm.assert_series_equal(df.loc[0.2, "a"], expect) + + df.index = [1, 0.2, 0.2] + expect = df.iloc[1:] + tm.assert_frame_equal(df.loc[0.2], expect) + + expect = df.iloc[1:, 0] + tm.assert_series_equal(df.loc[0.2, "a"], expect) + + df = DataFrame( + np.random.randn(4, 3), index=[1, 0.2, 0.2, 1], columns=list("abc") + ) + expect = df.iloc[1:-1] + tm.assert_frame_equal(df.loc[0.2], expect) + + expect = df.iloc[1:-1, 0] + tm.assert_series_equal(df.loc[0.2, "a"], expect) + + df.index = [0.1, 0.2, 2, 0.2] + expect = df.iloc[[1, -1]] + tm.assert_frame_equal(df.loc[0.2], expect) + + expect = df.iloc[[1, -1], 0] + tm.assert_series_equal(df.loc[0.2, "a"], expect) + + def test_iloc_setitem_custom_object(self): + # iloc with an object + class TO: + def __init__(self, value): + self.value = value + + def __str__(self) -> str: + return f"[{self.value}]" + + __repr__ = __str__ + + def __eq__(self, other) -> bool: + return self.value == other.value + + def view(self): + return self + + df = DataFrame(index=[0, 1], columns=[0]) + df.iloc[1, 0] = TO(1) + df.iloc[1, 0] = TO(2) + + result = DataFrame(index=[0, 1], columns=[0]) + result.iloc[1, 0] = TO(2) + + tm.assert_frame_equal(result, df) + + # remains object dtype even after setting it back + df = DataFrame(index=[0, 1], columns=[0]) + df.iloc[1, 0] = TO(1) + df.iloc[1, 0] = np.nan + result = DataFrame(index=[0, 1], columns=[0]) + + tm.assert_frame_equal(result, df) + class TestILocErrors: # NB: this test should work for _any_ Series we can pass as @@ -996,6 +1063,14 @@ def test_iloc_float_raises(self, series_with_simple_index, frame_or_series): with pytest.raises(IndexError, match=_slice_iloc_msg): obj.iloc[3.0] = 0 + def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): + with pytest.raises(IndexingError, match="Too many indexers"): + float_frame.iloc[:, :, :] + + with pytest.raises(IndexError, match="too many indices for array"): + # GH#32257 we let numpy do validation, get their exception + float_frame.iloc[:, :, :] = 1 + class TestILocSetItemDuplicateColumns: def test_iloc_setitem_scalar_duplicate_columns(self): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 34f7ec9418028..f55a0ae2c199b 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -101,12 +101,12 @@ def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli): idxr = indexer_sli(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) - if indexer_sli.__name__ == "iloc": + if indexer_sli is tm.iloc: err = ValueError msg = f"Cannot set values with ndim > {obj.ndim}" elif ( isinstance(index, pd.IntervalIndex) - and indexer_sli.__name__ == "setitem" + and indexer_sli is tm.setitem and obj.ndim == 1 ): err = AttributeError @@ -138,17 +138,6 @@ def test_inf_upcast(self): expected = pd.Float64Index([1, 2, np.inf]) tm.assert_index_equal(result, expected) - def test_loc_setitem_with_expasnion_inf_upcast_empty(self): - # Test with np.inf in columns - df = DataFrame() - df.loc[0, 0] = 1 - df.loc[1, 1] = 2 - df.loc[0, np.inf] = 3 - - result = df.columns - expected = pd.Float64Index([0, 1, np.inf]) - tm.assert_index_equal(result, expected) - def test_setitem_dtype_upcast(self): # GH3216 @@ -308,12 +297,11 @@ def test_dups_fancy_indexing3(self): result = df.loc[[1, 2], ["a", "b"]] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("case", [tm.getitem, tm.loc]) - def test_duplicate_int_indexing(self, case): + def test_duplicate_int_indexing(self, indexer_sl): # GH 17347 s = Series(range(3), index=[1, 1, 3]) expected = s[1] - result = case(s)[[1]] + result = indexer_sl(s)[[1]] tm.assert_series_equal(result, expected) def test_indexing_mixed_frame_bug(self): @@ -499,40 +487,6 @@ def test_setitem_list(self): tm.assert_frame_equal(result, df) - def test_iloc_setitem_custom_object(self): - # iloc with an object - class TO: - def __init__(self, value): - self.value = value - - def __str__(self) -> str: - return f"[{self.value}]" - - __repr__ = __str__ - - def __eq__(self, other) -> bool: - return self.value == other.value - - def view(self): - return self - - df = DataFrame(index=[0, 1], columns=[0]) - df.iloc[1, 0] = TO(1) - df.iloc[1, 0] = TO(2) - - result = DataFrame(index=[0, 1], columns=[0]) - result.iloc[1, 0] = TO(2) - - tm.assert_frame_equal(result, df) - - # remains object dtype even after setting it back - df = DataFrame(index=[0, 1], columns=[0]) - df.iloc[1, 0] = TO(1) - df.iloc[1, 0] = np.nan - result = DataFrame(index=[0, 1], columns=[0]) - - tm.assert_frame_equal(result, df) - def test_string_slice(self): # GH 14424 # string indexing against datetimelike with object @@ -748,7 +702,7 @@ def test_slice_with_zero_step_raises(self, indexer_sl): with pytest.raises(ValueError, match="slice step cannot be zero"): indexer_sl(ser)[::0] - def test_indexing_assignment_dict_already_exists(self): + def test_loc_setitem_indexing_assignment_dict_already_exists(self): index = Index([-5, 0, 5], name="z") df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8]}, index=index) expected = df.copy() @@ -763,7 +717,7 @@ def test_indexing_assignment_dict_already_exists(self): expected = DataFrame({"x": [1, 2, 9], "y": [2.0, 2.0, 99.0]}, index=index) tm.assert_frame_equal(df, expected) - def test_indexing_dtypes_on_empty(self): + def test_iloc_getitem_indexing_dtypes_on_empty(self): # Check that .iloc returns correct dtypes GH9983 df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]}) df2 = df.iloc[[], :] @@ -772,7 +726,7 @@ def test_indexing_dtypes_on_empty(self): tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0]) @pytest.mark.parametrize("size", [5, 999999, 1000000]) - def test_range_in_series_indexing(self, size): + def test_loc_range_in_series_indexing(self, size): # range can cause an indexing error # GH 11652 s = Series(index=range(size), dtype=np.float64) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 829bba5f2930d..3726bbecde827 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,5 +1,6 @@ """ test label based indexing with loc """ from datetime import ( + date, datetime, time, timedelta, @@ -1141,6 +1142,127 @@ def test_loc_setitem_listlike_with_timedelta64index(self, indexer, expected): tm.assert_frame_equal(expected, df) + def test_loc_setitem_categorical_values_partial_column_slice(self): + # Assigning a Category to parts of a int/... column uses the values of + # the Categorical + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) + exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) + df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) + df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp) + + def test_loc_setitem_single_row_categorical(self): + # GH#25495 + df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) + categories = Categorical(df["Alpha"], categories=["a", "b", "c"]) + df.loc[:, "Alpha"] = categories + + result = df["Alpha"] + expected = Series(categories, index=df.index, name="Alpha") + tm.assert_series_equal(result, expected) + + def test_loc_setitem_datetime_coercion(self): + # GH#1048 + df = DataFrame({"c": [Timestamp("2010-10-01")] * 3}) + df.loc[0:1, "c"] = np.datetime64("2008-08-08") + assert Timestamp("2008-08-08") == df.loc[0, "c"] + assert Timestamp("2008-08-08") == df.loc[1, "c"] + df.loc[2, "c"] = date(2005, 5, 5) + with tm.assert_produces_warning(FutureWarning): + # Comparing Timestamp to date obj is deprecated + assert Timestamp("2005-05-05") == df.loc[2, "c"] + assert Timestamp("2005-05-05").date() == df.loc[2, "c"] + + @pytest.mark.parametrize("idxer", ["var", ["var"]]) + def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): + # GH#11365 + tz = tz_naive_fixture + idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) + expected = DataFrame(1.2, index=idx, columns=["var"]) + result = DataFrame(index=idx, columns=["var"]) + result.loc[:, idxer] = expected + tm.assert_frame_equal(result, expected) + + def test_loc_setitem_time_key(self): + index = date_range("2012-01-01", "2012-01-05", freq="30min") + df = DataFrame(np.random.randn(len(index), 5), index=index) + akey = time(12, 0, 0) + bkey = slice(time(13, 0, 0), time(14, 0, 0)) + ainds = [24, 72, 120, 168] + binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172] + + result = df.copy() + result.loc[akey] = 0 + result = result.loc[akey] + expected = df.loc[akey].copy() + expected.loc[:] = 0 + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.loc[akey] = 0 + result.loc[akey] = df.iloc[ainds] + tm.assert_frame_equal(result, df) + + result = df.copy() + result.loc[bkey] = 0 + result = result.loc[bkey] + expected = df.loc[bkey].copy() + expected.loc[:] = 0 + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.loc[bkey] = 0 + result.loc[bkey] = df.iloc[binds] + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("key", ["A", ["A"], ("A", slice(None))]) + def test_loc_setitem_unsorted_multiindex_columns(self, key): + # GH#38601 + mi = MultiIndex.from_tuples([("A", 4), ("B", "3"), ("A", "2")]) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=mi) + obj = df.copy() + obj.loc[:, key] = np.zeros((2, 2), dtype=int) + expected = DataFrame([[0, 2, 0], [0, 5, 0]], columns=mi) + tm.assert_frame_equal(obj, expected) + + df = df.sort_index(1) + df.loc[:, key] = np.zeros((2, 2), dtype=int) + expected = expected.sort_index(1) + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_uint_drop(self, any_int_dtype): + # see GH#18311 + # assigning series.loc[0] = 4 changed series.dtype to int + series = Series([1, 2, 3], dtype=any_int_dtype) + series.loc[0] = 4 + expected = Series([4, 2, 3], dtype=any_int_dtype) + tm.assert_series_equal(series, expected) + + def test_loc_setitem_td64_non_nano(self): + # GH#14155 + ser = Series(10 * [np.timedelta64(10, "m")]) + ser.loc[[1, 2, 3]] = np.timedelta64(20, "m") + expected = Series(10 * [np.timedelta64(10, "m")]) + expected.loc[[1, 2, 3]] = Timedelta(np.timedelta64(20, "m")) + tm.assert_series_equal(ser, expected) + + def test_loc_setitem_2d_to_1d_raises(self): + data = np.random.randn(2, 2) + ser = Series(range(2)) + + msg = "|".join( + [ + r"shape mismatch: value array of shape \(2,2\)", + r"cannot reshape array of size 4 into shape \(2,\)", + ] + ) + with pytest.raises(ValueError, match=msg): + ser.loc[range(2)] = data + + msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" + with pytest.raises(ValueError, match=msg): + ser.loc[:] = data + class TestLocWithMultiIndex: @pytest.mark.parametrize( @@ -1270,6 +1392,31 @@ def test_loc_getitem_sorted_index_level_with_duplicates(self): result = df.loc[("foo", "bar")] tm.assert_frame_equal(result, expected) + def test_loc_getitem_preserves_index_level_category_dtype(self): + # GH#15166 + df = DataFrame( + data=np.arange(2, 22, 2), + index=MultiIndex( + levels=[CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) + + expected = CategoricalIndex( + ["a", "b"], + categories=["a", "b"], + ordered=False, + name="Index1", + dtype="category", + ) + + result = df.index.levels[0] + tm.assert_index_equal(result, expected) + + result = df.loc[["a"]].index.levels[0] + tm.assert_index_equal(result, expected) + class TestLocSetitemWithExpansion: @pytest.mark.slow @@ -1340,6 +1487,57 @@ def test_loc_setitem_categorical_column_retains_dtype(self, ordered): expected = DataFrame({"A": [1], "B": Categorical(["b"], ordered=ordered)}) tm.assert_frame_equal(result, expected) + def test_loc_setitem_with_expansion_and_existing_dst(self): + # GH#18308 + start = Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") + end = Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") + ts = Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") + idx = pd.date_range(start, end, closed="left", freq="H") + assert ts not in idx # i.e. result.loc setitem is with-expansion + + result = DataFrame(index=idx, columns=["value"]) + result.loc[ts, "value"] = 12 + expected = DataFrame( + [np.nan] * len(idx) + [12], + index=idx.append(pd.DatetimeIndex([ts])), + columns=["value"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + def test_setitem_with_expansion(self): + # indexing - setting an element + df = DataFrame( + data=pd.to_datetime(["2015-03-30 20:12:32", "2015-03-12 00:11:11"]), + columns=["time"], + ) + df["new_col"] = ["new", "old"] + df.time = df.set_index("time").index.tz_localize("UTC") + v = df[df.new_col == "new"].set_index("time").index.tz_convert("US/Pacific") + + # trying to set a single element on a part of a different timezone + # this converts to object + df2 = df.copy() + df2.loc[df2.new_col == "new", "time"] = v + + expected = Series([v[0], df.loc[1, "time"]], name="time") + tm.assert_series_equal(df2.time, expected) + + v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s") + df.loc[df.new_col == "new", "time"] = v + tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) + + def test_loc_setitem_with_expansion_inf_upcast_empty(self): + # Test with np.inf in columns + df = DataFrame() + df.loc[0, 0] = 1 + df.loc[1, 1] = 2 + df.loc[0, np.inf] = 3 + + result = df.columns + expected = pd.Float64Index([0, 1, np.inf]) + tm.assert_index_equal(result, expected) + class TestLocCallable: def test_frame_loc_getitem_callable(self): @@ -1543,6 +1741,35 @@ def test_loc_getitem_partial_slice_non_monotonicity( class TestLabelSlicing: + def test_loc_getitem_slicing_datetimes_frame(self): + # GH#7523 + + # unique + df_unique = DataFrame( + np.arange(4.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]], + ) + + # duplicates + df_dups = DataFrame( + np.arange(5.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]], + ) + + for df in [df_unique, df_dups]: + result = df.loc[datetime(2001, 1, 1, 10) :] + tm.assert_frame_equal(result, df) + result = df.loc[: datetime(2001, 1, 4, 10)] + tm.assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] + tm.assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11) :] + expected = df.iloc[1:] + tm.assert_frame_equal(result, expected) + result = df.loc["20010101 11":] + tm.assert_frame_equal(result, expected) + def test_loc_getitem_label_slice_across_dst(self): # GH#21846 idx = date_range( @@ -1791,6 +2018,48 @@ def test_loc_getitem_series_label_list_missing_integer_values(self): with pytest.raises(KeyError, match="with any missing labels"): ser.loc[np.array([9730701000001104, 10047311000001102])] + @pytest.mark.parametrize("to_period", [True, False]) + def test_loc_getitem_listlike_of_datetimelike_keys(self, to_period): + # GH#11497 + + idx = date_range("2011-01-01", "2011-01-02", freq="D", name="idx") + if to_period: + idx = idx.to_period("D") + ser = Series([0.1, 0.2], index=idx, name="s") + + keys = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] + if to_period: + keys = [x.to_period("D") for x in keys] + result = ser.loc[keys] + exp = Series([0.1, 0.2], index=idx, name="s") + if not to_period: + exp.index = exp.index._with_freq(None) + tm.assert_series_equal(result, exp, check_index_type=True) + + keys = [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02"), + Timestamp("2011-01-01"), + ] + if to_period: + keys = [x.to_period("D") for x in keys] + exp = Series( + [0.2, 0.2, 0.1], index=Index(keys, name="idx", dtype=idx.dtype), name="s" + ) + result = ser.loc[keys] + tm.assert_series_equal(result, exp, check_index_type=True) + + keys = [ + Timestamp("2011-01-03"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ] + if to_period: + keys = [x.to_period("D") for x in keys] + + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[keys] + @pytest.mark.parametrize( "columns, column_key, expected_columns", diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 2339e21288bb5..24d1973eeda6d 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -121,7 +121,7 @@ def test_ambiguous_width(self): assert adjoined == expected -@td.skip_array_manager_not_yet_implemented +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON class TestTableSchemaRepr: @classmethod def setup_class(cls): diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 6ead81db1fab0..8c69ffedf1df4 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -1,3 +1,5 @@ +from io import BytesIO + import pytest import pandas.util._test_decorators as td @@ -117,3 +119,13 @@ def test_to_json_compression(compression_only, read_infer, to_infer): df.to_json(path, compression=to_compression) result = pd.read_json(path, compression=read_compression) tm.assert_frame_equal(result, df) + + +def test_to_json_compression_mode(compression): + # GH 39985 (read_json does not support user-provided binary files) + expected = pd.DataFrame({"A": [1]}) + + with BytesIO() as buffer: + expected.to_json(buffer, compression=compression) + # df = pd.read_json(buffer, compression=compression) + # tm.assert_frame_equal(expected, df) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index d9575a6ad81e5..d97aaa2ea2763 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -247,12 +247,20 @@ def test_pickle_options(fsspectest): tm.assert_frame_equal(df, out) -@td.skip_array_manager_not_yet_implemented -def test_json_options(fsspectest): +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) JSON +def test_json_options(fsspectest, compression): df = DataFrame({"a": [0]}) - df.to_json("testmem://afile", storage_options={"test": "json_write"}) + df.to_json( + "testmem://afile", + compression=compression, + storage_options={"test": "json_write"}, + ) assert fsspectest.test[0] == "json_write" - out = read_json("testmem://afile", storage_options={"test": "json_read"}) + out = read_json( + "testmem://afile", + compression=compression, + storage_options={"test": "json_read"}, + ) assert fsspectest.test[0] == "json_read" tm.assert_frame_equal(df, out) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 2244f6eba9479..bf3e6d822ab19 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -218,7 +218,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): getattr(empty_series_dti.resample("d"), resample_method)() except DataError: # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) + # (ex: doing mean with dtype of np.object_) pass diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index a17ed44c4011a..50775b9ef3a47 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -10,6 +10,7 @@ from pandas import ( DataFrame, Series, + TimedeltaIndex, Timestamp, ) import pandas._testing as tm @@ -398,6 +399,18 @@ def test_resample_groupby_agg(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) +def test_empty(keys): + # GH 26411 + df = pd.DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected = DataFrame(columns=["a", "b"]).set_index(keys, drop=False) + if len(keys) == 1: + expected.index.name = keys[0] + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index e5499c44be7d7..2ec94d4cebf5a 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -551,6 +553,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d9af59382ae79..e1b1e80a29a43 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -287,17 +287,27 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self): + def test_merge_nocopy(self, using_array_manager): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) - merged["a"] = 6 - assert (left["a"] == 6).all() + if using_array_manager: + # With ArrayManager, setting a column doesn't change the values inplace + # and thus does not propagate the changes to the original left/right + # dataframes -> need to check that no copy was made in a different way + # TODO(ArrayManager) we should be able to simplify this with a .loc + # setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10 + # but this currently replaces the array (_setitem_with_indexer_split_path) + assert merged._mgr.arrays[0] is left._mgr.arrays[0] + assert merged._mgr.arrays[2] is right._mgr.arrays[0] + else: + merged["a"] = 6 + assert (left["a"] == 6).all() - merged["d"] = "peekaboo" - assert (right["d"] == "peekaboo").all() + merged["d"] = "peekaboo" + assert (right["d"] == "peekaboo").all() def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -1381,7 +1391,10 @@ def test_merge_readonly(self): np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) - data1._mgr.blocks[0].values.flags.writeable = False + # make each underlying block array / column array read-only + for arr in data1._mgr.arrays: + arr.flags.writeable = False + data1.merge(data2) # no error diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 86cde3eee874d..1ecb408d49813 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -240,7 +240,10 @@ def test_crosstab_no_overlap(self): s2 = Series([4, 5, 6], index=[4, 5, 6]) actual = crosstab(s1, s2) - expected = DataFrame() + expected = DataFrame( + index=Index([], dtype="int64", name="row_0"), + columns=Index([], dtype="int64", name="col_0"), + ) tm.assert_frame_equal(actual, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 4786b8c35a5b1..56326dd15bd9b 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -145,18 +145,28 @@ def test_bins_not_monotonic(): ), ), ( - [np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], + [ + np.timedelta64(-1, "ns"), + np.timedelta64(0, "ns"), + np.timedelta64(1, "ns"), + ], np.array( [ - np.timedelta64(-np.iinfo(np.int64).max), - np.timedelta64(0), - np.timedelta64(np.iinfo(np.int64).max), + np.timedelta64(-np.iinfo(np.int64).max, "ns"), + np.timedelta64(0, "ns"), + np.timedelta64(np.iinfo(np.int64).max, "ns"), ] ), IntervalIndex.from_tuples( [ - (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), - (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max)), + ( + np.timedelta64(-np.iinfo(np.int64).max, "ns"), + np.timedelta64(0, "ns"), + ), + ( + np.timedelta64(0, "ns"), + np.timedelta64(np.iinfo(np.int64).max, "ns"), + ), ] ), ), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 19eba4305fdf6..8d2b4f2b325c2 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2040,7 +2040,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): tm.assert_frame_equal(result, expected) def test_pivot_table_empty_aggfunc(self): - # GH 9186 + # GH 9186 & GH 13483 df = DataFrame( { "A": [2, 2, 3, 3, 2], @@ -2050,7 +2050,8 @@ def test_pivot_table_empty_aggfunc(self): } ) result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) - expected = DataFrame() + expected = DataFrame(index=Index([], dtype="int64", name="A")) + expected.columns.name = "D" tm.assert_frame_equal(result, expected) def test_pivot_table_no_column_raises(self): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 347aa8d66405c..8098b195c3838 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -61,89 +61,17 @@ def test_fancy_setitem(): assert (s[48:54] == -3).all() -def test_slicing_datetimes(): - # GH 7523 - - # unique - df = DataFrame( - np.arange(4.0, dtype="float64"), - index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]], - ) - result = df.loc[datetime(2001, 1, 1, 10) :] - tm.assert_frame_equal(result, df) - result = df.loc[: datetime(2001, 1, 4, 10)] - tm.assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] - tm.assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11) :] - expected = df.iloc[1:] - tm.assert_frame_equal(result, expected) - result = df.loc["20010101 11":] - tm.assert_frame_equal(result, expected) - - # duplicates - df = DataFrame( - np.arange(5.0, dtype="float64"), - index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]], - ) - - result = df.loc[datetime(2001, 1, 1, 10) :] - tm.assert_frame_equal(result, df) - result = df.loc[: datetime(2001, 1, 4, 10)] - tm.assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] - tm.assert_frame_equal(result, df) - - result = df.loc[datetime(2001, 1, 1, 11) :] - expected = df.iloc[1:] - tm.assert_frame_equal(result, expected) - result = df.loc["20010101 11":] - tm.assert_frame_equal(result, expected) - - -def test_getitem_setitem_datetime_tz_pytz(): - N = 50 - # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") - ts = Series(np.random.randn(N), index=rng) - - # also test Timestamp tz handling, GH #2789 - result = ts.copy() - result["1990-01-01 09:00:00+00:00"] = 0 - result["1990-01-01 09:00:00+00:00"] = ts[4] - tm.assert_series_equal(result, ts) - - result = ts.copy() - result["1990-01-01 03:00:00-06:00"] = 0 - result["1990-01-01 03:00:00-06:00"] = ts[4] - tm.assert_series_equal(result, ts) - - # repeat with datetimes - result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=pytz.timezone("UTC"))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=pytz.timezone("UTC"))] = ts[4] - tm.assert_series_equal(result, ts) - - result = ts.copy() - - # comparison dates with datetime MUST be localized! - date = pytz.timezone("US/Central").localize(datetime(1990, 1, 1, 3)) - result[date] = 0 - result[date] = ts[4] - tm.assert_series_equal(result, ts) - - -def test_getitem_setitem_datetime_tz_dateutil(): - - tz = ( - lambda x: tzutc() if x == "UTC" else gettz(x) - ) # handle special case for utc in dateutil +@pytest.mark.parametrize("tz_source", ["pytz", "dateutil"]) +def test_getitem_setitem_datetime_tz(tz_source): + if tz_source == "pytz": + tzget = pytz.timezone + else: + # handle special case for utc in dateutil + tzget = lambda x: tzutc() if x == "UTC" else gettz(x) N = 50 - # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz="America/New_York") + rng = date_range("1/1/1990", periods=N, freq="H", tz=tzget("US/Eastern")) ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -159,13 +87,15 @@ def test_getitem_setitem_datetime_tz_dateutil(): # repeat with datetimes result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = ts[4] + result[datetime(1990, 1, 1, 9, tzinfo=tzget("UTC"))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tzget("UTC"))] = ts[4] tm.assert_series_equal(result, ts) result = ts.copy() - result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = 0 - result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = ts[4] + dt = Timestamp(1990, 1, 1, 3).tz_localize(tzget("US/Central")) + dt = dt.to_pydatetime() + result[dt] = 0 + result[dt] = ts[4] tm.assert_series_equal(result, ts) @@ -382,7 +312,7 @@ def test_indexing_with_duplicate_datetimeindex( assert ts[datetime(2000, 1, 6)] == 0 -def test_indexing_over_size_cutoff(monkeypatch): +def test_loc_getitem_over_size_cutoff(monkeypatch): # #1821 monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 1000) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 64d763f410666..e6dfafabbfec2 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -36,6 +36,32 @@ class TestSeriesGetitemScalars: + def test_getitem_float_keys_tuple_values(self): + # see GH#13509 + + # unique Index + ser = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name="foo") + result = ser[0.0] + assert result == (1, 1) + + # non-unique Index + expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name="foo") + ser = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name="foo") + + result = ser[0.0] + tm.assert_series_equal(result, expected) + + def test_getitem_unrecognized_scalar(self): + # GH#32684 a scalar key that is not recognized by lib.is_scalar + + # a series that might be produced via `frame.dtypes` + ser = Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) + + key = ser.index[1] + + result = ser[key] + assert result == 2 + def test_getitem_negative_out_of_bounds(self): ser = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) @@ -595,3 +621,8 @@ def test_getitem_categorical_str(): with tm.assert_produces_warning(FutureWarning): result = ser.index.get_value(ser, "a") tm.assert_series_equal(result, expected) + + +def test_slice_can_reorder_not_uniquely_indexed(): + ser = Series(1, index=["a", "a", "b", "b", "c"]) + ser[::-1] # it works! diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 49264c5b669d7..34ba20c03b732 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -82,16 +82,6 @@ def test_getitem_setitem_ellipsis(): assert (result == 5).all() -def test_setitem_with_expansion_type_promotion(): - # GH12599 - s = Series(dtype=object) - s["a"] = Timestamp("2016-01-01") - s["b"] = 3.0 - s["c"] = "foo" - expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) - tm.assert_series_equal(s, expected) - - @pytest.mark.parametrize( "result_1, duplicate_item, expected_1", [ @@ -193,40 +183,12 @@ def test_setitem_slicestep(): assert (series[::2] == 0).all() -def test_setitem_not_contained(string_series): - # set item that's not contained - ser = string_series.copy() - ser["foobar"] = 1 - - app = Series([1], index=["foobar"], name="series") - expected = string_series.append(app) - tm.assert_series_equal(ser, expected) - - def test_setslice(datetime_series): sl = datetime_series[5:20] assert len(sl) == len(sl.index) assert sl.index.is_unique is True -def test_loc_setitem_2d_to_1d_raises(): - x = np.random.randn(2, 2) - y = Series(range(2)) - - msg = "|".join( - [ - r"shape mismatch: value array of shape \(2,2\)", - r"cannot reshape array of size 4 into shape \(2,\)", - ] - ) - with pytest.raises(ValueError, match=msg): - y.loc[range(2)] = x - - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" - with pytest.raises(ValueError, match=msg): - y.loc[:] = x - - # FutureWarning from NumPy about [slice(None, 5). @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") def test_basic_getitem_setitem_corner(datetime_series): @@ -252,84 +214,7 @@ def test_basic_getitem_setitem_corner(datetime_series): datetime_series[[5, slice(None, None)]] = 2 -@pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) -def test_setitem_with_tz(tz, indexer_sli): - orig = Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz)) - assert orig.dtype == f"datetime64[ns, {tz}]" - - exp = Series( - [ - Timestamp("2016-01-01 00:00", tz=tz), - Timestamp("2011-01-01 00:00", tz=tz), - Timestamp("2016-01-01 02:00", tz=tz), - ] - ) - - # scalar - ser = orig.copy() - indexer_sli(ser)[1] = Timestamp("2011-01-01", tz=tz) - tm.assert_series_equal(ser, exp) - - # vector - vals = Series( - [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], - index=[1, 2], - ) - assert vals.dtype == f"datetime64[ns, {tz}]" - - exp = Series( - [ - Timestamp("2016-01-01 00:00", tz=tz), - Timestamp("2011-01-01 00:00", tz=tz), - Timestamp("2012-01-01 00:00", tz=tz), - ] - ) - - ser = orig.copy() - indexer_sli(ser)[[1, 2]] = vals - tm.assert_series_equal(ser, exp) - - -def test_setitem_with_tz_dst(indexer_sli): - # GH XXX TODO: fill in GH ref - tz = "US/Eastern" - orig = Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz)) - assert orig.dtype == f"datetime64[ns, {tz}]" - - exp = Series( - [ - Timestamp("2016-11-06 00:00-04:00", tz=tz), - Timestamp("2011-01-01 00:00-05:00", tz=tz), - Timestamp("2016-11-06 01:00-05:00", tz=tz), - ] - ) - - # scalar - ser = orig.copy() - indexer_sli(ser)[1] = Timestamp("2011-01-01", tz=tz) - tm.assert_series_equal(ser, exp) - - # vector - vals = Series( - [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], - index=[1, 2], - ) - assert vals.dtype == f"datetime64[ns, {tz}]" - - exp = Series( - [ - Timestamp("2016-11-06 00:00", tz=tz), - Timestamp("2011-01-01 00:00", tz=tz), - Timestamp("2012-01-01 00:00", tz=tz), - ] - ) - - ser = orig.copy() - indexer_sli(ser)[[1, 2]] = vals - tm.assert_series_equal(ser, exp) - - -def test_categorical_assigning_ops(): +def test_setitem_categorical_assigning_ops(): orig = Series(Categorical(["b", "b"], categories=["a", "b"])) s = orig.copy() s[:] = "a" @@ -387,11 +272,6 @@ def test_slice(string_series, object_series): assert (string_series[10:20] == 0).all() -def test_slice_can_reorder_not_uniquely_indexed(): - s = Series(1, index=["a", "a", "b", "b", "c"]) - s[::-1] # it works! - - def test_loc_setitem(string_series): inds = string_series.index[[3, 4, 7]] @@ -433,15 +313,6 @@ def test_timedelta_assignment(): tm.assert_series_equal(s, expected) -def test_setitem_td64_non_nano(): - # GH 14155 - ser = Series(10 * [np.timedelta64(10, "m")]) - ser.loc[[1, 2, 3]] = np.timedelta64(20, "m") - expected = Series(10 * [np.timedelta64(10, "m")]) - expected.loc[[1, 2, 3]] = Timedelta(np.timedelta64(20, "m")) - tm.assert_series_equal(ser, expected) - - def test_underlying_data_conversion(): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) @@ -460,33 +331,6 @@ def test_underlying_data_conversion(): tm.assert_frame_equal(df, expected) -def test_chained_assignment(): - # GH 3970 - with pd.option_context("chained_assignment", None): - df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) - df["cc"] = 0.0 - - ck = [True] * len(df) - - df["bb"].iloc[0] = 0.13 - - # TODO: unused - df_tmp = df.iloc[ck] # noqa - - df["bb"].iloc[0] = 0.15 - assert df["bb"].iloc[0] == 0.15 - - -def test_setitem_with_expansion_dtype(): - # GH 3217 - df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) - df["c"] = np.nan - df["c"].update(Series(["foo"], index=[0])) - - expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) - tm.assert_frame_equal(df, expected) - - def test_preserve_refs(datetime_series): seq = datetime_series[[5, 10, 15]] seq[1] = np.NaN @@ -535,18 +379,16 @@ def test_setitem_mask_promote(): tm.assert_series_equal(ser, expected) -def test_multilevel_preserve_name(): +def test_multilevel_preserve_name(indexer_sl): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) - s = Series(np.random.randn(len(index)), index=index, name="sth") + ser = Series(np.random.randn(len(index)), index=index, name="sth") - result = s["foo"] - result2 = s.loc["foo"] - assert result.name == s.name - assert result2.name == s.name + result = indexer_sl(ser)["foo"] + assert result.name == ser.name """ @@ -554,27 +396,6 @@ def test_multilevel_preserve_name(): """ -def test_uint_drop(any_int_dtype): - # see GH18311 - # assigning series.loc[0] = 4 changed series.dtype to int - series = Series([1, 2, 3], dtype=any_int_dtype) - series.loc[0] = 4 - expected = Series([4, 2, 3], dtype=any_int_dtype) - tm.assert_series_equal(series, expected) - - -def test_getitem_unrecognized_scalar(): - # GH#32684 a scalar key that is not recognized by lib.is_scalar - - # a series that might be produced via `frame.dtypes` - ser = Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) - - key = ser.index[1] - - result = ser[key] - assert result == 2 - - def test_slice_with_zero_step_raises(index, frame_or_series, indexer_sli): ts = frame_or_series(np.arange(len(index)), index=index) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index ba9593067a412..bbe328114fd20 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -63,6 +63,81 @@ def test_setitem_tuple_with_datetimetz_values(self): expected.iloc[0] = np.nan tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) + def test_setitem_with_tz(self, tz, indexer_sli): + orig = Series(date_range("2016-01-01", freq="H", periods=3, tz=tz)) + assert orig.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-01-01 00:00", tz=tz), + Timestamp("2011-01-01 00:00", tz=tz), + Timestamp("2016-01-01 02:00", tz=tz), + ] + ) + + # scalar + ser = orig.copy() + indexer_sli(ser)[1] = Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(ser, exp) + + # vector + vals = Series( + [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-01-01 00:00", tz=tz), + Timestamp("2011-01-01 00:00", tz=tz), + Timestamp("2012-01-01 00:00", tz=tz), + ] + ) + + ser = orig.copy() + indexer_sli(ser)[[1, 2]] = vals + tm.assert_series_equal(ser, exp) + + def test_setitem_with_tz_dst(self, indexer_sli): + # GH XXX TODO: fill in GH ref + tz = "US/Eastern" + orig = Series(date_range("2016-11-06", freq="H", periods=3, tz=tz)) + assert orig.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-11-06 00:00-04:00", tz=tz), + Timestamp("2011-01-01 00:00-05:00", tz=tz), + Timestamp("2016-11-06 01:00-05:00", tz=tz), + ] + ) + + # scalar + ser = orig.copy() + indexer_sli(ser)[1] = Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(ser, exp) + + # vector + vals = Series( + [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-11-06 00:00", tz=tz), + Timestamp("2011-01-01 00:00", tz=tz), + Timestamp("2012-01-01 00:00", tz=tz), + ] + ) + + ser = orig.copy() + indexer_sli(ser)[[1, 2]] = vals + tm.assert_series_equal(ser, exp) + class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): @@ -303,6 +378,25 @@ def test_append_timedelta_does_not_cast(self, td): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) + def test_setitem_with_expansion_type_promotion(self): + # GH#12599 + ser = Series(dtype=object) + ser["a"] = Timestamp("2016-01-01") + ser["b"] = 3.0 + ser["c"] = "foo" + expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) + tm.assert_series_equal(ser, expected) + + def test_setitem_not_contained(self, string_series): + # set item that's not contained + ser = string_series.copy() + assert "foobar" not in ser.index + ser["foobar"] = 1 + + app = Series([1], index=["foobar"], name="series") + expected = string_series.append(app) + tm.assert_series_equal(ser, expected) + def test_setitem_scalar_into_readonly_backing_data(): # GH#14359: test that you cannot mutate a read only buffer diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 6ff14087e6259..99ff4e8e6a8dd 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsTimedelta + import pandas as pd from pandas import ( Series, @@ -14,6 +16,7 @@ to_timedelta, ) import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray class TestTimedeltas: @@ -75,6 +78,19 @@ def test_to_timedelta(self): expected = TimedeltaIndex([np.timedelta64(1, "D")] * 5) tm.assert_index_equal(result, expected) + def test_to_timedelta_oob_non_nano(self): + arr = np.array([pd.NaT.value + 1], dtype="timedelta64[s]") + + msg = r"Out of bounds for nanosecond timedelta64\[s\] -9223372036854775807" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + to_timedelta(arr) + + with pytest.raises(OutOfBoundsTimedelta, match=msg): + TimedeltaIndex(arr) + + with pytest.raises(OutOfBoundsTimedelta, match=msg): + TimedeltaArray._from_sequence(arr) + def test_to_timedelta_dataframe(self): # GH 11776 arr = np.arange(10).reshape(2, 5) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 4989e23ed7ba5..fc2e86310dae9 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1133,3 +1133,18 @@ def test_rolling_skew_kurt_large_value_range(method, values): def test_invalid_method(): with pytest.raises(ValueError, match="method must be 'table' or 'single"): Series(range(1)).rolling(1, method="foo") + + +@pytest.mark.parametrize("window", [1, "1d"]) +def test_rolling_descending_date_order_with_offset(window, frame_or_series): + # GH#40002 + idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") + obj = frame_or_series(range(1, 4), index=idx) + result = obj.rolling("1d", closed="left").sum() + expected = frame_or_series([np.nan, 1, 2], index=idx) + tm.assert_equal(result, expected) + + result = obj.iloc[::-1].rolling("1d", closed="left").sum() + idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1d") + expected = frame_or_series([np.nan, 3, 2], index=idx) + tm.assert_equal(result, expected) From 3ac01913d8f00d0a6baa0aa1889fd71ee40cf452 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Feb 2021 10:13:22 -0800 Subject: [PATCH 3/6] PERF: NDArrayBacked in cython --- pandas/_libs/lib.pyx | 142 ++++++++++++++++++++++++++++ pandas/compat/pickle_compat.py | 13 +++ pandas/core/arrays/datetimelike.py | 18 ++-- pandas/core/arrays/datetimes.py | 19 ++-- pandas/core/arrays/period.py | 4 + pandas/core/arrays/timedeltas.py | 15 ++- pandas/core/indexes/datetimelike.py | 1 + pandas/core/indexes/datetimes.py | 3 +- 8 files changed, 185 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d2aa47f65d263..39e2251ca37d0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2639,3 +2639,145 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +cdef class NDArrayBacked: + """ + Implementing these methods in cython improves performance quite a bit. + + import pandas as pd + + from pandas._libs.lib import NDArrayBacked as cls + + dti = pd.date_range("2016-01-01", periods=3) + dta = dti._data + arr = dta._ndarray + + obj = cls._simpler_new(arr, arr.dtype) + + %timeit foo.copy() + 299 ns ± 30 ns per loop # <-- arr underlying ndarray + 530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked + 1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked + 328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__ + 371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simpler_new + + %timeit foo.T + 125 ns ± 6.27 ns per loop # <-- arr underlying ndarray + 226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked + 911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked + 215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simpler_new + + """ + # TODO: implement take in terms of cnp.PyArray_TakeFrom + # TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate + + cdef: + readonly ndarray _ndarray + readonly object _dtype + + @classmethod + def _simpler_new(cls, ndarray values, object dtype): + # Note: not _simple_new; for unclear reasons, calling this _simple_new + # and trying to call it from the subclass method using super()... fails + cdef: + NDArrayBacked obj + obj = NDArrayBacked.__new__(cls) + obj._ndarray = values + obj._dtype = dtype + return obj + + cpdef NDArrayBacked _from_backing_data(self, ndarray values): + # TODO: re-reuse simpler_new if/when it can be cpdef + cdef: + NDArrayBacked obj + obj = NDArrayBacked.__new__(type(self)) + obj._ndarray = values + obj._dtype = self._dtype + return obj + + cpdef __setstate__(self, state): + if isinstance(state, dict): + if "_data" in state: + data = state.pop("_data") + elif "_ndarray" in state: + data = state.pop("_ndarray") + else: + raise ValueError + self._ndarray = data + self._dtype = state.pop("_dtype") + + for key, val in state.items(): + setattr(self, key, val) + elif isinstance(state, tuple): + if len(state) != 3: + raise NotImplementedError(state) + + data, dtype = state[:2] + if isinstance(dtype, np.ndarray): + dtype, data = data, dtype + self._ndarray = data + self._dtype = dtype + + if isinstance(state[2], dict): + for key, val in state[2].items(): + setattr(self, key, val) + else: + raise NotImplementedError(state) + else: + raise NotImplementedError(state) + + def __len__(self): + return len(self._ndarray) + + @property + def shape(self): + # object cast bc _ndarray.shape is npy_intp* + return ((self._ndarray)).shape + + @property + def ndim(self) -> int: + return self._ndarray.ndim + + @property + def size(self): + return self._ndarray.size + + @property + def nbytes(self): + return self._ndarray.nbytes + + def copy(self): + # NPY_ANYORDER -> same order as self._ndarray + res_values = cnp.PyArray_NewCopy(self._ndarray, cnp.NPY_ANYORDER) + return self._from_backing_data(res_values) + + def delete(self, loc, axis=0): + res_values = np.delete(self._ndarray, loc, axis=axis) + return self._from_backing_data(res_values) + + def swapaxes(self, axis1, axis2): + res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2) + return self._from_backing_data(res_values) + + # TODO: pass NPY_MAXDIMS equiv to axis=None? + def repeat(self, repeats, axis: int = 0): + if axis is None: + axis = 0 + res_values = cnp.PyArray_Repeat(self._ndarray, repeats, axis) + return self._from_backing_data(res_values) + + def reshape(self, *args, **kwargs): + res_values = self._ndarray.reshape(*args, **kwargs) + return self._from_backing_data(res_values) + + def ravel(self, order="C"): + # cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order) + # res_values = cnp.PyArray_Ravel(self._ndarray, order) + res_values = self._ndarray.ravel(order) + return self._from_backing_data(res_values) + + @property + def T(self): + res_values = self._ndarray.T + return self._from_backing_data(res_values) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 9d48035213126..852e8de816a4f 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -13,9 +13,15 @@ ) import warnings +import numpy as np + from pandas._libs.tslibs import BaseOffset from pandas import Index +from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, +) if TYPE_CHECKING: from pandas import ( @@ -207,6 +213,13 @@ def load_newobj(self): # compat if issubclass(cls, Index): obj = object.__new__(cls) + elif issubclass(cls, DatetimeArray) and not args: + arr = np.array([], dtype="M8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + elif issubclass(cls, TimedeltaArray) and not args: + arr = np.array([], dtype="m8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + else: obj = cls.__new__(cls, *args) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e476c3566c10f..162760d92ff2f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -158,9 +158,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): _recognized_scalars: Tuple[Type, ...] _ndarray: np.ndarray - def __init__(self, data, dtype: Optional[Dtype] = None, freq=None, copy=False): - raise AbstractMethodError(self) - @classmethod def _simple_new( cls: Type[DatetimeLikeArrayT], @@ -254,6 +251,8 @@ def _check_compatible_with( # NDArrayBackedExtensionArray compat def __setstate__(self, state): + # TODO: how is NDArrayBacked.__setstate__ getting called? we + # aren't doing super().__setstate__(state) here if isinstance(state, dict): if "_data" in state and "_ndarray" not in state: # backward compat, changed what is property vs attribute @@ -272,12 +271,6 @@ def __setstate__(self, state): def _data(self) -> np.ndarray: return self._ndarray - def _from_backing_data( - self: DatetimeLikeArrayT, arr: np.ndarray - ) -> DatetimeLikeArrayT: - # Note: we do not retain `freq` - return type(self)._simple_new(arr, dtype=self.dtype) - # ------------------------------------------------------------------ def _box_func(self, x): @@ -1680,11 +1673,16 @@ def strftime(self, date_format): """ -class TimelikeOps(DatetimeLikeArrayMixin): +class TimelikeOps(lib.NDArrayBacked, DatetimeLikeArrayMixin): """ Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. """ + def copy(self): + result = lib.NDArrayBacked.copy(self) + result._freq = self._freq + return result + def _round(self, freq, mode, ambiguous, nonexistent): # round the local times if is_datetime64tz_dtype(self.dtype): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 28e469547fe62..ddc39e76113b3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -238,13 +238,13 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _dtype: Union[np.dtype, DatetimeTZDtype] _freq = None - def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): + def __new__(cls, values, dtype=DT64NS_DTYPE, freq=None, copy=False): if isinstance(values, (ABCSeries, ABCIndex)): values = values._values inferred_freq = getattr(values, "_freq", None) - if isinstance(values, type(self)): + if isinstance(values, cls): # validation dtz = getattr(dtype, "tz", None) if dtz and values.tz is None: @@ -303,12 +303,11 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): # be incorrect(ish?) for the array as a whole dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz)) - self._ndarray = values - self._dtype = dtype - self._freq = freq + obj = cls._simple_new(values, freq=freq, dtype=dtype) if inferred_freq is None and freq is not None: - type(self)._validate_frequency(self, freq) + cls._validate_frequency(obj, freq) + return obj @classmethod def _simple_new( @@ -319,10 +318,8 @@ def _simple_new( assert values.dtype == "i8" values = values.view(DT64NS_DTYPE) - result = object.__new__(cls) - result._ndarray = values + result = cls._simpler_new(values, dtype) result._freq = freq - result._dtype = dtype return result @classmethod @@ -2005,7 +2002,9 @@ def sequence_to_dt64ns( if is_datetime64tz_dtype(data_dtype): # DatetimeArray -> ndarray tz = _maybe_infer_tz(tz, data.tz) - result = data._data + if isinstance(data, ABCIndex): + data = data._data + result = data._ndarray elif is_datetime64_dtype(data_dtype): # tz-naive DatetimeArray or ndarray[datetime64] diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 96a159c0804c9..97f049b8bfd2e 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -223,6 +223,10 @@ def _simple_new( assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg return cls(values, freq=freq, dtype=dtype) + def _from_backing_data(self: PeriodArray, arr: np.ndarray) -> PeriodArray: + # Note: we do not retain `freq` + return type(self)._simple_new(arr, dtype=self.dtype) + @classmethod def _from_sequence( cls: Type[PeriodArray], diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f7af1bb3da86b..b8e1cbb1d9b72 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -168,14 +168,14 @@ def dtype(self) -> np.dtype: _freq = None - def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): + def __new__(cls, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): values = extract_array(values) inferred_freq = getattr(values, "_freq", None) explicit_none = freq is None freq = freq if freq is not lib.no_default else None - if isinstance(values, type(self)): + if isinstance(values, cls): if explicit_none: # dont inherit from values pass @@ -216,12 +216,11 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): if freq: freq = to_offset(freq) - self._ndarray = values - self._dtype = dtype - self._freq = freq + obj = cls._simple_new(values, freq=freq, dtype=dtype) if inferred_freq is None and freq is not None: - type(self)._validate_frequency(self, freq) + cls._validate_frequency(obj, freq) + return obj @classmethod def _simple_new( @@ -233,10 +232,8 @@ def _simple_new( assert values.dtype == "i8" values = values.view(TD64NS_DTYPE) - result = object.__new__(cls) - result._ndarray = values + result = cls._simpler_new(values, TD64NS_DTYPE) result._freq = to_offset(freq) - result._dtype = TD64NS_DTYPE return result @classmethod diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 6d5992540ef49..be85839bf93c4 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -651,6 +651,7 @@ def _get_join_freq(self, other): def _wrap_joined_index(self, joined: np.ndarray, other): assert other.dtype == self.dtype, (other.dtype, self.dtype) + joined = joined.view(self._data._ndarray.dtype) result = super()._wrap_joined_index(joined, other) result._data._freq = self._get_join_freq(other) return result diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9ea43d083f5b3..cb700e5ac05e7 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -93,7 +93,8 @@ def _new_DatetimeIndex(cls, d): # These are already stored in our DatetimeArray; if they are # also in the pickle and don't match, we have a problem. if key in d: - assert d.pop(key) == getattr(dta, key) + val = d.pop(key) + assert val == getattr(dta, key), (key, val, getattr(dta, key)) result = cls._simple_new(dta, **d) else: with warnings.catch_warnings(): From 3ec35fb328df129082a64731701750d3bcb09150 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Feb 2021 10:20:01 -0800 Subject: [PATCH 4/6] remove outdated comment --- pandas/core/arrays/period.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 97f049b8bfd2e..d14da71a26b7b 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -224,7 +224,6 @@ def _simple_new( return cls(values, freq=freq, dtype=dtype) def _from_backing_data(self: PeriodArray, arr: np.ndarray) -> PeriodArray: - # Note: we do not retain `freq` return type(self)._simple_new(arr, dtype=self.dtype) @classmethod From 7b79120e3b7086d6b0ea7809ef52feb47c4dcd2f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Feb 2021 10:39:50 -0800 Subject: [PATCH 5/6] update comment --- pandas/_libs/lib.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 39e2251ca37d0..11b4677562f25 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2655,15 +2655,17 @@ cdef class NDArrayBacked: obj = cls._simpler_new(arr, arr.dtype) + # for foo in [arr, dta, obj]: ... + %timeit foo.copy() - 299 ns ± 30 ns per loop # <-- arr underlying ndarray + 299 ns ± 30 ns per loop # <-- arr underlying ndarray (for reference) 530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked 1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked 328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__ 371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simpler_new %timeit foo.T - 125 ns ± 6.27 ns per loop # <-- arr underlying ndarray + 125 ns ± 6.27 ns per loop # <-- arr underlying ndarray (for reference) 226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked 911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked 215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simpler_new From 7d3019a030019a5e631394ff5983d9b1b7e31d80 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Feb 2021 19:04:47 -0800 Subject: [PATCH 6/6] mypy fixup --- pandas/core/arrays/datetimelike.py | 11 ++++++----- pandas/core/dtypes/cast.py | 5 ++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 162760d92ff2f..bf20769a5a902 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -167,6 +167,11 @@ def _simple_new( ) -> DatetimeLikeArrayT: raise AbstractMethodError(cls) + def __init__(cls, values, dtype=np.dtype("M8[ns]"), freq=None, copy=False): + # This is just for mypy + # TODO: make default dtype subclass-specific + pass + @property def _scalar_type(self) -> Type[DatetimeLikeScalar]: """ @@ -1767,11 +1772,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False): uniques = self.copy() # TODO: copy or view? if sort and self.freq.n < 0: codes = codes[::-1] - # TODO: overload __getitem__, a slice indexer returns same type as self - # error: Incompatible types in assignment (expression has type - # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable - # has type "TimelikeOps") - uniques = uniques[::-1] # type: ignore[assignment] + uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort return super().factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b30dbe32eec4b..27c338a745b50 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1024,7 +1024,10 @@ def astype_dt64_to_dt64tz( # FIXME: GH#33401 this doesn't match DatetimeArray.astype, which # goes through the `not via_utc` path - return values.tz_localize("UTC").tz_convert(dtype.tz) + # error: "ExtensionArray" has no attribute "tz_localize" + values = values.tz_localize("UTC") # type:ignore[attr-defined] + # error: "ExtensionArray" has no attribute "tz_convert" + return values.tz_convert(dtype.tz) # type:ignore[attr-defined] else: # DatetimeArray/DatetimeIndex.astype behavior