Skip to content

Commit f5b994a

Browse files
committed
Merge remote-tracking branch 'upstream/master' into fix-issue-32621
2 parents b62ad89 + d537d77 commit f5b994a

32 files changed

+1008
-385
lines changed

asv_bench/benchmarks/multiindex_object.py

+28
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,38 @@ def setup(self):
7474
],
7575
dtype=object,
7676
)
77+
self.other_mi_many_mismatches = MultiIndex.from_tuples(
78+
[
79+
(-7, 41),
80+
(-2, 3),
81+
(-0.7, 5),
82+
(0, 0),
83+
(0, 1.5),
84+
(0, 340),
85+
(0, 1001),
86+
(1, -4),
87+
(1, 20),
88+
(1, 1040),
89+
(432, -5),
90+
(432, 17),
91+
(439, 165.5),
92+
(998, -4),
93+
(998, 24065),
94+
(999, 865.2),
95+
(999, 1000),
96+
(1045, -843),
97+
]
98+
)
7799

78100
def time_get_indexer(self):
79101
self.mi_int.get_indexer(self.obj_index)
80102

103+
def time_get_indexer_and_backfill(self):
104+
self.mi_int.get_indexer(self.other_mi_many_mismatches, method="backfill")
105+
106+
def time_get_indexer_and_pad(self):
107+
self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad")
108+
81109
def time_is_monotonic(self):
82110
self.mi_int.is_monotonic
83111

asv_bench/benchmarks/rolling.py

+22
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,26 @@ def peakmem_fixed(self):
165165
self.roll.max()
166166

167167

168+
class ForwardWindowMethods:
169+
params = (
170+
["DataFrame", "Series"],
171+
[10, 1000],
172+
["int", "float"],
173+
["median", "mean", "max", "min", "kurt", "sum"],
174+
)
175+
param_names = ["constructor", "window_size", "dtype", "method"]
176+
177+
def setup(self, constructor, window_size, dtype, method):
178+
N = 10 ** 5
179+
arr = np.random.random(N).astype(dtype)
180+
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=window_size)
181+
self.roll = getattr(pd, constructor)(arr).rolling(window=indexer)
182+
183+
def time_rolling(self, constructor, window_size, dtype, method):
184+
getattr(self.roll, method)()
185+
186+
def peakmem_rolling(self, constructor, window_size, dtype, method):
187+
getattr(self.roll, method)()
188+
189+
168190
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/reference/window.rst

+1
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,4 @@ Base class for defining custom window boundaries.
8585
:toctree: api/
8686

8787
api.indexers.BaseIndexer
88+
api.indexers.FixedForwardWindowIndexer

doc/source/user_guide/computation.rst

+14
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,20 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other
571571
3 3.0
572572
4 10.0
573573
574+
.. versionadded:: 1.1
575+
576+
For some problems knowledge of the future is available for analysis. For example, this occurs when
577+
each data point is a full time series read from an experiment, and the task is to extract underlying
578+
conditions. In these cases it can be useful to perform forward-looking rolling window computations.
579+
:func:`FixedForwardWindowIndexer <pandas.api.indexers.FixedForwardWindowIndexer>` class is available for this purpose.
580+
This :func:`BaseIndexer <pandas.api.indexers.BaseIndexer>` subclass implements a closed fixed-width
581+
forward-looking rolling window, and we can use it as follows:
582+
583+
.. ipython:: ipython
584+
585+
from pandas.api.indexers import FixedForwardWindowIndexer
586+
indexer = FixedForwardWindowIndexer(window_size=2)
587+
df.rolling(indexer, min_periods=1).sum()
574588

575589
.. _stats.rolling_window.endpoints:
576590

doc/source/whatsnew/v1.1.0.rst

+64-1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ Other API changes
109109
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
110110
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``std``, ``var``, ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
111111
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``min``, ``max`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
112+
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
112113
-
113114

114115
Backwards incompatible API changes
@@ -120,6 +121,67 @@ Backwards incompatible API changes
120121
Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`)
121122
- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`)
122123
- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`)
124+
125+
``MultiIndex.get_indexer`` interprets `method` argument differently
126+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127+
128+
This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`).
129+
130+
As an example of this, given:
131+
132+
.. ipython:: python
133+
134+
df = pd.DataFrame({
135+
'a': [0, 0, 0, 0],
136+
'b': [0, 2, 3, 4],
137+
'c': ['A', 'B', 'C', 'D'],
138+
}).set_index(['a', 'b'])
139+
mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]])
140+
141+
The differences in reindexing ``df`` with ``mi_2`` and using ``method='backfill'`` can be seen here:
142+
143+
*pandas >= 0.23, < 1.1.0*:
144+
145+
.. code-block:: ipython
146+
147+
In [1]: df.reindex(mi_2, method='backfill')
148+
Out[1]:
149+
c
150+
0 -1 A
151+
0 A
152+
1 D
153+
3 A
154+
4 A
155+
5 C
156+
157+
*pandas <0.23, >= 1.1.0*
158+
159+
.. ipython:: python
160+
161+
df.reindex(mi_2, method='backfill')
162+
163+
And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'`` can be seen here:
164+
165+
*pandas >= 0.23, < 1.1.0*
166+
167+
.. code-block:: ipython
168+
169+
In [1]: df.reindex(mi_2, method='pad')
170+
Out[1]:
171+
c
172+
0 -1 NaN
173+
0 NaN
174+
1 D
175+
3 NaN
176+
4 A
177+
5 C
178+
179+
*pandas < 0.23, >= 1.1.0*
180+
181+
.. ipython:: python
182+
183+
df.reindex(mi_2, method='pad')
184+
123185
-
124186

125187
.. _whatsnew_110.api_breaking.indexing_raises_key_errors:
@@ -274,7 +336,7 @@ Deprecations
274336
version 1.1. All other arguments should be given as keyword
275337
arguments (:issue:`27573`).
276338

277-
-
339+
- :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use `:func:pandas.api.types.is_categorical_dtype` instead (:issue:`33385`)
278340

279341
.. ---------------------------------------------------------------------------
280342
@@ -434,6 +496,7 @@ I/O
434496
- Bug in :meth:`read_sas` was raising an ``AttributeError`` when reading files from Google Cloud Storage (issue:`33069`)
435497
- Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`)
436498
- Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`)
499+
- Bug in :meth:`read_json` was raising ``TypeError`` when reading a list of booleans into a Series. (:issue:`31464`)
437500

438501
Plotting
439502
^^^^^^^^

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ dependencies:
8686
- lxml
8787

8888
# pd.read_excel, DataFrame.to_excel, pd.ExcelWriter, pd.ExcelFile
89-
- openpyxl<=3.0.1
89+
- openpyxl
9090
- xlrd
9191
- xlsxwriter
9292
- xlwt

pandas/_libs/index.pyx

+104-16
Original file line numberDiff line numberDiff line change
@@ -612,25 +612,113 @@ cdef class BaseMultiIndexCodesEngine:
612612
in zip(self.levels, zip(*target))]
613613
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
614614

615-
def get_indexer(self, object target, object method=None,
616-
object limit=None):
615+
def get_indexer_no_fill(self, object target) -> np.ndarray:
616+
"""
617+
Returns an array giving the positions of each value of `target` in
618+
`self.values`, where -1 represents a value in `target` which does not
619+
appear in `self.values`
620+
621+
Parameters
622+
----------
623+
target : list-like of keys
624+
Each key is a tuple, with a label for each level of the index
625+
626+
Returns
627+
-------
628+
np.ndarray[int64_t, ndim=1] of the indexer of `target` into
629+
`self.values`
630+
"""
617631
lab_ints = self._extract_level_codes(target)
632+
return self._base.get_indexer(self, lab_ints)
618633

619-
# All methods (exact, backfill, pad) directly map to the respective
620-
# methods of the underlying (integers) index...
621-
if method is not None:
622-
# but underlying backfill and pad methods require index and keys
623-
# to be sorted. The index already is (checked in
624-
# Index._get_fill_indexer), sort (integer representations of) keys:
625-
order = np.argsort(lab_ints)
626-
lab_ints = lab_ints[order]
627-
indexer = (getattr(self._base, f'get_{method}_indexer')
628-
(self, lab_ints, limit=limit))
629-
indexer = indexer[order]
630-
else:
631-
indexer = self._base.get_indexer(self, lab_ints)
634+
def get_indexer(self, object target, object values = None,
635+
object method = None, object limit = None) -> np.ndarray:
636+
"""
637+
Returns an array giving the positions of each value of `target` in
638+
`values`, where -1 represents a value in `target` which does not
639+
appear in `values`
632640

633-
return indexer
641+
If `method` is "backfill" then the position for a value in `target`
642+
which does not appear in `values` is that of the next greater value
643+
in `values` (if one exists), and -1 if there is no such value.
644+
645+
Similarly, if the method is "pad" then the position for a value in
646+
`target` which does not appear in `values` is that of the next smaller
647+
value in `values` (if one exists), and -1 if there is no such value.
648+
649+
Parameters
650+
----------
651+
target: list-like of tuples
652+
need not be sorted, but all must have the same length, which must be
653+
the same as the length of all tuples in `values`
654+
values : list-like of tuples
655+
must be sorted and all have the same length. Should be the set of
656+
the MultiIndex's values. Needed only if `method` is not None
657+
method: string
658+
"backfill" or "pad"
659+
limit: int, optional
660+
if provided, limit the number of fills to this value
661+
662+
Returns
663+
-------
664+
np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`,
665+
filled with the `method` (and optionally `limit`) specified
666+
"""
667+
if method is None:
668+
return self.get_indexer_no_fill(target)
669+
670+
assert method in ("backfill", "pad")
671+
cdef:
672+
int64_t i, j, next_code
673+
int64_t num_values, num_target_values
674+
ndarray[int64_t, ndim=1] target_order
675+
ndarray[object, ndim=1] target_values
676+
ndarray[int64_t, ndim=1] new_codes, new_target_codes
677+
ndarray[int64_t, ndim=1] sorted_indexer
678+
679+
target_order = np.argsort(target.values).astype('int64')
680+
target_values = target.values[target_order]
681+
num_values, num_target_values = len(values), len(target_values)
682+
new_codes, new_target_codes = (
683+
np.empty((num_values,)).astype('int64'),
684+
np.empty((num_target_values,)).astype('int64'),
685+
)
686+
687+
# `values` and `target_values` are both sorted, so we walk through them
688+
# and memoize the (ordered) set of indices in the (implicit) merged-and
689+
# sorted list of the two which belong to each of them
690+
# the effect of this is to create a factorization for the (sorted)
691+
# merger of the index values, where `new_codes` and `new_target_codes`
692+
# are the subset of the factors which appear in `values` and `target`,
693+
# respectively
694+
i, j, next_code = 0, 0, 0
695+
while i < num_values and j < num_target_values:
696+
val, target_val = values[i], target_values[j]
697+
if val <= target_val:
698+
new_codes[i] = next_code
699+
i += 1
700+
if target_val <= val:
701+
new_target_codes[j] = next_code
702+
j += 1
703+
next_code += 1
704+
705+
# at this point, at least one should have reached the end
706+
# the remaining values of the other should be added to the end
707+
assert i == num_values or j == num_target_values
708+
while i < num_values:
709+
new_codes[i] = next_code
710+
i += 1
711+
next_code += 1
712+
while j < num_target_values:
713+
new_target_codes[j] = next_code
714+
j += 1
715+
next_code += 1
716+
717+
# get the indexer, and undo the sorting of `target.values`
718+
sorted_indexer = (
719+
algos.backfill if method == "backfill" else algos.pad
720+
)(new_codes, new_target_codes, limit=limit).astype('int64')
721+
return sorted_indexer[np.argsort(target_order)]
634722

635723
def get_loc(self, object key):
636724
if is_definitely_invalid_key(key):

pandas/api/indexers/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
"""
44

55
from pandas.core.indexers import check_array_indexer
6-
from pandas.core.window.indexers import BaseIndexer
6+
from pandas.core.window.indexers import BaseIndexer, FixedForwardWindowIndexer
77

8-
__all__ = ["check_array_indexer", "BaseIndexer"]
8+
__all__ = ["check_array_indexer", "BaseIndexer", "FixedForwardWindowIndexer"]

pandas/core/dtypes/common.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def ensure_categorical(arr):
125125
cat_arr : The original array cast as a Categorical. If it already
126126
is a Categorical, we return as is.
127127
"""
128-
if not is_categorical(arr):
128+
if not is_categorical_dtype(arr.dtype):
129129
from pandas import Categorical
130130

131131
arr = Categorical(arr)
@@ -360,6 +360,12 @@ def is_categorical(arr) -> bool:
360360
>>> is_categorical(pd.CategoricalIndex([1, 2, 3]))
361361
True
362362
"""
363+
warnings.warn(
364+
"is_categorical is deprecated and will be removed in a future version. "
365+
"Use is_categorical_dtype instead",
366+
FutureWarning,
367+
stacklevel=2,
368+
)
363369
return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr)
364370

365371

@@ -1458,7 +1464,7 @@ def is_extension_type(arr) -> bool:
14581464
stacklevel=2,
14591465
)
14601466

1461-
if is_categorical(arr):
1467+
if is_categorical_dtype(arr):
14621468
return True
14631469
elif is_sparse(arr):
14641470
return True

pandas/core/dtypes/dtypes.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,6 @@ def _from_values_or_dtype(
337337
>>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
338338
CategoricalDtype(categories=['x', 'y'], ordered=False)
339339
"""
340-
from pandas.core.dtypes.common import is_categorical
341340

342341
if dtype is not None:
343342
# The dtype argument takes precedence over values.dtype (if any)
@@ -352,7 +351,7 @@ def _from_values_or_dtype(
352351
)
353352
elif not isinstance(dtype, CategoricalDtype):
354353
raise ValueError(f"Cannot not construct CategoricalDtype from {dtype}")
355-
elif is_categorical(values):
354+
elif cls.is_dtype(values):
356355
# If no "dtype" was passed, use the one from "values", but honor
357356
# the "ordered" and "categories" arguments
358357
dtype = values.dtype._from_categorical_dtype(

0 commit comments

Comments
 (0)