Skip to content

Commit 9e52128

Browse files
committed
Merge master.
2 parents d1ecca0 + 08e4baf commit 9e52128

File tree

111 files changed

+3453
-1913
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+3453
-1913
lines changed

asv_bench/benchmarks/algorithms.py

+12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas._libs import lib
66

77
import pandas as pd
8+
from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
89

910
from .pandas_vb_common import tm
1011

@@ -174,4 +175,15 @@ def time_argsort(self, N):
174175
self.array.argsort()
175176

176177

178+
class RemoveDuplicates:
179+
def setup(self):
180+
N = 10 ** 5
181+
na = np.arange(int(N / 2))
182+
self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
183+
self.right = np.concatenate([na, na])
184+
185+
def time_make_duplicates_of_left_unique_in_right(self):
186+
make_duplicates_of_left_unique_in_right(self.left, self.right)
187+
188+
177189
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/categoricals.py

+43
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import string
2+
import sys
13
import warnings
24

35
import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
6769
pd.Categorical(self.series)
6870

6971

72+
class AsType:
73+
def setup(self):
74+
N = 10 ** 5
75+
76+
random_pick = np.random.default_rng().choice
77+
78+
categories = {
79+
"str": list(string.ascii_letters),
80+
"int": np.random.randint(2 ** 16, size=154),
81+
"float": sys.maxsize * np.random.random((38,)),
82+
"timestamp": [
83+
pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
84+
],
85+
}
86+
87+
self.df = pd.DataFrame(
88+
{col: random_pick(cats, N) for col, cats in categories.items()}
89+
)
90+
91+
for col in ("int", "float", "timestamp"):
92+
self.df[col + "_as_str"] = self.df[col].astype(str)
93+
94+
for col in self.df.columns:
95+
self.df[col] = self.df[col].astype("category")
96+
97+
def astype_str(self):
98+
[self.df[col].astype("str") for col in "int float timestamp".split()]
99+
100+
def astype_int(self):
101+
[self.df[col].astype("int") for col in "int_as_str timestamp".split()]
102+
103+
def astype_float(self):
104+
[
105+
self.df[col].astype("float")
106+
for col in "float_as_str int int_as_str timestamp".split()
107+
]
108+
109+
def astype_datetime(self):
110+
self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
111+
112+
70113
class Concat:
71114
def setup(self):
72115
N = 10 ** 5

asv_bench/benchmarks/rolling.py

+13
Original file line numberDiff line numberDiff line change
@@ -225,4 +225,17 @@ def time_rolling_offset(self, method):
225225
getattr(self.groupby_roll_offset, method)()
226226

227227

228+
class GroupbyEWM:
229+
230+
params = ["cython", "numba"]
231+
param_names = ["engine"]
232+
233+
def setup(self, engine):
234+
df = pd.DataFrame({"A": range(50), "B": range(50)})
235+
self.gb_ewm = df.groupby("A").ewm(com=1.0)
236+
237+
def time_groupby_mean(self, engine):
238+
self.gb_ewm.mean(engine=engine)
239+
240+
228241
from .pandas_vb_common import setup # noqa: F401 isort:skip

ci/run_tests.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s
2525
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
2626
# GH#37455 windows py38 build appears to be running out of memory
2727
# skip collection of window tests
28-
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/"
28+
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/"
2929
fi
3030

3131
echo $PYTEST_CMD

doc/source/ecosystem.rst

+10
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,16 @@ D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle
178178
& Google Colab. Here are some demos of the `grid <http://alphatechadmin.pythonanywhere.com/>`__
179179
and `chart-builder <http://alphatechadmin.pythonanywhere.com/charts/4?chart_type=surface&query=&x=date&z=Col0&agg=raw&cpg=false&y=%5B%22security_id%22%5D>`__.
180180

181+
`hvplot <https://hvplot.holoviz.org/index.html>`__
182+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183+
184+
hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews <https://holoviews.org/>`__.
185+
It can be loaded as a native pandas plotting backend via
186+
187+
.. code:: python
188+
189+
pd.set_option("plotting.backend", "hvplot")
190+
181191
.. _ecosystem.ide:
182192

183193
IDE

doc/source/user_guide/indexing.rst

+34
Original file line numberDiff line numberDiff line change
@@ -1158,6 +1158,40 @@ Mask
11581158
s.mask(s >= 0)
11591159
df.mask(df >= 0)
11601160
1161+
.. _indexing.np_where:
1162+
1163+
Setting with enlargement conditionally using :func:`numpy`
1164+
----------------------------------------------------------
1165+
1166+
An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`.
1167+
Combined with setting a new column, you can use it to enlarge a dataframe where the
1168+
values are determined conditionally.
1169+
1170+
Consider you have two choices to choose from in the following dataframe. And you want to
1171+
set a new column color to 'green' when the second column has 'Z'. You can do the
1172+
following:
1173+
1174+
.. ipython:: python
1175+
1176+
df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')})
1177+
df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
1178+
df
1179+
1180+
If you have multiple conditions, you can use :func:`numpy.select` to achieve that. Say
1181+
corresponding to three conditions there are three choice of colors, with a fourth color
1182+
as a fallback, you can do the following.
1183+
1184+
.. ipython:: python
1185+
1186+
conditions = [
1187+
(df['col2'] == 'Z') & (df['col1'] == 'A'),
1188+
(df['col2'] == 'Z') & (df['col1'] == 'B'),
1189+
(df['col1'] == 'B')
1190+
]
1191+
choices = ['yellow', 'blue', 'purple']
1192+
df['color'] = np.select(conditions, choices, default='black')
1193+
df
1194+
11611195
.. _indexing.query:
11621196

11631197
The :meth:`~pandas.DataFrame.query` Method

doc/source/user_guide/merging.rst

+1-8
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ behavior:
194194
},
195195
index=[2, 3, 6, 7],
196196
)
197-
result = pd.concat([df1, df4], axis=1, sort=False)
197+
result = pd.concat([df1, df4], axis=1)
198198
199199
200200
.. ipython:: python
@@ -204,13 +204,6 @@ behavior:
204204
p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False);
205205
plt.close("all");
206206
207-
.. warning::
208-
209-
The default behavior with ``join='outer'`` is to sort the other axis
210-
(columns in this case). In a future version of pandas, the default will
211-
be to not sort. We specified ``sort=False`` to opt in to the new
212-
behavior now.
213-
214207
Here is the same thing with ``join='inner'``:
215208

216209
.. ipython:: python

doc/source/user_guide/window.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Concept Method Returned Object
4343
Rolling window ``rolling`` ``Rolling`` Yes Yes
4444
Weighted window ``rolling`` ``Window`` No No
4545
Expanding window ``expanding`` ``Expanding`` No Yes
46-
Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No No
46+
Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2)
4747
============================= ================= =========================== =========================== ========================
4848

4949
As noted above, some operations support specifying a window based on a time offset:

doc/source/whatsnew/v1.2.0.rst

+30-3
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,23 @@ example where the index name is preserved:
204204
The same is true for :class:`MultiIndex`, but the logic is applied separately on a
205205
level-by-level basis.
206206

207+
.. _whatsnew_120.groupby_ewm:
208+
209+
Groupby supports EWM operations directly
210+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
211+
212+
:class:`DataFrameGroupBy` now supports exponentially weighted window operations directly (:issue:`16037`).
213+
214+
.. ipython:: python
215+
216+
df = pd.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': range(4)})
217+
df
218+
df.groupby('A').ewm(com=1.0).mean()
219+
220+
Additionally ``mean`` supports execution via `Numba <https://numba.pydata.org/>`__ with
221+
the ``engine`` and ``engine_kwargs`` arguments. Numba must be installed as an optional dependency
222+
to use this feature.
223+
207224
.. _whatsnew_120.enhancements.other:
208225

209226
Other enhancements
@@ -221,6 +238,7 @@ Other enhancements
221238
-
222239
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
223240
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
241+
- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
224242
- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
225243
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
226244
- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)
@@ -457,7 +475,8 @@ Deprecations
457475
- :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`)
458476
- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`)
459477
- :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`)
460-
- Partial slicing on unordered :class:`DatetimeIndexes` with keys, which are not in Index is deprecated and will be removed in a future version (:issue:`18531`)
478+
- Partial slicing on unordered :class:`DatetimeIndex` with keys, which are not in Index is deprecated and will be removed in a future version (:issue:`18531`)
479+
- Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`DatetimeIndex`, :class:`TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`)
461480
- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`)
462481

463482
.. ---------------------------------------------------------------------------
@@ -482,6 +501,7 @@ Performance improvements
482501
- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`)
483502
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
484503
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
504+
- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`)
485505
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
486506
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
487507

@@ -567,7 +587,7 @@ Interval
567587

568588
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Interval` dtypes would be converted to object dtypes (:issue:`34871`)
569589
- Bug in :meth:`IntervalIndex.take` with negative indices and ``fill_value=None`` (:issue:`37330`)
570-
-
590+
- Bug in :meth:`IntervalIndex.putmask` with datetime-like dtype incorrectly casting to object dtype (:issue:`37968`)
571591
-
572592

573593
Indexing
@@ -590,6 +610,7 @@ Indexing
590610
- Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`)
591611
- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty :class:`DataFrame` with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`)
592612
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`)
613+
- Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`)
593614

594615
Missing
595616
^^^^^^^
@@ -603,6 +624,7 @@ MultiIndex
603624
- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
604625
- Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`)
605626
- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`)
627+
- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`)
606628

607629
I/O
608630
^^^
@@ -631,6 +653,8 @@ I/O
631653
- Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`)
632654
- :meth:`to_excel` and :meth:`to_markdown` support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`)
633655
- Bug in :meth:`read_fw` was not skipping blank lines (even with ``skip_blank_lines=True``) (:issue:`37758`)
656+
- Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`)
657+
- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`)
634658

635659
Period
636660
^^^^^^
@@ -643,8 +667,10 @@ Plotting
643667
- Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`)
644668
- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`)
645669
- Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`)
670+
- Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing :exc:`ValueError` with a :class:`Series` or :class:`DataFrame`
671+
indexed by a :class:`TimedeltaIndex` with a fixed frequency when x-axis lower limit was greater than upper limit (:issue:`37454`)
646672
- Bug in :meth:`DataFrameGroupBy.boxplot` when ``subplots=False``, a KeyError would raise (:issue:`16748`)
647-
673+
- Bug in :meth:`DataFrame.plot` and :meth:`Series.plot` was overwriting matplotlib's shared y axes behaviour when no sharey parameter was passed (:issue:`37942`)
648674

649675
Groupby/resample/rolling
650676
^^^^^^^^^^^^^^^^^^^^^^^^
@@ -687,6 +713,7 @@ Reshaping
687713
- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
688714
- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`)
689715
- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
716+
- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
690717

691718
Sparse
692719
^^^^^^

pandas/_libs/interval.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,8 @@ cdef class IntervalMixin:
179179
return (self.right == self.left) & (self.closed != 'both')
180180

181181
def _check_closed_matches(self, other, name='other'):
182-
"""Check if the closed attribute of `other` matches.
182+
"""
183+
Check if the closed attribute of `other` matches.
183184
184185
Note that 'left' and 'right' are considered different from 'both'.
185186

pandas/_libs/lib.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -636,7 +636,7 @@ cpdef ndarray[object] ensure_string_array(
636636
----------
637637
arr : array-like
638638
The values to be converted to str, if needed.
639-
na_value : Any
639+
na_value : Any, default np.nan
640640
The value to use for na. For example, np.nan or pd.NA.
641641
convert_na_value : bool, default True
642642
If False, existing na values will be used unchanged in the new array.

pandas/_libs/window/aggregations.pyx

+14-7
Original file line numberDiff line numberDiff line change
@@ -1496,15 +1496,17 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights,
14961496
# ----------------------------------------------------------------------
14971497
# Exponentially weighted moving average
14981498

1499-
def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times,
1500-
int64_t halflife):
1499+
def ewma_time(const float64_t[:] vals, int64_t[:] start, int64_t[:] end,
1500+
int minp, ndarray[int64_t] times, int64_t halflife):
15011501
"""
15021502
Compute exponentially-weighted moving average using halflife and time
15031503
distances.
15041504
15051505
Parameters
15061506
----------
15071507
vals : ndarray[float_64]
1508+
start: ndarray[int_64]
1509+
end: ndarray[int_64]
15081510
minp : int
15091511
times : ndarray[int64]
15101512
halflife : int64
@@ -1552,17 +1554,20 @@ def ewma_time(const float64_t[:] vals, int minp, ndarray[int64_t] times,
15521554
return output
15531555

15541556

1555-
def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp):
1557+
def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp,
1558+
float64_t com, bint adjust, bint ignore_na):
15561559
"""
15571560
Compute exponentially-weighted moving average using center-of-mass.
15581561
15591562
Parameters
15601563
----------
15611564
vals : ndarray (float64 type)
1565+
start: ndarray (int64 type)
1566+
end: ndarray (int64 type)
1567+
minp : int
15621568
com : float64
15631569
adjust : int
15641570
ignore_na : bool
1565-
minp : int
15661571
15671572
Returns
15681573
-------
@@ -1620,19 +1625,21 @@ def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp
16201625
# Exponentially weighted moving covariance
16211626

16221627

1623-
def ewmcov(float64_t[:] input_x, float64_t[:] input_y,
1624-
float64_t com, bint adjust, bint ignore_na, int minp, bint bias):
1628+
def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp,
1629+
float64_t[:] input_y, float64_t com, bint adjust, bint ignore_na, bint bias):
16251630
"""
16261631
Compute exponentially-weighted moving variance using center-of-mass.
16271632
16281633
Parameters
16291634
----------
16301635
input_x : ndarray (float64 type)
1636+
start: ndarray (int64 type)
1637+
end: ndarray (int64 type)
1638+
minp : int
16311639
input_y : ndarray (float64 type)
16321640
com : float64
16331641
adjust : int
16341642
ignore_na : bool
1635-
minp : int
16361643
bias : int
16371644
16381645
Returns

0 commit comments

Comments
 (0)