Skip to content

Commit 9fc5989

Browse files
Merge remote-tracking branch 'upstream/master' into GH28501
2 parents 5370b83 + 793b635 commit 9fc5989

File tree

139 files changed

+3701
-2672
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+3701
-2672
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
*.log
1313
*.swp
1414
*.pdb
15+
*.zip
1516
.project
1617
.pydevproject
1718
.settings

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ repos:
2626
name: isort (cython)
2727
types: [cython]
2828
- repo: https://github.com/asottile/pyupgrade
29-
rev: v2.7.3
29+
rev: v2.7.4
3030
hooks:
3131
- id: pyupgrade
3232
args: [--py37-plus]

asv_bench/benchmarks/algorithms.py

+12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas._libs import lib
66

77
import pandas as pd
8+
from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
89

910
from .pandas_vb_common import tm
1011

@@ -174,4 +175,15 @@ def time_argsort(self, N):
174175
self.array.argsort()
175176

176177

178+
class RemoveDuplicates:
179+
def setup(self):
180+
N = 10 ** 5
181+
na = np.arange(int(N / 2))
182+
self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
183+
self.right = np.concatenate([na, na])
184+
185+
def time_make_duplicates_of_left_unique_in_right(self):
186+
make_duplicates_of_left_unique_in_right(self.left, self.right)
187+
188+
177189
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/categoricals.py

+43
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import string
2+
import sys
13
import warnings
24

35
import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
6769
pd.Categorical(self.series)
6870

6971

72+
class AsType:
73+
def setup(self):
74+
N = 10 ** 5
75+
76+
random_pick = np.random.default_rng().choice
77+
78+
categories = {
79+
"str": list(string.ascii_letters),
80+
"int": np.random.randint(2 ** 16, size=154),
81+
"float": sys.maxsize * np.random.random((38,)),
82+
"timestamp": [
83+
pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
84+
],
85+
}
86+
87+
self.df = pd.DataFrame(
88+
{col: random_pick(cats, N) for col, cats in categories.items()}
89+
)
90+
91+
for col in ("int", "float", "timestamp"):
92+
self.df[col + "_as_str"] = self.df[col].astype(str)
93+
94+
for col in self.df.columns:
95+
self.df[col] = self.df[col].astype("category")
96+
97+
def astype_str(self):
98+
[self.df[col].astype("str") for col in "int float timestamp".split()]
99+
100+
def astype_int(self):
101+
[self.df[col].astype("int") for col in "int_as_str timestamp".split()]
102+
103+
def astype_float(self):
104+
[
105+
self.df[col].astype("float")
106+
for col in "float_as_str int int_as_str timestamp".split()
107+
]
108+
109+
def astype_datetime(self):
110+
self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
111+
112+
70113
class Concat:
71114
def setup(self):
72115
N = 10 ** 5

asv_bench/benchmarks/rolling.py

+13
Original file line numberDiff line numberDiff line change
@@ -225,4 +225,17 @@ def time_rolling_offset(self, method):
225225
getattr(self.groupby_roll_offset, method)()
226226

227227

228+
class GroupbyEWM:
229+
230+
params = ["cython", "numba"]
231+
param_names = ["engine"]
232+
233+
def setup(self, engine):
234+
df = pd.DataFrame({"A": range(50), "B": range(50)})
235+
self.gb_ewm = df.groupby("A").ewm(com=1.0)
236+
237+
def time_groupby_mean(self, engine):
238+
self.gb_ewm.mean(engine=engine)
239+
240+
228241
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/series_methods.py

+49
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,55 @@ def time_isin_long_series_long_values_floats(self):
9090
self.s_long_floats.isin(self.vals_long_floats)
9191

9292

93+
class IsInLongSeriesLookUpDominates:
94+
params = [
95+
["int64", "int32", "float64", "float32", "object"],
96+
[5, 1000],
97+
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
98+
]
99+
param_names = ["dtype", "MaxNumber", "series_type"]
100+
101+
def setup(self, dtype, MaxNumber, series_type):
102+
N = 10 ** 7
103+
if series_type == "random_hits":
104+
np.random.seed(42)
105+
array = np.random.randint(0, MaxNumber, N)
106+
if series_type == "random_misses":
107+
np.random.seed(42)
108+
array = np.random.randint(0, MaxNumber, N) + MaxNumber
109+
if series_type == "monotone_hits":
110+
array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
111+
if series_type == "monotone_misses":
112+
array = np.arange(N) + MaxNumber
113+
self.series = Series(array).astype(dtype)
114+
self.values = np.arange(MaxNumber).astype(dtype)
115+
116+
def time_isin(self, dtypes, MaxNumber, series_type):
117+
self.series.isin(self.values)
118+
119+
120+
class IsInLongSeriesValuesDominate:
121+
params = [
122+
["int64", "int32", "float64", "float32", "object"],
123+
["random", "monotone"],
124+
]
125+
param_names = ["dtype", "series_type"]
126+
127+
def setup(self, dtype, series_type):
128+
N = 10 ** 7
129+
if series_type == "random":
130+
np.random.seed(42)
131+
vals = np.random.randint(0, 10 * N, N)
132+
if series_type == "monotone":
133+
vals = np.arange(N)
134+
self.values = vals.astype(dtype)
135+
M = 10 ** 6 + 1
136+
self.series = Series(np.arange(M)).astype(dtype)
137+
138+
def time_isin(self, dtypes, series_type):
139+
self.series.isin(self.values)
140+
141+
93142
class NSort:
94143

95144
params = ["first", "last", "all"]

ci/run_tests.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s
2525
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
2626
# GH#37455 windows py38 build appears to be running out of memory
2727
# skip collection of window tests
28-
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/"
28+
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/"
2929
fi
3030

3131
echo $PYTEST_CMD

doc/source/ecosystem.rst

+10
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,16 @@ D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle
178178
& Google Colab. Here are some demos of the `grid <http://alphatechadmin.pythonanywhere.com/>`__
179179
and `chart-builder <http://alphatechadmin.pythonanywhere.com/charts/4?chart_type=surface&query=&x=date&z=Col0&agg=raw&cpg=false&y=%5B%22security_id%22%5D>`__.
180180

181+
`hvplot <https://hvplot.holoviz.org/index.html>`__
182+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183+
184+
hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews <https://holoviews.org/>`__.
185+
It can be loaded as a native pandas plotting backend via
186+
187+
.. code:: python
188+
189+
pd.set_option("plotting.backend", "hvplot")
190+
181191
.. _ecosystem.ide:
182192

183193
IDE

doc/source/user_guide/computation.rst

+7
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,10 @@ parameter:
205205
- ``min`` : lowest rank in the group
206206
- ``max`` : highest rank in the group
207207
- ``first`` : ranks assigned in the order they appear in the array
208+
209+
.. _computation.windowing:
210+
211+
Windowing functions
212+
~~~~~~~~~~~~~~~~~~~
213+
214+
See :ref:`the window operations user guide <window.overview>` for an overview of windowing functions.

doc/source/user_guide/indexing.rst

+34
Original file line numberDiff line numberDiff line change
@@ -1158,6 +1158,40 @@ Mask
11581158
s.mask(s >= 0)
11591159
df.mask(df >= 0)
11601160
1161+
.. _indexing.np_where:
1162+
1163+
Setting with enlargement conditionally using :func:`numpy`
1164+
----------------------------------------------------------
1165+
1166+
An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`.
1167+
Combined with setting a new column, you can use it to enlarge a dataframe where the
1168+
values are determined conditionally.
1169+
1170+
Consider you have two choices to choose from in the following dataframe. And you want to
1171+
set a new column color to 'green' when the second column has 'Z'. You can do the
1172+
following:
1173+
1174+
.. ipython:: python
1175+
1176+
df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')})
1177+
df['color'] = np.where(df['col2'] == 'Z', 'green', 'red')
1178+
df
1179+
1180+
If you have multiple conditions, you can use :func:`numpy.select` to achieve that. Say
1181+
corresponding to three conditions there are three choice of colors, with a fourth color
1182+
as a fallback, you can do the following.
1183+
1184+
.. ipython:: python
1185+
1186+
conditions = [
1187+
(df['col2'] == 'Z') & (df['col1'] == 'A'),
1188+
(df['col2'] == 'Z') & (df['col1'] == 'B'),
1189+
(df['col1'] == 'B')
1190+
]
1191+
choices = ['yellow', 'blue', 'purple']
1192+
df['color'] = np.select(conditions, choices, default='black')
1193+
df
1194+
11611195
.. _indexing.query:
11621196

11631197
The :meth:`~pandas.DataFrame.query` Method

doc/source/user_guide/merging.rst

+1-8
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ behavior:
194194
},
195195
index=[2, 3, 6, 7],
196196
)
197-
result = pd.concat([df1, df4], axis=1, sort=False)
197+
result = pd.concat([df1, df4], axis=1)
198198
199199
200200
.. ipython:: python
@@ -204,13 +204,6 @@ behavior:
204204
p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False);
205205
plt.close("all");
206206
207-
.. warning::
208-
209-
The default behavior with ``join='outer'`` is to sort the other axis
210-
(columns in this case). In a future version of pandas, the default will
211-
be to not sort. We specified ``sort=False`` to opt in to the new
212-
behavior now.
213-
214207
Here is the same thing with ``join='inner'``:
215208

216209
.. ipython:: python

doc/source/user_guide/window.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Concept Method Returned Object
4343
Rolling window ``rolling`` ``Rolling`` Yes Yes
4444
Weighted window ``rolling`` ``Window`` No No
4545
Expanding window ``expanding`` ``Expanding`` No Yes
46-
Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No No
46+
Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2)
4747
============================= ================= =========================== =========================== ========================
4848

4949
As noted above, some operations support specifying a window based on a time offset:

0 commit comments

Comments
 (0)