Skip to content

Commit 0cff41e

Browse files
authored
Merge branch 'master' into master
2 parents 77fc4a3 + 91abd0a commit 0cff41e

File tree

444 files changed

+14254
-9213
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

444 files changed

+14254
-9213
lines changed

.github/workflows/ci.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
steps:
1919

2020
- name: Setting conda path
21-
run: echo "::add-path::${HOME}/miniconda3/bin"
21+
run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH
2222

2323
- name: Checkout
2424
uses: actions/checkout@v1
@@ -98,7 +98,7 @@ jobs:
9898
steps:
9999

100100
- name: Setting conda path
101-
run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}"
101+
run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH
102102

103103
- name: Checkout
104104
uses: actions/checkout@v1

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
*.log
1313
*.swp
1414
*.pdb
15+
*.zip
1516
.project
1617
.pydevproject
1718
.settings

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ repos:
2626
name: isort (cython)
2727
types: [cython]
2828
- repo: https://github.com/asottile/pyupgrade
29-
rev: v2.7.3
29+
rev: v2.7.4
3030
hooks:
3131
- id: pyupgrade
3232
args: [--py37-plus]

.travis.yml

+1-6
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,6 @@ matrix:
3535
fast_finish: true
3636

3737
include:
38-
- dist: bionic
39-
python: 3.9-dev
40-
env:
41-
- JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)"
42-
4338
- env:
4439
- JOB="3.8, slow" ENV_FILE="ci/deps/travis-38-slow.yaml" PATTERN="slow" SQL="1"
4540
services:
@@ -94,7 +89,7 @@ install:
9489
script:
9590
- echo "script start"
9691
- echo "$JOB"
97-
- if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi
92+
- source activate pandas-dev
9893
- ci/run_tests.sh
9994

10095
after_script:

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,5 @@ RUN conda env update -n base -f "$pandas_home/environment.yml"
4343

4444
# Build C extensions and pandas
4545
RUN cd "$pandas_home" \
46-
&& python setup.py build_ext --inplace -j 4 \
46+
&& python setup.py build_ext -j 4 \
4747
&& python -m pip install -e .

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ clean_pyc:
99
-find . -name '*.py[co]' -exec rm {} \;
1010

1111
build: clean_pyc
12-
python setup.py build_ext --inplace
12+
python setup.py build_ext
1313

1414
lint-diff:
1515
git diff upstream/master --name-only -- "*.py" | xargs flake8

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Here are just a few of the things that pandas does well:
6060
and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]
6161
- [**Time series**][timeseries]-specific functionality: date range
6262
generation and frequency conversion, moving window statistics,
63-
date shifting and lagging.
63+
date shifting and lagging
6464

6565

6666
[missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data

asv_bench/benchmarks/algorithms.py

+12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas._libs import lib
66

77
import pandas as pd
8+
from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
89

910
from .pandas_vb_common import tm
1011

@@ -174,4 +175,15 @@ def time_argsort(self, N):
174175
self.array.argsort()
175176

176177

178+
class RemoveDuplicates:
179+
def setup(self):
180+
N = 10 ** 5
181+
na = np.arange(int(N / 2))
182+
self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
183+
self.right = np.concatenate([na, na])
184+
185+
def time_make_duplicates_of_left_unique_in_right(self):
186+
make_duplicates_of_left_unique_in_right(self.left, self.right)
187+
188+
177189
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/categoricals.py

+43
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import string
2+
import sys
13
import warnings
24

35
import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
6769
pd.Categorical(self.series)
6870

6971

72+
class AsType:
73+
def setup(self):
74+
N = 10 ** 5
75+
76+
random_pick = np.random.default_rng().choice
77+
78+
categories = {
79+
"str": list(string.ascii_letters),
80+
"int": np.random.randint(2 ** 16, size=154),
81+
"float": sys.maxsize * np.random.random((38,)),
82+
"timestamp": [
83+
pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
84+
],
85+
}
86+
87+
self.df = pd.DataFrame(
88+
{col: random_pick(cats, N) for col, cats in categories.items()}
89+
)
90+
91+
for col in ("int", "float", "timestamp"):
92+
self.df[col + "_as_str"] = self.df[col].astype(str)
93+
94+
for col in self.df.columns:
95+
self.df[col] = self.df[col].astype("category")
96+
97+
def astype_str(self):
98+
[self.df[col].astype("str") for col in "int float timestamp".split()]
99+
100+
def astype_int(self):
101+
[self.df[col].astype("int") for col in "int_as_str timestamp".split()]
102+
103+
def astype_float(self):
104+
[
105+
self.df[col].astype("float")
106+
for col in "float_as_str int int_as_str timestamp".split()
107+
]
108+
109+
def astype_datetime(self):
110+
self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
111+
112+
70113
class Concat:
71114
def setup(self):
72115
N = 10 ** 5

asv_bench/benchmarks/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ def setup(self):
486486
tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
487487
tmp = np.concatenate((tmp1, tmp2))
488488
arr = np.repeat(tmp, 10)
489-
self.df = DataFrame(dict(a=arr, b=arr))
489+
self.df = DataFrame({"a": arr, "b": arr})
490490

491491
def time_sum(self):
492492
self.df.groupby(["a"])["b"].sum()
+164
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
5+
6+
class IsinAlmostFullWithRandomInt:
7+
params = [
8+
[np.float64, np.int64, np.uint64, np.object],
9+
range(10, 21),
10+
]
11+
param_names = ["dtype", "exponent"]
12+
13+
def setup(self, dtype, exponent):
14+
M = 3 * 2 ** (exponent - 2)
15+
# 0.77-the maximal share of occupied buckets
16+
np.random.seed(42)
17+
self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype)
18+
self.values = np.random.randint(0, M, M).astype(dtype)
19+
self.values_outside = self.values + M
20+
21+
def time_isin(self, dtype, exponent):
22+
self.s.isin(self.values)
23+
24+
def time_isin_outside(self, dtype, exponent):
25+
self.s.isin(self.values_outside)
26+
27+
28+
class IsinWithRandomFloat:
29+
params = [
30+
[np.float64, np.object],
31+
[
32+
1_300,
33+
2_000,
34+
7_000,
35+
8_000,
36+
70_000,
37+
80_000,
38+
750_000,
39+
900_000,
40+
],
41+
]
42+
param_names = ["dtype", "M"]
43+
44+
def setup(self, dtype, M):
45+
np.random.seed(42)
46+
self.values = np.random.rand(M)
47+
self.s = pd.Series(self.values).astype(dtype)
48+
np.random.shuffle(self.values)
49+
self.values_outside = self.values + 0.1
50+
51+
def time_isin(self, dtype, M):
52+
self.s.isin(self.values)
53+
54+
def time_isin_outside(self, dtype, M):
55+
self.s.isin(self.values_outside)
56+
57+
58+
class IsinWithArangeSorted:
59+
params = [
60+
[np.float64, np.int64, np.uint64, np.object],
61+
[
62+
1_000,
63+
2_000,
64+
8_000,
65+
100_000,
66+
1_000_000,
67+
],
68+
]
69+
param_names = ["dtype", "M"]
70+
71+
def setup(self, dtype, M):
72+
self.s = pd.Series(np.arange(M)).astype(dtype)
73+
self.values = np.arange(M).astype(dtype)
74+
75+
def time_isin(self, dtype, M):
76+
self.s.isin(self.values)
77+
78+
79+
class IsinWithArange:
80+
params = [
81+
[np.float64, np.int64, np.uint64, np.object],
82+
[
83+
1_000,
84+
2_000,
85+
8_000,
86+
],
87+
[-2, 0, 2],
88+
]
89+
param_names = ["dtype", "M", "offset_factor"]
90+
91+
def setup(self, dtype, M, offset_factor):
92+
offset = int(M * offset_factor)
93+
np.random.seed(42)
94+
tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6))
95+
self.s = tmp.astype(dtype)
96+
self.values = np.arange(M).astype(dtype)
97+
98+
def time_isin(self, dtype, M, offset_factor):
99+
self.s.isin(self.values)
100+
101+
102+
class Float64GroupIndex:
103+
# GH28303
104+
def setup(self):
105+
self.df = pd.date_range(
106+
start="1/1/2018", end="1/2/2018", periods=1e6
107+
).to_frame()
108+
self.group_index = np.round(self.df.index.astype(int) / 1e9)
109+
110+
def time_groupby(self):
111+
self.df.groupby(self.group_index).last()
112+
113+
114+
class UniqueAndFactorizeArange:
115+
params = range(4, 16)
116+
param_names = ["exponent"]
117+
118+
def setup(self, exponent):
119+
a = np.arange(10 ** 4, dtype="float64")
120+
self.a2 = (a + 10 ** exponent).repeat(100)
121+
122+
def time_factorize(self, exponent):
123+
pd.factorize(self.a2)
124+
125+
def time_unique(self, exponent):
126+
pd.unique(self.a2)
127+
128+
129+
class NumericSeriesIndexing:
130+
131+
params = [
132+
(pd.Int64Index, pd.UInt64Index, pd.Float64Index),
133+
(10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
134+
]
135+
param_names = ["index_dtype", "N"]
136+
137+
def setup(self, index, N):
138+
vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
139+
indices = index(vals)
140+
self.data = pd.Series(np.arange(N), index=indices)
141+
142+
def time_loc_slice(self, index, N):
143+
# trigger building of mapping
144+
self.data.loc[:800]
145+
146+
147+
class NumericSeriesIndexingShuffled:
148+
149+
params = [
150+
(pd.Int64Index, pd.UInt64Index, pd.Float64Index),
151+
(10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
152+
]
153+
param_names = ["index_dtype", "N"]
154+
155+
def setup(self, index, N):
156+
vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
157+
np.random.seed(42)
158+
np.random.shuffle(vals)
159+
indices = index(vals)
160+
self.data = pd.Series(np.arange(N), index=indices)
161+
162+
def time_loc_slice(self, index, N):
163+
# trigger building of mapping
164+
self.data.loc[:800]

asv_bench/benchmarks/join_merge.py

+6
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort):
132132
def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
133133
self.df_shuf.join(self.df_key2, on="key2", sort=sort)
134134

135+
def time_join_dataframes_cross(self, sort):
136+
self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort)
137+
135138

136139
class JoinIndex:
137140
def setup(self):
@@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort):
205208
def time_merge_dataframe_integer_key(self, sort):
206209
merge(self.df, self.df2, on="key1", sort=sort)
207210

211+
def time_merge_dataframes_cross(self, sort):
212+
merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort)
213+
208214

209215
class I8Merge:
210216

asv_bench/benchmarks/reshape.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,10 @@ def setup(self):
103103
nidvars = 20
104104
N = 5000
105105
self.letters = list("ABCD")
106-
yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))]
106+
yrvars = [
107+
letter + str(num)
108+
for letter, num in product(self.letters, range(1, nyrs + 1))
109+
]
107110
columns = [str(i) for i in range(nidvars)] + yrvars
108111
self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns)
109112
self.df["id"] = self.df.index

asv_bench/benchmarks/rolling.py

+13
Original file line numberDiff line numberDiff line change
@@ -225,4 +225,17 @@ def time_rolling_offset(self, method):
225225
getattr(self.groupby_roll_offset, method)()
226226

227227

228+
class GroupbyEWM:
229+
230+
params = ["cython", "numba"]
231+
param_names = ["engine"]
232+
233+
def setup(self, engine):
234+
df = pd.DataFrame({"A": range(50), "B": range(50)})
235+
self.gb_ewm = df.groupby("A").ewm(com=1.0)
236+
237+
def time_groupby_mean(self, engine):
238+
self.gb_ewm.mean(engine=engine)
239+
240+
228241
from .pandas_vb_common import setup # noqa: F401 isort:skip

0 commit comments

Comments
 (0)