Skip to content

Commit 58de83c

Browse files
author
Sumanau Sareen
committed
Merge remote-tracking branch 'upstream/master' into BUG-32967
2 parents abef2d5 + 28e0f18 commit 58de83c

File tree

538 files changed

+19897
-15358
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

538 files changed

+19897
-15358
lines changed

.github/workflows/ci.yml

+20-23
Original file line numberDiff line numberDiff line change
@@ -125,35 +125,32 @@ jobs:
125125
- name: Check ipython directive errors
126126
run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log"
127127

128-
- name: Merge website and docs
129-
run: |
130-
mkdir -p pandas_web/docs
131-
cp -r web/build/* pandas_web/
132-
cp -r doc/build/html/* pandas_web/docs/
133-
if: github.event_name == 'push'
134-
135128
- name: Install Rclone
136129
run: sudo apt install rclone -y
137130
if: github.event_name == 'push'
138131

139132
- name: Set up Rclone
140133
run: |
141-
RCLONE_CONFIG_PATH=$HOME/.config/rclone/rclone.conf
142-
mkdir -p `dirname $RCLONE_CONFIG_PATH`
143-
echo "[ovh_cloud_pandas_web]" > $RCLONE_CONFIG_PATH
144-
echo "type = swift" >> $RCLONE_CONFIG_PATH
145-
echo "env_auth = false" >> $RCLONE_CONFIG_PATH
146-
echo "auth_version = 3" >> $RCLONE_CONFIG_PATH
147-
echo "auth = https://auth.cloud.ovh.net/v3/" >> $RCLONE_CONFIG_PATH
148-
echo "endpoint_type = public" >> $RCLONE_CONFIG_PATH
149-
echo "tenant_domain = default" >> $RCLONE_CONFIG_PATH
150-
echo "tenant = 2977553886518025" >> $RCLONE_CONFIG_PATH
151-
echo "domain = default" >> $RCLONE_CONFIG_PATH
152-
echo "user = w4KGs3pmDxpd" >> $RCLONE_CONFIG_PATH
153-
echo "key = ${{ secrets.ovh_object_store_key }}" >> $RCLONE_CONFIG_PATH
154-
echo "region = BHS" >> $RCLONE_CONFIG_PATH
134+
CONF=$HOME/.config/rclone/rclone.conf
135+
mkdir -p `dirname $CONF`
136+
echo "[ovh_host]" > $CONF
137+
echo "type = swift" >> $CONF
138+
echo "env_auth = false" >> $CONF
139+
echo "auth_version = 3" >> $CONF
140+
echo "auth = https://auth.cloud.ovh.net/v3/" >> $CONF
141+
echo "endpoint_type = public" >> $CONF
142+
echo "tenant_domain = default" >> $CONF
143+
echo "tenant = 2977553886518025" >> $CONF
144+
echo "domain = default" >> $CONF
145+
echo "user = w4KGs3pmDxpd" >> $CONF
146+
echo "key = ${{ secrets.ovh_object_store_key }}" >> $CONF
147+
echo "region = BHS" >> $CONF
148+
if: github.event_name == 'push'
149+
150+
- name: Sync web with OVH
151+
run: rclone sync --exclude pandas-docs/** web/build ovh_host:prod
155152
if: github.event_name == 'push'
156153

157-
- name: Sync web
158-
run: rclone sync pandas_web ovh_cloud_pandas_web:dev
154+
- name: Sync dev docs with OVH
155+
run: rclone sync doc/build/html ovh_host:prod/pandas-docs/dev
159156
if: github.event_name == 'push'

LICENSES/HAVEN_LICENSE

+21-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,21 @@
1-
YEAR: 2013-2016
2-
COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller
1+
# MIT License
2+
3+
Copyright (c) 2019 Hadley Wickham; RStudio; and Evan Miller
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

asv_bench/benchmarks/algorithms.py

+35-56
Original file line numberDiff line numberDiff line change
@@ -31,83 +31,62 @@ def time_maybe_convert_objects(self):
3131

3232
class Factorize:
3333

34-
params = [[True, False], ["int", "uint", "float", "string"]]
35-
param_names = ["sort", "dtype"]
36-
37-
def setup(self, sort, dtype):
38-
N = 10 ** 5
39-
data = {
40-
"int": pd.Int64Index(np.arange(N).repeat(5)),
41-
"uint": pd.UInt64Index(np.arange(N).repeat(5)),
42-
"float": pd.Float64Index(np.random.randn(N).repeat(5)),
43-
"string": tm.makeStringIndex(N).repeat(5),
44-
}
45-
self.idx = data[dtype]
46-
47-
def time_factorize(self, sort, dtype):
48-
self.idx.factorize(sort=sort)
49-
50-
51-
class FactorizeUnique:
52-
53-
params = [[True, False], ["int", "uint", "float", "string"]]
54-
param_names = ["sort", "dtype"]
34+
params = [
35+
[True, False],
36+
[True, False],
37+
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
38+
]
39+
param_names = ["unique", "sort", "dtype"]
5540

56-
def setup(self, sort, dtype):
41+
def setup(self, unique, sort, dtype):
5742
N = 10 ** 5
5843
data = {
5944
"int": pd.Int64Index(np.arange(N)),
6045
"uint": pd.UInt64Index(np.arange(N)),
61-
"float": pd.Float64Index(np.arange(N)),
46+
"float": pd.Float64Index(np.random.randn(N)),
6247
"string": tm.makeStringIndex(N),
63-
}
64-
self.idx = data[dtype]
65-
assert self.idx.is_unique
66-
67-
def time_factorize(self, sort, dtype):
48+
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
49+
"datetime64[ns, tz]": pd.date_range(
50+
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
51+
),
52+
}[dtype]
53+
if not unique:
54+
data = data.repeat(5)
55+
self.idx = data
56+
57+
def time_factorize(self, unique, sort, dtype):
6858
self.idx.factorize(sort=sort)
6959

7060

7161
class Duplicated:
7262

73-
params = [["first", "last", False], ["int", "uint", "float", "string"]]
74-
param_names = ["keep", "dtype"]
75-
76-
def setup(self, keep, dtype):
77-
N = 10 ** 5
78-
data = {
79-
"int": pd.Int64Index(np.arange(N).repeat(5)),
80-
"uint": pd.UInt64Index(np.arange(N).repeat(5)),
81-
"float": pd.Float64Index(np.random.randn(N).repeat(5)),
82-
"string": tm.makeStringIndex(N).repeat(5),
83-
}
84-
self.idx = data[dtype]
85-
# cache is_unique
86-
self.idx.is_unique
87-
88-
def time_duplicated(self, keep, dtype):
89-
self.idx.duplicated(keep=keep)
90-
91-
92-
class DuplicatedUniqueIndex:
93-
94-
params = ["int", "uint", "float", "string"]
95-
param_names = ["dtype"]
63+
params = [
64+
[True, False],
65+
["first", "last", False],
66+
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
67+
]
68+
param_names = ["unique", "keep", "dtype"]
9669

97-
def setup(self, dtype):
70+
def setup(self, unique, keep, dtype):
9871
N = 10 ** 5
9972
data = {
10073
"int": pd.Int64Index(np.arange(N)),
10174
"uint": pd.UInt64Index(np.arange(N)),
10275
"float": pd.Float64Index(np.random.randn(N)),
10376
"string": tm.makeStringIndex(N),
104-
}
105-
self.idx = data[dtype]
77+
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
78+
"datetime64[ns, tz]": pd.date_range(
79+
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
80+
),
81+
}[dtype]
82+
if not unique:
83+
data = data.repeat(5)
84+
self.idx = data
10685
# cache is_unique
10786
self.idx.is_unique
10887

109-
def time_duplicated_unique(self, dtype):
110-
self.idx.duplicated()
88+
def time_duplicated(self, unique, keep, dtype):
89+
self.idx.duplicated(keep=keep)
11190

11291

11392
class Hashing:

asv_bench/benchmarks/arithmetic.py

+30
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,36 @@ def time_frame_op_with_scalar(self, dtype, scalar, op):
5050
op(self.df, scalar)
5151

5252

53+
class MixedFrameWithSeriesAxis0:
54+
params = [
55+
[
56+
"eq",
57+
"ne",
58+
"lt",
59+
"le",
60+
"ge",
61+
"gt",
62+
"add",
63+
"sub",
64+
"div",
65+
"floordiv",
66+
"mul",
67+
"pow",
68+
]
69+
]
70+
param_names = ["opname"]
71+
72+
def setup(self, opname):
73+
arr = np.arange(10 ** 6).reshape(100, -1)
74+
df = DataFrame(arr)
75+
df["C"] = 1.0
76+
self.df = df
77+
self.ser = df[0]
78+
79+
def time_frame_op_with_series_axis0(self, opname):
80+
getattr(self.df, opname)(self.ser, axis=0)
81+
82+
5383
class Ops:
5484

5585
params = [[True, False], ["default", 1]]

asv_bench/benchmarks/categoricals.py

-3
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,6 @@ def setup(self):
258258
def time_get_loc(self):
259259
self.index.get_loc(self.category)
260260

261-
def time_shape(self):
262-
self.index.shape
263-
264261
def time_shallow_copy(self):
265262
self.index._shallow_copy()
266263

asv_bench/benchmarks/frame_ctor.py

+45
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22

3+
import pandas as pd
34
from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range
45

56
from .pandas_vb_common import tm
@@ -118,4 +119,48 @@ def time_frame_from_range(self):
118119
self.df = DataFrame(self.data)
119120

120121

122+
class FromArrays:
123+
124+
goal_time = 0.2
125+
126+
def setup(self):
127+
N_rows = 1000
128+
N_cols = 1000
129+
self.float_arrays = [np.random.randn(N_rows) for _ in range(N_cols)]
130+
self.sparse_arrays = [
131+
pd.arrays.SparseArray(np.random.randint(0, 2, N_rows), dtype="float64")
132+
for _ in range(N_cols)
133+
]
134+
self.int_arrays = [
135+
pd.array(np.random.randint(1000, size=N_rows), dtype="Int64")
136+
for _ in range(N_cols)
137+
]
138+
self.index = pd.Index(range(N_rows))
139+
self.columns = pd.Index(range(N_cols))
140+
141+
def time_frame_from_arrays_float(self):
142+
self.df = DataFrame._from_arrays(
143+
self.float_arrays,
144+
index=self.index,
145+
columns=self.columns,
146+
verify_integrity=False,
147+
)
148+
149+
def time_frame_from_arrays_int(self):
150+
self.df = DataFrame._from_arrays(
151+
self.int_arrays,
152+
index=self.index,
153+
columns=self.columns,
154+
verify_integrity=False,
155+
)
156+
157+
def time_frame_from_arrays_sparse(self):
158+
self.df = DataFrame._from_arrays(
159+
self.sparse_arrays,
160+
index=self.index,
161+
columns=self.columns,
162+
verify_integrity=False,
163+
)
164+
165+
121166
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/index_cached_properties.py

+3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ class IndexCache:
77

88
params = [
99
[
10+
"CategoricalIndex",
1011
"DatetimeIndex",
1112
"Float64Index",
1213
"IntervalIndex",
@@ -42,6 +43,8 @@ def setup(self, index_type):
4243
self.idx = pd.Float64Index(range(N))
4344
elif index_type == "UInt64Index":
4445
self.idx = pd.UInt64Index(range(N))
46+
elif index_type == "CategoricalIndex":
47+
self.idx = pd.CategoricalIndex(range(N), range(N))
4548
else:
4649
raise ValueError
4750
assert len(self.idx) == N

asv_bench/benchmarks/index_object.py

-8
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,6 @@ def time_datetime_difference_disjoint(self):
5555
self.datetime_left.difference(self.datetime_right)
5656

5757

58-
class Datetime:
59-
def setup(self):
60-
self.dr = date_range("20000101", freq="D", periods=10000)
61-
62-
def time_is_dates_only(self):
63-
self.dr._is_dates_only
64-
65-
6658
class Range:
6759
def setup(self):
6860
self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3)

asv_bench/benchmarks/indexing.py

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
"""
2+
These benchmarks are for Series and DataFrame indexing methods. For the
3+
lower-level methods directly on Index and subclasses, see index_object.py,
4+
indexing_engine.py, and index_cached.py
5+
"""
16
import warnings
27

38
import numpy as np

asv_bench/benchmarks/period.py

-3
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,6 @@ def setup(self):
8585
def time_get_loc(self):
8686
self.index.get_loc(self.period)
8787

88-
def time_shape(self):
89-
self.index.shape
90-
9188
def time_shallow_copy(self):
9289
self.index._shallow_copy()
9390

asv_bench/benchmarks/rolling.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ class Methods:
1111
["int", "float"],
1212
["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
1313
)
14-
param_names = ["contructor", "window", "dtype", "method"]
14+
param_names = ["constructor", "window", "dtype", "method"]
1515

1616
def setup(self, constructor, window, dtype, method):
1717
N = 10 ** 5
@@ -72,7 +72,7 @@ class ExpandingMethods:
7272
["int", "float"],
7373
["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
7474
)
75-
param_names = ["contructor", "window", "dtype", "method"]
75+
param_names = ["constructor", "window", "dtype", "method"]
7676

7777
def setup(self, constructor, dtype, method):
7878
N = 10 ** 5
@@ -86,7 +86,7 @@ def time_expanding(self, constructor, dtype, method):
8686
class EWMMethods:
8787

8888
params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"])
89-
param_names = ["contructor", "window", "dtype", "method"]
89+
param_names = ["constructor", "window", "dtype", "method"]
9090

9191
def setup(self, constructor, window, dtype, method):
9292
N = 10 ** 5
@@ -104,7 +104,7 @@ class VariableWindowMethods(Methods):
104104
["int", "float"],
105105
["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
106106
)
107-
param_names = ["contructor", "window", "dtype", "method"]
107+
param_names = ["constructor", "window", "dtype", "method"]
108108

109109
def setup(self, constructor, window, dtype, method):
110110
N = 10 ** 5

asv_bench/benchmarks/sparse.py

-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
4545
class SparseDataFrameConstructor:
4646
def setup(self):
4747
N = 1000
48-
self.arr = np.arange(N)
4948
self.sparse = scipy.sparse.rand(N, N, 0.005)
5049

5150
def time_from_scipy(self):

0 commit comments

Comments
 (0)