Skip to content

Commit a500311

Browse files
committed
Merge branch 'main' into enh-stata-non-nano
2 parents 227b7f7 + 92fa9ca commit a500311

File tree

333 files changed

+6368
-4646
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

333 files changed

+6368
-4646
lines changed

.github/workflows/comment-commands.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ jobs:
7777
echo 'EOF' >> $GITHUB_ENV
7878
echo "REGEX=$REGEX" >> $GITHUB_ENV
7979
80-
- uses: actions/github-script@v6
80+
- uses: actions/github-script@v7
8181
env:
8282
BENCH_OUTPUT: ${{env.BENCH_OUTPUT}}
8383
REGEX: ${{env.REGEX}}

.github/workflows/deprecation-tracking-bot.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
env:
2222
DEPRECATION_TRACKER_ISSUE: 50578
2323
steps:
24-
- uses: actions/github-script@v6
24+
- uses: actions/github-script@v7
2525
id: update-deprecation-issue
2626
with:
2727
script: |

.github/workflows/unit-tests.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ defaults:
2323
jobs:
2424
ubuntu:
2525
runs-on: ubuntu-22.04
26-
timeout-minutes: 180
26+
timeout-minutes: 90
2727
strategy:
2828
matrix:
2929
env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
@@ -177,7 +177,7 @@ jobs:
177177
if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}}
178178

179179
macos-windows:
180-
timeout-minutes: 180
180+
timeout-minutes: 90
181181
strategy:
182182
matrix:
183183
os: [macos-latest, windows-latest]
@@ -322,7 +322,7 @@ jobs:
322322
matrix:
323323
os: [ubuntu-22.04, macOS-latest, windows-latest]
324324

325-
timeout-minutes: 180
325+
timeout-minutes: 90
326326

327327
concurrency:
328328
#https://github.community/t/concurrecy-not-work-for-push/183068/7

asv_bench/asv.conf.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
// pip (with all the conda available packages installed first,
4242
// followed by the pip installed packages).
4343
"matrix": {
44-
"Cython": ["0.29.33"],
44+
"Cython": ["3.0.5"],
4545
"matplotlib": [],
4646
"sqlalchemy": [],
4747
"scipy": [],

asv_bench/benchmarks/algorithms.py

+47-53
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from importlib import import_module
22

33
import numpy as np
4-
import pyarrow as pa
54

65
import pandas as pd
76

@@ -20,9 +19,9 @@ class Factorize:
2019
[True, False],
2120
[True, False],
2221
[
23-
"int",
24-
"uint",
25-
"float",
22+
"int64",
23+
"uint64",
24+
"float64",
2625
"object",
2726
"object_str",
2827
"datetime64[ns]",
@@ -36,28 +35,24 @@ class Factorize:
3635

3736
def setup(self, unique, sort, dtype):
3837
N = 10**5
39-
string_index = tm.makeStringIndex(N)
40-
string_arrow = None
41-
if dtype == "string[pyarrow]":
42-
try:
43-
string_arrow = pd.array(string_index, dtype="string[pyarrow]")
44-
except ImportError:
45-
raise NotImplementedError
46-
47-
data = {
48-
"int": pd.Index(np.arange(N), dtype="int64"),
49-
"uint": pd.Index(np.arange(N), dtype="uint64"),
50-
"float": pd.Index(np.random.randn(N), dtype="float64"),
51-
"object_str": string_index,
52-
"object": pd.Index(np.arange(N), dtype="object"),
53-
"datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
54-
"datetime64[ns, tz]": pd.date_range(
55-
"2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
56-
),
57-
"Int64": pd.array(np.arange(N), dtype="Int64"),
58-
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
59-
"string[pyarrow]": string_arrow,
60-
}[dtype]
38+
39+
if dtype in ["int64", "uint64", "Int64", "object"]:
40+
data = pd.Index(np.arange(N), dtype=dtype)
41+
elif dtype == "float64":
42+
data = pd.Index(np.random.randn(N), dtype=dtype)
43+
elif dtype == "boolean":
44+
data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
45+
elif dtype == "datetime64[ns]":
46+
data = pd.date_range("2011-01-01", freq="h", periods=N)
47+
elif dtype == "datetime64[ns, tz]":
48+
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
49+
elif dtype == "object_str":
50+
data = tm.makeStringIndex(N)
51+
elif dtype == "string[pyarrow]":
52+
data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
53+
else:
54+
raise NotImplementedError
55+
6156
if not unique:
6257
data = data.repeat(5)
6358
self.data = data
@@ -74,9 +69,9 @@ class Duplicated:
7469
[True, False],
7570
["first", "last", False],
7671
[
77-
"int",
78-
"uint",
79-
"float",
72+
"int64",
73+
"uint64",
74+
"float64",
8075
"string",
8176
"datetime64[ns]",
8277
"datetime64[ns, tz]",
@@ -88,22 +83,20 @@ class Duplicated:
8883

8984
def setup(self, unique, keep, dtype):
9085
N = 10**5
91-
data = {
92-
"int": pd.Index(np.arange(N), dtype="int64"),
93-
"uint": pd.Index(np.arange(N), dtype="uint64"),
94-
"float": pd.Index(np.random.randn(N), dtype="float64"),
95-
"string": tm.makeStringIndex(N),
96-
"datetime64[ns]": pd.date_range("2011-01-01", freq="h", periods=N),
97-
"datetime64[ns, tz]": pd.date_range(
98-
"2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
99-
),
100-
"timestamp[ms][pyarrow]": pd.Index(
101-
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
102-
),
103-
"duration[s][pyarrow]": pd.Index(
104-
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
105-
),
106-
}[dtype]
86+
if dtype in ["int64", "uint64"]:
87+
data = pd.Index(np.arange(N), dtype=dtype)
88+
elif dtype == "float64":
89+
data = pd.Index(np.random.randn(N), dtype="float64")
90+
elif dtype == "string":
91+
data = tm.makeStringIndex(N)
92+
elif dtype == "datetime64[ns]":
93+
data = pd.date_range("2011-01-01", freq="h", periods=N)
94+
elif dtype == "datetime64[ns, tz]":
95+
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
96+
elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]:
97+
data = pd.Index(np.arange(N), dtype=dtype)
98+
else:
99+
raise NotImplementedError
107100
if not unique:
108101
data = data.repeat(5)
109102
self.idx = data
@@ -181,21 +174,22 @@ class Quantile:
181174
params = [
182175
[0, 0.5, 1],
183176
["linear", "nearest", "lower", "higher", "midpoint"],
184-
["float", "int", "uint"],
177+
["float64", "int64", "uint64"],
185178
]
186179
param_names = ["quantile", "interpolation", "dtype"]
187180

188181
def setup(self, quantile, interpolation, dtype):
189182
N = 10**5
190-
data = {
191-
"int": np.arange(N),
192-
"uint": np.arange(N).astype(np.uint64),
193-
"float": np.random.randn(N),
194-
}
195-
self.idx = pd.Series(data[dtype].repeat(5))
183+
if dtype in ["int64", "uint64"]:
184+
data = np.arange(N, dtype=dtype)
185+
elif dtype == "float64":
186+
data = np.random.randn(N)
187+
else:
188+
raise NotImplementedError
189+
self.ser = pd.Series(data.repeat(5))
196190

197191
def time_quantile(self, quantile, interpolation, dtype):
198-
self.idx.quantile(quantile, interpolation=interpolation)
192+
self.ser.quantile(quantile, interpolation=interpolation)
199193

200194

201195
class SortIntegerArray:

asv_bench/benchmarks/array.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ def time_from_float_array(self):
3131
class IntegerArray:
3232
def setup(self):
3333
N = 250_000
34-
self.values_integer = np.array([1, 0, 1, 0] * N)
35-
self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
36-
self.mask = np.array([False, False, True, False] * N)
34+
self.values_integer = np.tile(np.array([1, 0, 1, 0]), N)
35+
self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N)
36+
self.mask = np.tile(np.array([False, False, True, False]), N)
3737

3838
def time_constructor(self):
3939
pd.arrays.IntegerArray(self.data, self.mask)

asv_bench/benchmarks/categoricals.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -260,18 +260,16 @@ class CategoricalSlicing:
260260
def setup(self, index):
261261
N = 10**6
262262
categories = ["a", "b", "c"]
263-
values = [0] * N + [1] * N + [2] * N
264263
if index == "monotonic_incr":
265-
self.data = pd.Categorical.from_codes(values, categories=categories)
264+
codes = np.repeat([0, 1, 2], N)
266265
elif index == "monotonic_decr":
267-
self.data = pd.Categorical.from_codes(
268-
list(reversed(values)), categories=categories
269-
)
266+
codes = np.repeat([2, 1, 0], N)
270267
elif index == "non_monotonic":
271-
self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
268+
codes = np.tile([0, 1, 2], N)
272269
else:
273270
raise ValueError(f"Invalid index param: {index}")
274271

272+
self.data = pd.Categorical.from_codes(codes, categories=categories)
275273
self.scalar = 10000
276274
self.list = list(range(10000))
277275
self.cat_scalar = "b"

asv_bench/benchmarks/frame_methods.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,8 @@ def time_frame_nunique(self):
640640

641641
class SeriesNuniqueWithNan:
642642
def setup(self):
643-
self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
643+
values = 100 * [np.nan] + list(range(100))
644+
self.ser = Series(np.tile(values, 10000), dtype=float)
644645

645646
def time_series_nunique_nan(self):
646647
self.ser.nunique()

asv_bench/benchmarks/gil.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO):
272272
def setup(self, dtype):
273273
rows = 10000
274274
cols = 50
275-
data = {
276-
"float": DataFrame(np.random.randn(rows, cols)),
277-
"datetime": DataFrame(
275+
if dtype == "float":
276+
df = DataFrame(np.random.randn(rows, cols))
277+
elif dtype == "datetime":
278+
df = DataFrame(
278279
np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
279-
),
280-
"object": DataFrame(
280+
)
281+
elif dtype == "object":
282+
df = DataFrame(
281283
"foo", index=range(rows), columns=["object%03d" for _ in range(5)]
282-
),
283-
}
284+
)
285+
else:
286+
raise NotImplementedError
284287

285288
self.fname = f"__test_{dtype}__.csv"
286-
df = data[dtype]
287289
df.to_csv(self.fname)
288290

289291
@test_parallel(num_threads=2)

asv_bench/benchmarks/groupby.py

+46-1
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ def setup(self, dtype, tie_method):
713713
if dtype == "datetime64":
714714
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
715715
else:
716-
data = np.array([1] * N, dtype=dtype)
716+
data = np.ones(N, dtype=dtype)
717717
self.df = DataFrame({"values": data, "key": ["foo"] * N})
718718

719719
def time_rank_ties(self, dtype, tie_method):
@@ -802,6 +802,51 @@ def time_groupby_extra_cat_nosort(self, observed):
802802
self.df_extra_cat.groupby("a", observed=observed, sort=False)["b"].count()
803803

804804

805+
class MultipleCategories:
806+
def setup(self):
807+
N = 10**3
808+
arr = np.random.random(N)
809+
data = {
810+
"a1": Categorical(np.random.randint(10000, size=N)),
811+
"a2": Categorical(np.random.randint(10000, size=N)),
812+
"b": arr,
813+
}
814+
self.df = DataFrame(data)
815+
data = {
816+
"a1": Categorical(np.random.randint(10000, size=N), ordered=True),
817+
"a2": Categorical(np.random.randint(10000, size=N), ordered=True),
818+
"b": arr,
819+
}
820+
self.df_ordered = DataFrame(data)
821+
data = {
822+
"a1": Categorical(np.random.randint(100, size=N), categories=np.arange(N)),
823+
"a2": Categorical(np.random.randint(100, size=N), categories=np.arange(N)),
824+
"b": arr,
825+
}
826+
self.df_extra_cat = DataFrame(data)
827+
828+
def time_groupby_sort(self):
829+
self.df.groupby(["a1", "a2"], observed=False)["b"].count()
830+
831+
def time_groupby_nosort(self):
832+
self.df.groupby(["a1", "a2"], observed=False, sort=False)["b"].count()
833+
834+
def time_groupby_ordered_sort(self):
835+
self.df_ordered.groupby(["a1", "a2"], observed=False)["b"].count()
836+
837+
def time_groupby_ordered_nosort(self):
838+
self.df_ordered.groupby(["a1", "a2"], observed=False, sort=False)["b"].count()
839+
840+
def time_groupby_extra_cat_sort(self):
841+
self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].count()
842+
843+
def time_groupby_extra_cat_nosort(self):
844+
self.df_extra_cat.groupby(["a1", "a2"], observed=False, sort=False)["b"].count()
845+
846+
def time_groupby_transform(self):
847+
self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].cumsum()
848+
849+
805850
class Datelike:
806851
# GH 14338
807852
params = ["period_range", "date_range", "date_range_tz"]

asv_bench/benchmarks/index_object.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,7 @@ def setup(self, dtype):
161161
self.sorted = self.idx.sort_values()
162162
half = N // 2
163163
self.non_unique = self.idx[:half].append(self.idx[:half])
164-
self.non_unique_sorted = (
165-
self.sorted[:half].append(self.sorted[:half]).sort_values()
166-
)
164+
self.non_unique_sorted = self.sorted[:half].repeat(2)
167165
self.key = self.sorted[N // 4]
168166

169167
def time_boolean_array(self, dtype):

asv_bench/benchmarks/indexing.py

+16
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,10 @@ def time_loc_null_slice_plus_slice(self, unique_levels):
306306
target = (self.tgt_null_slice, self.tgt_slice)
307307
self.df.loc[target, :]
308308

309+
def time_loc_multiindex(self, unique_levels):
310+
target = self.df.index[::10]
311+
self.df.loc[target]
312+
309313
def time_xs_level_0(self, unique_levels):
310314
target = self.tgt_scalar
311315
self.df.xs(target, level=0)
@@ -515,6 +519,18 @@ def time_setitem_list(self):
515519
self.df[[100, 200, 300]] = 100
516520

517521

522+
class SetitemObjectDtype:
523+
# GH#19299
524+
525+
def setup(self):
526+
N = 1000
527+
cols = 500
528+
self.df = DataFrame(index=range(N), columns=range(cols), dtype=object)
529+
530+
def time_setitem_object_dtype(self):
531+
self.df.loc[0, 1] = 1.0
532+
533+
518534
class ChainIndexing:
519535
params = [None, "warn"]
520536
param_names = ["mode"]

0 commit comments

Comments
 (0)