Skip to content

Commit 445484c

Browse files
Merge remote-tracking branch 'upstream/main' into bisect
2 parents 3b18c35 + d0cf9b5 commit 445484c

File tree

282 files changed

+6296
-3893
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

282 files changed

+6296
-3893
lines changed

.github/workflows/datamanger.yml

-54
This file was deleted.

.github/workflows/posix.yml

+20-1
Original file line numberDiff line numberDiff line change
@@ -26,37 +26,52 @@ jobs:
2626
matrix:
2727
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
2828
pattern: ["not single_cpu", "single_cpu"]
29+
# Don't test pyarrow v2/3: Causes timeouts in read_csv engine
30+
# even if tests are skipped/xfailed
31+
pyarrow_version: ["5", "7"]
2932
include:
3033
- env_file: actions-38-downstream_compat.yaml
3134
pattern: "not slow and not network and not single_cpu"
3235
pytest_target: "pandas/tests/test_downstream.py"
36+
name: "Downstream Compat"
3337
- env_file: actions-38-minimum_versions.yaml
3438
pattern: "not slow and not network and not single_cpu"
39+
name: "Minimum Versions"
3540
- env_file: actions-38.yaml
3641
pattern: "not slow and not network and not single_cpu"
3742
extra_apt: "language-pack-it"
3843
lang: "it_IT.utf8"
3944
lc_all: "it_IT.utf8"
45+
name: "Locale: it_IT.utf8"
4046
- env_file: actions-38.yaml
4147
pattern: "not slow and not network and not single_cpu"
4248
extra_apt: "language-pack-zh-hans"
4349
lang: "zh_CN.utf8"
4450
lc_all: "zh_CN.utf8"
51+
name: "Locale: zh_CN.utf8"
52+
- env_file: actions-38.yaml
53+
pattern: "not slow and not network and not single_cpu"
54+
pandas_data_manager: "array"
55+
name: "Data Manager"
4556
- env_file: actions-pypy-38.yaml
4657
pattern: "not slow and not network and not single_cpu"
4758
test_args: "--max-worker-restart 0"
59+
name: "Pypy"
4860
- env_file: actions-310-numpydev.yaml
4961
pattern: "not slow and not network and not single_cpu"
5062
pandas_testing_mode: "deprecate"
5163
test_args: "-W error"
64+
name: "Numpy Dev"
5265
fail-fast: false
66+
name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }}
5367
env:
5468
ENV_FILE: ci/deps/${{ matrix.env_file }}
5569
PATTERN: ${{ matrix.pattern }}
5670
EXTRA_APT: ${{ matrix.extra_apt || '' }}
5771
LANG: ${{ matrix.lang || '' }}
5872
LC_ALL: ${{ matrix.lc_all || '' }}
5973
PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }}
74+
PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
6075
TEST_ARGS: ${{ matrix.test_args || '' }}
6176
PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
6277
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
@@ -65,7 +80,7 @@ jobs:
6580
COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }}
6681
concurrency:
6782
# https://github.community/t/concurrecy-not-work-for-push/183068/7
68-
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
83+
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }}
6984
cancel-in-progress: true
7085

7186
services:
@@ -133,6 +148,10 @@ jobs:
133148
use-only-tar-bz2: true
134149
if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support
135150

151+
- name: Upgrade Arrow version
152+
run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }}
153+
if: ${{ matrix.pyarrow_version }}
154+
136155
- name: Setup PyPy
137156
uses: actions/setup-python@v2
138157
with:

.github/workflows/sdist.yml

+2
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ on:
99
branches:
1010
- main
1111
- 1.4.x
12+
types: [labeled, opened, synchronize, reopened]
1213
paths-ignore:
1314
- "doc/**"
1415

1516
jobs:
1617
build:
18+
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
1719
runs-on: ubuntu-latest
1820
timeout-minutes: 60
1921
defaults:

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ repos:
178178
language: python
179179
files: ^pandas/core/generic\.py$
180180
- id: pandas-errors-documented
181-
name: Ensure pandas errors are documented in doc/source/reference/general_utility_functions.rst
181+
name: Ensure pandas errors are documented in doc/source/reference/testing.rst
182182
entry: python scripts/pandas_errors_documented.py
183183
language: python
184184
files: ^pandas/errors/__init__.py$

Dockerfile

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM quay.io/condaforge/miniforge3:4.11.0-0
1+
FROM quay.io/condaforge/miniforge3
22

33
# if you forked pandas, you can pass in your own GitHub username to use your fork
44
# i.e. gh_username=myname
@@ -12,6 +12,11 @@ ENV DEBIAN_FRONTEND=noninteractive
1212
RUN apt-get update \
1313
&& apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
1414
#
15+
# Install tzdata and configure timezone (fix for tests which try to read from "/etc/localtime")
16+
&& apt-get -y install tzdata \
17+
&& ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime \
18+
&& dpkg-reconfigure -f noninteractive tzdata \
19+
#
1520
# Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
1621
&& apt-get -y install git iproute2 procps iproute2 lsb-release \
1722
#

asv_bench/benchmarks/groupby.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
method_blocklist = {
2020
"object": {
21+
"diff",
2122
"median",
2223
"prod",
2324
"sem",
@@ -405,7 +406,7 @@ class GroupByMethods:
405406

406407
param_names = ["dtype", "method", "application", "ncols"]
407408
params = [
408-
["int", "float", "object", "datetime", "uint"],
409+
["int", "int16", "float", "object", "datetime", "uint"],
409410
[
410411
"all",
411412
"any",
@@ -417,6 +418,7 @@ class GroupByMethods:
417418
"cumprod",
418419
"cumsum",
419420
"describe",
421+
"diff",
420422
"ffill",
421423
"first",
422424
"head",
@@ -478,7 +480,7 @@ def setup(self, dtype, method, application, ncols):
478480
values = rng.take(taker, axis=0)
479481
if dtype == "int":
480482
key = np.random.randint(0, size, size=size)
481-
elif dtype == "uint":
483+
elif dtype in ("int16", "uint"):
482484
key = np.random.randint(0, size, size=size, dtype=dtype)
483485
elif dtype == "float":
484486
key = np.concatenate(

asv_bench/benchmarks/indexing.py

+91-21
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
CategoricalIndex,
1414
DataFrame,
1515
Float64Index,
16-
IndexSlice,
1716
Int64Index,
1817
IntervalIndex,
1918
MultiIndex,
@@ -200,28 +199,81 @@ def time_take(self, index):
200199

201200

202201
class MultiIndexing:
203-
def setup(self):
204-
mi = MultiIndex.from_product([range(1000), range(1000)])
205-
self.s = Series(np.random.randn(1000000), index=mi)
206-
self.df = DataFrame(self.s)
207202

208-
n = 100000
209-
with warnings.catch_warnings(record=True):
210-
self.mdt = DataFrame(
211-
{
212-
"A": np.random.choice(range(10000, 45000, 1000), n),
213-
"B": np.random.choice(range(10, 400), n),
214-
"C": np.random.choice(range(1, 150), n),
215-
"D": np.random.choice(range(10000, 45000), n),
216-
"x": np.random.choice(range(400), n),
217-
"y": np.random.choice(range(25), n),
218-
}
219-
)
220-
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
221-
self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
203+
params = [True, False]
204+
param_names = ["unique_levels"]
205+
206+
def setup(self, unique_levels):
207+
self.nlevels = 2
208+
if unique_levels:
209+
mi = MultiIndex.from_arrays([range(1000000)] * self.nlevels)
210+
else:
211+
mi = MultiIndex.from_product([range(1000)] * self.nlevels)
212+
self.df = DataFrame(np.random.randn(len(mi)), index=mi)
213+
214+
self.tgt_slice = slice(200, 800)
215+
self.tgt_null_slice = slice(None)
216+
self.tgt_list = list(range(0, 1000, 10))
217+
self.tgt_scalar = 500
218+
219+
bool_indexer = np.zeros(len(mi), dtype=np.bool_)
220+
bool_indexer[slice(0, len(mi), 100)] = True
221+
self.tgt_bool_indexer = bool_indexer
222+
223+
def time_loc_partial_key_slice(self, unique_levels):
224+
self.df.loc[self.tgt_slice, :]
225+
226+
def time_loc_partial_key_null_slice(self, unique_levels):
227+
self.df.loc[self.tgt_null_slice, :]
228+
229+
def time_loc_partial_key_list(self, unique_levels):
230+
self.df.loc[self.tgt_list, :]
231+
232+
def time_loc_partial_key_scalar(self, unique_levels):
233+
self.df.loc[self.tgt_scalar, :]
234+
235+
def time_loc_partial_key_bool_indexer(self, unique_levels):
236+
self.df.loc[self.tgt_bool_indexer, :]
237+
238+
def time_loc_all_slices(self, unique_levels):
239+
target = tuple([self.tgt_slice] * self.nlevels)
240+
self.df.loc[target, :]
241+
242+
def time_loc_all_null_slices(self, unique_levels):
243+
target = tuple([self.tgt_null_slice] * self.nlevels)
244+
self.df.loc[target, :]
245+
246+
def time_loc_all_lists(self, unique_levels):
247+
target = tuple([self.tgt_list] * self.nlevels)
248+
self.df.loc[target, :]
222249

223-
def time_index_slice(self):
224-
self.mdt.loc[self.idx, :]
250+
def time_loc_all_scalars(self, unique_levels):
251+
target = tuple([self.tgt_scalar] * self.nlevels)
252+
self.df.loc[target, :]
253+
254+
def time_loc_all_bool_indexers(self, unique_levels):
255+
target = tuple([self.tgt_bool_indexer] * self.nlevels)
256+
self.df.loc[target, :]
257+
258+
def time_loc_slice_plus_null_slice(self, unique_levels):
259+
target = (self.tgt_slice, self.tgt_null_slice)
260+
self.df.loc[target, :]
261+
262+
def time_loc_null_slice_plus_slice(self, unique_levels):
263+
target = (self.tgt_null_slice, self.tgt_slice)
264+
self.df.loc[target, :]
265+
266+
def time_xs_level_0(self, unique_levels):
267+
target = self.tgt_scalar
268+
self.df.xs(target, level=0)
269+
270+
def time_xs_level_1(self, unique_levels):
271+
target = self.tgt_scalar
272+
self.df.xs(target, level=1)
273+
274+
def time_xs_full_key(self, unique_levels):
275+
target = tuple([self.tgt_scalar] * self.nlevels)
276+
self.df.xs(target)
225277

226278

227279
class IntervalIndexing:
@@ -257,6 +309,24 @@ def time_get_indexer_mismatched_tz(self):
257309
self.dti.get_indexer(self.dti2)
258310

259311

312+
class SortedAndUnsortedDatetimeIndexLoc:
313+
def setup(self):
314+
dti = date_range("2016-01-01", periods=10000, tz="US/Pacific")
315+
index = np.array(dti)
316+
317+
unsorted_index = index.copy()
318+
unsorted_index[10] = unsorted_index[20]
319+
320+
self.df_unsorted = DataFrame(index=unsorted_index, data={"a": 1})
321+
self.df_sort = DataFrame(index=index, data={"a": 1})
322+
323+
def time_loc_unsorted(self):
324+
self.df_unsorted.loc["2016-6-11"]
325+
326+
def time_loc_sorted(self):
327+
self.df_sort.loc["2016-6-11"]
328+
329+
260330
class CategoricalIndexIndexing:
261331

262332
params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]

asv_bench/benchmarks/join_merge.py

+13
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
158158
self.left.join(self.right, on="jim")
159159

160160

161+
class JoinEmpty:
162+
def setup(self):
163+
N = 100_000
164+
self.df = DataFrame({"A": np.arange(N)})
165+
self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")
166+
167+
def time_inner_join_left_empty(self):
168+
self.df_empty.join(self.df, how="inner")
169+
170+
def time_inner_join_right_empty(self):
171+
self.df.join(self.df_empty, how="inner")
172+
173+
161174
class JoinNonUnique:
162175
# outer join of non-unique
163176
# GH 6329

asv_bench/benchmarks/reindex.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,22 @@ def setup(self):
2828
index = MultiIndex.from_arrays([level1, level2])
2929
self.s = Series(np.random.randn(N * K), index=index)
3030
self.s_subset = self.s[::2]
31+
self.s_subset_no_cache = self.s[::2].copy()
3132

3233
def time_reindex_dates(self):
3334
self.df.reindex(self.rng_subset)
3435

3536
def time_reindex_columns(self):
3637
self.df2.reindex(columns=self.df.columns[1:5])
3738

38-
def time_reindex_multiindex(self):
39+
def time_reindex_multiindex_with_cache(self):
40+
# MultiIndex._values gets cached
3941
self.s.reindex(self.s_subset.index)
4042

43+
def time_reindex_multiindex_no_cache(self):
44+
# Copy to avoid MultiIndex._values getting cached
45+
self.s.reindex(self.s_subset_no_cache.index.copy())
46+
4147

4248
class ReindexMethod:
4349

azure-pipelines.yml

-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ variables:
2222
PANDAS_CI: 1
2323

2424
jobs:
25-
# Mac and Linux use the same template
2625
- template: ci/azure/posix.yml
2726
parameters:
2827
name: macOS

0 commit comments

Comments
 (0)