Skip to content

Commit 675810f

Browse files
authored
Merge branch 'main' into pandas-devGH-15354-phased
2 parents 3a9c089 + 367f8a1 commit 675810f

File tree

201 files changed

+3400
-2481
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

201 files changed

+3400
-2481
lines changed

.github/workflows/datamanger.yml

-54
This file was deleted.

.github/workflows/posix.yml

+20-1
Original file line numberDiff line numberDiff line change
@@ -26,37 +26,52 @@ jobs:
2626
matrix:
2727
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
2828
pattern: ["not single_cpu", "single_cpu"]
29+
# Don't test pyarrow v2/3: Causes timeouts in read_csv engine
30+
# even if tests are skipped/xfailed
31+
pyarrow_version: ["5", "7"]
2932
include:
3033
- env_file: actions-38-downstream_compat.yaml
3134
pattern: "not slow and not network and not single_cpu"
3235
pytest_target: "pandas/tests/test_downstream.py"
36+
name: "Downstream Compat"
3337
- env_file: actions-38-minimum_versions.yaml
3438
pattern: "not slow and not network and not single_cpu"
39+
name: "Minimum Versions"
3540
- env_file: actions-38.yaml
3641
pattern: "not slow and not network and not single_cpu"
3742
extra_apt: "language-pack-it"
3843
lang: "it_IT.utf8"
3944
lc_all: "it_IT.utf8"
45+
name: "Locale: it_IT.utf8"
4046
- env_file: actions-38.yaml
4147
pattern: "not slow and not network and not single_cpu"
4248
extra_apt: "language-pack-zh-hans"
4349
lang: "zh_CN.utf8"
4450
lc_all: "zh_CN.utf8"
51+
name: "Locale: zh_CN.utf8"
52+
- env_file: actions-38.yaml
53+
pattern: "not slow and not network and not single_cpu"
54+
pandas_data_manager: "array"
55+
name: "Data Manager"
4556
- env_file: actions-pypy-38.yaml
4657
pattern: "not slow and not network and not single_cpu"
4758
test_args: "--max-worker-restart 0"
59+
name: "Pypy"
4860
- env_file: actions-310-numpydev.yaml
4961
pattern: "not slow and not network and not single_cpu"
5062
pandas_testing_mode: "deprecate"
5163
test_args: "-W error"
64+
name: "Numpy Dev"
5265
fail-fast: false
66+
name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }}
5367
env:
5468
ENV_FILE: ci/deps/${{ matrix.env_file }}
5569
PATTERN: ${{ matrix.pattern }}
5670
EXTRA_APT: ${{ matrix.extra_apt || '' }}
5771
LANG: ${{ matrix.lang || '' }}
5872
LC_ALL: ${{ matrix.lc_all || '' }}
5973
PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }}
74+
PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
6075
TEST_ARGS: ${{ matrix.test_args || '' }}
6176
PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
6277
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
@@ -65,7 +80,7 @@ jobs:
6580
COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }}
6681
concurrency:
6782
# https://github.community/t/concurrecy-not-work-for-push/183068/7
68-
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
83+
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }}
6984
cancel-in-progress: true
7085

7186
services:
@@ -133,6 +148,10 @@ jobs:
133148
use-only-tar-bz2: true
134149
if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support
135150

151+
- name: Upgrade Arrow version
152+
run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }}
153+
if: ${{ matrix.pyarrow_version }}
154+
136155
- name: Setup PyPy
137156
uses: actions/setup-python@v2
138157
with:

.github/workflows/sdist.yml

+2
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ on:
99
branches:
1010
- main
1111
- 1.4.x
12+
types: [labeled, opened, synchronize, reopened]
1213
paths-ignore:
1314
- "doc/**"
1415

1516
jobs:
1617
build:
18+
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
1719
runs-on: ubuntu-latest
1820
timeout-minutes: 60
1921
defaults:

asv_bench/benchmarks/groupby.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
method_blocklist = {
2020
"object": {
21+
"diff",
2122
"median",
2223
"prod",
2324
"sem",
@@ -405,7 +406,7 @@ class GroupByMethods:
405406

406407
param_names = ["dtype", "method", "application", "ncols"]
407408
params = [
408-
["int", "float", "object", "datetime", "uint"],
409+
["int", "int16", "float", "object", "datetime", "uint"],
409410
[
410411
"all",
411412
"any",
@@ -417,6 +418,7 @@ class GroupByMethods:
417418
"cumprod",
418419
"cumsum",
419420
"describe",
421+
"diff",
420422
"ffill",
421423
"first",
422424
"head",
@@ -478,7 +480,7 @@ def setup(self, dtype, method, application, ncols):
478480
values = rng.take(taker, axis=0)
479481
if dtype == "int":
480482
key = np.random.randint(0, size, size=size)
481-
elif dtype == "uint":
483+
elif dtype in ("int16", "uint"):
482484
key = np.random.randint(0, size, size=size, dtype=dtype)
483485
elif dtype == "float":
484486
key = np.concatenate(

asv_bench/benchmarks/indexing.py

+61-21
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
CategoricalIndex,
1414
DataFrame,
1515
Float64Index,
16-
IndexSlice,
1716
Int64Index,
1817
IntervalIndex,
1918
MultiIndex,
@@ -200,28 +199,69 @@ def time_take(self, index):
200199

201200

202201
class MultiIndexing:
203-
def setup(self):
204-
mi = MultiIndex.from_product([range(1000), range(1000)])
205-
self.s = Series(np.random.randn(1000000), index=mi)
206-
self.df = DataFrame(self.s)
207202

208-
n = 100000
209-
with warnings.catch_warnings(record=True):
210-
self.mdt = DataFrame(
211-
{
212-
"A": np.random.choice(range(10000, 45000, 1000), n),
213-
"B": np.random.choice(range(10, 400), n),
214-
"C": np.random.choice(range(1, 150), n),
215-
"D": np.random.choice(range(10000, 45000), n),
216-
"x": np.random.choice(range(400), n),
217-
"y": np.random.choice(range(25), n),
218-
}
219-
)
220-
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
221-
self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
203+
params = [True, False]
204+
param_names = ["unique_levels"]
205+
206+
def setup(self, unique_levels):
207+
self.ndim = 2
208+
if unique_levels:
209+
mi = MultiIndex.from_arrays([range(1000000)] * self.ndim)
210+
else:
211+
mi = MultiIndex.from_product([range(1000)] * self.ndim)
212+
self.df = DataFrame(np.random.randn(len(mi)), index=mi)
213+
214+
self.tgt_slice = slice(200, 800)
215+
self.tgt_null_slice = slice(None)
216+
self.tgt_list = list(range(0, 1000, 10))
217+
self.tgt_scalar = 500
218+
219+
bool_indexer = np.zeros(len(mi), dtype=np.bool_)
220+
bool_indexer[slice(0, len(mi), 100)] = True
221+
self.tgt_bool_indexer = bool_indexer
222+
223+
def time_loc_partial_key_slice(self, unique_levels):
224+
self.df.loc[self.tgt_slice, :]
225+
226+
def time_loc_partial_key_null_slice(self, unique_levels):
227+
self.df.loc[self.tgt_null_slice, :]
228+
229+
def time_loc_partial_key_list(self, unique_levels):
230+
self.df.loc[self.tgt_list, :]
231+
232+
def time_loc_partial_key_scalar(self, unique_levels):
233+
self.df.loc[self.tgt_scalar, :]
234+
235+
def time_loc_partial_bool_indexer(self, unique_levels):
236+
self.df.loc[self.tgt_bool_indexer, :]
237+
238+
def time_loc_all_slices(self, unique_levels):
239+
target = tuple([self.tgt_slice] * self.ndim)
240+
self.df.loc[target, :]
241+
242+
def time_loc_all_null_slices(self, unique_levels):
243+
target = tuple([self.tgt_null_slice] * self.ndim)
244+
self.df.loc[target, :]
245+
246+
def time_loc_all_lists(self, unique_levels):
247+
target = tuple([self.tgt_list] * self.ndim)
248+
self.df.loc[target, :]
249+
250+
def time_loc_all_scalars(self, unique_levels):
251+
target = tuple([self.tgt_scalar] * self.ndim)
252+
self.df.loc[target, :]
253+
254+
def time_loc_all_bool_indexers(self, unique_levels):
255+
target = tuple([self.tgt_bool_indexer] * self.ndim)
256+
self.df.loc[target, :]
257+
258+
def time_loc_slice_plus_null_slice(self, unique_levels):
259+
target = (self.tgt_slice, self.tgt_null_slice)
260+
self.df.loc[target, :]
222261

223-
def time_index_slice(self):
224-
self.mdt.loc[self.idx, :]
262+
def time_loc_null_slice_plus_slice(self, unique_levels):
263+
target = (self.tgt_null_slice, self.tgt_slice)
264+
self.df.loc[target, :]
225265

226266

227267
class IntervalIndexing:

asv_bench/benchmarks/join_merge.py

+13
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
158158
self.left.join(self.right, on="jim")
159159

160160

161+
class JoinEmpty:
162+
def setup(self):
163+
N = 100_000
164+
self.df = DataFrame({"A": np.arange(N)})
165+
self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")
166+
167+
def time_inner_join_left_empty(self):
168+
self.df_empty.join(self.df, how="inner")
169+
170+
def time_inner_join_right_empty(self):
171+
self.df.join(self.df_empty, how="inner")
172+
173+
161174
class JoinNonUnique:
162175
# outer join of non-unique
163176
# GH 6329

azure-pipelines.yml

-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ variables:
2222
PANDAS_CI: 1
2323

2424
jobs:
25-
# Mac and Linux use the same template
2625
- template: ci/azure/posix.yml
2726
parameters:
2827
name: macOS

ci/azure/posix.yml

+11-6
Original file line numberDiff line numberDiff line change
@@ -10,30 +10,35 @@ jobs:
1010
strategy:
1111
matrix:
1212
py38:
13-
ENV_FILE: ci/deps/azure-macos-38.yaml
13+
ENV_FILE: ci/deps/actions-38.yaml
1414
CONDA_PY: "38"
1515

1616
py39:
17-
ENV_FILE: ci/deps/azure-macos-39.yaml
17+
ENV_FILE: ci/deps/actions-39.yaml
1818
CONDA_PY: "39"
1919

2020
py310:
21-
ENV_FILE: ci/deps/azure-macos-310.yaml
21+
ENV_FILE: ci/deps/actions-310.yaml
2222
CONDA_PY: "310"
2323

2424
steps:
2525
- script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
2626
displayName: 'Set conda path'
2727

28+
- script: rm /usr/local/miniconda/pkgs/cache/*.json
29+
displayName: 'Workaround for mamba-org/mamba#488'
30+
2831
- script: ci/setup_env.sh
2932
displayName: 'Setup environment and build pandas'
3033

3134
- script: |
32-
source activate pandas-dev
33-
ci/run_tests.sh
35+
conda run -n pandas-dev --no-capture-output ci/run_tests.sh
3436
displayName: 'Test'
3537
36-
- script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
38+
- script: |
39+
pushd /tmp
40+
conda run -n pandas-dev python -c "import pandas; pandas.show_versions()"
41+
popd
3742
displayName: 'Build versions'
3843
3944
- task: PublishTestResults@2

ci/azure/windows.yml

+10-4
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,22 @@ jobs:
2626
Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
2727
Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin"
2828
displayName: 'Add conda to PATH'
29-
- script: conda update -q -n base conda
30-
displayName: 'Update conda'
29+
- bash: conda install -yv -c conda-forge -n base 'mamba>=0.21.2'
30+
displayName: 'Install mamba'
3131

3232
- bash: |
33-
conda env create -q --file ci\\deps\\actions-$(CONDA_PY).yaml
33+
# See https://github.com/mamba-org/mamba/issues/1370
34+
# See https://github.com/mamba-org/mamba/issues/633
35+
C:\\Miniconda\\condabin\\mamba.bat create -n pandas-dev
36+
C:\\Miniconda\\condabin\\mamba.bat env update -n pandas-dev --file ci\\deps\\actions-$(CONDA_PY).yaml
37+
# TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941
38+
C:\\Miniconda\\condabin\\mamba.bat install -n pandas-dev 'setuptools<60'
39+
C:\\Miniconda\\condabin\\mamba.bat list -n pandas-dev
3440
displayName: 'Create anaconda environment'
3541
- bash: |
3642
source activate pandas-dev
3743
conda list
38-
python setup.py build_ext -q -j 4
44+
python setup.py build_ext -q -j 2
3945
python -m pip install --no-build-isolation -e .
4046
displayName: 'Build'
4147
- bash: |

ci/deps/actions-38-minimum_versions.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ dependencies:
1919
- numpy=1.18.5
2020
- pytz=2020.1
2121

22-
# optional dependencies
22+
# optional dependencies, markupsafe for jinja2
2323
- beautifulsoup4=4.8.2
2424
- blosc=1.20.1
2525
- bottleneck=1.3.1
@@ -29,6 +29,7 @@ dependencies:
2929
- gcsfs=0.6.0
3030
- jinja2=2.11
3131
- lxml=4.5.0
32+
- markupsafe=2.0.1
3233
- matplotlib=3.3.2
3334
- numba=0.50.1
3435
- numexpr=2.7.1

0 commit comments

Comments
 (0)