Skip to content

Commit 7fbfb65

Browse files
committed
2 parents 4dac36a + 18ab444 commit 7fbfb65

File tree

464 files changed

+20010
-14305
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

464 files changed

+20010
-14305
lines changed

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ repos:
33
rev: 19.10b0
44
hooks:
55
- id: black
6-
language_version: python3.7
6+
language_version: python3
77
- repo: https://gitlab.com/pycqa/flake8
88
rev: 3.7.7
99
hooks:

.travis.yml

+21-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ cache:
1414

1515
env:
1616
global:
17+
# Variable for test workers
18+
- PYTEST_WORKERS="auto"
1719
# create a github personal access token
1820
# cd pandas-dev/pandas
1921
# travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas
@@ -27,12 +29,21 @@ matrix:
2729
fast_finish: true
2830

2931
include:
32+
# In allowed failures
33+
- dist: bionic
34+
python: 3.9-dev
35+
env:
36+
- JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)"
3037
- env:
3138
- JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)"
3239

3340
- env:
3441
- JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)"
3542

43+
- arch: arm64
44+
env:
45+
- JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)"
46+
3647
- env:
3748
- JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1"
3849
services:
@@ -53,11 +64,18 @@ matrix:
5364
services:
5465
- mysql
5566
- postgresql
67+
allow_failures:
68+
- arch: arm64
69+
env:
70+
- JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)"
71+
- dist: bionic
72+
python: 3.9-dev
73+
env:
74+
- JOB="3.9-dev" PATTERN="(not slow and not network)"
5675

5776
before_install:
5877
- echo "before_install"
59-
# set non-blocking IO on travis
60-
# https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024
78+
# Use blocking IO on travis. Ref: https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024
6179
- python -c 'import os,sys,fcntl; flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL); fcntl.fcntl(sys.stdout, fcntl.F_SETFL, flags&~os.O_NONBLOCK);'
6280
- source ci/travis_process_gbq_encryption.sh
6381
- export PATH="$HOME/miniconda3/bin:$PATH"
@@ -83,7 +101,7 @@ install:
83101
script:
84102
- echo "script start"
85103
- echo "$JOB"
86-
- source activate pandas-dev
104+
- if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi
87105
- ci/run_tests.sh
88106

89107
after_script:

README.md

+4-3
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616
[![Downloads](https://anaconda.org/conda-forge/pandas/badges/downloads.svg)](https://pandas.pydata.org)
1717
[![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas)
1818
[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org)
19+
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
1920

2021
## What is it?
2122

22-
**pandas** is a Python package providing fast, flexible, and expressive data
23+
**pandas** is a Python package that provides fast, flexible, and expressive data
2324
structures designed to make working with "relational" or "labeled" data both
2425
easy and intuitive. It aims to be the fundamental high-level building block for
2526
doing practical, **real world** data analysis in Python. Additionally, it has
@@ -153,11 +154,11 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove
153154
Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata).
154155

155156
## Discussion and Development
156-
Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.
157+
Most development discussions take place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.
157158

158159
## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas)
159160

160-
All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
161+
All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome.
161162

162163
A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub.
163164

asv_bench/benchmarks/algorithms.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,16 @@ class Factorize:
3434
params = [
3535
[True, False],
3636
[True, False],
37-
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
37+
[
38+
"int",
39+
"uint",
40+
"float",
41+
"string",
42+
"datetime64[ns]",
43+
"datetime64[ns, tz]",
44+
"Int64",
45+
"boolean",
46+
],
3847
]
3948
param_names = ["unique", "sort", "dtype"]
4049

@@ -49,13 +58,15 @@ def setup(self, unique, sort, dtype):
4958
"datetime64[ns, tz]": pd.date_range(
5059
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
5160
),
61+
"Int64": pd.array(np.arange(N), dtype="Int64"),
62+
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
5263
}[dtype]
5364
if not unique:
5465
data = data.repeat(5)
55-
self.idx = data
66+
self.data = data
5667

5768
def time_factorize(self, unique, sort, dtype):
58-
self.idx.factorize(sort=sort)
69+
pd.factorize(self.data, sort=sort)
5970

6071

6172
class Duplicated:

asv_bench/benchmarks/arithmetic.py

+79-1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,59 @@ def time_frame_op_with_series_axis1(self, opname):
101101
getattr(operator, opname)(self.df, self.ser)
102102

103103

104+
class FrameWithFrameWide:
105+
# Many-columns, mixed dtypes
106+
107+
params = [
108+
[
109+
# GH#32779 has discussion of which operators are included here
110+
operator.add,
111+
operator.floordiv,
112+
operator.gt,
113+
]
114+
]
115+
param_names = ["op"]
116+
117+
def setup(self, op):
118+
# we choose dtypes so as to make the blocks
119+
# a) not perfectly match between right and left
120+
# b) appreciably bigger than single columns
121+
n_cols = 2000
122+
n_rows = 500
123+
124+
# construct dataframe with 2 blocks
125+
arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8")
126+
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4")
127+
df = pd.concat(
128+
[pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True,
129+
)
130+
# should already be the case, but just to be sure
131+
df._consolidate_inplace()
132+
133+
# TODO: GH#33198 the setting here shoudlnt need two steps
134+
arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
135+
arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8")
136+
arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
137+
df2 = pd.concat(
138+
[pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
139+
axis=1,
140+
ignore_index=True,
141+
)
142+
# should already be the case, but just to be sure
143+
df2._consolidate_inplace()
144+
145+
self.left = df
146+
self.right = df2
147+
148+
def time_op_different_blocks(self, op):
149+
# blocks (and dtypes) are not aligned
150+
op(self.left, self.right)
151+
152+
def time_op_same_blocks(self, op):
153+
# blocks (and dtypes) are aligned
154+
op(self.left, self.left)
155+
156+
104157
class Ops:
105158

106159
params = [[True, False], ["default", 1]]
@@ -413,7 +466,32 @@ def setup(self, offset):
413466
self.rng = rng
414467

415468
def time_apply_index(self, offset):
416-
offset.apply_index(self.rng)
469+
self.rng + offset
470+
471+
472+
class BinaryOpsMultiIndex:
473+
params = ["sub", "add", "mul", "div"]
474+
param_names = ["func"]
475+
476+
def setup(self, func):
477+
date_range = pd.date_range("20200101 00:00", "20200102 0:00", freq="S")
478+
level_0_names = [str(i) for i in range(30)]
479+
480+
index = pd.MultiIndex.from_product([level_0_names, date_range])
481+
column_names = ["col_1", "col_2"]
482+
483+
self.df = pd.DataFrame(
484+
np.random.rand(len(index), 2), index=index, columns=column_names
485+
)
486+
487+
self.arg_df = pd.DataFrame(
488+
np.random.randint(1, 10, (len(level_0_names), 2)),
489+
index=level_0_names,
490+
columns=column_names,
491+
)
492+
493+
def time_binary_op_multiindex(self, func):
494+
getattr(self.df, func)(self.arg_df, level=0)
417495

418496

419497
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/indexing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,9 @@ def time_boolean_rows_boolean(self):
158158
class DataFrameNumericIndexing:
159159
def setup(self):
160160
self.idx_dupe = np.array(range(30)) * 99
161-
self.df = DataFrame(np.random.randn(10000, 5))
161+
self.df = DataFrame(np.random.randn(100000, 5))
162162
self.df_dup = concat([self.df, 2 * self.df, 3 * self.df])
163-
self.bool_indexer = [True] * 5000 + [False] * 5000
163+
self.bool_indexer = [True] * 50000 + [False] * 50000
164164

165165
def time_iloc_dups(self):
166166
self.df_dup.iloc[self.idx_dupe]

asv_bench/benchmarks/io/json.py

+6
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,18 @@ def time_read_json_lines(self, index):
5353
def time_read_json_lines_concat(self, index):
5454
concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
5555

56+
def time_read_json_lines_nrows(self, index):
57+
read_json(self.fname, orient="records", lines=True, nrows=25000)
58+
5659
def peakmem_read_json_lines(self, index):
5760
read_json(self.fname, orient="records", lines=True)
5861

5962
def peakmem_read_json_lines_concat(self, index):
6063
concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
6164

65+
def peakmem_read_json_lines_nrows(self, index):
66+
read_json(self.fname, orient="records", lines=True, nrows=15000)
67+
6268

6369
class ToJSON(BaseIO):
6470

asv_bench/benchmarks/rolling.py

+23
Original file line numberDiff line numberDiff line change
@@ -186,4 +186,27 @@ def peakmem_rolling(self, constructor, window_size, dtype, method):
186186
getattr(self.roll, method)()
187187

188188

189+
class Groupby:
190+
191+
params = ["sum", "median", "mean", "max", "min", "kurt", "sum"]
192+
193+
def setup(self, method):
194+
N = 1000
195+
df = pd.DataFrame(
196+
{
197+
"A": [str(i) for i in range(N)] * 10,
198+
"B": list(range(N)) * 10,
199+
"C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10),
200+
}
201+
)
202+
self.groupby_roll_int = df.groupby("A").rolling(window=2)
203+
self.groupby_roll_offset = df.groupby("A").rolling(window="30s", on="C")
204+
205+
def time_rolling_int(self, method):
206+
getattr(self.groupby_roll_int, method)()
207+
208+
def time_rolling_offset(self, method):
209+
getattr(self.groupby_roll_offset, method)()
210+
211+
189212
from .pandas_vb_common import setup # noqa: F401 isort:skip

azure-pipelines.yml

+3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ trigger:
55
pr:
66
- master
77

8+
variables:
9+
PYTEST_WORKERS: auto
10+
811
jobs:
912
# Mac and Linux use the same template
1013
- template: ci/azure/posix.yml

ci/azure/windows.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
CONDA_PY: "36"
1414
PATTERN: "not slow and not network"
1515

16-
py37_np141:
16+
py37_np18:
1717
ENV_FILE: ci/deps/azure-windows-37.yaml
1818
CONDA_PY: "37"
1919
PATTERN: "not slow and not network"

ci/build39.sh

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash -e
2+
# Special build for python3.9 until numpy puts its own wheels up
3+
4+
sudo apt-get install build-essential gcc xvfb
5+
pip install --no-deps -U pip wheel setuptools
6+
pip install python-dateutil pytz pytest pytest-xdist hypothesis
7+
pip install cython --pre # https://github.com/cython/cython/issues/3395
8+
9+
git clone https://github.com/numpy/numpy
10+
cd numpy
11+
python setup.py build_ext --inplace
12+
python setup.py install
13+
cd ..
14+
rm -rf numpy
15+
16+
python setup.py build_ext -inplace
17+
python -m pip install --no-build-isolation -e .
18+
19+
python -c "import sys; print(sys.version_info)"
20+
python -c "import pandas as pd"
21+
python -c "import hypothesis"

ci/deps/azure-37-numpydev.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ dependencies:
1616
- pip:
1717
- cython==0.29.16 # GH#34014
1818
- "git+git://github.com/dateutil/dateutil.git"
19-
- "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com"
19+
- "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple"
2020
- "--pre"
2121
- "numpy"
2222
- "scipy"

ci/deps/azure-windows-37.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ dependencies:
2222
- matplotlib=2.2.*
2323
- moto
2424
- numexpr
25-
- numpy=1.14.*
25+
- numpy=1.18.*
2626
- openpyxl
2727
- pyarrow=0.14
2828
- pytables

ci/deps/travis-36-locale.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ dependencies:
2727
- numexpr
2828
- numpy
2929
- openpyxl
30-
- pandas-gbq=0.8.0
30+
- pandas-gbq=0.12.0
3131
- psycopg2=2.6.2
3232
- pymysql=0.7.11
3333
- pytables

ci/deps/travis-37-arm64.yaml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: pandas-dev
2+
channels:
3+
- defaults
4+
- conda-forge
5+
dependencies:
6+
- python=3.7.*
7+
8+
# tools
9+
- cython>=0.29.13
10+
- pytest>=5.0.1
11+
- pytest-xdist>=1.21
12+
- hypothesis>=3.58.0
13+
14+
# pandas dependencies
15+
- botocore>=1.11
16+
- numpy
17+
- python-dateutil
18+
- pytz
19+
- pip
20+
- pip:
21+
- moto

ci/run_tests.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then
2020
XVFB="xvfb-run "
2121
fi
2222

23-
PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
23+
PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
2424

2525
echo $PYTEST_CMD
2626
sh -c "$PYTEST_CMD"

0 commit comments

Comments
 (0)