Skip to content

Commit b4d00f7

Browse files
committed
Merge remote-tracking branch 'upstream/master' into 24893-pivot_table
2 parents 45ffc77 + 261c3a6 commit b4d00f7

File tree

112 files changed

+1963
-1049
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+1963
-1049
lines changed

.pre-commit-config.yaml

-4
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,3 @@ repos:
1515
hooks:
1616
- id: isort
1717
language: python_venv
18-
- repo: https://github.com/asottile/seed-isort-config
19-
rev: v1.9.2
20-
hooks:
21-
- id: seed-isort-config

asv_bench/asv.conf.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,13 @@
5050
"xlsxwriter": [],
5151
"xlrd": [],
5252
"xlwt": [],
53+
"odfpy": [],
5354
"pytest": [],
5455
// If using Windows with python 2.7 and want to build using the
5556
// mingw toolchain (rather than MSVC), uncomment the following line.
5657
// "libpython": [],
5758
},
58-
59+
"conda_channels": ["defaults", "conda-forge"],
5960
// Combinations of libraries/python versions can be excluded/included
6061
// from the set to test. Each entry is a dictionary containing additional
6162
// key-value pairs to include/exclude.

asv_bench/benchmarks/io/excel.py

+54-22
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,72 @@
11
from io import BytesIO
22

33
import numpy as np
4+
from odf.opendocument import OpenDocumentSpreadsheet
5+
from odf.table import Table, TableCell, TableRow
6+
from odf.text import P
47

58
from pandas import DataFrame, ExcelWriter, date_range, read_excel
69
import pandas.util.testing as tm
710

811

9-
class Excel:
12+
def _generate_dataframe():
13+
N = 2000
14+
C = 5
15+
df = DataFrame(
16+
np.random.randn(N, C),
17+
columns=["float{}".format(i) for i in range(C)],
18+
index=date_range("20000101", periods=N, freq="H"),
19+
)
20+
df["object"] = tm.makeStringIndex(N)
21+
return df
22+
23+
24+
class WriteExcel:
1025

1126
params = ["openpyxl", "xlsxwriter", "xlwt"]
1227
param_names = ["engine"]
1328

1429
def setup(self, engine):
15-
N = 2000
16-
C = 5
17-
self.df = DataFrame(
18-
np.random.randn(N, C),
19-
columns=["float{}".format(i) for i in range(C)],
20-
index=date_range("20000101", periods=N, freq="H"),
21-
)
22-
self.df["object"] = tm.makeStringIndex(N)
23-
self.bio_read = BytesIO()
24-
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
25-
self.df.to_excel(self.writer_read, sheet_name="Sheet1")
26-
self.writer_read.save()
27-
self.bio_read.seek(0)
28-
29-
def time_read_excel(self, engine):
30-
read_excel(self.bio_read)
30+
self.df = _generate_dataframe()
3131

3232
def time_write_excel(self, engine):
33-
bio_write = BytesIO()
34-
bio_write.seek(0)
35-
writer_write = ExcelWriter(bio_write, engine=engine)
36-
self.df.to_excel(writer_write, sheet_name="Sheet1")
37-
writer_write.save()
33+
bio = BytesIO()
34+
bio.seek(0)
35+
writer = ExcelWriter(bio, engine=engine)
36+
self.df.to_excel(writer, sheet_name="Sheet1")
37+
writer.save()
38+
39+
40+
class ReadExcel:
41+
42+
params = ["xlrd", "openpyxl", "odf"]
43+
param_names = ["engine"]
44+
fname_excel = "spreadsheet.xlsx"
45+
fname_odf = "spreadsheet.ods"
46+
47+
def _create_odf(self):
48+
doc = OpenDocumentSpreadsheet()
49+
table = Table(name="Table1")
50+
for row in self.df.values:
51+
tr = TableRow()
52+
for val in row:
53+
tc = TableCell(valuetype="string")
54+
tc.addElement(P(text=val))
55+
tr.addElement(tc)
56+
table.addElement(tr)
57+
58+
doc.spreadsheet.addElement(table)
59+
doc.save(self.fname_odf)
60+
61+
def setup_cache(self):
62+
self.df = _generate_dataframe()
63+
64+
self.df.to_excel(self.fname_excel, sheet_name="Sheet1")
65+
self._create_odf()
66+
67+
def time_read_excel(self, engine):
68+
fname = self.fname_odf if engine == "odf" else self.fname_excel
69+
read_excel(fname, engine=engine)
3870

3971

4072
from ..pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/io/json.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -118,15 +118,15 @@ def setup(self, orient, frame):
118118
def time_to_json(self, orient, frame):
119119
getattr(self, frame).to_json(self.fname, orient=orient)
120120

121-
def mem_to_json(self, orient, frame):
121+
def peakmem_to_json(self, orient, frame):
122122
getattr(self, frame).to_json(self.fname, orient=orient)
123123

124124
def time_to_json_wide(self, orient, frame):
125125
base_df = getattr(self, frame).copy()
126126
df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
127127
df.to_json(self.fname, orient=orient)
128128

129-
def mem_to_json_wide(self, orient, frame):
129+
def peakmem_to_json_wide(self, orient, frame):
130130
base_df = getattr(self, frame).copy()
131131
df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
132132
df.to_json(self.fname, orient=orient)

asv_bench/benchmarks/package.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""
2+
Benchmarks for pandas at the package-level.
3+
"""
4+
import subprocess
5+
import sys
6+
7+
from pandas.compat import PY37
8+
9+
10+
class TimeImport:
11+
def time_import(self):
12+
if PY37:
13+
# on py37+ we the "-X importtime" usage gives us a more precise
14+
# measurement of the import time we actually care about,
15+
# without the subprocess or interpreter overhead
16+
cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"]
17+
p = subprocess.run(cmd, stderr=subprocess.PIPE)
18+
19+
line = p.stderr.splitlines()[-1]
20+
field = line.split(b"|")[-2].strip()
21+
total = int(field) # microseconds
22+
return total
23+
24+
cmd = [sys.executable, "-c", "import pandas as pd"]
25+
subprocess.run(cmd, stderr=subprocess.PIPE)

asv_bench/benchmarks/rolling.py

+3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def setup(self, constructor, window, dtype, method):
2121
def time_rolling(self, constructor, window, dtype, method):
2222
getattr(self.roll, method)()
2323

24+
def peakmem_rolling(self, constructor, window, dtype, method):
25+
getattr(self.roll, method)()
26+
2427

2528
class ExpandingMethods:
2629

asv_bench/benchmarks/stat_ops.py

+11
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,23 @@ def setup(self, method, use_bottleneck):
113113
nanops._USE_BOTTLENECK = use_bottleneck
114114
self.df = pd.DataFrame(np.random.randn(1000, 30))
115115
self.df2 = pd.DataFrame(np.random.randn(1000, 30))
116+
self.df_wide = pd.DataFrame(np.random.randn(1000, 200))
117+
self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9)
116118
self.s = pd.Series(np.random.randn(1000))
117119
self.s2 = pd.Series(np.random.randn(1000))
118120

119121
def time_corr(self, method, use_bottleneck):
120122
self.df.corr(method=method)
121123

124+
def time_corr_wide(self, method, use_bottleneck):
125+
self.df_wide.corr(method=method)
126+
127+
def time_corr_wide_nans(self, method, use_bottleneck):
128+
self.df_wide_nans.corr(method=method)
129+
130+
def peakmem_corr_wide(self, method, use_bottleneck):
131+
self.df_wide.corr(method=method)
132+
122133
def time_corr_series(self, method, use_bottleneck):
123134
self.s.corr(self.s2, method=method)
124135

ci/code_checks.sh

+7-3
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,14 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
203203
import sys
204204
import pandas
205205
206-
blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis',
206+
blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis',
207207
'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy',
208-
'tables', 'xlrd', 'xlsxwriter', 'xlwt'}
209-
mods = blacklist & set(m.split('.')[0] for m in sys.modules)
208+
'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'}
209+
210+
# GH#28227 for some of these check for top-level modules, while others are
211+
# more specific (e.g. urllib.request)
212+
import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules)
213+
mods = blacklist & import_mods
210214
if mods:
211215
sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
212216
sys.exit(len(mods))

ci/deps/azure-windows-36.yaml

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
name: pandas-dev
22
channels:
3-
- defaults
43
- conda-forge
4+
- defaults
55
dependencies:
66
- blosc
77
- bottleneck
8-
- boost-cpp<1.67
98
- fastparquet>=0.2.1
109
- matplotlib=3.0.2
1110
- numexpr
1211
- numpy=1.15.*
1312
- openpyxl
14-
- parquet-cpp
1513
- pyarrow
1614
- pytables
1715
- python-dateutil

doc/source/development/contributing.rst

+38-3
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,44 @@ operations. To install pandas from source, you need to compile these C
135135
extensions, which means you need a C compiler. This process depends on which
136136
platform you're using.
137137

138-
* Windows: https://devguide.python.org/setup/#windows-compiling
139-
* Mac: https://devguide.python.org/setup/#macos
140-
* Unix: https://devguide.python.org/setup/#unix-compiling
138+
**Windows**
139+
140+
You will need `Build Tools for Visual Studio 2017
141+
<https://visualstudio.microsoft.com/downloads/>`_.
142+
143+
.. warning::
144+
You DO NOT need to install Visual Studio 2019.
145+
You only need "Build Tools for Visual Studio 2019" found by
146+
scrolling down to "All downloads" -> "Tools for Visual Studio 2019".
147+
148+
**Mac OS**
149+
150+
Information about compiler installation can be found here:
151+
https://devguide.python.org/setup/#macos
152+
153+
**Unix**
154+
155+
Some Linux distributions will come with a pre-installed C compiler. To find out
156+
which compilers (and versions) are installed on your system::
157+
158+
# for Debian/Ubuntu:
159+
dpkg --list | grep compiler
160+
# for Red Hat/RHEL/CentOS/Fedora:
161+
yum list installed | grep -i --color compiler
162+
163+
`GCC (GNU Compiler Collection) <https://gcc.gnu.org/>`_, is a widely used
164+
compiler, which supports C and a number of other languages. If GCC is listed
165+
as an installed compiler nothing more is required. If no C compiler is
166+
installed (or you wish to install a newer version) you can install a compiler
167+
(GCC in the example code below) with::
168+
169+
# for recent Debian/Ubuntu:
170+
sudo apt install build-essential
171+
# for Red Had/RHEL/CentOS/Fedora
172+
yum groupinstall "Development Tools"
173+
174+
For other Linux distributions, consult your favourite search engine for
175+
commpiler installation instructions.
141176

142177
Let us know if you have any difficulties by opening an issue or reaching out on
143178
`Gitter`_.

doc/source/ecosystem.rst

+15-12
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,21 @@ or `search pypi for pandas <https://pypi.org/search/?q=pandas>`_.
2323
We'd like to make it easier for users to find these projects, if you know of other
2424
substantial projects that you feel should be on this list, please let us know.
2525

26+
.. _ecosystem.data_cleaning_and_validation:
27+
28+
Data cleaning and validation
29+
----------------------------
30+
31+
`pyjanitor <https://github.com/ericmjl/pyjanitor/>`__
32+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33+
34+
Pyjanitor provides a clean API for cleaning data, using method chaining.
35+
36+
`Engarde <https://engarde.readthedocs.io/en/latest/>`__
37+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
38+
39+
Engarde is a lightweight library used to explicitly state assumptions about your datasets
40+
and check that they're *actually* true.
2641

2742
.. _ecosystem.stats:
2843

@@ -329,18 +344,6 @@ Increasingly, packages are being built on top of pandas to address specific need
329344
* vaex.from_pandas
330345
* vaex.to_pandas_df
331346

332-
333-
.. _ecosystem.data_validation:
334-
335-
Data validation
336-
---------------
337-
338-
`Engarde <https://engarde.readthedocs.io/en/latest/>`__
339-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
340-
341-
Engarde is a lightweight library used to explicitly state your assumptions about your datasets
342-
and check that they're *actually* true.
343-
344347
.. _ecosystem.extensions:
345348

346349
Extension data types

doc/source/reference/plotting.rst

+4
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,14 @@ The following functions are contained in the `pandas.plotting` module.
1313
:toctree: api/
1414

1515
andrews_curves
16+
autocorrelation_plot
1617
bootstrap_plot
18+
boxplot
1719
deregister_matplotlib_converters
1820
lag_plot
1921
parallel_coordinates
22+
plot_params
2023
radviz
2124
register_matplotlib_converters
2225
scatter_matrix
26+
table

doc/source/user_guide/io.rst

+23
Original file line numberDiff line numberDiff line change
@@ -5047,6 +5047,17 @@ Example of a callable using PostgreSQL `COPY clause
50475047
from io import StringIO
50485048

50495049
def psql_insert_copy(table, conn, keys, data_iter):
5050+
"""
5051+
Execute SQL statement inserting data
5052+
5053+
Parameters
5054+
----------
5055+
table : pandas.io.sql.SQLTable
5056+
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
5057+
keys : list of str
5058+
Column names
5059+
data_iter : Iterable that iterates the values to be inserted
5060+
"""
50505061
# gets a DBAPI connection that can provide a cursor
50515062
dbapi_conn = conn.connection
50525063
with dbapi_conn.cursor() as cur:
@@ -5080,6 +5091,18 @@ table name and optionally a subset of columns to read.
50805091
50815092
pd.read_sql_table('data', engine)
50825093
5094+
.. note::
5095+
5096+
Note that pandas infers column dtypes from query outputs, and not by looking
5097+
up data types in the physical database schema. For example, assume ``userid``
5098+
is an integer column in a table. Then, intuitively, ``select userid ...`` will
5099+
return integer-valued series, while ``select cast(userid as text) ...`` will
5100+
return object-valued (str) series. Accordingly, if the query output is empty,
5101+
then all resulting columns will be returned as object-valued (since they are
5102+
most general). If you foresee that your query will sometimes generate an empty
5103+
result, you may want to explicitly typecast afterwards to ensure dtype
5104+
integrity.
5105+
50835106
You can also specify the name of the column as the ``DataFrame`` index,
50845107
and specify a subset of columns to be read.
50855108

doc/source/user_guide/options.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ determines how many rows are shown in the truncated repr.
163163
.. ipython:: python
164164
165165
pd.set_option('max_rows', 8)
166-
pd.set_option('max_rows', 4)
166+
pd.set_option('min_rows', 4)
167167
# below max_rows -> all rows shown
168168
df = pd.DataFrame(np.random.randn(7, 2))
169169
df

doc/source/whatsnew/v0.25.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Groupby/resample/rolling
7878
^^^^^^^^^^^^^^^^^^^^^^^^
7979

8080
- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
81+
- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
8182
-
8283
-
8384
-

0 commit comments

Comments
 (0)