Skip to content

Commit 147d14b

Browse files
committed
Merge branch 'main' of https://github.com/stellalin7/pandas into pylint-48855-C-type-disallowed-name
2 parents 9c676c4 + f9ff379 commit 147d14b

File tree

103 files changed

+326
-526
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+326
-526
lines changed

asv_bench/benchmarks/reshape.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def setup(self):
3636
self.df = DataFrame(data)
3737

3838
def time_reshape_pivot_time_series(self):
39-
self.df.pivot("date", "variable", "value")
39+
self.df.pivot(index="date", columns="variable", values="value")
4040

4141

4242
class SimpleReshape:

doc/scripts/eval_performance.py

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from timeit import repeat as timeit
2+
3+
import numpy as np
4+
import seaborn as sns
5+
6+
from pandas import DataFrame
7+
8+
setup_common = """from pandas import DataFrame
9+
from numpy.random import randn
10+
df = DataFrame(randn(%d, 3), columns=list('abc'))
11+
%s"""
12+
13+
setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
14+
15+
16+
def bench_with(n, times=10, repeat=3, engine="numexpr"):
17+
return (
18+
np.array(
19+
timeit(
20+
"df.eval(s, engine=%r)" % engine,
21+
setup=setup_common % (n, setup_with),
22+
repeat=repeat,
23+
number=times,
24+
)
25+
)
26+
/ times
27+
)
28+
29+
30+
setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
31+
32+
33+
def bench_subset(n, times=20, repeat=3, engine="numexpr"):
34+
return (
35+
np.array(
36+
timeit(
37+
"df.query(s, engine=%r)" % engine,
38+
setup=setup_common % (n, setup_subset),
39+
repeat=repeat,
40+
number=times,
41+
)
42+
)
43+
/ times
44+
)
45+
46+
47+
def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
48+
r = np.logspace(mn, mx, num=num).round().astype(int)
49+
50+
ev = DataFrame(np.empty((num, len(engines))), columns=engines)
51+
qu = ev.copy(deep=True)
52+
53+
ev["size"] = qu["size"] = r
54+
55+
for engine in engines:
56+
for i, n in enumerate(r):
57+
if verbose & (i % 10 == 0):
58+
print("engine: %r, i == %d" % (engine, i))
59+
ev_times = bench_with(n, times=1, repeat=1, engine=engine)
60+
ev.loc[i, engine] = np.mean(ev_times)
61+
qu_times = bench_subset(n, times=1, repeat=1, engine=engine)
62+
qu.loc[i, engine] = np.mean(qu_times)
63+
64+
return ev, qu
65+
66+
67+
def plot_perf(df, engines, title, filename=None):
68+
from matplotlib.pyplot import figure
69+
70+
sns.set()
71+
sns.set_palette("Set2")
72+
73+
fig = figure(figsize=(4, 3), dpi=120)
74+
ax = fig.add_subplot(111)
75+
76+
for engine in engines:
77+
ax.loglog(df["size"], df[engine], label=engine, lw=2)
78+
79+
ax.set_xlabel("Number of Rows")
80+
ax.set_ylabel("Time (s)")
81+
ax.set_title(title)
82+
ax.legend(loc="best")
83+
ax.tick_params(top=False, right=False)
84+
85+
fig.tight_layout()
86+
87+
if filename is not None:
88+
fig.savefig(filename)
89+
90+
91+
if __name__ == "__main__":
92+
import os
93+
94+
pandas_dir = os.path.dirname(
95+
os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
96+
)
97+
static_path = os.path.join(pandas_dir, "doc", "source", "_static")
98+
99+
join = lambda p: os.path.join(static_path, p)
100+
101+
fn = join("eval-query-perf-data.h5")
102+
103+
engines = "python", "numexpr"
104+
105+
ev, qu = bench(verbose=True) # only this one
106+
107+
plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png"))
108+
plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png"))
-24.7 KB
Binary file not shown.

doc/source/_static/eval-perf.png

10.8 KB
Loading
-21.2 KB
Binary file not shown.

doc/source/_static/query-perf.png

8.79 KB
Loading

doc/source/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@
236236
if ".dev" in version:
237237
switcher_version = "dev"
238238
elif "rc" in version:
239-
switcher_version = version.split("rc")[0] + " (rc)"
239+
switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)"
240240

241241
html_theme_options = {
242242
"external_links": [],

doc/source/development/contributing_environment.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ To test out code changes, you'll need to build pandas from source, which
1010
requires a C/C++ compiler and Python environment. If you're making documentation
1111
changes, you can skip to :ref:`contributing to the documentation <contributing_documentation>` but if you skip
1212
creating the development environment you won't be able to build the documentation
13-
locally before pushing your changes.
13+
locally before pushing your changes. It's recommended to also install the :ref:`pre-commit hooks <contributing.pre-commit>`.
1414

1515
.. contents:: Table of contents:
1616
:local:

doc/source/user_guide/enhancingperf.rst

+5-19
Original file line numberDiff line numberDiff line change
@@ -690,21 +690,12 @@ The equivalent in standard Python would be
690690
df["a"] = 1
691691
df
692692
693-
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
694-
whether the query modifies the original frame.
695-
696-
.. ipython:: python
697-
698-
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
699-
df.query("a > 2")
700-
df.query("a > 2", inplace=True)
701-
df
702-
703693
Local variables
704694
~~~~~~~~~~~~~~~
705695

706696
You must *explicitly reference* any local variable that you want to use in an
707-
expression by placing the ``@`` character in front of the name. For example,
697+
expression by placing the ``@`` character in front of the name. This mechanism is
698+
the same for both :meth:`DataFrame.query` and :meth:`DataFrame.eval`. For example,
708699

709700
.. ipython:: python
710701
@@ -820,17 +811,12 @@ significant performance benefit. Here is a plot showing the running time of
820811
:func:`pandas.eval` as function of the size of the frame involved in the
821812
computation. The two lines are two different engines.
822813

814+
..
815+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
823816
824817
.. image:: ../_static/eval-perf.png
825818

826-
827-
.. note::
828-
829-
Operations with smallish objects (around 15k-20k rows) are faster using
830-
plain Python:
831-
832-
.. image:: ../_static/eval-perf-small.png
833-
819+
You will only see the performance benefits of using the ``numexpr`` engine with :func:`pandas.eval` if your frame has more than approximately 100,000 rows.
834820

835821
This plot was created using a :class:`DataFrame` with 3 columns each containing
836822
floating point values generated using ``numpy.random.randn()``.

doc/source/user_guide/indexing.rst

+19-5
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,17 @@ If instead you don't want to or cannot name your index, you can use the name
12401240
renaming your columns to something less ambiguous.
12411241

12421242

1243+
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
1244+
whether the query modifies the original frame.
1245+
1246+
.. ipython:: python
1247+
1248+
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
1249+
df.query("a > 2")
1250+
df.query("a > 2", inplace=True)
1251+
df
1252+
1253+
12431254
:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax
12441255
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12451256

@@ -1438,15 +1449,18 @@ Performance of :meth:`~pandas.DataFrame.query`
14381449
``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
14391450
large frames.
14401451

1452+
..
1453+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
1454+
14411455
.. image:: ../_static/query-perf.png
14421456

1443-
.. note::
14441457

1445-
You will only see the performance benefits of using the ``numexpr`` engine
1446-
with ``DataFrame.query()`` if your frame has more than approximately 200,000
1447-
rows.
14481458

1449-
.. image:: ../_static/query-perf-small.png
1459+
You will only see the performance benefits of using the ``numexpr`` engine
1460+
with ``DataFrame.query()`` if your frame has more than approximately 100,000
1461+
rows.
1462+
1463+
14501464

14511465
This plot was created using a ``DataFrame`` with 3 columns each containing
14521466
floating point values generated using ``numpy.random.randn()``.

doc/source/whatsnew/v2.0.0.rst

+14
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,13 @@ Removal of prior version deprecations/changes
197197
- Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
198198
- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
199199
- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
200+
- Disallow passing non-keyword arguments to :meth:`DataFrame.set_index` except ``keys`` (:issue:`41495`)
201+
- Disallow passing non-keyword arguments to :meth:`Resampler.interpolate` except ``method`` (:issue:`41699`)
202+
- Disallow passing non-keyword arguments to :meth:`DataFrame.reset_index` and :meth:`Series.reset_index` except ``level`` (:issue:`41496`)
203+
- Disallow passing non-keyword arguments to :meth:`DataFrame.dropna` and :meth:`Series.dropna` (:issue:`41504`)
204+
- Disallow passing non-keyword arguments to :meth:`ExtensionArray.argsort` (:issue:`46134`)
205+
- Disallow passing non-keyword arguments to :meth:`Categorical.sort_values` (:issue:`47618`)
206+
- Disallow passing non-keyword arguments to :meth:`Index.drop_duplicates` and :meth:`Series.drop_duplicates` (:issue:`41485`)
200207
- Disallow passing non-keyword arguments to :meth:`DataFrame.drop_duplicates` except for ``subset`` (:issue:`41485`)
201208
- Disallow passing non-keyword arguments to :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` (:issue:`41506`)
202209
- Disallow passing non-keyword arguments to :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` except for ``method`` (:issue:`41510`)
@@ -209,6 +216,9 @@ Removal of prior version deprecations/changes
209216
- Disallow passing non-keyword arguments to :func:`read_json` except for ``path_or_buf`` (:issue:`27573`)
210217
- Disallow passing non-keyword arguments to :func:`read_sas` except for ``filepath_or_buffer`` (:issue:`47154`)
211218
- Disallow passing non-keyword arguments to :func:`read_stata` except for ``filepath_or_buffer`` (:issue:`48128`)
219+
- Disallow passing non-keyword arguments to :func:`read_csv` except ``filepath_or_buffer`` (:issue:`41485`)
220+
- Disallow passing non-keyword arguments to :func:`read_table` except ``filepath_or_buffer`` (:issue:`41485`)
221+
- Disallow passing non-keyword arguments to :func:`read_fwf` except ``filepath_or_buffer`` (:issue:`44710`)
212222
- Disallow passing non-keyword arguments to :func:`read_xml` except for ``path_or_buffer`` (:issue:`45133`)
213223
- Disallow passing non-keyword arguments to :meth:`Series.mask` and :meth:`DataFrame.mask` except ``cond`` and ``other`` (:issue:`41580`)
214224
- Disallow passing non-keyword arguments to :meth:`DataFrame.to_stata` except for ``path`` (:issue:`48128`)
@@ -243,6 +253,7 @@ Removal of prior version deprecations/changes
243253
- Removed :meth:`Series.str.__iter__` (:issue:`28277`)
244254
- Removed ``pandas.SparseArray`` in favor of :class:`arrays.SparseArray` (:issue:`30642`)
245255
- Removed ``pandas.SparseSeries`` and ``pandas.SparseDataFrame``, including pickle support. (:issue:`30642`)
256+
- Enforced disallowing passing an integer ``fill_value`` to :meth:`DataFrame.shift` and :meth:`Series.shift`` with datetime64, timedelta64, or period dtypes (:issue:`32591`)
246257
- Enforced disallowing a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
247258
- Enforced disallowing a tuple of column labels into :meth:`.DataFrameGroupBy.__getitem__` (:issue:`30546`)
248259
- Removed setting Categorical._codes directly (:issue:`41429`)
@@ -253,10 +264,13 @@ Removal of prior version deprecations/changes
253264
- Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
254265
- Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
255266
- Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
267+
- Changed the behavior of :func:`to_datetime` with argument "now" with ``utc=False`` to match ``Timestamp("now")`` (:issue:`18705`)
256268
- Changed behavior of :class:`DataFrame` constructor given floating-point ``data`` and an integer ``dtype``, when the data cannot be cast losslessly, the floating point dtype is retained, matching :class:`Series` behavior (:issue:`41170`)
257269
- Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`)
258270
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
259271
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
272+
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
273+
-
260274

261275
.. ---------------------------------------------------------------------------
262276
.. _whatsnew_200.performance:

pandas/_libs/parsers.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ from pandas._libs.util cimport (
7474
UINT64_MAX,
7575
)
7676

77-
import pandas._libs.lib as lib
77+
from pandas._libs import lib
7878

7979
from pandas._libs.khash cimport (
8080
kh_destroy_float64,

pandas/_libs/tslib.pyx

+5-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import warnings
2-
31
cimport cython
42
from cpython.datetime cimport (
53
PyDate_Check,
@@ -9,8 +7,6 @@ from cpython.datetime cimport (
97
tzinfo,
108
)
119

12-
from pandas.util._exceptions import find_stack_level
13-
1410
# import datetime C API
1511
import_datetime()
1612

@@ -855,17 +851,12 @@ cdef inline bint _parse_today_now(str val, int64_t* iresult, bint utc):
855851
# We delay this check for as long as possible
856852
# because it catches relatively rare cases
857853
if val == "now":
858-
iresult[0] = Timestamp.utcnow().value
859-
if not utc:
854+
if utc:
855+
iresult[0] = Timestamp.utcnow().value
856+
else:
860857
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
861-
warnings.warn(
862-
"The parsing of 'now' in pd.to_datetime without `utc=True` is "
863-
"deprecated. In a future version, this will match Timestamp('now') "
864-
"and Timestamp.now()",
865-
FutureWarning,
866-
stacklevel=find_stack_level(),
867-
)
868-
858+
# Note using Timestamp.now() is faster than Timestamp("now")
859+
iresult[0] = Timestamp.now().value
869860
return True
870861
elif val == "today":
871862
iresult[0] = Timestamp.today().value

pandas/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1727,7 +1727,7 @@ def any_skipna_inferred_dtype(request):
17271727
17281728
Examples
17291729
--------
1730-
>>> import pandas._libs.lib as lib
1730+
>>> from pandas._libs import lib
17311731
>>>
17321732
>>> def test_something(any_skipna_inferred_dtype):
17331733
... inferred_dtype, values = any_skipna_inferred_dtype

pandas/core/arrays/_mixins.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -252,16 +252,11 @@ def _validate_searchsorted_value(
252252
@doc(ExtensionArray.shift)
253253
def shift(self, periods: int = 1, fill_value=None, axis: AxisInt = 0):
254254

255-
fill_value = self._validate_shift_value(fill_value)
255+
fill_value = self._validate_scalar(fill_value)
256256
new_values = shift(self._ndarray, periods, axis, fill_value)
257257

258258
return self._from_backing_data(new_values)
259259

260-
def _validate_shift_value(self, fill_value):
261-
# TODO(2.0): after deprecation in datetimelikearraymixin is enforced,
262-
# we can remove this and use validate_fill_value directly
263-
return self._validate_scalar(fill_value)
264-
265260
def __setitem__(self, key, value) -> None:
266261
key = check_array_indexer(self, key)
267262
value = self._validate_setitem_value(value)

pandas/core/arrays/arrow/array.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,7 @@
2020
pa_version_under6p0,
2121
pa_version_under7p0,
2222
)
23-
from pandas.util._decorators import (
24-
deprecate_nonkeyword_arguments,
25-
doc,
26-
)
23+
from pandas.util._decorators import doc
2724

2825
from pandas.core.dtypes.common import (
2926
is_array_like,
@@ -452,13 +449,12 @@ def isna(self) -> npt.NDArray[np.bool_]:
452449
"""
453450
return self._data.is_null().to_numpy()
454451

455-
@deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
456452
def argsort(
457453
self,
454+
*,
458455
ascending: bool = True,
459456
kind: SortKind = "quicksort",
460457
na_position: str = "last",
461-
*args,
462458
**kwargs,
463459
) -> np.ndarray:
464460
order = "ascending" if ascending else "descending"

0 commit comments

Comments
 (0)