Skip to content

Commit 8a9d428

Browse files
committed
Merge branch 'master' of https://github.com/pandas-dev/pandas into npbump
2 parents 4953d26 + 1940fce commit 8a9d428

File tree

251 files changed

+6965
-4349
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

251 files changed

+6965
-4349
lines changed

asv_bench/benchmarks/arithmetic.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def time_series_op_with_fill_value_no_nas(self):
6767
self.ser.add(self.ser, fill_value=4)
6868

6969

70-
class MixedFrameWithSeriesAxis0:
70+
class MixedFrameWithSeriesAxis:
7171
params = [
7272
[
7373
"eq",
@@ -78,7 +78,7 @@ class MixedFrameWithSeriesAxis0:
7878
"gt",
7979
"add",
8080
"sub",
81-
"div",
81+
"truediv",
8282
"floordiv",
8383
"mul",
8484
"pow",
@@ -87,15 +87,19 @@ class MixedFrameWithSeriesAxis0:
8787
param_names = ["opname"]
8888

8989
def setup(self, opname):
90-
arr = np.arange(10 ** 6).reshape(100, -1)
90+
arr = np.arange(10 ** 6).reshape(1000, -1)
9191
df = DataFrame(arr)
9292
df["C"] = 1.0
9393
self.df = df
9494
self.ser = df[0]
95+
self.row = df.iloc[0]
9596

9697
def time_frame_op_with_series_axis0(self, opname):
9798
getattr(self.df, opname)(self.ser, axis=0)
9899

100+
def time_frame_op_with_series_axis1(self, opname):
101+
getattr(operator, opname)(self.df, self.ser)
102+
99103

100104
class Ops:
101105

asv_bench/benchmarks/groupby.py

+58
Original file line numberDiff line numberDiff line change
@@ -660,4 +660,62 @@ def function(values):
660660
self.grouper.transform(function, engine="cython")
661661

662662

663+
class AggEngine:
664+
def setup(self):
665+
N = 10 ** 3
666+
data = DataFrame(
667+
{0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
668+
columns=[0, 1],
669+
)
670+
self.grouper = data.groupby(0)
671+
672+
def time_series_numba(self):
673+
def function(values, index):
674+
total = 0
675+
for i, value in enumerate(values):
676+
if i % 2:
677+
total += value + 5
678+
else:
679+
total += value * 2
680+
return total
681+
682+
self.grouper[1].agg(function, engine="numba")
683+
684+
def time_series_cython(self):
685+
def function(values):
686+
total = 0
687+
for i, value in enumerate(values):
688+
if i % 2:
689+
total += value + 5
690+
else:
691+
total += value * 2
692+
return total
693+
694+
self.grouper[1].agg(function, engine="cython")
695+
696+
def time_dataframe_numba(self):
697+
def function(values, index):
698+
total = 0
699+
for i, value in enumerate(values):
700+
if i % 2:
701+
total += value + 5
702+
else:
703+
total += value * 2
704+
return total
705+
706+
self.grouper.agg(function, engine="numba")
707+
708+
def time_dataframe_cython(self):
709+
def function(values):
710+
total = 0
711+
for i, value in enumerate(values):
712+
if i % 2:
713+
total += value + 5
714+
else:
715+
total += value * 2
716+
return total
717+
718+
self.grouper.agg(function, engine="cython")
719+
720+
663721
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/rolling.py

+12-13
Original file line numberDiff line numberDiff line change
@@ -150,19 +150,18 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
150150
self.roll.quantile(percentile, interpolation=interpolation)
151151

152152

153-
class PeakMemFixed:
154-
def setup(self):
155-
N = 10
156-
arr = 100 * np.random.random(N)
157-
self.roll = pd.Series(arr).rolling(10)
158-
159-
def peakmem_fixed(self):
160-
# GH 25926
161-
# This is to detect memory leaks in rolling operations.
162-
# To save time this is only ran on one method.
163-
# 6000 iterations is enough for most types of leaks to be detected
164-
for x in range(6000):
165-
self.roll.max()
153+
class PeakMemFixedWindowMinMax:
154+
155+
params = ["min", "max"]
156+
157+
def setup(self, operation):
158+
N = int(1e6)
159+
arr = np.random.random(N)
160+
self.roll = pd.Series(arr).rolling(2)
161+
162+
def peakmem_fixed(self, operation):
163+
for x in range(5):
164+
getattr(self.roll, operation)()
166165

167166

168167
class ForwardWindowMethods:

asv_bench/benchmarks/stat_ops.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ class FrameOps:
1111
param_names = ["op", "dtype", "axis"]
1212

1313
def setup(self, op, dtype, axis):
14-
if op == "mad" and dtype == "Int64" and axis == 1:
15-
# GH-33036
14+
if op == "mad" and dtype == "Int64":
15+
# GH-33036, GH#33600
1616
raise NotImplementedError
1717
values = np.random.randn(100000, 4)
1818
if dtype == "Int64":

ci/deps/azure-37-numpydev.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ dependencies:
1414
- pytz
1515
- pip
1616
- pip:
17-
- cython==0.29.16
18-
# GH#33507 cython 3.0a1 is causing TypeErrors 2020-04-13
17+
- cython>=0.29.16
1918
- "git+git://github.com/dateutil/dateutil.git"
2019
- "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com"
2120
- "--pre"

ci/setup_env.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ conda list pandas
128128
echo "[Build extensions]"
129129
python setup.py build_ext -q -i -j2
130130

131-
# XXX: Some of our environments end up with old versions of pip (10.x)
131+
# TODO: Some of our environments end up with old versions of pip (10.x)
132132
# Adding a new enough version of pip to the requirements explodes the
133133
# solve time. Just using pip to update itself.
134134
# - py35_macos

doc/source/development/contributing.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ do not make sudden changes to the code that could have the potential to break
581581
a lot of user code as a result, that is, we need it to be as *backwards compatible*
582582
as possible to avoid mass breakages.
583583

584-
Additional standards are outlined on the `pandas code style guide <code_style>`_
584+
Additional standards are outlined on the :ref:`pandas code style guide <code_style>`
585585

586586
Optional dependencies
587587
---------------------

doc/source/reference/general_utility_functions.rst

+3
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,12 @@ Exceptions and warnings
3535
.. autosummary::
3636
:toctree: api/
3737

38+
errors.AccessorRegistrationWarning
3839
errors.DtypeWarning
3940
errors.EmptyDataError
4041
errors.OutOfBoundsDatetime
42+
errors.MergeError
43+
errors.NumbaUtilError
4144
errors.ParserError
4245
errors.ParserWarning
4346
errors.PerformanceWarning

doc/source/user_guide/basics.rst

+58
Original file line numberDiff line numberDiff line change
@@ -1781,6 +1781,31 @@ used to sort a pandas object by its index levels.
17811781
# Series
17821782
unsorted_df['three'].sort_index()
17831783
1784+
.. _basics.sort_index_key:
1785+
1786+
.. versionadded:: 1.1.0
1787+
1788+
Sorting by index also supports a ``key`` parameter that takes a callable
1789+
function to apply to the index being sorted. For `MultiIndex` objects,
1790+
the key is applied per-level to the levels specified by `level`.
1791+
1792+
.. ipython:: python
1793+
1794+
s1 = pd.DataFrame({
1795+
"a": ['B', 'a', 'C'],
1796+
"b": [1, 2, 3],
1797+
"c": [2, 3, 4]
1798+
}).set_index(list("ab"))
1799+
s1
1800+
1801+
.. ipython:: python
1802+
1803+
s1.sort_index(level="a")
1804+
s1.sort_index(level="a", key=lambda idx: idx.str.lower())
1805+
1806+
For information on key sorting by value, see :ref:`value sorting
1807+
<basics.sort_value_key>`.
1808+
17841809
.. _basics.sort_values:
17851810

17861811
By values
@@ -1813,6 +1838,39 @@ argument:
18131838
s.sort_values()
18141839
s.sort_values(na_position='first')
18151840
1841+
.. _basics.sort_value_key:
1842+
1843+
.. versionadded:: 1.1.0
1844+
1845+
Sorting also supports a ``key`` parameter that takes a callable function
1846+
to apply to the values being sorted.
1847+
1848+
.. ipython:: python
1849+
1850+
s1 = pd.Series(['B', 'a', 'C'])
1851+
1852+
.. ipython:: python
1853+
1854+
s1.sort_values()
1855+
s1.sort_values(key=lambda x: x.str.lower())
1856+
1857+
`key` will be given the :class:`Series` of values and should return a ``Series``
1858+
or array of the same shape with the transformed values. For `DataFrame` objects,
1859+
the key is applied per column, so the key should still expect a Series and return
1860+
a Series, e.g.
1861+
1862+
.. ipython:: python
1863+
1864+
df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]})
1865+
1866+
.. ipython:: python
1867+
1868+
df.sort_values(by='a')
1869+
df.sort_values(by='a', key=lambda col: col.str.lower())
1870+
1871+
The name or type of each column can be used to apply different functions to
1872+
different columns.
1873+
18161874
.. _basics.sort_indexes_and_values:
18171875

18181876
By indexes and values

doc/source/user_guide/computation.rst

+12-8
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,8 @@ We provide a number of common statistical functions:
318318
:meth:`~Rolling.kurt`, Sample kurtosis (4th moment)
319319
:meth:`~Rolling.quantile`, Sample quantile (value at %)
320320
:meth:`~Rolling.apply`, Generic apply
321-
:meth:`~Rolling.cov`, Unbiased covariance (binary)
322-
:meth:`~Rolling.corr`, Correlation (binary)
321+
:meth:`~Rolling.cov`, Sample covariance (binary)
322+
:meth:`~Rolling.corr`, Sample correlation (binary)
323323

324324
.. _computation.window_variance.caveats:
325325

@@ -341,6 +341,8 @@ We provide a number of common statistical functions:
341341
sample variance under the circumstances would result in a biased estimator
342342
of the variable we are trying to determine.
343343

344+
The same caveats apply to using any supported statistical sample methods.
345+
344346
.. _stats.rolling_apply:
345347

346348
Rolling apply
@@ -380,8 +382,8 @@ and their default values are set to ``False``, ``True`` and ``False`` respective
380382
.. note::
381383

382384
In terms of performance, **the first time a function is run using the Numba engine will be slow**
383-
as Numba will have some function compilation overhead. However, ``rolling`` objects will cache
384-
the function and subsequent calls will be fast. In general, the Numba engine is performant with
385+
as Numba will have some function compilation overhead. However, the compiled functions are cached,
386+
and subsequent calls will be fast. In general, the Numba engine is performant with
385387
a larger amount of data points (e.g. 1+ million).
386388

387389
.. code-block:: ipython
@@ -870,12 +872,12 @@ Method summary
870872
:meth:`~Expanding.max`, Maximum
871873
:meth:`~Expanding.std`, Sample standard deviation
872874
:meth:`~Expanding.var`, Sample variance
873-
:meth:`~Expanding.skew`, Unbiased skewness (3rd moment)
874-
:meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment)
875+
:meth:`~Expanding.skew`, Sample skewness (3rd moment)
876+
:meth:`~Expanding.kurt`, Sample kurtosis (4th moment)
875877
:meth:`~Expanding.quantile`, Sample quantile (value at %)
876878
:meth:`~Expanding.apply`, Generic apply
877-
:meth:`~Expanding.cov`, Unbiased covariance (binary)
878-
:meth:`~Expanding.corr`, Correlation (binary)
879+
:meth:`~Expanding.cov`, Sample covariance (binary)
880+
:meth:`~Expanding.corr`, Sample correlation (binary)
879881

880882
.. note::
881883

@@ -884,6 +886,8 @@ Method summary
884886
windows. See :ref:`this section <computation.window_variance.caveats>` for more
885887
information.
886888

889+
The same caveats apply to using any supported statistical sample methods.
890+
887891
.. currentmodule:: pandas
888892

889893
Aside from not having a ``window`` parameter, these functions have the same

doc/source/user_guide/enhancingperf.rst

+2-14
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ Consider the following toy example of doubling each observation:
396396
1000 loops, best of 3: 233 us per loop
397397
398398
# Custom function with numba
399-
In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
399+
In [7]: %timeit df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
400400
1000 loops, best of 3: 145 us per loop
401401
402402
Caveats
@@ -599,13 +599,6 @@ identifier.
599599
The ``inplace`` keyword determines whether this assignment will performed
600600
on the original ``DataFrame`` or return a copy with the new column.
601601

602-
.. warning::
603-
604-
For backwards compatibility, ``inplace`` defaults to ``True`` if not
605-
specified. This will change in a future version of pandas - if your
606-
code depends on an inplace assignment you should update to explicitly
607-
set ``inplace=True``.
608-
609602
.. ipython:: python
610603
611604
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
@@ -614,7 +607,7 @@ on the original ``DataFrame`` or return a copy with the new column.
614607
df.eval('a = 1', inplace=True)
615608
df
616609
617-
When ``inplace`` is set to ``False``, a copy of the ``DataFrame`` with the
610+
When ``inplace`` is set to ``False``, the default, a copy of the ``DataFrame`` with the
618611
new or modified columns is returned and the original frame is unchanged.
619612

620613
.. ipython:: python
@@ -653,11 +646,6 @@ whether the query modifies the original frame.
653646
df.query('a > 2', inplace=True)
654647
df
655648
656-
.. warning::
657-
658-
Unlike with ``eval``, the default value for ``inplace`` for ``query``
659-
is ``False``. This is consistent with prior versions of pandas.
660-
661649
Local variables
662650
~~~~~~~~~~~~~~~
663651

0 commit comments

Comments
 (0)