Skip to content

Commit b3cf5b3

Browse files
authored
Merge branch 'master' into 33200-groupby-quantile
2 parents e70e3ca + df5eee6 commit b3cf5b3

File tree

111 files changed

+2723
-1304
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+2723
-1304
lines changed

asv_bench/benchmarks/rolling.py

+12-13
Original file line numberDiff line numberDiff line change
@@ -150,19 +150,18 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
150150
self.roll.quantile(percentile, interpolation=interpolation)
151151

152152

153-
class PeakMemFixed:
154-
def setup(self):
155-
N = 10
156-
arr = 100 * np.random.random(N)
157-
self.roll = pd.Series(arr).rolling(10)
158-
159-
def peakmem_fixed(self):
160-
# GH 25926
161-
# This is to detect memory leaks in rolling operations.
162-
# To save time this is only ran on one method.
163-
# 6000 iterations is enough for most types of leaks to be detected
164-
for x in range(6000):
165-
self.roll.max()
153+
class PeakMemFixedWindowMinMax:
154+
155+
params = ["min", "max"]
156+
157+
def setup(self, operation):
158+
N = int(1e6)
159+
arr = np.random.random(N)
160+
self.roll = pd.Series(arr).rolling(2)
161+
162+
def peakmem_fixed(self, operation):
163+
for x in range(5):
164+
getattr(self.roll, operation)()
166165

167166

168167
class ForwardWindowMethods:

ci/deps/azure-37-numpydev.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ dependencies:
1414
- pytz
1515
- pip
1616
- pip:
17-
- cython==0.29.16
18-
# GH#33507 cython 3.0a1 is causing TypeErrors 2020-04-13
17+
- cython>=0.29.16
1918
- "git+git://github.com/dateutil/dateutil.git"
2019
- "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com"
2120
- "--pre"

doc/source/reference/general_utility_functions.rst

+3
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,12 @@ Exceptions and warnings
3535
.. autosummary::
3636
:toctree: api/
3737

38+
errors.AccessorRegistrationWarning
3839
errors.DtypeWarning
3940
errors.EmptyDataError
4041
errors.OutOfBoundsDatetime
42+
errors.MergeError
43+
errors.NumbaUtilError
4144
errors.ParserError
4245
errors.ParserWarning
4346
errors.PerformanceWarning

doc/source/user_guide/basics.rst

+58
Original file line numberDiff line numberDiff line change
@@ -1781,6 +1781,31 @@ used to sort a pandas object by its index levels.
17811781
# Series
17821782
unsorted_df['three'].sort_index()
17831783
1784+
.. _basics.sort_index_key:
1785+
1786+
.. versionadded:: 1.1.0
1787+
1788+
Sorting by index also supports a ``key`` parameter that takes a callable
1789+
function to apply to the index being sorted. For `MultiIndex` objects,
1790+
the key is applied per-level to the levels specified by `level`.
1791+
1792+
.. ipython:: python
1793+
1794+
s1 = pd.DataFrame({
1795+
"a": ['B', 'a', 'C'],
1796+
"b": [1, 2, 3],
1797+
"c": [2, 3, 4]
1798+
}).set_index(list("ab"))
1799+
s1
1800+
1801+
.. ipython:: python
1802+
1803+
s1.sort_index(level="a")
1804+
s1.sort_index(level="a", key=lambda idx: idx.str.lower())
1805+
1806+
For information on key sorting by value, see :ref:`value sorting
1807+
<basics.sort_value_key>`.
1808+
17841809
.. _basics.sort_values:
17851810

17861811
By values
@@ -1813,6 +1838,39 @@ argument:
18131838
s.sort_values()
18141839
s.sort_values(na_position='first')
18151840
1841+
.. _basics.sort_value_key:
1842+
1843+
.. versionadded:: 1.1.0
1844+
1845+
Sorting also supports a ``key`` parameter that takes a callable function
1846+
to apply to the values being sorted.
1847+
1848+
.. ipython:: python
1849+
1850+
s1 = pd.Series(['B', 'a', 'C'])
1851+
1852+
.. ipython:: python
1853+
1854+
s1.sort_values()
1855+
s1.sort_values(key=lambda x: x.str.lower())
1856+
1857+
`key` will be given the :class:`Series` of values and should return a ``Series``
1858+
or array of the same shape with the transformed values. For `DataFrame` objects,
1859+
the key is applied per column, so the key should still expect a Series and return
1860+
a Series, e.g.
1861+
1862+
.. ipython:: python
1863+
1864+
df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]})
1865+
1866+
.. ipython:: python
1867+
1868+
df.sort_values(by='a')
1869+
df.sort_values(by='a', key=lambda col: col.str.lower())
1870+
1871+
The name or type of each column can be used to apply different functions to
1872+
different columns.
1873+
18161874
.. _basics.sort_indexes_and_values:
18171875

18181876
By indexes and values

doc/source/user_guide/enhancingperf.rst

+2-14
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ Consider the following toy example of doubling each observation:
396396
1000 loops, best of 3: 233 us per loop
397397
398398
# Custom function with numba
399-
In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
399+
In [7]: %timeit df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy())
400400
1000 loops, best of 3: 145 us per loop
401401
402402
Caveats
@@ -599,13 +599,6 @@ identifier.
599599
The ``inplace`` keyword determines whether this assignment will performed
600600
on the original ``DataFrame`` or return a copy with the new column.
601601

602-
.. warning::
603-
604-
For backwards compatibility, ``inplace`` defaults to ``True`` if not
605-
specified. This will change in a future version of pandas - if your
606-
code depends on an inplace assignment you should update to explicitly
607-
set ``inplace=True``.
608-
609602
.. ipython:: python
610603
611604
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
@@ -614,7 +607,7 @@ on the original ``DataFrame`` or return a copy with the new column.
614607
df.eval('a = 1', inplace=True)
615608
df
616609
617-
When ``inplace`` is set to ``False``, a copy of the ``DataFrame`` with the
610+
When ``inplace`` is set to ``False``, the default, a copy of the ``DataFrame`` with the
618611
new or modified columns is returned and the original frame is unchanged.
619612

620613
.. ipython:: python
@@ -653,11 +646,6 @@ whether the query modifies the original frame.
653646
df.query('a > 2', inplace=True)
654647
df
655648
656-
.. warning::
657-
658-
Unlike with ``eval``, the default value for ``inplace`` for ``query``
659-
is ``False``. This is consistent with prior versions of pandas.
660-
661649
Local variables
662650
~~~~~~~~~~~~~~~
663651

doc/source/user_guide/reshaping.rst

+11-3
Original file line numberDiff line numberDiff line change
@@ -471,16 +471,24 @@ If ``crosstab`` receives only two Series, it will provide a frequency table.
471471
472472
pd.crosstab(df['A'], df['B'])
473473
474-
Any input passed containing ``Categorical`` data will have **all** of its
475-
categories included in the cross-tabulation, even if the actual data does
476-
not contain any instances of a particular category.
474+
``crosstab`` can also be implemented
475+
to ``Categorical`` data.
477476

478477
.. ipython:: python
479478
480479
foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
481480
bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
482481
pd.crosstab(foo, bar)
483482
483+
If you want to include **all** of data categories even if the actual data does
484+
not contain any instances of a particular category, you should set ``dropna=False``.
485+
486+
For example:
487+
488+
.. ipython:: python
489+
490+
pd.crosstab(foo, bar, dropna=False)
491+
484492
Normalization
485493
~~~~~~~~~~~~~
486494

doc/source/whatsnew/v1.1.0.rst

+59-4
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,53 @@ For example:
3636
ser["2014"]
3737
ser.loc["May 2015"]
3838
39+
.. _whatsnew_110.key_sorting:
40+
41+
Sorting with keys
42+
^^^^^^^^^^^^^^^^^
43+
44+
We've added a ``key`` argument to the DataFrame and Series sorting methods, including
45+
:meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`,
46+
and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied
47+
column-by-column to each column used for sorting, before sorting is performed (:issue:`27237`).
48+
See :ref:`sort_values with keys <basics.sort_value_key>` and :ref:`sort_index with keys
49+
<basics.sort_index_key>` for more information.
50+
51+
.. ipython:: python
52+
53+
s = pd.Series(['C', 'a', 'B'])
54+
s
55+
56+
.. ipython:: python
57+
58+
s.sort_values()
59+
60+
61+
Note how this is sorted with capital letters first. If we apply the :meth:`Series.str.lower`
62+
method, we get
63+
64+
.. ipython:: python
65+
66+
s.sort_values(key=lambda x: x.str.lower())
67+
68+
69+
When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if
70+
`by` is specified, e.g.
71+
72+
.. ipython:: python
73+
74+
df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'],
75+
'b': [1, 2, 3, 4, 5, 6]})
76+
df
77+
78+
.. ipython:: python
79+
80+
df.sort_values(by=['a'], key=lambda col: col.str.lower())
81+
82+
83+
For more details, see examples and documentation in :meth:`DataFrame.sort_values`,
84+
:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`.
85+
3986
.. _whatsnew_110.timestamp_fold_support:
4087

4188
Fold argument support in Timestamp constructor
@@ -98,6 +145,8 @@ Other enhancements
98145
This can be used to set a custom compression level, e.g.,
99146
``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}``
100147
(:issue:`33196`)
148+
- :meth:`Series.update` now accepts objects that can be coerced to a :class:`Series`,
149+
such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`)
101150
- :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
102151
- :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
103152
-
@@ -175,8 +224,7 @@ Other API changes
175224
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
176225
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
177226
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
178-
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
179-
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
227+
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
180228
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
181229
-
182230

@@ -478,6 +526,7 @@ Categorical
478526
- :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`)
479527
- Bug where :meth:`Categorical.replace` would replace with ``NaN`` whenever the new value and replacement value were equal (:issue:`33288`)
480528
- Bug where an ordered :class:`Categorical` containing only ``NaN`` values would raise rather than returning ``NaN`` when taking the minimum or maximum (:issue:`33450`)
529+
- Bug where :meth:`Series.isna` and :meth:`DataFrame.isna` would raise for categorical dtype when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33594`)
481530

482531
Datetimelike
483532
^^^^^^^^^^^^
@@ -520,7 +569,7 @@ Numeric
520569
- Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`)
521570
- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`)
522571
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
523-
-
572+
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
524573

525574
Conversion
526575
^^^^^^^^^^
@@ -566,12 +615,14 @@ Indexing
566615
- Bug in :meth:`DatetimeIndex.insert` and :meth:`TimedeltaIndex.insert` causing index ``freq`` to be lost when setting an element into an empty :class:`Series` (:issue:33573`)
567616
- Bug in :meth:`Series.__setitem__` with an :class:`IntervalIndex` and a list-like key of integers (:issue:`33473`)
568617
- Bug in :meth:`Series.__getitem__` allowing missing labels with ``np.ndarray``, :class:`Index`, :class:`Series` indexers but not ``list``, these now all raise ``KeyError`` (:issue:`33646`)
618+
- Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` where index was assumed to be monotone increasing (:issue:`33756`)
569619

570620
Missing
571621
^^^^^^^
572622
- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`).
573623
- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ``<NA>`` was raising a ``TypeError``. The method now handles this by ignoring ``<NA>`` values when doing the comparison for the replacement (:issue:`32621`)
574624
- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ``<NA>`` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`)
625+
- Clarified documentation on interpolate with method =akima. The ``der`` parameter must be scalar or None (:issue:`33426`)
575626

576627
MultiIndex
577628
^^^^^^^^^^
@@ -621,6 +672,9 @@ I/O
621672
unsupported HDF file (:issue:`9539`)
622673
- Bug in :meth:`~DataFrame.to_parquet` was not raising ``PermissionError`` when writing to a private s3 bucket with invalid creds. (:issue:`27679`)
623674
- Bug in :meth:`~DataFrame.to_csv` was silently failing when writing to an invalid s3 bucket. (:issue:`32486`)
675+
- Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`)
676+
- Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`)
677+
- Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`)
624678

625679
Plotting
626680
^^^^^^^^
@@ -645,6 +699,7 @@ Groupby/resample/rolling
645699
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
646700
- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
647701
- Bug in :meth:`SeriesGroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`)
702+
- Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`)
648703

649704
Reshaping
650705
^^^^^^^^^
@@ -677,7 +732,7 @@ Sparse
677732
ExtensionArray
678733
^^^^^^^^^^^^^^
679734

680-
- Fixed bug where :meth:`Serires.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`)
735+
- Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`)
681736
-
682737

683738

pandas/_libs/lib.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
cdef bint c_is_list_like(object, bint)
1+
cdef bint c_is_list_like(object, bint) except -1

pandas/_libs/lib.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -988,7 +988,7 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool:
988988
return c_is_list_like(obj, allow_sets)
989989

990990

991-
cdef inline bint c_is_list_like(object obj, bint allow_sets):
991+
cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1:
992992
return (
993993
isinstance(obj, abc.Iterable)
994994
# we do not count strings/unicode/bytes as list-like

pandas/_libs/window/aggregations.pyx

+7-9
Original file line numberDiff line numberDiff line change
@@ -971,8 +971,8 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs,
971971
return result
972972

973973

974-
def roll_max_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
975-
ndarray[int64_t] end, int64_t minp, int64_t win):
974+
def roll_max_fixed(float64_t[:] values, int64_t[:] start,
975+
int64_t[:] end, int64_t minp, int64_t win):
976976
"""
977977
Moving max of 1d array of any numeric type along axis=0 ignoring NaNs.
978978
@@ -988,7 +988,7 @@ def roll_max_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
988988
make the interval closed on the right, left,
989989
both or neither endpoints
990990
"""
991-
return _roll_min_max_fixed(values, start, end, minp, win, is_max=1)
991+
return _roll_min_max_fixed(values, minp, win, is_max=1)
992992

993993

994994
def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start,
@@ -1011,8 +1011,8 @@ def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start,
10111011
return _roll_min_max_variable(values, start, end, minp, is_max=1)
10121012

10131013

1014-
def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
1015-
ndarray[int64_t] end, int64_t minp, int64_t win):
1014+
def roll_min_fixed(float64_t[:] values, int64_t[:] start,
1015+
int64_t[:] end, int64_t minp, int64_t win):
10161016
"""
10171017
Moving min of 1d array of any numeric type along axis=0 ignoring NaNs.
10181018
@@ -1025,7 +1025,7 @@ def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
10251025
index : ndarray, optional
10261026
index for window computation
10271027
"""
1028-
return _roll_min_max_fixed(values, start, end, minp, win, is_max=0)
1028+
return _roll_min_max_fixed(values, minp, win, is_max=0)
10291029

10301030

10311031
def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start,
@@ -1112,9 +1112,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values,
11121112
return output
11131113

11141114

1115-
cdef _roll_min_max_fixed(ndarray[numeric] values,
1116-
ndarray[int64_t] starti,
1117-
ndarray[int64_t] endi,
1115+
cdef _roll_min_max_fixed(numeric[:] values,
11181116
int64_t minp,
11191117
int64_t win,
11201118
bint is_max):

0 commit comments

Comments
 (0)