Skip to content

Commit 8275c06

Browse files
authored
Merge branch 'master' into GH28501
2 parents 5370b83 + 446be82 commit 8275c06

File tree

95 files changed

+2076
-1220
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+2076
-1220
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
*.log
1313
*.swp
1414
*.pdb
15+
*.zip
1516
.project
1617
.pydevproject
1718
.settings

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ repos:
2626
name: isort (cython)
2727
types: [cython]
2828
- repo: https://github.com/asottile/pyupgrade
29-
rev: v2.7.3
29+
rev: v2.7.4
3030
hooks:
3131
- id: pyupgrade
3232
args: [--py37-plus]

asv_bench/benchmarks/rolling.py

+13
Original file line numberDiff line numberDiff line change
@@ -225,4 +225,17 @@ def time_rolling_offset(self, method):
225225
getattr(self.groupby_roll_offset, method)()
226226

227227

228+
class GroupbyEWM:
229+
230+
params = ["cython", "numba"]
231+
param_names = ["engine"]
232+
233+
def setup(self, engine):
234+
df = pd.DataFrame({"A": range(50), "B": range(50)})
235+
self.gb_ewm = df.groupby("A").ewm(com=1.0)
236+
237+
def time_groupby_mean(self, engine):
238+
self.gb_ewm.mean(engine=engine)
239+
240+
228241
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/series_methods.py

+49
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,55 @@ def time_isin_long_series_long_values_floats(self):
9090
self.s_long_floats.isin(self.vals_long_floats)
9191

9292

93+
class IsInLongSeriesLookUpDominates:
94+
params = [
95+
["int64", "int32", "float64", "float32", "object"],
96+
[5, 1000],
97+
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
98+
]
99+
param_names = ["dtype", "MaxNumber", "series_type"]
100+
101+
def setup(self, dtype, MaxNumber, series_type):
102+
N = 10 ** 7
103+
if series_type == "random_hits":
104+
np.random.seed(42)
105+
array = np.random.randint(0, MaxNumber, N)
106+
if series_type == "random_misses":
107+
np.random.seed(42)
108+
array = np.random.randint(0, MaxNumber, N) + MaxNumber
109+
if series_type == "monotone_hits":
110+
array = np.repeat(np.arange(MaxNumber), N // MaxNumber)
111+
if series_type == "monotone_misses":
112+
array = np.arange(N) + MaxNumber
113+
self.series = Series(array).astype(dtype)
114+
self.values = np.arange(MaxNumber).astype(dtype)
115+
116+
def time_isin(self, dtypes, MaxNumber, series_type):
117+
self.series.isin(self.values)
118+
119+
120+
class IsInLongSeriesValuesDominate:
121+
params = [
122+
["int64", "int32", "float64", "float32", "object"],
123+
["random", "monotone"],
124+
]
125+
param_names = ["dtype", "series_type"]
126+
127+
def setup(self, dtype, series_type):
128+
N = 10 ** 7
129+
if series_type == "random":
130+
np.random.seed(42)
131+
vals = np.random.randint(0, 10 * N, N)
132+
if series_type == "monotone":
133+
vals = np.arange(N)
134+
self.values = vals.astype(dtype)
135+
M = 10 ** 6 + 1
136+
self.series = Series(np.arange(M)).astype(dtype)
137+
138+
def time_isin(self, dtypes, series_type):
139+
self.series.isin(self.values)
140+
141+
93142
class NSort:
94143

95144
params = ["first", "last", "all"]

ci/run_tests.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s
2525
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
2626
# GH#37455 windows py38 build appears to be running out of memory
2727
# skip collection of window tests
28-
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/"
28+
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/"
2929
fi
3030

3131
echo $PYTEST_CMD

doc/source/ecosystem.rst

+10
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,16 @@ D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle
178178
& Google Colab. Here are some demos of the `grid <http://alphatechadmin.pythonanywhere.com/>`__
179179
and `chart-builder <http://alphatechadmin.pythonanywhere.com/charts/4?chart_type=surface&query=&x=date&z=Col0&agg=raw&cpg=false&y=%5B%22security_id%22%5D>`__.
180180

181+
`hvplot <https://hvplot.holoviz.org/index.html>`__
182+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183+
184+
hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews <https://holoviews.org/>`__.
185+
It can be loaded as a native pandas plotting backend via
186+
187+
.. code:: python
188+
189+
pd.set_option("plotting.backend", "hvplot")
190+
181191
.. _ecosystem.ide:
182192

183193
IDE

doc/source/user_guide/computation.rst

+7
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,10 @@ parameter:
205205
- ``min`` : lowest rank in the group
206206
- ``max`` : highest rank in the group
207207
- ``first`` : ranks assigned in the order they appear in the array
208+
209+
.. _computation.windowing:
210+
211+
Windowing functions
212+
~~~~~~~~~~~~~~~~~~~
213+
214+
See :ref:`the window operations user guide <window.overview>` for an overview of windowing functions.

doc/source/user_guide/window.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Concept Method Returned Object
4343
Rolling window ``rolling`` ``Rolling`` Yes Yes
4444
Weighted window ``rolling`` ``Window`` No No
4545
Expanding window ``expanding`` ``Expanding`` No Yes
46-
Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No No
46+
Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2)
4747
============================= ================= =========================== =========================== ========================
4848

4949
As noted above, some operations support specifying a window based on a time offset:

doc/source/whatsnew/v1.2.0.rst

+34-3
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,23 @@ example where the index name is preserved:
204204
The same is true for :class:`MultiIndex`, but the logic is applied separately on a
205205
level-by-level basis.
206206

207+
.. _whatsnew_120.groupby_ewm:
208+
209+
Groupby supports EWM operations directly
210+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
211+
212+
:class:`DataFrameGroupBy` now supports exponentially weighted window operations directly (:issue:`16037`).
213+
214+
.. ipython:: python
215+
216+
df = pd.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': range(4)})
217+
df
218+
df.groupby('A').ewm(com=1.0).mean()
219+
220+
Additionally ``mean`` supports execution via `Numba <https://numba.pydata.org/>`__ with
221+
the ``engine`` and ``engine_kwargs`` arguments. Numba must be installed as an optional dependency
222+
to use this feature.
223+
207224
.. _whatsnew_120.enhancements.other:
208225

209226
Other enhancements
@@ -457,7 +474,8 @@ Deprecations
457474
- :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`)
458475
- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`)
459476
- :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`)
460-
477+
- Partial slicing on unordered :class:`DatetimeIndexes` with keys, which are not in Index is deprecated and will be removed in a future version (:issue:`18531`)
478+
- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`)
461479

462480
.. ---------------------------------------------------------------------------
463481
@@ -482,6 +500,7 @@ Performance improvements
482500
- faster ``dir`` calls when many index labels, e.g. ``dir(ser)`` (:issue:`37450`)
483501
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
484502
- Performance improvement in :meth:`pd.DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`pd.Index.value_counts`)
503+
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements
485504

486505
.. ---------------------------------------------------------------------------
487506
@@ -521,6 +540,7 @@ Timedelta
521540
- Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`)
522541
- Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`)
523542
- Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`)
543+
- Bug in :class:`Timedelta` incorrectly truncating to sub-second portion of a string input when it has precision higher than nanoseconds (:issue:`36738`)
524544

525545
Timezones
526546
^^^^^^^^^
@@ -561,9 +581,15 @@ Strings
561581

562582
Interval
563583
^^^^^^^^
584+
585+
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Interval` dtypes would be converted to object dtypes (:issue:`34871`)
564586
- Bug in :meth:`IntervalIndex.take` with negative indices and ``fill_value=None`` (:issue:`37330`)
565587
-
566-
-
588+
Period
589+
^^^^^^
590+
591+
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Period` dtypes would be converted to object dtypes (:issue:`34871`)
592+
567593

568594
Indexing
569595
^^^^^^^^
@@ -583,6 +609,8 @@ Indexing
583609
- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`)
584610
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`)
585611
- Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`)
612+
- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty :class:`DataFrame` with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`)
613+
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`)
586614

587615
Missing
588616
^^^^^^^
@@ -625,16 +653,18 @@ I/O
625653
- :meth:`to_excel` and :meth:`to_markdown` support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`)
626654
- Bug in :meth:`read_fw` was not skipping blank lines (even with ``skip_blank_lines=True``) (:issue:`37758`)
627655
- Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`)
656+
- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`)
628657

629658
Plotting
630659
^^^^^^^^
631660

632661
- Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`)
633662
- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`)
634663
- Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`)
664+
- Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing :exc:`ValueError` with a :class:`Series` or :class:`DataFrame`
665+
indexed by a :class:`TimedeltaIndex` with a fixed frequency when x-axis lower limit was greater than upper limit (:issue:`37454`)
635666
- Bug in :meth:`DataFrameGroupBy.boxplot` when ``subplots=False``, a KeyError would raise (:issue:`16748`)
636667

637-
638668
Groupby/resample/rolling
639669
^^^^^^^^^^^^^^^^^^^^^^^^
640670

@@ -700,6 +730,7 @@ Other
700730
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly casting from ``PeriodDtype`` to object dtype (:issue:`34871`)
701731
- Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`)
702732
- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`) (:issue:`37381`)
733+
- Fixed metadata propagation when selecting columns from a DataFrame with ``DataFrame.__getitem__`` (:issue:`28283`)
703734
- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`)
704735
- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`)
705736
- Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`).

pandas/_libs/interval.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,8 @@ cdef class IntervalMixin:
179179
return (self.right == self.left) & (self.closed != 'both')
180180

181181
def _check_closed_matches(self, other, name='other'):
182-
"""Check if the closed attribute of `other` matches.
182+
"""
183+
Check if the closed attribute of `other` matches.
183184
184185
Note that 'left' and 'right' are considered different from 'both'.
185186

pandas/_libs/src/klib/khash_python.h

+5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
1717
// GH 28303 showed, that the simple xoring-version isn't good enough
1818
// See GH 36729 for evaluation of the currently used murmur2-hash version
19+
// An interesting alternative to expensive murmur2-hash would be to change
20+
// the probing strategy and use e.g. the probing strategy from CPython's
21+
// implementation of dicts, which shines for smaller sizes but is more
22+
// predisposed to superlinear running times (see GH 36729 for comparison)
23+
1924

2025
khint64_t PANDAS_INLINE asint64(double key) {
2126
khint64_t val;

pandas/_libs/tslibs/offsets.pyx

+26
Original file line numberDiff line numberDiff line change
@@ -1403,6 +1403,19 @@ cdef class BusinessDay(BusinessMixin):
14031403
cdef class BusinessHour(BusinessMixin):
14041404
"""
14051405
DateOffset subclass representing possibly n business hours.
1406+
1407+
Parameters
1408+
----------
1409+
n : int, default 1
1410+
The number of months represented.
1411+
normalize : bool, default False
1412+
Normalize start/end dates to midnight before generating date range.
1413+
weekmask : str, Default 'Mon Tue Wed Thu Fri'
1414+
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
1415+
start : str, default "09:00"
1416+
Start time of your custom business hour in 24h format.
1417+
end : str, default: "17:00"
1418+
End time of your custom business hour in 24h format.
14061419
"""
14071420

14081421
_prefix = "BH"
@@ -3251,6 +3264,19 @@ cdef class CustomBusinessDay(BusinessDay):
32513264
cdef class CustomBusinessHour(BusinessHour):
32523265
"""
32533266
DateOffset subclass representing possibly n custom business days.
3267+
3268+
Parameters
3269+
----------
3270+
n : int, default 1
3271+
The number of months represented.
3272+
normalize : bool, default False
3273+
Normalize start/end dates to midnight before generating date range.
3274+
weekmask : str, Default 'Mon Tue Wed Thu Fri'
3275+
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
3276+
start : str, default "09:00"
3277+
Start time of your custom business hour in 24h format.
3278+
end : str, default: "17:00"
3279+
End time of your custom business hour in 24h format.
32543280
"""
32553281

32563282
_prefix = "CBH"

pandas/_libs/tslibs/timedeltas.pyx

+7-2
Original file line numberDiff line numberDiff line change
@@ -405,9 +405,11 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1:
405405
m = 10**(3 -len(frac)) * 1000 * 1000
406406
elif len(frac) > 3 and len(frac) <= 6:
407407
m = 10**(6 -len(frac)) * 1000
408-
else:
408+
elif len(frac) > 6 and len(frac) <= 9:
409409
m = 10**(9 -len(frac))
410-
410+
else:
411+
m = 1
412+
frac = frac[:9]
411413
r = <int64_t>int(''.join(frac)) * m
412414
result += timedelta_as_neg(r, neg)
413415

@@ -1143,6 +1145,9 @@ class Timedelta(_Timedelta):
11431145
Notes
11441146
-----
11451147
The ``.value`` attribute is always in ns.
1148+
1149+
If the precision is higher than nanoseconds, the precision of the duration is
1150+
truncated to nanoseconds.
11461151
"""
11471152

11481153
def __new__(cls, object value=_no_input, unit=None, **kwargs):

0 commit comments

Comments
 (0)