Skip to content

Commit 66aeb81

Browse files
committed
Merge branch 'main' of https://github.com/pandas-dev/pandas into bug#60583
2 parents d7f8c97 + 513e787 commit 66aeb81

File tree

21 files changed

+502
-3
lines changed

21 files changed

+502
-3
lines changed

asv_bench/benchmarks/rolling.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,19 @@ class Methods:
1010
["DataFrame", "Series"],
1111
[("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})],
1212
["int", "float"],
13-
["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"],
13+
[
14+
"median",
15+
"mean",
16+
"max",
17+
"min",
18+
"std",
19+
"count",
20+
"skew",
21+
"kurt",
22+
"sum",
23+
"sem",
24+
"nunique",
25+
],
1426
)
1527
param_names = ["constructor", "window_kwargs", "dtype", "method"]
1628

ci/code_checks.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,16 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8383
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
8484
-i "pandas.core.resample.Resampler.quantile PR01,PR07" \
8585
-i "pandas.tseries.offsets.BDay PR02,SA01" \
86+
-i "pandas.tseries.offsets.BHalfYearBegin.is_on_offset GL08" \
87+
-i "pandas.tseries.offsets.BHalfYearBegin.n GL08" \
88+
-i "pandas.tseries.offsets.BHalfYearBegin.normalize GL08" \
89+
-i "pandas.tseries.offsets.BHalfYearBegin.rule_code GL08" \
90+
-i "pandas.tseries.offsets.BHalfYearBegin.startingMonth GL08" \
91+
-i "pandas.tseries.offsets.BHalfYearEnd.is_on_offset GL08" \
92+
-i "pandas.tseries.offsets.BHalfYearEnd.n GL08" \
93+
-i "pandas.tseries.offsets.BHalfYearEnd.normalize GL08" \
94+
-i "pandas.tseries.offsets.BHalfYearEnd.rule_code GL08" \
95+
-i "pandas.tseries.offsets.BHalfYearEnd.startingMonth GL08" \
8696
-i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
8797
-i "pandas.tseries.offsets.BQuarterBegin.n GL08" \
8898
-i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \
@@ -185,6 +195,16 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
185195
-i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \
186196
-i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \
187197
-i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \
198+
-i "pandas.tseries.offsets.HalfYearBegin.is_on_offset GL08" \
199+
-i "pandas.tseries.offsets.HalfYearBegin.n GL08" \
200+
-i "pandas.tseries.offsets.HalfYearBegin.normalize GL08" \
201+
-i "pandas.tseries.offsets.HalfYearBegin.rule_code GL08" \
202+
-i "pandas.tseries.offsets.HalfYearBegin.startingMonth GL08" \
203+
-i "pandas.tseries.offsets.HalfYearEnd.is_on_offset GL08" \
204+
-i "pandas.tseries.offsets.HalfYearEnd.n GL08" \
205+
-i "pandas.tseries.offsets.HalfYearEnd.normalize GL08" \
206+
-i "pandas.tseries.offsets.HalfYearEnd.rule_code GL08" \
207+
-i "pandas.tseries.offsets.HalfYearEnd.startingMonth GL08" \
188208
-i "pandas.tseries.offsets.Hour.is_on_offset GL08" \
189209
-i "pandas.tseries.offsets.Hour.n GL08" \
190210
-i "pandas.tseries.offsets.Hour.normalize GL08" \

doc/source/reference/offset_frequency.rst

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,146 @@ Methods
776776
QuarterBegin.is_year_start
777777
QuarterBegin.is_year_end
778778

779+
BHalfYearEnd
780+
------------
781+
.. autosummary::
782+
:toctree: api/
783+
784+
BHalfYearEnd
785+
786+
Properties
787+
~~~~~~~~~~
788+
.. autosummary::
789+
:toctree: api/
790+
791+
BHalfYearEnd.freqstr
792+
BHalfYearEnd.kwds
793+
BHalfYearEnd.name
794+
BHalfYearEnd.nanos
795+
BHalfYearEnd.normalize
796+
BHalfYearEnd.rule_code
797+
BHalfYearEnd.n
798+
BHalfYearEnd.startingMonth
799+
800+
Methods
801+
~~~~~~~
802+
.. autosummary::
803+
:toctree: api/
804+
805+
BHalfYearEnd.copy
806+
BHalfYearEnd.is_on_offset
807+
BHalfYearEnd.is_month_start
808+
BHalfYearEnd.is_month_end
809+
BHalfYearEnd.is_quarter_start
810+
BHalfYearEnd.is_quarter_end
811+
BHalfYearEnd.is_year_start
812+
BHalfYearEnd.is_year_end
813+
814+
BHalfYearBegin
815+
--------------
816+
.. autosummary::
817+
:toctree: api/
818+
819+
BHalfYearBegin
820+
821+
Properties
822+
~~~~~~~~~~
823+
.. autosummary::
824+
:toctree: api/
825+
826+
BHalfYearBegin.freqstr
827+
BHalfYearBegin.kwds
828+
BHalfYearBegin.name
829+
BHalfYearBegin.nanos
830+
BHalfYearBegin.normalize
831+
BHalfYearBegin.rule_code
832+
BHalfYearBegin.n
833+
BHalfYearBegin.startingMonth
834+
835+
Methods
836+
~~~~~~~
837+
.. autosummary::
838+
:toctree: api/
839+
840+
BHalfYearBegin.copy
841+
BHalfYearBegin.is_on_offset
842+
BHalfYearBegin.is_month_start
843+
BHalfYearBegin.is_month_end
844+
BHalfYearBegin.is_quarter_start
845+
BHalfYearBegin.is_quarter_end
846+
BHalfYearBegin.is_year_start
847+
BHalfYearBegin.is_year_end
848+
849+
HalfYearEnd
850+
-----------
851+
.. autosummary::
852+
:toctree: api/
853+
854+
HalfYearEnd
855+
856+
Properties
857+
~~~~~~~~~~
858+
.. autosummary::
859+
:toctree: api/
860+
861+
HalfYearEnd.freqstr
862+
HalfYearEnd.kwds
863+
HalfYearEnd.name
864+
HalfYearEnd.nanos
865+
HalfYearEnd.normalize
866+
HalfYearEnd.rule_code
867+
HalfYearEnd.n
868+
HalfYearEnd.startingMonth
869+
870+
Methods
871+
~~~~~~~
872+
.. autosummary::
873+
:toctree: api/
874+
875+
HalfYearEnd.copy
876+
HalfYearEnd.is_on_offset
877+
HalfYearEnd.is_month_start
878+
HalfYearEnd.is_month_end
879+
HalfYearEnd.is_quarter_start
880+
HalfYearEnd.is_quarter_end
881+
HalfYearEnd.is_year_start
882+
HalfYearEnd.is_year_end
883+
884+
HalfYearBegin
885+
-------------
886+
.. autosummary::
887+
:toctree: api/
888+
889+
HalfYearBegin
890+
891+
Properties
892+
~~~~~~~~~~
893+
.. autosummary::
894+
:toctree: api/
895+
896+
HalfYearBegin.freqstr
897+
HalfYearBegin.kwds
898+
HalfYearBegin.name
899+
HalfYearBegin.nanos
900+
HalfYearBegin.normalize
901+
HalfYearBegin.rule_code
902+
HalfYearBegin.n
903+
HalfYearBegin.startingMonth
904+
905+
Methods
906+
~~~~~~~
907+
.. autosummary::
908+
:toctree: api/
909+
910+
HalfYearBegin.copy
911+
HalfYearBegin.is_on_offset
912+
HalfYearBegin.is_month_start
913+
HalfYearBegin.is_month_end
914+
HalfYearBegin.is_quarter_start
915+
HalfYearBegin.is_quarter_end
916+
HalfYearBegin.is_year_start
917+
HalfYearBegin.is_year_end
918+
779919
BYearEnd
780920
--------
781921
.. autosummary::

doc/source/reference/window.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Rolling window functions
4242
Rolling.quantile
4343
Rolling.sem
4444
Rolling.rank
45+
Rolling.nunique
4546

4647
.. _api.functions_window:
4748

@@ -86,6 +87,7 @@ Expanding window functions
8687
Expanding.quantile
8788
Expanding.sem
8889
Expanding.rank
90+
Expanding.nunique
8991

9092
.. _api.functions_ewm:
9193

doc/source/user_guide/timeseries.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,10 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ
891891
:class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQE``, "business quarter end"
892892
:class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin"
893893
:class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter"
894+
:class:`~pandas.tseries.offsets.HalfYearEnd`, ``'HYE'``, "calendar half year end"
895+
:class:`~pandas.tseries.offsets.HalfYearBegin`, ``'HYS'``, "calendar half year begin"
896+
:class:`~pandas.tseries.offsets.BHalfYearEnd`, ``'BHYE``, "business half year end"
897+
:class:`~pandas.tseries.offsets.BHalfYearBegin`, ``'BHYS'``, "business half year begin"
894898
:class:`~pandas.tseries.offsets.YearEnd`, ``'YE'``, "calendar year end"
895899
:class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin"
896900
:class:`~pandas.tseries.offsets.BYearEnd`, ``'BYE'``, "business year end"

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ Conversion
118118

119119
Strings
120120
^^^^^^^
121+
- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
121122
- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
122123
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
123124
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ Other enhancements
6262
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
6363
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
6464
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
65+
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
6566
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
6667
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
6768
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)

pandas/_libs/window/aggregations.pyi

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ def roll_rank(
8989
method: WindowingRankType,
9090
ascending: bool,
9191
) -> np.ndarray: ... # np.ndarray[float]
92+
def roll_nunique(
93+
values: np.ndarray, # const float64_t[:]
94+
start: np.ndarray, # np.ndarray[np.int64]
95+
end: np.ndarray, # np.ndarray[np.int64]
96+
minp: int, # int64_t
97+
) -> np.ndarray: ... # np.ndarray[float]
9298
def roll_apply(
9399
obj: object,
94100
start: np.ndarray, # np.ndarray[np.int64]

pandas/_libs/window/aggregations.pyx

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from libc.math cimport (
66
sqrt,
77
)
88
from libcpp.deque cimport deque
9+
from libcpp.unordered_map cimport unordered_map
910

1011
from pandas._libs.algos cimport TiebreakEnumType
1112

@@ -1470,6 +1471,66 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
14701471
return np.asarray(output)
14711472

14721473

1474+
def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
1475+
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
1476+
"""
1477+
Rolling number of unique elements in the window
1478+
"""
1479+
cdef:
1480+
Py_ssize_t i, j, s, e, N = len(start)
1481+
int64_t nobs = 0
1482+
float64_t val
1483+
float64_t[::1] output
1484+
unordered_map[float64_t, int64_t] value_counts
1485+
1486+
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
1487+
start, end
1488+
)
1489+
output = np.empty(N, dtype=np.float64)
1490+
value_counts = unordered_map[float64_t, int64_t]()
1491+
1492+
with nogil:
1493+
for i in range(N):
1494+
s = start[i]
1495+
e = end[i]
1496+
1497+
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
1498+
if i != 0:
1499+
nobs = 0
1500+
value_counts.clear()
1501+
1502+
# setup
1503+
for j in range(s, e):
1504+
val = values[j]
1505+
if val == val:
1506+
nobs += 1
1507+
value_counts[val] += 1
1508+
1509+
else:
1510+
# calculate deletes
1511+
for j in range(start[i - 1], s):
1512+
val = values[j]
1513+
if val == val:
1514+
value_counts[val] -= 1
1515+
if value_counts[val] == 0:
1516+
value_counts.erase(val)
1517+
nobs -= 1
1518+
1519+
# calculate adds
1520+
for j in range(end[i - 1], e):
1521+
val = values[j]
1522+
if val == val:
1523+
nobs += 1
1524+
value_counts[val] += 1
1525+
1526+
if nobs >= minp:
1527+
output[i] = value_counts.size()
1528+
else:
1529+
output[i] = NaN
1530+
1531+
return np.asarray(output)
1532+
1533+
14731534
def roll_apply(object obj,
14741535
ndarray[int64_t] start, ndarray[int64_t] end,
14751536
int64_t minp,

pandas/core/arrays/base.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2628,7 +2628,15 @@ def _groupby_op(
26282628
if op.how not in ["any", "all"]:
26292629
# Fail early to avoid conversion to object
26302630
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
2631-
npvalues = self.to_numpy(object, na_value=np.nan)
2631+
2632+
arr = self
2633+
if op.how == "sum":
2634+
# https://github.com/pandas-dev/pandas/issues/60229
2635+
# All NA should result in the empty string.
2636+
assert "skipna" in kwargs
2637+
if kwargs["skipna"] and min_count == 0:
2638+
arr = arr.fillna("")
2639+
npvalues = arr.to_numpy(object, na_value=np.nan)
26322640
else:
26332641
raise NotImplementedError(
26342642
f"function is not implemented for this dtype: {self.dtype}"

pandas/core/window/expanding.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -927,6 +927,41 @@ def rank(
927927
numeric_only=numeric_only,
928928
)
929929

930+
@doc(
931+
template_header,
932+
".. versionadded:: 3.0.0 \n\n",
933+
create_section_header("Parameters"),
934+
kwargs_numeric_only,
935+
create_section_header("Returns"),
936+
template_returns,
937+
create_section_header("See Also"),
938+
template_see_also,
939+
create_section_header("Examples"),
940+
dedent(
941+
"""
942+
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
943+
>>> s.expanding().nunique()
944+
0 1.0
945+
1 2.0
946+
2 3.0
947+
3 4.0
948+
4 5.0
949+
5 5.0
950+
dtype: float64
951+
"""
952+
).replace("\n", "", 1),
953+
window_method="expanding",
954+
aggregation_description="nunique",
955+
agg_method="nunique",
956+
)
957+
def nunique(
958+
self,
959+
numeric_only: bool = False,
960+
):
961+
return super().nunique(
962+
numeric_only=numeric_only,
963+
)
964+
930965
@doc(
931966
template_header,
932967
create_section_header("Parameters"),

0 commit comments

Comments
 (0)