Skip to content

Commit 24c0c2c

Browse files
committed
Replace NaT with numpy's nat
1 parent 94e7d07 commit 24c0c2c

File tree

3 files changed

+230
-15
lines changed

3 files changed

+230
-15
lines changed

doc/source/whatsnew/v0.25.1.rst.save

+168
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
.. _whatsnew_0251:
2+
3+
What's new in 0.25.1 (July XX, 2019)
4+
------------------------------------
5+
6+
Enhancements
7+
~~~~~~~~~~~~
8+
9+
10+
.. _whatsnew_0251.enhancements.other:
11+
12+
Other enhancements
13+
^^^^^^^^^^^^^^^^^^
14+
15+
-
16+
-
17+
-
18+
19+
.. _whatsnew_0251.bug_fixes:
20+
21+
Bug fixes
22+
~~~~~~~~~
23+
24+
25+
Categorical
26+
^^^^^^^^^^^
27+
28+
- Bug in :meth:`Categorical.fillna` would replace all values, not just those that are ``NaN`` (:issue:`26215`)
29+
-
30+
31+
Datetimelike
32+
^^^^^^^^^^^^
33+
- Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`)
34+
-
35+
-
36+
-
37+
38+
Timedelta
39+
^^^^^^^^^
40+
41+
-
42+
-
43+
-
44+
45+
Timezones
46+
^^^^^^^^^
47+
48+
- Bug in :class:`Index` where a numpy object array with a timezone aware :class:`Timestamp` and ``np.nan`` would not return a :class:`DatetimeIndex` (:issue:`27011`)
49+
-
50+
-
51+
52+
Numeric
53+
^^^^^^^
54+
- Bug in :meth:`Series.interpolate` when using a timezone aware :class:`DatetimeIndex` (:issue:`27548`)
55+
- Bug when printing negative floating point complex numbers would raise an ``IndexError`` (:issue:`27484`)
56+
-
57+
-
58+
59+
Conversion
60+
^^^^^^^^^^
61+
62+
- Improved the warnings for the deprecated methods :meth:`Series.real` and :meth:`Series.imag` (:issue:`27610`)
63+
-
64+
-
65+
66+
Strings
67+
^^^^^^^
68+
69+
-
70+
-
71+
-
72+
73+
74+
Interval
75+
^^^^^^^^
76+
- Bug in :class:`IntervalIndex` where `dir(obj)` would raise ``ValueError`` (:issue:`27571`)
77+
-
78+
-
79+
-
80+
81+
Indexing
82+
^^^^^^^^
83+
84+
- Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`)
85+
- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`)
86+
- Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`).
87+
-
88+
89+
Missing
90+
^^^^^^^
91+
92+
-
93+
-
94+
-
95+
96+
MultiIndex
97+
^^^^^^^^^^
98+
99+
-
100+
-
101+
-
102+
103+
I/O
104+
^^^
105+
106+
- Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`)
107+
- Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`)
108+
-
109+
110+
Plotting
111+
^^^^^^^^
112+
113+
- Added a pandas_plotting_backends entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`).
114+
- Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`).
115+
-
116+
117+
Groupby/resample/rolling
118+
^^^^^^^^^^^^^^^^^^^^^^^^
119+
120+
- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
121+
- Bug in windowing over read-only arrays (:issue:`27766`)
122+
- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`)
123+
-
124+
-
125+
126+
Reshaping
127+
^^^^^^^^^
128+
129+
- A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`)
130+
- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`)
131+
- :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`)
132+
- Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`)
133+
134+
Sparse
135+
^^^^^^
136+
- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`)
137+
-
138+
-
139+
-
140+
141+
142+
Build Changes
143+
^^^^^^^^^^^^^
144+
145+
-
146+
-
147+
-
148+
149+
ExtensionArray
150+
^^^^^^^^^^^^^^
151+
152+
-
153+
-
154+
-
155+
156+
Other
157+
^^^^^
158+
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`)
159+
-
160+
-
161+
-
162+
163+
.. _whatsnew_0.251.contributors:
164+
165+
Contributors
166+
~~~~~~~~~~~~
167+
168+
.. contributors:: v0.25.0..HEAD

pandas/core/groupby/generic.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import pandas.core.algorithms as algorithms
4646
from pandas.core.base import DataError, SpecificationError
4747
import pandas.core.common as com
48+
from pandas.core.index import NaT
4849
from pandas.core.frame import DataFrame
4950
from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs
5051
from pandas.core.groupby import base
@@ -1140,14 +1141,17 @@ def nunique(self, dropna=True):
11401141
Number of unique values within each group.
11411142
"""
11421143
ids, _, _ = self.grouper.group_info
1144+
# breakpoint()
11431145

11441146
val = self.obj._internal_get_values()
1147+
# breakpoint()
11451148

1146-
# GH 27951
1147-
if dropna:
1148-
mask = notna(val)
1149-
ids = ids[mask]
1150-
val = val[mask]
1149+
# # GH 27951
1150+
# breakpoint()
1151+
val[isna(val)] = np.datetime64("NaT")
1152+
# mask = notna(val)
1153+
# ids = ids[mask]
1154+
# val = val[mask]
11511155

11521156
try:
11531157
sorter = np.lexsort((val, ids))
@@ -1159,22 +1163,29 @@ def nunique(self, dropna=True):
11591163
_isna = lambda a: a == -1
11601164
else:
11611165
_isna = isna
1166+
# breakpoint()
11621167

11631168
ids, val = ids[sorter], val[sorter]
1169+
# breakpoint()
11641170

11651171
# group boundaries are where group ids change
11661172
# unique observations are where sorted values change
1173+
# idx: ids at which groups change
11671174
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
1175+
# inc: 1 if it's a new value, 0 else
11681176
inc = np.r_[1, val[1:] != val[:-1]]
1177+
# breakpoint()
11691178

11701179
# 1st item of each group is a new unique observation
11711180
mask = _isna(val)
11721181
if dropna:
1173-
inc[idx] = 1
1174-
inc[mask] = 0
1182+
inc[idx] = 1 # 1st itme of each group is defo unique!
1183+
inc[mask] = 0 # Nas should be excluded, we don't like them!
11751184
else:
1176-
inc[mask & np.r_[False, mask[:-1]]] = 0
1177-
inc[idx] = 1
1185+
inc[mask & np.r_[False, mask[:-1]]] = 0 # only set it to
1186+
# zero if there are two consecutive nans?
1187+
inc[idx] = 1 # As before, first of each group is defo unique
1188+
# breakpoint()
11781189

11791190
out = np.add.reduceat(inc, idx).astype("int64", copy=False)
11801191
if len(ids):
@@ -1188,11 +1199,13 @@ def nunique(self, dropna=True):
11881199
else:
11891200
res = out[1:]
11901201
ri = self.grouper.result_index
1202+
# breakpoint()
11911203

11921204
# we might have duplications among the bins
11931205
if len(res) != len(ri):
11941206
res, out = np.zeros(len(ri), dtype=out.dtype), res
11951207
res[ids[idx]] = out
1208+
# breakpoint()
11961209

11971210
return Series(res, index=ri, name=self._selection_name)
11981211

pandas/tests/groupby/test_function.py

+40-6
Original file line numberDiff line numberDiff line change
@@ -1026,7 +1026,7 @@ def test_nunique_with_timegrouper():
10261026

10271027

10281028
@pytest.mark.parametrize(
1029-
"data, expected",
1029+
"data, dropna, expected",
10301030
[
10311031
(
10321032
DataFrame(
@@ -1041,7 +1041,8 @@ def test_nunique_with_timegrouper():
10411041
],
10421042
}
10431043
),
1044-
1,
1044+
True,
1045+
Series([1], index=pd.Index(["x"], name="key"), name="data"),
10451046
),
10461047
(
10471048
DataFrame(
@@ -1056,14 +1057,47 @@ def test_nunique_with_timegrouper():
10561057
],
10571058
}
10581059
),
1059-
1,
1060+
True,
1061+
Series([1], index=pd.Index(["x"], name="key"), name="data"),
1062+
),
1063+
(
1064+
DataFrame(
1065+
{
1066+
"key": ["x", "x", "x", "y", "y"],
1067+
"data": [
1068+
dt.date(2019, 1, 1),
1069+
NaT,
1070+
dt.date(2019, 1, 1),
1071+
NaT,
1072+
dt.date(2019, 1, 1),
1073+
],
1074+
}
1075+
),
1076+
False,
1077+
Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
1078+
),
1079+
(
1080+
DataFrame(
1081+
{
1082+
"key": ["x", "x", "x", "x", "y"],
1083+
"data": [
1084+
dt.date(2019, 1, 1),
1085+
NaT,
1086+
dt.date(2019, 1, 1),
1087+
NaT,
1088+
dt.date(2019, 1, 1),
1089+
],
1090+
}
1091+
),
1092+
False,
1093+
Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
10601094
),
10611095
],
10621096
)
1063-
def test_nunique_with_NaT(data, expected):
1097+
def test_nunique_with_NaT(data, dropna, expected):
10641098
# GH 27951
1065-
result = data.groupby(["key"])["data"].nunique()[0]
1066-
assert result == expected
1099+
result = data.groupby(["key"])["data"].nunique(dropna=dropna)
1100+
tm.assert_series_equal(result, expected)
10671101

10681102

10691103
def test_nunique_preserves_column_level_names():

0 commit comments

Comments
 (0)