Replace NaT with numpy's nat

MarcoGorelli · MarcoGorelli · commit 24c0c2c7fdc2 · 2019-08-22T22:11:08.000+01:00
diff --git a/doc/source/whatsnew/v0.25.1.rst.save b/doc/source/whatsnew/v0.25.1.rst.save
@@ -0,0 +1,168 @@
+.. _whatsnew_0251:
+
+What's new in 0.25.1 (July XX, 2019)
+------------------------------------
+
+Enhancements
+~~~~~~~~~~~~
+
+
+.. _whatsnew_0251.enhancements.other:
+
+Other enhancements
+^^^^^^^^^^^^^^^^^^
+
+-
+-
+-
+
+.. _whatsnew_0251.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+
+Categorical
+^^^^^^^^^^^
+
+- Bug in :meth:`Categorical.fillna` would replace all values, not just those that are ``NaN`` (:issue:`26215`)
+-
+
+Datetimelike
+^^^^^^^^^^^^
+- Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`)
+-
+-
+-
+
+Timedelta
+^^^^^^^^^
+
+-
+-
+-
+
+Timezones
+^^^^^^^^^
+
+- Bug in :class:`Index` where a numpy object array with a timezone aware :class:`Timestamp` and ``np.nan`` would not return a :class:`DatetimeIndex` (:issue:`27011`)
+-
+-
+
+Numeric
+^^^^^^^
+- Bug in :meth:`Series.interpolate` when using a timezone aware :class:`DatetimeIndex` (:issue:`27548`)
+- Bug when printing negative floating point complex numbers would raise an ``IndexError`` (:issue:`27484`)
+-
+-
+
+Conversion
+^^^^^^^^^^
+
+- Improved the warnings for the deprecated methods :meth:`Series.real` and :meth:`Series.imag` (:issue:`27610`)
+-
+-
+
+Strings
+^^^^^^^
+
+-
+-
+-
+
+
+Interval
+^^^^^^^^
+- Bug in :class:`IntervalIndex` where `dir(obj)` would raise ``ValueError`` (:issue:`27571`)
+-
+-
+-
+
+Indexing
+^^^^^^^^
+
+- Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`)
+- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`)
+- Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`).
+-
+
+Missing
+^^^^^^^
+
+-
+-
+-
+
+MultiIndex
+^^^^^^^^^^
+
+-
+-
+-
+
+I/O
+^^^
+
+- Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`)
+- Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`)
+-
+
+Plotting
+^^^^^^^^
+
+- Added a pandas_plotting_backends entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`).
+- Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`).
+-
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
+- Bug in windowing over read-only arrays (:issue:`27766`)
+- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`)
+- 
+-
+
+Reshaping
+^^^^^^^^^
+
+- A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`)
+- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`)
+- :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`)
+- Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`)
+
+Sparse
+^^^^^^
+- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`)
+-
+-
+-
+
+
+Build Changes
+^^^^^^^^^^^^^
+
+-
+-
+-
+
+ExtensionArray
+^^^^^^^^^^^^^^
+
+-
+-
+-
+
+Other
+^^^^^
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`)
+-
+-
+-
+
+.. _whatsnew_0.251.contributors:
+
+Contributors
+~~~~~~~~~~~~
+
+.. contributors:: v0.25.0..HEAD
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -45,6 +45,7 @@
 import pandas.core.algorithms as algorithms
 from pandas.core.base import DataError, SpecificationError
 import pandas.core.common as com
+from pandas.core.index import NaT
 from pandas.core.frame import DataFrame
 from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs
 from pandas.core.groupby import base
@@ -1140,14 +1141,17 @@ def nunique(self, dropna=True):
             Number of unique values within each group.
         """
         ids, _, _ = self.grouper.group_info
+        # breakpoint()
 
         val = self.obj._internal_get_values()
+        # breakpoint()
 
-        # GH 27951
-        if dropna:
-            mask = notna(val)
-            ids = ids[mask]
-            val = val[mask]
+        # # GH 27951
+        # breakpoint()
+        val[isna(val)] = np.datetime64("NaT")
+        #     mask = notna(val)
+        #     ids = ids[mask]
+        #     val = val[mask]
 
         try:
             sorter = np.lexsort((val, ids))
@@ -1159,22 +1163,29 @@ def nunique(self, dropna=True):
             _isna = lambda a: a == -1
         else:
             _isna = isna
+        # breakpoint()
 
         ids, val = ids[sorter], val[sorter]
+        # breakpoint()
 
         # group boundaries are where group ids change
         # unique observations are where sorted values change
+        # idx: ids at which groups change
         idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
+        # inc: 1 if it's a new value, 0 else
         inc = np.r_[1, val[1:] != val[:-1]]
+        # breakpoint()
 
         # 1st item of each group is a new unique observation
         mask = _isna(val)
         if dropna:
-            inc[idx] = 1
-            inc[mask] = 0
+            inc[idx] = 1  # 1st itme of each group is defo unique!
+            inc[mask] = 0  # Nas should be excluded, we don't like them!
         else:
-            inc[mask & np.r_[False, mask[:-1]]] = 0
-            inc[idx] = 1
+            inc[mask & np.r_[False, mask[:-1]]] = 0  # only set it to
+            # zero if there are two consecutive nans?
+            inc[idx] = 1  # As before, first of each group is defo unique
+        # breakpoint()
 
         out = np.add.reduceat(inc, idx).astype("int64", copy=False)
         if len(ids):
@@ -1188,11 +1199,13 @@ def nunique(self, dropna=True):
         else:
             res = out[1:]
         ri = self.grouper.result_index
+        # breakpoint()
 
         # we might have duplications among the bins
         if len(res) != len(ri):
             res, out = np.zeros(len(ri), dtype=out.dtype), res
             res[ids[idx]] = out
+        # breakpoint()
 
         return Series(res, index=ri, name=self._selection_name)
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -1026,7 +1026,7 @@ def test_nunique_with_timegrouper():
 
 
 @pytest.mark.parametrize(
-    "data, expected",
+    "data, dropna, expected",
     [
         (
             DataFrame(
@@ -1041,7 +1041,8 @@ def test_nunique_with_timegrouper():
                     ],
                 }
             ),
-            1,
+            True,
+            Series([1], index=pd.Index(["x"], name="key"), name="data"),
         ),
         (
             DataFrame(
@@ -1056,14 +1057,47 @@ def test_nunique_with_timegrouper():
                     ],
                 }
             ),
-            1,
+            True,
+            Series([1], index=pd.Index(["x"], name="key"), name="data"),
+        ),
+        (
+            DataFrame(
+                {
+                    "key": ["x", "x", "x", "y", "y"],
+                    "data": [
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                    ],
+                }
+            ),
+            False,
+            Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
+        ),
+        (
+            DataFrame(
+                {
+                    "key": ["x", "x", "x", "x", "y"],
+                    "data": [
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                    ],
+                }
+            ),
+            False,
+            Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
         ),
     ],
 )
-def test_nunique_with_NaT(data, expected):
+def test_nunique_with_NaT(data, dropna, expected):
     # GH 27951
-    result = data.groupby(["key"])["data"].nunique()[0]
-    assert result == expected
+    result = data.groupby(["key"])["data"].nunique(dropna=dropna)
+    tm.assert_series_equal(result, expected)
 
 
 def test_nunique_preserves_column_level_names():