BUG: GroupBy.get_group raises ValueError when group key contains NaT

sinhrks · sinhrks · commit 36057382039b · 2015-05-17T00:38:17.000+09:00
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems:
 
    df.groupby('A').std()
 
-NA group handling
+NA and NaT group handling
 ~~~~~~~~~~~~~~~~~
 
-If there are any NaN values in the grouping key, these will be automatically
-excluded. So there will never be an "NA group". This was not the case in older
+If there are any NaN or NaT values in the grouping key, these will be automatically
+excluded. So there will never be an "NA group" or "NaT group". This was not the case in older
 versions of pandas, but users were generally discarding the NA group anyway
 (and supporting it was an implementation headache).
 
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -66,4 +66,11 @@ Bug Fixes
 - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
 - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)
 
+
 - Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`)
+
+
+
+
+- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`)
+
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -61,6 +61,8 @@ cdef extern from "src/headers/math.h":
     int signbit(double)
 
 from pandas import lib
+from pandas import tslib
+cdef object NaT = tslib.NaT
 
 include "skiplist.pyx"
 
@@ -2010,7 +2012,7 @@ def groupby_indices(ndarray values):
         k = labels[i]
 
         # was NaN
-        if k == -1:
+        if k == -1 or k is NaT:
             continue
 
         loc = seen[k]
@@ -2043,7 +2045,7 @@ def group_labels(ndarray[object] values):
         val = values[i]
 
         # is NaN
-        if val != val:
+        if val != val or val is NaT:
             labels[i] = -1
             continue
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -426,7 +426,11 @@ def convert(key, s):
                 return Timestamp(key).asm8
             return key
 
-        sample = next(iter(self.indices))
+        if len(self.indices) > 0:
+            sample = next(iter(self.indices))
+        else:
+            sample = None       # Dummy sample
+
         if isinstance(sample, tuple):
             if not isinstance(name, tuple):
                 msg = ("must supply a tuple to get_group with multiple"
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -37,6 +37,8 @@
 
 cimport util
 from util cimport is_array, _checknull, _checknan, get_nat
+cimport lib
+from lib cimport is_null_datetimelike
 
 cdef int64_t iNaT = get_nat()
 
@@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if is_null_datetimelike(key):
             continue
 
         idx = index[i]
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
@@ -28,6 +28,8 @@ ctypedef unsigned char UChar
 
 cimport util
 from util cimport is_array, _checknull, _checknan, get_nat
+cimport lib
+from lib cimport is_null_datetimelike
 
 cdef int64_t iNaT = get_nat()
 
@@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if is_null_datetimelike(key):
             continue
 
         idx = index[i]
@@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if is_null_datetimelike(key):
             continue
 
         idx = index[i]
@@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if is_null_datetimelike(key):
             continue
 
         idx = index[i]
@@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if is_null_datetimelike(key):
             continue
 
         idx = index[i]
@@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if is_null_datetimelike(key):
             continue
 
         idx = index[i]
@@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if is_null_datetimelike(key):
             continue
 
         idx = index[i]
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -699,7 +699,6 @@ def test_get_group(self):
         expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
         assert_panel_equal(gp, expected)
 
-
         # GH 5267
         # be datelike friendly
         df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013',
@@ -2837,6 +2836,49 @@ def test_groupby_list_infer_array_like(self):
         result = df.groupby(['foo', 'bar']).mean()
         expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
 
+    def test_groupby_nat_exclude(self):
+        # GH 6992
+        df = pd.DataFrame({'values': np.random.randn(8),
+                   'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'),
+                          np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')],
+                    'str': [np.nan, 'a', np.nan, 'a',
+                          np.nan, 'a', np.nan, 'b']})
+        grouped = df.groupby('dt')
+
+        expected = [[1, 7], [3, 5]]
+        keys = sorted(grouped.groups.keys())
+        self.assertEqual(len(keys), 2)
+        for k, e in zip(keys, expected):
+            # grouped.groups keys are np.datetime64 with system tz
+            # not to be affected by tz, only compare values
+            self.assertEqual(grouped.groups[k], e)
+
+        # confirm obj is not filtered
+        tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
+        self.assertEqual(grouped.ngroups, 2)
+        expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]),
+                    Timestamp('2013-02-01 00:00:00'): np.array([3, 5])}
+        for k in grouped.indices:
+            self.assert_numpy_array_equal(grouped.indices[k], expected[k])
+
+        tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
+        tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
+
+        self.assertRaises(KeyError, grouped.get_group, pd.NaT)
+
+        nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
+                            'nat': [pd.NaT, pd.NaT, pd.NaT]})
+        self.assertEqual(nan_df['nan'].dtype, 'float64')
+        self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]')
+
+        for key in ['nan', 'nat']:
+            grouped = nan_df.groupby(key)
+            self.assertEqual(grouped.groups, {})
+            self.assertEqual(grouped.ngroups, 0)
+            self.assertEqual(grouped.indices, {})
+            self.assertRaises(KeyError, grouped.get_group, np.nan)
+            self.assertRaises(KeyError, grouped.get_group, pd.NaT)
+
     def test_dictify(self):
         dict(iter(self.df.groupby('A')))
         dict(iter(self.df.groupby(['A', 'B'])))
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -1858,6 +1858,25 @@ def test_ufunc_compat(self):
         expected = Float64Index(np.sin(np.arange(5,dtype='int64')))
         tm.assert_index_equal(result, expected)
 
+    def test_index_groupby(self):
+        int_idx = Index(range(6))
+        float_idx = Index(np.arange(0, 0.6, 0.1))
+        obj_idx = Index('A B C D E F'.split())
+        dt_idx = pd.date_range('2013-01-01', freq='M', periods=6)
+
+        for idx in [int_idx, float_idx, obj_idx, dt_idx]:
+            to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1])
+            self.assertEqual(idx.groupby(to_groupby),
+                             {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]})
+
+            to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1),
+                                pd.NaT, pd.NaT,
+                                datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values
+
+            ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')]))
+            expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]}
+            self.assertEqual(idx.groupby(to_groupby), expected)
+
 
 class TestFloat64Index(Numeric, tm.TestCase):
     _holder = Float64Index