BUG: GroupBy.get_group raises ValueError when group key contains NaT

sinhrks · sinhrks · commit 173185235d95 · 2015-05-16T01:08:30.000+09:00
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems:
 
    df.groupby('A').std()
 
-NA group handling
+NA and NaT group handling
 ~~~~~~~~~~~~~~~~~
 
-If there are any NaN values in the grouping key, these will be automatically
-excluded. So there will never be an "NA group". This was not the case in older
+If there are any NaN or NaT values in the grouping key, these will be automatically
+excluded. So there will never be an "NA group" or "NaT group". This was not the case in older
 versions of pandas, but users were generally discarding the NA group anyway
 (and supporting it was an implementation headache).
 
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -68,3 +68,8 @@ Bug Fixes
 
 
 
+
+
+- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`)
+
+
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -61,6 +61,8 @@ cdef extern from "src/headers/math.h":
     int signbit(double)
 
 from pandas import lib
+from pandas import tslib
+cdef object NaT = tslib.NaT
 
 include "skiplist.pyx"
 
@@ -2010,7 +2012,7 @@ def groupby_indices(ndarray values):
         k = labels[i]
 
         # was NaN
-        if k == -1:
+        if k == -1 or k is NaT:
             continue
 
         loc = seen[k]
@@ -2043,7 +2045,7 @@ def group_labels(ndarray[object] values):
         val = values[i]
 
         # is NaN
-        if val != val:
+        if val != val or val is NaT:
             labels[i] = -1
             continue
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -426,7 +426,11 @@ def convert(key, s):
                 return Timestamp(key).asm8
             return key
 
-        sample = next(iter(self.indices))
+        if len(self.indices) > 0:
+            sample = next(iter(self.indices))
+        else:
+            sample = None       # Dummy sample
+
         if isinstance(sample, tuple):
             if not isinstance(name, tuple):
                 msg = ("must supply a tuple to get_group with multiple"
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -37,6 +37,8 @@
 
 cimport util
 from util cimport is_array, _checknull, _checknan, get_nat
+cimport tslib
+from tslib cimport _checknull_with_np_nat
 
 cdef int64_t iNaT = get_nat()
 
@@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if _checknull_with_np_nat(key):
             continue
 
         idx = index[i]
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
@@ -28,8 +28,11 @@ ctypedef unsigned char UChar
 
 cimport util
 from util cimport is_array, _checknull, _checknan, get_nat
+cimport tslib
+from tslib cimport _checknull_with_np_nat
 
 cdef int64_t iNaT = get_nat()
+np_NaT = np.datetime64('NaT')
 
 # import datetime C API
 PyDateTime_IMPORT
@@ -2096,7 +2099,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if _checknull_with_np_nat(key):
             continue
 
         idx = index[i]
@@ -2124,7 +2127,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if _checknull_with_np_nat(key):
             continue
 
         idx = index[i]
@@ -2152,7 +2155,7 @@ def groupby_object(ndarray[object] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if _checknull_with_np_nat(key):
             continue
 
         idx = index[i]
@@ -2180,7 +2183,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if _checknull_with_np_nat(key):
             continue
 
         idx = index[i]
@@ -2208,7 +2211,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if _checknull_with_np_nat(key):
             continue
 
         idx = index[i]
@@ -2236,7 +2239,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels):
     for i in range(length):
         key = util.get_value_1d(labels, i)
 
-        if _checknull(key):
+        if _checknull_with_np_nat(key):
             continue
 
         idx = index[i]
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -699,7 +699,6 @@ def test_get_group(self):
         expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1])
         assert_panel_equal(gp, expected)
 
-
         # GH 5267
         # be datelike friendly
         df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013',
@@ -2837,6 +2836,49 @@ def test_groupby_list_infer_array_like(self):
         result = df.groupby(['foo', 'bar']).mean()
         expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
 
+    def test_groupby_nat_exclude(self):
+        # GH 6992
+        df = pd.DataFrame({'values': np.random.randn(8),
+                   'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'),
+                          np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')],
+                    'str': [np.nan, 'a', np.nan, 'a',
+                          np.nan, 'a', np.nan, 'b']})
+        grouped = df.groupby('dt')
+
+        expected = [[1, 7], [3, 5]]
+        keys = sorted(grouped.groups.keys())
+        self.assertEqual(len(keys), 2)
+        for k, e in zip(keys, expected):
+            # grouped.groups keys are np.datetime64 with system tz
+            # not to be affected by tz, only compare values
+            self.assertEqual(grouped.groups[k], e)
+
+        # confirm obj is not filtered
+        tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
+        self.assertEqual(grouped.ngroups, 2)
+        expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]),
+                    Timestamp('2013-02-01 00:00:00'): np.array([3, 5])}
+        for k in grouped.indices:
+            self.assert_numpy_array_equal(grouped.indices[k], expected[k])
+
+        tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
+        tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
+
+        self.assertRaises(KeyError, grouped.get_group, pd.NaT)
+
+        nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
+                            'nat': [pd.NaT, pd.NaT, pd.NaT]})
+        self.assertEqual(nan_df['nan'].dtype, 'float64')
+        self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]')
+
+        for key in ['nan', 'nat']:
+            grouped = nan_df.groupby(key)
+            self.assertEqual(grouped.groups, {})
+            self.assertEqual(grouped.ngroups, 0)
+            self.assertEqual(grouped.indices, {})
+            self.assertRaises(KeyError, grouped.get_group, np.nan)
+            self.assertRaises(KeyError, grouped.get_group, pd.NaT)
+
     def test_dictify(self):
         dict(iter(self.df.groupby('A')))
         dict(iter(self.df.groupby(['A', 'B'])))
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -1858,6 +1858,34 @@ def test_ufunc_compat(self):
         expected = Float64Index(np.sin(np.arange(5,dtype='int64')))
         tm.assert_index_equal(result, expected)
 
+    def test_index_groupby(self):
+        int_idx = Index(range(6))
+        float_idx = Index(np.arange(0, 0.6, 0.1))
+        obj_idx = Index('A B C D E F'.split())
+        dt_idx = pd.date_range('2013-01-01', freq='M', periods=6)
+
+        for idx in [int_idx, float_idx, obj_idx, dt_idx]:
+            to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1])
+            self.assertEqual(idx.groupby(to_groupby),
+                             {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]})
+            self.assertEqual(idx.groupby(to_groupby, dropna=False),
+                             {np.nan:[idx[2], idx[3]], 1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]})
+
+            to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1),
+                                pd.NaT, pd.NaT,
+                                datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values
+
+            ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')]))
+            expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]}
+            self.assertEqual(idx.groupby(to_groupby), expected)
+
+            ex_keys = pd.tslib.datetime_to_datetime64(np.array([pd.NaT, Timestamp('2011-11-01'),
+                                                                Timestamp('2011-12-01')]))
+            expected = {ex_keys[0][0]: [idx[2], idx[3]],
+                        ex_keys[0][1]: [idx[0], idx[5]],
+                        ex_keys[0][2]: [idx[1], idx[4]]}
+            self.assertEqual(idx.groupby(to_groupby, dropna=False), expected)
+
 
 class TestFloat64Index(Numeric, tm.TestCase):
     _holder = Float64Index
diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
@@ -2061,6 +2061,12 @@ def test_pickle(self):
         self.assertTrue(idx_p[1] is NaT)
         self.assertTrue(idx_p[2] == idx[2])
 
+    def test_indexing_doesnt_change_class(self):
+        idx = Index([1, 2, 3, 'a', 'b', 'c'])
+
+        self.assertTrue(idx[1:3].identical(pd.Index([2, 3], dtype=np.object_)))
+        self.assertTrue(idx[[0,1]].identical(pd.Index([1, 2], dtype=np.object_)))
+
 
 def _simple_ts(start, end, freq='D'):
     rng = date_range(start, end, freq=freq)
diff --git a/pandas/tslib.pxd b/pandas/tslib.pxd
@@ -7,3 +7,4 @@ cdef bint _is_utc(object)
 cdef bint _is_tzlocal(object)
 cdef object _get_dst_info(object)
 cdef bint _nat_scalar_rules[6]
+cdef bint _checknull_with_np_nat(object)
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
@@ -646,6 +646,13 @@ cdef inline bint _checknull_with_nat(object val):
     return val is None or (
         PyFloat_Check(val) and val != val) or val is NaT
 
+
+cdef inline bint _checknull_with_np_nat(object val):
+    """ utility to check if a value is a nat or not """
+    return val is None or (
+        PyFloat_Check(val) and val != val) or val == np_NaT
+
+
 cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1:
     return _nat_scalar_rules[op]
 

-Original file line number
+Diff line change
++
++
 +- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`)
++
++