diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 7ad2641dec52a..c9e18b585c764 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -784,11 +784,11 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() -NA group handling -~~~~~~~~~~~~~~~~~ +NA and NaT group handling +~~~~~~~~~~~~~~~~~~~~~~~~~ -If there are any NaN values in the grouping key, these will be automatically -excluded. So there will never be an "NA group". This was not the case in older +If there are any NaN or NaT values in the grouping key, these will be automatically +excluded. So there will never be an "NA group" or "NaT group". This was not the case in older versions of pandas, but users were generally discarding the NA group anyway (and supporting it was an implementation headache). diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index a7917e81f7057..5d4d149798d21 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -69,13 +69,19 @@ Bug Fixes - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) + - Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`) - Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`) + - Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) + - Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`) - Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`) +- Bug in GroupBy.get_group raises ValueError when group key contains NaT (:issue:`6992`) + + diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ffc3e6a08221c..51674bad60f5b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -426,7 +426,11 @@ def convert(key, s): return Timestamp(key).asm8 return key - sample = next(iter(self.indices)) + if len(self.indices) > 0: + sample = next(iter(self.indices)) + else: + sample = None # Dummy sample + if isinstance(sample, tuple): if not isinstance(name, tuple): msg = ("must supply a tuple to get_group with multiple" diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index a0cdc0ff5e841..598cdff30e4f7 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -37,6 +37,8 @@ cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -673,7 +675,7 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 79722a26ebedc..428decd4dca10 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -28,6 +28,8 @@ ctypedef unsigned char UChar cimport util from util cimport is_array, _checknull, _checknan, get_nat +cimport lib +from lib cimport is_null_datetimelike cdef int64_t iNaT = get_nat() @@ -2096,7 +2098,7 @@ def groupby_float64(ndarray[float64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2124,7 +2126,7 @@ def groupby_float32(ndarray[float32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2152,7 +2154,7 @@ def groupby_object(ndarray[object] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2180,7 +2182,7 @@ def groupby_int32(ndarray[int32_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2208,7 +2210,7 @@ def groupby_int64(ndarray[int64_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] @@ -2236,7 +2238,7 @@ def groupby_bool(ndarray[uint8_t] index, ndarray labels): for i in range(length): key = util.get_value_1d(labels, i) - if _checknull(key): + if is_null_datetimelike(key): continue idx = index[i] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c308308603167..0789e20df3945 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -699,7 +699,6 @@ def test_get_group(self): expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1]) assert_panel_equal(gp, expected) - # GH 5267 # be datelike friendly df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', @@ -2837,6 +2836,49 @@ def test_groupby_list_infer_array_like(self): result = df.groupby(['foo', 'bar']).mean() expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + def test_groupby_nat_exclude(self): + # GH 6992 + df = pd.DataFrame({'values': np.random.randn(8), + 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp('2013-02-01'), + np.nan, pd.Timestamp('2013-02-01'), np.nan, pd.Timestamp('2013-01-01')], + 'str': [np.nan, 'a', np.nan, 'a', + np.nan, 'a', np.nan, 'b']}) + grouped = df.groupby('dt') + + expected = [[1, 7], [3, 5]] + keys = sorted(grouped.groups.keys()) + self.assertEqual(len(keys), 2) + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + self.assertEqual(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + self.assertEqual(grouped.ngroups, 2) + expected = {Timestamp('2013-01-01 00:00:00'): np.array([1, 7]), + Timestamp('2013-02-01 00:00:00'): np.array([3, 5])} + for k in grouped.indices: + self.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + + nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], + 'nat': [pd.NaT, pd.NaT, pd.NaT]}) + self.assertEqual(nan_df['nan'].dtype, 'float64') + self.assertEqual(nan_df['nat'].dtype, 'datetime64[ns]') + + for key in ['nan', 'nat']: + grouped = nan_df.groupby(key) + self.assertEqual(grouped.groups, {}) + self.assertEqual(grouped.ngroups, 0) + self.assertEqual(grouped.indices, {}) + self.assertRaises(KeyError, grouped.get_group, np.nan) + self.assertRaises(KeyError, grouped.get_group, pd.NaT) + def test_dictify(self): dict(iter(self.df.groupby('A'))) dict(iter(self.df.groupby(['A', 'B']))) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 444aa2a0bab1e..93299292cf353 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1858,6 +1858,25 @@ def test_ufunc_compat(self): expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) tm.assert_index_equal(result, expected) + def test_index_groupby(self): + int_idx = Index(range(6)) + float_idx = Index(np.arange(0, 0.6, 0.1)) + obj_idx = Index('A B C D E F'.split()) + dt_idx = pd.date_range('2013-01-01', freq='M', periods=6) + + for idx in [int_idx, float_idx, obj_idx, dt_idx]: + to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) + self.assertEqual(idx.groupby(to_groupby), + {1.0: [idx[0], idx[5]], 2.0: [idx[1], idx[4]]}) + + to_groupby = Index([datetime(2011, 11, 1), datetime(2011, 12, 1), + pd.NaT, pd.NaT, + datetime(2011, 12, 1), datetime(2011, 11, 1)], tz='UTC').values + + ex_keys = pd.tslib.datetime_to_datetime64(np.array([Timestamp('2011-11-01'), Timestamp('2011-12-01')])) + expected = {ex_keys[0][0]: [idx[0], idx[5]], ex_keys[0][1]: [idx[1], idx[4]]} + self.assertEqual(idx.groupby(to_groupby), expected) + class TestFloat64Index(Numeric, tm.TestCase): _holder = Float64Index