From cf1f181b8dcd5f4a7f2989deac9f928c91e43520 Mon Sep 17 00:00:00 2001 From: ajcr Date: Sun, 2 Aug 2015 12:12:41 +0100 Subject: [PATCH] BUG: pd.unique should respect datetime64 and timedelta64 dtypes (GH9431) --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/algorithms.py | 10 ++++++-- pandas/tests/test_algos.py | 44 +++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9a9054fcf0489..16c6c639a489e 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -606,3 +606,4 @@ Bug Fixes - Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`) - Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`) +- Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue: `9431`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c958a70b43089..b0c7ff43bc7d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -36,7 +36,7 @@ def match(to_match, values, na_sentinel=-1): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) - result = _hashtable_algo(f, values.dtype) + result = _hashtable_algo(f, values.dtype, np.int64) if na_sentinel != -1: @@ -66,7 +66,7 @@ def unique(values): return _hashtable_algo(f, values.dtype) -def _hashtable_algo(f, dtype): +def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ @@ -74,6 +74,12 @@ def _hashtable_algo(f, dtype): return f(htable.Float64HashTable, com._ensure_float64) elif com.is_integer_dtype(dtype): return f(htable.Int64HashTable, com._ensure_int64) + elif com.is_datetime64_dtype(dtype): + return_dtype = return_dtype or 'M8[ns]' + return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) + elif com.is_timedelta64_dtype(dtype): + return_dtype = return_dtype or 'm8[ns]' + return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) else: return f(htable.PyObjectHashTable, com._ensure_object) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 30dcd8631f13a..cf72f0e433634 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -235,6 +235,50 @@ def test_on_index_object(self): tm.assert_almost_equal(result, expected) + def test_datetime64_dtype_array_returned(self): + # GH 9431 + expected = np.array(['2015-01-03T00:00:00.000000000+0000', + '2015-01-01T00:00:00.000000000+0000'], dtype='M8[ns]') + + dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000', + '2015-01-01T00:00:00.000000000+0000', + '2015-01-01T00:00:00.000000000+0000']) + result = algos.unique(dt_index) + tm.assert_numpy_array_equal(result, expected) + self.assertEqual(result.dtype, expected.dtype) + + s = pd.Series(dt_index) + result = algos.unique(s) + tm.assert_numpy_array_equal(result, expected) + self.assertEqual(result.dtype, expected.dtype) + + arr = s.values + result = algos.unique(arr) + tm.assert_numpy_array_equal(result, expected) + self.assertEqual(result.dtype, expected.dtype) + + + def test_timedelta64_dtype_array_returned(self): + # GH 9431 + expected = np.array([31200, 45678, 10000], dtype='m8[ns]') + + td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) + result = algos.unique(td_index) + tm.assert_numpy_array_equal(result, expected) + self.assertEqual(result.dtype, expected.dtype) + + s = pd.Series(td_index) + result = algos.unique(s) + tm.assert_numpy_array_equal(result, expected) + self.assertEqual(result.dtype, expected.dtype) + + arr = s.values + result = algos.unique(arr) + tm.assert_numpy_array_equal(result, expected) + self.assertEqual(result.dtype, expected.dtype) + + + class TestValueCounts(tm.TestCase): _multiprocess_can_split_ = True