From 543524700440ff9ed7e17c6d93fc5937499db6e5 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 12 Aug 2016 22:37:13 +0900 Subject: [PATCH] CLN: Datetimelike._can_hold_na --- pandas/tests/indexes/common.py | 13 +++- pandas/tests/test_base.py | 100 ++++++++++++++++-------------- pandas/tseries/base.py | 7 +-- pandas/tseries/period.py | 9 +++ pandas/tseries/tests/test_base.py | 59 ++++++++++++++++-- 5 files changed, 128 insertions(+), 60 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 687782172693a..2c8031898c78e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -8,6 +8,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, notnull) +from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -319,13 +320,21 @@ def test_get_unique_index(self): if not ind._can_hold_na: continue - vals = ind.values[[0] * 5] - vals[0] = np.nan + if needs_i8_conversion(ind): + vals = ind.asi8[[0] * 5] + vals[0] = pd.tslib.iNaT + else: + vals = ind.values[[0] * 5] + vals[0] = np.nan + vals_unique = vals[:2] idx_nan = ind._shallow_copy(vals) idx_unique_nan = ind._shallow_copy(vals_unique) self.assertTrue(idx_unique_nan.is_unique) + self.assertEqual(idx_nan.dtype, ind.dtype) + self.assertEqual(idx_unique_nan.dtype, ind.dtype) + for dropna, expected in zip([False, True], [idx_unique_nan, idx_unique]): for i in [idx_nan, idx_unique_nan]: diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 83b1cd141a61b..eaa316bfd8157 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -9,7 +9,8 @@ import pandas as pd import pandas.compat as compat -from pandas.types.common import is_object_dtype, is_datetimetz +from pandas.types.common import (is_object_dtype, is_datetimetz, + needs_i8_conversion) import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta) @@ -17,7 +18,6 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import (FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin) -from pandas.types.common import is_datetime64_dtype from pandas.tseries.base import DatetimeIndexOpsMixin @@ -450,7 +450,6 @@ def test_nanops(self): def test_value_counts_unique_nunique(self): for orig in self.objs: - o = orig.copy() klass = type(o) values = o._values @@ -504,9 +503,10 @@ def test_value_counts_unique_nunique(self): def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: - for o in self.objs: + for orig in self.objs: + o = orig.copy() klass = type(o) - values = o.values + values = o._values if not self._allow_na_ops(o): continue @@ -522,34 +522,43 @@ def test_value_counts_unique_nunique_null(self): o[0:2] = pd.tslib.iNaT values = o._values - elif is_datetime64_dtype(o) or isinstance(o, PeriodIndex): + elif needs_i8_conversion(o): values[0:2] = pd.tslib.iNaT + values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original + self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq - # ambiguous + if isinstance(o, (DatetimeIndex, PeriodIndex)): + expected_index = o.copy() + expected_index.name = None - # resets name from Index - expected_index = pd.Index(o, name=None) # attach name to klass - o = klass(np.repeat(values, range(1, len(o) + 1)), - freq=o.freq, name='a') - elif isinstance(o, Index): - expected_index = pd.Index(values, name=None) - o = klass( - np.repeat(values, range(1, len(o) + 1)), name='a') + o = klass(values.repeat(range(1, len(o) + 1))) + o.name = 'a' else: - expected_index = pd.Index(values, name=None) - idx = np.repeat(o.index.values, range(1, len(o) + 1)) - o = klass( - np.repeat(values, range( - 1, len(o) + 1)), index=idx, name='a') + if is_datetimetz(o): + expected_index = orig._values._shallow_copy(values) + else: + expected_index = pd.Index(values) + expected_index.name = None + o = o.repeat(range(1, len(o) + 1)) + o.name = 'a' + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) + # check values correctly have NaN + nanloc = np.zeros(len(o), dtype=np.bool) + nanloc[:3] = True + if isinstance(o, Index): + self.assert_numpy_array_equal(pd.isnull(o), nanloc) + else: + exp = pd.Series(nanloc, o.index, name='a') + self.assert_series_equal(pd.isnull(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], @@ -578,7 +587,9 @@ def test_value_counts_unique_nunique_null(self): self.assertIs(result[0], pd.NaT) else: tm.assert_numpy_array_equal(result[1:], values[2:]) + self.assertTrue(pd.isnull(result[0])) + self.assertEqual(result.dtype, orig.dtype) self.assertEqual(o.nunique(), 8) self.assertEqual(o.nunique(dropna=False), 9) @@ -942,18 +953,14 @@ def test_fillna(self): # # GH 11343 # though Index.fillna and Series.fillna has separate impl, # test here to confirm these works as the same - def get_fill_value(obj): - if isinstance(obj, pd.tseries.base.DatetimeIndexOpsMixin): - return obj.asobject.values[0] - else: - return obj.values[0] - for o in self.objs: - klass = type(o) + for orig in self.objs: + + o = orig.copy() values = o.values # values will not be changed - result = o.fillna(get_fill_value(o)) + result = o.fillna(o.astype(object).values[0]) if isinstance(o, Index): self.assert_index_equal(o, result) else: @@ -962,33 +969,30 @@ def get_fill_value(obj): self.assertFalse(o is result) for null_obj in [np.nan, None]: - for o in self.objs: + for orig in self.objs: + o = orig.copy() klass = type(o) - values = o.values.copy() if not self._allow_na_ops(o): continue - # value for filling - fill_value = get_fill_value(o) + if needs_i8_conversion(o): - # special assign to the numpy array - if o.values.dtype == 'datetime64[ns]' or isinstance( - o, PeriodIndex): - values[0:2] = pd.tslib.iNaT + values = o.astype(object).values + fill_value = values[0] + values[0:2] = pd.NaT else: + values = o.values.copy() + fill_value = o.values[0] values[0:2] = null_obj - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq - # ambiguous - expected = [fill_value.ordinal] * 2 + list(values[2:]) - expected = klass(ordinal=expected, freq=o.freq) - o = klass(ordinal=values, freq=o.freq) - else: - expected = [fill_value] * 2 + list(values[2:]) - expected = klass(expected) - o = klass(values) + expected = [fill_value] * 2 + list(values[2:]) + + expected = klass(expected) + o = klass(values) + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) result = o.fillna(fill_value) if isinstance(o, Index): diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index c08bb53238e5c..f0c6e334925c4 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -362,6 +362,8 @@ def get_duplicates(self): values = Index.get_duplicates(self) return self._simple_new(values) + _can_hold_na = True + _na_value = tslib.NaT """The expected NA value to use with this index.""" @@ -370,11 +372,6 @@ def _isnan(self): """ return if each value is nan""" return (self.asi8 == tslib.iNaT) - @cache_readonly - def hasnans(self): - """ return if I have any nans; enables various perf speedups """ - return self._isnan.any() - @property def asobject(self): """ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 8bce01b0759fc..4b50a8c0c088b 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -777,6 +777,15 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return Index.get_indexer(self._int64index, target, method, limit, tolerance) + def _get_unique_index(self, dropna=False): + """ + wrap Index._get_unique_index to handle NaT + """ + res = super(PeriodIndex, self)._get_unique_index(dropna=dropna) + if dropna: + res = res.dropna() + return res + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 26e77d3ad79f3..aa13591a4ff30 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -555,8 +555,8 @@ def test_nonunique_contains(self): def test_order(self): # with freq - idx1 = DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D', name='idx') + idx1 = DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], freq='D', name='idx') idx2 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo', name='tzidx') @@ -798,10 +798,27 @@ def test_shift(self): '2011-01-01 09:00'], name='xxx', tz=tz) tm.assert_index_equal(idx.shift(-3, freq='H'), exp) - def test_na_value(self): + def test_nat(self): self.assertIs(pd.DatetimeIndex._na_value, pd.NaT) self.assertIs(pd.DatetimeIndex([])._na_value, pd.NaT) + for tz in [None, 'US/Eastern', 'UTC']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + class TestTimedeltaIndexOps(Ops): def setUp(self): @@ -1645,10 +1662,26 @@ def test_repeat(self): tm.assert_index_equal(res, exp) self.assertIsNone(res.freq) - def test_na_value(self): + def test_nat(self): self.assertIs(pd.TimedeltaIndex._na_value, pd.NaT) self.assertIs(pd.TimedeltaIndex([])._na_value, pd.NaT) + idx = pd.TimedeltaIndex(['1 days', '2 days']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.TimedeltaIndex(['1 days', 'NaT']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + class TestPeriodIndexOps(Ops): def setUp(self): @@ -2593,10 +2626,26 @@ def test_repeat(self): for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) - def test_na_value(self): + def test_nat(self): self.assertIs(pd.PeriodIndex._na_value, pd.NaT) self.assertIs(pd.PeriodIndex([], freq='M')._na_value, pd.NaT) + idx = pd.PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.PeriodIndex(['2011-01-01', 'NaT'], freq='D') + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + if __name__ == '__main__': import nose