diff --git a/pandas/core/base.py b/pandas/core/base.py index beffbfb2923db..b3fed959a8522 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -288,21 +288,29 @@ def value_counts(self, normalize=False, sort=True, ascending=False, tz=getattr(self, 'tz', None)) return result - def unique(self): + def unique(self, dropna=False): """ Return array of unique values in the object. Significantly faster than numpy.unique. Includes NA values. + Parameters + ---------- + dropna : boolean, default False + Don't include NaN in the result. + Returns ------- uniques : ndarray """ - from pandas.core.nanops import unique1d - values = self.values - if hasattr(values,'unique'): - return values.unique() - - return unique1d(values) + if dropna: + return self.dropna().unique() + else: + if hasattr(self.values, 'unique'): + # Categorical Series not supported by unique1d + return self.values.unique() + else: + from pandas.core.nanops import unique1d + return unique1d(self.values) def nunique(self, dropna=True): """ @@ -319,7 +327,7 @@ def nunique(self, dropna=True): ------- nunique : int """ - return len(self.value_counts(dropna=dropna)) + return len(self.unique(dropna=dropna)) def factorize(self, sort=False, na_sentinel=-1): """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f9ed6c2fecc3c..9db21cdc03afd 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -869,7 +869,7 @@ def mode(self): fastpath=True) return result - def unique(self): + def unique(self, **kwargs): """ Return the unique values. diff --git a/pandas/core/index.py b/pandas/core/index.py index 263e6db8c486a..e1593eb2b75b2 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -215,6 +215,16 @@ def is_(self, other): # use something other than None to be clearer return self._id is getattr(other, '_id', Ellipsis) + def dropna(self): + """ + Return Index without null values + + Returns + ------- + dropped : Index + """ + return self[~isnull(self.values)] + def _reset_identity(self): """Initializes or resets ``_id`` attribute with new object""" self._id = _Identity() diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 8b1f6ce3e7f45..5e3a0201236fd 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -907,6 +907,13 @@ def test_nan_first_take_datetime(self): exp = Index([idx[-1], idx[0], idx[1]]) tm.assert_index_equal(res, exp) + def test_dropna(self): + idx = Index([np.nan, 'a', np.nan, np.nan, 'b', 'c', np.nan], + name='idx') + expected = Index(['a', 'b', 'c'], name='idx') + result = idx.dropna() + tm.assert_index_equal(result, expected) + class TestFloat64Index(tm.TestCase): _multiprocess_can_split_ = True @@ -1051,6 +1058,12 @@ def test_astype_from_object(self): tm.assert_equal(result.dtype, expected.dtype) tm.assert_index_equal(result, expected) + def test_dropna(self): + idx = Float64Index([np.nan, 1.0, np.nan, np.nan, 2.0, 3.0, np.nan]) + expected = Float64Index([1.0, 2.0, 3.0]) + result = idx.dropna() + tm.assert_index_equal(result, expected) + class TestInt64Index(tm.TestCase): _multiprocess_can_split_ = True @@ -1476,6 +1489,12 @@ def test_slice_keep_name(self): idx = Int64Index([1, 2], name='asdf') self.assertEqual(idx.name, idx[1:].name) + def test_dropna_does_nothing(self): + idx = Int64Index([1, 2, 3], name='idx') + expected = Int64Index([1, 2, 3], name='idx') + result = idx.dropna() + tm.assert_index_equal(result, expected) + class TestMultiIndex(tm.TestCase): _multiprocess_can_split_ = True @@ -2948,6 +2967,12 @@ def test_level_setting_resets_attributes(self): # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic + def test_dropna_does_nothing(self): + idx = MultiIndex.from_tuples([('bar', 'two')]) + expected = idx + result = idx.dropna() + tm.assert_index_equal(result, expected) + def test_get_combined_index(): from pandas.core.index import _get_combined_index diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 518bb4180ec89..cf586d609b513 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -852,6 +852,23 @@ def take(self, indices, axis=0): return self[maybe_slice] return super(DatetimeIndex, self).take(indices, axis) + def unique(self, dropna=False): + """ + Index.unique with handling for DatetimeIndex metadata + + Parameters + ---------- + dropna : boolean, default False + Don't include NaN in the result. + + Returns + ------- + result : DatetimeIndex + """ + result = Int64Index.unique(self, dropna=dropna) + return DatetimeIndex._simple_new(result, tz=self.tz, + name=self.name) + def union(self, other): """ Specialized union for DatetimeIndex objects. If combine diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py index 1659340cfe050..88f47e9515a63 100644 --- a/vb_suite/series_methods.py +++ b/vb_suite/series_methods.py @@ -27,3 +27,11 @@ 's2.nsmallest(3, take_last=False)', setup, start_date=datetime(2014, 1, 25)) + +series_nunique1 = Benchmark('s1.nunique()', + setup, + start_date=datetime(2014, 1, 25)) + +series_nunique2 = Benchmark('s2.nunique()', + setup, + start_date=datetime(2014, 1, 25)) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index be9aa03801641..ff6ef904a1d81 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -23,6 +23,7 @@ 'plotting', 'reindex', 'replace', + 'series_methods', 'sparse', 'strings', 'reshape',