diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 2b76da1434ba3..e5222f9e33c1c 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -75,7 +75,7 @@ Enhancements - +- Add ``dropna`` argument to ``value_counts`` and ``nunique`` (:issue:`5569`). @@ -159,7 +159,7 @@ Bug Fixes - +- Bug in ``value_counts`` where ``NaT`` did not qualify as missing (``NaN``) (:issue:`7423`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 954f18ccb69b8..1aec8561807c9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -168,7 +168,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): def value_counts(values, sort=True, ascending=False, normalize=False, - bins=None): + bins=None, dropna=True): """ Compute a histogram of the counts of non-null values @@ -184,6 +184,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data + dropna : boolean, default False + Don't include counts of NaN Returns ------- @@ -202,25 +204,31 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.labels - if com.is_integer_dtype(values.dtype): + dtype = values.dtype + if com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): - dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) + if dropna: + from pandas.tslib import iNaT + msk = keys != iNaT + keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in - keys = Series(keys, dtype=dtype) + keys = keys.astype(dtype) else: - mask = com.isnull(values) values = com._ensure_object(values) + mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) + if not dropna: + keys = np.insert(keys, 0, np.NaN) + counts = np.insert(counts, 0, mask.sum()) result = Series(counts, index=com._values_from_object(keys)) - if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.levels)), fill_value=0) diff --git a/pandas/core/base.py b/pandas/core/base.py index 6bbcc33c2271b..b43883885e962 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -245,7 +245,7 @@ def min(self): return pandas.core.nanops.nanmin(self.values) def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None): + bins=None, dropna=True): """ Returns object containing counts of unique values. The resulting object will be in descending order so that the first element is the most @@ -263,6 +263,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins : integer, optional Rather than count values, group them into half-open bins, a convenience for pd.cut, only works with numeric data + dropna : boolean, default False + Don't include counts of NaN Returns ------- @@ -270,7 +272,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, """ from pandas.core.algorithms import value_counts return value_counts(self.values, sort=sort, ascending=ascending, - normalize=normalize, bins=bins) + normalize=normalize, bins=bins, dropna=dropna) def unique(self): """ @@ -284,7 +286,7 @@ def unique(self): from pandas.core.nanops import unique1d return unique1d(self.values) - def nunique(self): + def nunique(self, dropna=True): """ Return count of unique elements in the object. Excludes NA values. @@ -292,7 +294,7 @@ def nunique(self): ------- nunique : int """ - return len(self.value_counts()) + return len(self.value_counts(dropna=dropna)) def factorize(self, sort=False, na_sentinel=-1): """ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6828c1d0528ea..ec2c64242f146 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -237,6 +237,19 @@ def test_value_counts_dtypes(self): self.assertRaises(TypeError, lambda s: algos.value_counts(s, bins=1), ['1', 1]) + def test_value_counts_nat(self): + td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]') + dt = pd.to_datetime(['NaT', '2014-01-01']) + + for s in [td, dt]: + vc = algos.value_counts(s) + vc_with_na = algos.value_counts(s, dropna=False) + self.assertEqual(len(vc), 1) + self.assertEqual(len(vc_with_na), 2) + + exp_dt = pd.Series({pd.Timestamp('2014-01-01 00:00:00'): 1}) + tm.assert_series_equal(algos.value_counts(dt), exp_dt) + # TODO same for (timedelta) def test_quantile(): s = Series(np.random.randn(100)) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 4aaab3b2c52a5..6c8dd3478835f 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -292,12 +292,13 @@ def test_value_counts_unique_nunique(self): o = klass(np.repeat(values, range(1, len(o) + 1))) if isinstance(o, DatetimeIndex): - # DatetimeIndex: nan is casted to Nat and included - expected_s = Series(list(range(10, 2, -1)) + [3], index=values[9:0:-1]) + expected_s_na = Series(list(range(10, 2, -1)) + [3], index=values[9:0:-1]) + expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1]) else: - # nan is excluded - expected_s = Series(range(10, 2, -1), index=values[9:1:-1], dtype='int64') + expected_s_na = Series(list(range(10, 2, -1)) +[3], index=values[9:0:-1], dtype='int64') + expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1], dtype='int64') + tm.assert_series_equal(o.value_counts(dropna=False), expected_s_na) tm.assert_series_equal(o.value_counts(), expected_s) # numpy_array_equal cannot compare arrays includes nan @@ -309,10 +310,8 @@ def test_value_counts_unique_nunique(self): else: self.assertTrue(pd.isnull(result[0])) - if isinstance(o, DatetimeIndex): - self.assertEqual(o.nunique(), 9) - else: - self.assertEqual(o.nunique(), 8) + self.assertEqual(o.nunique(), 8) + self.assertEqual(o.nunique(dropna=False), 9) def test_value_counts_inferred(self): klasses = [Index, Series] @@ -406,6 +405,9 @@ def test_value_counts_inferred(self): result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') + tm.assert_series_equal(result, expected_s) + + result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) @@ -415,7 +417,8 @@ def test_value_counts_inferred(self): self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) - self.assertEqual(s.nunique(), 4) + self.assertEqual(s.nunique(), 3) + self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 04210b4f0c88f..ddd6c26748d3e 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -106,16 +106,19 @@ def test_index_unique(self): self.assertEqual(result.name, 'foo') self.assertTrue(result.equals(expected)) - # NaT + # NaT, note this is excluded arr = [ 1370745748 + t for t in range(20) ] + [iNaT] idx = DatetimeIndex(arr * 3) self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) - self.assertEqual(idx.nunique(), 21) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) arr = [ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT] idx = DatetimeIndex(arr * 3) self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) - self.assertEqual(idx.nunique(), 21) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) + def test_index_dupes_contains(self): d = datetime(2011, 12, 5, 20, 30)