Skip to content

Performance improvements for nunique method. #7784

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,21 +288,29 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
tz=getattr(self, 'tz', None))
return result

def unique(self):
def unique(self, dropna=False):
"""
Return array of unique values in the object. Significantly faster than
numpy.unique. Includes NA values.

Parameters
----------
dropna : boolean, default False
Don't include NaN in the result.

Returns
-------
uniques : ndarray
"""
from pandas.core.nanops import unique1d
values = self.values
if hasattr(values,'unique'):
return values.unique()

return unique1d(values)
if dropna:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I bet it hits the AttributeError every time. values are numpy arrays here, you have to do self.dropna()

return self.dropna().unique()
else:
if hasattr(self.values, 'unique'):
# Categorical Series not supported by unique1d
return self.values.unique()
else:
from pandas.core.nanops import unique1d
return unique1d(self.values)

def nunique(self, dropna=True):
"""
Expand All @@ -319,7 +327,7 @@ def nunique(self, dropna=True):
-------
nunique : int
"""
return len(self.value_counts(dropna=dropna))
return len(self.unique(dropna=dropna))

def factorize(self, sort=False, na_sentinel=-1):
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@ def mode(self):
fastpath=True)
return result

def unique(self):
def unique(self, **kwargs):
"""
Return the unique values.

Expand Down
10 changes: 10 additions & 0 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,16 @@ def is_(self, other):
# use something other than None to be clearer
return self._id is getattr(other, '_id', Ellipsis)

def dropna(self):
"""
Return Index without null values

Returns
-------
dropped : Index
"""
return self[~isnull(self.values)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should just be self[~isnull(self)]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tests fail with that change:

raise NotImplementedError("isnull is not defined for MultiIndex")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm ok

going to need a test for each index type for dropna (except Int64) of course


def _reset_identity(self):
"""Initializes or resets ``_id`` attribute with new object"""
self._id = _Identity()
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,13 @@ def test_nan_first_take_datetime(self):
exp = Index([idx[-1], idx[0], idx[1]])
tm.assert_index_equal(res, exp)

def test_dropna(self):
idx = Index([np.nan, 'a', np.nan, np.nan, 'b', 'c', np.nan],
name='idx')
expected = Index(['a', 'b', 'c'], name='idx')
result = idx.dropna()
tm.assert_index_equal(result, expected)


class TestFloat64Index(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down Expand Up @@ -1051,6 +1058,12 @@ def test_astype_from_object(self):
tm.assert_equal(result.dtype, expected.dtype)
tm.assert_index_equal(result, expected)

def test_dropna(self):
idx = Float64Index([np.nan, 1.0, np.nan, np.nan, 2.0, 3.0, np.nan])
expected = Float64Index([1.0, 2.0, 3.0])
result = idx.dropna()
tm.assert_index_equal(result, expected)


class TestInt64Index(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down Expand Up @@ -1476,6 +1489,12 @@ def test_slice_keep_name(self):
idx = Int64Index([1, 2], name='asdf')
self.assertEqual(idx.name, idx[1:].name)

def test_dropna_does_nothing(self):
idx = Int64Index([1, 2, 3], name='idx')
expected = Int64Index([1, 2, 3], name='idx')
result = idx.dropna()
tm.assert_index_equal(result, expected)


class TestMultiIndex(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down Expand Up @@ -2948,6 +2967,12 @@ def test_level_setting_resets_attributes(self):
# if this fails, probably didn't reset the cache correctly.
assert not ind.is_monotonic

def test_dropna_does_nothing(self):
idx = MultiIndex.from_tuples([('bar', 'two')])
expected = idx
result = idx.dropna()
tm.assert_index_equal(result, expected)


def test_get_combined_index():
from pandas.core.index import _get_combined_index
Expand Down
17 changes: 17 additions & 0 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,23 @@ def take(self, indices, axis=0):
return self[maybe_slice]
return super(DatetimeIndex, self).take(indices, axis)

def unique(self, dropna=False):
"""
Index.unique with handling for DatetimeIndex metadata

Parameters
----------
dropna : boolean, default False
Don't include NaN in the result.

Returns
-------
result : DatetimeIndex
"""
result = Int64Index.unique(self, dropna=dropna)
return DatetimeIndex._simple_new(result, tz=self.tz,
name=self.name)

def union(self, other):
"""
Specialized union for DatetimeIndex objects. If combine
Expand Down
8 changes: 8 additions & 0 deletions vb_suite/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@
's2.nsmallest(3, take_last=False)',
setup,
start_date=datetime(2014, 1, 25))

series_nunique1 = Benchmark('s1.nunique()',
setup,
start_date=datetime(2014, 1, 25))

series_nunique2 = Benchmark('s2.nunique()',
setup,
start_date=datetime(2014, 1, 25))
1 change: 1 addition & 0 deletions vb_suite/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
'plotting',
'reindex',
'replace',
'series_methods',
'sparse',
'strings',
'reshape',
Expand Down