Skip to content

Implement unique+array parts of 24024 #24527

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,31 @@ is the case with :attr:`Period.end_time`, for example

p.end_time

.. _whatsnew_0240.api_breaking.datetime_unique:

The return type of :meth:`Series.unique` for datetime with timezone values has changed
from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays.DatetimeArray` (:issue:`24024`).

.. ipython:: python

ser = pd.Series([pd.Timestamp('2000', tz='UTC'),
pd.Timestamp('2000', tz='UTC')])

*Previous Behavior*:

.. code-block:: ipython

In [3]: ser.unique()
Out[3]: array([Timestamp('2000-01-01 00:00:00+0000', tz='UTC')], dtype=object)


*New Behavior*:

.. ipython:: python

ser.unique()


.. _whatsnew_0240.api_breaking.sparse_values:

Sparse Data Structure Refactor
Expand Down
6 changes: 5 additions & 1 deletion pandas/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@
"""
from pandas.core.arrays import (
IntervalArray, PeriodArray, Categorical, SparseArray, IntegerArray,
PandasArray
PandasArray,
DatetimeArrayMixin as DatetimeArray,
TimedeltaArrayMixin as TimedeltaArray,
)


__all__ = [
'Categorical',
'DatetimeArray',
'IntegerArray',
'IntervalArray',
'PandasArray',
'PeriodArray',
'SparseArray',
'TimedeltaArray',
]
11 changes: 3 additions & 8 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,21 +350,16 @@ def unique(values):
if is_extension_array_dtype(values):
# Dispatch to extension dtype's unique.
return values.unique()
elif is_datetime64tz_dtype(values):
# TODO: merge this check into the previous one following #24024
return values.unique()

original = values
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)

table = htable(len(values))
uniques = table.unique(values)
uniques = _reconstruct_data(uniques, dtype, original)

if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
# we are special casing datetime64tz_dtype
# to return an object array of tz-aware Timestamps

# TODO: it must return DatetimeArray with tz in pandas 2.0
uniques = uniques.astype(object).values

return uniques


Expand Down
20 changes: 14 additions & 6 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype,
is_extension_type, is_list_like, is_object_dtype, is_scalar)
is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetimelike,
is_extension_array_dtype, is_extension_type, is_list_like, is_object_dtype,
is_scalar, is_timedelta64_ns_dtype)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -849,12 +850,19 @@ def array(self):
"""
result = self._values

# TODO(DatetimeArray): remvoe the second clause.
if (not is_extension_array_dtype(result.dtype)
and not is_datetime64tz_dtype(result.dtype)):
from pandas.core.arrays.numpy_ import PandasArray
if (is_datetime64_ns_dtype(result.dtype) or
is_datetime64tz_dtype(result.dtype)):
from pandas.arrays import DatetimeArray
result = DatetimeArray(result)

elif is_timedelta64_ns_dtype(result.dtype):
from pandas.arrays import TimedeltaArray
result = TimedeltaArray(result)

elif not is_extension_array_dtype(result.dtype):
from pandas.core.arrays.numpy_ import PandasArray
result = PandasArray(result)

return result

def to_numpy(self, dtype=None, copy=False):
Expand Down
39 changes: 22 additions & 17 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@

from pandas.core.dtypes.common import (
_is_unorderable_exception, ensure_platform_int, is_bool,
is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
is_datetimelike, is_dict_like, is_extension_array_dtype, is_extension_type,
is_hashable, is_integer, is_iterator, is_list_like, is_scalar,
is_string_like, is_timedelta64_dtype)
is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries)
from pandas.core.dtypes.missing import (
Expand Down Expand Up @@ -1556,9 +1555,18 @@ def unique(self):

Returns
-------
ndarray or Categorical
The unique values returned as a NumPy array. In case of categorical
data type, returned as a Categorical.
ndarray or ExtensionArray
The unique values returned as a NumPy array. In case of an
extension-array backed Series, a new
:class:`~api.extensions.ExtensionArray` of that type with just
the unique values is returned. This includes

* Categorical
* Period
* Datetime with Timezone
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IntegerNA ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will edit. note otherwise this is verbatim from 24024

* Interval
* Sparse
* IntegerNA

See Also
--------
Expand All @@ -1575,8 +1583,9 @@ def unique(self):

>>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern')
... for _ in range(3)]).unique()
array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
dtype=object)
<DatetimeArrayMixin>
['2016-01-01 00:00:00-05:00']
Length: 1, dtype: datetime64[ns, US/Eastern]

An unordered Categorical will return categories in the order of
appearance.
Expand All @@ -1593,14 +1602,10 @@ def unique(self):
Categories (3, object): [a < b < c]
"""
result = super(Series, self).unique()

if is_datetime64tz_dtype(self.dtype):
# we are special casing datetime64tz_dtype
# to return an object array of tz-aware Timestamps

# TODO: it must return DatetimeArray with tz in pandas 2.0
result = result.astype(object).values

if isinstance(result, DatetimeIndex):
# TODO: This should be unnecessary after Series._values returns
# DatetimeArray
result = result._eadata
return result

def drop_duplicates(self, keep='first', inplace=False):
Expand Down
15 changes: 9 additions & 6 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pandas._libs import (groupby as libgroupby, algos as libalgos,
hashtable as ht)
from pandas.compat import lrange, range
from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray
import pandas.core.algorithms as algos
import pandas.core.common as com
import pandas.util.testing as tm
Expand Down Expand Up @@ -456,9 +457,10 @@ def test_datetime64tz_aware(self):
result = Series(
Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')])).unique()
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
tz='US/Eastern')], dtype=object)
tm.assert_numpy_array_equal(result, expected)
expected = DatetimeArray._from_sequence(np.array([
Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern")
]))
tm.assert_extension_array_equal(result, expected)

result = Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')]).unique()
Expand All @@ -469,9 +471,10 @@ def test_datetime64tz_aware(self):
result = pd.unique(
Series(Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')])))
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
tz='US/Eastern')], dtype=object)
tm.assert_numpy_array_equal(result, expected)
expected = DatetimeArray._from_sequence(np.array([
Timestamp('2016-01-01', tz="US/Eastern"),
]))
tm.assert_extension_array_equal(result, expected)

result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')]))
Expand Down
31 changes: 22 additions & 9 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@
import pandas.compat as compat
from pandas.core.dtypes.common import (
is_object_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
needs_i8_conversion)
needs_i8_conversion, is_timedelta64_dtype)
import pandas.util.testing as tm
from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex,
PeriodIndex, Timedelta, IntervalIndex, Interval,
CategoricalIndex, Timestamp, DataFrame, Panel)
from pandas.core.arrays import (
DatetimeArrayMixin as DatetimeArray,
TimedeltaArrayMixin as TimedeltaArray,
)
from pandas.compat import StringIO, PYPY, long
from pandas.compat.numpy import np_array_datetime64_compat
from pandas.core.arrays import PandasArray
Expand Down Expand Up @@ -383,8 +387,12 @@ def test_value_counts_unique_nunique(self):
assert result[0] == orig[0]
for r in result:
assert isinstance(r, Timestamp)
tm.assert_numpy_array_equal(result,
orig._values.astype(object).values)

# TODO(#24024) once orig._values returns DTA, remove
# the `._eadata` below
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in future, can make this a proper form test (e.g. result =, expected = )

tm.assert_numpy_array_equal(
result.astype(object),
orig._values._eadata.astype(object))
else:
tm.assert_numpy_array_equal(result, orig.values)

Expand All @@ -410,7 +418,9 @@ def test_value_counts_unique_nunique_null(self):
else:
o = o.copy()
o[0:2] = iNaT
values = o._values
# TODO(#24024) once Series._values returns DTA, remove
# the `._eadata` here
values = o._values._eadata

elif needs_i8_conversion(o):
values[0:2] = iNaT
Expand All @@ -431,7 +441,7 @@ def test_value_counts_unique_nunique_null(self):
o = klass(values.repeat(range(1, len(o) + 1)))
o.name = 'a'
else:
if is_datetime64tz_dtype(o):
if isinstance(o, DatetimeIndex):
expected_index = orig._values._shallow_copy(values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are super-hacks, in future, let's try to remove

else:
expected_index = Index(values)
Expand Down Expand Up @@ -472,8 +482,7 @@ def test_value_counts_unique_nunique_null(self):
Index(values[1:], name='a'))
elif is_datetime64tz_dtype(o):
# unable to compare NaT / nan
vals = values[2:].astype(object).values
tm.assert_numpy_array_equal(result[1:], vals)
tm.assert_extension_array_equal(result[1:], values[2:])
assert result[0] is pd.NaT
else:
tm.assert_numpy_array_equal(result[1:], values[2:])
Expand Down Expand Up @@ -1187,7 +1196,6 @@ def test_ndarray_values(array, expected):

@pytest.mark.parametrize("arr", [
np.array([1, 2, 3]),
np.array([1, 2, 3], dtype="datetime64[ns]"),
])
def test_numpy_array(arr):
ser = pd.Series(arr)
Expand All @@ -1199,7 +1207,12 @@ def test_numpy_array(arr):
def test_numpy_array_all_dtypes(any_numpy_dtype):
ser = pd.Series(dtype=any_numpy_dtype)
result = ser.array
assert isinstance(result, PandasArray)
if is_datetime64_dtype(any_numpy_dtype):
assert isinstance(result, DatetimeArray)
elif is_timedelta64_dtype(any_numpy_dtype):
assert isinstance(result, TimedeltaArray)
else:
assert isinstance(result, PandasArray)


@pytest.mark.parametrize("array, attr", [
Expand Down