Skip to content

BUG: preserve freq in DTI/TDI factorize #33836

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
16 changes: 13 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,16 @@ def _reconstruct_data(values, dtype, original):
-------
Index for extension types, otherwise ndarray casted to dtype
"""
if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
# Catch DatetimeArray/TimedeltaArray
return values

if is_extension_array_dtype(dtype):
values = dtype.construct_array_type()._from_sequence(values)
cls = dtype.construct_array_type()
if isinstance(values, cls) and values.dtype == dtype:
return values

values = cls._from_sequence(values)
elif is_bool_dtype(dtype):
values = values.astype(dtype, copy=False)

Expand Down Expand Up @@ -613,9 +621,11 @@ def factorize(

values = _ensure_arraylike(values)
original = values
if not isinstance(values, ABCMultiIndex):
values = extract_array(values, extract_numpy=True)

if is_extension_array_dtype(values.dtype):
values = extract_array(values)
if isinstance(values, ABCExtensionArray):
# Includes DatetimeArray, TimedeltaArray
codes, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,13 @@ def _with_freq(self, freq):
arr._freq = freq
return arr

def factorize(self, na_sentinel=-1):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't want to expose factorize on Index at all.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is already there in the status quo. If you want to remove it, that needs a deprecation cycle, is a separate issue

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/Users/jreback/pandas
bash-3.2$ grep -r factorize pandas/core/indexes/
pandas/core/indexes//multi.py:from pandas.core.arrays.categorical import factorize_from_iterables
pandas/core/indexes//multi.py:    indexer_from_factorized,
pandas/core/indexes//multi.py:        codes, levels = factorize_from_iterables(arrays)
pandas/core/indexes//multi.py:        codes, levels = factorize_from_iterables(iterables)
pandas/core/indexes//multi.py:        codes, uniques = algos.factorize(indexer, sort=True)
pandas/core/indexes//multi.py:            ok_codes, uniques = algos.factorize(indexer[mask], sort=True)
pandas/core/indexes//multi.py:            indexer = indexer_from_factorized(primary, primshp, compress=False)
Binary file pandas/core/indexes//__pycache__/multi.cpython-38.pyc matches

pls show where

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm i guess its in the base class.

In any event. I don't think we actually want to change this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment in existing tests seems to think this is already the behavior: https://github.com/pandas-dev/pandas/pull/33836/files#diff-8cf55ac38b6988b09a7f7f5d7280eb0fL360

Also should improve perf since this short-circuits the expensive part of factorize

if self.freq is not None:
# We must be unique, so can short-circuit (and retain freq)
codes = np.arange(len(self), dtype=np.intp)
return codes, self.copy()
return ExtensionArray.factorize(self, na_sentinel=na_sentinel)


DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin")

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def f(self):
return property(f)


class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps):
class DatetimeArray(dtl.TimelikeOps, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps):
"""
Pandas ExtensionArray for tz-naive or tz-aware datetime data.

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def f(self):
return property(f)


class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps):
class TimedeltaArray(dtl.TimelikeOps, dtl.DatetimeLikeArrayMixin):
"""
Pandas ExtensionArray for timedelta data.

Expand Down
9 changes: 9 additions & 0 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,15 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default):
result._cache = cache
return result

def factorize(self, sort=False, na_sentinel=-1):
if self.freq is not None and sort is False:
# we are unique, so can short-circuit, also can preserve freq
codes = np.arange(len(self), dtype=np.intp)
return codes, self.copy()
# TODO: In the sort=True case we could check for montonic_decreasing
# and operate on self[::-1]
return super().factorize(sort=sort, na_sentinel=na_sentinel)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be duplicating the array version, can't that be reused? (directly, or by having pd.factorize call the array version)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you check this comment?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its close; the main difference is that the Index version returns an Index for uniques

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that is handled in factorize (depending on the input, it will wrap the uniques in an Index or not)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im not clear on what youre suggesting? that we dont need to override this here at all?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, you might need to update factorize to preserve freq (it's using _shallow_copy right now)

(or we could decide that factorize is not an operation that should preserve the freq)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, you might need to update factorize to preserve freq (it's using _shallow_copy right now)

If we're getting into Index-subclass-specific logic, I think that belongs on the Index subclass.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(or we could decide that factorize is not an operation that should preserve the freq)

I'd be OK with this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're getting into Index-subclass-specific logic, I think that belongs on the Index subclass.

Not necessarily, I would say, as it already has index-specific handling as well.
But there is no "shallow_copy"-like method that preserves attributes like freq ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But there is no "shallow_copy"-like method that preserves attributes like freq ?

There used to be, but it was being used in places where freq shouldnt be retained, so was changed as it was a footgun.


# --------------------------------------------------------------------
# Set Operation Methods

Expand Down
51 changes: 35 additions & 16 deletions pandas/tests/indexes/datetimes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,12 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# tz must be preserved
idx1 = idx1.tz_localize("Asia/Tokyo")
Expand All @@ -340,6 +342,7 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

idx2 = pd.DatetimeIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"]
Expand All @@ -350,49 +353,65 @@ def test_factorize(self):
arr, idx = idx2.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp)
exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"])
arr, idx = idx2.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# freq must be preserved
def test_factorize_preserves_freq(self):
# GH#33836 freq should be preserved
idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)

arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

arr, idx = pd.factorize(idx3)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

def test_factorize_tz(self, tz_naive_fixture):
def test_factorize_tz(self, tz_naive_fixture, index_or_series):
tz = tz_naive_fixture
# GH#13750
base = pd.date_range("2016-11-05", freq="H", periods=100, tz=tz)
idx = base.repeat(5)

exp_arr = np.arange(100, dtype=np.intp).repeat(5)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
expected = base._with_freq(None)
tm.assert_index_equal(res, expected)
obj = index_or_series(idx)

arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
expected = base._with_freq(None)
tm.assert_index_equal(res, expected)
assert res.freq == expected.freq

def test_factorize_dst(self):
def test_factorize_dst(self, index_or_series):
# GH 13750
idx = pd.date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern")
obj = index_or_series(idx)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

idx = pd.date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern")
obj = index_or_series(idx)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

@pytest.mark.parametrize(
"arr, expected",
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/indexes/timedeltas/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,26 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# freq must be preserved
def test_factorize_preserves_freq(self):
# GH#33836 freq should be preserved
idx3 = timedelta_range("1 day", periods=4, freq="s")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

arr, idx = pd.factorize(idx3)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

def test_sort_values(self):

Expand Down