Skip to content

BUG: Preserve categorical dtypes when melting (#15853) #23671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ Other Enhancements
- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`)
- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`)
- :meth:`Index.tile`, :meth:`Series.tile`, and :meth:`Categorical.tile` were introduced, parallel to the repeat methods, to ease categorical melting (:issue:`15853`)
- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`)
- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to
reflect changes from the `Pandas-GBQ library version 0.6.0
Expand Down Expand Up @@ -1357,6 +1358,7 @@ Reshaping
- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`)
- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`)
- Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have :class:`MultiIndex`ed columns (:issue:`23033`).
- Bug in :meth:`DataFrame.melt` causing loss of categorical status when melting with categorical id_vars columns (:issue:`15853`).

.. _whatsnew_0240.bug_fixes.sparse:

Expand Down
4 changes: 4 additions & 0 deletions pandas/compat/numpy/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,10 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat',
method='both', max_fname_arg_count=1)

TILE_DEFAULTS = dict(axis=None)
validate_tile = CompatValidator(TILE_DEFAULTS, fname='tile',
method='both', max_fname_arg_count=1)

ROUND_DEFAULTS = dict(out=None)
validate_round = CompatValidator(ROUND_DEFAULTS, fname='round',
method='both', max_fname_arg_count=1)
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2325,12 +2325,25 @@ def repeat(self, repeats, *args, **kwargs):
See also
--------
numpy.ndarray.repeat

Categorical.tile
"""
nv.validate_repeat(args, kwargs)
codes = self._codes.repeat(repeats)
return self._constructor(values=codes, dtype=self.dtype, fastpath=True)

def tile(self, reps, *args, **kwargs):
"""
Tile elements of a Categorical.

See also
--------
numpy.tile
Categorical.repeat
"""
nv.validate_tile(args, kwargs)
codes = np.tile(self._codes, reps)
return self._constructor(values=codes, dtype=self.dtype, fastpath=True)

# Implement the ExtensionArray interface
@property
def _can_hold_na(self):
Expand Down
44 changes: 43 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
def _shallow_copy(self, values=None, **kwargs):
if values is None:
values = self.values

attributes = self._get_attributes_dict()
attributes.update(kwargs)
if not len(values) and 'dtype' not in kwargs:
Expand All @@ -557,7 +558,6 @@ def _shallow_copy(self, values=None, **kwargs):
# `self.values` returns `self` for tz-aware, so we need to unwrap
# more specifically
values = values.asi8

return self._simple_new(values, **attributes)

def _shallow_copy_with_infer(self, values, **kwargs):
Expand Down Expand Up @@ -822,6 +822,7 @@ def repeat(self, repeats, *args, **kwargs):
--------
Series.repeat : Equivalent function for Series
numpy.repeat : Underlying implementation
Index.tile : repeat the entire index as a group, not by element

Examples
--------
Expand All @@ -836,6 +837,47 @@ def repeat(self, repeats, *args, **kwargs):
nv.validate_repeat(args, kwargs)
return self._shallow_copy(self._values.repeat(repeats))

def tile(self, reps, *args, **kwargs):
"""
Tile elements of an Index.

Returns a new index constructed by repeating the current index
the number of times given by reps.

.. versionadded:: 0.24.0

Parameters
----------
reps : int
The number of repetitions of the element groups.
**kwargs
Additional keywords have no effect but might be accepted for
compatibility with numpy.

Returns
-------
pandas.Index
Newly created Index with tiled elements.

See Also
--------
Series.tile : Equivalent function for Series
numpy.tile : Underlying implementation
Index.repeat : repeat the index element by element, not as a group

Examples
--------
>>> idx = pd.Index([1, 2, 3])
>>> idx
Int64Index([1, 2, 3], dtype='int64')
>>> idx.tile(2)
Int64Index([1, 2, 3, 1, 2, 3], dtype='int64')
>>> idx.tile(3)
Int64Index([1, 2, 3, 1, 2, 3, 1, 2, 3], dtype='int64')
"""
nv.validate_tile(args, kwargs)
return self._shallow_copy(np.tile(self._values[:], reps))

_index_shared_docs['where'] = """
.. versionadded:: 0.19.0

Expand Down
12 changes: 12 additions & 0 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,18 @@ def repeat(self, repeats, *args, **kwargs):
return self._shallow_copy(self.asi8.repeat(repeats),
freq=freq)

def tile(self, reps, *args, **kwargs):
"""
Analogous to numpy.tile
"""
nv.validate_tile(args, kwargs)
if is_period_dtype(self):
freq = self.freq
else:
freq = None
return self._shallow_copy(np.tile(self.asi8, reps),
freq=freq)

@Appender(_index_shared_docs['where'] % _index_doc_kwargs)
def where(self, cond, other=None):
other = _ensure_datetimelike_to_i8(other, to_utc=True)
Expand Down
26 changes: 25 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,11 +989,12 @@ def _set_values(self, key, value):

def repeat(self, repeats, *args, **kwargs):
"""
Repeat elements of an Series. Refer to `numpy.ndarray.repeat`
Repeat elements of a Series. Refer to `numpy.ndarray.repeat`
for more information about the `repeats` argument.

See also
--------
pd.Series.tile
numpy.ndarray.repeat
"""
nv.validate_repeat(args, kwargs)
Expand All @@ -1002,6 +1003,29 @@ def repeat(self, repeats, *args, **kwargs):
return self._constructor(new_values,
index=new_index).__finalize__(self)

def tile(self, reps, *args, **kwargs):
"""
Tile elements of a Series. Refer to `numpy.tile`
for more information about the `reps` argument, although
note that we do not support multidimensional tiling of Series.

See also
--------
pd.Series.repeat
numpy.tile
"""
nv.validate_tile(args, kwargs)
new_index = self.index.tile(reps)
if is_categorical_dtype(self.dtype):
new_values = Categorical.from_codes(np.tile(self.cat.codes, reps),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do this more like repeats, you just directly access the underlying function; then you don't need to type check

categories=self.cat.categories,
ordered=self.cat.ordered)
else:
new_values = np.tile(self._values, reps)

return self._constructor(new_values,
index=new_index).__finalize__(self)

def get_value(self, label, takeable=False):
"""Quickly retrieve single value at passed index label

Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/arrays/categorical/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,14 @@ def test_numpy_repeat(self):
with pytest.raises(ValueError, match=msg):
np.repeat(cat, 2, axis=1)

def test_tile(self):
# GH15853
cat = Categorical(["a", "b"], categories=["b", "a"], ordered=True)
exp = Categorical(["a", "b", "a", "b"], categories=["b", "a"],
ordered=True)
res = cat.tile(2)
tm.assert_categorical_equal(res, exp)

def test_isna(self):
exp = np.array([False, False, True])
c = Categorical(["a", "b", np.nan])
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2485,6 +2485,26 @@ def test_repeat(self):
result = index.repeat(repeats)
tm.assert_index_equal(result, expected)

def test_tile(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ideally you can use the indices fixture here and test all Index

reps = 2
index = pd.Index([1, 2, 3])
expected = pd.Index([1, 2, 3, 1, 2, 3])

result = index.tile(reps)
tm.assert_index_equal(result, expected)

def test_tile_datetimeindex(self):
index = pd.date_range("2018-01-01", "2018-01-03")
result = index.tile(2)
expected = pd.to_datetime(["2018-01-01", "2018-01-02",
"2018-01-03"] * 2)

tm.assert_index_equal(result, expected)

# Even if reps = 1, verify we lose frequency
one_result = index.tile(1)
assert one_result.freq is None

@pytest.mark.parametrize("index", [
pd.Index([np.nan]), pd.Index([np.nan, 1]),
pd.Index([1, 2, np.nan]), pd.Index(['a', 'b', np.nan]),
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/reshape/test_melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,23 @@ def test_pandas_dtypes(self, col):
expected.columns = ['klass', 'col', 'attribute', 'value']
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('id_vars', [['a'], ['b'], ['a', 'b']])
def test_categorical_id_vars(self, id_vars):
# GH 15853
df = DataFrame({"a": pd.Series(["a", "b", "c", "a", "d"],
dtype="category"),
"b": pd.Series(pd.Categorical([0, 1, 1, 2, 1],
categories=[0, 2, 1, 3],
ordered=True)),
"c": range(5), "d": np.arange(5.0, 0.0, -1)},
columns=["a", "b", "c", "d"])

result = df.melt(id_vars=id_vars)
for column in id_vars:
num = len(df.columns) - len(id_vars)
expected = df[column].tile(num).reset_index(drop=True)
tm.assert_series_equal(result[column], expected)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also test result itself



class TestLreshape(object):

Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1395,6 +1395,27 @@ def test_numpy_repeat(self):
with pytest.raises(ValueError, match=msg):
np.repeat(s, 2, axis=0)

def test_tile(self):
s = Series(np.random.randn(3), index=['a', 'b', 'c'])

reps = s.tile(5)
exp = Series(np.tile(s.values, 5), index=np.tile(s.index.values, 5))
assert_series_equal(reps, exp)

def test_tile_categorical(self):
s = Series(pd.Categorical(["x", "y", "x", "z"],
categories=["x", "z", "y"],
ordered=True))
res_1 = s.tile(1)
assert_series_equal(s, res_1)

res_2 = s.tile(2)
exp_2 = Series(pd.Categorical(["x", "y", "x", "z"] * 2,
categories=s.cat.categories,
ordered=True),
index=[0, 1, 2, 3] * 2)
assert_series_equal(res_2, exp_2)

def test_searchsorted(self):
s = Series([1, 2, 3])

Expand Down