Skip to content

Commit ea233fb

Browse files
committed
BUG: Preserve categorical dtypes when melting (pandas-dev#15853)
Add support for tile and not simply repeat.
1 parent 2d4dd50 commit ea233fb

File tree

10 files changed

+165
-3
lines changed

10 files changed

+165
-3
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1357,6 +1357,7 @@ Reshaping
13571357
- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`)
13581358
- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`)
13591359
- Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have :class:`MultiIndex`ed columns (:issue:`23033`).
1360+
- Bug in :meth:`DataFrame.melt` causing loss of categorical status when melting with categorical id_vars columns (:issue:`15853`).
13601361

13611362
.. _whatsnew_0240.bug_fixes.sparse:
13621363

pandas/compat/numpy/function.py

+4
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,10 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
211211
validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat',
212212
method='both', max_fname_arg_count=1)
213213

214+
TILE_DEFAULTS = dict(axis=None)
215+
validate_tile = CompatValidator(TILE_DEFAULTS, fname='tile',
216+
method='both', max_fname_arg_count=1)
217+
214218
ROUND_DEFAULTS = dict(out=None)
215219
validate_round = CompatValidator(ROUND_DEFAULTS, fname='round',
216220
method='both', max_fname_arg_count=1)

pandas/core/arrays/categorical.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -2325,12 +2325,25 @@ def repeat(self, repeats, *args, **kwargs):
23252325
See also
23262326
--------
23272327
numpy.ndarray.repeat
2328-
2328+
Categorical.tile
23292329
"""
23302330
nv.validate_repeat(args, kwargs)
23312331
codes = self._codes.repeat(repeats)
23322332
return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
23332333

2334+
def tile(self, reps, *args, **kwargs):
2335+
"""
2336+
Tile elements of a Categorical.
2337+
2338+
See also
2339+
--------
2340+
numpy.tile
2341+
Categorical.repeat
2342+
"""
2343+
nv.validate_tile(args, kwargs)
2344+
codes = np.tile(self._codes, reps)
2345+
return self._constructor(values=codes, dtype=self.dtype, fastpath=True)
2346+
23342347
# Implement the ExtensionArray interface
23352348
@property
23362349
def _can_hold_na(self):

pandas/core/indexes/base.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
546546
def _shallow_copy(self, values=None, **kwargs):
547547
if values is None:
548548
values = self.values
549+
549550
attributes = self._get_attributes_dict()
550551
attributes.update(kwargs)
551552
if not len(values) and 'dtype' not in kwargs:
@@ -557,7 +558,6 @@ def _shallow_copy(self, values=None, **kwargs):
557558
# `self.values` returns `self` for tz-aware, so we need to unwrap
558559
# more specifically
559560
values = values.asi8
560-
561561
return self._simple_new(values, **attributes)
562562

563563
def _shallow_copy_with_infer(self, values, **kwargs):
@@ -822,6 +822,7 @@ def repeat(self, repeats, *args, **kwargs):
822822
--------
823823
Series.repeat : Equivalent function for Series
824824
numpy.repeat : Underlying implementation
825+
Index.tile : repeat the entire index as a group, not by element
825826
826827
Examples
827828
--------
@@ -836,6 +837,47 @@ def repeat(self, repeats, *args, **kwargs):
836837
nv.validate_repeat(args, kwargs)
837838
return self._shallow_copy(self._values.repeat(repeats))
838839

840+
def tile(self, reps, *args, **kwargs):
841+
"""
842+
Tile elements of an Index.
843+
844+
Returns a new index constructed by repeating the current index
845+
the number of times given by reps.
846+
847+
.. versionadded:: 0.24.0
848+
849+
Parameters
850+
----------
851+
reps : int
852+
The number of repetitions of the element groups.
853+
**kwargs
854+
Additional keywords have no effect but might be accepted for
855+
compatibility with numpy.
856+
857+
Returns
858+
-------
859+
pandas.Index
860+
Newly created Index with tiled elements.
861+
862+
See Also
863+
--------
864+
Series.tile : Equivalent function for Series
865+
numpy.tile : Underlying implementation
866+
Index.repeat : repeat the index element by element, not as a group
867+
868+
Examples
869+
--------
870+
>>> idx = pd.Index([1, 2, 3])
871+
>>> idx
872+
Int64Index([1, 2, 3], dtype='int64')
873+
>>> idx.tile(2)
874+
Int64Index([1, 2, 3, 1, 2, 3], dtype='int64')
875+
>>> idx.tile(3)
876+
Int64Index([1, 2, 3, 1, 2, 3, 1, 2, 3], dtype='int64')
877+
"""
878+
nv.validate_tile(args, kwargs)
879+
return self._shallow_copy(np.tile(self._values[:], reps))
880+
839881
_index_shared_docs['where'] = """
840882
.. versionadded:: 0.19.0
841883

pandas/core/indexes/datetimelike.py

+12
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,18 @@ def repeat(self, repeats, *args, **kwargs):
638638
return self._shallow_copy(self.asi8.repeat(repeats),
639639
freq=freq)
640640

641+
def tile(self, reps, *args, **kwargs):
642+
"""
643+
Analogous to numpy.tile
644+
"""
645+
nv.validate_tile(args, kwargs)
646+
if is_period_dtype(self):
647+
freq = self.freq
648+
else:
649+
freq = None
650+
return self._shallow_copy(np.tile(self.asi8, reps),
651+
freq=freq)
652+
641653
@Appender(_index_shared_docs['where'] % _index_doc_kwargs)
642654
def where(self, cond, other=None):
643655
other = _ensure_datetimelike_to_i8(other, to_utc=True)

pandas/core/series.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -989,11 +989,12 @@ def _set_values(self, key, value):
989989

990990
def repeat(self, repeats, *args, **kwargs):
991991
"""
992-
Repeat elements of an Series. Refer to `numpy.ndarray.repeat`
992+
Repeat elements of a Series. Refer to `numpy.ndarray.repeat`
993993
for more information about the `repeats` argument.
994994
995995
See also
996996
--------
997+
pd.Series.tile
997998
numpy.ndarray.repeat
998999
"""
9991000
nv.validate_repeat(args, kwargs)
@@ -1002,6 +1003,29 @@ def repeat(self, repeats, *args, **kwargs):
10021003
return self._constructor(new_values,
10031004
index=new_index).__finalize__(self)
10041005

1006+
def tile(self, reps, *args, **kwargs):
1007+
"""
1008+
Tile elements of a Series. Refer to `numpy.tile`
1009+
for more information about the `reps` argument, although
1010+
note that we do not support multidimensional tiling of Series.
1011+
1012+
See also
1013+
--------
1014+
pd.Series.repeat
1015+
numpy.tile
1016+
"""
1017+
nv.validate_tile(args, kwargs)
1018+
new_index = self.index.tile(reps)
1019+
if is_categorical_dtype(self.dtype):
1020+
new_values = Categorical.from_codes(np.tile(self.cat.codes, reps),
1021+
categories=self.cat.categories,
1022+
ordered=self.cat.ordered)
1023+
else:
1024+
new_values = np.tile(self._values, reps)
1025+
1026+
return self._constructor(new_values,
1027+
index=new_index).__finalize__(self)
1028+
10051029
def get_value(self, label, takeable=False):
10061030
"""Quickly retrieve single value at passed index label
10071031

pandas/tests/arrays/categorical/test_analytics.py

+7
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,13 @@ def test_numpy_repeat(self):
308308
with pytest.raises(ValueError, match=msg):
309309
np.repeat(cat, 2, axis=1)
310310

311+
def test_tile(self):
312+
# GH15853
313+
cat = Categorical(["a", "b"], categories=["b", "a"], ordered=True)
314+
exp = Categorical(["a", "b", "a", "b"], categories=["b", "a"], ordered=True)
315+
res = cat.tile(2)
316+
tm.assert_categorical_equal(res, exp)
317+
311318
def test_isna(self):
312319
exp = np.array([False, False, True])
313320
c = Categorical(["a", "b", np.nan])

pandas/tests/indexes/test_base.py

+20
Original file line numberDiff line numberDiff line change
@@ -2485,6 +2485,26 @@ def test_repeat(self):
24852485
result = index.repeat(repeats)
24862486
tm.assert_index_equal(result, expected)
24872487

2488+
def test_tile(self):
2489+
reps = 2
2490+
index = pd.Index([1, 2, 3])
2491+
expected = pd.Index([1, 2, 3, 1, 2, 3])
2492+
2493+
result = index.tile(reps)
2494+
tm.assert_index_equal(result, expected)
2495+
2496+
def test_tile_datetimeindex(self):
2497+
index = pd.date_range("2018-01-01", "2018-01-03")
2498+
result = index.tile(2)
2499+
expected = pd.to_datetime(["2018-01-01", "2018-01-02",
2500+
"2018-01-03"] * 2)
2501+
2502+
tm.assert_index_equal(result, expected)
2503+
2504+
# Even if reps = 1, verify we lose frequency
2505+
one_result = index.tile(1)
2506+
assert result.freq is None
2507+
24882508
@pytest.mark.parametrize("index", [
24892509
pd.Index([np.nan]), pd.Index([np.nan, 1]),
24902510
pd.Index([1, 2, np.nan]), pd.Index(['a', 'b', np.nan]),

pandas/tests/reshape/test_melt.py

+16
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,22 @@ def test_pandas_dtypes(self, col):
233233
expected.columns = ['klass', 'col', 'attribute', 'value']
234234
tm.assert_frame_equal(result, expected)
235235

236+
@pytest.mark.parametrize('id_vars', [['a'], ['b'], ['a', 'b']])
237+
def test_categorical_id_vars(self, id_vars):
238+
# GH 15853
239+
df = DataFrame({"a": pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
240+
"b": pd.Series(pd.Categorical([0, 1, 1, 2, 1],
241+
categories=[0, 2, 1, 3],
242+
ordered=True)),
243+
"c": range(5), "d": np.arange(5.0, 0.0, -1)},
244+
columns=["a", "b", "c", "d"])
245+
246+
result = df.melt(id_vars=id_vars)
247+
for column in id_vars:
248+
num = len(df.columns) - len(id_vars)
249+
expected = df[column].tile(num).reset_index(drop=True)
250+
tm.assert_series_equal(result[column], expected)
251+
236252

237253
class TestLreshape(object):
238254

pandas/tests/series/test_analytics.py

+23
Original file line numberDiff line numberDiff line change
@@ -1395,6 +1395,29 @@ def test_numpy_repeat(self):
13951395
with pytest.raises(ValueError, match=msg):
13961396
np.repeat(s, 2, axis=0)
13971397

1398+
def test_tile(self):
1399+
s = Series(np.random.randn(3), index=['a', 'b', 'c'])
1400+
1401+
reps = s.tile(5)
1402+
exp = Series(np.tile(s.values, 5), index=np.tile(s.index.values, 5))
1403+
assert_series_equal(reps, exp)
1404+
1405+
def test_tile_categorical(self):
1406+
s = Series(pd.Categorical(["x", "y", "x", "z"], categories=["x", "z", "y"],
1407+
ordered=True))
1408+
res_1 = s.tile(1)
1409+
print('s:\n', s)
1410+
print('res_1:')
1411+
print(res_1)
1412+
assert_series_equal(s, res_1)
1413+
1414+
res_2 = s.tile(2)
1415+
exp_2 = Series(pd.Categorical(["x", "y", "x", "z"] * 2,
1416+
categories=s.cat.categories,
1417+
ordered=True),
1418+
index=[0, 1, 2, 3] * 2)
1419+
assert_series_equal(res_2, exp_2)
1420+
13981421
def test_searchsorted(self):
13991422
s = Series([1, 2, 3])
14001423

0 commit comments

Comments
 (0)