From 2f2eba0b5faa473aa781b55fed6b2a19c8a5e045 Mon Sep 17 00:00:00 2001 From: "D.S. McNeil" Date: Mon, 12 Nov 2018 19:54:11 -0500 Subject: [PATCH] BUG: Preserve categorical dtypes when melting (#15853) Also add support for tile and not simply repeat. --- doc/source/whatsnew/v0.24.0.txt | 2 + pandas/compat/numpy/function.py | 4 ++ pandas/core/arrays/categorical.py | 15 ++++++- pandas/core/indexes/base.py | 44 ++++++++++++++++++- pandas/core/indexes/datetimelike.py | 12 +++++ pandas/core/series.py | 26 ++++++++++- .../arrays/categorical/test_analytics.py | 8 ++++ pandas/tests/indexes/test_base.py | 20 +++++++++ pandas/tests/reshape/test_melt.py | 17 +++++++ pandas/tests/series/test_analytics.py | 21 +++++++++ 10 files changed, 166 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 20496c9fb3f31..0b6086a62d97f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -211,6 +211,7 @@ Other Enhancements - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) - :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) +- :meth:`Index.tile`, :meth:`Series.tile`, and :meth:`Categorical.tile` were introduced, parallel to the repeat methods, to ease categorical melting (:issue:`15853`) - Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) - :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to reflect changes from the `Pandas-GBQ library version 0.6.0 @@ -1357,6 +1358,7 @@ Reshaping - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) - Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have :class:`MultiIndex`ed columns (:issue:`23033`). +- Bug in :meth:`DataFrame.melt` causing loss of categorical status when melting with categorical id_vars columns (:issue:`15853`). .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 30fdeca35faf3..0a38265227b07 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -211,6 +211,10 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat', method='both', max_fname_arg_count=1) +TILE_DEFAULTS = dict(axis=None) +validate_tile = CompatValidator(TILE_DEFAULTS, fname='tile', + method='both', max_fname_arg_count=1) + ROUND_DEFAULTS = dict(out=None) validate_round = CompatValidator(ROUND_DEFAULTS, fname='round', method='both', max_fname_arg_count=1) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4363f3ccb14e2..5e263dc57fb33 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2325,12 +2325,25 @@ def repeat(self, repeats, *args, **kwargs): See also -------- numpy.ndarray.repeat - + Categorical.tile """ nv.validate_repeat(args, kwargs) codes = self._codes.repeat(repeats) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + def tile(self, reps, *args, **kwargs): + """ + Tile elements of a Categorical. + + See also + -------- + numpy.tile + Categorical.repeat + """ + nv.validate_tile(args, kwargs) + codes = np.tile(self._codes, reps) + return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + # Implement the ExtensionArray interface @property def _can_hold_na(self): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 263de57d32f31..c1713a15984a6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -546,6 +546,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): def _shallow_copy(self, values=None, **kwargs): if values is None: values = self.values + attributes = self._get_attributes_dict() attributes.update(kwargs) if not len(values) and 'dtype' not in kwargs: @@ -557,7 +558,6 @@ def _shallow_copy(self, values=None, **kwargs): # `self.values` returns `self` for tz-aware, so we need to unwrap # more specifically values = values.asi8 - return self._simple_new(values, **attributes) def _shallow_copy_with_infer(self, values, **kwargs): @@ -822,6 +822,7 @@ def repeat(self, repeats, *args, **kwargs): -------- Series.repeat : Equivalent function for Series numpy.repeat : Underlying implementation + Index.tile : repeat the entire index as a group, not by element Examples -------- @@ -836,6 +837,47 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) + def tile(self, reps, *args, **kwargs): + """ + Tile elements of an Index. + + Returns a new index constructed by repeating the current index + the number of times given by reps. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + reps : int + The number of repetitions of the element groups. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + pandas.Index + Newly created Index with tiled elements. + + See Also + -------- + Series.tile : Equivalent function for Series + numpy.tile : Underlying implementation + Index.repeat : repeat the index element by element, not as a group + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + >>> idx.tile(2) + Int64Index([1, 2, 3, 1, 2, 3], dtype='int64') + >>> idx.tile(3) + Int64Index([1, 2, 3, 1, 2, 3, 1, 2, 3], dtype='int64') + """ + nv.validate_tile(args, kwargs) + return self._shallow_copy(np.tile(self._values[:], reps)) + _index_shared_docs['where'] = """ .. versionadded:: 0.19.0 diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4547f47314bad..e32fe380320eb 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -638,6 +638,18 @@ def repeat(self, repeats, *args, **kwargs): return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) + def tile(self, reps, *args, **kwargs): + """ + Analogous to numpy.tile + """ + nv.validate_tile(args, kwargs) + if is_period_dtype(self): + freq = self.freq + else: + freq = None + return self._shallow_copy(np.tile(self.asi8, reps), + freq=freq) + @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): other = _ensure_datetimelike_to_i8(other, to_utc=True) diff --git a/pandas/core/series.py b/pandas/core/series.py index 20e4720a3bde7..ad8616c9a370f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -989,11 +989,12 @@ def _set_values(self, key, value): def repeat(self, repeats, *args, **kwargs): """ - Repeat elements of an Series. Refer to `numpy.ndarray.repeat` + Repeat elements of a Series. Refer to `numpy.ndarray.repeat` for more information about the `repeats` argument. See also -------- + pd.Series.tile numpy.ndarray.repeat """ nv.validate_repeat(args, kwargs) @@ -1002,6 +1003,29 @@ def repeat(self, repeats, *args, **kwargs): return self._constructor(new_values, index=new_index).__finalize__(self) + def tile(self, reps, *args, **kwargs): + """ + Tile elements of a Series. Refer to `numpy.tile` + for more information about the `reps` argument, although + note that we do not support multidimensional tiling of Series. + + See also + -------- + pd.Series.repeat + numpy.tile + """ + nv.validate_tile(args, kwargs) + new_index = self.index.tile(reps) + if is_categorical_dtype(self.dtype): + new_values = Categorical.from_codes(np.tile(self.cat.codes, reps), + categories=self.cat.categories, + ordered=self.cat.ordered) + else: + new_values = np.tile(self._values, reps) + + return self._constructor(new_values, + index=new_index).__finalize__(self) + def get_value(self, label, takeable=False): """Quickly retrieve single value at passed index label diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index ea6facd66a1a3..248abbbb688bc 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -308,6 +308,14 @@ def test_numpy_repeat(self): with pytest.raises(ValueError, match=msg): np.repeat(cat, 2, axis=1) + def test_tile(self): + # GH15853 + cat = Categorical(["a", "b"], categories=["b", "a"], ordered=True) + exp = Categorical(["a", "b", "a", "b"], categories=["b", "a"], + ordered=True) + res = cat.tile(2) + tm.assert_categorical_equal(res, exp) + def test_isna(self): exp = np.array([False, False, True]) c = Categorical(["a", "b", np.nan]) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 4a3efe22926f7..1ce92f0605cf3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2485,6 +2485,26 @@ def test_repeat(self): result = index.repeat(repeats) tm.assert_index_equal(result, expected) + def test_tile(self): + reps = 2 + index = pd.Index([1, 2, 3]) + expected = pd.Index([1, 2, 3, 1, 2, 3]) + + result = index.tile(reps) + tm.assert_index_equal(result, expected) + + def test_tile_datetimeindex(self): + index = pd.date_range("2018-01-01", "2018-01-03") + result = index.tile(2) + expected = pd.to_datetime(["2018-01-01", "2018-01-02", + "2018-01-03"] * 2) + + tm.assert_index_equal(result, expected) + + # Even if reps = 1, verify we lose frequency + one_result = index.tile(1) + assert one_result.freq is None + @pytest.mark.parametrize("index", [ pd.Index([np.nan]), pd.Index([np.nan, 1]), pd.Index([1, 2, np.nan]), pd.Index(['a', 'b', np.nan]), diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 16ecb07c5f413..2a31bdde612ae 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -233,6 +233,23 @@ def test_pandas_dtypes(self, col): expected.columns = ['klass', 'col', 'attribute', 'value'] tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('id_vars', [['a'], ['b'], ['a', 'b']]) + def test_categorical_id_vars(self, id_vars): + # GH 15853 + df = DataFrame({"a": pd.Series(["a", "b", "c", "a", "d"], + dtype="category"), + "b": pd.Series(pd.Categorical([0, 1, 1, 2, 1], + categories=[0, 2, 1, 3], + ordered=True)), + "c": range(5), "d": np.arange(5.0, 0.0, -1)}, + columns=["a", "b", "c", "d"]) + + result = df.melt(id_vars=id_vars) + for column in id_vars: + num = len(df.columns) - len(id_vars) + expected = df[column].tile(num).reset_index(drop=True) + tm.assert_series_equal(result[column], expected) + class TestLreshape(object): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index a5a7cc2217864..afc1ec35443c6 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1395,6 +1395,27 @@ def test_numpy_repeat(self): with pytest.raises(ValueError, match=msg): np.repeat(s, 2, axis=0) + def test_tile(self): + s = Series(np.random.randn(3), index=['a', 'b', 'c']) + + reps = s.tile(5) + exp = Series(np.tile(s.values, 5), index=np.tile(s.index.values, 5)) + assert_series_equal(reps, exp) + + def test_tile_categorical(self): + s = Series(pd.Categorical(["x", "y", "x", "z"], + categories=["x", "z", "y"], + ordered=True)) + res_1 = s.tile(1) + assert_series_equal(s, res_1) + + res_2 = s.tile(2) + exp_2 = Series(pd.Categorical(["x", "y", "x", "z"] * 2, + categories=s.cat.categories, + ordered=True), + index=[0, 1, 2, 3] * 2) + assert_series_equal(res_2, exp_2) + def test_searchsorted(self): s = Series([1, 2, 3])