Skip to content

Commit 89c42a3

Browse files
jankatinsjreback
authored andcommitted
Categorical: .codes should be immutable
ERR: codes modification raises ValueError always Categorical: use Categorical.from_codes() in a few places Categorical: Fix assigning a Categorical to an existing string column
1 parent e474c68 commit 89c42a3

File tree

5 files changed

+83
-29
lines changed

5 files changed

+83
-29
lines changed

doc/source/categorical.rst

+9-12
Original file line numberDiff line numberDiff line change
@@ -548,28 +548,25 @@ relevant columns back to `category` and assign the right levels and level orderi
548548
:suppress:
549549
550550
from pandas.compat import StringIO
551-
csv_file = StringIO()
552551
553552
.. ipython:: python
554553
555554
s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd']))
556555
# rename the levels
557556
s.cat.levels = ["very good", "good", "bad"]
558-
# add new levels at the end
559-
s.cat.levels = list(s.cat.levels) + ["medium", "very bad"]
560-
# reorder the levels
557+
# reorder the levels and add missing levels
561558
s.cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"])
562-
df = pd.DataFrame({"s":s, "vals":[1,2,3,4,5,6]})
563-
df.to_csv(csv_file)
564-
df2 = pd.read_csv(csv_file)
559+
df = pd.DataFrame({"cats":s, "vals":[1,2,3,4,5,6]})
560+
csv = StringIO()
561+
df.to_csv(csv)
562+
df2 = pd.read_csv(StringIO(csv.getvalue()))
565563
df2.dtypes
566-
df2["vals"]
564+
df2["cats"]
567565
# Redo the category
568-
df2["vals"] = df2["vals"].astype("category")
569-
df2["vals"].cat.levels = list(df2["vals"].cat.levels) + ["medium", "very bad"]
570-
df2["vals"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"])
566+
df2["cats"] = df2["cats"].astype("category")
567+
df2["cats"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"])
571568
df2.dtypes
572-
df2["vals"]
569+
df2["cats"]
573570
574571
575572
Missing Data

pandas/core/categorical.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -325,11 +325,24 @@ def from_codes(cls, codes, levels, ordered=True, name=None):
325325
_codes = None
326326

327327
def _get_codes(self):
328-
""" Get the level codes. """
329-
# TODO: return a copy so that no manipulation is possible?
330-
return self._codes
328+
""" Get the level codes.
331329
332-
codes = property(fget=_get_codes, doc=_codes_doc)
330+
Returns
331+
-------
332+
codes : integer array view
333+
A non writable view of the `codes` array.
334+
"""
335+
v = self._codes.view()
336+
v.flags.writeable = False
337+
return v
338+
339+
def _set_codes(self, codes):
340+
"""
341+
Not settable by the user directly
342+
"""
343+
raise ValueError("cannot set Categorical codes directly")
344+
345+
codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc)
333346

334347
_levels = None
335348

pandas/core/internals.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1426,9 +1426,9 @@ def _try_cast(self, element):
14261426
return element
14271427

14281428
def should_store(self, value):
1429-
return not issubclass(value.dtype.type,
1429+
return not (issubclass(value.dtype.type,
14301430
(np.integer, np.floating, np.complexfloating,
1431-
np.datetime64, np.bool_))
1431+
np.datetime64, np.bool_)) or com.is_categorical_dtype(value))
14321432

14331433
def replace(self, to_replace, value, inplace=False, filter=None,
14341434
regex=False):

pandas/tests/test_categorical.py

+47-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
import numpy as np
88
import pandas as pd
99

10-
from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp)
10+
from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex,
11+
Timestamp, _np_version_under1p7)
1112

1213
import pandas.core.common as com
1314
import pandas.compat as compat
@@ -345,12 +346,12 @@ def test_remove_unused_levels(self):
345346

346347
def test_nan_handling(self):
347348

348-
# Nans are represented as -1 in labels
349+
# Nans are represented as -1 in codes
349350
c = Categorical(["a","b",np.nan,"a"])
350351
self.assert_numpy_array_equal(c.levels , np.array(["a","b"]))
351352
self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0]))
352353

353-
# If levels have nan included, the label should point to that instead
354+
# If levels have nan included, the code should point to that instead
354355
c = Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan])
355356
self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_))
356357
self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0]))
@@ -361,6 +362,36 @@ def test_nan_handling(self):
361362
self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_))
362363
self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0]))
363364

365+
def test_codes_immutable(self):
366+
367+
# Codes should be read only
368+
c = Categorical(["a","b","c","a", np.nan])
369+
exp = np.array([0,1,2,0, -1])
370+
self.assert_numpy_array_equal(c.codes, exp)
371+
372+
# Assignments to codes should raise
373+
def f():
374+
c.codes = np.array([0,1,2,0,1])
375+
self.assertRaises(ValueError, f)
376+
377+
# changes in the codes array should raise
378+
# np 1.6.1 raises RuntimeError rather than ValueError
379+
codes= c.codes
380+
def f():
381+
codes[4] = 1
382+
if _np_version_under1p7:
383+
self.assertRaises(RuntimeError, f)
384+
else:
385+
self.assertRaises(ValueError, f)
386+
387+
# But even after getting the codes, the original array should still be writeable!
388+
c[4] = "a"
389+
exp = np.array([0,1,2,0, 0])
390+
self.assert_numpy_array_equal(c.codes, exp)
391+
c._codes[4] = 2
392+
exp = np.array([0,1,2,0, 2])
393+
self.assert_numpy_array_equal(c.codes, exp)
394+
364395

365396
def test_min_max(self):
366397

@@ -549,6 +580,19 @@ def test_creation_astype(self):
549580
res = s.astype('category')
550581
tm.assert_series_equal(res, exp)
551582

583+
df = pd.DataFrame({"cats":[1,2,3,4,5,6], "vals":[1,2,3,4,5,6]})
584+
cats = Categorical([1,2,3,4,5,6])
585+
exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]})
586+
df["cats"] = df["cats"].astype("category")
587+
tm.assert_frame_equal(exp_df, df)
588+
589+
590+
df = pd.DataFrame({"cats":['a', 'b', 'b', 'a', 'a', 'd'], "vals":[1,2,3,4,5,6]})
591+
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
592+
exp_df = pd.DataFrame({"cats":cats, "vals":[1,2,3,4,5,6]})
593+
df["cats"] = df["cats"].astype("category")
594+
tm.assert_frame_equal(exp_df, df)
595+
552596
def test_sideeffects_free(self):
553597

554598
# Passing a categorical to a Series and then changing values in either the series or the

pandas/tests/test_groupby.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -2901,9 +2901,9 @@ def test_no_dummy_key_names(self):
29012901

29022902
def test_groupby_categorical(self):
29032903
levels = ['foo', 'bar', 'baz', 'qux']
2904-
labels = np.random.randint(0, 4, size=100)
2904+
codes = np.random.randint(0, 4, size=100)
29052905

2906-
cats = Categorical(labels, levels, name='myfactor', fastpath=True)
2906+
cats = Categorical.from_codes(codes, levels, name='myfactor')
29072907

29082908
data = DataFrame(np.random.randn(100, 4))
29092909

@@ -3049,18 +3049,18 @@ def test_cython_median(self):
30493049
def test_groupby_categorical_no_compress(self):
30503050
data = Series(np.random.randn(9))
30513051

3052-
labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
3053-
cats = Categorical(labels, [0, 1, 2], fastpath=True)
3052+
codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
3053+
cats = Categorical.from_codes(codes, [0, 1, 2])
30543054

30553055
result = data.groupby(cats).mean()
3056-
exp = data.groupby(labels).mean()
3056+
exp = data.groupby(codes).mean()
30573057
assert_series_equal(result, exp)
30583058

3059-
labels = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
3060-
cats = Categorical(labels, [0, 1, 2, 3], fastpath=True)
3059+
codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
3060+
cats = Categorical.from_codes(codes, [0, 1, 2, 3])
30613061

30623062
result = data.groupby(cats).mean()
3063-
exp = data.groupby(labels).mean().reindex(cats.levels)
3063+
exp = data.groupby(codes).mean().reindex(cats.levels)
30643064
assert_series_equal(result, exp)
30653065

30663066
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"])

0 commit comments

Comments
 (0)