Skip to content

Commit a693e7b

Browse files
committed
REF: refactor Factor class
1 parent 6d01c3c commit a693e7b

File tree

9 files changed

+38
-29
lines changed

9 files changed

+38
-29
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ pandas 0.8.0
131131
- Deprecation of offset, time_rule timeRule parameters throughout codebase
132132
- Series.append and DataFrame.append no longer check for duplicate indexes
133133
by default, add verify_integrity parameter (#1394)
134+
- Refactor Factor class, old constructor moved to Factor.from_array
134135

135136
**Bug fixes**
136137

pandas/core/factor.py

+23-17
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
1+
# pylint: disable=E1101,W0232
2+
13
import numpy as np
24
import pandas.core.common as com
3-
import pandas.lib as lib
45

56

6-
class Factor(np.ndarray):
7+
class Factor(object):
78
"""
89
Represents a categorical variable in classic R / S-plus fashion
910
1011
Parameters
1112
----------
13+
labels : ndarray of integers
14+
levels : Index-like (unique)
15+
1216
data : array-like
1317
1418
Returns
@@ -17,43 +21,45 @@ class Factor(np.ndarray):
1721
* labels : ndarray
1822
* levels : ndarray
1923
"""
20-
def __new__(cls, data):
24+
def __init__(self, labels, levels):
2125
from pandas.core.index import _ensure_index
26+
27+
levels = _ensure_index(levels)
28+
if not levels.is_unique:
29+
raise ValueError('Factor levels must be unique')
30+
31+
self.labels = labels
32+
self.levels = levels
33+
34+
@classmethod
35+
def from_array(cls, data):
2236
from pandas.core.algorithms import factorize
2337

2438
try:
2539
labels, levels, _ = factorize(data, sort=True)
2640
except TypeError:
2741
labels, levels, _ = factorize(data, sort=False)
2842

29-
labels = labels.view(Factor)
30-
labels.levels = _ensure_index(levels)
31-
return labels
43+
return Factor(labels, levels)
3244

3345
levels = None
3446

35-
def __array_finalize__(self, obj):
36-
self.levels = getattr(obj, 'levels', None)
37-
38-
@property
39-
def labels(self):
40-
return self.view(np.ndarray)
41-
42-
def asarray(self):
43-
return np.asarray(self.levels).take(self.labels)
47+
def __array__(self):
48+
return self.levels.values.take(self.labels)
4449

4550
def __len__(self):
4651
return len(self.labels)
4752

4853
def __repr__(self):
4954
temp = 'Factor:\n%s\nLevels (%d): %s'
50-
values = self.asarray()
55+
values = np.asarray(self)
5156
return temp % (repr(values), len(self.levels), self.levels)
5257

5358
def __getitem__(self, key):
5459
if isinstance(key, (int, np.integer)):
5560
i = self.labels[key]
5661
return self.levels[i]
5762
else:
58-
return np.ndarray.__getitem__(self, key)
63+
return Factor(self.labels[key], self.levels)
64+
5965

pandas/core/index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1460,7 +1460,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
14601460
levels = []
14611461
labels = []
14621462
for arr in arrays:
1463-
factor = Factor(arr)
1463+
factor = Factor.from_array(arr)
14641464
levels.append(factor.levels)
14651465
labels.append(factor.labels)
14661466

pandas/core/panel.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ def panel_index(time, panels, names=['time', 'panel']):
7777
(1962, 'C')], dtype=object)
7878
"""
7979
time, panels = _ensure_like_indices(time, panels)
80-
time_factor = Factor(time)
81-
panel_factor = Factor(panels)
80+
time_factor = Factor.from_array(time)
81+
panel_factor = Factor.from_array(panels)
8282

8383
labels = [time_factor.labels, panel_factor.labels]
8484
levels = [time_factor.levels, panel_factor.levels]

pandas/core/reshape.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
586586

587587
def make_column_dummies(data, column, prefix=False, prefix_sep='_'):
588588
from pandas import Factor
589-
factor = Factor(data[column].values)
589+
factor = Factor.from_array(data[column].values)
590590
dummy_mat = np.eye(len(factor.levels)).take(factor.labels, axis=0)
591591

592592
if prefix:
@@ -628,7 +628,7 @@ def make_axis_dummies(frame, axis='minor', transform=None):
628628
labels = frame.index.labels[num]
629629
if transform is not None:
630630
mapped_items = items.map(transform)
631-
factor = Factor(mapped_items.take(labels))
631+
factor = Factor.from_array(mapped_items.take(labels))
632632
labels = factor.labels
633633
items = factor.levels
634634

pandas/io/pytables.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -843,8 +843,8 @@ def _read_panel_table(self, group, where=None):
843843
index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
844844
values = sel.values['values']
845845

846-
major = Factor(index)
847-
minor = Factor(columns)
846+
major = Factor.from_array(index)
847+
minor = Factor.from_array(columns)
848848

849849
J, K = len(major.levels), len(minor.levels)
850850
key = major.labels * K + minor.labels

pandas/tests/test_index.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1534,7 +1534,8 @@ def test_has_duplicates(self):
15341534
class TestFactor(unittest.TestCase):
15351535

15361536
def setUp(self):
1537-
self.factor = Factor(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
1537+
self.factor = Factor.from_array(['a', 'b', 'b', 'a',
1538+
'a', 'c', 'c', 'c'])
15381539

15391540
def test_getitem(self):
15401541
self.assertEqual(self.factor[0], 'a')
@@ -1543,14 +1544,14 @@ def test_getitem(self):
15431544
subf = self.factor[[0, 1, 2]]
15441545
tm.assert_almost_equal(subf.labels, [0, 1, 1])
15451546

1546-
subf = self.factor[self.factor.asarray() == 'c']
1547+
subf = self.factor[np.asarray(self.factor) == 'c']
15471548
tm.assert_almost_equal(subf.labels, [2, 2, 2])
15481549

15491550
def test_constructor_unsortable(self):
15501551
arr = np.array([1, 2, 3, datetime.now()], dtype='O')
15511552

15521553
# it works!
1553-
factor = Factor(arr)
1554+
factor = Factor.from_array(arr)
15541555

15551556
def test_factor_agg(self):
15561557
import pandas.core.frame as frame

pandas/tools/merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1117,7 +1117,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
11171117
names = [None] * len(zipped)
11181118

11191119
if levels is None:
1120-
levels = [Factor(zp).levels for zp in zipped]
1120+
levels = [Factor.from_array(zp).levels for zp in zipped]
11211121
else:
11221122
levels = [_ensure_index(x) for x in levels]
11231123
else:
@@ -1150,7 +1150,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
11501150
levels.extend(concat_index.levels)
11511151
label_list.extend(concat_index.labels)
11521152
else:
1153-
factor = Factor(concat_index)
1153+
factor = Factor.from_array(concat_index)
11541154
levels.append(factor.levels)
11551155
label_list.append(factor.labels)
11561156

pandas/tools/tile.py

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from pandas.core.api import DataFrame, Series
6+
from pandas.core.factor import Factor
67
import pandas.core.algorithms as algos
78
import pandas.core.common as com
89
import pandas.core.nanops as nanops

0 commit comments

Comments
 (0)