Skip to content

Commit 4655828

Browse files
committed
ENH: refactoring to support ordered factors, cut/qcut return factors. #292
1 parent a693e7b commit 4655828

File tree

3 files changed

+43
-40
lines changed

3 files changed

+43
-40
lines changed

pandas/core/factor.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ def from_array(cls, data):
4444

4545
levels = None
4646

47-
def __array__(self):
48-
return self.levels.values.take(self.labels)
47+
def __array__(self, dtype=None):
48+
return com.take_1d(self.levels, self.labels)
4949

5050
def __len__(self):
5151
return len(self.labels)
@@ -58,7 +58,10 @@ def __repr__(self):
5858
def __getitem__(self, key):
5959
if isinstance(key, (int, np.integer)):
6060
i = self.labels[key]
61-
return self.levels[i]
61+
if i == -1:
62+
return np.nan
63+
else:
64+
return self.levels[i]
6265
else:
6366
return Factor(self.labels[key], self.levels)
6467

pandas/tools/tests/test_tile.py

+28-28
Original file line numberDiff line numberDiff line change
@@ -22,26 +22,26 @@ def test_simple(self):
2222

2323
def test_bins(self):
2424
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
25-
result, bins = cut(data, 3, labels=False, retbins=True)
26-
assert_equal(result, [1, 1, 1, 2, 3, 1])
25+
result, bins = cut(data, 3, retbins=True)
26+
assert_equal(result.labels, [0, 0, 0, 1, 2, 0])
2727
assert_almost_equal(bins, [ 0.1905, 3.36666667, 6.53333333, 9.7])
2828

2929
def test_right(self):
3030
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
31-
result, bins = cut(data, 4, right=True, labels=False, retbins=True)
32-
assert_equal(result, [1, 1, 1, 3, 4, 1, 1])
31+
result, bins = cut(data, 4, right=True, retbins=True)
32+
assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 0])
3333
assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7])
3434

3535
def test_noright(self):
3636
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
37-
result, bins = cut(data, 4, right=False, labels=False, retbins=True)
38-
assert_equal(result, [1, 1, 1, 3, 4, 1, 2])
37+
result, bins = cut(data, 4, right=False, retbins=True)
38+
assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 1])
3939
assert_almost_equal(bins, [ 0.2, 2.575, 4.95, 7.325, 9.7095])
4040

4141
def test_arraylike(self):
4242
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
43-
result, bins = cut(data, 3, labels=False, retbins=True)
44-
assert_equal(result, [1, 1, 1, 2, 3, 1])
43+
result, bins = cut(data, 3, retbins=True)
44+
assert_equal(result.labels, [0, 0, 0, 1, 2, 0])
4545
assert_almost_equal(bins, [ 0.1905, 3.36666667, 6.53333333, 9.7])
4646

4747
def test_bins_not_monotonic(self):
@@ -51,39 +51,39 @@ def test_bins_not_monotonic(self):
5151
def test_labels(self):
5252
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
5353

54-
labels, bins = cut(arr, 4, retbins=True)
55-
distinct_labels = sorted(unique(labels))
56-
ex_labels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]',
54+
result, bins = cut(arr, 4, retbins=True)
55+
ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]',
5756
'(0.75, 1]']
58-
self.assertEqual(distinct_labels, ex_labels)
57+
self.assert_(np.array_equal(result.levels, ex_levels))
5958

60-
labels, bins = cut(arr, 4, retbins=True, right=False)
61-
distinct_labels = sorted(unique(labels))
62-
ex_labels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)',
59+
result, bins = cut(arr, 4, retbins=True, right=False)
60+
ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)',
6361
'[0.75, 1.001)']
64-
self.assertEqual(distinct_labels, ex_labels)
62+
self.assert_(np.array_equal(result.levels, ex_levels))
6563

6664
def test_label_precision(self):
6765
arr = np.arange(0, 0.73, 0.01)
6866

69-
labels = cut(arr, 4, precision=2)
70-
distinct_labels = sorted(unique(labels))
71-
ex_labels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]',
67+
result = cut(arr, 4, precision=2)
68+
ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]',
7269
'(0.54, 0.72]']
73-
self.assertEqual(distinct_labels, ex_labels)
70+
self.assert_(np.array_equal(result.levels, ex_levels))
7471

7572
def test_na_handling(self):
7673
arr = np.arange(0, 0.75, 0.01)
7774
arr[::3] = np.nan
7875

79-
labels = cut(arr, 4)
80-
ex_labels = np.where(com.isnull(arr), np.nan, labels)
76+
result = cut(arr, 4)
8177

82-
tm.assert_almost_equal(labels, ex_labels)
78+
result_arr = np.asarray(result)
8379

84-
labels = cut(arr, 4, labels=False)
85-
ex_labels = np.where(com.isnull(arr), np.nan, labels)
86-
tm.assert_almost_equal(labels, ex_labels)
80+
ex_arr = np.where(com.isnull(arr), np.nan, result_arr)
81+
82+
tm.assert_almost_equal(result_arr, ex_arr)
83+
84+
result = cut(arr, 4, labels=False)
85+
ex_result = np.where(com.isnull(arr), np.nan, result)
86+
tm.assert_almost_equal(result, ex_result)
8787

8888
def test_qcut(self):
8989
arr = np.random.randn(1000)
@@ -94,9 +94,9 @@ def test_qcut(self):
9494

9595
assert_almost_equal(bins, ex_bins)
9696

97-
ex_labels = cut(arr, ex_bins)
97+
ex_levels = cut(arr, ex_bins)
9898

99-
self.assert_(np.array_equal(labels, ex_labels))
99+
self.assert_(np.array_equal(labels, ex_levels))
100100

101101

102102
if __name__ == '__main__':

pandas/tools/tile.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -153,28 +153,28 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
153153

154154
fmt = lambda v: _format_label(v, precision=precision)
155155
if right:
156-
strings = ['(%s, %s]' % (fmt(x), fmt(y))
156+
levels = ['(%s, %s]' % (fmt(x), fmt(y))
157157
for x, y in zip(labels, labels[1:])]
158158
else:
159-
strings = ['[%s, %s)' % (fmt(x), fmt(y))
159+
levels = ['[%s, %s)' % (fmt(x), fmt(y))
160160
for x, y in zip(labels, labels[1:])]
161161

162-
strings = np.asarray(strings, dtype=object)
162+
levels = np.asarray(levels, dtype=object)
163163

164164
if has_nas:
165165
np.putmask(ids, mask, 0)
166166

167-
labels = com.take_1d(strings, ids - 1)
167+
fac = Factor(ids - 1, levels)
168168
else:
169-
labels = ids
169+
fac = ids
170170
if has_nas:
171-
labels = labels.astype(np.float64)
172-
np.putmask(labels, mask, np.nan)
171+
fac = ids.astype(np.float64)
172+
np.putmask(fac, mask, np.nan)
173173

174174
if not retbins:
175-
return labels
175+
return fac
176176

177-
return labels, bins
177+
return fac, bins
178178

179179

180180
def _format_label(x, precision=3):

0 commit comments

Comments
 (0)