Skip to content

Commit 76dd27f

Browse files
committed
Merge pull request #7090 from sinhrks/factorize
ENH/CLN: Add factorize to IndexOpsMixin
2 parents 953ba54 + ad80d81 commit 76dd27f

12 files changed

+205
-21
lines changed

doc/source/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ Computations / Descriptive Stats
335335
Series.cumsum
336336
Series.describe
337337
Series.diff
338+
Series.factorize
338339
Series.kurt
339340
Series.mad
340341
Series.max
@@ -1040,6 +1041,7 @@ Modifying and Computations
10401041
Index.diff
10411042
Index.drop
10421043
Index.equals
1044+
Index.factorize
10431045
Index.identical
10441046
Index.insert
10451047
Index.order

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ API Changes
203203
ignored (:issue:`6607`)
204204
- Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python
205205
parser when no options are ignored (:issue:`6607`)
206+
- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)
206207

207208
Deprecations
208209
~~~~~~~~~~~~
@@ -485,6 +486,7 @@ Bug Fixes
485486
- Bug in cache coherence with chained indexing and slicing; add ``_is_view`` property to ``NDFrame`` to correctly predict
486487
views; mark ``is_copy`` on ``xs` only if its an actual copy (and not a view) (:issue:`7084`)
487488
- Bug in DatetimeIndex creation from string ndarray with ``dayfirst=True`` (:issue:`5917`)
489+
- Bug in ``MultiIndex.from_arrays`` created from ``DatetimeIndex`` doesn't preserve ``freq`` and ``tz`` (:issue:`7090`)
488490

489491
pandas 0.13.1
490492
-------------

doc/source/v0.14.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ API changes
245245
- add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`)
246246
- accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`), this was a regression
247247
from 0.13.1
248+
- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)
248249

249250
.. _whatsnew_0140.sql:
250251

pandas/core/base.py

+22
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,28 @@ def nunique(self):
319319
"""
320320
return len(self.value_counts())
321321

322+
def factorize(self, sort=False, na_sentinel=-1):
323+
"""
324+
Encode the object as an enumerated type or categorical variable
325+
326+
Parameters
327+
----------
328+
sort : boolean, default False
329+
Sort by values
330+
na_sentinel: int, default -1
331+
Value to mark "not found"
332+
333+
Returns
334+
-------
335+
labels : the indexer to the original array
336+
uniques : the unique Index
337+
"""
338+
from pandas.core.algorithms import factorize
339+
from pandas.core.index import Index
340+
labels, uniques = factorize(self, sort=sort, na_sentinel=na_sentinel)
341+
uniques = Index(uniques)
342+
return labels, uniques
343+
322344
date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps')
323345
time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps')
324346
year = _field_accessor('year', "The year of the datetime")

pandas/core/categorical.py

+6-12
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,11 @@ def __init__(self, labels, levels=None, name=None):
8080
if levels is None:
8181
if name is None:
8282
name = getattr(labels, 'name', None)
83-
if isinstance(labels, Index) and hasattr(labels, 'factorize'):
84-
labels, levels = labels.factorize()
83+
if hasattr(labels, 'factorize'):
84+
try:
85+
labels, levels = labels.factorize(sort=True)
86+
except TypeError:
87+
labels, levels = labels.factorize(sort=False)
8588
else:
8689
try:
8790
labels, levels = factorize(labels, sort=True)
@@ -103,16 +106,7 @@ def from_array(cls, data):
103106
Can be an Index or array-like. The levels are assumed to be
104107
the unique values of `data`.
105108
"""
106-
if isinstance(data, Index) and hasattr(data, 'factorize'):
107-
labels, levels = data.factorize()
108-
else:
109-
try:
110-
labels, levels = factorize(data, sort=True)
111-
except TypeError:
112-
labels, levels = factorize(data, sort=False)
113-
114-
return Categorical(labels, levels,
115-
name=getattr(data, 'name', None))
109+
return Categorical(data)
116110

117111
_levels = None
118112

pandas/tests/test_base.py

+42
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,48 @@ def test_value_counts_inferred(self):
398398
self.assert_numpy_array_equal(td.unique(), expected)
399399
self.assertEquals(td.nunique(), 1)
400400

401+
def test_factorize(self):
402+
for o in self.objs:
403+
exp_arr = np.array(range(len(o)))
404+
labels, uniques = o.factorize()
405+
406+
self.assert_numpy_array_equal(labels, exp_arr)
407+
if isinstance(o, Series):
408+
expected = Index(o.values)
409+
self.assert_numpy_array_equal(uniques, expected)
410+
else:
411+
self.assertTrue(uniques.equals(o))
412+
413+
for o in self.objs:
414+
# sort by value, and create duplicates
415+
if isinstance(o, Series):
416+
o.sort()
417+
else:
418+
indexer = o.argsort()
419+
o = o.take(indexer)
420+
n = o[5:].append(o)
421+
422+
exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
423+
labels, uniques = n.factorize(sort=True)
424+
425+
self.assert_numpy_array_equal(labels, exp_arr)
426+
if isinstance(o, Series):
427+
expected = Index(o.values)
428+
self.assert_numpy_array_equal(uniques, expected)
429+
else:
430+
self.assertTrue(uniques.equals(o))
431+
432+
exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4])
433+
labels, uniques = n.factorize(sort=False)
434+
self.assert_numpy_array_equal(labels, exp_arr)
435+
436+
if isinstance(o, Series):
437+
expected = Index(np.concatenate([o.values[5:10], o.values[:5]]))
438+
self.assert_numpy_array_equal(uniques, expected)
439+
else:
440+
expected = o[5:].append(o[:5])
441+
self.assertTrue(uniques.equals(expected))
442+
401443

402444
class TestDatetimeIndexOps(Ops):
403445
_allowed = '_allow_datetime_index_ops'

pandas/tests/test_categorical.py

+32
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.core.categorical import Categorical
1111
from pandas.core.index import Index, Int64Index, MultiIndex
1212
from pandas.core.frame import DataFrame
13+
from pandas.tseries.period import PeriodIndex
1314
from pandas.util.testing import assert_almost_equal
1415
import pandas.core.common as com
1516

@@ -180,6 +181,37 @@ def test_empty_print(self):
180181
"Index([], dtype=object)")
181182
self.assertEqual(repr(factor), expected)
182183

184+
def test_periodindex(self):
185+
idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
186+
'2014-03', '2014-03'], freq='M')
187+
cat1 = Categorical.from_array(idx1)
188+
189+
exp_arr = np.array([0, 0, 1, 1, 2, 2])
190+
exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
191+
192+
self.assert_numpy_array_equal(cat1.labels, exp_arr)
193+
self.assert_(cat1.levels.equals(exp_idx))
194+
195+
idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
196+
'2014-03', '2014-01'], freq='M')
197+
cat2 = Categorical.from_array(idx2)
198+
199+
exp_arr = np.array([2, 2, 1, 0, 2, 0])
200+
201+
self.assert_numpy_array_equal(cat2.labels, exp_arr)
202+
self.assert_(cat2.levels.equals(exp_idx))
203+
204+
idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
205+
'2013-08', '2013-07', '2013-05'], freq='M')
206+
cat3 = Categorical.from_array(idx3)
207+
208+
exp_arr = np.array([6, 5, 4, 3, 2, 1, 0])
209+
exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09',
210+
'2013-10', '2013-11', '2013-12'], freq='M')
211+
212+
self.assert_numpy_array_equal(cat3.labels, exp_arr)
213+
self.assert_(cat3.levels.equals(exp_idx))
214+
183215

184216
if __name__ == '__main__':
185217
import nose

pandas/tests/test_multilevel.py

+11
Original file line numberDiff line numberDiff line change
@@ -1899,6 +1899,17 @@ def test_multiindex_set_index(self):
18991899
# it works!
19001900
df.set_index(index)
19011901

1902+
def test_datetimeindex(self):
1903+
idx1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'] * 2, tz='Asia/Tokyo')
1904+
idx2 = pd.date_range('2010/01/01', periods=6, freq='M', tz='US/Eastern')
1905+
idx = MultiIndex.from_arrays([idx1, idx2])
1906+
1907+
expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'], tz='Asia/Tokyo')
1908+
1909+
self.assert_(idx.levels[0].equals(expected1))
1910+
self.assert_(idx.levels[1].equals(idx2))
1911+
1912+
19021913
if __name__ == '__main__':
19031914

19041915
import nose

pandas/tseries/index.py

+13
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,19 @@ def to_period(self, freq=None):
806806

807807
return PeriodIndex(self.values, freq=freq, tz=self.tz)
808808

809+
def factorize(self, sort=False, na_sentinel=-1):
810+
"""
811+
Index.factorize with handling for DatetimeIndex metadata
812+
813+
Returns
814+
-------
815+
result : DatetimeIndex
816+
"""
817+
from pandas.core.algorithms import factorize
818+
labels, uniques = factorize(self.asi8, sort=sort, na_sentinel=na_sentinel)
819+
uniques = DatetimeIndex._simple_new(uniques, name=self.name, freq=self.freq, tz=self.tz)
820+
return labels, uniques
821+
809822
def order(self, return_indexer=False, ascending=True):
810823
"""
811824
Return sorted copy of Index

pandas/tseries/period.py

-9
Original file line numberDiff line numberDiff line change
@@ -739,15 +739,6 @@ def is_full(self):
739739
values = self.values
740740
return ((values[1:] - values[:-1]) < 2).all()
741741

742-
def factorize(self):
743-
"""
744-
Specialized factorize that boxes uniques
745-
"""
746-
from pandas.core.algorithms import factorize
747-
labels, uniques = factorize(self.values)
748-
uniques = PeriodIndex(ordinal=uniques, freq=self.freq)
749-
return labels, uniques
750-
751742
@property
752743
def freqstr(self):
753744
return self.freq

pandas/tseries/tests/test_period.py

+29
Original file line numberDiff line numberDiff line change
@@ -2175,6 +2175,35 @@ def test_slice_keep_name(self):
21752175
idx = period_range('20010101', periods=10, freq='D', name='bob')
21762176
self.assertEqual(idx.name, idx[1:].name)
21772177

2178+
def test_factorize(self):
2179+
idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
2180+
'2014-03', '2014-03'], freq='M')
2181+
2182+
exp_arr = np.array([0, 0, 1, 1, 2, 2])
2183+
exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
2184+
2185+
arr, idx = idx1.factorize()
2186+
self.assert_numpy_array_equal(arr, exp_arr)
2187+
self.assert_(idx.equals(exp_idx))
2188+
2189+
arr, idx = idx1.factorize(sort=True)
2190+
self.assert_numpy_array_equal(arr, exp_arr)
2191+
self.assert_(idx.equals(exp_idx))
2192+
2193+
idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
2194+
'2014-03', '2014-01'], freq='M')
2195+
2196+
exp_arr = np.array([2, 2, 1, 0, 2, 0])
2197+
arr, idx = idx2.factorize(sort=True)
2198+
self.assert_numpy_array_equal(arr, exp_arr)
2199+
self.assert_(idx.equals(exp_idx))
2200+
2201+
exp_arr = np.array([0, 0, 1, 2, 0, 2])
2202+
exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M')
2203+
arr, idx = idx2.factorize()
2204+
self.assert_numpy_array_equal(arr, exp_arr)
2205+
self.assert_(idx.equals(exp_idx))
2206+
21782207

21792208
def _permute(obj):
21802209
return obj.take(np.random.permutation(len(obj)))

pandas/tseries/tests/test_timeseries.py

+45
Original file line numberDiff line numberDiff line change
@@ -2189,6 +2189,51 @@ def test_join_with_period_index(self):
21892189
'PeriodIndex-ed objects'):
21902190
df.columns.join(s.index, how=join)
21912191

2192+
def test_factorize(self):
2193+
idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02',
2194+
'2014-02', '2014-03', '2014-03'])
2195+
2196+
exp_arr = np.array([0, 0, 1, 1, 2, 2])
2197+
exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])
2198+
2199+
arr, idx = idx1.factorize()
2200+
self.assert_numpy_array_equal(arr, exp_arr)
2201+
self.assert_(idx.equals(exp_idx))
2202+
2203+
arr, idx = idx1.factorize(sort=True)
2204+
self.assert_numpy_array_equal(arr, exp_arr)
2205+
self.assert_(idx.equals(exp_idx))
2206+
2207+
# tz must be preserved
2208+
idx1 = idx1.tz_localize('Asia/Tokyo')
2209+
exp_idx = exp_idx.tz_localize('Asia/Tokyo')
2210+
2211+
arr, idx = idx1.factorize()
2212+
self.assert_numpy_array_equal(arr, exp_arr)
2213+
self.assert_(idx.equals(exp_idx))
2214+
2215+
idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01',
2216+
'2014-03', '2014-01'])
2217+
2218+
exp_arr = np.array([2, 2, 1, 0, 2, 0])
2219+
exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])
2220+
arr, idx = idx2.factorize(sort=True)
2221+
self.assert_numpy_array_equal(arr, exp_arr)
2222+
self.assert_(idx.equals(exp_idx))
2223+
2224+
exp_arr = np.array([0, 0, 1, 2, 0, 2])
2225+
exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01'])
2226+
arr, idx = idx2.factorize()
2227+
self.assert_numpy_array_equal(arr, exp_arr)
2228+
self.assert_(idx.equals(exp_idx))
2229+
2230+
# freq must be preserved
2231+
idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo')
2232+
exp_arr = np.array([0, 1, 2, 3])
2233+
arr, idx = idx3.factorize()
2234+
self.assert_numpy_array_equal(arr, exp_arr)
2235+
self.assert_(idx.equals(idx3))
2236+
21922237

21932238
class TestDatetime64(tm.TestCase):
21942239
"""

0 commit comments

Comments
 (0)