Skip to content

Commit 59f0ee7

Browse files
committed
ENH: add docs and add match function to API, close #502
1 parent 7353202 commit 59f0ee7

File tree

5 files changed

+63
-7
lines changed

5 files changed

+63
-7
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ pandas 0.8.0
3838
- Add support for indexes (dates or otherwise) with duplicates and common
3939
sense indexing/selection functionality
4040
- Series/DataFrame.update methods, in-place variant of combine_first (#961)
41+
- Add ``match`` function to API (#502)
4142

4243
**Improvements to existing features**
4344

pandas/core/algorithms.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -8,29 +8,45 @@
88
import pandas.core.common as com
99
import pandas._tseries as lib
1010

11-
def match(values, index):
11+
def match(to_match, values, na_sentinel=-1):
1212
"""
13-
13+
Compute locations of to_match into values
1414
1515
Parameters
1616
----------
17+
to_match : array-like
18+
values to find positions of
19+
values : array-like
20+
Unique set of values
21+
na_sentinel : int, default -1
22+
Value to mark "not found"
23+
24+
Examples
25+
--------
1726
1827
Returns
1928
-------
20-
match : ndarray
29+
match : ndarray of integers
2130
"""
22-
f = lambda htype, caster: _match_generic(values, index, htype, caster)
23-
return _hashtable_algo(f, index.dtype)
31+
values = np.asarray(values)
32+
if issubclass(values.dtype.type, basestring):
33+
values = np.array(values, dtype='O')
34+
35+
f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
36+
return _hashtable_algo(f, values.dtype)
2437

2538
def unique(values):
2639
"""
40+
Compute unique values (not necessarily sorted) efficiently from input array
41+
of values
2742
2843
Parameters
2944
----------
45+
values : array-like
3046
3147
Returns
3248
-------
33-
49+
uniques
3450
"""
3551
f = lambda htype, caster: _unique_generic(values, htype, caster)
3652
return _hashtable_algo(f, values.dtype)
@@ -98,7 +114,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
98114
labels, counts = table.get_labels(values, uniques, 0, na_sentinel)
99115

100116
labels = com._ensure_platform_int(labels)
101-
117+
102118
uniques = com._asarray_tuplesafe(uniques)
103119
if sort and len(counts) > 0:
104120
sorter = uniques.argsort()

pandas/core/api.py

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import numpy as np
55

6+
from pandas.core.algorithms import factorize, match, unique
7+
68
from pandas.core.common import isnull, notnull, save, load
79
from pandas.core.factor import Factor
810
from pandas.core.format import set_printoptions

pandas/tests/test_algos.py

+25
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,31 @@
22

33
import numpy as np
44

5+
56
import pandas.core.algorithms as algos
67
import pandas.util.testing as tm
78

9+
10+
class TestMatch(unittest.TestCase):
11+
12+
def test_ints(self):
13+
values = np.array([0, 2, 1])
14+
to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0])
15+
16+
result = algos.match(to_match, values)
17+
expected = np.array([0, 2, 1, 1, 0, 2, -1, 0])
18+
self.assert_(np.array_equal(result, expected))
19+
20+
def test_strings(self):
21+
values = ['foo', 'bar', 'baz']
22+
to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux']
23+
24+
result = algos.match(to_match, values)
25+
expected = np.array([1, 0, -1, 0, 1, 2, -1])
26+
self.assert_(np.array_equal(result, expected))
27+
28+
if __name__ == '__main__':
29+
import nose
30+
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
31+
exit=False)
32+

vb_suite/miscellaneous.py

+12
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,15 @@ def prop(self):
2020
misc_cache_readonly = Benchmark("obj.prop", setup, name="misc_cache_readonly",
2121
ncalls=2000000)
2222

23+
#----------------------------------------------------------------------
24+
# match
25+
26+
setup = common_setup + """
27+
from pandas.util.testing import rands
28+
29+
uniques = np.array([rands(10) for _ in xrange(1000)], dtype='O')
30+
all = uniques.repeat(10)
31+
"""
32+
33+
match_strings = Benchmark("match(all, uniques)", setup,
34+
start_date=datetime(2012, 5, 12))

0 commit comments

Comments
 (0)