ENH: add docs and add match function to API, close #502

wesm · wesm · commit 59f0ee735638 · 2012-05-12T14:30:24.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -38,6 +38,7 @@ pandas 0.8.0
   - Add support for indexes (dates or otherwise) with duplicates and common
     sense indexing/selection functionality
   - Series/DataFrame.update methods, in-place variant of combine_first (#961)
+  - Add ``match`` function to API (#502)
 
 **Improvements to existing features**
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -8,29 +8,45 @@
 import pandas.core.common as com
 import pandas._tseries as lib
 
-def match(values, index):
+def match(to_match, values, na_sentinel=-1):
     """
-
+    Compute locations of to_match into values
 
     Parameters
     ----------
+    to_match : array-like
+        values to find positions of
+    values : array-like
+        Unique set of values
+    na_sentinel : int, default -1
+        Value to mark "not found"
+
+    Examples
+    --------
 
     Returns
     -------
-    match : ndarray
+    match : ndarray of integers
     """
-    f = lambda htype, caster: _match_generic(values, index, htype, caster)
-    return _hashtable_algo(f, index.dtype)
+    values = np.asarray(values)
+    if issubclass(values.dtype.type, basestring):
+        values = np.array(values, dtype='O')
+
+    f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
+    return _hashtable_algo(f, values.dtype)
 
 def unique(values):
     """
+    Compute unique values (not necessarily sorted) efficiently from input array
+    of values
 
     Parameters
     ----------
+    values : array-like
 
     Returns
     -------
-
+    uniques
     """
     f = lambda htype, caster: _unique_generic(values, htype, caster)
     return _hashtable_algo(f, values.dtype)
@@ -98,7 +114,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
     labels, counts = table.get_labels(values, uniques, 0, na_sentinel)
 
     labels = com._ensure_platform_int(labels)
-    
+
     uniques = com._asarray_tuplesafe(uniques)
     if sort and len(counts) > 0:
         sorter = uniques.argsort()
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 
+from pandas.core.algorithms import factorize, match, unique
+
 from pandas.core.common import isnull, notnull, save, load
 from pandas.core.factor import Factor
 from pandas.core.format import set_printoptions
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -2,6 +2,31 @@
 
 import numpy as np
 
+
 import pandas.core.algorithms as algos
 import pandas.util.testing as tm
 
+
+class TestMatch(unittest.TestCase):
+
+    def test_ints(self):
+        values = np.array([0, 2, 1])
+        to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0])
+
+        result = algos.match(to_match, values)
+        expected = np.array([0, 2, 1, 1, 0, 2, -1, 0])
+        self.assert_(np.array_equal(result, expected))
+
+    def test_strings(self):
+        values = ['foo', 'bar', 'baz']
+        to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux']
+
+        result = algos.match(to_match, values)
+        expected = np.array([1, 0, -1, 0, 1, 2, -1])
+        self.assert_(np.array_equal(result, expected))
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
+                   exit=False)
+
diff --git a/vb_suite/miscellaneous.py b/vb_suite/miscellaneous.py
@@ -20,3 +20,15 @@ def prop(self):
 misc_cache_readonly = Benchmark("obj.prop", setup, name="misc_cache_readonly",
                                 ncalls=2000000)
 
+#----------------------------------------------------------------------
+# match
+
+setup = common_setup + """
+from pandas.util.testing import rands
+
+uniques = np.array([rands(10) for _ in xrange(1000)], dtype='O')
+all = uniques.repeat(10)
+"""
+
+match_strings = Benchmark("match(all, uniques)", setup,
+                          start_date=datetime(2012, 5, 12))