ENH: Added str.normalize to use unicodedata.normalize

sinhrks · sinhrks · commit 84afe26aa513 · 2015-05-04T13:16:58.000+09:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -540,6 +540,7 @@ strings and apply several methods to it. These can be acccessed like
    Series.str.lower
    Series.str.lstrip
    Series.str.match
+   Series.str.normalize
    Series.str.pad
    Series.str.repeat
    Series.str.replace
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -268,6 +268,7 @@ Method Summary
     :meth:`~Series.str.rfind`,Equivalent to ``str.rfind``
     :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize``
     :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase``
+    :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize``
     :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum``
     :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha``
     :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit``
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -26,6 +26,8 @@ Enhancements
 - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
 - ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`)
 - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
+- Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`)
+
 - Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
 
   The ``.str`` accessor is now available for both ``Series`` and ``Index``.
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -517,9 +517,16 @@ def _make_str_accessor(self):
             raise AttributeError("Can only use .str accessor with string "
                                  "values, which use np.object_ dtype in "
                                  "pandas")
-        elif isinstance(self, Index) and self.inferred_type != 'string':
-            raise AttributeError("Can only use .str accessor with string "
-                                 "values (i.e. inferred_type is 'string')")
+        elif isinstance(self, Index):
+            # see scc/inferrence.pyx which can contain string values
+            allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
+            if self.inferred_type not in allowed_types:
+                message = ("Can only use .str accessor with string values "
+                           "(i.e. inferred_type is 'string', 'unicode' or 'mixed')")
+                raise AttributeError(message)
+            if self.nlevels > 1:
+                message = "Can only use .str accessor with Index, not MultiIndex"
+                raise AttributeError(message)
         return StringMethods(self)
 
     str = AccessorProperty(StringMethods, _make_str_accessor)
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -1206,6 +1206,25 @@ def rfind(self, sub, start=0, end=None):
         result = str_find(self.series, sub, start=start, end=end, side='right')
         return self._wrap_result(result)
 
+    def normalize(self, form):
+        """Return the Unicode normal form for the strings in the Series/Index.
+        For more information on the forms, see the
+        :func:`unicodedata.normalize`.
+
+        Parameters
+        ----------
+        form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
+            Unicode form
+
+        Returns
+        -------
+        normalized : Series/Index of objects
+        """
+        import unicodedata
+        f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
+        result = _na_map(f, self.series)
+        return self._wrap_result(result)
+
     _shared_docs['len'] = ("""
     Compute length of each string in the Series/Index.
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -685,6 +685,7 @@ def test_empty_str_methods(self):
         tm.assert_series_equal(empty_str, empty.str.isdecimal())
         tm.assert_series_equal(empty_str, empty.str.capitalize())
         tm.assert_series_equal(empty_str, empty.str.swapcase())
+        tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
 
     def test_ismethods(self):
         values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', '  ']
@@ -1549,6 +1550,51 @@ def test_encode_decode_errors(self):
 
         tm.assert_series_equal(result, exp)
 
+    def test_normalize(self):
+        def unistr(codes):
+            # build unicode string from unichr
+            # we cannot use six.u() here because it escapes unicode
+            return ''.join([unichr(c) for c in codes])
+
+        values = ['ABC', # ASCII
+                  unistr([0xFF21, 0xFF22, 0xFF23]), # ＡＢＣ
+                  unistr([0xFF11, 0xFF12, 0xFF13]), # １２３
+                  np.nan,
+                  unistr([0xFF71, 0xFF72, 0xFF74])] # ｱｲｴ
+        s = Series(values, index=['a', 'b', 'c', 'd', 'e'])
+
+        normed = [compat.u_safe('ABC'),
+                  compat.u_safe('ABC'),
+                  compat.u_safe('123'),
+                  np.nan,
+                  unistr([0x30A2, 0x30A4, 0x30A8])] # アイエ
+        expected = Series(normed, index=['a', 'b', 'c', 'd', 'e'])
+
+        result = s.str.normalize('NFKC')
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([compat.u_safe('ABC'),
+                           unistr([0xFF21, 0xFF22, 0xFF23]), # ＡＢＣ
+                           unistr([0xFF11, 0xFF12, 0xFF13]), # １２３
+                           np.nan,
+                           unistr([0xFF71, 0xFF72, 0xFF74])], # ｱｲｴ
+                          index=['a', 'b', 'c', 'd', 'e'])
+
+        result = s.str.normalize('NFC')
+        tm.assert_series_equal(result, expected)
+
+        with tm.assertRaisesRegexp(ValueError, "invalid normalization form"):
+            s.str.normalize('xxx')
+
+        s = Index([unistr([0xFF21, 0xFF22, 0xFF23]),  # ＡＢＣ
+                   unistr([0xFF11, 0xFF12, 0xFF13]),  # １２３
+                   unistr([0xFF71, 0xFF72, 0xFF74])]) # ｱｲｴ
+        expected = Index([compat.u_safe('ABC'),
+                          compat.u_safe('123'),
+                          unistr([0x30A2, 0x30A4, 0x30A8])])
+        result = s.str.normalize('NFKC')
+        tm.assert_index_equal(result, expected)
+
     def test_cat_on_filtered_index(self):
         df = DataFrame(index=MultiIndex.from_product([[2011, 2012], [1,2,3]],
                                                      names=['year', 'month']))
@@ -1567,6 +1613,57 @@ def test_cat_on_filtered_index(self):
         self.assertEqual(str_multiple.loc[1], '2011 2 2')
 
 
+    def test_index_str_accessor_visibility(self):
+        from pandas.core.strings import StringMethods
+
+        if not compat.PY3:
+            cases = [(['a', 'b'], 'string'),
+                     (['a', u('b')], 'mixed'),
+                     ([u('a'), u('b')], 'unicode'),
+                     (['a', 'b', 1], 'mixed-integer'),
+                     (['a', 'b', 1.3], 'mixed'),
+                     (['a', 'b', 1.3, 1], 'mixed-integer'),
+                     (['aa', datetime(2011, 1, 1)], 'mixed')]
+        else:
+            cases = [(['a', 'b'], 'string'),
+                     (['a', u('b')], 'string'),
+                     ([u('a'), u('b')], 'string'),
+                     (['a', 'b', 1], 'mixed-integer'),
+                     (['a', 'b', 1.3], 'mixed'),
+                     (['a', 'b', 1.3, 1], 'mixed-integer'),
+                     (['aa', datetime(2011, 1, 1)], 'mixed')]
+        for values, tp in cases:
+            idx = Index(values)
+            self.assertTrue(isinstance(Series(values).str, StringMethods))
+            self.assertTrue(isinstance(idx.str, StringMethods))
+            self.assertEqual(idx.inferred_type, tp)
+
+        for values, tp in cases:
+            idx = Index(values)
+            self.assertTrue(isinstance(Series(values).str, StringMethods))
+            self.assertTrue(isinstance(idx.str, StringMethods))
+            self.assertEqual(idx.inferred_type, tp)
+
+        cases = [([1, np.nan], 'floating'),
+                 ([datetime(2011, 1, 1)], 'datetime64'),
+                 ([timedelta(1)], 'timedelta64')]
+        for values, tp in cases:
+            idx = Index(values)
+            message = 'Can only use .str accessor with string values'
+            with self.assertRaisesRegexp(AttributeError, message):
+                Series(values).str
+            with self.assertRaisesRegexp(AttributeError, message):
+                idx.str
+            self.assertEqual(idx.inferred_type, tp)
+
+        # MultiIndex has mixed dtype, but not allow to use accessor
+        idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')])
+        self.assertEqual(idx.inferred_type, 'mixed')
+        message = 'Can only use .str accessor with Index, not MultiIndex'
+        with self.assertRaisesRegexp(AttributeError, message):
+            idx.str
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)