ENH: Added str.normalize to use unicodedata.normalize

sinhrks · sinhrks · commit f4c167687386 · 2015-05-01T00:59:00.000+09:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -539,6 +539,7 @@ strings and apply several methods to it. These can be acccessed like
    Series.str.lower
    Series.str.lstrip
    Series.str.match
+   Series.str.normalize
    Series.str.pad
    Series.str.repeat
    Series.str.replace
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -268,6 +268,7 @@ Method Summary
     :meth:`~Series.str.rfind`,Equivalent to ``str.rfind``
     :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize``
     :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase``
+    :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize``
     :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum``
     :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha``
     :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit``
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -24,6 +24,8 @@ Enhancements
 
 - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
 - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
+- Added ``StringMethods.normalize()`` which behave as the same as standard :func:`unicodedata.normalizes` (:issue:`xxx`)
+
 - Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
 
   The ``.str`` accessor is now available for both ``Series`` and ``Index``.
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -517,9 +517,10 @@ def _make_str_accessor(self):
             raise AttributeError("Can only use .str accessor with string "
                                  "values, which use np.object_ dtype in "
                                  "pandas")
-        elif isinstance(self, Index) and self.inferred_type != 'string':
-            raise AttributeError("Can only use .str accessor with string "
-                                 "values (i.e. inferred_type is 'string')")
+        elif isinstance(self, Index) and self.inferred_type not in ('string', 'unicode'):
+            message = ("Can only use .str accessor with string " +
+                       "values (i.e. inferred_type is 'string' or 'unicode')")
+            raise AttributeError(message)
         return StringMethods(self)
 
     str = AccessorProperty(StringMethods, _make_str_accessor)
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -1161,6 +1161,24 @@ def rfind(self, sub, start=0, end=None):
         result = str_find(self.series, sub, start=start, end=end, side='right')
         return self._wrap_result(result)
 
+    def normalize(self, form):
+        """Return the Unicode normal form for the strings in the Series/Index.
+        Equivalent to standard :func:`unicodedata.normalize`.
+
+        Parameters
+        ----------
+        form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
+            Unicode form
+
+        Returns
+        -------
+        normalized : Series/Index of objects
+        """
+        import unicodedata
+        f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
+        result = _na_map(f, self.series)
+        return self._wrap_result(result)
+
     _shared_docs['len'] = ("""
     Compute length of each string in array.
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -685,6 +685,7 @@ def test_empty_str_methods(self):
         tm.assert_series_equal(empty_str, empty.str.isdecimal())
         tm.assert_series_equal(empty_str, empty.str.capitalize())
         tm.assert_series_equal(empty_str, empty.str.swapcase())
+        tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
 
     def test_ismethods(self):
         values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', '  ']
@@ -1549,6 +1550,51 @@ def test_encode_decode_errors(self):
 
         tm.assert_series_equal(result, exp)
 
+    def test_normalize(self):
+        def unistr(codes):
+            # build unicode string from unichr
+            # we cannot use six.u() here because it escapes unicode
+            return ''.join([unichr(c) for c in codes])
+
+        values = ['ABC', # ASCII
+                  unistr([0xFF21, 0xFF22, 0xFF23]), # ＡＢＣ
+                  unistr([0xFF11, 0xFF12, 0xFF13]), # １２３
+                  np.nan,
+                  unistr([0xFF71, 0xFF72, 0xFF74])] # ｱｲｴ
+        s = Series(values, index=['a', 'b', 'c', 'd', 'e'])
+
+        normed = [compat.u_safe('ABC'),
+                  compat.u_safe('ABC'),
+                  compat.u_safe('123'),
+                  np.nan,
+                  unistr([0x30A2, 0x30A4, 0x30A8])] # アイエ
+        expected = Series(normed, index=['a', 'b', 'c', 'd', 'e'])
+
+        result = s.str.normalize('NFKC')
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([compat.u_safe('ABC'),
+                           unistr([0xFF21, 0xFF22, 0xFF23]), # ＡＢＣ
+                           unistr([0xFF11, 0xFF12, 0xFF13]), # １２３
+                           np.nan,
+                           unistr([0xFF71, 0xFF72, 0xFF74])], # ｱｲｴ
+                          index=['a', 'b', 'c', 'd', 'e'])
+
+        result = s.str.normalize('NFC')
+        tm.assert_series_equal(result, expected)
+
+        with tm.assertRaisesRegexp(ValueError, "invalid normalization form"):
+            s.str.normalize('xxx')
+
+        s = Index([unistr([0xFF21, 0xFF22, 0xFF23]),  # ＡＢＣ
+                   unistr([0xFF11, 0xFF12, 0xFF13]),  # １２３
+                   unistr([0xFF71, 0xFF72, 0xFF74])]) # ｱｲｴ
+        expected = Index([compat.u_safe('ABC'),
+                          compat.u_safe('123'),
+                          unistr([0x30A2, 0x30A4, 0x30A8])])
+        result = s.str.normalize('NFKC')
+        tm.assert_index_equal(result, expected)
+
     def test_cat_on_filtered_index(self):
         df = DataFrame(index=MultiIndex.from_product([[2011, 2012], [1,2,3]],
                                                      names=['year', 'month']))