From 84afe26aa513c29f8511f82dc2dd1679cb3d9176 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 29 Apr 2015 11:06:15 +0900 Subject: [PATCH] ENH: Added str.normalize to use unicodedata.normalize --- doc/source/api.rst | 1 + doc/source/text.rst | 1 + doc/source/whatsnew/v0.16.1.txt | 2 + pandas/core/base.py | 13 ++++- pandas/core/strings.py | 19 +++++++ pandas/tests/test_strings.py | 97 +++++++++++++++++++++++++++++++++ 6 files changed, 130 insertions(+), 3 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index d442d8631247c..20c76422ff924 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -540,6 +540,7 @@ strings and apply several methods to it. These can be acccessed like Series.str.lower Series.str.lstrip Series.str.match + Series.str.normalize Series.str.pad Series.str.repeat Series.str.replace diff --git a/doc/source/text.rst b/doc/source/text.rst index dea40fb48748d..359b6d61dbb64 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -268,6 +268,7 @@ Method Summary :meth:`~Series.str.rfind`,Equivalent to ``str.rfind`` :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize`` :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase`` + :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize`` :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum`` :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha`` :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit`` diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index d0ffe37e20ddf..c429ea7186fcb 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -26,6 +26,8 @@ Enhancements - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`) - ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) +- Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`) + - Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). The ``.str`` accessor is now available for both ``Series`` and ``Index``. diff --git a/pandas/core/base.py b/pandas/core/base.py index 9c27f3c7a2cc3..2f171cdd6adf3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -517,9 +517,16 @@ def _make_str_accessor(self): raise AttributeError("Can only use .str accessor with string " "values, which use np.object_ dtype in " "pandas") - elif isinstance(self, Index) and self.inferred_type != 'string': - raise AttributeError("Can only use .str accessor with string " - "values (i.e. inferred_type is 'string')") + elif isinstance(self, Index): + # see scc/inferrence.pyx which can contain string values + allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') + if self.inferred_type not in allowed_types: + message = ("Can only use .str accessor with string values " + "(i.e. inferred_type is 'string', 'unicode' or 'mixed')") + raise AttributeError(message) + if self.nlevels > 1: + message = "Can only use .str accessor with Index, not MultiIndex" + raise AttributeError(message) return StringMethods(self) str = AccessorProperty(StringMethods, _make_str_accessor) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3506338afd9d4..5cea4c4afe8cc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1206,6 +1206,25 @@ def rfind(self, sub, start=0, end=None): result = str_find(self.series, sub, start=start, end=end, side='right') return self._wrap_result(result) + def normalize(self, form): + """Return the Unicode normal form for the strings in the Series/Index. + For more information on the forms, see the + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {'NFC', 'NFKC', 'NFD', 'NFKD'} + Unicode form + + Returns + ------- + normalized : Series/Index of objects + """ + import unicodedata + f = lambda x: unicodedata.normalize(form, compat.u_safe(x)) + result = _na_map(f, self.series) + return self._wrap_result(result) + _shared_docs['len'] = (""" Compute length of each string in the Series/Index. diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 9283be566bd8f..d3875f0675e9f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -685,6 +685,7 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) + tm.assert_series_equal(empty_str, empty.str.normalize('NFC')) def test_ismethods(self): values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] @@ -1549,6 +1550,51 @@ def test_encode_decode_errors(self): tm.assert_series_equal(result, exp) + def test_normalize(self): + def unistr(codes): + # build unicode string from unichr + # we cannot use six.u() here because it escapes unicode + return ''.join([unichr(c) for c in codes]) + + values = ['ABC', # ASCII + unistr([0xFF21, 0xFF22, 0xFF23]), # ABC + unistr([0xFF11, 0xFF12, 0xFF13]), # 123 + np.nan, + unistr([0xFF71, 0xFF72, 0xFF74])] # アイエ + s = Series(values, index=['a', 'b', 'c', 'd', 'e']) + + normed = [compat.u_safe('ABC'), + compat.u_safe('ABC'), + compat.u_safe('123'), + np.nan, + unistr([0x30A2, 0x30A4, 0x30A8])] # アイエ + expected = Series(normed, index=['a', 'b', 'c', 'd', 'e']) + + result = s.str.normalize('NFKC') + tm.assert_series_equal(result, expected) + + expected = Series([compat.u_safe('ABC'), + unistr([0xFF21, 0xFF22, 0xFF23]), # ABC + unistr([0xFF11, 0xFF12, 0xFF13]), # 123 + np.nan, + unistr([0xFF71, 0xFF72, 0xFF74])], # アイエ + index=['a', 'b', 'c', 'd', 'e']) + + result = s.str.normalize('NFC') + tm.assert_series_equal(result, expected) + + with tm.assertRaisesRegexp(ValueError, "invalid normalization form"): + s.str.normalize('xxx') + + s = Index([unistr([0xFF21, 0xFF22, 0xFF23]), # ABC + unistr([0xFF11, 0xFF12, 0xFF13]), # 123 + unistr([0xFF71, 0xFF72, 0xFF74])]) # アイエ + expected = Index([compat.u_safe('ABC'), + compat.u_safe('123'), + unistr([0x30A2, 0x30A4, 0x30A8])]) + result = s.str.normalize('NFKC') + tm.assert_index_equal(result, expected) + def test_cat_on_filtered_index(self): df = DataFrame(index=MultiIndex.from_product([[2011, 2012], [1,2,3]], names=['year', 'month'])) @@ -1567,6 +1613,57 @@ def test_cat_on_filtered_index(self): self.assertEqual(str_multiple.loc[1], '2011 2 2') + def test_index_str_accessor_visibility(self): + from pandas.core.strings import StringMethods + + if not compat.PY3: + cases = [(['a', 'b'], 'string'), + (['a', u('b')], 'mixed'), + ([u('a'), u('b')], 'unicode'), + (['a', 'b', 1], 'mixed-integer'), + (['a', 'b', 1.3], 'mixed'), + (['a', 'b', 1.3, 1], 'mixed-integer'), + (['aa', datetime(2011, 1, 1)], 'mixed')] + else: + cases = [(['a', 'b'], 'string'), + (['a', u('b')], 'string'), + ([u('a'), u('b')], 'string'), + (['a', 'b', 1], 'mixed-integer'), + (['a', 'b', 1.3], 'mixed'), + (['a', 'b', 1.3, 1], 'mixed-integer'), + (['aa', datetime(2011, 1, 1)], 'mixed')] + for values, tp in cases: + idx = Index(values) + self.assertTrue(isinstance(Series(values).str, StringMethods)) + self.assertTrue(isinstance(idx.str, StringMethods)) + self.assertEqual(idx.inferred_type, tp) + + for values, tp in cases: + idx = Index(values) + self.assertTrue(isinstance(Series(values).str, StringMethods)) + self.assertTrue(isinstance(idx.str, StringMethods)) + self.assertEqual(idx.inferred_type, tp) + + cases = [([1, np.nan], 'floating'), + ([datetime(2011, 1, 1)], 'datetime64'), + ([timedelta(1)], 'timedelta64')] + for values, tp in cases: + idx = Index(values) + message = 'Can only use .str accessor with string values' + with self.assertRaisesRegexp(AttributeError, message): + Series(values).str + with self.assertRaisesRegexp(AttributeError, message): + idx.str + self.assertEqual(idx.inferred_type, tp) + + # MultiIndex has mixed dtype, but not allow to use accessor + idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')]) + self.assertEqual(idx.inferred_type, 'mixed') + message = 'Can only use .str accessor with Index, not MultiIndex' + with self.assertRaisesRegexp(AttributeError, message): + idx.str + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)