Skip to content

Commit f4c1676

Browse files
committed
ENH: Added str.normalize to use unicodedata.normalize
1 parent 845cec9 commit f4c1676

File tree

6 files changed

+72
-3
lines changed

6 files changed

+72
-3
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ strings and apply several methods to it. These can be acccessed like
539539
Series.str.lower
540540
Series.str.lstrip
541541
Series.str.match
542+
Series.str.normalize
542543
Series.str.pad
543544
Series.str.repeat
544545
Series.str.replace

doc/source/text.rst

+1
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ Method Summary
268268
:meth:`~Series.str.rfind`,Equivalent to ``str.rfind``
269269
:meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize``
270270
:meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase``
271+
:meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize``
271272
:meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum``
272273
:meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha``
273274
:meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit``

doc/source/whatsnew/v0.16.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ Enhancements
2424

2525
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
2626
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
27+
- Added ``StringMethods.normalize()`` which behave as the same as standard :func:`unicodedata.normalizes` (:issue:`xxx`)
28+
2729
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
2830

2931
The ``.str`` accessor is now available for both ``Series`` and ``Index``.

pandas/core/base.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -517,9 +517,10 @@ def _make_str_accessor(self):
517517
raise AttributeError("Can only use .str accessor with string "
518518
"values, which use np.object_ dtype in "
519519
"pandas")
520-
elif isinstance(self, Index) and self.inferred_type != 'string':
521-
raise AttributeError("Can only use .str accessor with string "
522-
"values (i.e. inferred_type is 'string')")
520+
elif isinstance(self, Index) and self.inferred_type not in ('string', 'unicode'):
521+
message = ("Can only use .str accessor with string " +
522+
"values (i.e. inferred_type is 'string' or 'unicode')")
523+
raise AttributeError(message)
523524
return StringMethods(self)
524525

525526
str = AccessorProperty(StringMethods, _make_str_accessor)

pandas/core/strings.py

+18
Original file line numberDiff line numberDiff line change
@@ -1161,6 +1161,24 @@ def rfind(self, sub, start=0, end=None):
11611161
result = str_find(self.series, sub, start=start, end=end, side='right')
11621162
return self._wrap_result(result)
11631163

1164+
def normalize(self, form):
1165+
"""Return the Unicode normal form for the strings in the Series/Index.
1166+
Equivalent to standard :func:`unicodedata.normalize`.
1167+
1168+
Parameters
1169+
----------
1170+
form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
1171+
Unicode form
1172+
1173+
Returns
1174+
-------
1175+
normalized : Series/Index of objects
1176+
"""
1177+
import unicodedata
1178+
f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
1179+
result = _na_map(f, self.series)
1180+
return self._wrap_result(result)
1181+
11641182
_shared_docs['len'] = ("""
11651183
Compute length of each string in array.
11661184

pandas/tests/test_strings.py

+46
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,7 @@ def test_empty_str_methods(self):
685685
tm.assert_series_equal(empty_str, empty.str.isdecimal())
686686
tm.assert_series_equal(empty_str, empty.str.capitalize())
687687
tm.assert_series_equal(empty_str, empty.str.swapcase())
688+
tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
688689

689690
def test_ismethods(self):
690691
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
@@ -1549,6 +1550,51 @@ def test_encode_decode_errors(self):
15491550

15501551
tm.assert_series_equal(result, exp)
15511552

1553+
def test_normalize(self):
1554+
def unistr(codes):
1555+
# build unicode string from unichr
1556+
# we cannot use six.u() here because it escapes unicode
1557+
return ''.join([unichr(c) for c in codes])
1558+
1559+
values = ['ABC', # ASCII
1560+
unistr([0xFF21, 0xFF22, 0xFF23]), # ABC
1561+
unistr([0xFF11, 0xFF12, 0xFF13]), # 123
1562+
np.nan,
1563+
unistr([0xFF71, 0xFF72, 0xFF74])] # アイエ
1564+
s = Series(values, index=['a', 'b', 'c', 'd', 'e'])
1565+
1566+
normed = [compat.u_safe('ABC'),
1567+
compat.u_safe('ABC'),
1568+
compat.u_safe('123'),
1569+
np.nan,
1570+
unistr([0x30A2, 0x30A4, 0x30A8])] # アイエ
1571+
expected = Series(normed, index=['a', 'b', 'c', 'd', 'e'])
1572+
1573+
result = s.str.normalize('NFKC')
1574+
tm.assert_series_equal(result, expected)
1575+
1576+
expected = Series([compat.u_safe('ABC'),
1577+
unistr([0xFF21, 0xFF22, 0xFF23]), # ABC
1578+
unistr([0xFF11, 0xFF12, 0xFF13]), # 123
1579+
np.nan,
1580+
unistr([0xFF71, 0xFF72, 0xFF74])], # アイエ
1581+
index=['a', 'b', 'c', 'd', 'e'])
1582+
1583+
result = s.str.normalize('NFC')
1584+
tm.assert_series_equal(result, expected)
1585+
1586+
with tm.assertRaisesRegexp(ValueError, "invalid normalization form"):
1587+
s.str.normalize('xxx')
1588+
1589+
s = Index([unistr([0xFF21, 0xFF22, 0xFF23]), # ABC
1590+
unistr([0xFF11, 0xFF12, 0xFF13]), # 123
1591+
unistr([0xFF71, 0xFF72, 0xFF74])]) # アイエ
1592+
expected = Index([compat.u_safe('ABC'),
1593+
compat.u_safe('123'),
1594+
unistr([0x30A2, 0x30A4, 0x30A8])])
1595+
result = s.str.normalize('NFKC')
1596+
tm.assert_index_equal(result, expected)
1597+
15521598
def test_cat_on_filtered_index(self):
15531599
df = DataFrame(index=MultiIndex.from_product([[2011, 2012], [1,2,3]],
15541600
names=['year', 'month']))

0 commit comments

Comments
 (0)