Skip to content

Commit 84afe26

Browse files
committed
ENH: Added str.normalize to use unicodedata.normalize
1 parent b7c3271 commit 84afe26

File tree

6 files changed

+130
-3
lines changed

6 files changed

+130
-3
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,7 @@ strings and apply several methods to it. These can be acccessed like
540540
Series.str.lower
541541
Series.str.lstrip
542542
Series.str.match
543+
Series.str.normalize
543544
Series.str.pad
544545
Series.str.repeat
545546
Series.str.replace

doc/source/text.rst

+1
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ Method Summary
268268
:meth:`~Series.str.rfind`,Equivalent to ``str.rfind``
269269
:meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize``
270270
:meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase``
271+
:meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize``
271272
:meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum``
272273
:meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha``
273274
:meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit``

doc/source/whatsnew/v0.16.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ Enhancements
2626
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
2727
- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`)
2828
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
29+
- Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`)
30+
2931
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
3032

3133
The ``.str`` accessor is now available for both ``Series`` and ``Index``.

pandas/core/base.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -517,9 +517,16 @@ def _make_str_accessor(self):
517517
raise AttributeError("Can only use .str accessor with string "
518518
"values, which use np.object_ dtype in "
519519
"pandas")
520-
elif isinstance(self, Index) and self.inferred_type != 'string':
521-
raise AttributeError("Can only use .str accessor with string "
522-
"values (i.e. inferred_type is 'string')")
520+
elif isinstance(self, Index):
521+
# see scc/inferrence.pyx which can contain string values
522+
allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
523+
if self.inferred_type not in allowed_types:
524+
message = ("Can only use .str accessor with string values "
525+
"(i.e. inferred_type is 'string', 'unicode' or 'mixed')")
526+
raise AttributeError(message)
527+
if self.nlevels > 1:
528+
message = "Can only use .str accessor with Index, not MultiIndex"
529+
raise AttributeError(message)
523530
return StringMethods(self)
524531

525532
str = AccessorProperty(StringMethods, _make_str_accessor)

pandas/core/strings.py

+19
Original file line numberDiff line numberDiff line change
@@ -1206,6 +1206,25 @@ def rfind(self, sub, start=0, end=None):
12061206
result = str_find(self.series, sub, start=start, end=end, side='right')
12071207
return self._wrap_result(result)
12081208

1209+
def normalize(self, form):
1210+
"""Return the Unicode normal form for the strings in the Series/Index.
1211+
For more information on the forms, see the
1212+
:func:`unicodedata.normalize`.
1213+
1214+
Parameters
1215+
----------
1216+
form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
1217+
Unicode form
1218+
1219+
Returns
1220+
-------
1221+
normalized : Series/Index of objects
1222+
"""
1223+
import unicodedata
1224+
f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
1225+
result = _na_map(f, self.series)
1226+
return self._wrap_result(result)
1227+
12091228
_shared_docs['len'] = ("""
12101229
Compute length of each string in the Series/Index.
12111230

pandas/tests/test_strings.py

+97
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,7 @@ def test_empty_str_methods(self):
685685
tm.assert_series_equal(empty_str, empty.str.isdecimal())
686686
tm.assert_series_equal(empty_str, empty.str.capitalize())
687687
tm.assert_series_equal(empty_str, empty.str.swapcase())
688+
tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
688689

689690
def test_ismethods(self):
690691
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
@@ -1549,6 +1550,51 @@ def test_encode_decode_errors(self):
15491550

15501551
tm.assert_series_equal(result, exp)
15511552

1553+
def test_normalize(self):
1554+
def unistr(codes):
1555+
# build unicode string from unichr
1556+
# we cannot use six.u() here because it escapes unicode
1557+
return ''.join([unichr(c) for c in codes])
1558+
1559+
values = ['ABC', # ASCII
1560+
unistr([0xFF21, 0xFF22, 0xFF23]), # ABC
1561+
unistr([0xFF11, 0xFF12, 0xFF13]), # 123
1562+
np.nan,
1563+
unistr([0xFF71, 0xFF72, 0xFF74])] # アイエ
1564+
s = Series(values, index=['a', 'b', 'c', 'd', 'e'])
1565+
1566+
normed = [compat.u_safe('ABC'),
1567+
compat.u_safe('ABC'),
1568+
compat.u_safe('123'),
1569+
np.nan,
1570+
unistr([0x30A2, 0x30A4, 0x30A8])] # アイエ
1571+
expected = Series(normed, index=['a', 'b', 'c', 'd', 'e'])
1572+
1573+
result = s.str.normalize('NFKC')
1574+
tm.assert_series_equal(result, expected)
1575+
1576+
expected = Series([compat.u_safe('ABC'),
1577+
unistr([0xFF21, 0xFF22, 0xFF23]), # ABC
1578+
unistr([0xFF11, 0xFF12, 0xFF13]), # 123
1579+
np.nan,
1580+
unistr([0xFF71, 0xFF72, 0xFF74])], # アイエ
1581+
index=['a', 'b', 'c', 'd', 'e'])
1582+
1583+
result = s.str.normalize('NFC')
1584+
tm.assert_series_equal(result, expected)
1585+
1586+
with tm.assertRaisesRegexp(ValueError, "invalid normalization form"):
1587+
s.str.normalize('xxx')
1588+
1589+
s = Index([unistr([0xFF21, 0xFF22, 0xFF23]), # ABC
1590+
unistr([0xFF11, 0xFF12, 0xFF13]), # 123
1591+
unistr([0xFF71, 0xFF72, 0xFF74])]) # アイエ
1592+
expected = Index([compat.u_safe('ABC'),
1593+
compat.u_safe('123'),
1594+
unistr([0x30A2, 0x30A4, 0x30A8])])
1595+
result = s.str.normalize('NFKC')
1596+
tm.assert_index_equal(result, expected)
1597+
15521598
def test_cat_on_filtered_index(self):
15531599
df = DataFrame(index=MultiIndex.from_product([[2011, 2012], [1,2,3]],
15541600
names=['year', 'month']))
@@ -1567,6 +1613,57 @@ def test_cat_on_filtered_index(self):
15671613
self.assertEqual(str_multiple.loc[1], '2011 2 2')
15681614

15691615

1616+
def test_index_str_accessor_visibility(self):
1617+
from pandas.core.strings import StringMethods
1618+
1619+
if not compat.PY3:
1620+
cases = [(['a', 'b'], 'string'),
1621+
(['a', u('b')], 'mixed'),
1622+
([u('a'), u('b')], 'unicode'),
1623+
(['a', 'b', 1], 'mixed-integer'),
1624+
(['a', 'b', 1.3], 'mixed'),
1625+
(['a', 'b', 1.3, 1], 'mixed-integer'),
1626+
(['aa', datetime(2011, 1, 1)], 'mixed')]
1627+
else:
1628+
cases = [(['a', 'b'], 'string'),
1629+
(['a', u('b')], 'string'),
1630+
([u('a'), u('b')], 'string'),
1631+
(['a', 'b', 1], 'mixed-integer'),
1632+
(['a', 'b', 1.3], 'mixed'),
1633+
(['a', 'b', 1.3, 1], 'mixed-integer'),
1634+
(['aa', datetime(2011, 1, 1)], 'mixed')]
1635+
for values, tp in cases:
1636+
idx = Index(values)
1637+
self.assertTrue(isinstance(Series(values).str, StringMethods))
1638+
self.assertTrue(isinstance(idx.str, StringMethods))
1639+
self.assertEqual(idx.inferred_type, tp)
1640+
1641+
for values, tp in cases:
1642+
idx = Index(values)
1643+
self.assertTrue(isinstance(Series(values).str, StringMethods))
1644+
self.assertTrue(isinstance(idx.str, StringMethods))
1645+
self.assertEqual(idx.inferred_type, tp)
1646+
1647+
cases = [([1, np.nan], 'floating'),
1648+
([datetime(2011, 1, 1)], 'datetime64'),
1649+
([timedelta(1)], 'timedelta64')]
1650+
for values, tp in cases:
1651+
idx = Index(values)
1652+
message = 'Can only use .str accessor with string values'
1653+
with self.assertRaisesRegexp(AttributeError, message):
1654+
Series(values).str
1655+
with self.assertRaisesRegexp(AttributeError, message):
1656+
idx.str
1657+
self.assertEqual(idx.inferred_type, tp)
1658+
1659+
# MultiIndex has mixed dtype, but not allow to use accessor
1660+
idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')])
1661+
self.assertEqual(idx.inferred_type, 'mixed')
1662+
message = 'Can only use .str accessor with Index, not MultiIndex'
1663+
with self.assertRaisesRegexp(AttributeError, message):
1664+
idx.str
1665+
1666+
15701667
if __name__ == '__main__':
15711668
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
15721669
exit=False)

0 commit comments

Comments
 (0)