-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Added str.normalize to use unicodedata.normalize #10031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -517,9 +517,16 @@ def _make_str_accessor(self): | |
raise AttributeError("Can only use .str accessor with string " | ||
"values, which use np.object_ dtype in " | ||
"pandas") | ||
elif isinstance(self, Index) and self.inferred_type != 'string': | ||
raise AttributeError("Can only use .str accessor with string " | ||
"values (i.e. inferred_type is 'string')") | ||
elif isinstance(self, Index): | ||
# see scc/inferrence.pyx which can contain string values | ||
allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') | ||
if self.inferred_type not in allowed_types: | ||
message = ("Can only use .str accessor with string values " | ||
"(i.e. inferred_type is 'string', 'unicode' or 'mixed')") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you can accept
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback It is to be compat with current
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i c...ok |
||
raise AttributeError(message) | ||
if self.nlevels > 1: | ||
message = "Can only use .str accessor with Index, not MultiIndex" | ||
raise AttributeError(message) | ||
return StringMethods(self) | ||
|
||
str = AccessorProperty(StringMethods, _make_str_accessor) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1206,6 +1206,25 @@ def rfind(self, sub, start=0, end=None): | |
result = str_find(self.series, sub, start=start, end=end, side='right') | ||
return self._wrap_result(result) | ||
|
||
def normalize(self, form): | ||
"""Return the Unicode normal form for the strings in the Series/Index. | ||
For more information on the forms, see the | ||
:func:`unicodedata.normalize`. | ||
|
||
Parameters | ||
---------- | ||
form : {'NFC', 'NFKC', 'NFD', 'NFKD'} | ||
Unicode form | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add something like: "For more information on the forms, see the :func: |
||
|
||
Returns | ||
------- | ||
normalized : Series/Index of objects | ||
""" | ||
import unicodedata | ||
f = lambda x: unicodedata.normalize(form, compat.u_safe(x)) | ||
result = _na_map(f, self.series) | ||
return self._wrap_result(result) | ||
|
||
_shared_docs['len'] = (""" | ||
Compute length of each string in the Series/Index. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -685,6 +685,7 @@ def test_empty_str_methods(self): | |
tm.assert_series_equal(empty_str, empty.str.isdecimal()) | ||
tm.assert_series_equal(empty_str, empty.str.capitalize()) | ||
tm.assert_series_equal(empty_str, empty.str.swapcase()) | ||
tm.assert_series_equal(empty_str, empty.str.normalize('NFC')) | ||
|
||
def test_ismethods(self): | ||
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] | ||
|
@@ -1549,6 +1550,51 @@ def test_encode_decode_errors(self): | |
|
||
tm.assert_series_equal(result, exp) | ||
|
||
def test_normalize(self): | ||
def unistr(codes): | ||
# build unicode string from unichr | ||
# we cannot use six.u() here because it escapes unicode | ||
return ''.join([unichr(c) for c in codes]) | ||
|
||
values = ['ABC', # ASCII | ||
unistr([0xFF21, 0xFF22, 0xFF23]), # ABC | ||
unistr([0xFF11, 0xFF12, 0xFF13]), # 123 | ||
np.nan, | ||
unistr([0xFF71, 0xFF72, 0xFF74])] # アイエ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know to what extent we want to really have such unicode characters in our source files? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an alternative expression of normal unicode string, such as "u'ABC'" to work both on 2.x and 3.x. Unable to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once we remove Python 3.2 support we can finally use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @shoyer 0.17? see #9118, maybe we should just decide when the 'finally' will be @sinhrks sorry to be unclear, I just meant the unicode in the comment (what the unistr forms). I know our source files are unicode (or at least this one is), but I was just wondering to what extent we should also really use such characters (eg for when people with older or misconfigured editors looking at this file). But probably not a big deal |
||
s = Series(values, index=['a', 'b', 'c', 'd', 'e']) | ||
|
||
normed = [compat.u_safe('ABC'), | ||
compat.u_safe('ABC'), | ||
compat.u_safe('123'), | ||
np.nan, | ||
unistr([0x30A2, 0x30A4, 0x30A8])] # アイエ | ||
expected = Series(normed, index=['a', 'b', 'c', 'd', 'e']) | ||
|
||
result = s.str.normalize('NFKC') | ||
tm.assert_series_equal(result, expected) | ||
|
||
expected = Series([compat.u_safe('ABC'), | ||
unistr([0xFF21, 0xFF22, 0xFF23]), # ABC | ||
unistr([0xFF11, 0xFF12, 0xFF13]), # 123 | ||
np.nan, | ||
unistr([0xFF71, 0xFF72, 0xFF74])], # アイエ | ||
index=['a', 'b', 'c', 'd', 'e']) | ||
|
||
result = s.str.normalize('NFC') | ||
tm.assert_series_equal(result, expected) | ||
|
||
with tm.assertRaisesRegexp(ValueError, "invalid normalization form"): | ||
s.str.normalize('xxx') | ||
|
||
s = Index([unistr([0xFF21, 0xFF22, 0xFF23]), # ABC | ||
unistr([0xFF11, 0xFF12, 0xFF13]), # 123 | ||
unistr([0xFF71, 0xFF72, 0xFF74])]) # アイエ | ||
expected = Index([compat.u_safe('ABC'), | ||
compat.u_safe('123'), | ||
unistr([0x30A2, 0x30A4, 0x30A8])]) | ||
result = s.str.normalize('NFKC') | ||
tm.assert_index_equal(result, expected) | ||
|
||
def test_cat_on_filtered_index(self): | ||
df = DataFrame(index=MultiIndex.from_product([[2011, 2012], [1,2,3]], | ||
names=['year', 'month'])) | ||
|
@@ -1567,6 +1613,57 @@ def test_cat_on_filtered_index(self): | |
self.assertEqual(str_multiple.loc[1], '2011 2 2') | ||
|
||
|
||
def test_index_str_accessor_visibility(self): | ||
from pandas.core.strings import StringMethods | ||
|
||
if not compat.PY3: | ||
cases = [(['a', 'b'], 'string'), | ||
(['a', u('b')], 'mixed'), | ||
([u('a'), u('b')], 'unicode'), | ||
(['a', 'b', 1], 'mixed-integer'), | ||
(['a', 'b', 1.3], 'mixed'), | ||
(['a', 'b', 1.3, 1], 'mixed-integer'), | ||
(['aa', datetime(2011, 1, 1)], 'mixed')] | ||
else: | ||
cases = [(['a', 'b'], 'string'), | ||
(['a', u('b')], 'string'), | ||
([u('a'), u('b')], 'string'), | ||
(['a', 'b', 1], 'mixed-integer'), | ||
(['a', 'b', 1.3], 'mixed'), | ||
(['a', 'b', 1.3, 1], 'mixed-integer'), | ||
(['aa', datetime(2011, 1, 1)], 'mixed')] | ||
for values, tp in cases: | ||
idx = Index(values) | ||
self.assertTrue(isinstance(Series(values).str, StringMethods)) | ||
self.assertTrue(isinstance(idx.str, StringMethods)) | ||
self.assertEqual(idx.inferred_type, tp) | ||
|
||
for values, tp in cases: | ||
idx = Index(values) | ||
self.assertTrue(isinstance(Series(values).str, StringMethods)) | ||
self.assertTrue(isinstance(idx.str, StringMethods)) | ||
self.assertEqual(idx.inferred_type, tp) | ||
|
||
cases = [([1, np.nan], 'floating'), | ||
([datetime(2011, 1, 1)], 'datetime64'), | ||
([timedelta(1)], 'timedelta64')] | ||
for values, tp in cases: | ||
idx = Index(values) | ||
message = 'Can only use .str accessor with string values' | ||
with self.assertRaisesRegexp(AttributeError, message): | ||
Series(values).str | ||
with self.assertRaisesRegexp(AttributeError, message): | ||
idx.str | ||
self.assertEqual(idx.inferred_type, tp) | ||
|
||
# MultiIndex has mixed dtype, but not allow to use accessor | ||
idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')]) | ||
self.assertEqual(idx.inferred_type, 'mixed') | ||
message = 'Can only use .str accessor with Index, not MultiIndex' | ||
with self.assertRaisesRegexp(AttributeError, message): | ||
idx.str | ||
|
||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
exit=False) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor typo, should be "which behaves the same as"