Skip to content

Commit 61334b8

Browse files
chmulligwesm
authored andcommitted
ENH: add encoding/decoding error handling
When encoding/decoding strings with errors allow you to pass error handling strings. This works the same as error handling for other encode/decode functions. Defaults to 'strict', but you can pass 'ignore', 'replace', etc. Extends work done in #1706.
1 parent bdbca8e commit 61334b8

File tree

2 files changed

+31
-10
lines changed

2 files changed

+31
-10
lines changed

pandas/core/strings.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -519,35 +519,37 @@ def str_get(arr, i):
519519
return _na_map(f, arr)
520520

521521

522-
def str_decode(arr, encoding):
522+
def str_decode(arr, encoding, errors="strict"):
523523
"""
524524
Decode character string to unicode using indicated encoding
525525
526526
Parameters
527527
----------
528528
encoding : string
529+
errors : string
529530
530531
Returns
531532
-------
532533
decoded : array
533534
"""
534-
f = lambda x: x.decode(encoding)
535+
f = lambda x: x.decode(encoding, errors)
535536
return _na_map(f, arr)
536537

537538

538-
def str_encode(arr, encoding):
539+
def str_encode(arr, encoding, errors="strict"):
539540
"""
540-
Encode character string to unicode using indicated encoding
541+
Encode character string to some other encoding using indicated encoding
541542
542543
Parameters
543544
----------
544545
encoding : string
546+
errors : string
545547
546548
Returns
547549
-------
548550
encoded : array
549551
"""
550-
f = lambda x: x.encode(encoding)
552+
f = lambda x: x.encode(encoding, errors)
551553
return _na_map(f, arr)
552554

553555

@@ -675,13 +677,13 @@ def slice_replace(self, i=None, j=None):
675677
raise NotImplementedError
676678

677679
@copy(str_decode)
678-
def decode(self, encoding):
679-
result = str_decode(self.series, encoding)
680+
def decode(self, encoding, errors="strict"):
681+
result = str_decode(self.series, encoding, errors)
680682
return self._wrap_result(result)
681683

682684
@copy(str_encode)
683-
def encode(self, encoding):
684-
result = str_encode(self.series, encoding)
685+
def encode(self, encoding, errors="strict"):
686+
result = str_encode(self.series, encoding, errors)
685687
return self._wrap_result(result)
686688

687689
count = _pat_wrapper(str_count, flags=True)

pandas/tests/test_strings.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,7 @@ def test_match_findall_flags(self):
690690
self.assertEquals(result[0], True)
691691

692692
def test_encode_decode(self):
693-
base = Series([u'a', u'b', u'\xe4'])
693+
base = Series([u'a', u'b', u'a\xe4'])
694694
series = base.str.encode('utf-8')
695695

696696
f = lambda x: x.decode('utf-8')
@@ -699,6 +699,25 @@ def test_encode_decode(self):
699699

700700
tm.assert_series_equal(result, exp)
701701

702+
def test_encode_decode_errors(self):
703+
encodeBase = Series([u'a', u'b', u'a\x9d'])
704+
with self.assertRaises(UnicodeEncodeError):
705+
encodeBase.str.encode('cp1252')
706+
707+
f = lambda x: x.encode('cp1252', 'ignore')
708+
result = encodeBase.str.encode('cp1252', 'ignore')
709+
exp = encodeBase.map(f)
710+
tm.assert_series_equal(result, exp)
711+
712+
decodeBase = Series(['a', 'b', 'a\x9d'])
713+
with self.assertRaises(UnicodeDecodeError):
714+
decodeBase.str.encode('cp1252')
715+
f = lambda x: x.decode('cp1252', 'ignore')
716+
result = decodeBase.str.decode('cp1252', 'ignore')
717+
exp = decodeBase.map(f)
718+
719+
tm.assert_series_equal(result, exp)
720+
702721
if __name__ == '__main__':
703722
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
704723
exit=False)

0 commit comments

Comments
 (0)