diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cc3cc631b9575..89a74ab00048d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1075,3 +1075,5 @@ Bug Fixes - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. + +- Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5372203318d69..c2117e206564c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -350,6 +350,9 @@ def _validate_nrows(nrows): def _read(filepath_or_buffer, kwds): "Generic reader of line files." encoding = kwds.get('encoding', None) + if encoding is not None: + encoding = re.sub('_', '-', encoding).lower() + kwds['encoding'] = encoding # If the input could be a filename, check for a recognizable compression # extension. If we're reading from a URL, the `get_filepath_or_buffer` diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 96eb0ec6fd7a2..5972569cf020b 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -8,6 +8,7 @@ import re import sys from datetime import datetime +from io import BytesIO import nose import numpy as np @@ -1583,3 +1584,13 @@ def test_temporary_file(self): new_file.close() expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) + + def test_read_csv_utf_aliases(self): + # see gh issue 13549 + expected = pd.DataFrame({'mb_num': [4.8], 'multibyte': ['test']}) + for byte in [8, 16]: + for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']: + encoding = fmt.format(byte) + data = 'mb_num,multibyte\n4.8,test'.encode(encoding) + result = self.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected)