BUG: fix read_csv c engine to accept unicode aliases for encoding (pandas-dev#14060)

nateGeorge · jorisvandenbossche · commit 6645b2b11a82 · 2016-08-22T22:47:18.000+02:00
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -1095,3 +1095,5 @@ Bug Fixes
 - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
 - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`)
 - Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment.
+
+- Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -343,6 +343,9 @@ def _validate_nrows(nrows):
 def _read(filepath_or_buffer, kwds):
     "Generic reader of line files."
     encoding = kwds.get('encoding', None)
+    if encoding is not None:
+        encoding = re.sub('_', '-', encoding).lower()
+        kwds['encoding'] = encoding
 
     # If the input could be a filename, check for a recognizable compression
     # extension.  If we're reading from a URL, the `get_filepath_or_buffer`
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1583,3 +1583,13 @@ def test_temporary_file(self):
         new_file.close()
         expected = DataFrame([[0, 0]])
         tm.assert_frame_equal(result, expected)
+
+    def test_read_csv_utf_aliases(self):
+        # see gh issue 13549
+        expected = pd.DataFrame({'mb_num': [4.8], 'multibyte': ['test']})
+        for byte in [8, 16]:
+            for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']:
+                encoding = fmt.format(byte)
+                data = 'mb_num,multibyte\n4.8,test'.encode(encoding)
+                result = self.read_csv(BytesIO(data), encoding=encoding)
+                tm.assert_frame_equal(result, expected)