pandas-dev · nateGeorge · Jul 5, 2016 · Jul 6, 2016 · Jul 6, 2016 · Jul 6, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -549,3 +549,5 @@ Bug Fixes
 - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)
 
 - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient  (:issue:`13454`)
+
+- Bug in ``pd.read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised ``UnicodeDecodeError`` (:issue:`13549`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -339,6 +339,10 @@ def _validate_nrows(nrows):
 def _read(filepath_or_buffer, kwds):
     "Generic reader of line files."
     encoding = kwds.get('encoding', None)
+    if encoding is not None:
+        encoding = re.sub('_', '-', encoding).lower()
+        kwds['encoding'] = encoding
+
     skipfooter = kwds.pop('skipfooter', None)
     if skipfooter is not None:
         kwds['skip_footer'] = skipfooter

diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1339,8 +1339,8 @@ def test_compact_ints_use_unsigned(self):
             'b': np.array([9], dtype=np.int64),
             'c': np.array([258], dtype=np.int64),
         })
-        out = self.read_csv(StringIO(data))
-        tm.assert_frame_equal(out, expected)
+        result = self.read_csv(StringIO(data))
+        tm.assert_frame_equal(result, expected)
 
         expected = DataFrame({
             'a': np.array([1], dtype=np.int8),
@@ -1351,14 +1351,14 @@ def test_compact_ints_use_unsigned(self):
         # default behaviour for 'use_unsigned'
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
-            out = self.read_csv(StringIO(data), compact_ints=True)
-            tm.assert_frame_equal(out, expected)
+            result = self.read_csv(StringIO(data), compact_ints=True)
+            tm.assert_frame_equal(result, expected)
 
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
-            out = self.read_csv(StringIO(data), compact_ints=True,
-                                use_unsigned=False)
-            tm.assert_frame_equal(out, expected)
+            result = self.read_csv(StringIO(data), compact_ints=True,
+                                   use_unsigned=False)
+            tm.assert_frame_equal(result, expected)
 
         expected = DataFrame({
             'a': np.array([1], dtype=np.uint8),
@@ -1368,9 +1368,9 @@ def test_compact_ints_use_unsigned(self):
 
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
-            out = self.read_csv(StringIO(data), compact_ints=True,
-                                use_unsigned=True)
-            tm.assert_frame_equal(out, expected)
+            result = self.read_csv(StringIO(data), compact_ints=True,
+                                   use_unsigned=True)
+            tm.assert_frame_equal(result, expected)
 
     def test_compact_ints_as_recarray(self):
         data = ('0,1,0,0\n'
@@ -1399,27 +1399,28 @@ def test_as_recarray(self):
             data = 'a,b\n1,a\n2,b'
             expected = np.array([(1, 'a'), (2, 'b')],
                                 dtype=[('a', '<i8'), ('b', 'O')])
-            out = self.read_csv(StringIO(data), as_recarray=True)
-            tm.assert_numpy_array_equal(out, expected)
+            result = self.read_csv(StringIO(data), as_recarray=True)
+            tm.assert_numpy_array_equal(result, expected)
 
         # index_col ignored
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
             data = 'a,b\n1,a\n2,b'
             expected = np.array([(1, 'a'), (2, 'b')],
                                 dtype=[('a', '<i8'), ('b', 'O')])
-            out = self.read_csv(StringIO(data), as_recarray=True, index_col=0)
-            tm.assert_numpy_array_equal(out, expected)
+            result = self.read_csv(
+                StringIO(data), as_recarray=True, index_col=0)
+            tm.assert_numpy_array_equal(result, expected)
 
         # respects names
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
             data = '1,a\n2,b'
             expected = np.array([(1, 'a'), (2, 'b')],
                                 dtype=[('a', '<i8'), ('b', 'O')])
-            out = self.read_csv(StringIO(data), names=['a', 'b'],
-                                header=None, as_recarray=True)
-            tm.assert_numpy_array_equal(out, expected)
+            result = self.read_csv(StringIO(data), names=['a', 'b'],
+                                   header=None, as_recarray=True)
+            tm.assert_numpy_array_equal(result, expected)
 
         # header order is respected even though it conflicts
         # with the natural ordering of the column names
@@ -1428,16 +1429,17 @@ def test_as_recarray(self):
             data = 'b,a\n1,a\n2,b'
             expected = np.array([(1, 'a'), (2, 'b')],
                                 dtype=[('b', '<i8'), ('a', 'O')])
-            out = self.read_csv(StringIO(data), as_recarray=True)
-            tm.assert_numpy_array_equal(out, expected)
+            result = self.read_csv(StringIO(data), as_recarray=True)
+            tm.assert_numpy_array_equal(result, expected)
 
         # overrides the squeeze parameter
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
             data = 'a\n1'
             expected = np.array([(1,)], dtype=[('a', '<i8')])
-            out = self.read_csv(StringIO(data), as_recarray=True, squeeze=True)
-            tm.assert_numpy_array_equal(out, expected)
+            result = self.read_csv(
+                StringIO(data), as_recarray=True, squeeze=True)
+            tm.assert_numpy_array_equal(result, expected)
 
         # does data conversions before doing recarray conversion
         with tm.assert_produces_warning(
@@ -1446,18 +1448,18 @@ def test_as_recarray(self):
             conv = lambda x: int(x) + 1
             expected = np.array([(2, 'a'), (3, 'b')],
                                 dtype=[('a', '<i8'), ('b', 'O')])
-            out = self.read_csv(StringIO(data), as_recarray=True,
-                                converters={'a': conv})
-            tm.assert_numpy_array_equal(out, expected)
+            result = self.read_csv(StringIO(data), as_recarray=True,
+                                   converters={'a': conv})
+            tm.assert_numpy_array_equal(result, expected)
 
         # filters by usecols before doing recarray conversion
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
             data = 'a,b\n1,a\n2,b'
             expected = np.array([(1,), (2,)], dtype=[('a', '<i8')])
-            out = self.read_csv(StringIO(data), as_recarray=True,
-                                usecols=['a'])
-            tm.assert_numpy_array_equal(out, expected)
+            result = self.read_csv(StringIO(data), as_recarray=True,
+                                   usecols=['a'])
+            tm.assert_numpy_array_equal(result, expected)
 
     def test_memory_map(self):
         mmap_file = os.path.join(self.dirpath, 'test_mmap.csv')
@@ -1467,5 +1469,23 @@ def test_memory_map(self):
             'c': ['I', 'II', 'III']
         })
 
-        out = self.read_csv(mmap_file, memory_map=True)
-        tm.assert_frame_equal(out, expected)
+        result = self.read_csv(mmap_file, memory_map=True)
+        tm.assert_frame_equal(result, expected)
+
+    def test_read_csv_utf_aliases(self):
+        # see gh issue 13549
+        path = 'test.csv'
+        expected = DataFrame({'A': [0, 1], 'B': [2, 3],
+                              'multibyte_test': ['testing123', 'bananabis'],
+                              'mb_nums': [154.868, 457.8798]})
+        with tm.ensure_clean(path) as path:
+            for byte in [8, 16]:
+                expected.to_csv(path, encoding='utf-' + str(byte), index=False)
+                for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']:
+                    encoding = fmt.format(byte)
+                    for engine in ['c', 'python', None]:
+                        result = self.read_csv(
+                            path,
+                            engine=engine,
+                            encoding=encoding)
+                        tm.assert_frame_equal(result, expected)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -549,3 +549,5 @@ Bug Fixes
		- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)

		- Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`)

		- Bug in ``pd.read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised ``UnicodeDecodeError`` (:issue:`13549`)