BUG: GH13219 Fixed. Allow unicode values in usecols

hassanshamim · jreback · commit fcd73ad2e748 · 2016-06-01T07:11:55.000-04:00
closes pandas-dev#13219 closes pandas-dev#13233
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -338,18 +338,19 @@ Bug Fixes
 - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`)
 - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`)
 - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`)
-- Bug in ``pd.read_csv`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`)
 
 
 
 
-- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
 - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)
 
 
 
+- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
+- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`)
 - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
 - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`)
+- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
 
 
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -882,12 +882,13 @@ def _validate_usecols_arg(usecols):
     or strings (column by name). Raises a ValueError
     if that is not the case.
     """
+    msg = ("The elements of 'usecols' must "
+           "either be all strings, all unicode, or all integers")
+
     if usecols is not None:
         usecols_dtype = lib.infer_dtype(usecols)
-        if usecols_dtype not in ('integer', 'string'):
-            raise ValueError(("The elements of 'usecols' "
-                              "must either be all strings "
-                              "or all integers"))
+        if usecols_dtype not in ('integer', 'string', 'unicode'):
+            raise ValueError(msg)
 
     return usecols
 
diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py
@@ -6,6 +6,7 @@
 """
 
 from datetime import datetime
+import nose
 
 import pandas.util.testing as tm
 
@@ -22,9 +23,8 @@ def test_raise_on_mixed_dtype_usecols(self):
         1000,2000,3000
         4000,5000,6000
         """
-        msg = ("The elements of \'usecols\' "
-               "must either be all strings "
-               "or all integers")
+        msg = ("The elements of 'usecols' must "
+               "either be all strings, all unicode, or all integers")
         usecols = [0, 'b', 2]
 
         with tm.assertRaisesRegexp(ValueError, msg):
@@ -254,3 +254,103 @@ def test_usecols_with_parse_dates_and_usecol_names(self):
                            usecols=[3, 0, 2],
                            parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_unicode_strings(self):
+        # see gh-13219
+
+        s = '''AAA,BBB,CCC,DDD
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'AAA': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'BBB': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_single_byte_unicode_strings(self):
+        # see gh-13219
+
+        s = '''A,B,C,D
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'A': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'B': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_mixed_encoding_strings(self):
+        s = '''AAA,BBB,CCC,DDD
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        msg = ("The elements of 'usecols' must "
+               "either be all strings, all unicode, or all integers")
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
+
+    def test_usecols_with_multibyte_characters(self):
+        s = '''あああ,いい,ううう,ええええ
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+        data = {
+            'あああ': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'いい': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
+        tm.assert_frame_equal(df, expected)
+
+    def test_usecols_with_multibyte_unicode_characters(self):
+        raise nose.SkipTest('TODO: see gh-13253')
+
+        s = '''あああ,いい,ううう,ええええ
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+        data = {
+            'あああ': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'いい': {0: 8, 1: 2, 2: 7}
+        }
+        expected = DataFrame(data)
+
+        df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
+        tm.assert_frame_equal(df, expected)