From c30eeb52193da00b5efcc32c9580be2dd6fd791e Mon Sep 17 00:00:00 2001 From: Hassan Shamim Date: Thu, 19 May 2016 14:13:04 -0700 Subject: [PATCH] BUG: GH13219 Fixed. Allow unicode values in usecol --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 9 +-- pandas/io/tests/parser/usecols.py | 106 +++++++++++++++++++++++++++++- 3 files changed, 109 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 33a48671a9b65..a36e9d5989b6b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -372,3 +372,4 @@ Bug Fixes - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) +- Bug in ``pd.read_csv()`` that prevents ``usecol`` kwarg from accepting single-byte unicode strings (:issue:`13219`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 394fe1a98880a..d2c9efa4927f7 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -882,12 +882,13 @@ def _validate_usecols_arg(usecols): or strings (column by name). Raises a ValueError if that is not the case. """ + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") + if usecols is not None: usecols_dtype = lib.infer_dtype(usecols) - if usecols_dtype not in ('integer', 'string'): - raise ValueError(("The elements of 'usecols' " - "must either be all strings " - "or all integers")) + if usecols_dtype not in ('integer', 'string', 'unicode'): + raise ValueError(msg) return usecols diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 06275c168becd..0d3ae95f0d1d4 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -6,6 +6,7 @@ """ from datetime import datetime +import nose import pandas.util.testing as tm @@ -22,9 +23,8 @@ def test_raise_on_mixed_dtype_usecols(self): 1000,2000,3000 4000,5000,6000 """ - msg = ("The elements of \'usecols\' " - "must either be all strings " - "or all integers") + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") usecols = [0, 'b', 2] with tm.assertRaisesRegexp(ValueError, msg): @@ -254,3 +254,103 @@ def test_usecols_with_parse_dates_and_usecol_names(self): usecols=[3, 0, 2], parse_dates=parse_dates) tm.assert_frame_equal(df, expected) + + def test_usecols_with_unicode_strings(self): + # see gh-13219 + + s = '''AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'AAA': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'BBB': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_single_byte_unicode_strings(self): + # see gh-13219 + + s = '''A,B,C,D + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'A': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'B': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'A', u'B']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_mixed_encoding_strings(self): + s = '''AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB']) + + def test_usecols_with_multibyte_characters(self): + s = '''あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + data = { + 'あああ': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'いい': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=['あああ', 'いい']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_multibyte_unicode_characters(self): + raise nose.SkipTest('TODO: see gh-13253') + + s = '''あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + data = { + 'あああ': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'いい': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい']) + tm.assert_frame_equal(df, expected)