Skip to content

Commit fcd73ad

Browse files
hassanshamimjreback
authored andcommitted
BUG: GH13219 Fixed. Allow unicode values in usecols
closes pandas-dev#13219 closes pandas-dev#13233
1 parent 45bab82 commit fcd73ad

File tree

3 files changed

+111
-9
lines changed

3 files changed

+111
-9
lines changed

doc/source/whatsnew/v0.18.2.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -338,18 +338,19 @@ Bug Fixes
338338
- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`)
339339
- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`)
340340
- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`)
341-
- Bug in ``pd.read_csv`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`)
342341

343342

344343

345344

346-
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
347345
- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)
348346

349347

350348

349+
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
350+
- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`)
351351
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
352352
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`)
353+
- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
353354

354355

355356

pandas/io/parsers.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -882,12 +882,13 @@ def _validate_usecols_arg(usecols):
882882
or strings (column by name). Raises a ValueError
883883
if that is not the case.
884884
"""
885+
msg = ("The elements of 'usecols' must "
886+
"either be all strings, all unicode, or all integers")
887+
885888
if usecols is not None:
886889
usecols_dtype = lib.infer_dtype(usecols)
887-
if usecols_dtype not in ('integer', 'string'):
888-
raise ValueError(("The elements of 'usecols' "
889-
"must either be all strings "
890-
"or all integers"))
890+
if usecols_dtype not in ('integer', 'string', 'unicode'):
891+
raise ValueError(msg)
891892

892893
return usecols
893894

pandas/io/tests/parser/usecols.py

+103-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""
77

88
from datetime import datetime
9+
import nose
910

1011
import pandas.util.testing as tm
1112

@@ -22,9 +23,8 @@ def test_raise_on_mixed_dtype_usecols(self):
2223
1000,2000,3000
2324
4000,5000,6000
2425
"""
25-
msg = ("The elements of \'usecols\' "
26-
"must either be all strings "
27-
"or all integers")
26+
msg = ("The elements of 'usecols' must "
27+
"either be all strings, all unicode, or all integers")
2828
usecols = [0, 'b', 2]
2929

3030
with tm.assertRaisesRegexp(ValueError, msg):
@@ -254,3 +254,103 @@ def test_usecols_with_parse_dates_and_usecol_names(self):
254254
usecols=[3, 0, 2],
255255
parse_dates=parse_dates)
256256
tm.assert_frame_equal(df, expected)
257+
258+
def test_usecols_with_unicode_strings(self):
259+
# see gh-13219
260+
261+
s = '''AAA,BBB,CCC,DDD
262+
0.056674973,8,True,a
263+
2.613230982,2,False,b
264+
3.568935038,7,False,a
265+
'''
266+
267+
data = {
268+
'AAA': {
269+
0: 0.056674972999999997,
270+
1: 2.6132309819999997,
271+
2: 3.5689350380000002
272+
},
273+
'BBB': {0: 8, 1: 2, 2: 7}
274+
}
275+
expected = DataFrame(data)
276+
277+
df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
278+
tm.assert_frame_equal(df, expected)
279+
280+
def test_usecols_with_single_byte_unicode_strings(self):
281+
# see gh-13219
282+
283+
s = '''A,B,C,D
284+
0.056674973,8,True,a
285+
2.613230982,2,False,b
286+
3.568935038,7,False,a
287+
'''
288+
289+
data = {
290+
'A': {
291+
0: 0.056674972999999997,
292+
1: 2.6132309819999997,
293+
2: 3.5689350380000002
294+
},
295+
'B': {0: 8, 1: 2, 2: 7}
296+
}
297+
expected = DataFrame(data)
298+
299+
df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
300+
tm.assert_frame_equal(df, expected)
301+
302+
def test_usecols_with_mixed_encoding_strings(self):
303+
s = '''AAA,BBB,CCC,DDD
304+
0.056674973,8,True,a
305+
2.613230982,2,False,b
306+
3.568935038,7,False,a
307+
'''
308+
309+
msg = ("The elements of 'usecols' must "
310+
"either be all strings, all unicode, or all integers")
311+
312+
with tm.assertRaisesRegexp(ValueError, msg):
313+
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
314+
315+
with tm.assertRaisesRegexp(ValueError, msg):
316+
self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
317+
318+
def test_usecols_with_multibyte_characters(self):
319+
s = '''あああ,いい,ううう,ええええ
320+
0.056674973,8,True,a
321+
2.613230982,2,False,b
322+
3.568935038,7,False,a
323+
'''
324+
data = {
325+
'あああ': {
326+
0: 0.056674972999999997,
327+
1: 2.6132309819999997,
328+
2: 3.5689350380000002
329+
},
330+
'いい': {0: 8, 1: 2, 2: 7}
331+
}
332+
expected = DataFrame(data)
333+
334+
df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
335+
tm.assert_frame_equal(df, expected)
336+
337+
def test_usecols_with_multibyte_unicode_characters(self):
338+
raise nose.SkipTest('TODO: see gh-13253')
339+
340+
s = '''あああ,いい,ううう,ええええ
341+
0.056674973,8,True,a
342+
2.613230982,2,False,b
343+
3.568935038,7,False,a
344+
'''
345+
data = {
346+
'あああ': {
347+
0: 0.056674972999999997,
348+
1: 2.6132309819999997,
349+
2: 3.5689350380000002
350+
},
351+
'いい': {0: 8, 1: 2, 2: 7}
352+
}
353+
expected = DataFrame(data)
354+
355+
df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
356+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)