Skip to content

Commit 27349b9

Browse files
committed
BUG: dataframe loading with duplicated columns and usecols pandas-dev#11823
1 parent bc3ad9a commit 27349b9

File tree

4 files changed

+23
-9
lines changed

4 files changed

+23
-9
lines changed

doc/source/whatsnew/v0.18.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,5 @@ Bug Fixes
143143

144144

145145
- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
146+
147+
- Bug in ``read_csv`` with duplicated columns and ``usecols`` (:issue:`11823`)

pandas/io/parsers.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
from __future__ import print_function
55
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
6-
from pandas import compat
6+
from pandas import compat, unique
77
from collections import defaultdict
88
import re
99
import csv
@@ -1788,12 +1788,8 @@ def _handle_usecols(self, columns, usecols_key):
17881788
if len(columns) > 1:
17891789
raise ValueError("If using multiple headers, usecols must "
17901790
"be integers.")
1791-
col_indices = []
1792-
for u in self.usecols:
1793-
if isinstance(u, string_types):
1794-
col_indices.append(usecols_key.index(u))
1795-
else:
1796-
col_indices.append(u)
1791+
col_indices = Index(usecols_key).get_indexer_for(
1792+
unique(self.usecols))
17971793
else:
17981794
col_indices = self.usecols
17991795

pandas/io/tests/test_parsers.py

+14
Original file line numberDiff line numberDiff line change
@@ -2342,6 +2342,20 @@ def test_usecols(self):
23422342
expected = expected[['a', 'b']]
23432343
tm.assert_frame_equal(result, expected)
23442344

2345+
# 11823: usecols vs no usecols
2346+
result = self.read_csv(StringIO(data), names=['a', 'a', 'b'],
2347+
header=None, usecols=['a', 'a', 'b'])
2348+
expected = self.read_csv(StringIO(data), names=['a', 'a', 'b'],
2349+
header=None)
2350+
tm.assert_frame_equal(result, expected)
2351+
2352+
# 11823: c vs python engine
2353+
result_c = pd.read_csv(StringIO(data), engine='c', header=None,
2354+
names=['a', 'a', 'b'], usecols=['a','a','b'])
2355+
result_py = pd.read_csv(StringIO(data), engine='python', header=None,
2356+
names=['a', 'a', 'b'], usecols=['a','a','b'])
2357+
tm.assert_frame_equal(result_c, result_py)
2358+
23452359
# length conflict, passed names and usecols disagree
23462360
self.assertRaises(ValueError, self.read_csv, StringIO(data),
23472361
names=['a', 'b'], usecols=[1], header=None)

pandas/parser.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,8 @@ cdef class TextReader:
280280
object compression
281281
object mangle_dupe_cols
282282
object tupleize_cols
283-
set noconvert, usecols
283+
set noconvert
284+
list usecols
284285

285286
def __cinit__(self, source,
286287
delimiter=b',',
@@ -409,7 +410,8 @@ cdef class TextReader:
409410
# suboptimal
410411
if usecols is not None:
411412
self.has_usecols = 1
412-
self.usecols = set(usecols)
413+
self.usecols = list(usecols)
414+
#self.usecols = set(usecols)
413415

414416
# XXX
415417
if skip_footer > 0:

0 commit comments

Comments
 (0)