BUG: dataframe loading with duplicated columns and usecols pandas-dev#11823

sxwang · sxwang · commit 27349b9d74bd · 2016-03-19T20:19:47.000-07:00
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -143,3 +143,5 @@ Bug Fixes
 
 
 - Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
+
+- Bug in ``read_csv`` with duplicated columns and ``usecols`` (:issue:`11823`) 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -3,7 +3,7 @@
 """
 from __future__ import print_function
 from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
-from pandas import compat
+from pandas import compat, unique
 from collections import defaultdict
 import re
 import csv
@@ -1788,12 +1788,8 @@ def _handle_usecols(self, columns, usecols_key):
                 if len(columns) > 1:
                     raise ValueError("If using multiple headers, usecols must "
                                      "be integers.")
-                col_indices = []
-                for u in self.usecols:
-                    if isinstance(u, string_types):
-                        col_indices.append(usecols_key.index(u))
-                    else:
-                        col_indices.append(u)
+                col_indices = Index(usecols_key).get_indexer_for(
+                    unique(self.usecols))
             else:
                 col_indices = self.usecols
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2342,6 +2342,20 @@ def test_usecols(self):
         expected = expected[['a', 'b']]
         tm.assert_frame_equal(result, expected)
 
+        # 11823: usecols vs no usecols
+        result = self.read_csv(StringIO(data), names=['a', 'a', 'b'],
+                               header=None, usecols=['a', 'a', 'b'])
+        expected = self.read_csv(StringIO(data), names=['a', 'a', 'b'],
+                                 header=None)
+        tm.assert_frame_equal(result, expected)
+        
+        # 11823: c vs python engine
+        result_c = pd.read_csv(StringIO(data), engine='c', header=None, 
+                            names=['a', 'a', 'b'], usecols=['a','a','b'])
+        result_py = pd.read_csv(StringIO(data), engine='python', header=None, 
+                            names=['a', 'a', 'b'], usecols=['a','a','b'])
+        tm.assert_frame_equal(result_c, result_py)
+
         # length conflict, passed names and usecols disagree
         self.assertRaises(ValueError, self.read_csv, StringIO(data),
                           names=['a', 'b'], usecols=[1], header=None)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -280,7 +280,8 @@ cdef class TextReader:
         object compression
         object mangle_dupe_cols
         object tupleize_cols
-        set noconvert, usecols
+        set noconvert 
+        list usecols
 
     def __cinit__(self, source,
                   delimiter=b',',
@@ -409,7 +410,8 @@ cdef class TextReader:
         # suboptimal
         if usecols is not None:
             self.has_usecols = 1
-            self.usecols = set(usecols)
+            self.usecols = list(usecols)
+            #self.usecols = set(usecols)
 
         # XXX
         if skip_footer > 0:

Original file line number	Diff line number	Diff line change
`@@ -143,3 +143,5 @@ Bug Fixes`
`143`	`143`
`144`	`144`
`145`	`145`	- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
	`146`	`+`
	`147`	+- Bug in ``read_csv`` with duplicated columns and ``usecols`` (:issue:`11823`)