Skip to content

Commit 4bef0e0

Browse files
committed
BUG: wrong index name during read_csv if using usecols
Closes #4201 If user passes usecols and not names, then ensure that the inferred names refer to the used columns, not the document's columns.
1 parent 2b5e525 commit 4bef0e0

File tree

3 files changed

+42
-4
lines changed

3 files changed

+42
-4
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ Bug Fixes
480480
- Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`)
481481
- Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)
482482
- Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`)
483+
- Fixed wrong index name during read_csv if using usecols. Applies to c parser only. (:issue:`4201`)
483484

484485
pandas 0.12.0
485486
-------------

pandas/io/parsers.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Module contains tools for processing files into DataFrames or other objects
33
"""
44
from __future__ import print_function
5-
from pandas.compat import range, lrange, StringIO, lzip, zip
5+
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types
66
from pandas import compat
77
import re
88
import csv
@@ -15,7 +15,6 @@
1515
import datetime
1616
import pandas.core.common as com
1717
from pandas.core.config import get_option
18-
from pandas import compat
1918
from pandas.io.date_converters import generic_parser
2019
from pandas.io.common import get_filepath_or_buffer
2120

@@ -24,7 +23,7 @@
2423
import pandas.lib as lib
2524
import pandas.tslib as tslib
2625
import pandas.parser as _parser
27-
from pandas.tseries.period import Period
26+
2827

2928
_parser_params = """Also supports optionally iterating or breaking of the file
3029
into chunks.
@@ -982,7 +981,19 @@ def __init__(self, src, **kwds):
982981
else:
983982
self.names = lrange(self._reader.table_width)
984983

985-
# XXX
984+
# If the names were inferred (not passed by user) and usedcols is defined,
985+
# then ensure names refers to the used columns, not the document's columns.
986+
if self.usecols and passed_names:
987+
col_indices = []
988+
for u in self.usecols:
989+
if isinstance(u, string_types):
990+
col_indices.append(self.names.index(u))
991+
else:
992+
col_indices.append(u)
993+
self.names = [n for i, n in enumerate(self.names) if i in col_indices]
994+
if len(self.names) < len(self.usecols):
995+
raise ValueError("Usecols do not match names.")
996+
986997
self._set_noconvert_columns()
987998

988999
self.orig_names = self.names

pandas/io/tests/test_parsers.py

+26
Original file line numberDiff line numberDiff line change
@@ -1865,6 +1865,32 @@ def test_parse_integers_above_fp_precision(self):
18651865

18661866
self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers']))
18671867

1868+
def test_usecols_index_col_conflict(self):
1869+
# Issue 4201 Test that index_col as integer reflects usecols
1870+
data = """SecId,Time,Price,P2,P3
1871+
10000,2013-5-11,100,10,1
1872+
500,2013-5-12,101,11,1
1873+
"""
1874+
expected = DataFrame({'Price': [100, 101]}, index=[datetime(2013, 5, 11), datetime(2013, 5, 12)])
1875+
expected.index.name = 'Time'
1876+
1877+
df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0)
1878+
tm.assert_frame_equal(expected, df)
1879+
1880+
df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time')
1881+
tm.assert_frame_equal(expected, df)
1882+
1883+
df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time')
1884+
tm.assert_frame_equal(expected, df)
1885+
1886+
df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0)
1887+
tm.assert_frame_equal(expected, df)
1888+
1889+
expected = DataFrame({'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)})
1890+
expected = expected.set_index(['Price', 'P2'])
1891+
df = pd.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2'])
1892+
tm.assert_frame_equal(expected, df)
1893+
18681894

18691895
class TestPythonParser(ParserTests, unittest.TestCase):
18701896

0 commit comments

Comments
 (0)