Skip to content

BUG: wrong index name during read_csv if using usecols #5003

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 27, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ Bug Fixes
- Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`)
- Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)
- Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`)
- Fixed wrong index name during read_csv if using usecols. Applies to c parser only. (:issue:`4201`)

pandas 0.12.0
-------------
Expand Down
19 changes: 15 additions & 4 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Module contains tools for processing files into DataFrames or other objects
"""
from __future__ import print_function
from pandas.compat import range, lrange, StringIO, lzip, zip
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types
from pandas import compat
import re
import csv
Expand All @@ -15,7 +15,6 @@
import datetime
import pandas.core.common as com
from pandas.core.config import get_option
from pandas import compat
from pandas.io.date_converters import generic_parser
from pandas.io.common import get_filepath_or_buffer

Expand All @@ -24,7 +23,7 @@
import pandas.lib as lib
import pandas.tslib as tslib
import pandas.parser as _parser
from pandas.tseries.period import Period


_parser_params = """Also supports optionally iterating or breaking of the file
into chunks.
Expand Down Expand Up @@ -982,7 +981,19 @@ def __init__(self, src, **kwds):
else:
self.names = lrange(self._reader.table_width)

# XXX
# If the names were inferred (not passed by user) and usedcols is defined,
# then ensure names refers to the used columns, not the document's columns.
if self.usecols and passed_names:
col_indices = []
for u in self.usecols:
if isinstance(u, string_types):
col_indices.append(self.names.index(u))
else:
col_indices.append(u)
self.names = [n for i, n in enumerate(self.names) if i in col_indices]
if len(self.names) < len(self.usecols):
raise ValueError("Usecols do not match names.")

self._set_noconvert_columns()

self.orig_names = self.names
Expand Down
26 changes: 26 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1865,6 +1865,32 @@ def test_parse_integers_above_fp_precision(self):

self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers']))

def test_usecols_index_col_conflict(self):
# Issue 4201 Test that index_col as integer reflects usecols
data = """SecId,Time,Price,P2,P3
10000,2013-5-11,100,10,1
500,2013-5-12,101,11,1
"""
expected = DataFrame({'Price': [100, 101]}, index=[datetime(2013, 5, 11), datetime(2013, 5, 12)])
expected.index.name = 'Time'

df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0)
tm.assert_frame_equal(expected, df)

df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time')
tm.assert_frame_equal(expected, df)

df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time')
tm.assert_frame_equal(expected, df)

df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0)
tm.assert_frame_equal(expected, df)

expected = DataFrame({'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)})
expected = expected.set_index(['Price', 'P2'])
df = pd.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2'])
tm.assert_frame_equal(expected, df)


class TestPythonParser(ParserTests, unittest.TestCase):

Expand Down