Skip to content

Commit 6bb558d

Browse files
committed
BUG: Patch missing data handling with usecols
Closes gh-6710. Closes gh-8985.
1 parent c71f214 commit 6bb558d

File tree

5 files changed

+44
-3
lines changed

5 files changed

+44
-3
lines changed

doc/source/io.rst

+13
Original file line numberDiff line numberDiff line change
@@ -1215,6 +1215,19 @@ You can elect to skip bad lines:
12151215
0 1 2 3
12161216
1 8 9 10
12171217
1218+
You can also use the ``usecols`` parameter to eliminate extraneous column
1219+
data that appear in some lines but not others:
1220+
1221+
.. code-block:: ipython
1222+
1223+
In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
1224+
1225+
Out[30]:
1226+
a b c
1227+
0 1 2 3
1228+
1 4 5 6
1229+
2 8 9 10
1230+
12181231
.. _io.quoting:
12191232

12201233
Quoting and Escape Characters

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ Bug Fixes
306306
- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
307307
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
308308
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
309+
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
309310

310311
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
311312

pandas/io/parsers.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2295,11 +2295,12 @@ def _infer_columns(self):
22952295
columns = [lrange(ncols)]
22962296
columns = self._handle_usecols(columns, columns[0])
22972297
else:
2298-
if self.usecols is None or len(names) == num_original_columns:
2298+
if self.usecols is None or len(names) >= num_original_columns:
22992299
columns = self._handle_usecols([names], names)
23002300
num_original_columns = len(names)
23012301
else:
2302-
if self.usecols and len(names) != len(self.usecols):
2302+
if (not callable(self.usecols) and
2303+
len(names) != len(self.usecols)):
23032304
raise ValueError(
23042305
'Number of passed names did not match number of '
23052306
'header fields in the file'

pandas/io/tests/parser/usecols.py

+25
Original file line numberDiff line numberDiff line change
@@ -440,3 +440,28 @@ def test_callable_usecols(self):
440440
expected = DataFrame()
441441
df = self.read_csv(StringIO(s), usecols=lambda x: False)
442442
tm.assert_frame_equal(df, expected)
443+
444+
def test_incomplete_first_row(self):
445+
# see gh-6710
446+
data = '1,2\n1,2,3'
447+
names = ['a', 'b', 'c']
448+
expected = DataFrame({'a': [1, 1],
449+
'c': [np.nan, 3]})
450+
451+
usecols = ['a', 'c']
452+
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
453+
tm.assert_frame_equal(df, expected)
454+
455+
usecols = lambda x: x in ['a', 'c']
456+
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
457+
tm.assert_frame_equal(df, expected)
458+
459+
def test_uneven_length_cols(self):
460+
# see gh-8985
461+
usecols = [0, 1, 2]
462+
data = '19,29,39\n' * 2 + '10,20,30,40'
463+
expected = DataFrame([[19, 29, 39],
464+
[19, 29, 39],
465+
[10, 20, 30]])
466+
df = self.read_csv(StringIO(data), header=None, usecols=usecols)
467+
tm.assert_frame_equal(df, expected)

pandas/parser.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -1317,7 +1317,8 @@ cdef class TextReader:
13171317

13181318
cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
13191319
if self.has_usecols and self.names is not None:
1320-
if len(self.names) == len(self.usecols):
1320+
if (not callable(self.usecols) and
1321+
len(self.names) == len(self.usecols)):
13211322
return self.names[nused]
13221323
else:
13231324
return self.names[i - self.leading_cols]

0 commit comments

Comments
 (0)