Skip to content

BUG: Add warning if rows have more columns than expected #33782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 39 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b042377
Add warnings when rows in csv file have too many values
mproszewska Apr 25, 2020
89a04c5
Remove unused variable
mproszewska Apr 25, 2020
23c9109
Add helper function
mproszewska May 5, 2020
77537c2
Add comma in test
mproszewska May 5, 2020
5c0dfb4
Merge branch 'master' into csv
mproszewska May 10, 2020
9bb7a86
Include index_col and usecols in check
mproszewska May 10, 2020
2d661e8
Run black
mproszewska May 10, 2020
61d66ab
Add docstring
mproszewska May 10, 2020
c94b45e
PERF: Remove unnecessary copies in sorting functions
mproszewska May 15, 2020
0ab450b
Run tests
mproszewska May 16, 2020
54c7304
Run tests
mproszewska May 16, 2020
e00993d
Move function
mproszewska May 22, 2020
6d72a34
Add asv
mproszewska May 22, 2020
5ba54a6
Run black
mproszewska May 22, 2020
2766270
Remove asv
mproszewska May 22, 2020
91176ca
Merge branch 'perf'
mproszewska May 24, 2020
412cd45
Run tests
mproszewska May 28, 2020
f748b78
Merge remote-tracking branch 'upstream/master'
mproszewska May 28, 2020
c04c494
Merge remote-tracking branch 'upstream/master'
mproszewska Jun 1, 2020
f1807ee
Merge branch 'master' into csv
mproszewska Jun 1, 2020
4d7c568
Remove newline
mproszewska Jun 1, 2020
bbe77ca
Fix
mproszewska Jun 3, 2020
d9aa319
Add asv
mproszewska Jun 5, 2020
0afb1b1
Fix
mproszewska Jun 5, 2020
35539d0
Add warnings when rows in csv file have too many values
mproszewska Apr 25, 2020
358113b
Remove unused variable
mproszewska Apr 25, 2020
ab22429
Add helper function
mproszewska May 5, 2020
996213d
Add comma in test
mproszewska May 5, 2020
17d9b12
Include index_col and usecols in check
mproszewska May 10, 2020
44a5da5
Run black
mproszewska May 10, 2020
c191274
Add docstring
mproszewska May 10, 2020
0567294
Move function
mproszewska May 22, 2020
31c9bd0
Run tests
mproszewska May 28, 2020
9a84498
Remove newline
mproszewska Jun 1, 2020
459250b
Fix
mproszewska Jun 3, 2020
cd0ad9e
Merge branch 'csv' of https://github.com/mproszewska/pandas into csv
mproszewska Oct 8, 2020
cd1239f
Resolve conflicts
mproszewska Oct 8, 2020
6ad230a
Merge branch 'master' into csv
mproszewska Oct 8, 2020
18f3767
Run black
mproszewska Oct 8, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.missing import isna, notna

from pandas.core import algorithms
from pandas.core.arrays import Categorical
Expand Down Expand Up @@ -2151,15 +2151,19 @@ def read(self, nrows=None):

# columns as list
alldata = [x[1] for x in data]

if len(names) != len(data) and notna(data[len(names) :]).any():
warnings.warn(
"Expected {} columns instead of {}".format(len(names), len(data)),
ParserWarning,
stacklevel=2,
)
data = {k: v for k, (i, v) in zip(names, data)}

names, data = self._do_date_conversions(names, data)
index, names = self._make_index(data, alldata, names)

# maybe create a mi on the columns
names = self._maybe_make_multi_index_columns(names, self.col_names)

return index, names, data

def _filter_usecols(self, names):
Expand Down Expand Up @@ -2508,6 +2512,13 @@ def read(self, rows=None):
content = content[1:]

alldata = self._rows_to_cols(content)
if len(columns) != len(alldata) and notna(alldata[len(columns) :]).any():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we need the notna check?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In example mentioned in linked issue additional comma was added in one row. I assumed that additional commas are common and hence we might ignore them and don't raise a warning.
I'm using notna to check if data that won't be included contains only NaN values.

warnings.warn(
"Expected {} columns instead of {}".format(len(columns), len(alldata)),
ParserWarning,
stacklevel=2,
)

data = self._exclude_implicit_index(alldata)

columns = self._maybe_dedup_names(self.columns)
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pytest

from pandas._libs.tslib import Timestamp
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning
import pandas.util._test_decorators as td

from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
Expand Down Expand Up @@ -2124,5 +2124,13 @@ def test_no_header_two_extra_columns(all_parsers):
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
stream = StringIO("foo,bar,baz,bam,blah")
parser = all_parsers
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
tm.assert_frame_equal(df, ref)
with tm.assert_produces_warning(ParserWarning):
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
tm.assert_frame_equal(df, ref)


def test_first_row_length(all_parsers):
stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6\n6,7,8")
parser = all_parsers
with tm.assert_produces_warning(ParserWarning):
parser.read_csv(stream, index_col=False)