Skip to content

BUG: Add warning if rows have more columns than expected #33782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 39 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b042377
Add warnings when rows in csv file have too many values
mproszewska Apr 25, 2020
89a04c5
Remove unused variable
mproszewska Apr 25, 2020
23c9109
Add helper function
mproszewska May 5, 2020
77537c2
Add comma in test
mproszewska May 5, 2020
5c0dfb4
Merge branch 'master' into csv
mproszewska May 10, 2020
9bb7a86
Include index_col and usecols in check
mproszewska May 10, 2020
2d661e8
Run black
mproszewska May 10, 2020
61d66ab
Add docstring
mproszewska May 10, 2020
c94b45e
PERF: Remove unnecessary copies in sorting functions
mproszewska May 15, 2020
0ab450b
Run tests
mproszewska May 16, 2020
54c7304
Run tests
mproszewska May 16, 2020
e00993d
Move function
mproszewska May 22, 2020
6d72a34
Add asv
mproszewska May 22, 2020
5ba54a6
Run black
mproszewska May 22, 2020
2766270
Remove asv
mproszewska May 22, 2020
91176ca
Merge branch 'perf'
mproszewska May 24, 2020
412cd45
Run tests
mproszewska May 28, 2020
f748b78
Merge remote-tracking branch 'upstream/master'
mproszewska May 28, 2020
c04c494
Merge remote-tracking branch 'upstream/master'
mproszewska Jun 1, 2020
f1807ee
Merge branch 'master' into csv
mproszewska Jun 1, 2020
4d7c568
Remove newline
mproszewska Jun 1, 2020
bbe77ca
Fix
mproszewska Jun 3, 2020
d9aa319
Add asv
mproszewska Jun 5, 2020
0afb1b1
Fix
mproszewska Jun 5, 2020
35539d0
Add warnings when rows in csv file have too many values
mproszewska Apr 25, 2020
358113b
Remove unused variable
mproszewska Apr 25, 2020
ab22429
Add helper function
mproszewska May 5, 2020
996213d
Add comma in test
mproszewska May 5, 2020
17d9b12
Include index_col and usecols in check
mproszewska May 10, 2020
44a5da5
Run black
mproszewska May 10, 2020
c191274
Add docstring
mproszewska May 10, 2020
0567294
Move function
mproszewska May 22, 2020
31c9bd0
Run tests
mproszewska May 28, 2020
9a84498
Remove newline
mproszewska Jun 1, 2020
459250b
Fix
mproszewska Jun 3, 2020
cd0ad9e
Merge branch 'csv' of https://github.com/mproszewska/pandas into csv
mproszewska Oct 8, 2020
cd1239f
Resolve conflicts
mproszewska Oct 8, 2020
6ad230a
Merge branch 'master' into csv
mproszewska Oct 8, 2020
18f3767
Run black
mproszewska Oct 8, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def _generate_dataframe():
N = 2000
N = 20000
C = 5
df = DataFrame(
np.random.randn(N, C),
Expand Down Expand Up @@ -69,5 +69,9 @@ def time_read_excel(self, engine):
fname = self.fname_odf if engine == "odf" else self.fname_excel
read_excel(fname, engine=engine)

def nrows_read_excel(self, engine):
fname = self.fname_odf if engine == "odf" else self.fname_excel
read_excel(fname, engine=engine, nrows=1)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
34 changes: 32 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.missing import isna, notna

from pandas.core import algorithms
from pandas.core.arrays import Categorical
Expand Down Expand Up @@ -1322,6 +1322,31 @@ def _validate_parse_dates_arg(parse_dates):
return parse_dates


def _check_unexpected_data(columns, data, index_col):
"""
Checks if ammount of columns in data matches expected number of columns.
Raises a warning if those numbers don't match.

Parameters
----------
columns : list
List that contains columns names.
data : array-like
Object that contains column data.
index_col : list or False, optional
Columns to use as the index.
"""
if index_col is None or index_col is False:
index_col = []
expected_columns = len(columns) + len(index_col)
if expected_columns != len(data) and notna(data[expected_columns:]).any():
warnings.warn(
"Expected {} columns instead of {}".format(expected_columns, len(data)),
ParserWarning,
stacklevel=2,
)


class ParserBase:
def __init__(self, kwds):
self.names = kwds.get("names")
Expand Down Expand Up @@ -2136,6 +2161,8 @@ def read(self, nrows=None):

# columns as list
alldata = [x[1] for x in data]
if self.usecols is None:
_check_unexpected_data(names, data, self.index_col)

data = {k: v for k, (i, v) in zip(names, data)}

Expand All @@ -2144,7 +2171,6 @@ def read(self, nrows=None):

# maybe create a mi on the columns
names = self._maybe_make_multi_index_columns(names, self.col_names)

return index, names, data

def _filter_usecols(self, names):
Expand Down Expand Up @@ -2495,6 +2521,10 @@ def read(self, rows=None):
content = content[1:]

alldata = self._rows_to_cols(content)

if self.usecols is None:
_check_unexpected_data(columns, alldata, self.index_col)

data = self._exclude_implicit_index(alldata)

columns = self._maybe_dedup_names(self.columns)
Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pytest

from pandas._libs.tslib import Timestamp
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning
import pandas.util._test_decorators as td

from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context
Expand Down Expand Up @@ -1071,8 +1071,8 @@ def test_trailing_delimiters(all_parsers):
4,5,6,
7,8,9,"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)

with tm.assert_produces_warning(ParserWarning):
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -2178,7 +2178,8 @@ def test_no_header_two_extra_columns(all_parsers):
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
stream = StringIO("foo,bar,baz,bam,blah")
parser = all_parsers
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
with tm.assert_produces_warning(ParserWarning):
df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
tm.assert_frame_equal(df, ref)


Expand Down Expand Up @@ -2241,3 +2242,10 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):

with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)


def test_first_row_length(all_parsers):
stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6,\n6,7,8")
parser = all_parsers
with tm.assert_produces_warning(ParserWarning):
parser.read_csv(stream, index_col=False)