diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..e9776ff2c641e 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def nrows_read_excel(self, engine): + fname = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ede4fdc5e1d8b..209883dab3ce6 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -49,7 +49,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms from pandas.core.arrays import Categorical @@ -1322,6 +1322,31 @@ def _validate_parse_dates_arg(parse_dates): return parse_dates +def _check_unexpected_data(columns, data, index_col): + """ + Checks if ammount of columns in data matches expected number of columns. + Raises a warning if those numbers don't match. + + Parameters + ---------- + columns : list + List that contains columns names. + data : array-like + Object that contains column data. + index_col : list or False, optional + Columns to use as the index. + """ + if index_col is None or index_col is False: + index_col = [] + expected_columns = len(columns) + len(index_col) + if expected_columns != len(data) and notna(data[expected_columns:]).any(): + warnings.warn( + "Expected {} columns instead of {}".format(expected_columns, len(data)), + ParserWarning, + stacklevel=2, + ) + + class ParserBase: def __init__(self, kwds): self.names = kwds.get("names") @@ -2136,6 +2161,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data] + if self.usecols is None: + _check_unexpected_data(names, data, self.index_col) data = {k: v for k, (i, v) in zip(names, data)} @@ -2144,7 +2171,6 @@ def read(self, nrows=None): # maybe create a mi on the columns names = self._maybe_make_multi_index_columns(names, self.col_names) - return index, names, data def _filter_usecols(self, names): @@ -2495,6 +2521,10 @@ def read(self, rows=None): content = content[1:] alldata = self._rows_to_cols(content) + + if self.usecols is None: + _check_unexpected_data(columns, alldata, self.index_col) + data = self._exclude_implicit_index(alldata) columns = self._maybe_dedup_names(self.columns) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index a6a9e5c5610f2..f199267d0d462 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,7 +15,7 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.errors import DtypeWarning, EmptyDataError, ParserError +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context @@ -1071,8 +1071,8 @@ def test_trailing_delimiters(all_parsers): 4,5,6, 7,8,9,""" parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=False) - + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv(StringIO(data), index_col=False) expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) tm.assert_frame_equal(result, expected) @@ -2178,7 +2178,8 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) @@ -2241,3 +2242,10 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): with pytest.raises(ValueError, match=msg): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) + + +def test_first_row_length(all_parsers): + stream = StringIO("col1,col2,col3\n0,1,2,X\n4,5,6,\n6,7,8") + parser = all_parsers + with tm.assert_produces_warning(ParserWarning): + parser.read_csv(stream, index_col=False)